Merge pull request #469 from WrightonLabCSU/bugfix/distill-bugs

madeline-scyphers · web-flow · commit 79dd74db3e60 · 2025-11-21T00:01:01.000-07:00
bugfix: Fix distill bugs with genome stats, and metabolism summary
diff --git a/bin/distill.py b/bin/distill.py
@@ -113,52 +113,12 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN):
     return rrna_frame
 
 
-def summarize_trnas(trnas_df, groupby_column=FASTA_COLUMN):
-    # first build the frame
-    combos = {(line.type, line.codon, line.note) for _, line in trnas_df.iterrows()}
-    frame_rows = list()
-    for combo in combos:
-        if combo[2] == 'pseudo':
-            gene_id = '%s, pseudo (%s)'
-            gene_description = '%s pseudo tRNA with %s Codon'
-        else:
-            gene_id = '%s (%s)'
-            gene_description = '%s tRNA with %s Codon'
-        gene_id = gene_id % (combo[0], combo[1])
-        gene_description = gene_description % (combo[0], combo[1])
-        module_description = '%s tRNA' % combo[0]
-        frame_rows.append([gene_id, gene_description, module_description, 'tRNA', 'tRNA', ''])
-    trna_frame = pd.DataFrame(frame_rows, columns=FRAME_COLUMNS)
-    trna_frame = trna_frame.sort_values(COL_GENE_ID)
-    # then fill it in
-    trna_frame = trna_frame.set_index(COL_GENE_ID)
-    for group, frame in trnas_df.groupby(groupby_column):
-        gene_ids = list()
-        for index, line in frame.iterrows():
-            if line.note == 'pseudo':
-                gene_id = '%s, pseudo (%s)'
-            else:
-                gene_id = '%s (%s)'
-            gene_ids.append(gene_id % (line.type, line.codon))
-        trna_frame[group] = pd.Series(Counter(gene_ids))
-    trna_frame = trna_frame.reset_index()
-    trna_frame = trna_frame.fillna(0)
-    return trna_frame
-
-
-def make_genome_summary(annotations, genome_summary_frame, logger, trna_frame=None, rrna_frame=None, groupby_column=FASTA_COLUMN):
+def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column=FASTA_COLUMN):
+    
     summary_frames = list()
     # get ko summaries
     summary_frames.append(fill_genome_summary_frame(annotations, genome_summary_frame.copy(), groupby_column, logger))
 
-    # add rRNAs
-    if rrna_frame is not None:
-        summary_frames.append(summarize_rrnas(rrna_frame, groupby_column))
-
-    # add tRNAs
-    if trna_frame is not None:
-        summary_frames.append(summarize_trnas(trna_frame, groupby_column))
-
     # merge summary frames
     summarized_genomes = pd.concat(summary_frames, sort=False)
     return summarized_genomes
@@ -187,7 +147,7 @@ def split_names_to_long(col:pd.Series):
     return pd.DataFrame(splits, columns=col_names, index=dex).fillna('')
 
 
-def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
+def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_frames=tuple()):
     # turn all this into an xlsx
     with pd.ExcelWriter(output_file) as writer:
         for sheet, frame in summarized_genomes.groupby(COL_SHEET, sort=False):
@@ -197,6 +157,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
             split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1)
             frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS],  split_genes], axis=1)
             frame.to_excel(writer, sheet_name=sheet, index=False)
+        for extra_frame in extra_frames:
+            if extra_frame is not None and not extra_frame.empty:
+                extra_frame.to_excel(writer, sheet_name=extra_frame[COL_SHEET].iloc[0], index=False)
 
 
 # TODO: add assembly stats like N50, longest contig, total assembled length etc
@@ -311,7 +274,6 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
     else:
         quast_frame = pd.read_csv(quast_path, sep='\t')
 
-
     distil_sheets_names = []
     if "default" in distil_topics:
         distil_sheets_names = [
@@ -371,10 +333,9 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
         summarized_genomes = fill_genome_summary_frame_gene_names(annotations, genome_summary_form, groupby_column, logger)
     else:
         logger.info(f'distillate_gene_names flag is {distillate_gene_names}. Giving counts instead of gene names in genome metabolism summary')
-        summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, trna_frame, rrna_frame,
-                                                 groupby_column)
+        summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, groupby_column)
     summarized_genomes.to_csv('summarized_genomes.tsv', sep='\t', index=None)
-    write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary)
+    write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary, extra_frames=[rrna_frame, trna_frame])
     logger.info('Generated genome metabolism summary')
 
     
diff --git a/modules/local/collect_rna/rrna_collect.nf b/modules/local/collect_rna/rrna_collect.nf
@@ -53,7 +53,7 @@ process RRNA_COLLECT {
     else:
         collected_data = []
         for gene_id, input_fastas_counts in gene_type_counts.items():
-            row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': '', 'subcategory': ''}
+            row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': 'rRNA', 'subcategory': ''}
             for input_fasta in input_fastas: row[input_fasta] = input_fastas_counts.get(input_fasta, 0)
             collected_data.append(row)
         collected_df = pd.DataFrame(collected_data)
diff --git a/modules/local/distill/distill.nf b/modules/local/distill/distill.nf
@@ -8,11 +8,12 @@ process SUMMARIZE {
 
     input:
     path( ch_combined_annotations, stageAs: "raw-annotations.tsv" )
-    path( ch_rrna_combined, stageAs: "rrna_combined.tsv" )
-    path( ch_trna_combined, stageAs: "trna_combined.tsv" )
+    path( ch_rrna_collected, stageAs: "rrna_combined.tsv" )
+    path( ch_trna_collected, stageAs: "trna_combined.tsv" )
+    path( ch_quast_stats )
     val( distill_topic )
     val( distill_ecosystem )
-    path( distill_custom )
+    val( distill_custom )
 
     output:
     path( "metabolism_summary.xlsx" ), emit: distillate
@@ -25,7 +26,7 @@ process SUMMARIZE {
     # export constants for script
     export FASTA_COLUMN="${params.CONSTANTS.FASTA_COLUMN}"
 
-    distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_combined}' --trna_path '${ch_trna_combined}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}"
+    distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_collected}' --trna_path '${ch_trna_collected}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}" --quast_path '${ch_quast_stats}'
 
     """
 }
diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf
@@ -29,8 +29,8 @@ workflow ANNOTATE {
 
     main:
     n_fastas = 0
-    ch_rrna_combined = default_sheet
-    ch_trna_combined = default_sheet
+    ch_rrna_collected = default_sheet
+    ch_trna_collected = default_sheet
     ch_combined_annotations = default_sheet
 
     if (params.rename || call) {
@@ -95,14 +95,15 @@ workflow ANNOTATE {
 
     if (params.qc){
         QC( ch_fasta, default_sheet, ch_combined_annotations, ch_collected_fna, call )
-        ch_rrna_combined = QC.out.ch_rrna_combined
-        ch_trna_combined = QC.out.ch_trna_combined
+        ch_rrna_collected = QC.out.ch_rrna_collected
+        ch_trna_collected = QC.out.ch_trna_collected
         ch_combined_annotations = QC.out.ch_final_annots
     }
 
     emit:
-    ch_rrna_combined
-    ch_trna_combined
+    ch_rrna_collected
+    ch_trna_collected
     ch_combined_annotations
+    ch_quast_stats
 
 }
diff --git a/subworkflows/local/collect_rna.nf b/subworkflows/local/collect_rna.nf
@@ -72,28 +72,28 @@ workflow COLLECT_RNA {
         // Create sheet for rrnas from the collected rRNAs or provided rRNAs
         // Run RRNA_COLLECT to generate a combined TSV for all fastas
         RRNA_COLLECT( ch_collected_rRNAs )
-        ch_rrna_sheet = RRNA_COLLECT.out.rrna_collected_out
+        ch_rrna_collected = RRNA_COLLECT.out.rrna_collected_out
         ch_rrna_combined = RRNA_COLLECT.out.rrna_combined_out
     } else {
-        ch_rrna_sheet = default_sheet
+        ch_rrna_collected = default_sheet
         ch_rrna_combined = default_sheet
     }
     if (run_trna_collect) {
         // Create sheet for trnas from the collected tRNAs or provided tRNAs
         // Run TRNA_COLLECT to generate a combined TSV for all fastas
         TRNA_COLLECT( ch_collected_tRNAs )
-        ch_trna_sheet = TRNA_COLLECT.out.trna_collected_out
+        ch_trna_collected = TRNA_COLLECT.out.trna_collected_out
         ch_trna_combined = TRNA_COLLECT.out.trna_combined_out
     } else {
-        ch_trna_sheet = default_sheet
+        ch_trna_collected = default_sheet
         ch_trna_combined = default_sheet
     }
 
 
     emit:
-    ch_rrna_sheet
+    ch_rrna_collected
     ch_rrna_combined
-    ch_trna_sheet
+    ch_trna_collected
     ch_trna_combined
 
 }
diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf
@@ -20,8 +20,8 @@ workflow QC {
     main:
 
     COLLECT_RNA( ch_fasta, default_sheet, call )
-    ch_rrna_combined = COLLECT_RNA.out.ch_rrna_combined
-    ch_trna_combined = COLLECT_RNA.out.ch_trna_combined
+    ch_rrna_collected = COLLECT_RNA.out.ch_rrna_collected
+    ch_trna_collected = COLLECT_RNA.out.ch_trna_collected
 
 
     // Add Bin Quality to annotations
@@ -71,6 +71,6 @@ workflow QC {
 
     emit:
     ch_final_annots
-    ch_rrna_combined
-    ch_trna_combined
+    ch_rrna_collected
+    ch_trna_collected
 }
diff --git a/workflows/dram.nf b/workflows/dram.nf
@@ -252,8 +252,9 @@ workflow DRAM {
 
                 SUMMARIZE(
                     ch_final_annots,
-                    ANNOTATE.out.ch_rrna_combined,
-                    ANNOTATE.out.ch_trna_combined,
+                    ANNOTATE.out.ch_rrna_collected,
+                    ANNOTATE.out.ch_trna_collected,
+                    ANNOTATE.out.ch_quast_stats,
                     distill_topic,
                     distill_ecosystem,
                     distill_custom