bugfix: Fix distill bugs with genome stats, and metabolism summary

madeline-scyphers · madeline-scyphers · commit 419322b0d3b5 · 2025-11-20T23:57:05.000-07:00
genome_stats would error out if certain arguments were not provided.
Also fixed that the metabolism summary was adding extra columns to
some tabs that were needed.
Also fixed that quast stats were not being passed to distill script
diff --git a/bin/distill.py b/bin/distill.py
@@ -113,52 +113,12 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN):
     return rrna_frame
 
 
-def summarize_trnas(trnas_df, groupby_column=FASTA_COLUMN):
-    # first build the frame
-    combos = {(line.type, line.codon, line.note) for _, line in trnas_df.iterrows()}
-    frame_rows = list()
-    for combo in combos:
-        if combo[2] == 'pseudo':
-            gene_id = '%s, pseudo (%s)'
-            gene_description = '%s pseudo tRNA with %s Codon'
-        else:
-            gene_id = '%s (%s)'
-            gene_description = '%s tRNA with %s Codon'
-        gene_id = gene_id % (combo[0], combo[1])
-        gene_description = gene_description % (combo[0], combo[1])
-        module_description = '%s tRNA' % combo[0]
-        frame_rows.append([gene_id, gene_description, module_description, 'tRNA', 'tRNA', ''])
-    trna_frame = pd.DataFrame(frame_rows, columns=FRAME_COLUMNS)
-    trna_frame = trna_frame.sort_values(COL_GENE_ID)
-    # then fill it in
-    trna_frame = trna_frame.set_index(COL_GENE_ID)
-    for group, frame in trnas_df.groupby(groupby_column):
-        gene_ids = list()
-        for index, line in frame.iterrows():
-            if line.note == 'pseudo':
-                gene_id = '%s, pseudo (%s)'
-            else:
-                gene_id = '%s (%s)'
-            gene_ids.append(gene_id % (line.type, line.codon))
-        trna_frame[group] = pd.Series(Counter(gene_ids))
-    trna_frame = trna_frame.reset_index()
-    trna_frame = trna_frame.fillna(0)
-    return trna_frame
-
-
-def make_genome_summary(annotations, genome_summary_frame, logger, trna_frame=None, rrna_frame=None, groupby_column=FASTA_COLUMN):
+def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column=FASTA_COLUMN):
+    
     summary_frames = list()
     # get ko summaries
     summary_frames.append(fill_genome_summary_frame(annotations, genome_summary_frame.copy(), groupby_column, logger))
 
-    # add rRNAs
-    if rrna_frame is not None:
-        summary_frames.append(summarize_rrnas(rrna_frame, groupby_column))
-
-    # add tRNAs
-    if trna_frame is not None:
-        summary_frames.append(summarize_trnas(trna_frame, groupby_column))
-
     # merge summary frames
     summarized_genomes = pd.concat(summary_frames, sort=False)
     return summarized_genomes
@@ -187,7 +147,7 @@ def split_names_to_long(col:pd.Series):
     return pd.DataFrame(splits, columns=col_names, index=dex).fillna('')
 
 
-def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
+def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_frames=tuple()):
     # turn all this into an xlsx
     with pd.ExcelWriter(output_file) as writer:
         for sheet, frame in summarized_genomes.groupby(COL_SHEET, sort=False):
@@ -197,6 +157,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
             split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1)
             frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS],  split_genes], axis=1)
             frame.to_excel(writer, sheet_name=sheet, index=False)
+        for extra_frame in extra_frames:
+            if extra_frame is not None and not extra_frame.empty:
+                extra_frame.to_excel(writer, sheet_name=extra_frame[COL_SHEET].iloc[0], index=False)
 
 
 # TODO: add assembly stats like N50, longest contig, total assembled length etc
@@ -311,7 +274,6 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
     else:
         quast_frame = pd.read_csv(quast_path, sep='\t')
 
-
     distil_sheets_names = []
     if "default" in distil_topics:
         distil_sheets_names = [
@@ -371,10 +333,9 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
         summarized_genomes = fill_genome_summary_frame_gene_names(annotations, genome_summary_form, groupby_column, logger)
     else:
         logger.info(f'distillate_gene_names flag is {distillate_gene_names}. Giving counts instead of gene names in genome metabolism summary')
-        summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, trna_frame, rrna_frame,
-                                                 groupby_column)
+        summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, groupby_column)
     summarized_genomes.to_csv('summarized_genomes.tsv', sep='\t', index=None)
-    write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary)
+    write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary, extra_frames=[rrna_frame, trna_frame])
     logger.info('Generated genome metabolism summary')
 
     
diff --git a/modules/local/collect_rna/rrna_collect.nf b/modules/local/collect_rna/rrna_collect.nf
@@ -53,7 +53,7 @@ process RRNA_COLLECT {
     else:
         collected_data = []
         for gene_id, input_fastas_counts in gene_type_counts.items():
-            row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': '', 'subcategory': ''}
+            row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': 'rRNA', 'subcategory': ''}
             for input_fasta in input_fastas: row[input_fasta] = input_fastas_counts.get(input_fasta, 0)
             collected_data.append(row)
         collected_df = pd.DataFrame(collected_data)
diff --git a/modules/local/distill/distill.nf b/modules/local/distill/distill.nf
@@ -8,11 +8,12 @@ process SUMMARIZE {
 
     input:
     path( ch_combined_annotations, stageAs: "raw-annotations.tsv" )
-    path( ch_rrna_combined, stageAs: "rrna_combined.tsv" )
-    path( ch_trna_combined, stageAs: "trna_combined.tsv" )
+    path( ch_rrna_collected, stageAs: "rrna_combined.tsv" )
+    path( ch_trna_collected, stageAs: "trna_combined.tsv" )
+    path( ch_quast_stats )
     val( distill_topic )
     val( distill_ecosystem )
-    path( distill_custom )
+    val( distill_custom )
 
     output:
     path( "metabolism_summary.xlsx" ), emit: distillate
@@ -25,7 +26,7 @@ process SUMMARIZE {
     # export constants for script
     export FASTA_COLUMN="${params.CONSTANTS.FASTA_COLUMN}"
 
-    distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_combined}' --trna_path '${ch_trna_combined}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}"
+    distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_collected}' --trna_path '${ch_trna_collected}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}" --quast_path '${ch_quast_stats}'
 
     """
 }
diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf
@@ -29,8 +29,8 @@ workflow ANNOTATE {
 
     main:
     n_fastas = 0
-    ch_rrna_combined = default_sheet
-    ch_trna_combined = default_sheet
+    ch_rrna_collected = default_sheet
+    ch_trna_collected = default_sheet
     ch_combined_annotations = default_sheet
 
     if (params.rename || call) {
@@ -95,14 +95,15 @@ workflow ANNOTATE {
 
     if (params.qc){
         QC( ch_fasta, default_sheet, ch_combined_annotations, ch_collected_fna, call )
-        ch_rrna_combined = QC.out.ch_rrna_combined
-        ch_trna_combined = QC.out.ch_trna_combined
+        ch_rrna_collected = QC.out.ch_rrna_collected
+        ch_trna_collected = QC.out.ch_trna_collected
         ch_combined_annotations = QC.out.ch_final_annots
     }
 
     emit:
-    ch_rrna_combined
-    ch_trna_combined
+    ch_rrna_collected
+    ch_trna_collected
     ch_combined_annotations
+    ch_quast_stats
 
 }
diff --git a/subworkflows/local/collect_rna.nf b/subworkflows/local/collect_rna.nf
@@ -72,28 +72,28 @@ workflow COLLECT_RNA {
         // Create sheet for rrnas from the collected rRNAs or provided rRNAs
         // Run RRNA_COLLECT to generate a combined TSV for all fastas
         RRNA_COLLECT( ch_collected_rRNAs )
-        ch_rrna_sheet = RRNA_COLLECT.out.rrna_collected_out
+        ch_rrna_collected = RRNA_COLLECT.out.rrna_collected_out
         ch_rrna_combined = RRNA_COLLECT.out.rrna_combined_out
     } else {
-        ch_rrna_sheet = default_sheet
+        ch_rrna_collected = default_sheet
         ch_rrna_combined = default_sheet
     }
     if (run_trna_collect) {
         // Create sheet for trnas from the collected tRNAs or provided tRNAs
         // Run TRNA_COLLECT to generate a combined TSV for all fastas
         TRNA_COLLECT( ch_collected_tRNAs )
-        ch_trna_sheet = TRNA_COLLECT.out.trna_collected_out
+        ch_trna_collected = TRNA_COLLECT.out.trna_collected_out
         ch_trna_combined = TRNA_COLLECT.out.trna_combined_out
     } else {
-        ch_trna_sheet = default_sheet
+        ch_trna_collected = default_sheet
         ch_trna_combined = default_sheet
     }
 
 
     emit:
-    ch_rrna_sheet
+    ch_rrna_collected
     ch_rrna_combined
-    ch_trna_sheet
+    ch_trna_collected
     ch_trna_combined
 
 }
diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf
@@ -20,8 +20,8 @@ workflow QC {
     main:
 
     COLLECT_RNA( ch_fasta, default_sheet, call )
-    ch_rrna_combined = COLLECT_RNA.out.ch_rrna_combined
-    ch_trna_combined = COLLECT_RNA.out.ch_trna_combined
+    ch_rrna_collected = COLLECT_RNA.out.ch_rrna_collected
+    ch_trna_collected = COLLECT_RNA.out.ch_trna_collected
 
 
     // Add Bin Quality to annotations
@@ -71,6 +71,6 @@ workflow QC {
 
     emit:
     ch_final_annots
-    ch_rrna_combined
-    ch_trna_combined
+    ch_rrna_collected
+    ch_trna_collected
 }
diff --git a/workflows/dram.nf b/workflows/dram.nf
@@ -252,8 +252,9 @@ workflow DRAM {
 
                 SUMMARIZE(
                     ch_final_annots,
-                    ANNOTATE.out.ch_rrna_combined,
-                    ANNOTATE.out.ch_trna_combined,
+                    ANNOTATE.out.ch_rrna_collected,
+                    ANNOTATE.out.ch_trna_collected,
+                    ANNOTATE.out.ch_quast_stats,
                     distill_topic,
                     distill_ecosystem,
                     distill_custom