Skip to content

Commit 419322b

Browse files
bugfix: Fix distill bugs with genome stats, and metabolism summary
genome_stats would error out if certain arguments were not provided. Also fixed that the metabolism summary was adding extra columns to some tabs that were needed. Also fixed that quast stats were not being passed to distill script
1 parent 3c20b59 commit 419322b

7 files changed

Lines changed: 34 additions & 70 deletions

File tree

bin/distill.py

Lines changed: 8 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -113,52 +113,12 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN):
113113
return rrna_frame
114114

115115

116-
def summarize_trnas(trnas_df, groupby_column=FASTA_COLUMN):
117-
# first build the frame
118-
combos = {(line.type, line.codon, line.note) for _, line in trnas_df.iterrows()}
119-
frame_rows = list()
120-
for combo in combos:
121-
if combo[2] == 'pseudo':
122-
gene_id = '%s, pseudo (%s)'
123-
gene_description = '%s pseudo tRNA with %s Codon'
124-
else:
125-
gene_id = '%s (%s)'
126-
gene_description = '%s tRNA with %s Codon'
127-
gene_id = gene_id % (combo[0], combo[1])
128-
gene_description = gene_description % (combo[0], combo[1])
129-
module_description = '%s tRNA' % combo[0]
130-
frame_rows.append([gene_id, gene_description, module_description, 'tRNA', 'tRNA', ''])
131-
trna_frame = pd.DataFrame(frame_rows, columns=FRAME_COLUMNS)
132-
trna_frame = trna_frame.sort_values(COL_GENE_ID)
133-
# then fill it in
134-
trna_frame = trna_frame.set_index(COL_GENE_ID)
135-
for group, frame in trnas_df.groupby(groupby_column):
136-
gene_ids = list()
137-
for index, line in frame.iterrows():
138-
if line.note == 'pseudo':
139-
gene_id = '%s, pseudo (%s)'
140-
else:
141-
gene_id = '%s (%s)'
142-
gene_ids.append(gene_id % (line.type, line.codon))
143-
trna_frame[group] = pd.Series(Counter(gene_ids))
144-
trna_frame = trna_frame.reset_index()
145-
trna_frame = trna_frame.fillna(0)
146-
return trna_frame
147-
148-
149-
def make_genome_summary(annotations, genome_summary_frame, logger, trna_frame=None, rrna_frame=None, groupby_column=FASTA_COLUMN):
116+
def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column=FASTA_COLUMN):
117+
150118
summary_frames = list()
151119
# get ko summaries
152120
summary_frames.append(fill_genome_summary_frame(annotations, genome_summary_frame.copy(), groupby_column, logger))
153121

154-
# add rRNAs
155-
if rrna_frame is not None:
156-
summary_frames.append(summarize_rrnas(rrna_frame, groupby_column))
157-
158-
# add tRNAs
159-
if trna_frame is not None:
160-
summary_frames.append(summarize_trnas(trna_frame, groupby_column))
161-
162122
# merge summary frames
163123
summarized_genomes = pd.concat(summary_frames, sort=False)
164124
return summarized_genomes
@@ -187,7 +147,7 @@ def split_names_to_long(col:pd.Series):
187147
return pd.DataFrame(splits, columns=col_names, index=dex).fillna('')
188148

189149

190-
def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
150+
def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_frames=tuple()):
191151
# turn all this into an xlsx
192152
with pd.ExcelWriter(output_file) as writer:
193153
for sheet, frame in summarized_genomes.groupby(COL_SHEET, sort=False):
@@ -197,6 +157,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
197157
split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1)
198158
frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS], split_genes], axis=1)
199159
frame.to_excel(writer, sheet_name=sheet, index=False)
160+
for extra_frame in extra_frames:
161+
if extra_frame is not None and not extra_frame.empty:
162+
extra_frame.to_excel(writer, sheet_name=extra_frame[COL_SHEET].iloc[0], index=False)
200163

201164

202165
# TODO: add assembly stats like N50, longest contig, total assembled length etc
@@ -311,7 +274,6 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
311274
else:
312275
quast_frame = pd.read_csv(quast_path, sep='\t')
313276

314-
315277
distil_sheets_names = []
316278
if "default" in distil_topics:
317279
distil_sheets_names = [
@@ -371,10 +333,9 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
371333
summarized_genomes = fill_genome_summary_frame_gene_names(annotations, genome_summary_form, groupby_column, logger)
372334
else:
373335
logger.info(f'distillate_gene_names flag is {distillate_gene_names}. Giving counts instead of gene names in genome metabolism summary')
374-
summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, trna_frame, rrna_frame,
375-
groupby_column)
336+
summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, groupby_column)
376337
summarized_genomes.to_csv('summarized_genomes.tsv', sep='\t', index=None)
377-
write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary)
338+
write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary, extra_frames=[rrna_frame, trna_frame])
378339
logger.info('Generated genome metabolism summary')
379340

380341

modules/local/collect_rna/rrna_collect.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ process RRNA_COLLECT {
5353
else:
5454
collected_data = []
5555
for gene_id, input_fastas_counts in gene_type_counts.items():
56-
row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': '', 'subcategory': ''}
56+
row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': 'rRNA', 'subcategory': ''}
5757
for input_fasta in input_fastas: row[input_fasta] = input_fastas_counts.get(input_fasta, 0)
5858
collected_data.append(row)
5959
collected_df = pd.DataFrame(collected_data)

modules/local/distill/distill.nf

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@ process SUMMARIZE {
88

99
input:
1010
path( ch_combined_annotations, stageAs: "raw-annotations.tsv" )
11-
path( ch_rrna_combined, stageAs: "rrna_combined.tsv" )
12-
path( ch_trna_combined, stageAs: "trna_combined.tsv" )
11+
path( ch_rrna_collected, stageAs: "rrna_combined.tsv" )
12+
path( ch_trna_collected, stageAs: "trna_combined.tsv" )
13+
path( ch_quast_stats )
1314
val( distill_topic )
1415
val( distill_ecosystem )
15-
path( distill_custom )
16+
val( distill_custom )
1617

1718
output:
1819
path( "metabolism_summary.xlsx" ), emit: distillate
@@ -25,7 +26,7 @@ process SUMMARIZE {
2526
# export constants for script
2627
export FASTA_COLUMN="${params.CONSTANTS.FASTA_COLUMN}"
2728
28-
distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_combined}' --trna_path '${ch_trna_combined}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}"
29+
distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_collected}' --trna_path '${ch_trna_collected}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}" --quast_path '${ch_quast_stats}'
2930
3031
"""
3132
}

subworkflows/local/annotate.nf

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ workflow ANNOTATE {
2929

3030
main:
3131
n_fastas = 0
32-
ch_rrna_combined = default_sheet
33-
ch_trna_combined = default_sheet
32+
ch_rrna_collected = default_sheet
33+
ch_trna_collected = default_sheet
3434
ch_combined_annotations = default_sheet
3535

3636
if (params.rename || call) {
@@ -95,14 +95,15 @@ workflow ANNOTATE {
9595

9696
if (params.qc){
9797
QC( ch_fasta, default_sheet, ch_combined_annotations, ch_collected_fna, call )
98-
ch_rrna_combined = QC.out.ch_rrna_combined
99-
ch_trna_combined = QC.out.ch_trna_combined
98+
ch_rrna_collected = QC.out.ch_rrna_collected
99+
ch_trna_collected = QC.out.ch_trna_collected
100100
ch_combined_annotations = QC.out.ch_final_annots
101101
}
102102

103103
emit:
104-
ch_rrna_combined
105-
ch_trna_combined
104+
ch_rrna_collected
105+
ch_trna_collected
106106
ch_combined_annotations
107+
ch_quast_stats
107108

108109
}

subworkflows/local/collect_rna.nf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,28 +72,28 @@ workflow COLLECT_RNA {
7272
// Create sheet for rrnas from the collected rRNAs or provided rRNAs
7373
// Run RRNA_COLLECT to generate a combined TSV for all fastas
7474
RRNA_COLLECT( ch_collected_rRNAs )
75-
ch_rrna_sheet = RRNA_COLLECT.out.rrna_collected_out
75+
ch_rrna_collected = RRNA_COLLECT.out.rrna_collected_out
7676
ch_rrna_combined = RRNA_COLLECT.out.rrna_combined_out
7777
} else {
78-
ch_rrna_sheet = default_sheet
78+
ch_rrna_collected = default_sheet
7979
ch_rrna_combined = default_sheet
8080
}
8181
if (run_trna_collect) {
8282
// Create sheet for trnas from the collected tRNAs or provided tRNAs
8383
// Run TRNA_COLLECT to generate a combined TSV for all fastas
8484
TRNA_COLLECT( ch_collected_tRNAs )
85-
ch_trna_sheet = TRNA_COLLECT.out.trna_collected_out
85+
ch_trna_collected = TRNA_COLLECT.out.trna_collected_out
8686
ch_trna_combined = TRNA_COLLECT.out.trna_combined_out
8787
} else {
88-
ch_trna_sheet = default_sheet
88+
ch_trna_collected = default_sheet
8989
ch_trna_combined = default_sheet
9090
}
9191

9292

9393
emit:
94-
ch_rrna_sheet
94+
ch_rrna_collected
9595
ch_rrna_combined
96-
ch_trna_sheet
96+
ch_trna_collected
9797
ch_trna_combined
9898

9999
}

subworkflows/local/qc.nf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ workflow QC {
2020
main:
2121

2222
COLLECT_RNA( ch_fasta, default_sheet, call )
23-
ch_rrna_combined = COLLECT_RNA.out.ch_rrna_combined
24-
ch_trna_combined = COLLECT_RNA.out.ch_trna_combined
23+
ch_rrna_collected = COLLECT_RNA.out.ch_rrna_collected
24+
ch_trna_collected = COLLECT_RNA.out.ch_trna_collected
2525

2626

2727
// Add Bin Quality to annotations
@@ -71,6 +71,6 @@ workflow QC {
7171

7272
emit:
7373
ch_final_annots
74-
ch_rrna_combined
75-
ch_trna_combined
74+
ch_rrna_collected
75+
ch_trna_collected
7676
}

workflows/dram.nf

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,9 @@ workflow DRAM {
252252

253253
SUMMARIZE(
254254
ch_final_annots,
255-
ANNOTATE.out.ch_rrna_combined,
256-
ANNOTATE.out.ch_trna_combined,
255+
ANNOTATE.out.ch_rrna_collected,
256+
ANNOTATE.out.ch_trna_collected,
257+
ANNOTATE.out.ch_quast_stats,
257258
distill_topic,
258259
distill_ecosystem,
259260
distill_custom

0 commit comments

Comments
 (0)