Skip to content

Commit 79dd74d

Browse files
Merge pull request #469 from WrightonLabCSU/bugfix/distill-bugs
bugfix: Fix distill bugs with genome stats, and metabolism summary
2 parents 3c20b59 + 419322b commit 79dd74d

7 files changed

Lines changed: 34 additions & 70 deletions

File tree

bin/distill.py

Lines changed: 8 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -113,52 +113,12 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN):
113113
return rrna_frame
114114

115115

116-
def summarize_trnas(trnas_df, groupby_column=FASTA_COLUMN):
117-
# first build the frame
118-
combos = {(line.type, line.codon, line.note) for _, line in trnas_df.iterrows()}
119-
frame_rows = list()
120-
for combo in combos:
121-
if combo[2] == 'pseudo':
122-
gene_id = '%s, pseudo (%s)'
123-
gene_description = '%s pseudo tRNA with %s Codon'
124-
else:
125-
gene_id = '%s (%s)'
126-
gene_description = '%s tRNA with %s Codon'
127-
gene_id = gene_id % (combo[0], combo[1])
128-
gene_description = gene_description % (combo[0], combo[1])
129-
module_description = '%s tRNA' % combo[0]
130-
frame_rows.append([gene_id, gene_description, module_description, 'tRNA', 'tRNA', ''])
131-
trna_frame = pd.DataFrame(frame_rows, columns=FRAME_COLUMNS)
132-
trna_frame = trna_frame.sort_values(COL_GENE_ID)
133-
# then fill it in
134-
trna_frame = trna_frame.set_index(COL_GENE_ID)
135-
for group, frame in trnas_df.groupby(groupby_column):
136-
gene_ids = list()
137-
for index, line in frame.iterrows():
138-
if line.note == 'pseudo':
139-
gene_id = '%s, pseudo (%s)'
140-
else:
141-
gene_id = '%s (%s)'
142-
gene_ids.append(gene_id % (line.type, line.codon))
143-
trna_frame[group] = pd.Series(Counter(gene_ids))
144-
trna_frame = trna_frame.reset_index()
145-
trna_frame = trna_frame.fillna(0)
146-
return trna_frame
147-
148-
149-
def make_genome_summary(annotations, genome_summary_frame, logger, trna_frame=None, rrna_frame=None, groupby_column=FASTA_COLUMN):
116+
def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column=FASTA_COLUMN):
117+
150118
summary_frames = list()
151119
# get ko summaries
152120
summary_frames.append(fill_genome_summary_frame(annotations, genome_summary_frame.copy(), groupby_column, logger))
153121

154-
# add rRNAs
155-
if rrna_frame is not None:
156-
summary_frames.append(summarize_rrnas(rrna_frame, groupby_column))
157-
158-
# add tRNAs
159-
if trna_frame is not None:
160-
summary_frames.append(summarize_trnas(trna_frame, groupby_column))
161-
162122
# merge summary frames
163123
summarized_genomes = pd.concat(summary_frames, sort=False)
164124
return summarized_genomes
@@ -187,7 +147,7 @@ def split_names_to_long(col:pd.Series):
187147
return pd.DataFrame(splits, columns=col_names, index=dex).fillna('')
188148

189149

190-
def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
150+
def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_frames=tuple()):
191151
# turn all this into an xlsx
192152
with pd.ExcelWriter(output_file) as writer:
193153
for sheet, frame in summarized_genomes.groupby(COL_SHEET, sort=False):
@@ -197,6 +157,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
197157
split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1)
198158
frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS], split_genes], axis=1)
199159
frame.to_excel(writer, sheet_name=sheet, index=False)
160+
for extra_frame in extra_frames:
161+
if extra_frame is not None and not extra_frame.empty:
162+
extra_frame.to_excel(writer, sheet_name=extra_frame[COL_SHEET].iloc[0], index=False)
200163

201164

202165
# TODO: add assembly stats like N50, longest contig, total assembled length etc
@@ -311,7 +274,6 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
311274
else:
312275
quast_frame = pd.read_csv(quast_path, sep='\t')
313276

314-
315277
distil_sheets_names = []
316278
if "default" in distil_topics:
317279
distil_sheets_names = [
@@ -371,10 +333,9 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
371333
summarized_genomes = fill_genome_summary_frame_gene_names(annotations, genome_summary_form, groupby_column, logger)
372334
else:
373335
logger.info(f'distillate_gene_names flag is {distillate_gene_names}. Giving counts instead of gene names in genome metabolism summary')
374-
summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, trna_frame, rrna_frame,
375-
groupby_column)
336+
summarized_genomes = make_genome_summary(annotations, genome_summary_form, logger, groupby_column)
376337
summarized_genomes.to_csv('summarized_genomes.tsv', sep='\t', index=None)
377-
write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary)
338+
write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary, extra_frames=[rrna_frame, trna_frame])
378339
logger.info('Generated genome metabolism summary')
379340

380341

modules/local/collect_rna/rrna_collect.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ process RRNA_COLLECT {
5353
else:
5454
collected_data = []
5555
for gene_id, input_fastas_counts in gene_type_counts.items():
56-
row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': '', 'subcategory': ''}
56+
row = {'gene_id': gene_id, 'gene_description': f"{gene_id} gene", 'category': 'rRNA', 'topic_ecosystem': 'rRNA', 'subcategory': ''}
5757
for input_fasta in input_fastas: row[input_fasta] = input_fastas_counts.get(input_fasta, 0)
5858
collected_data.append(row)
5959
collected_df = pd.DataFrame(collected_data)

modules/local/distill/distill.nf

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@ process SUMMARIZE {
88

99
input:
1010
path( ch_combined_annotations, stageAs: "raw-annotations.tsv" )
11-
path( ch_rrna_combined, stageAs: "rrna_combined.tsv" )
12-
path( ch_trna_combined, stageAs: "trna_combined.tsv" )
11+
path( ch_rrna_collected, stageAs: "rrna_combined.tsv" )
12+
path( ch_trna_collected, stageAs: "trna_combined.tsv" )
13+
path( ch_quast_stats )
1314
val( distill_topic )
1415
val( distill_ecosystem )
15-
path( distill_custom )
16+
val( distill_custom )
1617

1718
output:
1819
path( "metabolism_summary.xlsx" ), emit: distillate
@@ -25,7 +26,7 @@ process SUMMARIZE {
2526
# export constants for script
2627
export FASTA_COLUMN="${params.CONSTANTS.FASTA_COLUMN}"
2728
28-
distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_combined}' --trna_path '${ch_trna_combined}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}"
29+
distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_collected}' --trna_path '${ch_trna_collected}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}" --quast_path '${ch_quast_stats}'
2930
3031
"""
3132
}

subworkflows/local/annotate.nf

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ workflow ANNOTATE {
2929

3030
main:
3131
n_fastas = 0
32-
ch_rrna_combined = default_sheet
33-
ch_trna_combined = default_sheet
32+
ch_rrna_collected = default_sheet
33+
ch_trna_collected = default_sheet
3434
ch_combined_annotations = default_sheet
3535

3636
if (params.rename || call) {
@@ -95,14 +95,15 @@ workflow ANNOTATE {
9595

9696
if (params.qc){
9797
QC( ch_fasta, default_sheet, ch_combined_annotations, ch_collected_fna, call )
98-
ch_rrna_combined = QC.out.ch_rrna_combined
99-
ch_trna_combined = QC.out.ch_trna_combined
98+
ch_rrna_collected = QC.out.ch_rrna_collected
99+
ch_trna_collected = QC.out.ch_trna_collected
100100
ch_combined_annotations = QC.out.ch_final_annots
101101
}
102102

103103
emit:
104-
ch_rrna_combined
105-
ch_trna_combined
104+
ch_rrna_collected
105+
ch_trna_collected
106106
ch_combined_annotations
107+
ch_quast_stats
107108

108109
}

subworkflows/local/collect_rna.nf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,28 +72,28 @@ workflow COLLECT_RNA {
7272
// Create sheet for rrnas from the collected rRNAs or provided rRNAs
7373
// Run RRNA_COLLECT to generate a combined TSV for all fastas
7474
RRNA_COLLECT( ch_collected_rRNAs )
75-
ch_rrna_sheet = RRNA_COLLECT.out.rrna_collected_out
75+
ch_rrna_collected = RRNA_COLLECT.out.rrna_collected_out
7676
ch_rrna_combined = RRNA_COLLECT.out.rrna_combined_out
7777
} else {
78-
ch_rrna_sheet = default_sheet
78+
ch_rrna_collected = default_sheet
7979
ch_rrna_combined = default_sheet
8080
}
8181
if (run_trna_collect) {
8282
// Create sheet for trnas from the collected tRNAs or provided tRNAs
8383
// Run TRNA_COLLECT to generate a combined TSV for all fastas
8484
TRNA_COLLECT( ch_collected_tRNAs )
85-
ch_trna_sheet = TRNA_COLLECT.out.trna_collected_out
85+
ch_trna_collected = TRNA_COLLECT.out.trna_collected_out
8686
ch_trna_combined = TRNA_COLLECT.out.trna_combined_out
8787
} else {
88-
ch_trna_sheet = default_sheet
88+
ch_trna_collected = default_sheet
8989
ch_trna_combined = default_sheet
9090
}
9191

9292

9393
emit:
94-
ch_rrna_sheet
94+
ch_rrna_collected
9595
ch_rrna_combined
96-
ch_trna_sheet
96+
ch_trna_collected
9797
ch_trna_combined
9898

9999
}

subworkflows/local/qc.nf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ workflow QC {
2020
main:
2121

2222
COLLECT_RNA( ch_fasta, default_sheet, call )
23-
ch_rrna_combined = COLLECT_RNA.out.ch_rrna_combined
24-
ch_trna_combined = COLLECT_RNA.out.ch_trna_combined
23+
ch_rrna_collected = COLLECT_RNA.out.ch_rrna_collected
24+
ch_trna_collected = COLLECT_RNA.out.ch_trna_collected
2525

2626

2727
// Add Bin Quality to annotations
@@ -71,6 +71,6 @@ workflow QC {
7171

7272
emit:
7373
ch_final_annots
74-
ch_rrna_combined
75-
ch_trna_combined
74+
ch_rrna_collected
75+
ch_trna_collected
7676
}

workflows/dram.nf

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,9 @@ workflow DRAM {
252252

253253
SUMMARIZE(
254254
ch_final_annots,
255-
ANNOTATE.out.ch_rrna_combined,
256-
ANNOTATE.out.ch_trna_combined,
255+
ANNOTATE.out.ch_rrna_collected,
256+
ANNOTATE.out.ch_trna_collected,
257+
ANNOTATE.out.ch_quast_stats,
257258
distill_topic,
258259
distill_ecosystem,
259260
distill_custom

0 commit comments

Comments
 (0)