@@ -113,52 +113,12 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN):
113113 return rrna_frame
114114
115115
116- def summarize_trnas (trnas_df , groupby_column = FASTA_COLUMN ):
117- # first build the frame
118- combos = {(line .type , line .codon , line .note ) for _ , line in trnas_df .iterrows ()}
119- frame_rows = list ()
120- for combo in combos :
121- if combo [2 ] == 'pseudo' :
122- gene_id = '%s, pseudo (%s)'
123- gene_description = '%s pseudo tRNA with %s Codon'
124- else :
125- gene_id = '%s (%s)'
126- gene_description = '%s tRNA with %s Codon'
127- gene_id = gene_id % (combo [0 ], combo [1 ])
128- gene_description = gene_description % (combo [0 ], combo [1 ])
129- module_description = '%s tRNA' % combo [0 ]
130- frame_rows .append ([gene_id , gene_description , module_description , 'tRNA' , 'tRNA' , '' ])
131- trna_frame = pd .DataFrame (frame_rows , columns = FRAME_COLUMNS )
132- trna_frame = trna_frame .sort_values (COL_GENE_ID )
133- # then fill it in
134- trna_frame = trna_frame .set_index (COL_GENE_ID )
135- for group , frame in trnas_df .groupby (groupby_column ):
136- gene_ids = list ()
137- for index , line in frame .iterrows ():
138- if line .note == 'pseudo' :
139- gene_id = '%s, pseudo (%s)'
140- else :
141- gene_id = '%s (%s)'
142- gene_ids .append (gene_id % (line .type , line .codon ))
143- trna_frame [group ] = pd .Series (Counter (gene_ids ))
144- trna_frame = trna_frame .reset_index ()
145- trna_frame = trna_frame .fillna (0 )
146- return trna_frame
147-
148-
149- def make_genome_summary (annotations , genome_summary_frame , logger , trna_frame = None , rrna_frame = None , groupby_column = FASTA_COLUMN ):
116+ def make_genome_summary (annotations , genome_summary_frame , logger , groupby_column = FASTA_COLUMN ):
117+
150118 summary_frames = list ()
151119 # get ko summaries
152120 summary_frames .append (fill_genome_summary_frame (annotations , genome_summary_frame .copy (), groupby_column , logger ))
153121
154- # add rRNAs
155- if rrna_frame is not None :
156- summary_frames .append (summarize_rrnas (rrna_frame , groupby_column ))
157-
158- # add tRNAs
159- if trna_frame is not None :
160- summary_frames .append (summarize_trnas (trna_frame , groupby_column ))
161-
162122 # merge summary frames
163123 summarized_genomes = pd .concat (summary_frames , sort = False )
164124 return summarized_genomes
@@ -187,7 +147,7 @@ def split_names_to_long(col:pd.Series):
187147 return pd .DataFrame (splits , columns = col_names , index = dex ).fillna ('' )
188148
189149
190- def write_summarized_genomes_to_xlsx (summarized_genomes , output_file ):
150+ def write_summarized_genomes_to_xlsx (summarized_genomes , output_file , extra_frames = tuple () ):
191151 # turn all this into an xlsx
192152 with pd .ExcelWriter (output_file ) as writer :
193153 for sheet , frame in summarized_genomes .groupby (COL_SHEET , sort = False ):
@@ -197,6 +157,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file):
197157 split_genes = pd .concat ([split_names_to_long (frame [i ].astype (str )) for i in gene_columns ], axis = 1 )
198158 frame = pd .concat ([frame [CONSTANT_DISTILLATE_COLUMNS ], split_genes ], axis = 1 )
199159 frame .to_excel (writer , sheet_name = sheet , index = False )
160+ for extra_frame in extra_frames :
161+ if extra_frame is not None and not extra_frame .empty :
162+ extra_frame .to_excel (writer , sheet_name = extra_frame [COL_SHEET ].iloc [0 ], index = False )
200163
201164
202165# TODO: add assembly stats like N50, longest contig, total assembled length etc
@@ -311,7 +274,6 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
311274 else :
312275 quast_frame = pd .read_csv (quast_path , sep = '\t ' )
313276
314-
315277 distil_sheets_names = []
316278 if "default" in distil_topics :
317279 distil_sheets_names = [
@@ -371,10 +333,9 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby
371333 summarized_genomes = fill_genome_summary_frame_gene_names (annotations , genome_summary_form , groupby_column , logger )
372334 else :
373335 logger .info (f'distillate_gene_names flag is { distillate_gene_names } . Giving counts instead of gene names in genome metabolism summary' )
374- summarized_genomes = make_genome_summary (annotations , genome_summary_form , logger , trna_frame , rrna_frame ,
375- groupby_column )
336+ summarized_genomes = make_genome_summary (annotations , genome_summary_form , logger , groupby_column )
376337 summarized_genomes .to_csv ('summarized_genomes.tsv' , sep = '\t ' , index = None )
377- write_summarized_genomes_to_xlsx (summarized_genomes , genome_summary )
338+ write_summarized_genomes_to_xlsx (summarized_genomes , genome_summary , extra_frames = [ rrna_frame , trna_frame ] )
378339 logger .info ('Generated genome metabolism summary' )
379340
380341
0 commit comments