33from copy import deepcopy
44import functools as ft
55import logging
6+ import numpy as np
67from os import PathLike
78from pathlib import Path
89from pathlib import PurePath
@@ -83,6 +84,13 @@ def main():
8384 "integers. Must be same length as <NUM_SPLITS>. If omitted will "
8485 "default to randomized seeds."
8586 )
87+ p_process_datasets .add_argument (
88+ '-e' , '--exclude_improve_drug_id' , dest = 'EXCL_DRUGS_LIST' ,
89+ type = _improve_drug_id_list ,
90+ default = None ,
91+ help = 'define a list of improve_drug_id/improve_chem_id[s] that '
92+ 'should be excluded from the reference datasets.'
93+ )
8694
8795 p_all = command_parsers .add_parser (
8896 "all" ,
@@ -183,7 +191,7 @@ def process_datasets(args):
183191 columns = {'improve_drug_id' : 'improve_chem_id' },
184192 inplace = True ,
185193 )
186- response_data ['improve_sample_id' ] = "SAMPLE_ID_ " + response_data ['improve_sample_id' ].astype (int ).astype (str )
194+ response_data ['improve_sample_id' ] = "SAMPLE-ID- " + response_data ['improve_sample_id' ].astype (int ).astype (str )
187195 # exporting the drug response data to 'y_data/response.tsv'
188196 outfile_path = args .WORKDIR .joinpath ("data_out" , "y_data" , "response.tsv" )
189197 response_data .to_csv (
@@ -201,12 +209,12 @@ def process_datasets(args):
201209 #-------------------------------------------------------------------
202210
203211
204- split_data_sets (
205- args = args ,
206- data_sets = data_sets ,
207- data_sets_info = data_sets_info ,
208- response_data = response_data
209- )
212+ # split_data_sets(
213+ # args=args,
214+ # data_sets=data_sets,
215+ # data_sets_info=data_sets_info,
216+ # response_data=response_data
217+ # )
210218
211219 #-------------------------------------------------------------------
212220 # getting common / reference gene symbols
@@ -276,6 +284,9 @@ def process_datasets(args):
276284 )
277285
278286 merged_transcriptomics = merged_transcriptomics [merged_transcriptomics ['entrez_id' ] != 0 ]
287+ merged_transcriptomics = merged_transcriptomics .fillna (0 ).T .reset_index ()
288+ for i in range (0 ,3 ):
289+ merged_transcriptomics .iloc [i ,0 ] = np .nan
279290
280291 # writing the expression datatable to '/x_data/*_expression.tsv'
281292 outfile_path = args .WORKDIR .joinpath (
@@ -287,12 +298,11 @@ def process_datasets(args):
287298 # This back fills NAs with 0s - the assumend "neutral" value for
288299 # gene expression data
289300 (merged_transcriptomics
290- .fillna (0 )
291- .transpose ()
292301 .to_csv (
293302 path_or_buf = outfile_path ,
294303 sep = '\t ' ,
295- header = False
304+ header = False ,
305+ index = False
296306 )
297307 )
298308
@@ -332,6 +342,9 @@ def process_datasets(args):
332342 'gene_symbol' ,
333343 merged_copy_number .pop ('gene_symbol' )
334344 )
345+ merged_copy_number = merged_copy_number .T .reset_index ()
346+ for i in range (0 ,3 ):
347+ merged_copy_number .iloc [i ,0 ] = np .nan
335348
336349 # writing the expression datatable to '/x_data/*_copy_number.tsv'
337350 outfile_path = args .WORKDIR .joinpath (
@@ -340,11 +353,11 @@ def process_datasets(args):
340353 "cancer_copy_number.tsv"
341354 )
342355 (merged_copy_number
343- .transpose ()
344356 .to_csv (
345357 path_or_buf = outfile_path ,
346358 sep = '\t ' ,
347- header = False
359+ header = False ,
360+ index = False
348361 )
349362 )
350363
@@ -369,6 +382,9 @@ def process_datasets(args):
369382 'gene_symbol' ,
370383 discretized_copy_number .pop ('gene_symbol' )
371384 )
385+ discretized_copy_number = discretized_copy_number .T .reset_index ()
386+ for i in range (0 ,3 ):
387+ discretized_copy_number .iloc [i ,0 ] = np .nan
372388
373389 # writing the expression datatable to '/x_data/*_copy_number.tsv'
374390 outfile_path = args .WORKDIR .joinpath (
@@ -377,11 +393,11 @@ def process_datasets(args):
377393 "cancer_discretized_copy_number.tsv"
378394 )
379395 (discretized_copy_number
380- .transpose ()
381396 .to_csv (
382397 path_or_buf = outfile_path ,
383398 sep = '\t ' ,
384- header = False
399+ header = False ,
400+ index = False
385401 )
386402 )
387403
@@ -398,6 +414,13 @@ def process_datasets(args):
398414
399415 concat_drugs = pd .concat (dfs_to_merge .values ())
400416 out_df = concat_drugs [['improve_drug_id' ,'canSMILES' ]].drop_duplicates ()
417+
418+ if args .EXCL_DRUGS_LIST is not None :
419+ logger .info (
420+ f"Removing all chemical compunds with ids: '{ args .EXCL_DRUGS_LIST } '"
421+ )
422+ out_df = out_df [~ out_df ['improve_drug_id' ].isin (args .EXCL_DRUGS_LIST )]
423+
401424 out_df .rename (
402425 columns = {'improve_drug_id' : 'improve_chem_id' },
403426 inplace = True ,
@@ -437,7 +460,7 @@ def process_datasets(args):
437460 # retrieving unique mutations (the above creates multiplicates) &
438461 # adding a prefix to the improve_sample_id
439462 unique_mutations = merged_mutations [['entrez_id' , 'improve_sample_id' , 'mutation' ]].drop_duplicates ()
440- unique_mutations ['improve_sample_id' ] = 'SAMPLE_ID_ ' + unique_mutations ['improve_sample_id' ].astype (str )
463+ unique_mutations ['improve_sample_id' ] = 'SAMPLE-ID- ' + unique_mutations ['improve_sample_id' ].astype (str )
441464
442465 # counting the mutations per entrez_id/improve_sample_id pair and
443466 # aggregating it into a pivot table (also filling NAs with 0s)
@@ -474,17 +497,21 @@ def process_datasets(args):
474497 # removing some rows where we don't have a 'gene_symbol' for the
475498 # entrez id
476499 mutation_counts = mutation_counts [mutation_counts ['gene_symbol' ].notna ()]
500+ mutation_counts = mutation_counts .T .reset_index ()
501+ for i in range (0 ,3 ):
502+ mutation_counts .iloc [i ,0 ] = np .nan
477503
478504 # writing the dataframe to the mutation counts mastertable
479505 outfile_path = args .WORKDIR .joinpath (
480506 "data_out" ,
481507 "x_data" ,
482508 "cancer_mutation_count.tsv"
483509 )
484- mutation_counts .T . to_csv (
510+ mutation_counts .to_csv (
485511 path_or_buf = outfile_path ,
486512 sep = '\t ' ,
487- header = False
513+ header = False ,
514+ index = False
488515 )
489516
490517def split_data_sets (
@@ -518,7 +545,7 @@ def split_data_sets(
518545 columns = {'improve_drug_id' : 'improve_chem_id' },
519546 inplace = True ,
520547 )
521- drug_response_rows ['improve_sample_id' ] = "SAMPLE_ID_ " + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
548+ drug_response_rows ['improve_sample_id' ] = "SAMPLE-ID- " + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
522549 row_nums = pd .merge (
523550 response_data ,
524551 drug_response_rows ,
@@ -563,7 +590,7 @@ def split_data_sets(
563590 columns = {'improve_drug_id' : 'improve_chem_id' },
564591 inplace = True ,
565592 )
566- train_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + train_keys ['improve_sample_id' ].astype (int ).astype (str )
593+ train_keys ['improve_sample_id' ] = "SAMPLE-ID- " + train_keys ['improve_sample_id' ].astype (int ).astype (str )
567594 row_nums = pd .merge (
568595 response_data ,
569596 train_keys ,
@@ -601,7 +628,7 @@ def split_data_sets(
601628 columns = {'improve_drug_id' : 'improve_chem_id' },
602629 inplace = True ,
603630 )
604- test_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + test_keys ['improve_sample_id' ].astype (int ).astype (str )
631+ test_keys ['improve_sample_id' ] = "SAMPLE-ID- " + test_keys ['improve_sample_id' ].astype (int ).astype (str )
605632 row_nums = pd .merge (
606633 response_data ,
607634 test_keys ,
@@ -632,7 +659,7 @@ def split_data_sets(
632659 columns = {'improve_drug_id' : 'improve_chem_id' },
633660 inplace = True ,
634661 )
635- val_keys ['improve_sample_id' ] = "SAMPLE_ID_ " + val_keys ['improve_sample_id' ].astype (int ).astype (str )
662+ val_keys ['improve_sample_id' ] = "SAMPLE-ID- " + val_keys ['improve_sample_id' ].astype (int ).astype (str )
636663 row_nums = pd .merge (
637664 response_data ,
638665 val_keys ,
@@ -679,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
679706 data_sets [data_set ]
680707 .format (data_type = data_type )
681708 .transpose ()
682- .add_prefix ('SAMPLE_ID_ ' , axis = 1 )
709+ .add_prefix ('SAMPLE-ID- ' , axis = 1 )
683710 )
684711
685712 merged_data = None
@@ -805,6 +832,15 @@ def _random_seed_list(list: str) -> list:
805832 list_ = list .split (',' )
806833 return [int (item ) for item in list_ ]
807834
835+ def _improve_drug_id_list (list : str ) -> list :
836+ if not isinstance (list , str ):
837+ raise TypeError (
838+ f"'exclude_improve_drug_id' must be of type str. Supplied argument "
839+ f"is of type { type (list )} ."
840+ )
841+ list_ = list .split (',' )
842+ return list_
843+
808844
809845if __name__ == '__main__' :
810846 try : main ()
0 commit comments