@@ -183,6 +183,7 @@ def process_datasets(args):
183183 columns = {'improve_drug_id' : 'improve_chem_id' },
184184 inplace = True ,
185185 )
186+ response_data ['improve_sample_id' ] = "SAMPLE_ID_" + response_data ['improve_sample_id' ].astype (int ).astype (str )
186187 # exporting the drug response data to 'y_data/response.tsv'
187188 outfile_path = args .WORKDIR .joinpath ("data_out" , "y_data" , "response.tsv" )
188189 response_data .to_csv (
@@ -200,7 +201,6 @@ def process_datasets(args):
200201 #-------------------------------------------------------------------
201202
202203
203- # TODO: potentially change vars to be read from `args`
204204 split_data_sets (
205205 args = args ,
206206 data_sets = data_sets ,
@@ -433,8 +433,10 @@ def process_datasets(args):
433433 how = 'outer' ),
434434 dfs_to_merge .values ())
435435
436- # retrieving unique mutations (the above creates multiplicates)
436+ # retrieving unique mutations (the above creates multiplicates) &
437+ # adding a prefix to the improve_sample_id
437438 unique_mutations = merged_mutations [['entrez_id' , 'improve_sample_id' , 'mutation' ]].drop_duplicates ()
439+ unique_mutations ['improve_sample_id' ] = 'SAMPLE_ID_' + unique_mutations ['improve_sample_id' ].astype (str )
438440
439441 # counting the mutations per entrez_id/improve_sample_id pair and
440442 # aggregating it into a pivot table (also filling NAs with 0s)
@@ -505,7 +507,7 @@ def split_data_sets(
505507 logger .info (f'creating splits for { data_set } ...' )
506508 # getting "<DATASET>_all.txt"
507509 drug_response_rows = (
508- data_sets ['mpnst' ]
510+ data_sets [data_set ]
509511 .experiments [
510512 ['improve_sample_id' , 'improve_drug_id' , "time" , "study" ]
511513 ]
@@ -515,6 +517,7 @@ def split_data_sets(
515517 columns = {'improve_drug_id' : 'improve_chem_id' },
516518 inplace = True ,
517519 )
520+ drug_response_rows ['improve_sample_id' ] = "SAMPLE_ID_" + drug_response_rows ['improve_sample_id' ].astype (int ).astype (str )
518521 row_nums = pd .merge (
519522 response_data ,
520523 drug_response_rows ,
@@ -559,6 +562,7 @@ def split_data_sets(
559562 columns = {'improve_drug_id' : 'improve_chem_id' },
560563 inplace = True ,
561564 )
565+ train_keys ['improve_sample_id' ] = "SAMPLE_ID_" + train_keys ['improve_sample_id' ].astype (int ).astype (str )
562566 row_nums = pd .merge (
563567 response_data ,
564568 train_keys ,
@@ -596,6 +600,7 @@ def split_data_sets(
596600 columns = {'improve_drug_id' : 'improve_chem_id' },
597601 inplace = True ,
598602 )
603+ test_keys ['improve_sample_id' ] = "SAMPLE_ID_" + test_keys ['improve_sample_id' ].astype (int ).astype (str )
599604 row_nums = pd .merge (
600605 response_data ,
601606 test_keys ,
@@ -626,6 +631,7 @@ def split_data_sets(
626631 columns = {'improve_drug_id' : 'improve_chem_id' },
627632 inplace = True ,
628633 )
634+ val_keys ['improve_sample_id' ] = "SAMPLE_ID_" + val_keys ['improve_sample_id' ].astype (int ).astype (str )
629635 row_nums = pd .merge (
630636 response_data ,
631637 val_keys ,
@@ -669,7 +675,10 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
669675 getattr (data_sets [data_set ], data_type , None ) is not None
670676 ):
671677 dfs_to_merge .append (
672- data_sets [data_set ].format (data_type = data_type ).transpose ()
678+ data_sets [data_set ]
679+ .format (data_type = data_type )
680+ .transpose ()
681+ .add_prefix ('SAMPLE_ID_' , axis = 1 )
673682 )
674683
675684 merged_data = None
@@ -697,7 +706,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
697706 )
698707
699708 # Casting col and row indices back to int
700- merged_data .columns .astype (int )
709+ # merged_data.columns.astype(int)
701710 if not merged_data .index .dtype == int :
702711 merged_data .index = merged_data .index .astype (int )
703712
0 commit comments