@@ -174,10 +174,8 @@ def retrieve_drug_info(compound_name):
174174 return np .nan , np .nan , np .nan , np .nan , np .nan , np .nan
175175
176176 data = response .json ()
177- #print(data)
178177 if "PropertyTable" in data :
179178 properties = data ["PropertyTable" ]["Properties" ][0 ]
180- #print(properties)
181179 pubchem_id = properties .get ('CID' ,np .nan )
182180 canSMILES = properties .get ("CanonicalSMILES" , np .nan )
183181 isoSMILES = properties .get ("IsomericSMILES" , np .nan )
@@ -259,9 +257,6 @@ def merge_drug_info(d_df,drug_map):
259257 pd.DataFrame
260258 The merged dataframe containing combined drug information.
261259 """
262- #print(drug_map)
263- #print(d_df.columns)
264- #print(d_df)
265260 print (d_df ['isoSMILES' ].dtype , drug_map ['isoSMILES' ].dtype )
266261 d_df ['isoSMILES' ] = d_df ['isoSMILES' ].astype (str )
267262 drug_map ['isoSMILES' ] = drug_map ['isoSMILES' ].astype (str )
@@ -337,10 +332,9 @@ def add_improve_id(previous_df, new_df):
337332 """
338333 if not previous_df .empty and 'improve_drug_id' in previous_df .columns :
339334 id_list = [int (val .replace ('SMI_' , '' )) for val in previous_df ['improve_drug_id' ].tolist () if pd .notnull (val ) and val .startswith ('SMI_' )]
340- max_id = max (id_list ) if id_list else 0 # Default to 0 if the list is empty
335+ max_id = max (id_list ) if id_list else 0
341336 else :
342- max_id = 0 # Default value if the DataFrame is empty or doesn't have the column
343- # max_id = max([int(val.replace('SMI_', '')) for val in previous_df['improve_drug_id'].tolist() if pd.notnull(val) and val.startswith('SMI_')])
337+ max_id = 0
344338 # Identify isoSMILES in the new dataframe that don't exist in the old dataframe
345339 unique_new_smiles = set (new_df ['isoSMILES' ]) - set (previous_df ['isoSMILES' ])
346340 # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
@@ -370,24 +364,9 @@ def map_exp_to_improve(exp_path):#df,improve_map_file):
370364 pd.DataFrame
371365 Mapped dataframe with 'improve_sample_id' added and 'sample_id' removed.
372366 """
373- mapped_df = pd .read_csv (exp_path ,sep = '\t ' ) # Map sample_id to improve_sample_id
374- #mapped_df = pd.merge(df, improve[['other_id', 'improve_sample_id']], left_on='sample_id', right_on='other_id', how='left')
375- #mapped_df.drop(columns=['sample_id', 'other_id'], inplace=True)
376- #mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
367+ mapped_df = pd .read_csv (exp_path ,sep = '\t ' )
377368 mapped_df ['source' ] = 'synapse'
378369 mapped_df ['study' ] = 'BeatAML'
379- #mapped_df= mapped_df.rename(columns={'Drug':'improve_sample_id',
380- # 'IC50':'ic50',
381- # 'EC50':'ec50',
382- # 'EC50se':'ec50se',
383- # 'Einf':'einf',
384- # 'HS':'hs',
385- # 'AAC1':'aac1',
386- # 'AUC1':'auc1',
387- # 'DSS1':'dss1',
388- # 'R2fit':'r2fit'
389- # }
390- # )
391370 return mapped_df
392371
393372
@@ -445,12 +424,21 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
445424 mapped_df .rename (columns = {"hgvsc" : "mutation" }, inplace = True )
446425 mapped_df .rename (columns = {"labId" : "sample_id" }, inplace = True )
447426 mapped_df .rename (columns = {"Entrez_Gene_Id" : "entrez_id" }, inplace = True )
448-
449- elif data_type == "mutation" :
450- df = df [['dbgap_sample_id' ,'hgvsc' , 'hgvsp' , 'gene' , 'variant_classification' ,'t_vaf' , 'refseq' , 'symbol' ]]
451- mapped_df = df .merge (genes , left_on = 'symbol' , right_on = 'gene_symbol' , how = 'left' ).reindex (
452- columns = ['hgvsc' , 'entrez_id' , "dbgap_sample_id" ,"variant_classification" ])
453427
428+ variant_mapping = {
429+ 'frameshift_variant' : 'Frameshift_Variant' ,
430+ 'missense_variant' : 'Missense_Mutation' ,
431+ 'stop_gained' : 'Nonsense_Mutation' ,
432+ 'inframe_deletion' : 'In_Frame_Del' ,
433+ 'protein_altering_variant' : 'Protein_Altering_Variant' ,
434+ 'splice_acceptor_variant' : 'Splice_Site' ,
435+ 'splice_donor_variant' : 'Splice_Site' ,
436+ 'start_lost' : 'Start_Codon_Del' ,
437+ 'inframe_insertion' : 'In_Frame_Ins' ,
438+ 'stop_lost' : 'Nonstop_Mutation'
439+ }
440+
441+ mapped_df ['variant_classification' ] = mapped_df ['variant_classification' ].map (variant_mapping )
454442
455443 elif data_type == "proteomics" :
456444 mapped_ids ['sampleID' ] = mapped_ids ['sampleID' ].str .split ('_' ).apply (lambda x : x [2 ])
@@ -473,7 +461,6 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
473461 inplace = True
474462 )
475463
476-
477464 mapped_df = pd .merge (mapped_df , improve [['other_id' , 'improve_sample_id' ]],
478465 left_on = 'sample_id' ,
479466 right_on = 'other_id' ,
@@ -482,7 +469,7 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
482469 mapped_df ['source' ] = 'synapse'
483470 mapped_df ['study' ] = 'BeatAML'
484471
485- final_dataframe = mapped_df .dropna ()#pd.dropna(mapped_df,0)
472+ final_dataframe = mapped_df .dropna ()
486473 return final_dataframe
487474
488475
@@ -659,8 +646,6 @@ def generate_drug_list(drug_map_path,drug_path):
659646
660647
661648 t_df = pd .read_csv ('tpm_' + transcriptomics_file , sep = '\t ' )
662- # t_df.index = t_df.stable_id#display_label
663- # t_df = t_df.iloc[:, 4:]
664649 t_df = t_df .reset_index ().rename (columns = {'stable_id' : 'Gene' })
665650 t_df = pd .melt (t_df , id_vars = ['Gene' ], var_name = 'sample_id' , value_name = 'transcriptomics' )
666651 print (improve_map_file )
@@ -724,7 +709,5 @@ def generate_drug_list(drug_map_path,drug_path):
724709 exp_res = map_exp_to_improve (drug_path )
725710 exp_res .to_csv ("/tmp/beataml_experiments.tsv" , index = False , sep = '\t ' )
726711
727- #drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")
728-
729712# print("Finished Pipeline")
730713
0 commit comments