99import argparse
1010import time
1111
12- def download_from_github (raw_url , save_path ):
13- """
14- Download a file from a raw GitHub URL and save to the specified path.
15-
16- Parameters
17- ----------
18- raw_url : str
19- The raw GitHub URL pointing to the file to be downloaded.
20- save_path : str
21- The local path where the downloaded file will be saved.
22-
23- Returns
24- -------
25- None
26- """
27- response = requests .get (raw_url )
28- with open (save_path , 'wb' ) as f :
29- f .write (response .content )
30- return
12+ # def download_from_github(raw_url, save_path):
13+ # """
14+ # Download a file from a raw GitHub URL and save to the specified path.
15+
16+ # Parameters
17+ # ----------
18+ # raw_url : str
19+ # The raw GitHub URL pointing to the file to be downloaded.
20+ # save_path : str
21+ # The local path where the downloaded file will be saved.
22+
23+ # Returns
24+ # -------
25+ # None
26+ # """
27+ # response = requests.get(raw_url)
28+ # with open(save_path, 'wb') as f:
29+ # f.write(response.content)
30+ # return
3131
3232def retrieve_figshare_data (url ):
3333 """
@@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
178178 properties = data ["PropertyTable" ]["Properties" ][0 ]
179179 pubchem_id = properties .get ('CID' ,np .nan )
180180 canSMILES = properties .get ("CanonicalSMILES" , np .nan )
181- isoSMILES = properties .get ("IsomericSMILES" , np .nan )
181+ # isoSMILES = properties.get("IsomericSMILES", np.nan)
182182 InChIKey = properties .get ("InChIKey" , np .nan )
183183 formula = properties .get ("MolecularFormula" , np .nan )
184184 weight = properties .get ("MolecularWeight" , np .nan )
185185
186- return pubchem_id , canSMILES , isoSMILES , InChIKey , formula , weight
186+ return pubchem_id , canSMILES , InChIKey , formula , weight
187187 else :
188- return np .nan , np .nan , np .nan , np .nan , np .nan , np . nan
188+ return np .nan , np .nan , np .nan , np .nan , np .nan
189189
190190
191191def update_dataframe_with_pubchem (d_df ):
@@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
230230 if row ['chem_name' ] in data_dict and not all (pd .isna (val ) for val in data_dict [row ['chem_name' ]]):
231231 values = data_dict [row ['chem_name' ]]
232232 else :
233- values = data_dict .get (row ['other_name' ], (np .nan , np .nan , np .nan , np .nan , np .nan , np . nan ))
233+ values = data_dict .get (row ['other_name' ], (np .nan , np .nan , np .nan , np .nan , np .nan ))
234234
235235 d_df .at [idx , 'pubchem_id' ] = values [0 ]
236236 d_df .at [idx , "canSMILES" ] = values [1 ]
237- d_df .at [idx , "isoSMILES" ] = values [2 ]
238- d_df .at [idx , "InChIKey" ] = values [3 ]
239- d_df .at [idx , "formula" ] = values [4 ]
240- d_df .at [idx , "weight" ] = values [5 ]
237+ # d_df.at[idx, "isoSMILES"] = values[2]
238+ d_df .at [idx , "InChIKey" ] = values [2 ]
239+ d_df .at [idx , "formula" ] = values [3 ]
240+ d_df .at [idx , "weight" ] = values [4 ]
241241
242242 return d_df
243243
@@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
250250 d_df : pd.DataFrame
251251 Main drug dataframe containing drug-related columns.
252252 drug_map : pd.DataFrame
253- Mapping dataframe containing drug information and the column 'isoSMILES '.
253+ Mapping dataframe containing drug information and the column 'canSMILES '.
254254
255255 Returns
256256 -------
257257 pd.DataFrame
258258 The merged dataframe containing combined drug information.
259259 """
260- print (d_df ['isoSMILES' ].dtype , drug_map ['isoSMILES' ].dtype )
261- d_df ['isoSMILES ' ] = d_df ['isoSMILES ' ].astype (str )
262- drug_map ['isoSMILES ' ] = drug_map ['isoSMILES ' ].astype (str )
263- result_df = d_df .merge (drug_map [['isoSMILES ' , 'improve_drug_id' ]], on = 'isoSMILES ' , how = 'left' )
260+ # print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
261+ d_df ['canSMILES ' ] = d_df ['canSMILES ' ].astype (str )
262+ drug_map ['canSMILES ' ] = drug_map ['canSMILES ' ].astype (str )
263+ result_df = d_df .merge (drug_map [['canSMILES ' , 'improve_drug_id' ]], on = 'canSMILES ' , how = 'left' )
264264 return result_df
265265
266266def format_drug_map (drug_map_path ):
267267 """
268268 Format and clean up the drug mapping file.
269269
270- Reads a drug map file, removes duplicates based on the 'isoSMILES ' column,
270+ Reads a drug map file, removes duplicates based on the 'canSMILES ' column,
271271 and returns the cleaned dataframe.
272272
273273 Parameters
@@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
282282 """
283283 if drug_map_path :
284284 drug_map = pd .read_csv (drug_map_path , sep = "\t " )
285- drug_map = drug_map .drop_duplicates (subset = 'isoSMILES ' , keep = 'first' )
285+ drug_map = drug_map .drop_duplicates (subset = 'canSMILES ' , keep = 'first' )
286286 else :
287287 drug_map = pd .DataFrame (columns = [
288- 'improve_drug_id' , 'chem_name' , 'pubchem_id' , 'canSMILES' ,
289- 'isoSMILES ' , 'InChIKey' , 'formula' , 'weight'
288+ 'improve_drug_id' , 'chem_name' , 'pubchem_id' ,
289+ 'canSMILES ' , 'InChIKey' , 'formula' , 'weight'
290290 ])
291291 return drug_map
292292
@@ -316,7 +316,7 @@ def format_drug_df(drug_path):
316316
317317def add_improve_id (previous_df , new_df ):
318318 """
319- Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES ' not present in the previous dataframe.
319+ Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES ' not present in the previous dataframe.
320320
321321 Parameters
322322 ----------
@@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
335335 max_id = max (id_list ) if id_list else 0
336336 else :
337337 max_id = 0
338- # Identify isoSMILES in the new dataframe that don't exist in the old dataframe
339- unique_new_smiles = set (new_df ['isoSMILES ' ]) - set (previous_df ['isoSMILES ' ])
340- # Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
341- mask = (new_df ['isoSMILES ' ].isin (unique_new_smiles )) & (new_df ['improve_drug_id' ].isna ())
338+ # Identify canSMILES in the new dataframe that don't exist in the old dataframe
339+ unique_new_smiles = set (new_df ['canSMILES ' ]) - set (previous_df ['canSMILES ' ])
340+ # Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
341+ mask = (new_df ['canSMILES ' ].isin (unique_new_smiles )) & (new_df ['improve_drug_id' ].isna ())
342342 id_map = {}
343343 for smiles in unique_new_smiles :
344344 max_id += 1
345345 id_map [smiles ] = f"SMI_{ max_id } "
346- # Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
347- new_df .loc [mask , 'improve_drug_id' ] = new_df ['isoSMILES ' ].map (id_map )
346+ # Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
347+ new_df .loc [mask , 'improve_drug_id' ] = new_df ['canSMILES ' ].map (id_map )
348348 return new_df
349349
350350
@@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
466466 right_on = 'other_id' ,
467467 how = 'left' )
468468 mapped_df .insert (0 , 'improve_sample_id' , mapped_df .pop ('improve_sample_id' ))
469+
470+ print (mapped_df .to_string ())
471+ mapped_df ['improve_sample_id' ] = mapped_df ['improve_sample_id' ].astype (int )
472+ mapped_df ['entrez_id' ] = mapped_df ['entrez_id' ].fillna (0 )
473+ mapped_df ['entrez_id' ] = mapped_df ['entrez_id' ].astype (int )
469474 mapped_df ['source' ] = 'synapse'
470475 mapped_df ['study' ] = 'BeatAML'
476+ mapped_df = mapped_df .drop_duplicates ()
471477
472478 final_dataframe = mapped_df .dropna ()
473479 return final_dataframe
@@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path):
541547 d_res = add_improve_id (drug_map , d_res )
542548 #Drug Data
543549 #print(d_res)
544- drug_res = d_res [["improve_drug_id" ,"chem_name" ,"pubchem_id" ,"formula" ,"weight" ,"InChIKey" ,"canSMILES" , "isoSMILES" ]]
550+ drug_res = d_res [["improve_drug_id" ,"chem_name" ,"pubchem_id" ,"formula" ,"weight" ,"InChIKey" ,"canSMILES" ]]
545551 drug_res = drug_res .drop_duplicates ()
546552 drug_res .to_csv ("/tmp/beataml_drugs.tsv" ,sep = "\t " , index = False )
547553
@@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path):
587593# 'syn32533104',
588594# 'syn32529921',
589595 'syn26642974' ,
590- 'syn26427390'
596+ 'syn26427390' ,
597+ 'syn64126458' ,
598+ 'syn64126462' ,
599+ 'syn64126463' ,
600+ 'syn64126464' ,
601+ 'syn64126468'
591602 ]
592603 print ("Downloading Files from Synapse" )
593604 for entity_id in entity_ids :
@@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path):
597608 #gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
598609 #entrez_map_file = retrieve_figshare_data(gene_url)
599610
600- additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
611+ # additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
601612 sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
602- download_from_github (additional_mapping_url , sample_mapping_file )
613+ # download_from_github(additional_mapping_url, sample_mapping_file)
603614
604- supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
615+ # supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
605616 supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
606- download_from_github (supplementary_url , supplimentary_file )
617+ # download_from_github(supplementary_url, supplimentary_file)
607618
608619
609620 if args .samples :
@@ -619,26 +630,26 @@ def generate_drug_list(drug_map_path,drug_path):
619630 else :
620631 print ("Drug File Provided. Proceeding with build." )
621632 original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
622- original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
623- download_from_github (original_drug_url , original_drug_file )
624- generate_drug_list (args .drugFile , original_drug_file ) ##this doesn't exist, need to add
633+ # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
634+ # download_from_github(original_drug_url, original_drug_file)
635+ generate_drug_list (args .drugFile , original_drug_file )
625636 if args .omics :
626637 if args .genes is None or args .curSamples is None :
627638 print ('Cannot process omics without sample mapping and gene mapping files' )
628639 exit ()
629640 else :
630641 improve_map_file = args .curSamples
631642 transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
632- transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
633- download_from_github (transcriptomics_url , transcriptomics_file )
643+ # transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
644+ # download_from_github(transcriptomics_url, transcriptomics_file)
634645
635646 mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
636- mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
637- download_from_github (mutations_url , mutations_file )
647+ # mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
648+ # download_from_github(mutations_url, mutations_file)
638649
639650 mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
640- mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
641- download_from_github (mutation_map_url , mutation_map_file )
651+ # mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
652+ # download_from_github(mutation_map_url, mutation_map_file)
642653 # New Transcriptomics Data
643654 print ("Starting Transcriptomics Data" )
644655 ##first run conversion tool
@@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path):
680691 imp_samp_map = pd .read_csv (args .curSamples )
681692 imp_drug_map = pd .read_csv (args .drugFile ,sep = '\t ' )
682693 original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
683- original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
694+ # original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
684695 # Generate Raw Drugs File to use in Curve fitting algorithm
685- download_from_github (original_drug_url , original_drug_file )
696+ # download_from_github(original_drug_url, original_drug_file)
686697 # Experiment Data
687698 updated_raw_drug_file = "beatAML_drug_raw.tsv"
688699 generate_raw_drug_file (original_drug_file ,sample_mapping_file , updated_raw_drug_file ,supplimentary_file )
0 commit comments