Skip to content

Commit 163000a

Browse files
committed
SMILES moved to Canonical. IsoSMILES dropped
1 parent 02b65aa commit 163000a

4 files changed

Lines changed: 74 additions & 69 deletions

File tree

build/beatAML/GetBeatAML.py

Lines changed: 65 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,25 @@
99
import argparse
1010
import time
1111

12-
def download_from_github(raw_url, save_path):
13-
"""
14-
Download a file from a raw GitHub URL and save to the specified path.
15-
16-
Parameters
17-
----------
18-
raw_url : str
19-
The raw GitHub URL pointing to the file to be downloaded.
20-
save_path : str
21-
The local path where the downloaded file will be saved.
22-
23-
Returns
24-
-------
25-
None
26-
"""
27-
response = requests.get(raw_url)
28-
with open(save_path, 'wb') as f:
29-
f.write(response.content)
30-
return
12+
# def download_from_github(raw_url, save_path):
13+
# """
14+
# Download a file from a raw GitHub URL and save to the specified path.
15+
16+
# Parameters
17+
# ----------
18+
# raw_url : str
19+
# The raw GitHub URL pointing to the file to be downloaded.
20+
# save_path : str
21+
# The local path where the downloaded file will be saved.
22+
23+
# Returns
24+
# -------
25+
# None
26+
# """
27+
# response = requests.get(raw_url)
28+
# with open(save_path, 'wb') as f:
29+
# f.write(response.content)
30+
# return
3131

3232
def retrieve_figshare_data(url):
3333
"""
@@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
178178
properties = data["PropertyTable"]["Properties"][0]
179179
pubchem_id = properties.get('CID',np.nan)
180180
canSMILES = properties.get("CanonicalSMILES", np.nan)
181-
isoSMILES = properties.get("IsomericSMILES", np.nan)
181+
# isoSMILES = properties.get("IsomericSMILES", np.nan)
182182
InChIKey = properties.get("InChIKey", np.nan)
183183
formula = properties.get("MolecularFormula", np.nan)
184184
weight = properties.get("MolecularWeight", np.nan)
185185

186-
return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight
186+
return pubchem_id, canSMILES, InChIKey, formula, weight
187187
else:
188-
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
188+
return np.nan, np.nan, np.nan, np.nan, np.nan
189189

190190

191191
def update_dataframe_with_pubchem(d_df):
@@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
230230
if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
231231
values = data_dict[row['chem_name']]
232232
else:
233-
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
233+
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan))
234234

235235
d_df.at[idx, 'pubchem_id'] = values[0]
236236
d_df.at[idx, "canSMILES"] = values[1]
237-
d_df.at[idx, "isoSMILES"] = values[2]
238-
d_df.at[idx, "InChIKey"] = values[3]
239-
d_df.at[idx, "formula"] = values[4]
240-
d_df.at[idx, "weight"] = values[5]
237+
# d_df.at[idx, "isoSMILES"] = values[2]
238+
d_df.at[idx, "InChIKey"] = values[2]
239+
d_df.at[idx, "formula"] = values[3]
240+
d_df.at[idx, "weight"] = values[4]
241241

242242
return d_df
243243

@@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
250250
d_df : pd.DataFrame
251251
Main drug dataframe containing drug-related columns.
252252
drug_map : pd.DataFrame
253-
Mapping dataframe containing drug information and the column 'isoSMILES'.
253+
Mapping dataframe containing drug information and the column 'canSMILES'.
254254
255255
Returns
256256
-------
257257
pd.DataFrame
258258
The merged dataframe containing combined drug information.
259259
"""
260-
print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
261-
d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
262-
drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
263-
result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
260+
# print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
261+
d_df['canSMILES'] = d_df['canSMILES'].astype(str)
262+
drug_map['canSMILES'] = drug_map['canSMILES'].astype(str)
263+
result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left')
264264
return result_df
265265

266266
def format_drug_map(drug_map_path):
267267
"""
268268
Format and clean up the drug mapping file.
269269
270-
Reads a drug map file, removes duplicates based on the 'isoSMILES' column,
270+
Reads a drug map file, removes duplicates based on the 'canSMILES' column,
271271
and returns the cleaned dataframe.
272272
273273
Parameters
@@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
282282
"""
283283
if drug_map_path:
284284
drug_map = pd.read_csv(drug_map_path, sep = "\t")
285-
drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first')
285+
drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first')
286286
else:
287287
drug_map = pd.DataFrame(columns=[
288-
'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES',
289-
'isoSMILES', 'InChIKey', 'formula', 'weight'
288+
'improve_drug_id', 'chem_name', 'pubchem_id',
289+
'canSMILES', 'InChIKey', 'formula', 'weight'
290290
])
291291
return drug_map
292292

@@ -316,7 +316,7 @@ def format_drug_df(drug_path):
316316

317317
def add_improve_id(previous_df, new_df):
318318
"""
319-
Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe.
319+
Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe.
320320
321321
Parameters
322322
----------
@@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
335335
max_id = max(id_list) if id_list else 0
336336
else:
337337
max_id = 0
338-
# Identify isoSMILES in the new dataframe that don't exist in the old dataframe
339-
unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
340-
# Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
341-
mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
338+
# Identify canSMILES in the new dataframe that don't exist in the old dataframe
339+
unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES'])
340+
# Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
341+
mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
342342
id_map = {}
343343
for smiles in unique_new_smiles:
344344
max_id += 1
345345
id_map[smiles] = f"SMI_{max_id}"
346-
# Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
347-
new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map)
346+
# Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
347+
new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map)
348348
return new_df
349349

350350

@@ -541,7 +541,7 @@ def generate_drug_list(drug_map_path,drug_path):
541541
d_res = add_improve_id(drug_map, d_res)
542542
#Drug Data
543543
#print(d_res)
544-
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]]
544+
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]]
545545
drug_res = drug_res.drop_duplicates()
546546
drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False)
547547

@@ -587,7 +587,12 @@ def generate_drug_list(drug_map_path,drug_path):
587587
# 'syn32533104',
588588
# 'syn32529921',
589589
'syn26642974',
590-
'syn26427390'
590+
'syn26427390',
591+
'syn64126458',
592+
'syn64126462',
593+
'syn64126463',
594+
'syn64126464',
595+
'syn64126468'
591596
]
592597
print("Downloading Files from Synapse")
593598
for entity_id in entity_ids:
@@ -597,13 +602,13 @@ def generate_drug_list(drug_map_path,drug_path):
597602
#gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
598603
#entrez_map_file = retrieve_figshare_data(gene_url)
599604

600-
additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
605+
# additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
601606
sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
602-
download_from_github(additional_mapping_url, sample_mapping_file)
607+
# download_from_github(additional_mapping_url, sample_mapping_file)
603608

604-
supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
609+
# supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
605610
supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
606-
download_from_github(supplementary_url, supplimentary_file)
611+
# download_from_github(supplementary_url, supplimentary_file)
607612

608613

609614
if args.samples:
@@ -619,26 +624,26 @@ def generate_drug_list(drug_map_path,drug_path):
619624
else:
620625
print("Drug File Provided. Proceeding with build.")
621626
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
622-
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
623-
download_from_github(original_drug_url, original_drug_file)
624-
generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add
627+
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
628+
# download_from_github(original_drug_url, original_drug_file)
629+
generate_drug_list(args.drugFile, original_drug_file)
625630
if args.omics:
626631
if args.genes is None or args.curSamples is None:
627632
print('Cannot process omics without sample mapping and gene mapping files')
628633
exit()
629634
else:
630635
improve_map_file = args.curSamples
631636
transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
632-
transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
633-
download_from_github(transcriptomics_url, transcriptomics_file)
637+
# transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
638+
# download_from_github(transcriptomics_url, transcriptomics_file)
634639

635640
mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
636-
mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
637-
download_from_github(mutations_url, mutations_file)
641+
# mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
642+
# download_from_github(mutations_url, mutations_file)
638643

639644
mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
640-
mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
641-
download_from_github(mutation_map_url, mutation_map_file)
645+
# mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
646+
# download_from_github(mutation_map_url, mutation_map_file)
642647
# New Transcriptomics Data
643648
print("Starting Transcriptomics Data")
644649
##first run conversion tool
@@ -680,9 +685,9 @@ def generate_drug_list(drug_map_path,drug_path):
680685
imp_samp_map = pd.read_csv(args.curSamples)
681686
imp_drug_map = pd.read_csv(args.drugFile,sep='\t')
682687
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
683-
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
688+
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
684689
# Generate Raw Drugs File to use in Curve fitting algorithm
685-
download_from_github(original_drug_url, original_drug_file)
690+
# download_from_github(original_drug_url, original_drug_file)
686691
# Experiment Data
687692
updated_raw_drug_file = "beatAML_drug_raw.tsv"
688693
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)

build/broad_sanger/03a-nci60Drugs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def main():
3939
opts = parser.parse_args()
4040

4141
###primary DF
42-
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
42+
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\
4343
'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
4444

4545
print('Downloading NSC identifiers for nci60 data')
@@ -69,7 +69,7 @@ def main():
6969
upper=[a.upper() for a in smiles['SMILES']]
7070
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
7171
##reduce to smiels only in current drugs
72-
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
72+
# ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
7373
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
7474
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
7575
arr = set(pubchems['CID'])
@@ -102,7 +102,7 @@ def main():
102102
{
103103
"improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
104104
'canSMILES': [a for a in set(mdf['SMILES'])],
105-
'isoSMILES': [a for a in set(mdf['SMILES'])],
105+
# 'isoSMILES': [a for a in set(mdf['SMILES'])],
106106
'InChIKey': [None for a in range(smicount)],
107107
'formula': [None for a in range(smicount)],
108108
'weight': [None for a in range(smicount)]

build/mpnst/02_get_drug_data.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ if (!is.na(olddrugfiles)) {
103103
chem_name = character(),
104104
pubchem_id = character(),
105105
canSMILES = character(),
106-
isoSMILES = character(),
106+
# isoSMILES = character(),
107107
InChIKey = character(),
108108
formula = character(),
109109
weight = numeric(),
@@ -118,7 +118,7 @@ if (!is.na(olddrugfiles)) {
118118
chem_name = character(),
119119
pubchem_id = character(),
120120
canSMILES = character(),
121-
isoSMILES = character(),
121+
# isoSMILES = character(),
122122
InChIKey = character(),
123123
formula = character(),
124124
weight = numeric(),

build/utils/pubchem_retrieval.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,12 @@ def retrieve_drug_info(compound,ignore_chems,isname=True):
5454

5555
if isname:
5656
urls = {
57-
"properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
57+
"properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
5858
"synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound}/synonyms/JSON"
5959
}
6060
else:
6161
urls = {
62-
"properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
62+
"properties": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/property/CanonicalSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON",
6363
"synonyms": f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{compound}/synonyms/JSON"
6464
}
6565

@@ -188,9 +188,9 @@ def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv",ign
188188
mode = 'a' if file_exists else 'w'
189189
with open(output_filename, mode) as f:
190190
if not file_exists:
191-
f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tisoSMILES\tInChIKey\tformula\tweight\n")
191+
f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tInChIKey\tformula\tweight\n")
192192
for entry in data:
193-
f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry.get('IsomericSMILES', '')}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n")
193+
f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t{entry['CanonicalSMILES']}\t{entry['InChIKey']}\t{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n")
194194

195195
with open(ignore_chems,"a") as ig_f:
196196
for entry in data:

0 commit comments

Comments
 (0)