Skip to content

Commit f2120e4

Browse files
authored
Merge pull request #252 from PNNL-CompBio/drop_drugs
Drugs, BeatAML, CanSMILES Updates
2 parents 0054cab + 54dbe7c commit f2120e4

34 files changed

Lines changed: 1284 additions & 127 deletions

build/beatAML/GetBeatAML.py

Lines changed: 71 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,25 @@
99
import argparse
1010
import time
1111

12-
def download_from_github(raw_url, save_path):
13-
"""
14-
Download a file from a raw GitHub URL and save to the specified path.
15-
16-
Parameters
17-
----------
18-
raw_url : str
19-
The raw GitHub URL pointing to the file to be downloaded.
20-
save_path : str
21-
The local path where the downloaded file will be saved.
22-
23-
Returns
24-
-------
25-
None
26-
"""
27-
response = requests.get(raw_url)
28-
with open(save_path, 'wb') as f:
29-
f.write(response.content)
30-
return
12+
# def download_from_github(raw_url, save_path):
13+
# """
14+
# Download a file from a raw GitHub URL and save to the specified path.
15+
16+
# Parameters
17+
# ----------
18+
# raw_url : str
19+
# The raw GitHub URL pointing to the file to be downloaded.
20+
# save_path : str
21+
# The local path where the downloaded file will be saved.
22+
23+
# Returns
24+
# -------
25+
# None
26+
# """
27+
# response = requests.get(raw_url)
28+
# with open(save_path, 'wb') as f:
29+
# f.write(response.content)
30+
# return
3131

3232
def retrieve_figshare_data(url):
3333
"""
@@ -178,14 +178,14 @@ def retrieve_drug_info(compound_name):
178178
properties = data["PropertyTable"]["Properties"][0]
179179
pubchem_id = properties.get('CID',np.nan)
180180
canSMILES = properties.get("CanonicalSMILES", np.nan)
181-
isoSMILES = properties.get("IsomericSMILES", np.nan)
181+
# isoSMILES = properties.get("IsomericSMILES", np.nan)
182182
InChIKey = properties.get("InChIKey", np.nan)
183183
formula = properties.get("MolecularFormula", np.nan)
184184
weight = properties.get("MolecularWeight", np.nan)
185185

186-
return pubchem_id, canSMILES, isoSMILES, InChIKey, formula, weight
186+
return pubchem_id, canSMILES, InChIKey, formula, weight
187187
else:
188-
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
188+
return np.nan, np.nan, np.nan, np.nan, np.nan
189189

190190

191191
def update_dataframe_with_pubchem(d_df):
@@ -230,14 +230,14 @@ def update_dataframe_with_pubchem(d_df):
230230
if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
231231
values = data_dict[row['chem_name']]
232232
else:
233-
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))
233+
values = data_dict.get(row['other_name'], (np.nan, np.nan, np.nan, np.nan, np.nan))
234234

235235
d_df.at[idx, 'pubchem_id'] = values[0]
236236
d_df.at[idx, "canSMILES"] = values[1]
237-
d_df.at[idx, "isoSMILES"] = values[2]
238-
d_df.at[idx, "InChIKey"] = values[3]
239-
d_df.at[idx, "formula"] = values[4]
240-
d_df.at[idx, "weight"] = values[5]
237+
# d_df.at[idx, "isoSMILES"] = values[2]
238+
d_df.at[idx, "InChIKey"] = values[2]
239+
d_df.at[idx, "formula"] = values[3]
240+
d_df.at[idx, "weight"] = values[4]
241241

242242
return d_df
243243

@@ -250,24 +250,24 @@ def merge_drug_info(d_df,drug_map):
250250
d_df : pd.DataFrame
251251
Main drug dataframe containing drug-related columns.
252252
drug_map : pd.DataFrame
253-
Mapping dataframe containing drug information and the column 'isoSMILES'.
253+
Mapping dataframe containing drug information and the column 'canSMILES'.
254254
255255
Returns
256256
-------
257257
pd.DataFrame
258258
The merged dataframe containing combined drug information.
259259
"""
260-
print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
261-
d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
262-
drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
263-
result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
260+
# print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
261+
d_df['canSMILES'] = d_df['canSMILES'].astype(str)
262+
drug_map['canSMILES'] = drug_map['canSMILES'].astype(str)
263+
result_df = d_df.merge(drug_map[['canSMILES', 'improve_drug_id']], on='canSMILES', how='left')
264264
return result_df
265265

266266
def format_drug_map(drug_map_path):
267267
"""
268268
Format and clean up the drug mapping file.
269269
270-
Reads a drug map file, removes duplicates based on the 'isoSMILES' column,
270+
Reads a drug map file, removes duplicates based on the 'canSMILES' column,
271271
and returns the cleaned dataframe.
272272
273273
Parameters
@@ -282,11 +282,11 @@ def format_drug_map(drug_map_path):
282282
"""
283283
if drug_map_path:
284284
drug_map = pd.read_csv(drug_map_path, sep = "\t")
285-
drug_map = drug_map.drop_duplicates(subset='isoSMILES', keep='first')
285+
drug_map = drug_map.drop_duplicates(subset='canSMILES', keep='first')
286286
else:
287287
drug_map = pd.DataFrame(columns=[
288-
'improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES',
289-
'isoSMILES', 'InChIKey', 'formula', 'weight'
288+
'improve_drug_id', 'chem_name', 'pubchem_id',
289+
'canSMILES', 'InChIKey', 'formula', 'weight'
290290
])
291291
return drug_map
292292

@@ -316,7 +316,7 @@ def format_drug_df(drug_path):
316316

317317
def add_improve_id(previous_df, new_df):
318318
"""
319-
Add 'improve_drug_id' to the new dataframe based on unique 'isoSMILES' not present in the previous dataframe.
319+
Add 'improve_drug_id' to the new dataframe based on unique 'canSMILES' not present in the previous dataframe.
320320
321321
Parameters
322322
----------
@@ -335,16 +335,16 @@ def add_improve_id(previous_df, new_df):
335335
max_id = max(id_list) if id_list else 0
336336
else:
337337
max_id = 0
338-
# Identify isoSMILES in the new dataframe that don't exist in the old dataframe
339-
unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
340-
# Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
341-
mask = (new_df['isoSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
338+
# Identify canSMILES in the new dataframe that don't exist in the old dataframe
339+
unique_new_smiles = set(new_df['canSMILES']) - set(previous_df['canSMILES'])
340+
# Identify rows in the new dataframe with canSMILES that are unique and where improve_drug_id is NaN
341+
mask = (new_df['canSMILES'].isin(unique_new_smiles)) & (new_df['improve_drug_id'].isna())
342342
id_map = {}
343343
for smiles in unique_new_smiles:
344344
max_id += 1
345345
id_map[smiles] = f"SMI_{max_id}"
346-
# Apply the mapping to the new dataframe for rows with unique isoSMILES and NaN improve_drug_id
347-
new_df.loc[mask, 'improve_drug_id'] = new_df['isoSMILES'].map(id_map)
346+
# Apply the mapping to the new dataframe for rows with unique canSMILES and NaN improve_drug_id
347+
new_df.loc[mask, 'improve_drug_id'] = new_df['canSMILES'].map(id_map)
348348
return new_df
349349

350350

@@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
466466
right_on='other_id',
467467
how='left')
468468
mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
469+
470+
print(mapped_df.to_string())
471+
mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
472+
mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
473+
mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
469474
mapped_df['source'] = 'synapse'
470475
mapped_df['study'] = 'BeatAML'
476+
mapped_df =mapped_df.drop_duplicates()
471477

472478
final_dataframe = mapped_df.dropna()
473479
return final_dataframe
@@ -541,7 +547,7 @@ def generate_drug_list(drug_map_path,drug_path):
541547
d_res = add_improve_id(drug_map, d_res)
542548
#Drug Data
543549
#print(d_res)
544-
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES","isoSMILES"]]
550+
drug_res = d_res[["improve_drug_id","chem_name","pubchem_id","formula","weight","InChIKey","canSMILES"]]
545551
drug_res = drug_res.drop_duplicates()
546552
drug_res.to_csv("/tmp/beataml_drugs.tsv",sep="\t", index=False)
547553

@@ -587,7 +593,12 @@ def generate_drug_list(drug_map_path,drug_path):
587593
# 'syn32533104',
588594
# 'syn32529921',
589595
'syn26642974',
590-
'syn26427390'
596+
'syn26427390',
597+
'syn64126458',
598+
'syn64126462',
599+
'syn64126463',
600+
'syn64126464',
601+
'syn64126468'
591602
]
592603
print("Downloading Files from Synapse")
593604
for entity_id in entity_ids:
@@ -597,13 +608,13 @@ def generate_drug_list(drug_map_path,drug_path):
597608
#gene_url = "https://figshare.com/ndownloader/files/40576109?private_link=525f7777039f4610ef47"
598609
#entrez_map_file = retrieve_figshare_data(gene_url)
599610

600-
additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
611+
# additional_mapping_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
601612
sample_mapping_file = "beataml_waves1to4_sample_mapping.xlsx"
602-
download_from_github(additional_mapping_url, sample_mapping_file)
613+
# download_from_github(additional_mapping_url, sample_mapping_file)
603614

604-
supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
615+
# supplementary_url = 'https://ars.els-cdn.com/content/image/1-s2.0-S1535610822003129-mmc2.xlsx'
605616
supplimentary_file = '1-s2.0-S1535610822003129-mmc2.xlsx'
606-
download_from_github(supplementary_url, supplimentary_file)
617+
# download_from_github(supplementary_url, supplimentary_file)
607618

608619

609620
if args.samples:
@@ -619,26 +630,26 @@ def generate_drug_list(drug_map_path,drug_path):
619630
else:
620631
print("Drug File Provided. Proceeding with build.")
621632
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
622-
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
623-
download_from_github(original_drug_url, original_drug_file)
624-
generate_drug_list(args.drugFile, original_drug_file) ##this doesn't exist, need to add
633+
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
634+
# download_from_github(original_drug_url, original_drug_file)
635+
generate_drug_list(args.drugFile, original_drug_file)
625636
if args.omics:
626637
if args.genes is None or args.curSamples is None:
627638
print('Cannot process omics without sample mapping and gene mapping files')
628639
exit()
629640
else:
630641
improve_map_file = args.curSamples
631642
transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
632-
transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
633-
download_from_github(transcriptomics_url, transcriptomics_file)
643+
# transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
644+
# download_from_github(transcriptomics_url, transcriptomics_file)
634645

635646
mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
636-
mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
637-
download_from_github(mutations_url, mutations_file)
647+
# mutations_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wes_wv1to4_mutations_dbgap.txt"
648+
# download_from_github(mutations_url, mutations_file)
638649

639650
mutation_map_file = "beataml_waves1to4_sample_mapping.xlsx"
640-
mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
641-
download_from_github(mutation_map_url, mutation_map_file)
651+
# mutation_map_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_sample_mapping.xlsx"
652+
# download_from_github(mutation_map_url, mutation_map_file)
642653
# New Transcriptomics Data
643654
print("Starting Transcriptomics Data")
644655
##first run conversion tool
@@ -680,9 +691,9 @@ def generate_drug_list(drug_map_path,drug_path):
680691
imp_samp_map = pd.read_csv(args.curSamples)
681692
imp_drug_map = pd.read_csv(args.drugFile,sep='\t')
682693
original_drug_file = "beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
683-
original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
694+
# original_drug_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_wv1to4_raw_inhibitor_v4_dbgap.txt"
684695
# Generate Raw Drugs File to use in Curve fitting algorithm
685-
download_from_github(original_drug_url, original_drug_file)
696+
# download_from_github(original_drug_url, original_drug_file)
686697
# Experiment Data
687698
updated_raw_drug_file = "beatAML_drug_raw.tsv"
688699
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)

build/broad_sanger/03a-nci60Drugs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def main():
3939
opts = parser.parse_args()
4040

4141
###primary DF
42-
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
42+
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],\
4343
'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
4444

4545
print('Downloading NSC identifiers for nci60 data')
@@ -69,7 +69,7 @@ def main():
6969
upper=[a.upper() for a in smiles['SMILES']]
7070
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
7171
##reduce to smiels only in current drugs
72-
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
72+
# ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
7373
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
7474
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
7575
arr = set(pubchems['CID'])
@@ -102,7 +102,7 @@ def main():
102102
{
103103
"improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
104104
'canSMILES': [a for a in set(mdf['SMILES'])],
105-
'isoSMILES': [a for a in set(mdf['SMILES'])],
105+
# 'isoSMILES': [a for a in set(mdf['SMILES'])],
106106
'InChIKey': [None for a in range(smicount)],
107107
'formula': [None for a in range(smicount)],
108108
'weight': [None for a in range(smicount)]

build/broad_sanger/04b-nci60-updated.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,11 @@ def main():
107107

108108
finaldf = pl.DataFrame(
109109
{
110-
'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
110+
'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
111111
'improve_sample_id':molar['improve_sample_id'],
112112
'Drug':molar['improve_drug_id'],
113-
'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
113+
# 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
114+
'study': "NCI60",
114115
'time':molar['time'],
115116
'time_unit':molar['time_unit'],
116117
'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import gc
2+
import polars as pl
3+
4+
5+
6+
def main():
7+
8+
# Remove Problematic Drugs before Splitting Data
9+
10+
# Load the datasets
11+
all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t")
12+
all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t")
13+
14+
# Define the brd_list with lowercase entries for case-insensitive matching
15+
brd_list = [
16+
'brd-k03911514',
17+
'brd-k07442505',
18+
'brd-k13185470',
19+
'brd-k16130065',
20+
'brd-k20514654',
21+
'brd-k27188169',
22+
'brd-k55473186',
23+
'yl54',
24+
'brd-k58730230',
25+
'brd-k79669418',
26+
'brd-k99584050']
27+
28+
# Identify rows in all_drugs that match brd_list entries (case insensitive)
29+
removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list))
30+
31+
# Store the improve_drug_id IDs of removed entries
32+
improve_drug_id = removed_drugs["improve_drug_id"].to_list()
33+
34+
# Remove these rows from all_drugs and all_experiments
35+
all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
36+
all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
37+
38+
all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t")
39+
all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t")
40+
41+
42+
if __name__ == "__main__":
43+
main()

build/broad_sanger/05_separate_datasets.py renamed to build/broad_sanger/05b_separate_datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55

66
def main():
7-
87
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
98
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
109
samples_datatypes = ["samples"] #csv

build/broad_sanger/build_misc.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@ set -euo pipefail
44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

66
cp /tmp/broad_sanger* .
7-
echo "Running 05_separate_datasets.py..."
8-
/opt/venv/bin/python 05_separate_datasets.py
7+
8+
echo "Running 05a_remove_problem_drugs.py..."
9+
/opt/venv/bin/python 05a_remove_problem_drugs.py
10+
11+
echo "Running 05b_separate_datasets.py..."
12+
/opt/venv/bin/python 05b_separate_datasets.py
913

1014
echo "Removing broad_sanger* files..."
1115
rm broad_sanger*

0 commit comments

Comments
 (0)