Skip to content

Commit 721b24b

Browse files
authored
Merge pull request #237 from PNNL-CompBio/split_broad_sanger
Cell Line Datasets Separated, Improved Error Handling for all Build Scripts
2 parents 4025953 + 85ed415 commit 721b24b

39 files changed

Lines changed: 673 additions & 375 deletions

.dockerignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ coderdata/
44
dataSummary/
55
docs/
66
candle_bmd/
7-
schema/
8-
build/local/
7+
build/local/

build/beatAML/GetBeatAML.py

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,8 @@ def retrieve_drug_info(compound_name):
174174
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
175175

176176
data = response.json()
177-
#print(data)
178177
if "PropertyTable" in data:
179178
properties = data["PropertyTable"]["Properties"][0]
180-
#print(properties)
181179
pubchem_id = properties.get('CID',np.nan)
182180
canSMILES = properties.get("CanonicalSMILES", np.nan)
183181
isoSMILES = properties.get("IsomericSMILES", np.nan)
@@ -259,9 +257,6 @@ def merge_drug_info(d_df,drug_map):
259257
pd.DataFrame
260258
The merged dataframe containing combined drug information.
261259
"""
262-
#print(drug_map)
263-
#print(d_df.columns)
264-
#print(d_df)
265260
print(d_df['isoSMILES'].dtype, drug_map['isoSMILES'].dtype)
266261
d_df['isoSMILES'] = d_df['isoSMILES'].astype(str)
267262
drug_map['isoSMILES'] = drug_map['isoSMILES'].astype(str)
@@ -337,10 +332,9 @@ def add_improve_id(previous_df, new_df):
337332
"""
338333
if not previous_df.empty and 'improve_drug_id' in previous_df.columns:
339334
id_list = [int(val.replace('SMI_', '')) for val in previous_df['improve_drug_id'].tolist() if pd.notnull(val) and val.startswith('SMI_')]
340-
max_id = max(id_list) if id_list else 0 # Default to 0 if the list is empty
335+
max_id = max(id_list) if id_list else 0
341336
else:
342-
max_id = 0 # Default value if the DataFrame is empty or doesn't have the column
343-
# max_id = max([int(val.replace('SMI_', '')) for val in previous_df['improve_drug_id'].tolist() if pd.notnull(val) and val.startswith('SMI_')])
337+
max_id = 0
344338
# Identify isoSMILES in the new dataframe that don't exist in the old dataframe
345339
unique_new_smiles = set(new_df['isoSMILES']) - set(previous_df['isoSMILES'])
346340
# Identify rows in the new dataframe with isoSMILES that are unique and where improve_drug_id is NaN
@@ -370,24 +364,9 @@ def map_exp_to_improve(exp_path):#df,improve_map_file):
370364
pd.DataFrame
371365
Mapped dataframe with 'improve_sample_id' added and 'sample_id' removed.
372366
"""
373-
mapped_df = pd.read_csv(exp_path,sep='\t') # Map sample_id to improve_sample_id
374-
#mapped_df = pd.merge(df, improve[['other_id', 'improve_sample_id']], left_on='sample_id', right_on='other_id', how='left')
375-
#mapped_df.drop(columns=['sample_id', 'other_id'], inplace=True)
376-
#mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
367+
mapped_df = pd.read_csv(exp_path,sep='\t')
377368
mapped_df['source'] = 'synapse'
378369
mapped_df['study'] = 'BeatAML'
379-
#mapped_df= mapped_df.rename(columns={'Drug':'improve_sample_id',
380-
# 'IC50':'ic50',
381-
# 'EC50':'ec50',
382-
# 'EC50se':'ec50se',
383-
# 'Einf':'einf',
384-
# 'HS':'hs',
385-
# 'AAC1':'aac1',
386-
# 'AUC1':'auc1',
387-
# 'DSS1':'dss1',
388-
# 'R2fit':'r2fit'
389-
# }
390-
# )
391370
return mapped_df
392371

393372

@@ -445,12 +424,21 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
445424
mapped_df.rename(columns={"hgvsc": "mutation"}, inplace=True)
446425
mapped_df.rename(columns={"labId": "sample_id"}, inplace=True)
447426
mapped_df.rename(columns={"Entrez_Gene_Id": "entrez_id"}, inplace=True)
448-
449-
elif data_type == "mutation":
450-
df = df[['dbgap_sample_id','hgvsc', 'hgvsp', 'gene', 'variant_classification','t_vaf', 'refseq', 'symbol']]
451-
mapped_df = df.merge(genes, left_on='symbol', right_on='gene_symbol', how='left').reindex(
452-
columns=['hgvsc', 'entrez_id', "dbgap_sample_id","variant_classification"])
453427

428+
variant_mapping = {
429+
'frameshift_variant': 'Frameshift_Variant',
430+
'missense_variant': 'Missense_Mutation',
431+
'stop_gained': 'Nonsense_Mutation',
432+
'inframe_deletion': 'In_Frame_Del',
433+
'protein_altering_variant': 'Protein_Altering_Variant',
434+
'splice_acceptor_variant': 'Splice_Site',
435+
'splice_donor_variant': 'Splice_Site',
436+
'start_lost': 'Start_Codon_Del',
437+
'inframe_insertion': 'In_Frame_Ins',
438+
'stop_lost': 'Nonstop_Mutation'
439+
}
440+
441+
mapped_df['variant_classification'] = mapped_df['variant_classification'].map(variant_mapping)
454442

455443
elif data_type == "proteomics":
456444
mapped_ids['sampleID'] = mapped_ids['sampleID'].str.split('_').apply(lambda x: x[2])
@@ -473,7 +461,6 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
473461
inplace=True
474462
)
475463

476-
477464
mapped_df = pd.merge(mapped_df, improve[['other_id', 'improve_sample_id']],
478465
left_on='sample_id',
479466
right_on='other_id',
@@ -482,7 +469,7 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
482469
mapped_df['source'] = 'synapse'
483470
mapped_df['study'] = 'BeatAML'
484471

485-
final_dataframe = mapped_df.dropna()#pd.dropna(mapped_df,0)
472+
final_dataframe = mapped_df.dropna()
486473
return final_dataframe
487474

488475

@@ -659,8 +646,6 @@ def generate_drug_list(drug_map_path,drug_path):
659646

660647

661648
t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')
662-
# t_df.index = t_df.stable_id#display_label
663-
# t_df = t_df.iloc[:, 4:]
664649
t_df = t_df.reset_index().rename(columns={'stable_id': 'Gene'})
665650
t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
666651
print(improve_map_file)
@@ -724,7 +709,5 @@ def generate_drug_list(drug_map_path,drug_path):
724709
exp_res = map_exp_to_improve(drug_path)
725710
exp_res.to_csv("/tmp/beataml_experiments.tsv", index=False, sep='\t')
726711

727-
#drug_map_path = retrieve_figshare_data("https://figshare.com/ndownloader/files/43112314?private_link=0ea222d9bd461c756fb0")
728-
729712
# print("Finished Pipeline")
730713

build/beatAML/build_drugs.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,10 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running GetBeatAML.py with token and drugFile $1"
17
python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --drugs --drugFile $1
8+
9+
echo "Running build_drug_desc.py..."
210
python build_drug_desc.py --drugtable /tmp/beataml_drugs.tsv --desctable /tmp/beataml_drug_descriptors.tsv.gz

build/beatAML/build_exp.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running GetBeatAML.py with token and curSamples $1 and drugFile $2."
17
python GetBeatAML.py --exp --token $SYNAPSE_AUTH_TOKEN --curSamples $1 --drugFile $2

build/beatAML/build_omics.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running GetBeatAML.py with token, curSamples $2, and genes $1."
17
python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --omics --curSamples $2 --genes $1

build/beatAML/build_samples.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running GetBeatAML.py with token and prevSamples $1."
17
python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --samples --prevSamples $1

build/broad_sanger/03a-nci60Drugs.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,21 @@ def main():
122122
merged = pl.concat([mdf,namedf],how='horizontal').select(['SMILES','pubchem_id','nscid','lower_name'])
123123
melted = merged.melt(id_vars=['SMILES','pubchem_id'],value_vars=['nscid','lower_name']).select(['SMILES','pubchem_id','value']).unique()
124124
melted.columns = ['canSMILES','pubchem_id','chem_name']
125-
if newdf.shape[0]>0:
126-
newdf = newdf.join(melted,on='canSMILES',how='inner').select(res.columns)
127-
res = pl.concat([res,newdf],how='vertical')
125+
126+
if newdf.shape[0] > 0:
127+
res = res.with_columns([
128+
pl.col("InChIKey").cast(pl.Utf8),
129+
pl.col("formula").cast(pl.Utf8),
130+
pl.col("weight").cast(pl.Utf8)
131+
])
132+
newdf = newdf.with_columns([
133+
pl.col("InChIKey").cast(pl.Utf8),
134+
pl.col("formula").cast(pl.Utf8),
135+
pl.col("weight").cast(pl.Utf8)
136+
])
137+
138+
newdf = newdf.join(melted, on='canSMILES', how='inner').select(res.columns)
139+
res = pl.concat([res, newdf], how='vertical')
128140
res.write_csv(opts.output,separator='\t')
129141

130142
if __name__=='__main__':
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import gc
2+
import polars as pl
3+
4+
5+
6+
def main():
7+
8+
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
9+
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
10+
samples_datatypes = ["samples"] #csv
11+
12+
drugs_datatypes = ["drugs", "drug_descriptors"] # tsv
13+
14+
15+
dataset_sources = {
16+
"CCLE": ["Broad"],
17+
"CTRPv2": ["Broad"],
18+
"PRISM": ["Broad"],
19+
"GDSCv1": ["Sanger"],
20+
"GDSCv2": ["Sanger"],
21+
"FIMM": ["Broad"],
22+
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23+
"NCI60": ["Broad"]
24+
}
25+
26+
for dataset in datasets_to_process:
27+
exp = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") # Keeping memory down, so I will not be making copies.
28+
exp = exp.filter(pl.col("study") == dataset)
29+
30+
# Extract information to separate out datasets
31+
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
32+
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()
33+
34+
# Write Filtered Experiments File to TSV. Then delete it from memory.
35+
exp_filename = f"/tmp/{dataset}_experiments.tsv".lower()
36+
exp.write_csv(exp_filename, separator="\t")
37+
del exp
38+
gc.collect()
39+
40+
41+
#Filter Samples files, write to file, delete from mem.
42+
for samples in samples_datatypes:
43+
samples_filename_in = f"broad_sanger_{samples}.csv"
44+
samples_filename_out = f"/tmp/{dataset}_{samples}.csv".lower()
45+
samples_df = pl.read_csv(samples_filename_in)
46+
samples_df = samples_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
47+
samples_df.write_csv(samples_filename_out) #csv
48+
del samples_df
49+
gc.collect()
50+
51+
#One by one, filter other Omics files, write to file, delete from mem.
52+
for omics in omics_datatypes:
53+
omics_filename_in = f"broad_sanger_{omics}.csv"
54+
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
55+
omics_df = pl.read_csv(omics_filename_in)
56+
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
57+
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
58+
omics_df.write_csv(omics_filename_out) #csv
59+
del omics_df
60+
gc.collect()
61+
62+
63+
#One by one, filter other Drugs files, write to file, delete from mem.
64+
for drugs in drugs_datatypes:
65+
drugs_filename_in = f"broad_sanger_{drugs}.tsv"
66+
drugs_filename_out = f"/tmp/{dataset}_{drugs}.tsv".lower()
67+
if drugs == "drug_descriptors":
68+
drugs_df = pl.read_csv(drugs_filename_in,separator="\t",
69+
dtypes={"improve_drug_id": pl.Utf8,
70+
"structural_descriptor": pl.Utf8,
71+
"descriptor_value": pl.Utf8}
72+
)
73+
74+
else:
75+
drugs_df = pl.read_csv(drugs_filename_in,separator="\t")
76+
77+
drugs_df = drugs_df.filter(pl.col("improve_drug_id").is_in(exp_improve_drug_ids))
78+
drugs_df.write_csv(drugs_filename_out,separator="\t") #tsv
79+
del drugs_df
80+
gc.collect()
81+
82+
if __name__ == "__main__":
83+
main()

build/broad_sanger/build_drugs.sh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1-
/opt/venv/bin/python 03a-nci60Drugs.py
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running 03a-nci60Drugs.py..."
7+
/opt/venv/bin/python 03a-nci60Drugs.py
8+
9+
echo "Running 03-createDrugFile.R..."
210
Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM
3-
/opt/venv/bin/python build_drug_desc.py --drugtable /tmp/broad_sanger_drugs.tsv --desctable /tmp/broad_sanger_drug_descriptors.tsv.gz
11+
12+
echo "Running build_drug_desc.py..."
13+
/opt/venv/bin/python build_drug_desc.py \
14+
--drugtable /tmp/broad_sanger_drugs.tsv \
15+
--desctable /tmp/broad_sanger_drug_descriptors.tsv.gz

build/broad_sanger/build_exp.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1-
/opt/venv/bin/python 04-drug_dosage_and_curves.py --drugfile $2 --curSampleFile $1
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1"
7+
/opt/venv/bin/python 04-drug_dosage_and_curves.py --drugfile $2 --curSampleFile $1

0 commit comments

Comments
 (0)