Skip to content

Commit 4025953

Browse files
authored
Merge pull request #232 from PNNL-CompBio/nci-fix
Nci fix
2 parents 1fc3e75 + 4895f96 commit 4025953

8 files changed

Lines changed: 97 additions & 42 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ __pycache__
1717
tests/__pycache__
1818
dist
1919
build/lib
20+
build/local

build/broad_sanger/03a-nci60Drugs.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,23 @@
1313

1414
##drug files
1515
smi_strings='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=1&modificationDate=1710381820000&api=v2&download=true'
16+
#oct 2024
17+
smi_strings = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=3&modificationDate=1727924130457&api=v2&download=true'
18+
1619
pc_ids='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=2&modificationDate=1712766341112&api=v2&download=true'
20+
pc_ids = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=4&modificationDate=1727924129121&api=v2&download=true'
21+
22+
#oct 2024
1723
chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemcal_name.csv?version=1&modificationDate=1710382716000&api=v2&download=true'
24+
25+
chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemical_name.csv?version=1&modificationDate=1727924127004&api=v2'
26+
#oct 2024
1827
cas='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=1&modificationDate=1710381783000&api=v2&download=true'
28+
#oct 2024
29+
cas = 'https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=3&modificationDate=1727924126194&api=v2&download=true'
1930
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
31+
##OCT 2024
32+
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'
2033

2134

2235
def main():
@@ -39,13 +52,27 @@ def main():
3952
if not os.path.exists('DOSERESP.csv'):
4053
resp = request.urlretrieve(conc_data,'doseresp.zip')
4154
os.system('unzip doseresp.zip')
42-
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
55+
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
4356
pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
57+
smiles = smiles.filter(pl.col("NSC").is_in(dose_resp['NSC']))
4458
##first retreive pubchem data
4559
if opts.test:
4660
arr = rand.sample(list(pubchems['CID']),100)
4761
else:
4862
arr = set(pubchems['CID'])
63+
64+
##first filter to see if there are structures/drugs in teh data already. i dont think this does much.
65+
if os.path.exists(opts.output):
66+
curdrugs = pl.read_csv(opts.output,separator='\t')
67+
# cs = set(curdrugs['isoSMILES'])
68+
smiles = smiles.filter(pl.col('SMILES').is_not_null())
69+
upper=[a.upper() for a in smiles['SMILES']]
70+
smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
71+
##reduce to smiels only in current drugs
72+
ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
73+
ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
74+
pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
75+
arr = set(pubchems['CID'])
4976

5077
print("Querying pubchem from CIDs")
5178
pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)

build/broad_sanger/04a-drugResponseData.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ getDoseRespData<-function(dset,studyName,improve_samples,drug.map){
123123
dplyr::full_join(respDat,by=c('doseNum','exp_id'))%>%
124124
left_join(full.map)%>%
125125
dplyr::select(Drug=improve_drug_id,improve_sample_id=improve_sample_id,DOSE=Dose,GROWTH=Response)%>%
126-
#dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are mM
126+
#dplyr::mutate(DOSE=-log10(Dose/1000))###curve fitting code requires -log10(M), these are uM according to https://github.com/bhklab/PharmacoGx/blob/master/vignettes/CreatingPharmacoSet.Rmd
127127
#rename(GROWTH=RESPONSE)%>%
128128
mutate(source='pharmacoGX')%>%
129129
mutate(study=studyName)|>

build/broad_sanger/04b-nci60-updated.py

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
'''
2-
gets nci60 data from 10/2023 release
2+
gets nci60 data from 10/2024 release
33
44
'''
55

@@ -12,13 +12,17 @@
1212
from urllib import request
1313

1414
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
15+
##OCT 2024
16+
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'
17+
1518
cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'
1619

1720
def main():
1821

1922
parser = argparse.ArgumentParser()
2023
parser.add_argument('--sampleFile',dest='samplefile',default=None,help='DepMap sample file')
2124
parser.add_argument('--drugFile',dest='dfile',default=None,help='Drug database')
25+
2226

2327
opts = parser.parse_args()
2428

@@ -31,7 +35,7 @@ def main():
3135
samples = pl.read_csv(samplefile,quote_char='"')
3236
drugs = pl.read_csv(drugfile,separator='\t',quote_char='"')
3337

34-
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
38+
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000,ignore_errors=True)
3539

3640
##update drug mapping
3741
drugmapping = pl.DataFrame(
@@ -47,7 +51,10 @@ def main():
4751
###update sample mapping
4852
on = samples[['other_names','improve_sample_id']]
4953
on.columns=['common_name','improve_sample_id']
50-
54+
55+
#there should be 71 cell lines, but there are 163.
56+
# 82 map to the 'other_names'
57+
# 81 map to neither
5158
sampmapping = pl.concat([on[['common_name','improve_sample_id']],samples[['common_name','improve_sample_id']]])
5259

5360
sampmapping = sampmapping.unique()
@@ -64,44 +71,50 @@ def main():
6471

6572

6673
##now we can merge all the data into the dose response data frame
67-
merged = dose_resp[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
74+
merged = dose_resp[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
6875
merged = merged.join(timemapping,on='EXPID',how='left')
6976

7077
##clean up mssing samples
7178
nonulls = merged.filter(pl.col('improve_sample_id').is_not_null())
7279

7380
nulls = merged.filter(pl.col('improve_sample_id').is_null())
7481

75-
newnames = pl.DataFrame(
76-
{
77-
'new_name': [re.split(r' |\(|/', a)[0] for a in nulls['CELL_NAME']],
78-
'CELL_NAME':nulls['CELL_NAME']
79-
}
80-
)
81-
newnames = newnames.unique()
82+
# newnames = pl.DataFrame(
83+
# {
84+
# 'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
85+
# 'CELL_NAME':nulls['CELL_NAME']
86+
# }
87+
# )
88+
# newnames = newnames.unique()
89+
8290

83-
fixed = nulls[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
84-
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
85-
fixed = fixed.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
86-
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
87-
fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
91+
# fixed = nulls[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
92+
# merged.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
93+
# fixed = merged.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
94+
# fixed.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
95+
# fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
8896

89-
merged = pl.concat([nonulls,fixed])
97+
merged = nonulls#pl.concat([nonulls,fixed])
9098

9199
###we get a few more results added, but still missing a bunch
92100
merged = merged.join(drugmapping,on='NSC',how='left')
93101
nulldrugs = merged.filter(pl.col('improve_drug_id').is_null())
94102
nonulls = merged.filter(pl.col('improve_drug_id').is_not_null())
103+
104+
###now update all the concentrations to be in Moles (some are in uM, all are log10)
105+
##some are provided as molecular weights ('v') or other ('s') and we can't compare
106+
molar = merged.filter(pl.col('CONCENTRATION_UNIT')=='M')
107+
95108
finaldf = pl.DataFrame(
96109
{
97-
'source':['NCI60_24' for a in nonulls['improve_drug_id']], ##2024 build
98-
'improve_sample_id':nonulls['improve_sample_id'],
99-
'Drug':nonulls['improve_drug_id'],
100-
'study':['NCI60' for a in nonulls['improve_drug_id']],
101-
'time':nonulls['time'],
102-
'time_unit':nonulls['time_unit'],
103-
'DOSE': [10**a for a in nonulls['CONCENTRATION']],
104-
'GROWTH':nonulls['AVERAGE_PTC']
110+
'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
111+
'improve_sample_id':molar['improve_sample_id'],
112+
'Drug':molar['improve_drug_id'],
113+
'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
114+
'time':molar['time'],
115+
'time_unit':molar['time_unit'],
116+
'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
117+
'GROWTH':molar['AVERAGE_PTC']
105118
}
106119
)
107120
##write to file

build/docker/Dockerfile.mpnst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ RUN /opt/venv/bin/pip install --upgrade pip
3131

3232
# Set environment variables for reticulate
3333
ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
34-
ENV PYTHONPATH="${PYTHONPATH}:/app"
34+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
3535
WORKDIR /app
3636

3737
# Set MPLCONFIGDIR to a writable directory
@@ -51,4 +51,4 @@ RUN /opt/venv/bin/pip3 install -r requirements.txt
5151
RUN Rscript requirements.r
5252

5353
# Set up volume for temporary storage
54-
VOLUME ["/tmp"]
54+
VOLUME ["/tmp"]

build/mpnst/03_get_drug_response_data.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ getDrugDataByParent<-function(parid,sampleId){
8585
data <- fread(synGet(x)$path)|>
8686
filter(response_type=='percent viability')|>
8787
mutate(improve_sample_id=sampleId,
88-
DOSE=exp(dosage)/1000000, ##dosage is log(M), need to move to micromolar
88+
DOSE=(10^dosage)*1000000, ##dosage is log(M), need to move to micromolar
8989
GROWTH=response, #/100,
9090
source = "NF Data Portal",
9191
#CELL = improve_sample_id,

build/mpnst/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ directory. Currently using the test files as input.
1212
`mpnst_samples.csv` file. This pulls from the latest synapse
1313
project metadata table.
1414
```
15-
docker run -v $PWD:/tmp -e -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
15+
docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnst sh build_samples.sh /tmp/build/build_test/test_samples.csv
1616
```
1717

1818
3. Pull the data and map it to the samples. This uses the metadata

build/utils/fit_curve.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def hs_response_curve_original(x, einf, ec50, hs):
4343

4444

4545
HS_BOUNDS = ([0, 0, 0], [1, 12, 4])
46-
HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
46+
#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
47+
HS_BOUNDS_NEG = ([0, -5,-1],[1,3,0]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range
4748
def response_curve(x, einf, ec50, hs):
4849
""" transformed the original function with ec50 in -log10(M) instead of M
4950
"""
@@ -116,14 +117,14 @@ def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
116117

117118
return metrics.to_frame(name='metrics').T
118119

119-
def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):
120+
121+
def compute_fit_metrics(xdata, ydata, popt, pcov, d1 = -5, d2=3):#d1=4, d2=10):
120122
'''
121-
xdata: dose data in log10(M)
122-
ydata: range from 0 to 1
123+
xdata: dose data in log10( ydata: range from 0 to 1
123124
popt: fit curve metrics
124125
pcov: ??
125-
d1: minimum fixed dose in log10(M)
126-
d2: maximum fixed dose log10(M)
126+
d1: minimum fixed dose in log10(M) ##updated to uM and made range larger
127+
d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
127128
'''
128129
if popt is None:
129130
cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
@@ -146,6 +147,7 @@ def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10):
146147
int10x = compute_area(xmin, ic10x, *popt)
147148
##old code - assumes a positive hill slope, otherwise doesn't seem to work.
148149
dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
150+
#this auc has fixed doses, so can be (in theory) standardized across datasets
149151
auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
150152
##added by sara, i'm not sure where the above came from
151153
## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
@@ -239,20 +241,32 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
239241

240242
def main():
241243
parser = argparse.ArgumentParser()
242-
parser.add_argument('--input', help='input file')
244+
parser.add_argument('--input', help='input file with the following columns:\
245+
DOSE: dose of drug in uM,\
246+
GROWTH: percentage of cells left,\
247+
study: name of study to group measurements by,\
248+
source: source of the data,\
249+
improve_sample_id: improve_sample_id,\
250+
Drug: improve_drug_id,\
251+
time: time at which measurement was taken,\
252+
time_unit: unit of time')
243253
parser.add_argument('--output', help='prefix of output file')
244254
parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
245-
255+
parser.add_argument('--debug',action='store_true',default=False)
256+
246257
args = parser.parse_args()
247258
print(args.input)
248259
df_all = pd.read_table(args.input)
260+
if args.debug:
261+
df_all = df_all.iloc[0:1000000]
262+
249263
#drop nas
250264
df_all = df_all.dropna()
251-
##pharmacoGX data is micromolar, we need log transformed molar
252-
df_all.DOSE = np.log10(df_all.DOSE*1000000)
265+
##pharmacoGX data is micromolar, we need log transformed data
266+
df_all.DOSE = np.log10(df_all.DOSE)
253267
##need data to be between 0 and 1, not 0 and 100
254268
df_all.GROWTH=df_all.GROWTH/100.00
255-
269+
print(df_all.head)
256270
fname = args.output or 'combined_single_response_agg'
257271
process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
258272

0 commit comments

Comments
 (0)