Skip to content

Commit c171f0d

Browse files
Merge branch 'main' of https://github.com/PNNL-CompBio/coderdata into novartisPDX-omics
2 parents f31a8a3 + 618b21b commit c171f0d

53 files changed

Lines changed: 14049 additions & 8934 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

LICENSE_DISCLAIMER renamed to DISCLAIMER

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,3 @@
1-
Copyright Battelle Memorial Institute 2025
2-
3-
Redistribution and use in source and binary forms, with or without
4-
modification, are permitted provided that the following conditions are met:
5-
6-
1. Redistributions of source code must retain the above copyright notice, this
7-
list of conditions and the following disclaimer.
8-
9-
2. Redistributions in binary form must reproduce the above copyright notice,
10-
this list of conditions and the following disclaimer in the documentation
11-
and/or other materials provided with the distribution.
12-
13-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14-
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15-
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
17-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23-
24-
------
251

262
This material was prepared as an account of work sponsored by an agency of the
273
United States Government. Neither the United States Government nor the United

LICENSE

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Copyright Battelle Memorial Institute 2025
2+
3+
Redistribution and use in source and binary forms, with or without
4+
modification, are permitted provided that the following conditions are met:
5+
6+
1. Redistributions of source code must retain the above copyright notice, this
7+
list of conditions and the following disclaimer.
8+
9+
2. Redistributions in binary form must reproduce the above copyright notice,
10+
this list of conditions and the following disclaimer in the documentation
11+
and/or other materials provided with the distribution.
12+
13+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
17+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23+

build/beatAML/GetBeatAML.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
424424
mapped_df.rename(columns={"hgvsc": "mutation"}, inplace=True)
425425
mapped_df.rename(columns={"labId": "sample_id"}, inplace=True)
426426
mapped_df.rename(columns={"Entrez_Gene_Id": "entrez_id"}, inplace=True)
427+
428+
#remove (gene) information preceeding the colon - this formats it like other datasets.
429+
mapped_df["mutation"] = mapped_df["mutation"].astype(str).str.split(":", n=1).str[-1]
427430

428431
variant_mapping = {
429432
'frameshift_variant': 'Frameshift_Variant',
@@ -662,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path):
662665
print(improve_map_file)
663666
t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
664667
t_df = t_df[t_df.entrez_id.notna()]
668+
t_df = t_df[t_df.entrez_id != 0]
665669
t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
666670
t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')
667671

@@ -673,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path):
673677
p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
674678
p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map)
675679
p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
680+
p_df = p_df[p_df.entrez_id != 0]
676681
p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip')
677682

678683
# New Mutation Data
679684
print("Starting Mutation Data")
680685
m_df = pd.read_csv(mutations_file, sep = '\t')
681-
682686
m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file)
683687
m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]]
688+
m_df = m_df[m_df.entrez_id != 0]
684689
m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip')
685690

686691
if args.exp:

build/beatAML/fit_curve.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#! /usr/bin/env python3
2+
3+
import os
4+
os.environ['MPLCONFIGDIR'] = '/app/tmp/matplotlib'
5+
import matplotlib
6+
matplotlib.use('Agg')
7+
import matplotlib.pyplot as plt
8+
import matplotlib.font_manager
9+
10+
import sys
11+
import argparse
12+
import numpy as np
13+
import pandas as pd
14+
15+
from tqdm import tqdm
16+
from itertools import islice
17+
from sklearn.metrics import r2_score
18+
from scipy.optimize import curve_fit
19+
import multiprocessing
20+
21+
#import uno_data as ud
22+
23+
def format_coderd_schema(fname):
24+
""" formats output to comply with coderdata schema
25+
"""
26+
df = pd.read_csv(fname,delimiter='\t')
27+
##first rename Drug to improve_drug_id
28+
df2 = df.rename(columns={'Drug':'improve_drug_id'})
29+
new_df = pd.melt(df2,id_vars=['source','improve_sample_id','improve_drug_id','study','time','time_unit'],value_vars=['fit_auc','fit_ic50','fit_ec50','fit_r2','fit_ec50se','fit_einf','fit_hs','aac','auc','dss'],value_name='dose_response_value',var_name='dose_response_metric')
30+
31+
new_df.to_csv(fname,sep='\t',index=False)
32+
33+
HS_BOUNDS_ORIG = ([0, 10**-12, 0], [1, 1, 4])
34+
35+
def hs_response_curve_original(x, einf, ec50, hs):
36+
""" from PharmacoDB supp. https://doi.org/10.1093/nar/gkx911
37+
bounds:
38+
einf: [0, 1] # fraction of cells not susceptible to drug
39+
ec50: [10^-12, 1] # concentration to have half target receptors bound: [1pM, 1M]
40+
hs: [0, 4] # hill slope binding cooperativity
41+
"""
42+
return einf + (1 - einf) / (1 + np.power(x/ec50, hs))
43+
44+
45+
HS_BOUNDS = ([0, 0, 0], [1, 12, 4])
46+
#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
47+
HS_BOUNDS_NEG = ([0, -5,-1],[1,3,1]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range
48+
49+
HS_BOUNDS_M = ([0, 0, -4], [1, 12, 4])
50+
51+
def response_curve(x, einf, ec50, hs):
52+
""" transformed the original function with ec50 in -log10(M) instead of M
53+
"""
54+
return einf + (1 - einf) / (1 + 10 ** ((ec50 - x) * hs))
55+
56+
57+
def response_integral(x, einf, ec50, hs):
58+
return (1 - einf) * np.log10(1 + 10 ** ((ec50 - x) * hs)) / hs + x
59+
60+
61+
def compute_area(x1, x2, einf, ec50, hs, mode='trapz'):
62+
popt = (einf, ec50, hs)
63+
if mode == 'trapz':
64+
# trapezoidal numerical integrationcollapse
65+
xx = np.linspace(x1, x2, 100)
66+
yy = response_curve(xx, *popt)
67+
area = np.trapz(yy, xx, dx=0.01)
68+
else:
69+
# the integral function can be expressed analytically
70+
# but sometimes less accurate due to float precision issues
71+
area = response_integral(x2, *popt) - response_integral(x1, *popt)
72+
return area
73+
74+
75+
76+
'''
77+
added back this function as a spot check of data
78+
'''
79+
def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
80+
if save:
81+
font = {'family' : 'normal',
82+
# 'weight' : 'bold',
83+
'size' : 14}
84+
matplotlib.rc('font', **font)
85+
plt.figure(figsize=(12, 6))
86+
87+
print(df_exp)
88+
xdata = df_exp.DOSE.astype(float)
89+
ydata = df_exp.GROWTH.astype(float)
90+
# ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
91+
92+
# print(xdata)
93+
# print(ydata)
94+
95+
popt, pcov = response_curve_fit(xdata, ydata)
96+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
97+
98+
if popt is None:
99+
return metrics
100+
101+
dmin = dmin or xdata.min()
102+
dmax = dmax or xdata.max()
103+
xx = np.linspace(dmin, dmax, 100)
104+
yy = response_curve(xx, *popt)
105+
106+
plt.xlim(dmax, dmin)
107+
plt.ylim(0, np.max([105, np.max(yy)]))
108+
plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
109+
plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
110+
plt.xlabel('Dose (-log10(M))')
111+
plt.ylabel('Growth%')
112+
plt.title(title)
113+
plt.tight_layout()
114+
plt.legend()
115+
if save:
116+
plt.savefig('exp.png', dpi=360)
117+
plt.close()
118+
else:
119+
plt.show()
120+
121+
return metrics.to_frame(name='metrics').T
122+
123+
124+
def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10): #d1 = -5, d2=3):
125+
'''
126+
xdata: dose data in log10(
127+
ydata: range from 0 to 1
128+
popt: fit curve metrics
129+
pcov: ??
130+
d1: minimum fixed dose in log10(M) ##updated to uM and made range larger
131+
d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
132+
'''
133+
if popt is None:
134+
cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
135+
return pd.Series([np.nan] * len(cols), index=cols)
136+
einf, ec50, hs = popt
137+
perr = np.sqrt(np.diag(pcov))
138+
ec50se = perr[1]
139+
xmin = xdata.min()
140+
xmax = xdata.max()
141+
ypred = response_curve(xdata, *popt)
142+
r2 = r2_score(ydata, ypred)
143+
auc1 = compute_area(xmin, xmax, *popt) / (xmax - xmin)
144+
aac1 = 1 - auc1
145+
ic50 = ec50 - np.log10(0.5/(0.5-einf)) / hs if einf < 0.5 else np.nan
146+
ic90 = ec50 - np.log10(0.9/(0.1-einf)) / hs if einf < 0.1 else np.nan
147+
ic10 = ec50 - np.log10(0.1/(0.9-einf)) / hs if einf < 0.9 else np.nan
148+
ic10x = min(ic10, xmax)
149+
150+
##compute area under the ic10 to subtract from total
151+
int10x = compute_area(xmin, ic10x, *popt)
152+
##old code - assumes a positive hill slope, otherwise doesn't seem to work.
153+
dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
154+
#this auc has fixed doses, so can be (in theory) standardized across datasets
155+
auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
156+
##added by sara, i'm not sure where the above came from
157+
## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
158+
## here t = 0.1 and i use the fitted curve values
159+
dss1 = (auc1-0.1*(ic10x-xmin)) / (0.9 * (xmax - xmin)) if xmax > ic50 else 0
160+
dss2 = dss1/(1-einf) ##made this dss2 doesn't change much
161+
metrics = pd.Series({'fit_auc':auc, 'fit_ic50':ic50, 'fit_ec50':ec50,'fit_einf':einf,
162+
'fit_ec50se':ec50se, 'fit_r2':r2, 'einf':einf, 'fit_hs':hs,
163+
'aac':aac1, 'auc':auc1, 'dss':dss2}).round(4)
164+
return metrics
165+
166+
167+
168+
def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS_M):
169+
'''
170+
xdata: log10 molar concetnration
171+
ydata: value between 0 and 1 for response
172+
bounds: these are fixed in code, nto sure what they are for
173+
'''
174+
ydata = ydata.clip(lower=0, upper=1.0)
175+
popt, pcov = None, None
176+
nfev = 100 * 3
177+
while popt is None and nfev < 10000:
178+
# print(nfev)
179+
try:
180+
popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev)
181+
# popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev, method='dogbox')
182+
except RuntimeError:
183+
pass
184+
nfev *= 2
185+
return popt, pcov
186+
187+
188+
def process_df(df, fname, sep='\t', ngroups=None):
189+
# df = df1.copy()
190+
i = 0
191+
header = None
192+
cols = ['source', 'improve_sample_id', 'Drug', 'study']
193+
groups = df.groupby(cols)
194+
f = open(fname, 'w')
195+
for name, group in tqdm(groups):
196+
# print(name)
197+
xdata = group.DOSE.astype(float)
198+
##added the following 3 lines to acocunt for data normalized between 0 and 100 instead of 0 and 1
199+
ydata = group.GROWTH
200+
# if max(ydata)>10:
201+
# ydata = ydata/100.0
202+
ydata.clip(lower=0, upper=1.0).astype(float)
203+
popt, pcov = response_curve_fit(xdata, ydata)
204+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
205+
if header is None:
206+
header = cols + metrics.index.tolist()
207+
print(sep.join(header), file=f)
208+
print(sep.join(name), end=sep, file=f)
209+
print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
210+
i += 1
211+
if ngroups and i >= ngroups:
212+
break
213+
f.close()
214+
215+
216+
def process_single_drug(name_group_tuple):
217+
name, group = name_group_tuple
218+
xdata = group.DOSE.astype(float)
219+
ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
220+
popt, pcov = response_curve_fit(xdata, ydata)
221+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
222+
return name, metrics
223+
224+
def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
225+
cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
226+
groups = df.groupby(cols)
227+
count = count or (4484081 - start)
228+
groups = islice(groups, start, start+count)
229+
cores = multiprocessing.cpu_count()
230+
poolsize = round(cores-1)
231+
print('we have '+str(cores)+' cores and '+str(poolsize)+' threads')
232+
with multiprocessing.Pool(processes=poolsize) as pool:
233+
results = pool.map(process_single_drug, groups)
234+
235+
with open(f'{fname}.{start}', 'w') as f:
236+
header = None
237+
for result in results:
238+
name, metrics = result
239+
if header is None:
240+
header = cols + metrics.index.tolist()
241+
print(sep.join(header), file=f)
242+
print(sep.join(str(n) for n in name), end=sep, file=f)
243+
print(sep.join(f'{x:.4g}' for x in metrics), file=f)
244+
245+
246+
def main():
247+
parser = argparse.ArgumentParser()
248+
parser.add_argument('--input', help='input file with the following columns:\
249+
DOSE: dose of drug in uM,\
250+
GROWTH: percentage of cells left,\
251+
study: name of study to group measurements by,\
252+
source: source of the data,\
253+
improve_sample_id: improve_sample_id,\
254+
Drug: improve_drug_id,\
255+
time: time at which measurement was taken,\
256+
time_unit: unit of time')
257+
parser.add_argument('--output', help='prefix of output file')
258+
parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
259+
parser.add_argument('--debug',action='store_true',default=False)
260+
261+
args = parser.parse_args()
262+
print(args.input)
263+
df_all = pd.read_table(args.input)
264+
if args.debug:
265+
df_all = df_all.iloc[0:1000000]
266+
267+
#drop nas
268+
df_all = df_all.dropna()
269+
##pharmacoGX data is micromolar, we need log transformed data
270+
df_all.DOSE = -1.0 * np.log10(df_all.DOSE/1000000.0)
271+
##need data to be between 0 and 1, not 0 and 100
272+
df_all.GROWTH=df_all.GROWTH/100.00
273+
print(df_all.head)
274+
fname = args.output or 'combined_single_response_agg'
275+
process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
276+
277+
# if args.beataml == False:
278+
format_coderd_schema(fname+'.0')
279+
280+
if __name__ == '__main__':
281+
main()

0 commit comments

Comments
 (0)