Skip to content

Commit 781891a

Browse files
authored
Merge pull request #403 from PNNL-CompBio/updated-curve
update curve fit to go back to MOLAR
2 parents b028af9 + 86b2a88 commit 781891a

1 file changed

Lines changed: 281 additions & 0 deletions

File tree

build/beatAML/fit_curve.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#! /usr/bin/env python3
2+
3+
import os
4+
os.environ['MPLCONFIGDIR'] = '/app/tmp/matplotlib'
5+
import matplotlib
6+
matplotlib.use('Agg')
7+
import matplotlib.pyplot as plt
8+
import matplotlib.font_manager
9+
10+
import sys
11+
import argparse
12+
import numpy as np
13+
import pandas as pd
14+
15+
from tqdm import tqdm
16+
from itertools import islice
17+
from sklearn.metrics import r2_score
18+
from scipy.optimize import curve_fit
19+
import multiprocessing
20+
21+
#import uno_data as ud
22+
23+
def format_coderd_schema(fname):
24+
""" formats output to comply with coderdata schema
25+
"""
26+
df = pd.read_csv(fname,delimiter='\t')
27+
##first rename Drug to improve_drug_id
28+
df2 = df.rename(columns={'Drug':'improve_drug_id'})
29+
new_df = pd.melt(df2,id_vars=['source','improve_sample_id','improve_drug_id','study','time','time_unit'],value_vars=['fit_auc','fit_ic50','fit_ec50','fit_r2','fit_ec50se','fit_einf','fit_hs','aac','auc','dss'],value_name='dose_response_value',var_name='dose_response_metric')
30+
31+
new_df.to_csv(fname,sep='\t',index=False)
32+
33+
HS_BOUNDS_ORIG = ([0, 10**-12, 0], [1, 1, 4])
34+
35+
def hs_response_curve_original(x, einf, ec50, hs):
36+
""" from PharmacoDB supp. https://doi.org/10.1093/nar/gkx911
37+
bounds:
38+
einf: [0, 1] # fraction of cells not susceptible to drug
39+
ec50: [10^-12, 1] # concentration to have half target receptors bound: [1pM, 1M]
40+
hs: [0, 4] # hill slope binding cooperativity
41+
"""
42+
return einf + (1 - einf) / (1 + np.power(x/ec50, hs))
43+
44+
45+
HS_BOUNDS = ([0, 0, 0], [1, 12, 4])
46+
#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
47+
HS_BOUNDS_NEG = ([0, -5,-1],[1,3,1]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range
48+
49+
HS_BOUNDS_M = ([0, 0, -4], [1, 12, 4])
50+
51+
def response_curve(x, einf, ec50, hs):
52+
""" transformed the original function with ec50 in -log10(M) instead of M
53+
"""
54+
return einf + (1 - einf) / (1 + 10 ** ((ec50 - x) * hs))
55+
56+
57+
def response_integral(x, einf, ec50, hs):
58+
return (1 - einf) * np.log10(1 + 10 ** ((ec50 - x) * hs)) / hs + x
59+
60+
61+
def compute_area(x1, x2, einf, ec50, hs, mode='trapz'):
62+
popt = (einf, ec50, hs)
63+
if mode == 'trapz':
64+
# trapezoidal numerical integrationcollapse
65+
xx = np.linspace(x1, x2, 100)
66+
yy = response_curve(xx, *popt)
67+
area = np.trapz(yy, xx, dx=0.01)
68+
else:
69+
# the integral function can be expressed analytically
70+
# but sometimes less accurate due to float precision issues
71+
area = response_integral(x2, *popt) - response_integral(x1, *popt)
72+
return area
73+
74+
75+
76+
'''
77+
added back this function as a spot check of data
78+
'''
79+
def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
80+
if save:
81+
font = {'family' : 'normal',
82+
# 'weight' : 'bold',
83+
'size' : 14}
84+
matplotlib.rc('font', **font)
85+
plt.figure(figsize=(12, 6))
86+
87+
print(df_exp)
88+
xdata = df_exp.DOSE.astype(float)
89+
ydata = df_exp.GROWTH.astype(float)
90+
# ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
91+
92+
# print(xdata)
93+
# print(ydata)
94+
95+
popt, pcov = response_curve_fit(xdata, ydata)
96+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
97+
98+
if popt is None:
99+
return metrics
100+
101+
dmin = dmin or xdata.min()
102+
dmax = dmax or xdata.max()
103+
xx = np.linspace(dmin, dmax, 100)
104+
yy = response_curve(xx, *popt)
105+
106+
plt.xlim(dmax, dmin)
107+
plt.ylim(0, np.max([105, np.max(yy)]))
108+
plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
109+
plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
110+
plt.xlabel('Dose (-log10(M))')
111+
plt.ylabel('Growth%')
112+
plt.title(title)
113+
plt.tight_layout()
114+
plt.legend()
115+
if save:
116+
plt.savefig('exp.png', dpi=360)
117+
plt.close()
118+
else:
119+
plt.show()
120+
121+
return metrics.to_frame(name='metrics').T
122+
123+
124+
def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10): #d1 = -5, d2=3):
125+
'''
126+
xdata: dose data in log10(
127+
ydata: range from 0 to 1
128+
popt: fit curve metrics
129+
pcov: ??
130+
d1: minimum fixed dose in log10(M) ##updated to uM and made range larger
131+
d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
132+
'''
133+
if popt is None:
134+
cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
135+
return pd.Series([np.nan] * len(cols), index=cols)
136+
einf, ec50, hs = popt
137+
perr = np.sqrt(np.diag(pcov))
138+
ec50se = perr[1]
139+
xmin = xdata.min()
140+
xmax = xdata.max()
141+
ypred = response_curve(xdata, *popt)
142+
r2 = r2_score(ydata, ypred)
143+
auc1 = compute_area(xmin, xmax, *popt) / (xmax - xmin)
144+
aac1 = 1 - auc1
145+
ic50 = ec50 - np.log10(0.5/(0.5-einf)) / hs if einf < 0.5 else np.nan
146+
ic90 = ec50 - np.log10(0.9/(0.1-einf)) / hs if einf < 0.1 else np.nan
147+
ic10 = ec50 - np.log10(0.1/(0.9-einf)) / hs if einf < 0.9 else np.nan
148+
ic10x = min(ic10, xmax)
149+
150+
##compute area under the ic10 to subtract from total
151+
int10x = compute_area(xmin, ic10x, *popt)
152+
##old code - assumes a positive hill slope, otherwise doesn't seem to work.
153+
dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
154+
#this auc has fixed doses, so can be (in theory) standardized across datasets
155+
auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
156+
##added by sara, i'm not sure where the above came from
157+
## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
158+
## here t = 0.1 and i use the fitted curve values
159+
dss1 = (auc1-0.1*(ic10x-xmin)) / (0.9 * (xmax - xmin)) if xmax > ic50 else 0
160+
dss2 = dss1/(1-einf) ##made this dss2 doesn't change much
161+
metrics = pd.Series({'fit_auc':auc, 'fit_ic50':ic50, 'fit_ec50':ec50,'fit_einf':einf,
162+
'fit_ec50se':ec50se, 'fit_r2':r2, 'einf':einf, 'fit_hs':hs,
163+
'aac':aac1, 'auc':auc1, 'dss':dss2}).round(4)
164+
return metrics
165+
166+
167+
168+
def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS_M):
169+
'''
170+
xdata: log10 molar concetnration
171+
ydata: value between 0 and 1 for response
172+
bounds: these are fixed in code, nto sure what they are for
173+
'''
174+
ydata = ydata.clip(lower=0, upper=1.0)
175+
popt, pcov = None, None
176+
nfev = 100 * 3
177+
while popt is None and nfev < 10000:
178+
# print(nfev)
179+
try:
180+
popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev)
181+
# popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev, method='dogbox')
182+
except RuntimeError:
183+
pass
184+
nfev *= 2
185+
return popt, pcov
186+
187+
188+
def process_df(df, fname, sep='\t', ngroups=None):
189+
# df = df1.copy()
190+
i = 0
191+
header = None
192+
cols = ['source', 'improve_sample_id', 'Drug', 'study']
193+
groups = df.groupby(cols)
194+
f = open(fname, 'w')
195+
for name, group in tqdm(groups):
196+
# print(name)
197+
xdata = group.DOSE.astype(float)
198+
##added the following 3 lines to acocunt for data normalized between 0 and 100 instead of 0 and 1
199+
ydata = group.GROWTH
200+
# if max(ydata)>10:
201+
# ydata = ydata/100.0
202+
ydata.clip(lower=0, upper=1.0).astype(float)
203+
popt, pcov = response_curve_fit(xdata, ydata)
204+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
205+
if header is None:
206+
header = cols + metrics.index.tolist()
207+
print(sep.join(header), file=f)
208+
print(sep.join(name), end=sep, file=f)
209+
print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
210+
i += 1
211+
if ngroups and i >= ngroups:
212+
break
213+
f.close()
214+
215+
216+
def process_single_drug(name_group_tuple):
217+
name, group = name_group_tuple
218+
xdata = group.DOSE.astype(float)
219+
ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
220+
popt, pcov = response_curve_fit(xdata, ydata)
221+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
222+
return name, metrics
223+
224+
def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
225+
cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
226+
groups = df.groupby(cols)
227+
count = count or (4484081 - start)
228+
groups = islice(groups, start, start+count)
229+
cores = multiprocessing.cpu_count()
230+
poolsize = round(cores-1)
231+
print('we have '+str(cores)+' cores and '+str(poolsize)+' threads')
232+
with multiprocessing.Pool(processes=poolsize) as pool:
233+
results = pool.map(process_single_drug, groups)
234+
235+
with open(f'{fname}.{start}', 'w') as f:
236+
header = None
237+
for result in results:
238+
name, metrics = result
239+
if header is None:
240+
header = cols + metrics.index.tolist()
241+
print(sep.join(header), file=f)
242+
print(sep.join(str(n) for n in name), end=sep, file=f)
243+
print(sep.join(f'{x:.4g}' for x in metrics), file=f)
244+
245+
246+
def main():
247+
parser = argparse.ArgumentParser()
248+
parser.add_argument('--input', help='input file with the following columns:\
249+
DOSE: dose of drug in uM,\
250+
GROWTH: percentage of cells left,\
251+
study: name of study to group measurements by,\
252+
source: source of the data,\
253+
improve_sample_id: improve_sample_id,\
254+
Drug: improve_drug_id,\
255+
time: time at which measurement was taken,\
256+
time_unit: unit of time')
257+
parser.add_argument('--output', help='prefix of output file')
258+
parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
259+
parser.add_argument('--debug',action='store_true',default=False)
260+
261+
args = parser.parse_args()
262+
print(args.input)
263+
df_all = pd.read_table(args.input)
264+
if args.debug:
265+
df_all = df_all.iloc[0:1000000]
266+
267+
#drop nas
268+
df_all = df_all.dropna()
269+
##pharmacoGX data is micromolar, we need log transformed data
270+
df_all.DOSE = -1.0 * np.log10(df_all.DOSE/1000000.0)
271+
##need data to be between 0 and 1, not 0 and 100
272+
df_all.GROWTH=df_all.GROWTH/100.00
273+
print(df_all.head)
274+
fname = args.output or 'combined_single_response_agg'
275+
process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
276+
277+
# if args.beataml == False:
278+
format_coderd_schema(fname+'.0')
279+
280+
if __name__ == '__main__':
281+
main()

0 commit comments

Comments
 (0)