Skip to content

Commit 86b2a88

Browse files
committed
update curve fit to go back to MOLAR
previous inputs required uM. We still take uM inputs, but now calculate curves and produce ec50 in -log10(M) values.
1 parent b028af9 commit 86b2a88

1 file changed

Lines changed: 281 additions & 0 deletions

File tree

build/beatAML/fit_curve.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
#! /usr/bin/env python3
2+
3+
import os
4+
os.environ['MPLCONFIGDIR'] = '/app/tmp/matplotlib'
5+
import matplotlib
6+
matplotlib.use('Agg')
7+
import matplotlib.pyplot as plt
8+
import matplotlib.font_manager
9+
10+
import sys
11+
import argparse
12+
import numpy as np
13+
import pandas as pd
14+
15+
from tqdm import tqdm
16+
from itertools import islice
17+
from sklearn.metrics import r2_score
18+
from scipy.optimize import curve_fit
19+
import multiprocessing
20+
21+
#import uno_data as ud
22+
23+
def format_coderd_schema(fname):
24+
""" formats output to comply with coderdata schema
25+
"""
26+
df = pd.read_csv(fname,delimiter='\t')
27+
##first rename Drug to improve_drug_id
28+
df2 = df.rename(columns={'Drug':'improve_drug_id'})
29+
new_df = pd.melt(df2,id_vars=['source','improve_sample_id','improve_drug_id','study','time','time_unit'],value_vars=['fit_auc','fit_ic50','fit_ec50','fit_r2','fit_ec50se','fit_einf','fit_hs','aac','auc','dss'],value_name='dose_response_value',var_name='dose_response_metric')
30+
31+
new_df.to_csv(fname,sep='\t',index=False)
32+
33+
HS_BOUNDS_ORIG = ([0, 10**-12, 0], [1, 1, 4])
34+
35+
def hs_response_curve_original(x, einf, ec50, hs):
36+
""" from PharmacoDB supp. https://doi.org/10.1093/nar/gkx911
37+
bounds:
38+
einf: [0, 1] # fraction of cells not susceptible to drug
39+
ec50: [10^-12, 1] # concentration to have half target receptors bound: [1pM, 1M]
40+
hs: [0, 4] # hill slope binding cooperativity
41+
"""
42+
return einf + (1 - einf) / (1 + np.power(x/ec50, hs))
43+
44+
45+
HS_BOUNDS = ([0, 0, 0], [1, 12, 4])
46+
#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
47+
HS_BOUNDS_NEG = ([0, -5,-1],[1,3,1]) ## made hill slope forced to be negative ##20241017 updated to shift EC50 range
48+
49+
HS_BOUNDS_M = ([0, 0, -4], [1, 12, 4])
50+
51+
def response_curve(x, einf, ec50, hs):
52+
""" transformed the original function with ec50 in -log10(M) instead of M
53+
"""
54+
return einf + (1 - einf) / (1 + 10 ** ((ec50 - x) * hs))
55+
56+
57+
def response_integral(x, einf, ec50, hs):
58+
return (1 - einf) * np.log10(1 + 10 ** ((ec50 - x) * hs)) / hs + x
59+
60+
61+
def compute_area(x1, x2, einf, ec50, hs, mode='trapz'):
62+
popt = (einf, ec50, hs)
63+
if mode == 'trapz':
64+
# trapezoidal numerical integrationcollapse
65+
xx = np.linspace(x1, x2, 100)
66+
yy = response_curve(xx, *popt)
67+
area = np.trapz(yy, xx, dx=0.01)
68+
else:
69+
# the integral function can be expressed analytically
70+
# but sometimes less accurate due to float precision issues
71+
area = response_integral(x2, *popt) - response_integral(x1, *popt)
72+
return area
73+
74+
75+
76+
'''
77+
added back this function as a spot check of data
78+
'''
79+
def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
80+
if save:
81+
font = {'family' : 'normal',
82+
# 'weight' : 'bold',
83+
'size' : 14}
84+
matplotlib.rc('font', **font)
85+
plt.figure(figsize=(12, 6))
86+
87+
print(df_exp)
88+
xdata = df_exp.DOSE.astype(float)
89+
ydata = df_exp.GROWTH.astype(float)
90+
# ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
91+
92+
# print(xdata)
93+
# print(ydata)
94+
95+
popt, pcov = response_curve_fit(xdata, ydata)
96+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
97+
98+
if popt is None:
99+
return metrics
100+
101+
dmin = dmin or xdata.min()
102+
dmax = dmax or xdata.max()
103+
xx = np.linspace(dmin, dmax, 100)
104+
yy = response_curve(xx, *popt)
105+
106+
plt.xlim(dmax, dmin)
107+
plt.ylim(0, np.max([105, np.max(yy)]))
108+
plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
109+
plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
110+
plt.xlabel('Dose (-log10(M))')
111+
plt.ylabel('Growth%')
112+
plt.title(title)
113+
plt.tight_layout()
114+
plt.legend()
115+
if save:
116+
plt.savefig('exp.png', dpi=360)
117+
plt.close()
118+
else:
119+
plt.show()
120+
121+
return metrics.to_frame(name='metrics').T
122+
123+
124+
def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10): #d1 = -5, d2=3):
125+
'''
126+
xdata: dose data in log10(
127+
ydata: range from 0 to 1
128+
popt: fit curve metrics
129+
pcov: ??
130+
d1: minimum fixed dose in log10(M) ##updated to uM and made range larger
131+
d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
132+
'''
133+
if popt is None:
134+
cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
135+
return pd.Series([np.nan] * len(cols), index=cols)
136+
einf, ec50, hs = popt
137+
perr = np.sqrt(np.diag(pcov))
138+
ec50se = perr[1]
139+
xmin = xdata.min()
140+
xmax = xdata.max()
141+
ypred = response_curve(xdata, *popt)
142+
r2 = r2_score(ydata, ypred)
143+
auc1 = compute_area(xmin, xmax, *popt) / (xmax - xmin)
144+
aac1 = 1 - auc1
145+
ic50 = ec50 - np.log10(0.5/(0.5-einf)) / hs if einf < 0.5 else np.nan
146+
ic90 = ec50 - np.log10(0.9/(0.1-einf)) / hs if einf < 0.1 else np.nan
147+
ic10 = ec50 - np.log10(0.1/(0.9-einf)) / hs if einf < 0.9 else np.nan
148+
ic10x = min(ic10, xmax)
149+
150+
##compute area under the ic10 to subtract from total
151+
int10x = compute_area(xmin, ic10x, *popt)
152+
##old code - assumes a positive hill slope, otherwise doesn't seem to work.
153+
dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
154+
#this auc has fixed doses, so can be (in theory) standardized across datasets
155+
auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
156+
##added by sara, i'm not sure where the above came from
157+
## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
158+
## here t = 0.1 and i use the fitted curve values
159+
dss1 = (auc1-0.1*(ic10x-xmin)) / (0.9 * (xmax - xmin)) if xmax > ic50 else 0
160+
dss2 = dss1/(1-einf) ##made this dss2 doesn't change much
161+
metrics = pd.Series({'fit_auc':auc, 'fit_ic50':ic50, 'fit_ec50':ec50,'fit_einf':einf,
162+
'fit_ec50se':ec50se, 'fit_r2':r2, 'einf':einf, 'fit_hs':hs,
163+
'aac':aac1, 'auc':auc1, 'dss':dss2}).round(4)
164+
return metrics
165+
166+
167+
168+
def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS_M):
169+
'''
170+
xdata: log10 molar concetnration
171+
ydata: value between 0 and 1 for response
172+
bounds: these are fixed in code, nto sure what they are for
173+
'''
174+
ydata = ydata.clip(lower=0, upper=1.0)
175+
popt, pcov = None, None
176+
nfev = 100 * 3
177+
while popt is None and nfev < 10000:
178+
# print(nfev)
179+
try:
180+
popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev)
181+
# popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev, method='dogbox')
182+
except RuntimeError:
183+
pass
184+
nfev *= 2
185+
return popt, pcov
186+
187+
188+
def process_df(df, fname, sep='\t', ngroups=None):
189+
# df = df1.copy()
190+
i = 0
191+
header = None
192+
cols = ['source', 'improve_sample_id', 'Drug', 'study']
193+
groups = df.groupby(cols)
194+
f = open(fname, 'w')
195+
for name, group in tqdm(groups):
196+
# print(name)
197+
xdata = group.DOSE.astype(float)
198+
##added the following 3 lines to acocunt for data normalized between 0 and 100 instead of 0 and 1
199+
ydata = group.GROWTH
200+
# if max(ydata)>10:
201+
# ydata = ydata/100.0
202+
ydata.clip(lower=0, upper=1.0).astype(float)
203+
popt, pcov = response_curve_fit(xdata, ydata)
204+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
205+
if header is None:
206+
header = cols + metrics.index.tolist()
207+
print(sep.join(header), file=f)
208+
print(sep.join(name), end=sep, file=f)
209+
print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
210+
i += 1
211+
if ngroups and i >= ngroups:
212+
break
213+
f.close()
214+
215+
216+
def process_single_drug(name_group_tuple):
217+
name, group = name_group_tuple
218+
xdata = group.DOSE.astype(float)
219+
ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
220+
popt, pcov = response_curve_fit(xdata, ydata)
221+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
222+
return name, metrics
223+
224+
def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
225+
cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
226+
groups = df.groupby(cols)
227+
count = count or (4484081 - start)
228+
groups = islice(groups, start, start+count)
229+
cores = multiprocessing.cpu_count()
230+
poolsize = round(cores-1)
231+
print('we have '+str(cores)+' cores and '+str(poolsize)+' threads')
232+
with multiprocessing.Pool(processes=poolsize) as pool:
233+
results = pool.map(process_single_drug, groups)
234+
235+
with open(f'{fname}.{start}', 'w') as f:
236+
header = None
237+
for result in results:
238+
name, metrics = result
239+
if header is None:
240+
header = cols + metrics.index.tolist()
241+
print(sep.join(header), file=f)
242+
print(sep.join(str(n) for n in name), end=sep, file=f)
243+
print(sep.join(f'{x:.4g}' for x in metrics), file=f)
244+
245+
246+
def main():
247+
parser = argparse.ArgumentParser()
248+
parser.add_argument('--input', help='input file with the following columns:\
249+
DOSE: dose of drug in uM,\
250+
GROWTH: percentage of cells left,\
251+
study: name of study to group measurements by,\
252+
source: source of the data,\
253+
improve_sample_id: improve_sample_id,\
254+
Drug: improve_drug_id,\
255+
time: time at which measurement was taken,\
256+
time_unit: unit of time')
257+
parser.add_argument('--output', help='prefix of output file')
258+
parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
259+
parser.add_argument('--debug',action='store_true',default=False)
260+
261+
args = parser.parse_args()
262+
print(args.input)
263+
df_all = pd.read_table(args.input)
264+
if args.debug:
265+
df_all = df_all.iloc[0:1000000]
266+
267+
#drop nas
268+
df_all = df_all.dropna()
269+
##pharmacoGX data is micromolar, we need log transformed data
270+
df_all.DOSE = -1.0 * np.log10(df_all.DOSE/1000000.0)
271+
##need data to be between 0 and 1, not 0 and 100
272+
df_all.GROWTH=df_all.GROWTH/100.00
273+
print(df_all.head)
274+
fname = args.output or 'combined_single_response_agg'
275+
process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
276+
277+
# if args.beataml == False:
278+
format_coderd_schema(fname+'.0')
279+
280+
if __name__ == '__main__':
281+
main()

0 commit comments

Comments
 (0)