PNNL-CompBio
diff --git a/‎LICENSE_DISCLAIMER‎ ‎DISCLAIMER‎LICENSE_DISCLAIMER renamed to DISCLAIMER
Lines changed: 0 additions & 24 deletions b/‎LICENSE_DISCLAIMER‎ ‎DISCLAIMER‎LICENSE_DISCLAIMER renamed to DISCLAIMER
Lines changed: 0 additions & 24 deletions
diff --git a/‎LICENSE‎
Lines changed: 23 additions & 0 deletions b/‎LICENSE‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎build/beatAML/GetBeatAML.py‎
Lines changed: 6 additions & 1 deletion b/‎build/beatAML/GetBeatAML.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎build/beatAML/fit_curve.py‎
Lines changed: 281 additions & 0 deletions b/‎build/beatAML/fit_curve.py‎
Lines changed: 281 additions & 0 deletions
@@ -1,27 +1,3 @@
-Copyright Battelle Memorial Institute 2025
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-------
 
 This material was prepared as an account of work sponsored by an agency of the
 United States Government.  Neither the United States Government nor the United
 
@@ -0,0 +1,23 @@
+Copyright Battelle Memorial Institute 2025
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
@@ -424,6 +424,9 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
         mapped_df.rename(columns={"hgvsc": "mutation"}, inplace=True)
         mapped_df.rename(columns={"labId": "sample_id"}, inplace=True)
         mapped_df.rename(columns={"Entrez_Gene_Id": "entrez_id"}, inplace=True)
+        
+        #remove (gene) information preceeding the colon - this formats it like other datasets.
+        mapped_df["mutation"] = mapped_df["mutation"].astype(str).str.split(":", n=1).str[-1]
 
         variant_mapping = {
             'frameshift_variant': 'Frameshift_Variant',
@@ -662,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path):
             print(improve_map_file)
             t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
             t_df = t_df[t_df.entrez_id.notna()]
+            t_df = t_df[t_df.entrez_id != 0]
             t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
             t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')
 
@@ -673,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path):
             p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
             p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map)
             p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
+            p_df = p_df[p_df.entrez_id != 0]
             p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip')
 
             # New Mutation Data
             print("Starting Mutation Data")
             m_df = pd.read_csv(mutations_file, sep = '\t')
-            
             m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file)
             m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]]
+            m_df = m_df[m_df.entrez_id != 0]
             m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip')
 
     if args.exp:
 
@@ -0,0 +1,281 @@
+#! /usr/bin/env python3
+
+import os
+os.environ['MPLCONFIGDIR'] = '/app/tmp/matplotlib'
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.font_manager
+
+import sys
+import argparse
+import numpy as np
+import pandas as pd
+
+from tqdm import tqdm
+from itertools import islice
+from sklearn.metrics import r2_score
+from scipy.optimize import curve_fit
+import multiprocessing
+
+#import uno_data as ud
+
+def format_coderd_schema(fname):
+    """    formats output to comply with coderdata schema
+    """
+    df = pd.read_csv(fname,delimiter='\t')
+    ##first rename Drug to improve_drug_id
+    df2 = df.rename(columns={'Drug':'improve_drug_id'})
+    new_df = pd.melt(df2,id_vars=['source','improve_sample_id','improve_drug_id','study','time','time_unit'],value_vars=['fit_auc','fit_ic50','fit_ec50','fit_r2','fit_ec50se','fit_einf','fit_hs','aac','auc','dss'],value_name='dose_response_value',var_name='dose_response_metric')
+
+    new_df.to_csv(fname,sep='\t',index=False)
+
+HS_BOUNDS_ORIG = ([0, 10**-12, 0], [1, 1, 4])
+
+def hs_response_curve_original(x, einf, ec50, hs):
+    """ from PharmacoDB supp. https://doi.org/10.1093/nar/gkx911
+        bounds:
+          einf: [0, 1]       # fraction of cells not susceptible to drug
+          ec50: [10^-12, 1]  # concentration to have half target receptors bound: [1pM, 1M]
+          hs:   [0, 4]       # hill slope binding cooperativity
+    """
+    return einf + (1 - einf) / (1 + np.power(x/ec50, hs))
+
+
+HS_BOUNDS = ([0, 0, 0], [1, 12, 4]) 
+#HS_BOUNDS_NEG = ([0, -3,-1],[1,8,0]) ## made hill slope forced to be negative
+HS_BOUNDS_NEG = ([0, -5,-1],[1,3,1]) ## made hill slope forced to be negative  ##20241017 updated to shift EC50 range
+
+HS_BOUNDS_M = ([0, 0, -4], [1, 12, 4])
+
+def response_curve(x, einf, ec50, hs):
+    """ transformed the original function with ec50 in -log10(M) instead of M
+    """
+    return einf + (1 - einf) / (1 + 10 ** ((ec50 - x) * hs))
+
+
+def response_integral(x, einf, ec50, hs):
+    return (1 - einf) * np.log10(1 + 10 ** ((ec50 - x) * hs)) / hs + x
+
+
+def compute_area(x1, x2, einf, ec50, hs, mode='trapz'):
+    popt = (einf, ec50, hs)
+    if mode == 'trapz':
+        # trapezoidal numerical integrationcollapse
+        xx = np.linspace(x1, x2, 100)
+        yy = response_curve(xx, *popt)
+        area = np.trapz(yy, xx, dx=0.01)
+    else:
+        # the integral function can be expressed analytically
+        # but sometimes less accurate due to float precision issues
+        area = response_integral(x2, *popt) - response_integral(x1, *popt)
+    return area
+
+
+
+'''
+added back this function as a spot check of data
+'''
+def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
+    if save:
+        font = {'family' : 'normal',
+                # 'weight' : 'bold',
+                'size'   : 14}
+        matplotlib.rc('font', **font)
+        plt.figure(figsize=(12, 6))
+
+    print(df_exp)
+    xdata = df_exp.DOSE.astype(float)
+    ydata = df_exp.GROWTH.astype(float)
+    # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
+
+    # print(xdata)
+    # print(ydata)
+
+    popt, pcov = response_curve_fit(xdata, ydata)
+    metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
+
+    if popt is None:
+        return metrics
+
+    dmin = dmin or xdata.min()
+    dmax = dmax or xdata.max()
+    xx = np.linspace(dmin, dmax, 100)
+    yy = response_curve(xx, *popt)
+
+    plt.xlim(dmax, dmin)
+    plt.ylim(0, np.max([105, np.max(yy)]))
+    plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
+    plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
+    plt.xlabel('Dose (-log10(M))')
+    plt.ylabel('Growth%')
+    plt.title(title)
+    plt.tight_layout()
+    plt.legend()
+    if save:
+        plt.savefig('exp.png', dpi=360)
+        plt.close()
+    else:
+        plt.show()
+
+    return metrics.to_frame(name='metrics').T
+
+
+def compute_fit_metrics(xdata, ydata, popt, pcov, d1=4, d2=10): #d1 = -5, d2=3):
+    '''
+    xdata: dose data in log10(
+    ydata: range from 0 to 1
+    popt: fit curve metrics
+    pcov: ??
+    d1: minimum fixed dose in log10(M) ##updated to uM and  made range larger
+    d2: maximum fixed dose log10(M) ##updated to uM and made ranger larger
+    '''
+    if popt is None:
+        cols = ['fit_auc','fit_ic50','fit_ec50','fit_ec50se','fit_r2','fit_einf','fit_hs','aac','auc','dss']#'auc ic50 ec50 ec50se R2fit rinf hs aac1 auc1 dss1'.split(' ')
+        return pd.Series([np.nan] * len(cols), index=cols)
+    einf, ec50, hs = popt
+    perr = np.sqrt(np.diag(pcov))
+    ec50se = perr[1]
+    xmin = xdata.min()
+    xmax = xdata.max()
+    ypred = response_curve(xdata, *popt)
+    r2 = r2_score(ydata, ypred)
+    auc1 = compute_area(xmin, xmax, *popt) / (xmax - xmin)
+    aac1 = 1 - auc1
+    ic50 = ec50 - np.log10(0.5/(0.5-einf)) / hs if einf < 0.5 else np.nan
+    ic90 = ec50 - np.log10(0.9/(0.1-einf)) / hs if einf < 0.1 else np.nan
+    ic10 = ec50 - np.log10(0.1/(0.9-einf)) / hs if einf < 0.9 else np.nan
+    ic10x = min(ic10, xmax)
+
+    ##compute area under the ic10 to subtract from total
+    int10x = compute_area(xmin, ic10x, *popt)
+    ##old code - assumes a positive hill slope, otherwise doesn't seem to work.
+    dss1 = (0.9 * (ic10x - xmin) - int10x) / (0.9 * (xmax - xmin)) if xmin < ic10x else 0
+    #this auc has fixed doses, so can be (in theory) standardized across datasets
+    auc = (response_integral(d2, *popt) - response_integral(d1, *popt)) / (d2 - d1)
+    ##added by sara, i'm not sure where the above came from
+    ## orig definition from paper is here: https://static-content.springer.com/esm/art%3A10.1038%2Fsrep05193/MediaObjects/41598_2014_BFsrep05193_MOESM1_ESM.pdf
+    ## here t = 0.1 and i use the fitted curve values
+    dss1 = (auc1-0.1*(ic10x-xmin)) / (0.9 * (xmax - xmin)) if xmax > ic50 else 0
+    dss2 = dss1/(1-einf) ##made this dss2 doesn't change much 
+    metrics = pd.Series({'fit_auc':auc, 'fit_ic50':ic50, 'fit_ec50':ec50,'fit_einf':einf,
+                         'fit_ec50se':ec50se, 'fit_r2':r2, 'einf':einf, 'fit_hs':hs,
+                         'aac':aac1, 'auc':auc1, 'dss':dss2}).round(4)
+    return metrics
+
+
+
+def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS_M):
+    '''
+     xdata: log10 molar concetnration
+     ydata: value between 0 and 1 for response
+     bounds: these are fixed in code, nto sure what they are for
+    '''
+    ydata = ydata.clip(lower=0, upper=1.0)
+    popt, pcov = None, None
+    nfev = 100 * 3
+    while popt is None and nfev < 10000:
+        # print(nfev)
+        try:
+            popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev)
+            # popt, pcov = curve_fit(response_curve, xdata, ydata, bounds=bounds, max_nfev=nfev, method='dogbox')
+        except RuntimeError:
+            pass
+        nfev *= 2
+    return popt, pcov
+
+
+def process_df(df, fname, sep='\t', ngroups=None):
+    # df = df1.copy()
+    i = 0
+    header = None
+    cols = ['source', 'improve_sample_id', 'Drug', 'study']
+    groups = df.groupby(cols)
+    f = open(fname, 'w')
+    for name, group in tqdm(groups):
+        # print(name)
+        xdata = group.DOSE.astype(float)
+        ##added the following 3 lines to acocunt for data normalized between 0 and 100 instead of 0 and 1
+        ydata = group.GROWTH
+      #  if max(ydata)>10:
+      #      ydata = ydata/100.0
+        ydata.clip(lower=0, upper=1.0).astype(float)
+        popt, pcov = response_curve_fit(xdata, ydata)
+        metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
+        if header is None:
+            header = cols + metrics.index.tolist()
+            print(sep.join(header), file=f)
+        print(sep.join(name), end=sep, file=f)
+        print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
+        i += 1
+        if ngroups and i >= ngroups:
+            break
+    f.close()
+
+
+def process_single_drug(name_group_tuple):
+    name, group = name_group_tuple
+    xdata = group.DOSE.astype(float)
+    ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
+    popt, pcov = response_curve_fit(xdata, ydata)
+    metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
+    return name, metrics
+
+def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
+    cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
+    groups = df.groupby(cols)
+    count = count or (4484081 - start)
+    groups = islice(groups, start, start+count)
+    cores = multiprocessing.cpu_count()
+    poolsize = round(cores-1)
+    print('we have '+str(cores)+' cores and '+str(poolsize)+' threads')
+    with multiprocessing.Pool(processes=poolsize) as pool:
+        results = pool.map(process_single_drug, groups)
+
+    with open(f'{fname}.{start}', 'w') as f:
+        header = None
+        for result in results:
+            name, metrics = result
+            if header is None:
+                header = cols + metrics.index.tolist()
+                print(sep.join(header), file=f)
+            print(sep.join(str(n) for n in name), end=sep, file=f)
+            print(sep.join(f'{x:.4g}' for x in metrics), file=f)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', help='input file with the following columns:\
+    DOSE: dose of drug in uM,\
+    GROWTH: percentage of cells left,\
+    study: name of study to group measurements by,\
+    source: source of the data,\
+    improve_sample_id: improve_sample_id,\
+    Drug: improve_drug_id,\
+    time: time at which measurement was taken,\
+    time_unit: unit of time')
+    parser.add_argument('--output', help='prefix of output file')
+    parser.add_argument('--beataml', action='store_true', help='Include this if for BeatAML')
+    parser.add_argument('--debug',action='store_true',default=False)
+    
+    args = parser.parse_args()
+    print(args.input)
+    df_all = pd.read_table(args.input)
+    if args.debug:
+        df_all = df_all.iloc[0:1000000]
+
+    #drop nas
+    df_all = df_all.dropna()
+    ##pharmacoGX data is micromolar, we need log transformed data
+    df_all.DOSE = -1.0 * np.log10(df_all.DOSE/1000000.0)
+    ##need data to be between 0 and 1, not 0 and 100
+    df_all.GROWTH=df_all.GROWTH/100.00
+    print(df_all.head)
+    fname = args.output or 'combined_single_response_agg'
+    process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
+    
+#    if args.beataml == False:
+    format_coderd_schema(fname+'.0')
+
+if __name__ == '__main__':
+    main()