Addition of drugs and experiments, alterations to calc_pdx_metrics

RubyFore · RubyFore · commit 08c2f2f373c5 · 2025-07-01T16:19:02.000-07:00
Addition of drug data (~12 drugs were not matchable) and experiments from `calc_pdx_metrics.py`. `calc_pdx_metrics.py` was altered to skip auc calculations with fewer than 2 points (the linear model would not converge).
diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py
@@ -4,6 +4,33 @@
 import math
 import argparse
 
+
+def get_copy_call(a):
+    """
+    Heler Function - Determine copy call for a value.
+    """
+
+    if a is None:
+        return float('nan')
+
+    if math.isnan(a):
+        return float('nan')
+
+    a_val = math.log2(float(a)+0.000001)
+    if a_val < 0.5210507:
+        return 'deep del'
+    elif a_val < 0.7311832:
+        return 'het loss'
+    elif a_val < 1.214125:
+        return 'diploid'
+    elif a_val < 1.422233:
+        return 'gain'
+    else:
+        return 'amp'
+
+    return pd.Series([get_copy_call(a) for a in arr])
+
+
 def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None):
     """ 
     Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account
@@ -167,4 +194,21 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat
     return(sample_entrez_transcriptomics_df)
 
 
+if __name__ == "__main__":
+    print('in main')
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
+    parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
+    parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
+    parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
+    parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
+    parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
+    parser.add_argument('-t', '--token', help='Synapse token')
+
+    args = parser.parse_args()
+    print("Logging into Synapse")
+    PAT = args.token
+
+    genes=pd.read_csv(args.genes)
+    samples = pd.read_csv(args.samples)
 
+    data =download_parse_omics_novPDX(syn id,savestring, PAT) 
diff --git a/build/novartispdx/03-drugs-novartispdx.py b/build/novartispdx/03-drugs-novartispdx.py
@@ -0,0 +1,70 @@
+import synapseclient
+import pandas as pd
+import numpy as np
+import argparse
+import os
+# for testing locally
+from utils.pubchem_retrieval import update_dataframe_and_write_tsv
+# for building in docker
+#from pubchem_retrieval import update_dataframe_and_write_tsv
+
+
+def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath):
+    file = synObject.get('syn66276102')
+    # read raw drug data from synapse
+    rawDrugData = pd.read_csv(file.path)
+    # split on + operator - there are 2- and one 3- way drug combos in this dataset
+    sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True)
+    ### NEED TO ALSO remove drug names with different dose info
+    
+  
+    # taking the drug names from the first and second column from the split - there is only one 
+    # drug name in the 3rd column (onen 3-way combo) that is replicated in other treatments as well
+    alldrugnames = pd.Series(pd.concat([sepDrugNames[0], sepDrugNames[1]]).dropna()).str.split('"', expand=True)[0].str.split("-", expand=True)[0]
+    #nodoseinfo = pd.Series(alldrugnames.str.split("-", expand =True)[0])
+    #combineddrugames = pd.concat([alldrugnames, nodoseinfo])
+    finalDrugNames = pd.Series(alldrugnames.unique()).str.strip().unique()
+    # get unique drugs
+    newdrugnames = finalDrugNames[finalDrugNames != 'untreated']
+
+    #print(finalDrugNames.tolist) 
+    #newdrugnames = finalDrugNames.remove('untreated')
+    print(2)
+    print(newdrugnames)
+
+
+    # use helper functions in pubchem_retrieval.py 
+    alldrugs = []
+    if prevDrugFilepath is not None and prevDrugFilepath is not "":
+        prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
+        alldrugs = pd.concat(prevdrugs).drop_duplicates()
+
+        imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
+        newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
+        
+        ##write drugs
+        newdrugs.to_csv(outputPath, sep='\t', index=False)
+
+    if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
+        print('Missing drugs in existing file, querying pubchem')
+        update_dataframe_and_write_tsv(newdrugnames,outputPath)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Novartis PDX data")
+    parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for bladderpdo', nargs="?", default = None)
+    parser.add_argument('-o', '--outputPath', help='Output path for updated novartispdx drug file', default = "/tmp/novartispdx_drugs.tsv") 
+    parser.add_argument('-t', '--token', help='Synapse token')
+
+    args = parser.parse_args()
+    print("Logging into Synapse")
+    PAT = args.token
+    print("after PAT assignment")
+    synObject = synapseclient.login(authToken=PAT)
+    print('after creating synObject')
+    if args.prevDrugFilePath:
+        previousDrugs = args.prevDrugFilePath
+    else:
+        previousDrugs = None
+    create_novartis_pdx_drugs_file(synObject, previousDrugs, args.outputPath)
diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py
@@ -0,0 +1,59 @@
+import synapseclient
+import pandas as pd
+import numpy as np
+import argparse
+import os
+
+def get_novartis_pdx_experiments_file: 
+    # input for the calc_pdx_metrics script
+
+    file1 = synObject.get('syn66276102')
+    rawDrugData = pd.read_csv(file1.path)
+    # STILL NEED TO : link to improve ids. 
+    # update a few drug ids for greater inclusion
+    novartispdx_curvefile = rawDrugData[['Model', 'Days Post T0', 'Volume (mm3)', 'Treatment']]
+    novartispdx_curvefile=novartispdx_curvefile.rename({'Model': 'model_id', 'Days Post T0' : 'time', 'Volume (mm3)': 'volume', 'Treatment':'treatment'}, axis=1)
+    novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.lower()
+    novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.replace('"', '')
+    novartispdx_curvefile['treatment']=novartispdx_curvefile['treatment'].str.replace('untreated', 'control')
+    novartispdx_curvefile['experiment'] = novartispdx_curvefile.groupby(['model_id']).ngroup()+1
+    # remove triple combination(s)
+    novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+.*\+')]
+    # remove dose information appended to some drugs in the treatment column and include in dose colum
+    druganddose = novartispdx_curvefile['treatment'].str.split('-', expand=True)
+    druganddose = druganddose.rename({0: 'treatment', 1:'dose'}, axis=1)
+    novartispdx_curvefile['treatment']=druganddose['treatment']
+    novartispdx_curvefile['dose'] = druganddose['dose']
+    # remove pdxs with only one drug treatment (no control)
+    unique_vals_tally = novartispdx_curvefile.groupby('experiment').nunique() 
+    todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index
+    novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)]
+    # remove groups with no 'control' treatment
+    groupeddf = test.groupby('experiment')
+    no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any())
+
+    missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment']
+    finaldf=test[~test['experiment'].isin(missingcontrols)]
+
+    finalcurvefile = finaldf
+    return finalcurvefile
+    #finalcurvefile.to_csv('/tmp/novartispdx_doserep.tsv', sep="\t")        
+
+
+if __name__ == "__main__": 
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--token', help='Synapse authentication token')
+    parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples')
+    parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples')
+    parser.add_argument('-o', '--output', default = '/tmp/novartispdx_doserep.tsv',help='Output file to be read into curve fitting code')
+
+    args = parser.parse_args()
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+    drug_df = pd.read_csv(args.drugfile, sep='\t')
+    samples_df = pd.read_csv(args.curSampleFile)
+
+    doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df, drug_df)
+    doseresponse_data.to_csv(args.output, sep='\t')
+
diff --git a/build/novartispdx/build_drugs.sh b/build/novartispdx/build_drugs.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -euo pipefail
+
+trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
+
+echo "Running script with token and drugFile $1"
+# for running locally (from build directory):
+python3 -m novartispdx.03-drugs-novartispdx --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv
+#python3 novar
+#python3 03-drugs-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv
+
+echo "Running build_drug_desc.py..."
+#for running locally: 
+python3 utils/build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz
+#python3 build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz
diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh
diff --git a/build/novartispdx/build_omics.sh b/build/novartispdx/build_omics.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -euo pipefail
+
+trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
+
+echo "Running script with token, curSamples $2, and genes $1."
+# for mutation data (-m)
+python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -m
+# for expressiondata (-e)
+python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -e
+# for copynumber
+python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -c
diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py
@@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True):
     dict: Dictionary containing the AUC value.
     """
     auc = trapz_auc(time, volume)
-    #print(time)
+    print('at line 187')
+    print(time.shape)
+    print(time.dtype)
+    print(np.max(time.astype(int)))
+    print('auc is : ')
+    print(auc)
     if time_normalize:
         auc = auc/np.max(time)
     return {"metric": "auc", "value": auc, 'time':np.max(time)}
@@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name):
         raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'")
     
     data['log_volume'] = np.log(data['volume'])
-    
+    print('drug name is ' + drug_name)
+    data['exp_type'] = data['exp_type'].astype('category')
+    data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True)
+    print(data)
+    print(data['exp_type'].cat.categories)
     # Define the formula for mixed linear model
     formula = 'log_volume ~ time*exp_type'
     
+    #print(data['exp_type'].cat.categories)
     # Fit the model
     model = mixedlm(formula, data, groups=data['model_id'])
     fit = model.fit()
@@ -282,8 +292,9 @@ def lmm(time, volume, treatment, drug_name):
     #interaction_term = 'time:exp_type'
 #    if interaction_term in fit.params:
 #    time_coef_value = fit.params['time']
-    #print(fit.params)
+    print(fit.params)
     i_coef_value = fit.params['time:exp_type[T.'+drug_name+']']
+    #i_coef_value = fit.params['time:exp_type['+drug_name+']']
    # else:
    #     coef_value = None  # Handle the case when the interaction term is not present
     
@@ -341,21 +352,25 @@ def get_drug_stats(df, control='control'):
     for name, group in tqdm(groups):
         # Each group contains multiple treatments and a control
         drugs = set(group.treatment) - set([control])
+        print('line 355')
         print(name[0])
         print(drugs)
         mod = list(set(group.model_id))[0]
 
         ctl_data = group[group.treatment == control]
         ctl_time = np.array(ctl_data.time)
         ctl_volume = np.array(ctl_data.volume)
-
+        if (ctl_volume.shape[0] < 2):
+            continue
         ctl_auc = AUC(ctl_time, ctl_volume)
         for d in drugs:
-            print(d)
-            d_data = group[group.treatment == d]
+            print('is our drug a string or dict?')
+            print(str(d))
+            d_data = group[group.treatment == str(d)]
             treat_time = np.array(d_data.time)
             treat_volume = np.array(d_data.volume)
-
+            if (treat_volume.shape[0] < 2):
+                continue
             # Get ABC for group
             treat_auc = AUC(treat_time, treat_volume)
             treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume)
@@ -368,6 +383,7 @@ def get_drug_stats(df, control='control'):
 
             #llm
             comb = pd.concat([ctl_data, d_data])
+            #print(comb)
             lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
             lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d: