progress on experiments data

RubyFore · RubyFore · commit 9d361120a0ef · 2025-07-14T14:08:34.000-07:00
adding in improve sample and drug ids, added build_experiments.py, testing single experiments with linkml. More work to do with combo experiments.
diff --git a/build/novartispdx/03-drugs-novartispdx.py b/build/novartispdx/03-drugs-novartispdx.py
@@ -15,7 +15,7 @@ def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath):
     rawDrugData = pd.read_csv(file.path)
     # split on + operator - there are 2- and one 3- way drug combos in this dataset
     sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True)
-    ### NEED TO ALSO remove drug names with different dose info
+    
     
   
     # taking the drug names from the first and second column from the split - there is only one 
diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py
@@ -4,7 +4,9 @@
 import argparse
 import os
 
-def get_novartis_pdx_experiments_file: 
+
+# add improve IDs - for sample and drug
+def get_novartis_pdx_experiments_file(synObject, samples_df): 
     # input for the calc_pdx_metrics script
 
     file1 = synObject.get('syn66276102')
@@ -29,31 +31,38 @@ def get_novartis_pdx_experiments_file:
     todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index
     novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)]
     # remove groups with no 'control' treatment
-    groupeddf = test.groupby('experiment')
+    groupeddf = novartispdx_curvefile.groupby('experiment')
     no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any())
 
     missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment']
-    finaldf=test[~test['experiment'].isin(missingcontrols)]
-
-    finalcurvefile = finaldf
+    nomissingcontrols=novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(missingcontrols)]
+    #merge on drug names done in calc_pdx_metrics.py
+    #final_w_drugIDs = finaldf.merge(drug_df, how='left',right_on='chem_name', left_on="treatment")
+    final_allIDs = nomissingcontrols.merge(samples_df, how='left', right_on='common_name', left_on='model_id') 
+    print(final_allIDs.head)
+    final_allIDs = final_allIDs.drop('model_id', axis=1)
+    finalDF = final_allIDs.rename({'improve_sample_id':'model_id'}, axis=1)
+    print(finalDF.head)
+    finalcurvefile = finalDF[['model_id', 'time', 'volume', 'treatment', 'experiment', 'dose']]
+    print(finalcurvefile.head)
     return finalcurvefile
-    #finalcurvefile.to_csv('/tmp/novartispdx_doserep.tsv', sep="\t")        
 
 
 if __name__ == "__main__": 
     parser = argparse.ArgumentParser()
     parser.add_argument('-t', '--token', help='Synapse authentication token')
-    parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples')
-    parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples')
-    parser.add_argument('-o', '--output', default = '/tmp/novartispdx_doserep.tsv',help='Output file to be read into curve fitting code')
+    parser.add_argument('-s', '--curSampleFile', default='/tmp/novartispdx_samples.csv', help='Sample mapping file for bladder pdo samples')
+    parser.add_argument('-d', '--drugfile', default='/tmp/novartispdx_drugs.tsv', help='Drug mapping file for bladder pdo samples')
+    parser.add_argument('-o', '--output', default = '/tmp/novartispdx_experiments.tsv',help='Output experiments file')
 
     args = parser.parse_args()
     print("Logging into Synapse")
     PAT = args.token
     synObject = synapseclient.login(authToken=PAT)
-    drug_df = pd.read_csv(args.drugfile, sep='\t')
+    #drug_df = pd.read_csv(args.drugfile, sep='\t')
     samples_df = pd.read_csv(args.curSampleFile)
-
-    doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df, drug_df)
-    doseresponse_data.to_csv(args.output, sep='\t')
+    
+    doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df)
+    print(doseresponse_data.head)
+    doseresponse_data.to_csv('/tmp/novartispdx_curvedata.tsv', columns=list({'model_id', 'time', 'volume', 'treatment','experiment', 'dose'}), sep='\t')
 
diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh
@@ -0,0 +1,5 @@
+
+#python3 04-experiments-novartispdx.py --token $SYNAPSE_AUTH_TOKEN 
+
+python3 -m novartispdx.04-experiments-novartispdx --token $SYNAPSE_AUTH_TOKEN -o ~/Projects/CoderData/dev-environment/novartispdx/novartispdx_curvedata.tsv
+python3 utils/calc_pdx_metrics.py /tmp/novartispdx_curvedata.tsv --drugfile=/tmp/novartispdx_drugs.tsv --outprefix=/tmp/novartispdx
diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py
@@ -275,11 +275,11 @@ def lmm(time, volume, treatment, drug_name):
         raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'")
     
     data['log_volume'] = np.log(data['volume'])
-    print('drug name is ' + drug_name)
+    #print('drug name is ' + drug_name)
     data['exp_type'] = data['exp_type'].astype('category')
     data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True)
-    print(data)
-    print(data['exp_type'].cat.categories)
+    #print(data)
+    #print(data['exp_type'].cat.categories)
     # Define the formula for mixed linear model
     formula = 'log_volume ~ time*exp_type'
     
@@ -327,11 +327,13 @@ def main():
     # source	improve_sample_id	improve_drug_id	study	time	time_unit	dose_response_metric	dose_response_value
 
     combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True)
+    print('COMBOS ARE: ')
+    print(combos[['drug1', 'drug2']])
     combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna()
 
     expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
     expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
-
+    print(expcomb[['improve_drug_1', 'improve_drug_2']])
     expcomb[['source']]='Synapse'
     expcomb[['study']]='MPNST PDX in vivo'
 
@@ -352,9 +354,9 @@ def get_drug_stats(df, control='control'):
     for name, group in tqdm(groups):
         # Each group contains multiple treatments and a control
         drugs = set(group.treatment) - set([control])
-        print('line 355')
-        print(name[0])
-        print(drugs)
+        #print('line 355')
+        #print(name[0])
+        #print(drugs)
         mod = list(set(group.model_id))[0]
 
         ctl_data = group[group.treatment == control]
@@ -364,8 +366,8 @@ def get_drug_stats(df, control='control'):
             continue
         ctl_auc = AUC(ctl_time, ctl_volume)
         for d in drugs:
-            print('is our drug a string or dict?')
-            print(str(d))
+            #print('is our drug a string or dict?')
+            #print(str(d))
             d_data = group[group.treatment == str(d)]
             treat_time = np.array(d_data.time)
             treat_volume = np.array(d_data.volume)