PNNL-CompBio
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎build/README.md‎
Lines changed: 6 additions & 8 deletions b/‎build/README.md‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎build/beatAML/GetBeatAML.py‎
Lines changed: 4 additions & 4 deletions b/‎build/beatAML/GetBeatAML.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎build/hcmi/02-getHCMIData.py‎
Lines changed: 5 additions & 5 deletions b/‎build/hcmi/02-getHCMIData.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎build/utils/build_drug_desc.py‎
Lines changed: 12 additions & 3 deletions b/‎build/utils/build_drug_desc.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎build/utils/calc_pdx_metrics.py‎
Lines changed: 33 additions & 34 deletions b/‎build/utils/calc_pdx_metrics.py‎
Lines changed: 33 additions & 34 deletions
@@ -4,6 +4,7 @@ on:
   push:
     tags:
           - '*'  # Triggers the workflow only on version tags
+  workflow_dispatch:  # Allows manual triggering of the workflow
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
@@ -44,4 +45,4 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@v4
@@ -10,11 +10,10 @@ are added.
 
 ## build_all.py script
 
-This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare and pypi.
+This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare.
 
 It requires the following authorization tokens to be set in the local environment depending on the use case:   
 `SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Join the [CoderData team](https://www.synapse.org/#!Team:3503472) on Synapse and generate an access token.  
-`PYPI_TOKEN`: This token is required to upload to PyPI.  
 `FIGSHARE_TOKEN`: This token is required to upload to Figshare.  
 `GITHUB_TOKEN`: This token is required to upload to GitHub.  
 
@@ -25,21 +24,20 @@ It requires the following authorization tokens to be set in the local environmen
 - `--omics`: Processes and builds the omics data files.
 - `--drugs`: Processes and builds the drug data files.
 - `--exp`: Processes and builds the experiment data files.
-- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate, figshare, or pypi commands.
+- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate or figshare commands.
 - `--validate`: Validates the generated datasets using the schema check scripts. This is automatically included if data upload occurs.
 - `--figshare`: Uploads the datasets to Figshare. FIGSHARE_TOKEN must be set in local environment.
-- `--pypi`: Uploads the package to PyPI. PYPI_TOKEN must be set in local environment.
 - `--high_mem`: Utilizes high memory mode for concurrent data processing. This has been successfully tested using 32 or more vCPUs. 
 - `--dataset`: Specifies the datasets to process (default='broad_sanger,hcmi,beataml,mpnst,cptac').
-- `--version`: Specifies the version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for figshare and PyPI upload steps. This must be a higher version than previously published versions.
+- `--version`: Specifies the version number for the Figshare upload title (e.g., "0.1.29"). This must be a higher version than previously published versions.
 - `--github-username`: GitHub username matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 - `--github-email`: GitHub email matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 
 **Example usage**:  
-- Build all datasets and upload to Figshare and PyPI and GitHub.  
-Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `PYPI_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
+- Build all datasets and upload to Figshare and GitHub.  
+Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
 ```bash
-python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
+python build/build_all.py --all --high_mem --validate --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
 ```
 
 - Build only the experiment files.  
 
@@ -467,10 +467,10 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
                          how='left')
     mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
 
-    print(mapped_df.to_string())
-    mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
-    mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
-    mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
+    # Replace NaNs, round values, and convert to integers for specified columns
+    columns_to_convert = ['improve_sample_id', 'entrez_id']
+    mapped_df[columns_to_convert] = mapped_df[columns_to_convert].fillna(0).round().astype('int32')
+    
     mapped_df['source'] = 'synapse'
     mapped_df['study'] = 'BeatAML'
     mapped_df =mapped_df.drop_duplicates()
 
@@ -474,15 +474,15 @@ def get_copy_call(a):
 
         if math.isnan(a):
             return float('nan')
-        
-        a_val = math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
-        if a_val < 0.0: #0.5210507:
+
+        a_val = math.log2(float(a)+0.000001)
+        if a_val < 0.5210507:
             return 'deep del'
         elif a_val < 0.7311832:
             return 'het loss'
         elif a_val < 1.214125:
             return 'diploid'
-        elif a_val < 1.731183:
+        elif a_val < 1.422233:
             return 'gain'
         else:
             return 'amp'
@@ -697,4 +697,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    
+    
@@ -64,6 +64,7 @@ def smiles_to_mordred(smiles,nproc=2):
     ##reformat here
     longtab = pd.melt(dd,id_vars='smile',value_vars=values)
     longtab = longtab.rename({'variable':'structural_descriptor','value':'descriptor_value'},axis=1)
+    
     return longtab
 
 def main():
@@ -82,16 +83,24 @@ def main():
     cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan']
     #    isosmiles = list(set(tab.isoSMILES))
     morgs = smiles_to_fingerprint(cansmiles)
-#    print(morgs)
+
     ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates()
-#    print(ids)
+
     id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']]
 
     mords = smiles_to_mordred(cansmiles,nproc=ncors)
 
     id_mord = ids.rename({'canSMILES':'smile'},axis=1).merge(mords)[['improve_drug_id','structural_descriptor','descriptor_value']]
 
-    full = pd.concat([id_morg,id_mord],axis=0)                     
+    full = pd.concat([id_morg,id_mord],axis=0)    
+    
+    # Convert any values that contain the following strings to NA. I think this covers all of the cases, but add here if more are found.
+    strings_to_replace = ["min", "max", "invalid", "multiple", "missing"]
+    pattern = '|'.join(strings_to_replace)
+    full['descriptor_value'] = full['descriptor_value'].astype(str)
+    full.loc[full['descriptor_value'].str.contains(pattern, case=False, na=False), 'descriptor_value'] = "NaN"
+
+
     full.to_csv(args.outtable,sep='\t',index=False,compression='gzip')
 
 if __name__=='__main__':
 
@@ -310,91 +310,90 @@ def main():
     singles, combos = get_drug_stats(tab)
 
     ##join with drug ids
-    expsing = singles.rename({'drug':'chem_name','metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','drug_combination_metric','drug_combination_value']]
+    expsing = singles.rename({'drug':'chem_name','metric':'dose_response_metric','value':'dose_response_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','time_unit','time','dose_response_metric','dose_response_value']]
     expsing = expsing.dropna()
+    
+    # source	improve_sample_id	improve_drug_id	study	time	time_unit	dose_response_metric	dose_response_value
 
     combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True)
     combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna()
 
-    expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','drug_combination_metric','drug_combination_value']]
-    expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','drug_combination_metric','drug_combination_value']]
+    expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
+    expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
 
     expcomb[['source']]='Synapse'
     expcomb[['study']]='MPNST PDX in vivo'
 
     expsing[['source']]='Synapse'
     expsing[['study']]='MPNST PDX in vivo'
-    expsing.to_csv(args.outprefix+'_experiments.csv',index=False)
-    expcomb.to_csv(args.outprefix+'_combinations.csv',index=False)
+    expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t")
+    expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t")
 
 
 
-def get_drug_stats(df,control='control'):
+def get_drug_stats(df, control='control'):
     ##for each experiment, call group
-    cols = ['experiment','model_id']
+    cols = ['experiment', 'model_id']
     groups = df.groupby(cols)
     singleres = []
     combores = []
 
-    for name,group in tqdm(groups):
-        #each group contains multiple treatments anda  control
-        drugs = set(group.treatment)-set([control])
+    for name, group in tqdm(groups):
+        # Each group contains multiple treatments and a control
+        drugs = set(group.treatment) - set([control])
         print(name[0])
         print(drugs)
         mod = list(set(group.model_id))[0]
- #       print(set(group.model_id))
-        ctl_data = group[group.treatment==control]
+
+        ctl_data = group[group.treatment == control]
         ctl_time = np.array(ctl_data.time)
         ctl_volume = np.array(ctl_data.volume)
 
-        ctl_auc = AUC(ctl_time,ctl_volume)
+        ctl_auc = AUC(ctl_time, ctl_volume)
         for d in drugs:
             print(d)
-            d_data = group[group.treatment==d]
+            d_data = group[group.treatment == d]
             treat_time = np.array(d_data.time)
             treat_volume = np.array(d_data.volume)
 
-            #get abc for group
-            treat_auc = AUC(treat_time,treat_volume)
-            treat_abc = ABC(ctl_time,ctl_volume,treat_time,treat_volume)
-            #print(f"AUC: {treat_auc}")
-            #print(f"ABC: {treat_abc}")
-            treat_abc.update({'sample':mod,'drug':d,'time_unit':'days'})
+            # Get ABC for group
+            treat_auc = AUC(treat_time, treat_volume)
+            treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume)
+            print(f"treat_time:, {treat_time}")
+            treat_abc.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d:
                 combores.append(treat_abc)
             else:
                 singleres.append(treat_abc)
-            #lmm
-            comb = pd.concat([ctl_data,d_data])
-            lmm_res = lmm(comb.time, comb.volume, comb.treatment,d)
-            lmm_res.update({'sample':mod,'drug':d,'time_unit':'days'})
-            #print(f"LMM: {lmm_res}")
+
+            #llm
+            comb = pd.concat([ctl_data, d_data])
+            lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
+            lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d:
                 combores.append(lmm_res)
             else:
                 singleres.append(lmm_res)
 
-            #get tgi for group
-            tg = TGI(ctl_volume,treat_volume,treat_time)
-            tg.update({'sample':mod,'drug':d,'time_unit':'days'})
-            #print(tg)
+            # Get TGI for group
+            tg = TGI(ctl_volume, treat_volume, treat_time)
+            tg.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d:
                 combores.append(tg)
             else:
                 singleres.append(tg)
 
-            
-            #get mRECIST for group
-            mr = mrecist(treat_time,treat_volume)
-            mr.update({'sample':mod,'drug':d,'time_unit':'days'})
+            # Get mRECIST for group
+            mr = mrecist(treat_time, treat_volume)
+            mr.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d:
                 combores.append(mr)
             else:
                 singleres.append(mr)
 
     sing = pd.DataFrame.from_records(singleres)
     comb = pd.DataFrame.from_records(combores)
-    return sing,comb
+    return sing, comb
 
 if __name__=='__main__':
     main()