PNNL-CompBio
diff --git a/‎LICENSE‎ ‎LICENSE_DISCLAIMER‎LICENSE renamed to LICENSE_DISCLAIMER
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎ ‎LICENSE_DISCLAIMER‎LICENSE renamed to LICENSE_DISCLAIMER
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎build/beatAML/GetBeatAML.py‎
Lines changed: 1 addition & 1 deletion b/‎build/beatAML/GetBeatAML.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/broad_sanger/02-broadSangerOmics.R‎
Lines changed: 5 additions & 1 deletion b/‎build/broad_sanger/02-broadSangerOmics.R‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎build/broad_sanger/02a-broad_sanger_proteomics.py‎
Lines changed: 7 additions & 4 deletions b/‎build/broad_sanger/02a-broad_sanger_proteomics.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎build/broad_sanger/05b_separate_datasets.py‎
Lines changed: 17 additions & 10 deletions b/‎build/broad_sanger/05b_separate_datasets.py‎
Lines changed: 17 additions & 10 deletions
diff --git a/‎build/broad_sanger/build_misc.sh‎
Lines changed: 2 additions & 3 deletions b/‎build/broad_sanger/build_misc.sh‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎build/build_all.py‎
Lines changed: 8 additions & 6 deletions b/‎build/build_all.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎build/build_dataset.py‎
Lines changed: 4 additions & 2 deletions b/‎build/build_dataset.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎build/cptac/getCptacData.py‎
Lines changed: 3 additions & 2 deletions b/‎build/cptac/getCptacData.py‎
Lines changed: 3 additions & 2 deletions
@@ -1,4 +1,4 @@
-Copyright Battelle Memorial Institute
+Copyright Battelle Memorial Institute 2025
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
@@ -1,5 +1,7 @@
 ## Cancer Omics Drug Experiment Response Dataset 
 
+
+
 There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
 
 This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the 
 
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
             # New Transcriptomics Data
             print("Starting Transcriptomics Data")
             ##first run conversion tool
-            os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
+            os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
 
 
             t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')
 
@@ -566,12 +566,16 @@ main<-function(){
     lapply(alltypes,function(dt){
         print(dt)
         temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
+        readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
         tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
-        readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
+        readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
+
+#        readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
         rm(tempd)
         rm(temps)
     })
 
 }
 
 main()
+
@@ -48,6 +48,9 @@ def main():
 
     full[['study']] = 'DepMap'
     full[['source']] = 'Broad'
+    ##now save to separate files
+    full.dropna(axis=0)
+    full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
 
 
     ##now get sanger
@@ -69,9 +72,9 @@ def main():
     full2.loc[:,['study']] = 'Sanger'
     full2.loc[:,['source']] = 'Sanger'
 
-    full3 = pd.concat([full,full2])
-    print(full3)
-    full3.dropna(axis=0)
-    full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
+    #full3 = pd.concat([full,full2])
+    #print(full3)
+    full2.dropna(axis=0)
+    full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
 
 main()
@@ -5,6 +5,12 @@
 import shutil
 
 def main():
+    
+    print("ls:\n")
+    files = os.listdir(".")
+    print(files)
+    print("\n")
+    
     datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
     omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv 
     samples_datatypes = ["samples"] #csv
@@ -13,14 +19,14 @@ def main():
 
 
     dataset_sources = {
-        "CCLE": ["Broad"],
-        "CTRPv2": ["Broad"],
-        "PRISM": ["Broad"],
-        "GDSCv1": ["Sanger"],
-        "GDSCv2": ["Sanger"],
-        "FIMM": ["Broad"],
-        "gCSI": ["Broad"],  # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
-        "NCI60": ["Broad"]
+        "CCLE": ["broad"],
+        "CTRPv2": ["broad"],
+        "PRISM": ["broad"],
+        "GDSCv1": ["sanger"],
+        "GDSCv2": ["sanger"],
+        "FIMM": ["broad"],
+        "gCSI": ["broad"],  # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
+        "NCI60": ["broad"]
     }
 
     for dataset in datasets_to_process:
@@ -70,14 +76,15 @@ def main():
 
         #One by one, filter other Omics files, write to file, delete from mem.
         for omics in omics_datatypes:
-            omics_filename_in = f"broad_sanger_{omics}.csv"
+            ds = dataset_sources[dataset][0]
+            omics_filename_in = f"{ds}_{omics}.csv"
             if os.path.isfile(omics_filename_in + ".gz"):
                 omics_filename_in += ".gz"
 
             omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
             omics_df = pl.read_csv(omics_filename_in)
             omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
-            omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
+#            omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
             omics_df.write_csv(omics_filename_out) #csv
 
             #Rewrite as gzipped if needed
 
@@ -3,13 +3,12 @@ set -euo pipefail
 
 trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
 
-cp /tmp/broad_sanger* .
+cp /tmp/broad* . 
+cp /tmp/sanger* . 
 
 echo "Running 05a_remove_problem_drugs.py..."
 /opt/venv/bin/python 05a_remove_problem_drugs.py
 
 echo "Running 05b_separate_datasets.py..."
 /opt/venv/bin/python 05b_separate_datasets.py
 
-echo "Removing broad_sanger* files..."
-rm broad_sanger*
@@ -40,7 +40,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
         print('running...'+filename)
         env = os.environ.copy()
         if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
-            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST and beatAML Datasets')
+            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
         else:
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -121,6 +121,8 @@ def process_docker(datasets):
             'mpnst': ['mpnst'],
             'mpnstpdx': ['mpnstpdx'],
             'pancpdo': ['pancpdo'],
+            'bladderpdo': ['bladderpdo'],
+            'sarcpdo': ['sarcpdo'],
             'cptac': ['cptac'],
             'genes': ['genes'],
             'upload': ['upload']
@@ -132,7 +134,7 @@ def process_docker(datasets):
             datasets_to_build.extend(dataset_map.get(dataset, []))
 
         # Build the docker-compose command, adding specific datasets
-        compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
+        compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
 
         log_file_path = 'local/docker.log'
         env = os.environ.copy()
@@ -328,9 +330,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # Error handling for required tokens
     if args.figshare and not figshare_token:
         raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
-    if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token:
+    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
         if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
-            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.")
+            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
 
     ######
     ### Begin Pipeline
@@ -407,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # if args.figshare or args.validate:
         # FigShare File Prefixes:
 
-        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
+        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
         if "broad_sanger" in datasets:
             prefixes.extend(broad_sanger_datasets)
 
@@ -44,6 +44,7 @@ def process_docker(dataset,validate):
         'mpnstpdx': ['mpnstpdx'],
         'pancpdo': ['pancpdo'],
         'cptac': ['cptac'],
+        'sarcpdo': ['sarcpdo'],
         'genes': ['genes'],
         'upload': ['upload'], 
         'bladderpdo': ['bladderpdo']
@@ -57,7 +58,7 @@ def process_docker(dataset,validate):
 
     datasets_to_build.extend(dataset_map.get(dataset, []))
 
-    compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build
+    compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
 
     log_file_path = 'local/docker.log'
     env = os.environ.copy()
@@ -125,8 +126,9 @@ def process_omics(executor, dataset, should_continue):
         'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'hcmi': ['mutations', 'transcriptomics'],
+        'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
+        'sarcpdo': ['mutations', 'transcriptomics'],
         'pancpdo': ['transcriptomics'],
-        'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 
         'bladderpdo': ['copy_number', 'mutations', 'transcriptomics']
     }
 
 
@@ -288,8 +288,9 @@ def main():
         exit()
 
     # Remove the old values in samples (from prev file)
-    samples.drop(samples.index,inplace=True)
-    
+    if 'other_id_source' in samples.columns:
+        samples = samples[samples['other_id_source'] == 'CPTAC3'].copy()
+        
     # Create new samples
     if build_samples:
         # Loop through the cancer types to build samples
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright Battelle Memorial Institute`
	`1`	`+Copyright Battelle Memorial Institute 2025`
`2`	`2`
`3`	`3`	`Redistribution and use in source and binary forms, with or without`
`4`	`4`	`modification, are permitted provided that the following conditions are met:`