Updated Readme for build_dataset.py. Removed unused code

jjacobson95 · jjacobson95 · commit 19548a9afab7 · 2024-10-14T14:52:24.000-07:00
diff --git a/build/README.md b/build/README.md
@@ -48,6 +48,40 @@ python build/build_all.py --all --high_mem --validate --pypi --figshare --versio
 python build/build_all.py --exp
 ```
 
+## build_dataset.py script
+This script builds a single dataset for **debugging purposes only**. It can help determine if a dataset will build correctly in isolation. Note that the sample and drug identifiers generated may not align with those from other datasets, so this script is not suitable for building production datasets.
+
+It requires the following authorization tokens to be set in the local environment depending on the dataset:
+
+`SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Follow the directions above to use gain access.
+
+Available arguments:
+- `--dataset`: Required. Name of the dataset to build.
+- `--use_prev_dataset`: Optional. Prefix of the previous dataset for sample and drug ID continuation. The previous dataset files must be in the "local" directory.
+- `--validate`: Optional. Runs the schema checker on the built files.
+- `--continue`: Optional. Continues from where the build left off by skipping existing files in "local" directory.
+Example usage:
+
+Build the broad_sanger dataset:
+```bash
+python build/build_dataset.py --dataset broad_sanger
+```
+Build the mpnst dataset continuing from broad_sanger sample and drug IDs:
+```bash
+python build/build_dataset.py --dataset mpnst --use_prev_dataset broad_sanger
+```
+Build the hcmi dataset and run validation:
+```bash
+python build/build_dataset.py --dataset hcmi --validate
+```
+Build the broad_sanger dataset but skip previously built files in "local" directory:
+```bash
+python build/build_dataset.py --dataset broad_sanger --continue
+```
+
+
+
+
 ## Data Source Reference List
 
 | Dataset | Data Source | Resource | Authors | AACR Reference Number |
diff --git a/build/build_dataset.py b/build/build_dataset.py
@@ -111,7 +111,7 @@ def process_drugs(executor, dataset, use_prev_dataset, should_continue):
     executor.submit(run_docker_cmd, [di, 'sh', 'build_drugs.sh', ','.join(dflist)], filename)
 
 
-def process_omics(executor, dataset, high_mem, should_continue):
+def process_omics(executor, dataset, should_continue):
     '''
     Build the omics files for the specified dataset.
     '''
@@ -158,7 +158,7 @@ def process_omics(executor, dataset, high_mem, should_continue):
     executor.submit(run_docker_cmd, [di, 'sh', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename)
 
 
-def process_experiments(executor, dataset, high_mem, should_continue):
+def process_experiments(executor, dataset, should_continue):
     '''
     Build the experiments files for the specified dataset.
     '''
@@ -236,7 +236,6 @@ def main():
     )
     parser.add_argument('--dataset', required=True, help='Name of the dataset to build')
     parser.add_argument('--use_prev_dataset', help='Prefix of the previous dataset for sample and drug ID assignment')
-    parser.add_argument('--high-mem', action='store_true', help='Use high memory mode for parallel processing')
     parser.add_argument('--validate', action='store_true', help='Run schema checker on the built files')
     parser.add_argument('--continue', dest='should_continue', action='store_true', help='Continue from where the build left off by skipping existing files')
 
@@ -265,8 +264,8 @@ def main():
     with ThreadPoolExecutor() as executor:
         
         # Build omics and experiments
-        omics_future = executor.submit(process_omics, executor, args.dataset, args.high_mem, args.should_continue)
-        experiments_future = executor.submit(process_experiments, executor, args.dataset, args.high_mem, args.should_continue)
+        omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue)
+        experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue)
 
         omics_future.result()
         experiments_future.result()