PNNL-CompBio
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎build/README.md‎
Lines changed: 12 additions & 13 deletions b/‎build/README.md‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎build/beatAML/GetBeatAML.py‎
Lines changed: 6 additions & 0 deletions b/‎build/beatAML/GetBeatAML.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎build/broad_sanger/03a-nci60Drugs.py‎
Lines changed: 1 addition & 1 deletion b/‎build/broad_sanger/03a-nci60Drugs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/broad_sanger/04b-nci60-updated.py‎
Lines changed: 3 additions & 2 deletions b/‎build/broad_sanger/04b-nci60-updated.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎build/build_all.py‎
Lines changed: 16 additions & 31 deletions b/‎build/build_all.py‎
Lines changed: 16 additions & 31 deletions
diff --git a/‎build/build_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎build/build_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/cptac/getCptacData.py‎
Lines changed: 4 additions & 0 deletions b/‎build/cptac/getCptacData.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎build/docker/Dockerfile.mpnstPDX‎ ‎build/docker/Dockerfile.mpnstpdx‎build/docker/Dockerfile.mpnstPDX renamed to build/docker/Dockerfile.mpnstpdx b/‎build/docker/Dockerfile.mpnstPDX‎ ‎build/docker/Dockerfile.mpnstpdx‎build/docker/Dockerfile.mpnstPDX renamed to build/docker/Dockerfile.mpnstpdx
diff --git a/‎build/docker/Dockerfile.upload‎
Lines changed: 3 additions & 3 deletions b/‎build/docker/Dockerfile.upload‎
Lines changed: 3 additions & 3 deletions
@@ -4,6 +4,7 @@ on:
   push:
     tags:
           - '*'  # Triggers the workflow only on version tags
+  workflow_dispatch:  # Allows manual triggering of the workflow
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
@@ -44,4 +45,4 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@v4
@@ -10,11 +10,10 @@ are added.
 
 ## build_all.py script
 
-This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare and pypi.
+This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare.
 
 It requires the following authorization tokens to be set in the local environment depending on the use case:   
 `SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Join the [CoderData team](https://www.synapse.org/#!Team:3503472) on Synapse and generate an access token.  
-`PYPI_TOKEN`: This token is required to upload to PyPI.  
 `FIGSHARE_TOKEN`: This token is required to upload to Figshare.  
 `GITHUB_TOKEN`: This token is required to upload to GitHub.  
 
@@ -25,21 +24,20 @@ It requires the following authorization tokens to be set in the local environmen
 - `--omics`: Processes and builds the omics data files.
 - `--drugs`: Processes and builds the drug data files.
 - `--exp`: Processes and builds the experiment data files.
-- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate, figshare, or pypi commands.
+- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate or figshare commands.
 - `--validate`: Validates the generated datasets using the schema check scripts. This is automatically included if data upload occurs.
 - `--figshare`: Uploads the datasets to Figshare. FIGSHARE_TOKEN must be set in local environment.
-- `--pypi`: Uploads the package to PyPI. PYPI_TOKEN must be set in local environment.
 - `--high_mem`: Utilizes high memory mode for concurrent data processing. This has been successfully tested using 32 or more vCPUs. 
 - `--dataset`: Specifies the datasets to process (default='broad_sanger,hcmi,beataml,mpnst,cptac').
-- `--version`: Specifies the version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for figshare and PyPI upload steps. This must be a higher version than previously published versions.
+- `--version`: Specifies the version number for the Figshare upload title (e.g., "0.1.29"). This must be a higher version than previously published versions.
 - `--github-username`: GitHub username matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 - `--github-email`: GitHub email matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 
 **Example usage**:  
-- Build all datasets and upload to Figshare and PyPI and GitHub.  
-Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `PYPI_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
+- Build all datasets and upload to Figshare and GitHub.  
+Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
 ```bash
-python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
+python build/build_all.py --all --high_mem --validate --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
 ```
 
 - Build only the experiment files.  
@@ -56,21 +54,22 @@ It requires the following authorization tokens to be set in the local environmen
 `SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Follow the directions above to use gain access.
 
 Available arguments:
-- `--dataset`: Required. Name of the dataset to build.
+- `--dataset`: Required. Name of the dataset to build. At a minimum, this will build the docker images.
 - `--use_prev_dataset`: Optional. Prefix of the previous dataset for sample and drug ID continuation. The previous dataset files must be in the "local" directory.
-- `--validate`: Optional. Runs the schema checker on the built files.
+- `--build`: Optional. Build the desired Dataset.
+- `--validate`: Optional. Run the schema checker on the built files.
 - `--continue`: Optional. Continues from where the build left off by skipping existing files in "local" directory.
 Example usage:
 
 Build the broad_sanger dataset:
 ```bash
-python build/build_dataset.py --dataset broad_sanger
+python build/build_dataset.py --build --dataset broad_sanger
 ```
 Build the mpnst dataset continuing from broad_sanger sample and drug IDs:
 ```bash
-python build/build_dataset.py --dataset mpnst --use_prev_dataset broad_sanger
+python build/build_dataset.py --build --dataset mpnst --use_prev_dataset broad_sanger
 ```
-Build the hcmi dataset and run validation:
+Build run schema validation on hcmi dataset:
 ```bash
 python build/build_dataset.py --dataset hcmi --validate
 ```
 
@@ -466,8 +466,14 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
                          right_on='other_id',
                          how='left')
     mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
+    
+    # Replace NaNs, round values, and convert to integers for specified columns
+    columns_to_convert = ['improve_sample_id', 'entrez_id']
+    mapped_df[columns_to_convert] = mapped_df[columns_to_convert].fillna(0).round().astype('int32')
+    
     mapped_df['source'] = 'synapse'
     mapped_df['study'] = 'BeatAML'
+    mapped_df =mapped_df.drop_duplicates()
 
     final_dataframe = mapped_df.dropna()
     return final_dataframe
 
@@ -70,7 +70,7 @@ def main():
         smiles= pl.DataFrame({'NSC':smiles['NSC'],'upper':upper})#smiles.with_columns(upper=upper)
         ##reduce to smiels only in current drugs
         # ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['isoSMILES']))
-        ssmiles = ssmiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
+        ssmiles = smiles.filter(~pl.col('upper').is_in(curdrugs['canSMILES']))
         pubchems = pubchems.filter(pl.col('NSC').is_in(ssmiles['NSC']))
         arr = set(pubchems['CID'])
 
 
@@ -107,10 +107,11 @@ def main():
 
     finaldf = pl.DataFrame(
         {
-            'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
+            'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
             'improve_sample_id':molar['improve_sample_id'],
             'Drug':molar['improve_drug_id'],
-            'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            # 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
+            'study': "NCI60",
             'time':molar['time'],
             'time_unit':molar['time_unit'],
             'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB
 
@@ -10,39 +10,37 @@
 import shutil
 import gzip
 from glob import glob
-from packaging import version
 import sys
 
 def main():
     parser=argparse.ArgumentParser(
-        description="This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare and PyPI.",
+        description="This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare.",
         epilog="""Examples of usage:
 
-Build all datasets in a high memory environment, validate them, and upload to Figshare and PyPI:
-  python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.29
+Build all datasets in a high memory environment, validate them, and upload to Figshare:
+  python build/build_all.py --all --high_mem --validate --figshare --version 0.1.29
 
 Build only experiment files. This assumes preceding steps (docker images, samples, omics, and drugs) have already been completed:
   python build/build_all.py --exp
 
 Validate all local files without building or uploading. These files must be located in ./local. Includes compression/decompression steps.
   python build/build_all.py --validate
 
-Upload the latest data to Figshare and PyPI (ensure tokens are set in the local environment):
-  python build/build_all.py --figshare --pypi --version 0.1.30
+Upload the latest data to Figshare (ensure tokens are set in the local environment):
+  python build/build_all.py --figshare --version 0.1.30
         """
     )
     parser.add_argument('--docker',dest='docker',default=False,action='store_true', help="Build all docker images.")
     parser.add_argument('--samples',dest='samples',default=False,action='store_true', help="Build all sample files.")
     parser.add_argument('--omics',dest='omics',default=False,action='store_true', help="Build all omics files.")
     parser.add_argument('--drugs',dest='drugs',default=False,action='store_true', help="Build all drug files")
     parser.add_argument('--exp',dest='exp',default=False,action='store_true', help="Build all experiment file.")
-    parser.add_argument('--validate', action='store_true', help="Run schema checker on all local files. Note this will be run, whether specified or not, if figshare or pypi arguments are included.")
+    parser.add_argument('--validate', action='store_true', help="Run schema checker on all local files. Note this will be run, whether specified or not, if figshare arguments are included.")
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
-    parser.add_argument('--pypi', action='store_true', help="Update PYPI Package with latest Figshare data. PYPI_TOKEN must be set in local environment.")
-    parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate, figshare, or pypi commands.")
+    parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,mpnst,cptac',help='Datasets to process. Defaults to all available.')
-    parser.add_argument('--version', type=str, required=False, help='Version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for Figshare and PyPI upload. This must be a higher version than previously published versions.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
 
@@ -120,6 +118,7 @@ def process_docker(datasets):
             'hcmi': ['hcmi'],
             'beataml': ['beataml'],
             'mpnst': ['mpnst'],
+            'mpnstpdx': ['mpnstpdx'],
             'cptac': ['cptac'],
             'genes': ['genes'],
             'upload': ['upload']
@@ -266,8 +265,6 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
         docker_run = ['docker', 'run', '--rm', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '-e', f"VERSION={version}"]
 
         # Add Appropriate Environment Variables
-        if 'PYPI_TOKEN' in env and name == 'PyPI':
-            docker_run.extend(['-e', f"PYPI_TOKEN={env['PYPI_TOKEN']}", 'upload'])
         if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
             docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
         if name == "validate":
@@ -308,16 +305,13 @@ def compress_file(file_path):
     #####
 
     figshare_token = os.getenv('FIGSHARE_TOKEN')
-    pypi_token = os.getenv('PYPI_TOKEN')
     synapse_auth_token = os.getenv('SYNAPSE_AUTH_TOKEN')
     github_token = os.getenv('GITHUB_TOKEN')
 
 
     # Error handling for required tokens
     if args.figshare and not figshare_token:
         raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
-    if args.pypi and not pypi_token:
-        raise ValueError("PYPI_TOKEN environment variable is not set.")
     if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token:
         if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
             raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.")
@@ -394,7 +388,7 @@ def compress_file(file_path):
     ### Begin Upload and/or validation
     #####
 
-    if args.pypi or args.figshare or args.validate:
+    if args.figshare or args.validate:
         # FigShare File Prefixes:
         prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
@@ -405,23 +399,18 @@ def compress_file(file_path):
 
 
         figshare_token = os.getenv('FIGSHARE_TOKEN')
-        pypi_token = os.getenv('PYPI_TOKEN')
 
         all_files_dir = 'local/all_files_dir'
         if not os.path.exists(all_files_dir):
             os.makedirs(all_files_dir)
-
-        # Ensure pypi tokens are available
-        if  args.pypi and not pypi_token:
-            raise ValueError("Required tokens (PYPI) are not set in environment variables.")
 
         # Ensure figshare tokens are available
         if  args.figshare and not figshare_token:
             raise ValueError("Required tokens (FIGSHARE) are not set in environment variables.")
 
         # Ensure version is specified
-        if (args.figshare or args.pypi) and not args.version:
-            raise ValueError("Version must be specified when pushing to pypi or figshare")
+        if args.figshare and not args.version:
+            raise ValueError("Version must be specified when pushing to figshare")
 
         # Move relevant files to a designated directory
         for file in glob(os.path.join("local", '*.*')):
@@ -433,7 +422,7 @@ def compress_file(file_path):
             decompress_file(file)
 
         # Run schema checker - This will always run if uploading data.
-        schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
+        schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
         run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
 
         print("Validation complete. Proceeding with file compression/decompression adjustments")
@@ -453,13 +442,9 @@ def compress_file(file_path):
             figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
             run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
 
-    # Upload to PyPI using Docker
-        if args.pypi and args.version and pypi_token:
-            pypi_command = ['python3', 'scripts/push_to_pypi.py', '-y', '/tmp/figshare_latest.yml', '-d', 'coderdata/download/downloader.py', "-v", args.version]
-            run_docker_upload_cmd(pypi_command, 'all_files_dir', 'PyPI', args.version)
 
             # Push changes to GitHub using Docker
-        if args.version and args.figshare and args.pypi and pypi_token and figshare_token and github_token and args.github_username and args.github_email:
+        if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
             git_command = [
                 'bash', '-c', (
                     f'git config --global user.name "{args.github_username}" '
@@ -476,4 +461,4 @@ def compress_file(file_path):
 
 
 if __name__ == '__main__':
-    main()
+    main()
@@ -220,7 +220,7 @@ def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
     Wrapper for 'docker run' command used during validation and uploads.
     '''
     env = os.environ.copy()
-    docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
+    docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '--platform=linux/amd64']
     docker_run.extend(['upload']) 
     docker_run.extend(cmd_arr)
     print('Executing:', ' '.join(docker_run))
 
@@ -380,11 +380,15 @@ def main():
                     dat_files[dtype_key] = fdf2
                 else:
                     dat_files[dtype_key] = fdf.dropna()
+                
                 print(dtype_key)
 
         # Now concatenate all the cancers into a single file
         for dtype_key, df in dat_files.items():
             print('Saving ' + "cptac_" + dtype_key + '.csv.gz' + ' file')
+            print(df.to_string())
+            df['entrez_id'] = df['entrez_id'].fillna(0)
+            df['entrez_id'] = df['entrez_id'].astype(int)
             df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip')
 
 if __name__ == '__main__':
 
@@ -2,10 +2,10 @@ FROM python:3.9
 
 WORKDIR /usr/src/app
 
-RUN python -m pip install --upgrade pip setuptools wheel twine packaging pyyaml requests linkml
+RUN python -m pip install --upgrade pip pyyaml requests linkml
 
 RUN apt-get update && apt-get install -y git
 
 
-COPY ./schema /usr/src/app/schema
-ADD scripts/check_schema.py ./
+RUN git clone https://github.com/PNNL-CompBio/coderdata.git
+WORKDIR /usr/src/app/coderdata