PNNL-CompBio
diff --git a/‎build/build_all.py‎
Lines changed: 13 additions & 9 deletions b/‎build/build_all.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎build/build_dataset.py‎
Lines changed: 5 additions & 3 deletions b/‎build/build_dataset.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎build/docker/Dockerfile.liverpdo‎
Lines changed: 55 additions & 0 deletions b/‎build/docker/Dockerfile.liverpdo‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion b/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎build/hcmi/02-getHCMIData.py‎
Lines changed: 5 additions & 2 deletions b/‎build/hcmi/02-getHCMIData.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎build/hcmi/README.md‎
Lines changed: 27 additions & 6 deletions b/‎build/hcmi/README.md‎
Lines changed: 27 additions & 6 deletions
@@ -40,7 +40,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
         print('running...'+filename)
         env = os.environ.copy()
         if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
-            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
+            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets')
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
         else:
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -85,7 +85,7 @@ def run_docker_cmd(cmd_arr,filename):
     #     All output and errors are logged at local/docker.log
     #     '''
     #     compose_file = 'build/docker/docker-compose.yml'
-    #     compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel']
+    #     compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel']
     #     log_file_path = 'local/docker.log'
     #     env = os.environ.copy()
     #     print(f"Docker-compose is building all images. View output in {log_file_path}.")
@@ -125,7 +125,8 @@ def process_docker(datasets):
             'sarcpdo': ['sarcpdo'],
             'cptac': ['cptac'],
             'genes': ['genes'],
-            'upload': ['upload']
+            'upload': ['upload'],
+            'liverpdo': ['liverpdo']
         }
 
         # Collect container names to build based on the datasets provided. Always build genes and upload.
@@ -134,7 +135,7 @@ def process_docker(datasets):
             datasets_to_build.extend(dataset_map.get(dataset, []))
 
         # Build the docker-compose command, adding specific datasets
-        compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
+        compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
 
         log_file_path = 'local/docker.log'
         env = os.environ.copy()
@@ -273,7 +274,7 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
             docker_run.extend(['upload'])
         if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
             docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
-        if name == "Map_Drugs" or name == "Map_Samples":
+        if name in ["Map_Drugs", "Map_Samples", "Align_Drug_Descriptors"]:
             docker_run.extend(['upload'])
         if 'GITHUB_TOKEN' in env and name == "GitHub":
             docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
@@ -330,9 +331,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # Error handling for required tokens
     if args.figshare and not figshare_token:
         raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
-    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
+    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo','liverpdo']) and not synapse_auth_token:
         if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
-            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
+            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets.")
 
     ######
     ### Begin Pipeline
@@ -409,7 +410,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # if args.figshare or args.validate:
         # FigShare File Prefixes:
 
-        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
+        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
         if "broad_sanger" in datasets:
             prefixes.extend(broad_sanger_datasets)
@@ -445,6 +446,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
 
         drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
         run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
+        
+        drug_mapping_command_2 = ['python3', 'scripts/align_drug_descriptors.py', '--local_dir', "/tmp", '--version', args.version]
+        run_docker_upload_cmd(drug_mapping_command_2, 'all_files_dir', 'Align_Drug_Descriptors', args.version)
 
         # Run schema checker - This will always run if uploading data.
         schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
 
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
         'genes': ['genes'],
         'upload': ['upload'],
         'crcpdo': ['crcpdo'], 
-        'bladderpdo': ['bladderpdo']
+        'bladderpdo': ['bladderpdo'],
+        'liverpdo': ['liverpdo']
     }
 
     # Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -59,7 +60,7 @@ def process_docker(dataset,validate):
 
     datasets_to_build.extend(dataset_map.get(dataset, []))
 
-    compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
+    compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build
 
     log_file_path = 'local/docker.log'
     env = os.environ.copy()
@@ -131,7 +132,8 @@ def process_omics(executor, dataset, should_continue):
         'sarcpdo': ['mutations', 'transcriptomics'],
         'pancpdo': ['transcriptomics'],
         'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
-        'crcpdo':['copy_number', 'mutations', 'transcriptomics']
+        'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
+        'liverpdo':['copy_number', 'mutations', 'transcriptomics']
     }
 
     expected_omics = dataset_omics_files.get(dataset, [])
 
@@ -0,0 +1,55 @@
+FROM r-base:4.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update package list and install required packages
+RUN apt-get update && \
+    apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
+    zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
+
+# Download and compile Python 3.10 with shared library support
+RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-optimizations --enable-shared && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-3.10.12.tgz Python-3.10.12
+
+# Set Python 3.10 as default
+RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
+    ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
+
+# Update library paths for Python shared library
+RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
+
+# Create a Python virtual environment
+#RUN python3 -m venv /opt/venv
+#RUN /opt/venv/bin/pip install --upgrade pip
+
+# Set environment variables for reticulate
+#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
+WORKDIR /app
+
+# Set MPLCONFIGDIR to a writable directory and create it.
+ENV MPLCONFIGDIR=/app/tmp/matplotlib
+RUN mkdir -p /app/tmp/matplotlib
+
+
+
+
+# installing python libraries
+ADD build/liverpdo/requirements.txt .
+#RUN /opt/venv/bin/pip3 install -r requirements.txt
+RUN pip3 install -r requirements.txt
+
+RUN python3 --version
+
+#ENV PATH="/opt/venv/bin:$PATH"
+
+ADD build/liverpdo/*py ./
+ADD build/liverpdo/*sh ./
+
+ADD build/utils/* ./
@@ -112,4 +112,13 @@ services:
       args:
         HTTPS_PROXY: ${HTTPS_PROXY}
     platform: linux/amd64
-    image: crcpdo:latest
+    image: crcpdo:latest
+
+  liverpdo:
+    build:
+      context: ../../
+      dockerfile: build/docker/Dockerfile.liverpdo
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: liverpdo:latest
@@ -613,10 +613,13 @@ def write_dataframe_to_csv(dataframe, outname):
     -------
     None
     """
+    dataframe = dataframe.to_pandas()
+    dataframe = dataframe.drop_duplicates()
+
     if('gz' in outname):
-        dataframe.to_pandas().to_csv(outname,compression='gzip',index=False)
+        dataframe.to_csv(outname,compression='gzip',index=False)
     else:
-        dataframe.to_pandas().to_csv(outname,index=False)
+        dataframe.to_csv(outname,index=False)
     return
 
 def main():
 
@@ -1,17 +1,38 @@
 ## HCMI Data
 
-Here we will store the scripts required to process the data from the [Human Cancer Models Initiative](https://ocg.cancer.gov/programs/HCMI)
+Here we will store the scripts required to process the data from the
+[Human Cancer Models
+Initiative](https://ocg.cancer.gov/programs/HCMI). 
+
+Currenlty all data collected is part of the [HCMI-CMDC Project on the
+GDC](https://portal.gdc.cancer.gov/analysis_page?app=Projects). To
+update:
+
+1. Navigate to the [GDC Data
+   Portal](https://portal.gdc.cancer.gov/analysis_page?app=Projects),
+   and select 'HCMI-CMDC'
+2. Click on the 'Cases' button, and select the download button where
+   it lists the number of files.
+3. This will download the ENTIRE Manifes
+4. Filter the manifest for RNASeq, WGS mutations, and copy number
+   calls using the following command:
+```
+ cat ~gdc_manifest.2025-07-08.091940.txt | grep 'mask\|copy\|rna_seq\|md5'
+   | grep 'txt\|maf\|tsv\|md5' > new_manifest.txt
+ cp new_manifest.txt full_manifest.txt
+ 
+```
 
 
-Currently the tool require two steps to build the data:
+Currently the tool require two scripts to build the data:
 ```
 python 01-createHCMISamplesFile.py
 
-python 02-getHCMIData.py -m transcriptomics_gdc_manifest.txt  -t transcriptomics -o transcriptomics.csv
+python 02-getHCMIData.py -m full_manifest.txt  -t transcriptomics -o transcriptomics.csv
 
-python 02-getHCMIData.py -m mutations_manifest_gdc.txt -t mutations -o mutations.csv
+python 02-getHCMIData.py -m full_manifest.txt -t mutations -o mutations.csv
 
-python 02-getHCMIData.py -m _manifest.txt -t copy_number -o copy_number.csv
+python 02-getHCMIData.py -m full_manifest.txt -t copy_number -o copy_number.csv
 
 
-```
+```