PNNL-CompBio
diff --git a/‎build/build_all.py‎
Lines changed: 9 additions & 8 deletions b/‎build/build_all.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎build/build_dataset.py‎
Lines changed: 5 additions & 3 deletions b/‎build/build_dataset.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎build/docker/Dockerfile.liverpdo‎
Lines changed: 55 additions & 0 deletions b/‎build/docker/Dockerfile.liverpdo‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion b/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎build/liverpdo/01-samples-liverpdo.py‎
Lines changed: 137 additions & 0 deletions b/‎build/liverpdo/01-samples-liverpdo.py‎
Lines changed: 137 additions & 0 deletions
@@ -40,7 +40,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
         print('running...'+filename)
         env = os.environ.copy()
         if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
-            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
+            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets')
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
         else:
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -85,7 +85,7 @@ def run_docker_cmd(cmd_arr,filename):
     #     All output and errors are logged at local/docker.log
     #     '''
     #     compose_file = 'build/docker/docker-compose.yml'
-    #     compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel']
+    #     compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel']
     #     log_file_path = 'local/docker.log'
     #     env = os.environ.copy()
     #     print(f"Docker-compose is building all images. View output in {log_file_path}.")
@@ -125,7 +125,8 @@ def process_docker(datasets):
             'sarcpdo': ['sarcpdo'],
             'cptac': ['cptac'],
             'genes': ['genes'],
-            'upload': ['upload']
+            'upload': ['upload'],
+            'liverpdo': ['liverpdo']
         }
 
         # Collect container names to build based on the datasets provided. Always build genes and upload.
@@ -134,7 +135,7 @@ def process_docker(datasets):
             datasets_to_build.extend(dataset_map.get(dataset, []))
 
         # Build the docker-compose command, adding specific datasets
-        compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
+        compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
 
         log_file_path = 'local/docker.log'
         env = os.environ.copy()
@@ -330,9 +331,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # Error handling for required tokens
     if args.figshare and not figshare_token:
         raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
-    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
+    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo','liverpdo']) and not synapse_auth_token:
         if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
-            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
+            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets.")
 
     ######
     ### Begin Pipeline
@@ -409,7 +410,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # if args.figshare or args.validate:
         # FigShare File Prefixes:
 
-        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
+        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
         if "broad_sanger" in datasets:
             prefixes.extend(broad_sanger_datasets)
 
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
         'genes': ['genes'],
         'upload': ['upload'],
         'crcpdo': ['crcpdo'], 
-        'bladderpdo': ['bladderpdo']
+        'bladderpdo': ['bladderpdo'],
+        'liverpdo': ['liverpdo']
     }
 
     # Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -59,7 +60,7 @@ def process_docker(dataset,validate):
 
     datasets_to_build.extend(dataset_map.get(dataset, []))
 
-    compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
+    compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build
 
     log_file_path = 'local/docker.log'
     env = os.environ.copy()
@@ -131,7 +132,8 @@ def process_omics(executor, dataset, should_continue):
         'sarcpdo': ['mutations', 'transcriptomics'],
         'pancpdo': ['transcriptomics'],
         'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
-        'crcpdo':['copy_number', 'mutations', 'transcriptomics']
+        'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
+        'liverpdo':['copy_number', 'mutations', 'transcriptomics']
     }
 
     expected_omics = dataset_omics_files.get(dataset, [])
 
@@ -0,0 +1,55 @@
+FROM r-base:4.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update package list and install required packages
+RUN apt-get update && \
+    apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
+    zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
+
+# Download and compile Python 3.10 with shared library support
+RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-optimizations --enable-shared && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-3.10.12.tgz Python-3.10.12
+
+# Set Python 3.10 as default
+RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
+    ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
+
+# Update library paths for Python shared library
+RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
+
+# Create a Python virtual environment
+#RUN python3 -m venv /opt/venv
+#RUN /opt/venv/bin/pip install --upgrade pip
+
+# Set environment variables for reticulate
+#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
+WORKDIR /app
+
+# Set MPLCONFIGDIR to a writable directory and create it.
+ENV MPLCONFIGDIR=/app/tmp/matplotlib
+RUN mkdir -p /app/tmp/matplotlib
+
+
+
+
+# installing python libraries
+ADD build/liverpdo/requirements.txt .
+#RUN /opt/venv/bin/pip3 install -r requirements.txt
+RUN pip3 install -r requirements.txt
+
+RUN python3 --version
+
+#ENV PATH="/opt/venv/bin:$PATH"
+
+ADD build/liverpdo/*py ./
+ADD build/liverpdo/*sh ./
+
+ADD build/utils/* ./
@@ -112,4 +112,13 @@ services:
       args:
         HTTPS_PROXY: ${HTTPS_PROXY}
     platform: linux/amd64
-    image: crcpdo:latest
+    image: crcpdo:latest
+
+  liverpdo:
+    build:
+      context: ../../
+      dockerfile: build/docker/Dockerfile.liverpdo
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: liverpdo:latest
@@ -0,0 +1,137 @@
+# import required packages
+import pandas as pd
+import numpy as np
+import os
+import gzip
+import requests
+import argparse
+import synapseclient 
+
+
+
+## Download the samples data from synapse
+def download_samples_data(synID:str , save_path:str = None, synToken:str = None):
+    """ 
+    Download samples data from Synapse at synapseID syn64961953. Requires a synapse token, which requires you to make a Synapse account
+    and create a Personal Access Token.  More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens 
+    
+    Parameters
+    ----------
+    synID : string
+        SynapseID of dataset to download. Default is synapseID of the samples dataset.
+        
+    save_path : string
+        Local path where the downloaded file will be saved.
+
+    synToken : string
+        Synapse Personal Access Token of user.  Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
+        
+    Returns
+    -------
+    str
+        Filepath to downloaded excel file
+    """
+    
+    syn = synapseclient.Synapse() 
+    syn.login(authToken=synToken) 
+    
+    # Obtain a pointer and download the data 
+    downloaded_data = syn.get(entity=synID, downloadLocation = save_path) 
+
+    # Get the path to the local copy of the data file 
+    samples_filepath = downloaded_data.path
+    return(samples_filepath)
+
+def map_substring(s, dict_map):
+    for key in dict_map.keys():
+        if key in s: 
+            return dict_map[key]
+    return np.nan
+
+ ### create sample sheet function 
+def generate_sample_file(samples_data_path:str = None, prev_samples_path:str = "") -> pd.DataFrame:
+    """
+    Creates sample file from samples data excel file. Checks the input sample file against previous sample files to make sure 
+    there are no clashing sample names and assigns improved ID's starting from where previous sample sheet left off.
+    
+    Parameters
+    ----------
+    samples_data_path : string
+        Path to samples data from https://pmc.ncbi.nlm.nih.gov/articles/PMC10949980/#_ad93_. Supplementary Table S1-S13
+        
+    prev_samples_path : string
+        Path to previous sample sheet.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing the combined samples data.
+    
+    """
+    # reading in samples excel file
+    samples_excel = pd.ExcelFile(open(samples_data_path, 'rb'))
+    clinical_info_df = pd.read_excel(samples_excel, 'S1. Clinical information') # table with samples information 
+    
+    # reading in previous sample file 
+    if prev_samples_path != "":
+        prev_samples = pd.read_csv(prev_samples_path)
+
+    # formatting table
+    clinical_info_df = clinical_info_df.iloc[3:68] # keep only these rows, since the rest are formatted oddly and there's no info there
+    samples_df = pd.DataFrame({'other_id':clinical_info_df.iloc[3:68,0]}).reset_index(drop=True)   
+    samples_df['common_name'] = samples_df['other_id'] 
+    ctype_dict = {'HCC':'hepatocellular carcinoma',
+               'ICC':'intrahepatic cholangiocarcinoma',
+               'CHC': 'combined hepatocellular-cholangiocarcinoma',
+               'HB': 'hepatoblastoma'}
+    samples_df['cancer_type'] = samples_df['other_id'].apply(lambda x: map_substring(x, ctype_dict))
+    samples_df['other_id_source'] = "Synapse"
+    samples_df['species'] = "Homo sapiens (Human)"
+    samples_df['model_type'] = "patient derived organoid"
+
+
+    # check other_id doesn't clash with previous sample names
+    if prev_samples_path != "":
+        if prev_samples.other_id.values in samples_df.other_id.values:
+            print("Duplicate id names detected. Cannot proceed with generating sample sheet until resolved.")
+            exit()
+    if prev_samples_path == "":
+        maxval = 0
+    else:
+        maxval = max(prev_samples.improve_sample_id)
+    samples_df['improve_sample_id'] = samples_df.index + maxval + 1 # take index plus 1 to create counter, start from max value
+    return(samples_df)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='###')
+
+    parser.add_argument('-D', '--download',action='store_true', default=False, help='Download RNA seq and sequencing data from GEO and supplemental materials from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#mmc2')
+    parser.add_argument('-t', '--token', type=str, default=None, help='Synapse Token')
+    parser.add_argument('-i', '--synapseID', type=str, default="syn66593307", help='SynapseID of data to download')
+
+    parser.add_argument('-s', '--samples', action = 'store_true', help='Only generate samples, requires previous samples',default=False)
+    parser.add_argument('-p', '--prevSamples', nargs='?',type=str, default='', const='', help='Use this to provide previous sample file')
+
+
+
+    args = parser.parse_args()
+
+    if args.download:
+        if args.token is None:
+            print("No synpase download tocken was provided. Cannot download data.")
+            exit()
+        else:
+            print("Downloading Files from Synapse.")
+            # Download samples data
+            samples_download_path = download_samples_data(synID = args.synapseID, synToken = args.token, save_path = "/tmp")
+
+    if args.samples:
+        if args.prevSamples is None or args.prevSamples=='':
+            print("No previous samples file provided.  Starting improve_sample_id from 1. Running sample file generation")
+            sample_sheet = generate_sample_file(samples_data_path = samples_download_path)
+        else:
+            print("Previous sample sheet {} detected. Running sample file generation and checking for duplicate IDs.".format(args.prevSamples))
+            sample_sheet = generate_sample_file(samples_data_path = samples_download_path, prev_samples_path= args.prevSamples)
+        sample_sheet.to_csv("/tmp/liverpdo_samples.csv", index=False)
+    
+