PNNL-CompBio
diff --git a/‎build/build_dataset.py‎
Lines changed: 3 additions & 1 deletion b/‎build/build_dataset.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎build/crcpdo/02-omics-crcpdo.py‎
Lines changed: 1 addition & 1 deletion b/‎build/crcpdo/02-omics-crcpdo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/docker/Dockerfile.novartispdx‎
Lines changed: 53 additions & 0 deletions b/‎build/docker/Dockerfile.novartispdx‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion b/‎build/docker/docker-compose.yml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎build/novartispdx/01-samples-novartispdx.py‎
Lines changed: 58 additions & 0 deletions b/‎build/novartispdx/01-samples-novartispdx.py‎
Lines changed: 58 additions & 0 deletions
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
         'upload': ['upload'],
         'crcpdo': ['crcpdo'], 
         'bladderpdo': ['bladderpdo'],
-        'liverpdo': ['liverpdo']
+        'liverpdo': ['liverpdo'],
+        'novartispdx': ['novartispdx']
     }
 
     # Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -131,6 +132,7 @@ def process_omics(executor, dataset, should_continue):
         'pancpdo': ['transcriptomics'],
         'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
         'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
+        'novartispdx':['copy_number', 'mutations', 'transcriptomics'],
         'liverpdo':['copy_number', 'mutations', 'transcriptomics']
     }
 
 
@@ -209,7 +209,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
     parser = argparse.ArgumentParser(description='###')
 
     # arguments for file paths
-    parser.add_argument('-g', '--genes', type=str, default=None, help='Path to transcriptomics genes.csv.  Can be obtained using this docker container: https://github.com/PNNL-CompBio/coderdata/blob/0225c52b861dcd6902521228731c54a61768bcd6/build/genes/README.md#L4')
+    parser.add_argument('-g', '--genes', type=str, default=None, help='Path to genes.csv.  Can be obtained using this docker container: https://github.com/PNNL-CompBio/coderdata/blob/0225c52b861dcd6902521228731c54a61768bcd6/build/genes/README.md#L4')
     parser.add_argument('-i', '--ids', type=str, default=None, help='Path to sample Ids')
 
     # arguments for what data to process
 
@@ -0,0 +1,53 @@
+FROM r-base:4.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Update package list and install required packages
+RUN apt-get update && \
+    apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
+    zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
+
+# Download and compile Python 3.10 with shared library support
+RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-optimizations --enable-shared && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-3.10.12.tgz Python-3.10.12
+
+# Set Python 3.10 as default
+RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
+    ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
+
+# Update library paths for Python shared library
+RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
+
+# Create a Python virtual environment
+#RUN python3 -m venv /opt/venv
+#RUN /opt/venv/bin/pip install --upgrade pip
+
+# Set environment variables for reticulate
+#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
+ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
+WORKDIR /app
+
+# Set MPLCONFIGDIR to a writable directory and create it.
+ENV MPLCONFIGDIR=/app/tmp/matplotlib
+RUN mkdir -p /app/tmp/matplotlib
+
+
+# installing python libraries
+ADD build/novartispdx/requirements.txt .
+#RUN /opt/venv/bin/pip3 install -r requirements.txt
+RUN pip3 install -r requirements.txt
+
+RUN python3 --version
+
+#ENV PATH="/opt/venv/bin:$PATH"
+
+ADD build/novartispdx/*py ./
+ADD build/novartispdx/*sh ./
+
+ADD build/utils/* ./
@@ -121,4 +121,13 @@ services:
       args:
         HTTPS_PROXY: ${HTTPS_PROXY}
     platform: linux/amd64
-    image: liverpdo:latest
+    image: liverpdo:latest
+
+  novartispdx:
+    build:
+      context: ../../
+      dockerfile: build/docker/Dockerfile.novartispdx
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: novartispdx:latest
@@ -0,0 +1,58 @@
+import pandas as pd
+import synapseclient
+import numpy as np
+import argparse
+import os
+
+def get_complete_novartispdx_sample_sheet(synObject):
+
+    files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file']))
+
+    synIDs = [item['id'] for item in files]
+    # leave off synIDs for drug info
+    synIDs.remove('syn66276102')
+    synIDs.remove('syn66276098')
+    synIDs.remove("syn66477971")
+    # create empty dataframe
+    allsamplesheet = pd.DataFrame()
+    # iterate through IDs and concatenate
+    for id in synIDs:
+        curr = synObject.get(id)
+        currdf = pd.read_csv(curr.path)
+        allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True)
+    # rename columns and reformat cancer type from CANCER_HISTOLOGY column
+    allsamplesheet['other_id'] = allsamplesheet['Sample ID']
+    allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID']
+    allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1]
+    allsamplesheet['species'] = "Homo Sapiens(human)"
+    allsamplesheet['model_type'] = 'patient derived xenograft'
+    allsamplesheet['other_id_source'] = 'Synapse'
+    allsamplesheet['other_names'] = ''
+    finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
+    return finalsamplesheet
+
+if __name__ == "__main__":
+    
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet")
+    
+    parser.add_argument('-t', '--token', type=str, help='Synapse Token')
+
+    parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const  = "", help = "Use this to provide previous sample file, will run sample file generation")
+
+    args = parser.parse_args()
+   
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+    samplesheet = get_complete_novartispdx_sample_sheet(synObject)
+
+    if (args.prevSamples):
+        prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
+    else: 
+        prev_max_improve_id = 0
+
+    samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1) 
+
+    samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False)
+
+