Skip to content

Commit a760c17

Browse files
authored
Merge pull request #426 from PNNL-CompBio/novartisPDX-omics
NovartisPDX
2 parents fbab783 + cf87d17 commit a760c17

15 files changed

Lines changed: 670 additions & 3 deletions

build/build_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
4848
'upload': ['upload'],
4949
'crcpdo': ['crcpdo'],
5050
'bladderpdo': ['bladderpdo'],
51-
'liverpdo': ['liverpdo']
51+
'liverpdo': ['liverpdo'],
52+
'novartispdx': ['novartispdx']
5253
}
5354

5455
# Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -131,6 +132,7 @@ def process_omics(executor, dataset, should_continue):
131132
'pancpdo': ['transcriptomics'],
132133
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
133134
'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
135+
'novartispdx':['copy_number', 'mutations', 'transcriptomics'],
134136
'liverpdo':['copy_number', 'mutations', 'transcriptomics']
135137
}
136138

build/crcpdo/02-omics-crcpdo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
209209
parser = argparse.ArgumentParser(description='###')
210210

211211
# arguments for file paths
212-
parser.add_argument('-g', '--genes', type=str, default=None, help='Path to transcriptomics genes.csv. Can be obtained using this docker container: https://github.com/PNNL-CompBio/coderdata/blob/0225c52b861dcd6902521228731c54a61768bcd6/build/genes/README.md#L4')
212+
parser.add_argument('-g', '--genes', type=str, default=None, help='Path to genes.csv. Can be obtained using this docker container: https://github.com/PNNL-CompBio/coderdata/blob/0225c52b861dcd6902521228731c54a61768bcd6/build/genes/README.md#L4')
213213
parser.add_argument('-i', '--ids', type=str, default=None, help='Path to sample Ids')
214214

215215
# arguments for what data to process
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
FROM r-base:4.4.1
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Update package list and install required packages
6+
RUN apt-get update && \
7+
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
8+
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
9+
10+
# Download and compile Python 3.10 with shared library support
11+
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
12+
tar -xf Python-3.10.12.tgz && \
13+
cd Python-3.10.12 && \
14+
./configure --enable-optimizations --enable-shared && \
15+
make -j$(nproc) && \
16+
make altinstall && \
17+
cd .. && \
18+
rm -rf Python-3.10.12.tgz Python-3.10.12
19+
20+
# Set Python 3.10 as default
21+
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
22+
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
23+
24+
# Update library paths for Python shared library
25+
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
26+
27+
# Create a Python virtual environment
28+
#RUN python3 -m venv /opt/venv
29+
#RUN /opt/venv/bin/pip install --upgrade pip
30+
31+
# Set environment variables for reticulate
32+
#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
33+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
34+
WORKDIR /app
35+
36+
# Set MPLCONFIGDIR to a writable directory and create it.
37+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
38+
RUN mkdir -p /app/tmp/matplotlib
39+
40+
41+
# installing python libraries
42+
ADD build/novartispdx/requirements.txt .
43+
#RUN /opt/venv/bin/pip3 install -r requirements.txt
44+
RUN pip3 install -r requirements.txt
45+
46+
RUN python3 --version
47+
48+
#ENV PATH="/opt/venv/bin:$PATH"
49+
50+
ADD build/novartispdx/*py ./
51+
ADD build/novartispdx/*sh ./
52+
53+
ADD build/utils/* ./

build/docker/docker-compose.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,4 +121,13 @@ services:
121121
args:
122122
HTTPS_PROXY: ${HTTPS_PROXY}
123123
platform: linux/amd64
124-
image: liverpdo:latest
124+
image: liverpdo:latest
125+
126+
novartispdx:
127+
build:
128+
context: ../../
129+
dockerfile: build/docker/Dockerfile.novartispdx
130+
args:
131+
HTTPS_PROXY: ${HTTPS_PROXY}
132+
platform: linux/amd64
133+
image: novartispdx:latest
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import pandas as pd
2+
import synapseclient
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
def get_complete_novartispdx_sample_sheet(synObject):
8+
9+
files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file']))
10+
11+
synIDs = [item['id'] for item in files]
12+
# leave off synIDs for drug info
13+
synIDs.remove('syn66276102')
14+
synIDs.remove('syn66276098')
15+
synIDs.remove("syn66477971")
16+
# create empty dataframe
17+
allsamplesheet = pd.DataFrame()
18+
# iterate through IDs and concatenate
19+
for id in synIDs:
20+
curr = synObject.get(id)
21+
currdf = pd.read_csv(curr.path)
22+
allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True)
23+
# rename columns and reformat cancer type from CANCER_HISTOLOGY column
24+
allsamplesheet['other_id'] = allsamplesheet['Sample ID']
25+
allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID']
26+
allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1]
27+
allsamplesheet['species'] = "Homo Sapiens(human)"
28+
allsamplesheet['model_type'] = 'patient derived xenograft'
29+
allsamplesheet['other_id_source'] = 'Synapse'
30+
allsamplesheet['other_names'] = ''
31+
finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
32+
return finalsamplesheet
33+
34+
if __name__ == "__main__":
35+
36+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet")
37+
38+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
39+
40+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
41+
42+
args = parser.parse_args()
43+
44+
print("Logging into Synapse")
45+
PAT = args.token
46+
synObject = synapseclient.login(authToken=PAT)
47+
samplesheet = get_complete_novartispdx_sample_sheet(synObject)
48+
49+
if (args.prevSamples):
50+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
51+
else:
52+
prev_max_improve_id = 0
53+
54+
samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1)
55+
56+
samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False)
57+
58+

0 commit comments

Comments
 (0)