Skip to content

Commit a69bd57

Browse files
authored
Merge pull request #413 from PNNL-CompBio/liver_pdo
Liver pdo
2 parents 004b730 + 4a52046 commit a69bd57

15 files changed

Lines changed: 927 additions & 12 deletions

build/build_all.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
6262
print('running...'+filename)
6363
env = os.environ.copy()
6464
if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
65-
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
65+
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets')
6666
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
6767
else:
6868
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -85,7 +85,7 @@ def run_docker_cmd(cmd_arr,filename):
8585
# All output and errors are logged at local/docker.log
8686
# '''
8787
# compose_file = 'build/docker/docker-compose.yml'
88-
# compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel']
88+
# compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel']
8989
# log_file_path = 'local/docker.log'
9090
# env = os.environ.copy()
9191
# print(f"Docker-compose is building all images. View output in {log_file_path}.")
@@ -125,7 +125,8 @@ def process_docker(datasets):
125125
'sarcpdo': ['sarcpdo'],
126126
'cptac': ['cptac'],
127127
'genes': ['genes'],
128-
'upload': ['upload']
128+
'upload': ['upload'],
129+
'liverpdo': ['liverpdo']
129130
}
130131

131132
# Collect container names to build based on the datasets provided. Always build genes and upload.
@@ -134,7 +135,7 @@ def process_docker(datasets):
134135
datasets_to_build.extend(dataset_map.get(dataset, []))
135136

136137
# Build the docker-compose command, adding specific datasets
137-
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
138+
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
138139

139140
log_file_path = 'local/docker.log'
140141
env = os.environ.copy()
@@ -330,9 +331,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
330331
# Error handling for required tokens
331332
if args.figshare and not figshare_token:
332333
raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
333-
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
334+
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo','liverpdo']) and not synapse_auth_token:
334335
if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
335-
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
336+
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets.")
336337

337338
######
338339
### Begin Pipeline
@@ -409,7 +410,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
409410
# if args.figshare or args.validate:
410411
# FigShare File Prefixes:
411412

412-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
413+
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
413414
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
414415
if "broad_sanger" in datasets:
415416
prefixes.extend(broad_sanger_datasets)

build/build_dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
4848
'genes': ['genes'],
4949
'upload': ['upload'],
5050
'crcpdo': ['crcpdo'],
51-
'bladderpdo': ['bladderpdo']
51+
'bladderpdo': ['bladderpdo'],
52+
'liverpdo': ['liverpdo']
5253
}
5354

5455
# Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -59,7 +60,7 @@ def process_docker(dataset,validate):
5960

6061
datasets_to_build.extend(dataset_map.get(dataset, []))
6162

62-
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
63+
compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build
6364

6465
log_file_path = 'local/docker.log'
6566
env = os.environ.copy()
@@ -131,7 +132,8 @@ def process_omics(executor, dataset, should_continue):
131132
'sarcpdo': ['mutations', 'transcriptomics'],
132133
'pancpdo': ['transcriptomics'],
133134
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
134-
'crcpdo':['copy_number', 'mutations', 'transcriptomics']
135+
'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
136+
'liverpdo':['copy_number', 'mutations', 'transcriptomics']
135137
}
136138

137139
expected_omics = dataset_omics_files.get(dataset, [])

build/docker/Dockerfile.liverpdo

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
FROM r-base:4.4.1
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Update package list and install required packages
6+
RUN apt-get update && \
7+
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
8+
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
9+
10+
# Download and compile Python 3.10 with shared library support
11+
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
12+
tar -xf Python-3.10.12.tgz && \
13+
cd Python-3.10.12 && \
14+
./configure --enable-optimizations --enable-shared && \
15+
make -j$(nproc) && \
16+
make altinstall && \
17+
cd .. && \
18+
rm -rf Python-3.10.12.tgz Python-3.10.12
19+
20+
# Set Python 3.10 as default
21+
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
22+
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
23+
24+
# Update library paths for Python shared library
25+
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
26+
27+
# Create a Python virtual environment
28+
#RUN python3 -m venv /opt/venv
29+
#RUN /opt/venv/bin/pip install --upgrade pip
30+
31+
# Set environment variables for reticulate
32+
#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
33+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
34+
WORKDIR /app
35+
36+
# Set MPLCONFIGDIR to a writable directory and create it.
37+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
38+
RUN mkdir -p /app/tmp/matplotlib
39+
40+
41+
42+
43+
# installing python libraries
44+
ADD build/liverpdo/requirements.txt .
45+
#RUN /opt/venv/bin/pip3 install -r requirements.txt
46+
RUN pip3 install -r requirements.txt
47+
48+
RUN python3 --version
49+
50+
#ENV PATH="/opt/venv/bin:$PATH"
51+
52+
ADD build/liverpdo/*py ./
53+
ADD build/liverpdo/*sh ./
54+
55+
ADD build/utils/* ./

build/docker/docker-compose.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,13 @@ services:
112112
args:
113113
HTTPS_PROXY: ${HTTPS_PROXY}
114114
platform: linux/amd64
115-
image: crcpdo:latest
115+
image: crcpdo:latest
116+
117+
liverpdo:
118+
build:
119+
context: ../../
120+
dockerfile: build/docker/Dockerfile.liverpdo
121+
args:
122+
HTTPS_PROXY: ${HTTPS_PROXY}
123+
platform: linux/amd64
124+
image: liverpdo:latest
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# import required packages
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
import gzip
6+
import requests
7+
import argparse
8+
import synapseclient
9+
10+
11+
12+
## Download the samples data from synapse
13+
def download_samples_data(synID:str , save_path:str = None, synToken:str = None):
14+
"""
15+
Download samples data from Synapse at synapseID syn64961953. Requires a synapse token, which requires you to make a Synapse account
16+
and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
17+
18+
Parameters
19+
----------
20+
synID : string
21+
SynapseID of dataset to download. Default is synapseID of the samples dataset.
22+
23+
save_path : string
24+
Local path where the downloaded file will be saved.
25+
26+
synToken : string
27+
Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
28+
29+
Returns
30+
-------
31+
str
32+
Filepath to downloaded excel file
33+
"""
34+
35+
syn = synapseclient.Synapse()
36+
syn.login(authToken=synToken)
37+
38+
# Obtain a pointer and download the data
39+
downloaded_data = syn.get(entity=synID, downloadLocation = save_path)
40+
41+
# Get the path to the local copy of the data file
42+
samples_filepath = downloaded_data.path
43+
return(samples_filepath)
44+
45+
def map_substring(s, dict_map):
46+
for key in dict_map.keys():
47+
if key in s:
48+
return dict_map[key]
49+
return np.nan
50+
51+
### create sample sheet function
52+
def generate_sample_file(samples_data_path:str = None, prev_samples_path:str = "") -> pd.DataFrame:
53+
"""
54+
Creates sample file from samples data excel file. Checks the input sample file against previous sample files to make sure
55+
there are no clashing sample names and assigns improved ID's starting from where previous sample sheet left off.
56+
57+
Parameters
58+
----------
59+
samples_data_path : string
60+
Path to samples data from https://pmc.ncbi.nlm.nih.gov/articles/PMC10949980/#_ad93_. Supplementary Table S1-S13
61+
62+
prev_samples_path : string
63+
Path to previous sample sheet.
64+
65+
Returns
66+
-------
67+
pd.DataFrame
68+
A DataFrame containing the combined samples data.
69+
70+
"""
71+
# reading in samples excel file
72+
samples_excel = pd.ExcelFile(open(samples_data_path, 'rb'))
73+
clinical_info_df = pd.read_excel(samples_excel, 'S1. Clinical information') # table with samples information
74+
75+
# reading in previous sample file
76+
if prev_samples_path != "":
77+
prev_samples = pd.read_csv(prev_samples_path)
78+
79+
# formatting table
80+
clinical_info_df = clinical_info_df.iloc[3:68] # keep only these rows, since the rest are formatted oddly and there's no info there
81+
samples_df = pd.DataFrame({'other_id':clinical_info_df.iloc[3:68,0]}).reset_index(drop=True)
82+
samples_df['common_name'] = samples_df['other_id']
83+
ctype_dict = {'HCC':'hepatocellular carcinoma',
84+
'ICC':'intrahepatic cholangiocarcinoma',
85+
'CHC': 'combined hepatocellular-cholangiocarcinoma',
86+
'HB': 'hepatoblastoma'}
87+
samples_df['cancer_type'] = samples_df['other_id'].apply(lambda x: map_substring(x, ctype_dict))
88+
samples_df['other_id_source'] = "Synapse"
89+
samples_df['species'] = "Homo sapiens (Human)"
90+
samples_df['model_type'] = "patient derived organoid"
91+
92+
93+
# check other_id doesn't clash with previous sample names
94+
if prev_samples_path != "":
95+
if prev_samples.other_id.values in samples_df.other_id.values:
96+
print("Duplicate id names detected. Cannot proceed with generating sample sheet until resolved.")
97+
exit()
98+
if prev_samples_path == "":
99+
maxval = 0
100+
else:
101+
maxval = max(prev_samples.improve_sample_id)
102+
samples_df['improve_sample_id'] = samples_df.index + maxval + 1 # take index plus 1 to create counter, start from max value
103+
return(samples_df)
104+
105+
if __name__ == "__main__":
106+
parser = argparse.ArgumentParser(description='###')
107+
108+
parser.add_argument('-D', '--download',action='store_true', default=False, help='Download RNA seq and sequencing data from GEO and supplemental materials from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#mmc2')
109+
parser.add_argument('-t', '--token', type=str, default=None, help='Synapse Token')
110+
parser.add_argument('-i', '--synapseID', type=str, default="syn66593307", help='SynapseID of data to download')
111+
112+
parser.add_argument('-s', '--samples', action = 'store_true', help='Only generate samples, requires previous samples',default=False)
113+
parser.add_argument('-p', '--prevSamples', nargs='?',type=str, default='', const='', help='Use this to provide previous sample file')
114+
115+
116+
117+
args = parser.parse_args()
118+
119+
if args.download:
120+
if args.token is None:
121+
print("No synpase download tocken was provided. Cannot download data.")
122+
exit()
123+
else:
124+
print("Downloading Files from Synapse.")
125+
# Download samples data
126+
samples_download_path = download_samples_data(synID = args.synapseID, synToken = args.token, save_path = "/tmp")
127+
128+
if args.samples:
129+
if args.prevSamples is None or args.prevSamples=='':
130+
print("No previous samples file provided. Starting improve_sample_id from 1. Running sample file generation")
131+
sample_sheet = generate_sample_file(samples_data_path = samples_download_path)
132+
else:
133+
print("Previous sample sheet {} detected. Running sample file generation and checking for duplicate IDs.".format(args.prevSamples))
134+
sample_sheet = generate_sample_file(samples_data_path = samples_download_path, prev_samples_path= args.prevSamples)
135+
sample_sheet.to_csv("/tmp/liverpdo_samples.csv", index=False)
136+
137+

0 commit comments

Comments
 (0)