Skip to content

Commit 3cf23f8

Browse files
committed
Merge remote-tracking branch 'refs/remotes/origin/main'
2 parents 481385e + 618b21b commit 3cf23f8

21 files changed

Lines changed: 5109 additions & 2160 deletions

build/build_all.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
6262
print('running...'+filename)
6363
env = os.environ.copy()
6464
if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
65-
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
65+
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets')
6666
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
6767
else:
6868
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -85,7 +85,7 @@ def run_docker_cmd(cmd_arr,filename):
8585
# All output and errors are logged at local/docker.log
8686
# '''
8787
# compose_file = 'build/docker/docker-compose.yml'
88-
# compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel']
88+
# compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel']
8989
# log_file_path = 'local/docker.log'
9090
# env = os.environ.copy()
9191
# print(f"Docker-compose is building all images. View output in {log_file_path}.")
@@ -125,7 +125,8 @@ def process_docker(datasets):
125125
'sarcpdo': ['sarcpdo'],
126126
'cptac': ['cptac'],
127127
'genes': ['genes'],
128-
'upload': ['upload']
128+
'upload': ['upload'],
129+
'liverpdo': ['liverpdo']
129130
}
130131

131132
# Collect container names to build based on the datasets provided. Always build genes and upload.
@@ -134,7 +135,7 @@ def process_docker(datasets):
134135
datasets_to_build.extend(dataset_map.get(dataset, []))
135136

136137
# Build the docker-compose command, adding specific datasets
137-
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
138+
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
138139

139140
log_file_path = 'local/docker.log'
140141
env = os.environ.copy()
@@ -273,7 +274,7 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
273274
docker_run.extend(['upload'])
274275
if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
275276
docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
276-
if name == "Map_Drugs" or name == "Map_Samples":
277+
if name in ["Map_Drugs", "Map_Samples", "Align_Drug_Descriptors"]:
277278
docker_run.extend(['upload'])
278279
if 'GITHUB_TOKEN' in env and name == "GitHub":
279280
docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
@@ -330,9 +331,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
330331
# Error handling for required tokens
331332
if args.figshare and not figshare_token:
332333
raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
333-
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
334+
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo','liverpdo']) and not synapse_auth_token:
334335
if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
335-
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
336+
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, liverpdo, or sarcpdo datasets.")
336337

337338
######
338339
### Begin Pipeline
@@ -409,7 +410,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
409410
# if args.figshare or args.validate:
410411
# FigShare File Prefixes:
411412

412-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
413+
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
413414
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
414415
if "broad_sanger" in datasets:
415416
prefixes.extend(broad_sanger_datasets)
@@ -445,6 +446,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
445446

446447
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
447448
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
449+
450+
drug_mapping_command_2 = ['python3', 'scripts/align_drug_descriptors.py', '--local_dir', "/tmp", '--version', args.version]
451+
run_docker_upload_cmd(drug_mapping_command_2, 'all_files_dir', 'Align_Drug_Descriptors', args.version)
448452

449453
# Run schema checker - This will always run if uploading data.
450454
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets

build/build_dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def process_docker(dataset,validate):
4848
'genes': ['genes'],
4949
'upload': ['upload'],
5050
'crcpdo': ['crcpdo'],
51-
'bladderpdo': ['bladderpdo']
51+
'bladderpdo': ['bladderpdo'],
52+
'liverpdo': ['liverpdo']
5253
}
5354

5455
# Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -59,7 +60,7 @@ def process_docker(dataset,validate):
5960

6061
datasets_to_build.extend(dataset_map.get(dataset, []))
6162

62-
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
63+
compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build
6364

6465
log_file_path = 'local/docker.log'
6566
env = os.environ.copy()
@@ -131,7 +132,8 @@ def process_omics(executor, dataset, should_continue):
131132
'sarcpdo': ['mutations', 'transcriptomics'],
132133
'pancpdo': ['transcriptomics'],
133134
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
134-
'crcpdo':['copy_number', 'mutations', 'transcriptomics']
135+
'crcpdo':['copy_number', 'mutations', 'transcriptomics'],
136+
'liverpdo':['copy_number', 'mutations', 'transcriptomics']
135137
}
136138

137139
expected_omics = dataset_omics_files.get(dataset, [])

build/docker/Dockerfile.liverpdo

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
FROM r-base:4.4.1
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Update package list and install required packages
6+
RUN apt-get update && \
7+
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
8+
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
9+
10+
# Download and compile Python 3.10 with shared library support
11+
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
12+
tar -xf Python-3.10.12.tgz && \
13+
cd Python-3.10.12 && \
14+
./configure --enable-optimizations --enable-shared && \
15+
make -j$(nproc) && \
16+
make altinstall && \
17+
cd .. && \
18+
rm -rf Python-3.10.12.tgz Python-3.10.12
19+
20+
# Set Python 3.10 as default
21+
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
22+
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
23+
24+
# Update library paths for Python shared library
25+
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
26+
27+
# Create a Python virtual environment
28+
#RUN python3 -m venv /opt/venv
29+
#RUN /opt/venv/bin/pip install --upgrade pip
30+
31+
# Set environment variables for reticulate
32+
#ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
33+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
34+
WORKDIR /app
35+
36+
# Set MPLCONFIGDIR to a writable directory and create it.
37+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
38+
RUN mkdir -p /app/tmp/matplotlib
39+
40+
41+
42+
43+
# installing python libraries
44+
ADD build/liverpdo/requirements.txt .
45+
#RUN /opt/venv/bin/pip3 install -r requirements.txt
46+
RUN pip3 install -r requirements.txt
47+
48+
RUN python3 --version
49+
50+
#ENV PATH="/opt/venv/bin:$PATH"
51+
52+
ADD build/liverpdo/*py ./
53+
ADD build/liverpdo/*sh ./
54+
55+
ADD build/utils/* ./

build/docker/docker-compose.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,13 @@ services:
112112
args:
113113
HTTPS_PROXY: ${HTTPS_PROXY}
114114
platform: linux/amd64
115-
image: crcpdo:latest
115+
image: crcpdo:latest
116+
117+
liverpdo:
118+
build:
119+
context: ../../
120+
dockerfile: build/docker/Dockerfile.liverpdo
121+
args:
122+
HTTPS_PROXY: ${HTTPS_PROXY}
123+
platform: linux/amd64
124+
image: liverpdo:latest

build/hcmi/02-getHCMIData.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -613,10 +613,13 @@ def write_dataframe_to_csv(dataframe, outname):
613613
-------
614614
None
615615
"""
616+
dataframe = dataframe.to_pandas()
617+
dataframe = dataframe.drop_duplicates()
618+
616619
if('gz' in outname):
617-
dataframe.to_pandas().to_csv(outname,compression='gzip',index=False)
620+
dataframe.to_csv(outname,compression='gzip',index=False)
618621
else:
619-
dataframe.to_pandas().to_csv(outname,index=False)
622+
dataframe.to_csv(outname,index=False)
620623
return
621624

622625
def main():

build/hcmi/README.md

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,38 @@
11
## HCMI Data
22

3-
Here we will store the scripts required to process the data from the [Human Cancer Models Initiative](https://ocg.cancer.gov/programs/HCMI)
3+
Here we will store the scripts required to process the data from the
4+
[Human Cancer Models
5+
Initiative](https://ocg.cancer.gov/programs/HCMI).
6+
7+
Currenlty all data collected is part of the [HCMI-CMDC Project on the
8+
GDC](https://portal.gdc.cancer.gov/analysis_page?app=Projects). To
9+
update:
10+
11+
1. Navigate to the [GDC Data
12+
Portal](https://portal.gdc.cancer.gov/analysis_page?app=Projects),
13+
and select 'HCMI-CMDC'
14+
2. Click on the 'Cases' button, and select the download button where
15+
it lists the number of files.
16+
3. This will download the ENTIRE Manifes
17+
4. Filter the manifest for RNASeq, WGS mutations, and copy number
18+
calls using the following command:
19+
```
20+
cat ~gdc_manifest.2025-07-08.091940.txt | grep 'mask\|copy\|rna_seq\|md5'
21+
| grep 'txt\|maf\|tsv\|md5' > new_manifest.txt
22+
cp new_manifest.txt full_manifest.txt
23+
24+
```
425

526

6-
Currently the tool require two steps to build the data:
27+
Currently the tool require two scripts to build the data:
728
```
829
python 01-createHCMISamplesFile.py
930
10-
python 02-getHCMIData.py -m transcriptomics_gdc_manifest.txt -t transcriptomics -o transcriptomics.csv
31+
python 02-getHCMIData.py -m full_manifest.txt -t transcriptomics -o transcriptomics.csv
1132
12-
python 02-getHCMIData.py -m mutations_manifest_gdc.txt -t mutations -o mutations.csv
33+
python 02-getHCMIData.py -m full_manifest.txt -t mutations -o mutations.csv
1334
14-
python 02-getHCMIData.py -m _manifest.txt -t copy_number -o copy_number.csv
35+
python 02-getHCMIData.py -m full_manifest.txt -t copy_number -o copy_number.csv
1536
1637
17-
```
38+
```

0 commit comments

Comments
 (0)