Skip to content

Commit 4d74714

Browse files
committed
Added robust methods to download files for broad_sanger omics
1 parent 09fb9e5 commit 4d74714

7 files changed

Lines changed: 152 additions & 40 deletions

File tree

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 98 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,46 @@ library(readr)
55
library(tidyr)
66
library(dplyr)
77
library(rio)
8+
library(httr2)
89

910
Sys.setenv(VROOM_CONNECTION_SIZE=100000000)
1011

12+
13+
# Robust download with retry and optional content-length validation
14+
robust_download_httr2 <- function(url, dest, max_tries = 5, timeout_secs = 120) {
15+
req <- request(url) |>
16+
req_timeout(timeout_secs) |>
17+
req_retry(max_tries = max_tries, retry_on_failure = TRUE)
18+
19+
resp <- req |> req_perform(path = dest) # streams to dest; errors on 4xx/5xx automatically
20+
21+
# Validate content length if provided
22+
hdrs <- resp |> resp_headers()
23+
if (!is.null(hdrs$`content-length`)) {
24+
expected <- as.numeric(hdrs$`content-length`)
25+
actual <- file.info(dest)$size
26+
if (is.na(actual) || actual != expected) {
27+
stop(sprintf("Incomplete download for %s: expected %d bytes but got %d", url, expected, actual))
28+
}
29+
}
30+
31+
invisible(dest)
32+
}
33+
34+
# Helper to download a ZIP and extract it safely
35+
download_and_extract_zip_httr2 <- function(url, dest_zip, extract_dir, max_tries = 5, timeout_secs = 120) {
36+
robust_download_httr2(url, dest_zip, max_tries = max_tries, timeout_secs = timeout_secs)
37+
if (!file.exists(dest_zip)) stop(sprintf("Download failed, %s missing", dest_zip))
38+
tryCatch({
39+
utils::unzip(dest_zip, exdir = extract_dir)
40+
}, error = function(e) {
41+
file.remove(dest_zip)
42+
stop(sprintf("Failed to unzip %s: %s", dest_zip, e$message))
43+
})
44+
}
45+
46+
47+
1148
##### DEPMAP FILES
1249

1350
depmap_filenames=list(copy_number='https://figshare.com/ndownloader/files/40448840',
@@ -79,8 +116,11 @@ sanger_files<-function(fi,value){
79116
##and mapping to get it into a unified 3 column schema
80117
if(value=='copy_number'){
81118
#read in file
82-
exp_file <- readr::read_csv(fi) ##already in long form <3 <3 <3
83-
file.remove(fi)
119+
local_cn <- file.path(tempdir(), "sanger_copy_number.csv.gz")
120+
robust_download_httr2(fi, local_cn)
121+
exp_file <- readr::read_csv(local_cn)
122+
# exp_file <- readr::read_csv(fi) ##already in long form <3 <3 <3
123+
# file.remove(fi)
84124
smap<-sanger_samples|>
85125
subset(other_id_source=='Sanger')|>
86126
subset(other_id%in%exp_file$model_id)|>
@@ -149,17 +189,22 @@ sanger_files<-function(fi,value){
149189

150190

151191
}else if(value=='mutations'){ ####IF DATA REPRESENTS MUTATIONS#####
152-
res=download.file(fi,'/tmp/tmp.zip')
153-
filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
154-
fi= "/tmp/mutations_all_20230202.csv"
155-
if(file.exists("/tmp/tmp.zip"))
156-
file.remove('/tmp/tmp.zip')
157-
158-
exp_file <- readr::read_csv(fi)|>
159-
dplyr::select(gene_symbol,other_id='model_id',effect,mutation='cdna_mutation',source)|>
160-
distinct()
161-
if(file.exists(fi))
162-
file.remove(fi)
192+
# res=download.file(fi,'/tmp/tmp.zip')
193+
# filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
194+
# fi= "/tmp/mutations_all_20230202.csv"
195+
zip_path <- file.path(tempdir(), "sanger_mutations_20230202.zip")
196+
download_and_extract_zip_httr2(fi, zip_path, "/tmp")
197+
csv_path <- file.path("/tmp", "mutations_all_20230202.csv")
198+
if (!file.exists(csv_path)) stop("Expected mutations CSV not found after unzip")
199+
200+
201+
exp_file <- readr::read_csv(csv_path) |>
202+
dplyr::select(gene_symbol, other_id = 'model_id', effect, mutation = 'cdna_mutation', source) |>
203+
distinct()
204+
205+
206+
file.remove(csv_path)
207+
if (file.exists(zip_path)) file.remove(zip_path)
163208

164209
smap<-sanger_samples|>
165210
dplyr::select(improve_sample_id,other_id)|>distinct()
@@ -193,12 +238,14 @@ sanger_files<-function(fi,value){
193238
print(head(res))
194239
return(res)
195240
}else if(value=='transcriptomics'){ #if gene expression
196-
res=download.file(fi,'/tmp/tmp.zip')
197-
filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
198-
fi= "/tmp/rnaseq_tpm_20220624.csv"
199-
if(file.exists("/tmp/tmp.zip"))
200-
file.remove('/tmp/tmp.zip')
201-
exp_file <- readr::read_csv(fi)
241+
# res=download.file(fi,'/tmp/tmp.zip')
242+
# filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
243+
244+
zip_path <- "/tmp/rnaseq_all_20220624.zip"
245+
download_and_extract_zip_httr2(fi, zip_path, "/tmp")
246+
csv_path <- file.path("/tmp", "rnaseq_tpm_20220624.csv")
247+
if (!file.exists(csv_path)) stop("Expected transcriptomics CSV not found after unzip")
248+
exp_file <- readr::read_csv(csv_path)
202249

203250
##the rows have metadata
204251
samps<-t(exp_file[1:3,])
@@ -238,7 +285,11 @@ sanger_files<-function(fi,value){
238285
full<-res|>
239286
left_join(smap)
240287
rm(res)
241-
file.remove(fi)
288+
289+
file.remove(csv_path)
290+
if (file.exists(zip_path)) file.remove(zip_path)
291+
292+
242293
}else if(value=='miRNA'){ #if mirna expression
243294
exp_file <- readr::read_csv(fi)
244295

@@ -279,13 +330,20 @@ sanger_files<-function(fi,value){
279330
full<-res
280331

281332
}else if(value=='proteomics'){
282-
res=download.file(fi,'/tmp/tmp.zip')
283-
filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
333+
# res=download.file(fi,'/tmp/tmp.zip')
334+
# filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
335+
336+
zip_path <- "/tmp/Proteomics_20221214.zip"
337+
download_and_extract_zip_httr2(fi, zip_path, "/tmp")
338+
tsv_path <- file.path("/tmp", "Protein_matrix_averaged_zscore_20221214.tsv")
339+
if (!file.exists(tsv_path)) stop("Expected proteomics TSV not found after unzip")
284340

285-
fi='/tmp/Protein_matrix_averaged_zscore_20221214.tsv'
286-
exp_file <- readr::read_tsv(fi,skip=1)[-1,-1]
341+
exp_file <- readr::read_tsv(tsv_path,skip=1)[-1,-1]
287342
colnames(exp_file)[1]<-'other_id'
288-
file.remove(fi)
343+
344+
file.remove(tsv_path)
345+
if (file.exists(zip_path)) file.remove(zip_path)
346+
289347
smap<-sanger_samples|>
290348
dplyr::select(improve_sample_id,other_id)|>distinct()
291349

@@ -339,7 +397,11 @@ depmap_files<-function(fi,value){
339397
##now every data type is parsed slightly differently, so we need to change our formatting
340398
##and mapping to get it into a unified 3 column schema
341399
if(value=='copy_number'){
342-
exp_file <- readr::read_csv(fi)
400+
# exp_file <- readr::read_csv(fi)
401+
local_path <- "/tmp/depmap_copy_number.csv.gz"
402+
robust_download_httr2(fi, local_path)
403+
exp_file <- readr::read_csv(local_path)
404+
343405

344406
print('Long to wide')
345407
res = exp_file|>
@@ -399,7 +461,11 @@ depmap_files<-function(fi,value){
399461

400462

401463
}else if(value=='mutations'){ ####IF DATA REPRESENTS MUTATIONS#####
402-
exp_file <- readr::read_csv(fi)|>
464+
465+
local_mut <- file.path(tempdir(), "depmap_mutations.csv.gz")
466+
robust_download_httr2(fi, local_mut)
467+
468+
exp_file <- readr::read_csv(local_mut)|>
403469
dplyr::select(EntrezGeneID,HgncName,other_id='ModelID',VariantInfo,mutation='DNAChange')|>
404470
distinct()
405471

@@ -439,7 +505,11 @@ depmap_files<-function(fi,value){
439505
print(head(full))
440506
return(full)
441507
}else if(value=='transcriptomics'){ #if gene expression
442-
exp_file <- readr::read_csv(fi)
508+
# exp_file <- readr::read_csv(fi)
509+
local_tx <- file.path(tempdir(), "depmap_transcriptomics.csv.gz")
510+
robust_download_httr2(fi, local_tx)
511+
exp_file <- readr::read_csv(local_tx)
512+
443513
print("wide to long")
444514
res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)),
445515
names_to='gene_entrez',values_to='transcriptomics',

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,37 @@
22
import argparse
33
from zipfile import ZipFile
44
import requests
5+
from requests.adapters import HTTPAdapter
6+
from urllib3.util import Retry
7+
8+
def robust_download(url, dest_path, max_retries=5):
9+
session = requests.Session()
10+
retry_strategy = Retry(
11+
total=max_retries,
12+
backoff_factor=1,
13+
status_forcelist=[429, 500, 502, 503, 504],
14+
allowed_methods={"GET", "HEAD"},
15+
raise_on_status=False,
16+
)
17+
adapter = HTTPAdapter(max_retries=retry_strategy)
18+
session.mount("https://", adapter)
19+
session.mount("http://", adapter)
20+
21+
try:
22+
with session.get(url, stream=True, timeout=(5, 60)) as r:
23+
r.raise_for_status()
24+
with open(dest_path, "wb") as f:
25+
for chunk in r.iter_content(chunk_size=1024 * 1024):
26+
if chunk: # filter out keep-alive chunks
27+
f.write(chunk)
28+
except requests.exceptions.RequestException as e:
29+
raise RuntimeError(f"Failed to download {url}: {e}") from e
30+
531

632
def main():
733
parser = argparse.ArgumentParser()
8-
parser.add_argument('--sample',dest='samplefile',default=None,help='DepMap sample file')
9-
parser.add_argument('--gene',dest='genefile',default=None,help='DepMap sample file')
34+
parser.add_argument('--sample', dest='samplefile', default=None, help='DepMap sample file')
35+
parser.add_argument('--gene', dest='genefile', default=None, help='Gene file')
1036

1137
opts = parser.parse_args()
1238

@@ -53,15 +79,27 @@ def main():
5379
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5480

5581

82+
#old download, (was failing too much)
83+
# sanger_protfile='https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip'
84+
# r = requests.get(sanger_protfile)
85+
# sanger_loc ='/tmp/sp.zip'
86+
# open(sanger_loc , 'wb').write(r.content)
87+
# zf = ZipFile(sanger_loc,'r')
88+
# zf.extractall(path='/tmp/')
89+
# pdat = pd.read_csv('/tmp/Protein_matrix_averaged_zscore_20221214.tsv',sep='\t',skiprows=[0])
90+
5691
##now get sanger
57-
sanger_protfile='https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip'
58-
r = requests.get(sanger_protfile)
59-
sanger_loc ='/tmp/sp.zip'
60-
open(sanger_loc , 'wb').write(r.content)
61-
62-
zf = ZipFile(sanger_loc,'r')
63-
zf.extractall(path='/tmp/')
64-
pdat = pd.read_csv('/tmp/Protein_matrix_averaged_zscore_20221214.tsv',sep='\t',skiprows=[0])
92+
sanger_protfile = "https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip"
93+
sanger_loc = "/tmp/sp.zip"
94+
robust_download(sanger_protfile, sanger_loc)
95+
with ZipFile(sanger_loc, "r") as zf:
96+
zf.extractall(path="/tmp/")
97+
pdat = pd.read_csv(
98+
"/tmp/Protein_matrix_averaged_zscore_20221214.tsv",
99+
sep="\t",
100+
skiprows=[0],
101+
)
102+
65103
vv=pdat.columns[2:]
66104
plong = pd.melt(pdat,id_vars='symbol',value_vars=vv)
67105
pres = plong.rename({'symbol':'other_names','variable':'gene_symbol','value':'proteomics'},axis=1)

build/broad_sanger/omics_requirements.r

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ install.packages("dplyr")
77
install.packages("XML")
88
#install.packages('reticulate')
99
install.packages('tidyr')
10+
install.packages('httr2')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pandas
22
numpy
33
requests
4+
urllib3

build/broad_sanger/requirements.r

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ install.packages("XML")
1010
#install.packages('remotes')
1111
install.packages('reticulate')
1212
install.packages('tidyr')
13+
install.packages('httr2')
1314
#install.packages("BiocManager")
1415
BiocManager::install("PharmacoGx",update=TRUE,ask=FALSE)
1516
BiocManager::install("org.Hs.eg.db",update=TRUE,ask=FALSE)

build/broad_sanger/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ mordredcommunity
1212
rdkit
1313
coderdata==0.1.40
1414
psutil
15-
polars
15+
polars
16+
urllib3

scripts/map_improve_sample_ids.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def main():
412412
help='Build date in YYYY-MM-DD. Default=now.')
413413
parser.add_argument('--version', required=True,
414414
help='Build version. Must be unique per build.')
415-
parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,cptac,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
415+
parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
416416
help='Comma-separated list of datasets, e.g., beataml,ccle')
417417
parser.add_argument('--local_dir', default='data',
418418
help='Directory containing all CSV/TSV files.')

0 commit comments

Comments
 (0)