@@ -5,9 +5,46 @@ library(readr)
55library(tidyr )
66library(dplyr )
77library(rio )
8+ library(httr2 )
89
910Sys.setenv(VROOM_CONNECTION_SIZE = 100000000 )
1011
12+
13+ # Robust download with retry and optional content-length validation
14+ robust_download_httr2 <- function (url , dest , max_tries = 5 , timeout_secs = 120 ) {
15+ req <- request(url ) | >
16+ req_timeout(timeout_secs ) | >
17+ req_retry(max_tries = max_tries , retry_on_failure = TRUE )
18+
19+ resp <- req | > req_perform(path = dest ) # streams to dest; errors on 4xx/5xx automatically
20+
21+ # Validate content length if provided
22+ hdrs <- resp | > resp_headers()
23+ if (! is.null(hdrs $ `content-length` )) {
24+ expected <- as.numeric(hdrs $ `content-length` )
25+ actual <- file.info(dest )$ size
26+ if (is.na(actual ) || actual != expected ) {
27+ stop(sprintf(" Incomplete download for %s: expected %d bytes but got %d" , url , expected , actual ))
28+ }
29+ }
30+
31+ invisible (dest )
32+ }
33+
34+ # Helper to download a ZIP and extract it safely
35+ download_and_extract_zip_httr2 <- function (url , dest_zip , extract_dir , max_tries = 5 , timeout_secs = 120 ) {
36+ robust_download_httr2(url , dest_zip , max_tries = max_tries , timeout_secs = timeout_secs )
37+ if (! file.exists(dest_zip )) stop(sprintf(" Download failed, %s missing" , dest_zip ))
38+ tryCatch({
39+ utils :: unzip(dest_zip , exdir = extract_dir )
40+ }, error = function (e ) {
41+ file.remove(dest_zip )
42+ stop(sprintf(" Failed to unzip %s: %s" , dest_zip , e $ message ))
43+ })
44+ }
45+
46+
47+
1148# #### DEPMAP FILES
1249
1350depmap_filenames = list (copy_number = ' https://figshare.com/ndownloader/files/40448840' ,
@@ -79,8 +116,11 @@ sanger_files<-function(fi,value){
79116 # #and mapping to get it into a unified 3 column schema
80117 if (value == ' copy_number' ){
81118 # read in file
82- exp_file <- readr :: read_csv(fi ) # #already in long form <3 <3 <3
83- file.remove(fi )
119+ local_cn <- file.path(tempdir(), " sanger_copy_number.csv.gz" )
120+ robust_download_httr2(fi , local_cn )
121+ exp_file <- readr :: read_csv(local_cn )
122+ # exp_file <- readr::read_csv(fi) ##already in long form <3 <3 <3
123+ # file.remove(fi)
84124 smap <- sanger_samples | >
85125 subset(other_id_source == ' Sanger' )| >
86126 subset(other_id %in% exp_file $ model_id )| >
@@ -149,17 +189,22 @@ sanger_files<-function(fi,value){
149189
150190
151191 }else if (value == ' mutations' ){ # ###IF DATA REPRESENTS MUTATIONS#####
152- res = download.file(fi ,' /tmp/tmp.zip' )
153- filist <- unzip(' /tmp/tmp.zip' ,exdir = ' /tmp' )
154- fi = " /tmp/mutations_all_20230202.csv"
155- if (file.exists(" /tmp/tmp.zip" ))
156- file.remove(' /tmp/tmp.zip' )
157-
158- exp_file <- readr :: read_csv(fi )| >
159- dplyr :: select(gene_symbol ,other_id = ' model_id' ,effect ,mutation = ' cdna_mutation' ,source )| >
160- distinct()
161- if (file.exists(fi ))
162- file.remove(fi )
192+ # res=download.file(fi,'/tmp/tmp.zip')
193+ # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
194+ # fi= "/tmp/mutations_all_20230202.csv"
195+ zip_path <- file.path(tempdir(), " sanger_mutations_20230202.zip" )
196+ download_and_extract_zip_httr2(fi , zip_path , " /tmp" )
197+ csv_path <- file.path(" /tmp" , " mutations_all_20230202.csv" )
198+ if (! file.exists(csv_path )) stop(" Expected mutations CSV not found after unzip" )
199+
200+
201+ exp_file <- readr :: read_csv(csv_path ) | >
202+ dplyr :: select(gene_symbol , other_id = ' model_id' , effect , mutation = ' cdna_mutation' , source ) | >
203+ distinct()
204+
205+
206+ file.remove(csv_path )
207+ if (file.exists(zip_path )) file.remove(zip_path )
163208
164209 smap <- sanger_samples | >
165210 dplyr :: select(improve_sample_id ,other_id )| > distinct()
@@ -193,12 +238,14 @@ sanger_files<-function(fi,value){
193238 print(head(res ))
194239 return (res )
195240 }else if (value == ' transcriptomics' ){ # if gene expression
196- res = download.file(fi ,' /tmp/tmp.zip' )
197- filist <- unzip(' /tmp/tmp.zip' ,exdir = ' /tmp' )
198- fi = " /tmp/rnaseq_tpm_20220624.csv"
199- if (file.exists(" /tmp/tmp.zip" ))
200- file.remove(' /tmp/tmp.zip' )
201- exp_file <- readr :: read_csv(fi )
241+ # res=download.file(fi,'/tmp/tmp.zip')
242+ # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
243+
244+ zip_path <- " /tmp/rnaseq_all_20220624.zip"
245+ download_and_extract_zip_httr2(fi , zip_path , " /tmp" )
246+ csv_path <- file.path(" /tmp" , " rnaseq_tpm_20220624.csv" )
247+ if (! file.exists(csv_path )) stop(" Expected transcriptomics CSV not found after unzip" )
248+ exp_file <- readr :: read_csv(csv_path )
202249
203250 # #the rows have metadata
204251 samps <- t(exp_file [1 : 3 ,])
@@ -238,7 +285,11 @@ sanger_files<-function(fi,value){
238285 full <- res | >
239286 left_join(smap )
240287 rm(res )
241- file.remove(fi )
288+
289+ file.remove(csv_path )
290+ if (file.exists(zip_path )) file.remove(zip_path )
291+
292+
242293 }else if (value == ' miRNA' ){ # if mirna expression
243294 exp_file <- readr :: read_csv(fi )
244295
@@ -279,13 +330,20 @@ sanger_files<-function(fi,value){
279330 full <- res
280331
281332 }else if (value == ' proteomics' ){
282- res = download.file(fi ,' /tmp/tmp.zip' )
283- filist <- unzip(' /tmp/tmp.zip' ,exdir = ' /tmp' )
333+ # res=download.file(fi,'/tmp/tmp.zip')
334+ # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
335+
336+ zip_path <- " /tmp/Proteomics_20221214.zip"
337+ download_and_extract_zip_httr2(fi , zip_path , " /tmp" )
338+ tsv_path <- file.path(" /tmp" , " Protein_matrix_averaged_zscore_20221214.tsv" )
339+ if (! file.exists(tsv_path )) stop(" Expected proteomics TSV not found after unzip" )
284340
285- fi = ' /tmp/Protein_matrix_averaged_zscore_20221214.tsv'
286- exp_file <- readr :: read_tsv(fi ,skip = 1 )[- 1 ,- 1 ]
341+ exp_file <- readr :: read_tsv(tsv_path ,skip = 1 )[- 1 ,- 1 ]
287342 colnames(exp_file )[1 ]<- ' other_id'
288- file.remove(fi )
343+
344+ file.remove(tsv_path )
345+ if (file.exists(zip_path )) file.remove(zip_path )
346+
289347 smap <- sanger_samples | >
290348 dplyr :: select(improve_sample_id ,other_id )| > distinct()
291349
@@ -339,7 +397,11 @@ depmap_files<-function(fi,value){
339397 # #now every data type is parsed slightly differently, so we need to change our formatting
340398 # #and mapping to get it into a unified 3 column schema
341399 if (value == ' copy_number' ){
342- exp_file <- readr :: read_csv(fi )
400+ # exp_file <- readr::read_csv(fi)
401+ local_path <- " /tmp/depmap_copy_number.csv.gz"
402+ robust_download_httr2(fi , local_path )
403+ exp_file <- readr :: read_csv(local_path )
404+
343405
344406 print(' Long to wide' )
345407 res = exp_file | >
@@ -399,7 +461,11 @@ depmap_files<-function(fi,value){
399461
400462
401463 }else if (value == ' mutations' ){ # ###IF DATA REPRESENTS MUTATIONS#####
402- exp_file <- readr :: read_csv(fi )| >
464+
465+ local_mut <- file.path(tempdir(), " depmap_mutations.csv.gz" )
466+ robust_download_httr2(fi , local_mut )
467+
468+ exp_file <- readr :: read_csv(local_mut )| >
403469 dplyr :: select(EntrezGeneID ,HgncName ,other_id = ' ModelID' ,VariantInfo ,mutation = ' DNAChange' )| >
404470 distinct()
405471
@@ -439,7 +505,11 @@ depmap_files<-function(fi,value){
439505 print(head(full ))
440506 return (full )
441507 }else if (value == ' transcriptomics' ){ # if gene expression
442- exp_file <- readr :: read_csv(fi )
508+ # exp_file <- readr::read_csv(fi)
509+ local_tx <- file.path(tempdir(), " depmap_transcriptomics.csv.gz" )
510+ robust_download_httr2(fi , local_tx )
511+ exp_file <- readr :: read_csv(local_tx )
512+
443513 print(" wide to long" )
444514 res = tidyr :: pivot_longer(data = exp_file ,cols = c(2 : ncol(exp_file )),
445515 names_to = ' gene_entrez' ,values_to = ' transcriptomics' ,
0 commit comments