Added robust methods to download files for broad_sanger omics

jjacobson95 · jjacobson95 · commit 4d74714c0b46 · 2025-08-04T11:58:45.000-07:00
diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R
@@ -5,9 +5,46 @@ library(readr)
 library(tidyr)
 library(dplyr)
 library(rio)
+library(httr2)
 
 Sys.setenv(VROOM_CONNECTION_SIZE=100000000)
 
+
+# Robust download with retry and optional content-length validation
+robust_download_httr2 <- function(url, dest, max_tries = 5, timeout_secs = 120) {
+  req <- request(url) |>
+    req_timeout(timeout_secs) |>
+    req_retry(max_tries = max_tries, retry_on_failure = TRUE)
+
+  resp <- req |> req_perform(path = dest)  # streams to dest; errors on 4xx/5xx automatically
+
+  # Validate content length if provided
+  hdrs <- resp |> resp_headers()
+  if (!is.null(hdrs$`content-length`)) {
+    expected <- as.numeric(hdrs$`content-length`)
+    actual <- file.info(dest)$size
+    if (is.na(actual) || actual != expected) {
+      stop(sprintf("Incomplete download for %s: expected %d bytes but got %d", url, expected, actual))
+    }
+  }
+
+  invisible(dest)
+}
+
+# Helper to download a ZIP and extract it safely
+download_and_extract_zip_httr2 <- function(url, dest_zip, extract_dir, max_tries = 5, timeout_secs = 120) {
+  robust_download_httr2(url, dest_zip, max_tries = max_tries, timeout_secs = timeout_secs)
+  if (!file.exists(dest_zip)) stop(sprintf("Download failed, %s missing", dest_zip))
+  tryCatch({
+    utils::unzip(dest_zip, exdir = extract_dir)
+  }, error = function(e) {
+    file.remove(dest_zip)
+    stop(sprintf("Failed to unzip %s: %s", dest_zip, e$message))
+  })
+}
+
+
+
 ##### DEPMAP FILES
 
 depmap_filenames=list(copy_number='https://figshare.com/ndownloader/files/40448840',
@@ -79,8 +116,11 @@ sanger_files<-function(fi,value){
     ##and mapping to get it into a unified 3 column schema
     if(value=='copy_number'){
       #read in file
-      exp_file <- readr::read_csv(fi) ##already in long form <3 <3 <3
-      file.remove(fi)
+      local_cn <- file.path(tempdir(), "sanger_copy_number.csv.gz")
+      robust_download_httr2(fi, local_cn)
+      exp_file <- readr::read_csv(local_cn)
+      # exp_file <- readr::read_csv(fi) ##already in long form <3 <3 <3
+      # file.remove(fi)
       smap<-sanger_samples|>
           subset(other_id_source=='Sanger')|>
           subset(other_id%in%exp_file$model_id)|>
@@ -149,17 +189,22 @@ sanger_files<-function(fi,value){
 
 
     }else if(value=='mutations'){ ####IF DATA REPRESENTS MUTATIONS#####
-      res=download.file(fi,'/tmp/tmp.zip')
-      filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
-      fi= "/tmp/mutations_all_20230202.csv"
-      if(file.exists("/tmp/tmp.zip"))
-          file.remove('/tmp/tmp.zip')
-
-      exp_file <- readr::read_csv(fi)|>
-        dplyr::select(gene_symbol,other_id='model_id',effect,mutation='cdna_mutation',source)|>
-          distinct()
-      if(file.exists(fi))
-          file.remove(fi)
+      # res=download.file(fi,'/tmp/tmp.zip')
+      # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
+      # fi= "/tmp/mutations_all_20230202.csv"
+      zip_path <- file.path(tempdir(), "sanger_mutations_20230202.zip")
+      download_and_extract_zip_httr2(fi, zip_path, "/tmp")
+      csv_path <- file.path("/tmp", "mutations_all_20230202.csv")
+      if (!file.exists(csv_path)) stop("Expected mutations CSV not found after unzip")
+
+    
+      exp_file <- readr::read_csv(csv_path) |>
+        dplyr::select(gene_symbol, other_id = 'model_id', effect, mutation = 'cdna_mutation', source) |>
+        distinct()
+
+
+      file.remove(csv_path)
+      if (file.exists(zip_path)) file.remove(zip_path)
 
       smap<-sanger_samples|>
         dplyr::select(improve_sample_id,other_id)|>distinct()
@@ -193,12 +238,14 @@ sanger_files<-function(fi,value){
       print(head(res))
       return(res)
     }else if(value=='transcriptomics'){ #if gene expression
-      res=download.file(fi,'/tmp/tmp.zip')
-      filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
-      fi= "/tmp/rnaseq_tpm_20220624.csv"
-      if(file.exists("/tmp/tmp.zip"))
-         file.remove('/tmp/tmp.zip')
-         exp_file <- readr::read_csv(fi)
+      # res=download.file(fi,'/tmp/tmp.zip')
+      # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
+
+      zip_path <- "/tmp/rnaseq_all_20220624.zip"
+      download_and_extract_zip_httr2(fi, zip_path, "/tmp")
+      csv_path <- file.path("/tmp", "rnaseq_tpm_20220624.csv")
+      if (!file.exists(csv_path)) stop("Expected transcriptomics CSV not found after unzip")
+      exp_file <- readr::read_csv(csv_path)
 
       ##the rows have metadata
       samps<-t(exp_file[1:3,])
@@ -238,7 +285,11 @@ sanger_files<-function(fi,value){
       full<-res|>
         left_join(smap)
       rm(res)
-      file.remove(fi)
+
+      file.remove(csv_path)
+      if (file.exists(zip_path)) file.remove(zip_path)
+
+
     }else if(value=='miRNA'){ #if mirna expression
       exp_file <- readr::read_csv(fi)
 
@@ -279,13 +330,20 @@ sanger_files<-function(fi,value){
       full<-res
 
     }else if(value=='proteomics'){
-      res=download.file(fi,'/tmp/tmp.zip')
-      filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
+      # res=download.file(fi,'/tmp/tmp.zip')
+      # filist<-unzip('/tmp/tmp.zip',exdir='/tmp')
+
+      zip_path <- "/tmp/Proteomics_20221214.zip"
+      download_and_extract_zip_httr2(fi, zip_path, "/tmp")
+      tsv_path <- file.path("/tmp", "Protein_matrix_averaged_zscore_20221214.tsv")
+      if (!file.exists(tsv_path)) stop("Expected proteomics TSV not found after unzip")
 
-      fi='/tmp/Protein_matrix_averaged_zscore_20221214.tsv'
-      exp_file <- readr::read_tsv(fi,skip=1)[-1,-1]
+      exp_file <- readr::read_tsv(tsv_path,skip=1)[-1,-1]
       colnames(exp_file)[1]<-'other_id'
-      file.remove(fi)
+
+      file.remove(tsv_path)
+      if (file.exists(zip_path)) file.remove(zip_path)
+
       smap<-sanger_samples|>
         dplyr::select(improve_sample_id,other_id)|>distinct()
 
@@ -339,7 +397,11 @@ depmap_files<-function(fi,value){
     ##now every data type is parsed slightly differently, so we need to change our formatting
     ##and mapping to get it into a unified 3 column schema
     if(value=='copy_number'){
-      exp_file <- readr::read_csv(fi)
+      # exp_file <- readr::read_csv(fi)
+      local_path <- "/tmp/depmap_copy_number.csv.gz"
+      robust_download_httr2(fi, local_path)
+      exp_file <- readr::read_csv(local_path)
+
 
       print('Long to wide')
       res = exp_file|>
@@ -399,7 +461,11 @@ depmap_files<-function(fi,value){
 
 
       }else if(value=='mutations'){ ####IF DATA REPRESENTS MUTATIONS#####
-        exp_file <- readr::read_csv(fi)|>
+
+        local_mut <- file.path(tempdir(), "depmap_mutations.csv.gz")
+        robust_download_httr2(fi, local_mut)
+
+        exp_file <- readr::read_csv(local_mut)|>
           dplyr::select(EntrezGeneID,HgncName,other_id='ModelID',VariantInfo,mutation='DNAChange')|>
           distinct()
 
@@ -439,7 +505,11 @@ depmap_files<-function(fi,value){
         print(head(full))
         return(full)
       }else if(value=='transcriptomics'){ #if gene expression
-        exp_file <- readr::read_csv(fi)
+        # exp_file <- readr::read_csv(fi)
+        local_tx <- file.path(tempdir(), "depmap_transcriptomics.csv.gz")
+        robust_download_httr2(fi, local_tx)
+        exp_file <- readr::read_csv(local_tx)
+
         print("wide to long")
         res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)),
                                   names_to='gene_entrez',values_to='transcriptomics',
diff --git a/build/broad_sanger/02a-broad_sanger_proteomics.py b/build/broad_sanger/02a-broad_sanger_proteomics.py
@@ -2,11 +2,37 @@
 import argparse
 from zipfile import ZipFile
 import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+def robust_download(url, dest_path, max_retries=5):
+    session = requests.Session()
+    retry_strategy = Retry(
+        total=max_retries,
+        backoff_factor=1,
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods={"GET", "HEAD"},
+        raise_on_status=False,
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+
+    try:
+        with session.get(url, stream=True, timeout=(5, 60)) as r:
+            r.raise_for_status()
+            with open(dest_path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=1024 * 1024):
+                    if chunk:  # filter out keep-alive chunks
+                        f.write(chunk)
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Failed to download {url}: {e}") from e
+
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--sample',dest='samplefile',default=None,help='DepMap sample file')
-    parser.add_argument('--gene',dest='genefile',default=None,help='DepMap sample file')
+    parser.add_argument('--sample', dest='samplefile', default=None, help='DepMap sample file')
+    parser.add_argument('--gene', dest='genefile', default=None, help='Gene file')
 
     opts = parser.parse_args()
 
@@ -53,15 +79,27 @@ def main():
     full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
 
 
+    #old download, (was failing too much)
+    # sanger_protfile='https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip'
+    # r = requests.get(sanger_protfile)
+    # sanger_loc ='/tmp/sp.zip'
+    # open(sanger_loc , 'wb').write(r.content)
+    # zf = ZipFile(sanger_loc,'r')
+    # zf.extractall(path='/tmp/')
+    # pdat = pd.read_csv('/tmp/Protein_matrix_averaged_zscore_20221214.tsv',sep='\t',skiprows=[0])
+    
     ##now get sanger
-    sanger_protfile='https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip'
-    r = requests.get(sanger_protfile)
-    sanger_loc ='/tmp/sp.zip'
-    open(sanger_loc , 'wb').write(r.content)
-
-    zf = ZipFile(sanger_loc,'r')
-    zf.extractall(path='/tmp/')
-    pdat = pd.read_csv('/tmp/Protein_matrix_averaged_zscore_20221214.tsv',sep='\t',skiprows=[0])
+    sanger_protfile = "https://cog.sanger.ac.uk/cmp/download/Proteomics_20221214.zip"
+    sanger_loc = "/tmp/sp.zip"
+    robust_download(sanger_protfile, sanger_loc)
+    with ZipFile(sanger_loc, "r") as zf:
+        zf.extractall(path="/tmp/")
+    pdat = pd.read_csv(
+        "/tmp/Protein_matrix_averaged_zscore_20221214.tsv",
+        sep="\t",
+        skiprows=[0],
+    )
+    
     vv=pdat.columns[2:]
     plong = pd.melt(pdat,id_vars='symbol',value_vars=vv)
     pres = plong.rename({'symbol':'other_names','variable':'gene_symbol','value':'proteomics'},axis=1)
diff --git a/build/broad_sanger/omics_requirements.r b/build/broad_sanger/omics_requirements.r
@@ -7,3 +7,4 @@ install.packages("dplyr")
 install.packages("XML")
 #install.packages('reticulate')
 install.packages('tidyr')
+install.packages('httr2')
diff --git a/build/broad_sanger/omics_requirements.txt b/build/broad_sanger/omics_requirements.txt
@@ -1,3 +1,4 @@
 pandas
 numpy
 requests
+urllib3
diff --git a/build/broad_sanger/requirements.r b/build/broad_sanger/requirements.r
@@ -10,6 +10,7 @@ install.packages("XML")
 #install.packages('remotes')
 install.packages('reticulate')
 install.packages('tidyr')
+install.packages('httr2')
 #install.packages("BiocManager")
 BiocManager::install("PharmacoGx",update=TRUE,ask=FALSE)
 BiocManager::install("org.Hs.eg.db",update=TRUE,ask=FALSE)
diff --git a/build/broad_sanger/requirements.txt b/build/broad_sanger/requirements.txt
@@ -12,4 +12,5 @@ mordredcommunity
 rdkit
 coderdata==0.1.40
 psutil
-polars
+polars
+urllib3
diff --git a/scripts/map_improve_sample_ids.py b/scripts/map_improve_sample_ids.py
@@ -412,7 +412,7 @@ def main():
                         help='Build date in YYYY-MM-DD. Default=now.')
     parser.add_argument('--version', required=True,
                         help='Build version. Must be unique per build.')
-    parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,cptac,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
+    parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
                         help='Comma-separated list of datasets, e.g., beataml,ccle')
     parser.add_argument('--local_dir', default='data',
                         help='Directory containing all CSV/TSV files.')

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 pandas
 numpy
 requests
 +urllib3