Skip to content

Commit 71f624c

Browse files
committed
fix for #343
Here we separate out the omics data early on in the pipeline to ensure better assignment.
1 parent d982023 commit 71f624c

4 files changed

Lines changed: 26 additions & 15 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## Cancer Omics Drug Experiment Response Dataset
22

3+
4+
35
There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
46

57
This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,12 +566,16 @@ main<-function(){
566566
lapply(alltypes,function(dt){
567567
print(dt)
568568
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
569+
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
569570
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
570-
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571+
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
572+
573+
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571574
rm(tempd)
572575
rm(temps)
573576
})
574577

575578
}
576579

577580
main()
581+

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def main():
4848

4949
full[['study']] = 'DepMap'
5050
full[['source']] = 'Broad'
51+
##now save to separate files
52+
full.dropna(axis=0)
53+
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5154

5255

5356
##now get sanger
@@ -69,9 +72,9 @@ def main():
6972
full2.loc[:,['study']] = 'Sanger'
7073
full2.loc[:,['source']] = 'Sanger'
7174

72-
full3 = pd.concat([full,full2])
73-
print(full3)
74-
full3.dropna(axis=0)
75-
full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
75+
#full3 = pd.concat([full,full2])
76+
#print(full3)
77+
full2.dropna(axis=0)
78+
full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
7679

7780
main()

build/broad_sanger/05b_separate_datasets.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ def main():
1313

1414

1515
dataset_sources = {
16-
"CCLE": ["Broad"],
17-
"CTRPv2": ["Broad"],
18-
"PRISM": ["Broad"],
19-
"GDSCv1": ["Sanger"],
20-
"GDSCv2": ["Sanger"],
21-
"FIMM": ["Broad"],
22-
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23-
"NCI60": ["Broad"]
16+
"CCLE": ["broad"],
17+
"CTRPv2": ["broad"],
18+
"PRISM": ["broad"],
19+
"GDSCv1": ["sanger"],
20+
"GDSCv2": ["sanger"],
21+
"FIMM": ["broad"],
22+
"gCSI": ["broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23+
"NCI60": ["broad"]
2424
}
2525

2626
for dataset in datasets_to_process:
@@ -70,14 +70,16 @@ def main():
7070

7171
#One by one, filter other Omics files, write to file, delete from mem.
7272
for omics in omics_datatypes:
73-
omics_filename_in = f"broad_sanger_{omics}.csv"
73+
ds = dataset_sources[dataset][0]
74+
#print(ds)
75+
omics_filename_in = f"{ds}_{omics}.csv"
7476
if os.path.isfile(omics_filename_in + ".gz"):
7577
omics_filename_in += ".gz"
7678

7779
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
7880
omics_df = pl.read_csv(omics_filename_in)
7981
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
80-
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
82+
# omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
8183
omics_df.write_csv(omics_filename_out) #csv
8284

8385
#Rewrite as gzipped if needed

0 commit comments

Comments
 (0)