Skip to content

Commit 2578dec

Browse files
committed
Revert "Reverted Sara's original changes, fixed Source issue in Sanger transcriptomics file"
This reverts commit 6faba50.
1 parent 6faba50 commit 2578dec

3 files changed

Lines changed: 28 additions & 20 deletions

File tree

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,7 @@ sanger_files<-function(fi,value){
206206
samps<-samps[-c(1:2),]|>as.data.frame()|>
207207
tibble::rownames_to_column('other_id')|>
208208
left_join(sanger_samples)|>
209-
dplyr::rename(source='data_source',study='dataset_name') %>%
210-
mutate(source = "Sanger")
209+
dplyr::rename(source='data_source',study='dataset_name')
211210

212211
missing<-subset(samps,is.na(improve_sample_id))|>
213212
dplyr::select(-c(other_id,improve_sample_id))|>
@@ -567,12 +566,16 @@ main<-function(){
567566
lapply(alltypes,function(dt){
568567
print(dt)
569568
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
569+
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
570570
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
571-
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571+
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
572+
573+
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
572574
rm(tempd)
573575
rm(temps)
574576
})
575577

576578
}
577579

578-
main()
580+
main()
581+

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def main():
4848

4949
full[['study']] = 'DepMap'
5050
full[['source']] = 'Broad'
51+
##now save to separate files
52+
full.dropna(axis=0)
53+
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5154

5255

5356
##now get sanger
@@ -69,9 +72,9 @@ def main():
6972
full2.loc[:,['study']] = 'Sanger'
7073
full2.loc[:,['source']] = 'Sanger'
7174

72-
full3 = pd.concat([full,full2])
73-
print(full3)
74-
full3.dropna(axis=0)
75-
full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
75+
#full3 = pd.concat([full,full2])
76+
#print(full3)
77+
full2.dropna(axis=0)
78+
full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
7679

77-
main()
80+
main()

build/broad_sanger/05b_separate_datasets.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ def main():
1313

1414

1515
dataset_sources = {
16-
"CCLE": ["Broad"],
17-
"CTRPv2": ["Broad"],
18-
"PRISM": ["Broad"],
19-
"GDSCv1": ["Sanger"],
20-
"GDSCv2": ["Sanger"],
21-
"FIMM": ["Broad"],
22-
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23-
"NCI60": ["Broad"]
16+
"CCLE": ["broad"],
17+
"CTRPv2": ["broad"],
18+
"PRISM": ["broad"],
19+
"GDSCv1": ["sanger"],
20+
"GDSCv2": ["sanger"],
21+
"FIMM": ["broad"],
22+
"gCSI": ["broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23+
"NCI60": ["broad"]
2424
}
2525

2626
for dataset in datasets_to_process:
@@ -70,14 +70,16 @@ def main():
7070

7171
#One by one, filter other Omics files, write to file, delete from mem.
7272
for omics in omics_datatypes:
73-
omics_filename_in = f"broad_sanger_{omics}.csv"
73+
ds = dataset_sources[dataset][0]
74+
#print(ds)
75+
omics_filename_in = f"{ds}_{omics}.csv"
7476
if os.path.isfile(omics_filename_in + ".gz"):
7577
omics_filename_in += ".gz"
7678

7779
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
7880
omics_df = pl.read_csv(omics_filename_in)
7981
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
80-
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
82+
# omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
8183
omics_df.write_csv(omics_filename_out) #csv
8284

8385
#Rewrite as gzipped if needed
@@ -118,4 +120,4 @@ def main():
118120
gc.collect()
119121

120122
if __name__ == "__main__":
121-
main()
123+
main()

0 commit comments

Comments
 (0)