Skip to content

Commit 6faba50

Browse files
committed
Reverted Sara's original changes, fixed Source issue in Sanger transcriptomics file
1 parent 3bcae82 commit 6faba50

3 files changed

Lines changed: 20 additions & 28 deletions

File tree

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,8 @@ sanger_files<-function(fi,value){
206206
samps<-samps[-c(1:2),]|>as.data.frame()|>
207207
tibble::rownames_to_column('other_id')|>
208208
left_join(sanger_samples)|>
209-
dplyr::rename(source='data_source',study='dataset_name')
209+
dplyr::rename(source='data_source',study='dataset_name') %>%
210+
mutate(source = "Sanger")
210211

211212
missing<-subset(samps,is.na(improve_sample_id))|>
212213
dplyr::select(-c(other_id,improve_sample_id))|>
@@ -566,16 +567,12 @@ main<-function(){
566567
lapply(alltypes,function(dt){
567568
print(dt)
568569
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
569-
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
570570
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
571-
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
572-
573-
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571+
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
574572
rm(tempd)
575573
rm(temps)
576574
})
577575

578576
}
579577

580-
main()
581-
578+
main()

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ def main():
4848

4949
full[['study']] = 'DepMap'
5050
full[['source']] = 'Broad'
51-
##now save to separate files
52-
full.dropna(axis=0)
53-
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5451

5552

5653
##now get sanger
@@ -72,9 +69,9 @@ def main():
7269
full2.loc[:,['study']] = 'Sanger'
7370
full2.loc[:,['source']] = 'Sanger'
7471

75-
#full3 = pd.concat([full,full2])
76-
#print(full3)
77-
full2.dropna(axis=0)
78-
full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
72+
full3 = pd.concat([full,full2])
73+
print(full3)
74+
full3.dropna(axis=0)
75+
full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
7976

80-
main()
77+
main()

build/broad_sanger/05b_separate_datasets.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ def main():
1313

1414

1515
dataset_sources = {
16-
"CCLE": ["broad"],
17-
"CTRPv2": ["broad"],
18-
"PRISM": ["broad"],
19-
"GDSCv1": ["sanger"],
20-
"GDSCv2": ["sanger"],
21-
"FIMM": ["broad"],
22-
"gCSI": ["broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23-
"NCI60": ["broad"]
16+
"CCLE": ["Broad"],
17+
"CTRPv2": ["Broad"],
18+
"PRISM": ["Broad"],
19+
"GDSCv1": ["Sanger"],
20+
"GDSCv2": ["Sanger"],
21+
"FIMM": ["Broad"],
22+
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23+
"NCI60": ["Broad"]
2424
}
2525

2626
for dataset in datasets_to_process:
@@ -70,16 +70,14 @@ def main():
7070

7171
#One by one, filter other Omics files, write to file, delete from mem.
7272
for omics in omics_datatypes:
73-
ds = dataset_sources[dataset][0]
74-
#print(ds)
75-
omics_filename_in = f"{ds}_{omics}.csv"
73+
omics_filename_in = f"broad_sanger_{omics}.csv"
7674
if os.path.isfile(omics_filename_in + ".gz"):
7775
omics_filename_in += ".gz"
7876

7977
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
8078
omics_df = pl.read_csv(omics_filename_in)
8179
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
82-
# omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
80+
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
8381
omics_df.write_csv(omics_filename_out) #csv
8482

8583
#Rewrite as gzipped if needed
@@ -120,4 +118,4 @@ def main():
120118
gc.collect()
121119

122120
if __name__ == "__main__":
123-
main()
121+
main()

0 commit comments

Comments
 (0)