Skip to content

Commit 743e911

Browse files
Merge remote-tracking branch 'origin/main' into cdc_organoids
2 parents 04821c1 + d61a6ca commit 743e911

48 files changed

Lines changed: 535793 additions & 391 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## Cancer Omics Drug Experiment Response Dataset
22

3+
4+
35
There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
46

57
This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the

build/beatAML/GetBeatAML.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
653653
# New Transcriptomics Data
654654
print("Starting Transcriptomics Data")
655655
##first run conversion tool
656-
os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
656+
os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
657657

658658

659659
t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
8+
def get_bladder_pdo_samples(synLoginObject, maxval):
9+
10+
# download from Synapse..
11+
samples_syn = synLoginObject.get('syn64765486')
12+
# and read the file
13+
samples_df = pd.read_csv(samples_syn.path, sep="\t")
14+
15+
samples = samples_df[['Sample ID', 'Patient ID', 'Cancer Type Detailed', 'Sample Class']]
16+
samples = samples.rename({"Sample ID" : 'other_id', 'Patient ID' : 'common_name', 'Cancer Type Detailed': 'cancer_type', 'Sample Class' : 'model_type'}, axis=1)
17+
18+
samples.loc[:,['species']] = 'Homo sapiens(Human)'
19+
samples.loc[:,['other_id_source']] = 'Synapse'
20+
samples.loc[:,['other_names'] ]= ''
21+
samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
22+
samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
23+
24+
samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
25+
26+
return samples
27+
28+
29+
if __name__ == "__main__":
30+
31+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
32+
33+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
34+
35+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
36+
37+
args = parser.parse_args()
38+
39+
print("Logging into Synapse")
40+
PAT = args.token
41+
synObject = synapseclient.login(authToken=PAT)
42+
43+
if (args.prevSamples):
44+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
45+
else:
46+
prev_max_improve_id = 0
47+
48+
bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
49+
50+
bladder_pdo_samples.to_csv("/tmp/bladderpdo_samples.csv", index=False)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
import wget
7+
import gzip
8+
import subprocess
9+
import math
10+
11+
def get_copy_call(a):
12+
"""
13+
Heler Function - Determine copy call for a value.
14+
"""
15+
16+
if a is None:
17+
return float('nan')
18+
19+
if math.isnan(a):
20+
return float('nan')
21+
22+
a_val = math.log2(float(a)+0.000001)
23+
if a_val < 0.5210507:
24+
return 'deep del'
25+
elif a_val < 0.7311832:
26+
return 'het loss'
27+
elif a_val < 1.214125:
28+
return 'diploid'
29+
elif a_val < 1.422233:
30+
return 'gain'
31+
else:
32+
return 'amp'
33+
34+
return pd.Series([get_copy_call(a) for a in arr])
35+
36+
def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
37+
38+
bladderpdo_url ='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz'
39+
transcriptomic_txt = wget.download(bladderpdo_url)
40+
transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
41+
subprocess.call (["/usr/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
42+
43+
GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
44+
fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
45+
transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
46+
transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
47+
mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
48+
# transform data to long format
49+
50+
mapped_df.drop('other_id', axis=1)
51+
value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
52+
melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
53+
# use info from GEO to get Sample IDS
54+
txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
55+
# use samplesheet to link sample_ids to improve ids
56+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
57+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
58+
txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
59+
final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
60+
final_transcriptomics['source'] = "Gene Expression Omnibus"
61+
final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
62+
final_transcriptomics.rename({'value' : 'transcriptomics' })
63+
# remove duplicates
64+
toreturn = final_transcriptomics.drop_duplicates()
65+
66+
return toreturn
67+
68+
def get_bladder_pdo_mutations(synObject, samples, genes):
69+
print(samples.head)
70+
mutations = synObject.get("syn64765525")
71+
mutations_df = pd.read_csv(mutations.path, sep='\t')
72+
mutations_df['mutation'] = mutations_df['HGVSc'].str.split(":", expand=True)[1]
73+
#samplesheet = pd.read_csv(samples)
74+
selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
75+
merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
76+
merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
77+
print(merged_mutations_renamed.head)
78+
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
79+
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
80+
print(final_mutations.head)
81+
return final_mutations
82+
83+
def get_bladder_pdo_copynumber(synObject, samples, genes):
84+
segfile = synObject.get("syn64765499")
85+
segfile_df = pd.read_csv(segfile.path, sep='\t')
86+
87+
segfile_df.to_csv("bladder_segfile.csv")
88+
subprocess.call (["/usr/bin/Rscript", "--vanilla", "CNV-segfile-annotation.R", "bladder_segfile.csv", "bladder_annotated_segfile.csv"])
89+
copynumber = pd.read_csv("bladder_annotated_segfile.csv")
90+
copynumber['copy_number'] = np.exp2(copynumber['score'].div(2))*2
91+
copynumber['copy_call'] = [get_copy_call(a) for a in copynumber['copy_number']]
92+
copynumber_with_improveids = copynumber.merge(samples, left_on='ID', right_on = 'other_id', how='left')
93+
copynumber_with_correct_colnames = copynumber_with_improveids.rename({"ENTREZID":'entrez_id'}, axis=1)
94+
final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
95+
final_copynumber['source'] = "Synapse"
96+
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
97+
98+
return final_copynumber
99+
100+
101+
102+
103+
if __name__ == "__main__":
104+
print('in main')
105+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
106+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
107+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
108+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
109+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
110+
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
111+
parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
112+
parser.add_argument('-t', '--token', help='Synapse token')
113+
114+
args = parser.parse_args()
115+
print("Logging into Synapse")
116+
PAT = args.token
117+
synObject = synapseclient.login(authToken=PAT)
118+
print('gene file is:')
119+
print(args.genes)
120+
print('sample file is :')
121+
print(args.samples)
122+
genes = pd.read_csv(args.genes)
123+
samples = pd.read_csv(args.samples)
124+
125+
if args.expression:
126+
get_bladder_pdo_transcriptomics(args.geolink, samples, genes).to_csv("/tmp/bladderpdo_transcriptomics.csv", index=False)
127+
128+
if args.mutation:
129+
get_bladder_pdo_mutations(synObject, samples, genes).to_csv('/tmp/bladderpdo_mutations.csv', index=False)
130+
131+
if args.copy:
132+
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copynumber.csv", index=False)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
# for testing locally
7+
#from utils.pubchem_retrieval import update_dataframe_and_write_tsv
8+
# for building in docker
9+
from pubchem_retrieval import update_dataframe_and_write_tsv
10+
11+
12+
def create_bladder_pdo_drugs_file(synObject, prevDrugFilepath, outputPath):
13+
bladder_dir = synObject.get('syn64765430')
14+
filenames = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
15+
bladder_drugs = pd.DataFrame({'drugNames' : [str]})
16+
# '-4' - there are 4 nondrug files in this directory.
17+
for i in range(len(filenames)-4):
18+
bladder_drugs.loc[i,'drugNames'] = filenames[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip()
19+
20+
# get unique drugs
21+
newdrugnames = bladder_drugs['drugNames'].unique()
22+
# use helper functions in pubchem_retrieval.py
23+
alldrugs = []
24+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
25+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
26+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
27+
28+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
29+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
30+
31+
##write drugs
32+
newdrugs.to_csv(outputPath, sep='\t', index=False)
33+
34+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
35+
print('Missing drugs in existing file, querying pubchem')
36+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
37+
38+
39+
if __name__ == "__main__":
40+
41+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Lee Bladder PDO project")
42+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo', nargs="?", default = None)
43+
parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = "/tmp/sarcpdo_drugs.tsv")
44+
parser.add_argument('-t', '--token', help='Synapse token')
45+
46+
args = parser.parse_args()
47+
print("Logging into Synapse")
48+
PAT = args.token
49+
synObject = synapseclient.login(authToken=PAT)
50+
if args.prevDrugFilePath:
51+
previousDrugs = args.prevDrugFilePath
52+
else:
53+
previousDrugs = None
54+
create_bladder_pdo_drugs_file(synObject, previousDrugs, args.outputPath)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import synapseclient
2+
import pandas as pd
3+
import argparse
4+
5+
6+
def get_bladder_pdo_experiments(synObject, samples, drugs):
7+
# get list of syn id info
8+
files = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
9+
# load sample sheet and format _ to .
10+
# load conversion table and remove trailing _T
11+
conversion_table = synObject.get(files[50]['id'])
12+
conversion_table_df = pd.read_excel(conversion_table.path, header=None)
13+
conversion_table_df[2] = conversion_table_df[1].str.rsplit("_", expand=True)[0]#.replace(".", "_")
14+
conversion_table_df[3] = conversion_table_df[2].str.replace(".", "_")
15+
#print(conversion_table_df.head)
16+
17+
# initiate empty pd.dat.frame
18+
drug_df = pd.DataFrame()
19+
# for each drug,
20+
for i in range(len(files)-4):
21+
drug_table_syn =synObject.get(files[i]['id'])
22+
drug_table = pd.read_csv(drug_table_syn.path, sep="\t")
23+
# melt
24+
# link to conversion table
25+
# link to sample sheet
26+
# Rename, add columns
27+
melted_single_drug = drug_table.melt(id_vars = 'Unnamed: 0', value_vars = drug_table.columns[1:], var_name="sample")
28+
melted_single_drug['linkID'] = melted_single_drug['sample'].str.split(".", expand=True)[0]
29+
drugdata = melted_single_drug.merge(conversion_table_df, left_on = 'linkID', right_on = 0, how='left')[['Unnamed: 0', 'value', 3]]
30+
#print(drugdata.head)
31+
drugdata_with_improvesample = drugdata.merge(samples, left_on = 3, right_on='common_name')
32+
33+
# print(drugdata_with_improvesample.head)
34+
drugdata_with_improvesample = drugdata_with_improvesample[['Unnamed: 0', 'value', 'improve_sample_id']]
35+
#print(drugdata_with_improvesample.columns)
36+
drugdata_with_improvesample = drugdata_with_improvesample.rename({"Unnamed: 0" : "DOSE", 'value' : 'GROWTH'}, axis=1)
37+
#print(drugdata_with_improvesample.columns)
38+
39+
selected_drugdata = drugdata_with_improvesample
40+
selected_drugdata['chem_name'] = files[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip().lower()
41+
#print(selected_drugdata.head)
42+
drugdata_with_both_improveIds = selected_drugdata.merge(drugs[['improve_drug_id', 'chem_name']], how='left')
43+
final_drugdata = drugdata_with_both_improveIds[['DOSE', 'GROWTH', 'improve_sample_id', 'improve_drug_id']]
44+
final_drugdata = final_drugdata.rename({'improve_drug_id' : "Drug"}, axis=1)
45+
final_drugdata['study'] = "Lee etal 2018 Bladder PDOs"
46+
final_drugdata['source'] = "Synapse"
47+
final_drugdata['time'] = 6
48+
final_drugdata['time_unit'] = 'days'
49+
#print(final_drugdata.head)
50+
# append to dataframe
51+
drug_df = pd.concat([drug_df, final_drugdata])
52+
53+
return drug_df
54+
55+
56+
if __name__ == "__main__":
57+
parser = argparse.ArgumentParser()
58+
parser.add_argument('-t', '--token', help='Synapse authentication token')
59+
parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples')
60+
parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples')
61+
parser.add_argument('-o', '--output', default = '/tmp/bladderpdo_doserep.tsv',help='Output file to be read into curve fitting code')
62+
63+
args = parser.parse_args()
64+
print("Logging into Synapse")
65+
PAT = args.token
66+
synObject = synapseclient.login(authToken=PAT)
67+
drug_df = pd.read_csv(args.drugfile, sep='\t')
68+
samples_df = pd.read_csv(args.curSampleFile)
69+
70+
doseresponse_data = get_bladder_pdo_experiments(synObject, samples_df, drug_df)
71+
doseresponse_data.to_csv(args.output, sep='\t')
72+
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env Rscript
2+
args = commandArgs(trailingOnly=TRUE)
3+
4+
# If BiocManager not installed, install it
5+
if (!require("BiocManager", quietly = TRUE))
6+
install.packages("BiocManager")
7+
BiocManager::install(version = "3.20")
8+
9+
library(GenomicRanges)
10+
library(Homo.sapiens)
11+
12+
13+
# function to return gene mappings for each coordinate range (row) in the segfile
14+
splitColumnByOverlap <-
15+
function(query, subject, column="ENTREZID", ...)
16+
{
17+
olaps <- findOverlaps(query, subject, ...)
18+
f1 <- factor(subjectHits(olaps),
19+
levels=seq_len(subjectLength(olaps)))
20+
splitAsList(mcols(query)[[column]][queryHits(olaps)], f1)
21+
}
22+
23+
segfile = read.csv(args[1])
24+
25+
# create genomic ranges object from segfile for use with findOverlaps()
26+
gr <- GRanges(seqnames = Rle(paste0('chr', segfile$chrom)),
27+
ranges = IRanges(segfile$loc.start, end = segfile$loc.end),
28+
strand = Rle(c("-", "0", "+")[sign(segfile$loc.start) +2]),
29+
score = segfile$seg.mean,
30+
ID = segfile$ID)
31+
32+
# create genes GRanges obj from database
33+
genes<- genes(Homo.sapiens, columns =c("ENTREZID"))
34+
35+
geneHitsByRow <- splitColumnByOverlap(genes, gr, "ENTREZID")
36+
37+
# create matrices of annotations and scores/patient IDs in segfile
38+
# with a catch if there are no gene hits found
39+
matrixresults <- list()
40+
noresults <- c()
41+
for (i in 1:length(geneHitsByRow)){
42+
if(length(geneHitsByRow[[i]])>0){
43+
44+
matrixresults[[i]] <- data.frame(ENTREZID = as.matrix(geneHitsByRow[[i]]), rep(mcols(gr[i,]), length(geneHitsByRow[[i]])))
45+
}else{
46+
noresults <- c(noresults, i)
47+
matrixresults[[i]] <- data.frame(ENTREZID = NA, mcols(gr)[i,])
48+
}
49+
}
50+
# concatenate annotated matrices
51+
allCNV <- do.call(rbind, matrixresults)
52+
53+
# drop NAs
54+
completeAllCNV <- allCNV[complete.cases(allCNV),]
55+
# aggregate scores for the same genes (that came from different regions) for the same patient ID
56+
aggregatedAllCNV <- aggregate(completeAllCNV$score, by = list(ENTREZID =completeAllCNV$ENTREZID, ID = completeAllCNV$ID), FUN=mean)
57+
names(aggregatedAllCNV)[names(aggregatedAllCNV)=='x'] <- "score"
58+
# write results to csv for further processing in Python
59+
write.csv(aggregatedAllCNV, args[2], row.names=F, quote =F)

build/bladderpdo/README.md

Whitespace-only changes.

0 commit comments

Comments
 (0)