Skip to content

Commit 0e134b1

Browse files
committed
updated with panc pdo build working
1 parent f981006 commit 0e134b1

20 files changed

Lines changed: 114 additions & 265 deletions

File tree

build/build_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def process_drugs(executor, dataset, use_prev_dataset, should_continue):
9898
'''
9999
Build the drugs file for the specified dataset.
100100
'''
101-
if dataset in ['cptac', 'hcmi','pancpdo']:
101+
if dataset in ['cptac', 'hcmi']:
102102
return # No drugs to process for these datasets
103103

104104
drugs_file = f'local/{dataset}_drugs.tsv'
@@ -166,7 +166,7 @@ def process_experiments(executor, dataset, should_continue):
166166
'''
167167
Build the experiments files for the specified dataset.
168168
'''
169-
if dataset in ['cptac', 'hcmi','pancpdo']:
169+
if dataset in ['cptac', 'hcmi']:
170170
return # No experiments to process for these datasets
171171

172172
experiments_file = f'local/{dataset}_experiments.tsv'

build/docker/Dockerfile.pancpdo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ ENV MPLCONFIGDIR=/app/tmp/matplotlib
1717
RUN mkdir -p /app/tmp/matplotlib
1818

1919
RUN pip install --no-cache-dir -r requirements.txt
20-
20+
VOLUME ['/tmp']

build/pancpdo/01-createPancPDOSamplesFile.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,28 @@
55
import numpy as np
66

77

8+
#this is table S1 - it has a mapping from patient number to organoid
9+
sample_mapping='https://aacr.silverchair-cdn.com/aacr/content_public/journal/cancerdiscovery/8/9/10.1158_2159-8290.cd-18-0349/5/21598290cd180349-sup-199398_2_supp_4775186_p95dln.xlsx?Expires=1738004990&Signature=yngaaKNaXfIPCr-xLS2bDjX49n9py8JC7NBwi3q7m7ARYnK573eZwavFYmJOZVanL555vUWAr5x5k9b7IKj4VWHtZ-dts7BDzHd14AZh15LbsorJh-r3gjPliF7v1PIoAcGnEXjma2~kosmoDmyK0EDWXQCOE48tAaG5hFtaWAMMAINRMeBNgtDYk937Npc3Wb0IcGAdlgD2TJd8KJW2jQmcRspY1hfYssiS3BcWzuJrP-DVJeb-1V7-BnVNL6cVCkr7zHhau50H6aVgMVzk33F0gjCphl4r90OIx9UwE59hyNHbN9rFeeW26kDQpgCQKCj98Ol6CNQfLDsb2Zc5dQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
10+
11+
12+
def get_organoid_samples(sample_tab):
13+
'''
14+
takes as input a processed list of samples from HCMI and appends it with the 'organoid' identifier from the papers table S1 described above
15+
'''
16+
map = pd.read_excel(sample_mapping, sheet_name='Patient-Derived Organoid Cohort', skiprows=1)
17+
pmap = map[['Patient number','Organoid']]
18+
pmap = pmap.rename(columns={'Patient number':'common_name','Organoid':'experimentId'})
19+
20+
#join with sampletab
21+
sample_tab.common_name=[str(a) for a in sample_tab.common_name]
22+
pmap.common_name = [str(a) for a in pmap.common_name]
23+
ocols=['common_name','other_names','model_type','cancer_type','improve_sample_id','species']
24+
red_tab = sample_tab[ocols].merge(pmap)
25+
26+
#then add in organoid number
27+
newsamp = red_tab.melt(id_vars=ocols,value_vars='experimentId',var_name='other_id_source',value_name='other_id').drop_duplicates()
28+
res = pd.concat([sample_tab,newsamp])
29+
return res
830

931
def align_to_linkml_schema(input_df):
1032
"""
@@ -320,6 +342,7 @@ def main():
320342
output = filter_and_subset_data(df,maxval,args.map)
321343
aligned = align_to_linkml_schema(output)
322344
print(aligned)
345+
aligned = get_organoid_samples(aligned)
323346
aligned.to_csv("/tmp/pancpdo_samples.csv",index=False)
324347

325348
main()

build/pancpdo/03-getPancPDODrugs.py

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,52 @@
22
import os
33
import argparse
44
import synapseclient as sc
5-
5+
from pubchem_retrieval import update_dataframe_and_write_tsv
66

77

88
###figshare link:
99

1010
filelink='https://aacr.figshare.com/ndownloader/files/39996295'
11-
synid = 'syn64333325'
11+
#synid = 'syn64333325'
1212
##get third tab and drugsa re listeda cross top
1313

14-
14+
##sup table drug list (in column names)
15+
tablink = 'https://aacr.silverchair-cdn.com/aacr/content_public/journal/cancerdiscovery/8/9/10.1158_2159-8290.cd-18-0349/5/21598290cd180349-sup-199398_2_supp_4775187_p95dln.xlsx?Expires=1738004990&Signature=av8XadTm9AmI20O2Y7J7aHDtPbpluKJIfI5ubsoiYJ15D0zh5p1ltF4a7-DCSWTSMs-qX5TD09shxHeqkQ2NkLWHZsXoCD5KyREGhEgcDAvWZ1V9kwXDm0bjpINipAPPtC20oeuw6c~hPooF3Mtgzp4MzMCCjcVwfn05u27a0kS0yifBi11wQj3nmHlR3ym-2fYkFuqQtnNPCzH8-yIw21y0kTvXrNodAzC5pGA8qUK4PLxBt52xUIvTEPsPiPjXwBnDCfVsLGGdDYIY25lEPKiA403q6kFYvrSQ3bsTvM4kuvltb7yS4AXjK0-tthMOKbqq8~uREmJCcueADUF91g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
1516

1617
def getDrugNames(token=""):
17-
if token !="":
18-
syn = sc.login(token)
19-
else:
20-
syn = sc.login()
21-
fpath = syn.get(synid).path
22-
print(fpath)
23-
tab = pd.read_excel(fpath,sheet_name='concentrations')
24-
drugs = [a.lower() for a in tab.columns]
25-
return drugs
2618

19+
#chemo drugs
20+
ctab = pd.read_excel(tablink,sheet_name=1,skiprows=1)
21+
#targeted drugs
22+
ttab = pd.read_excel(tablink,sheet_name=2,skiprows=1)
23+
drugs = [a.lower() for a in ctab.columns]+[a.lower() for a in ttab.columns]
24+
drugs = set(drugs)-set(['sample id','insensitive'])
25+
return drugs
2726

2827

2928
def main():
3029
parser = argparse.ArgumentParser(description='Download and match pancpdodrugs')
31-
parser.add_argument('-p', '--pat',help='Synapse authentication token with permission to syn64333325')
32-
parser.add_argument('-d', '--prevDrugFile',help='Comma-delimited list of previous drug files')
33-
parser.add_argument('-o', '--output', default = '/tmp/pancpdo_drugs.tsv.gz')
30+
# parser.add_argument('-p', '--pat',help='Synapse authentication token with permission to syn64333325')
31+
parser.add_argument('-d', '--prevDrugFile', default=None, help='Comma-delimited list of previous drug files')
32+
parser.add_argument('-o', '--output', default = '/tmp/pancpdo_drugs.tsv')
3433

3534
args = parser.parse_args()
36-
newdrugs = getDrugNames(args.pat)
37-
38-
prevdrugs = [pd.read_csv(t,sep='\t') for t in args.prevDrugFile.split(',')]
39-
alldrugs = pd.concat(prevdrugs).drop_duplicates()
40-
41-
imps = alldrugs[alldrugs.chem_name.isin(newdrugs)]
42-
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
43-
44-
##write drugs
45-
newdrugs.to_csv(args.output, sep='\t', compression='gzip', index=False)
46-
35+
newdrugnames = getDrugNames()
36+
37+
alldrugs = []
38+
if args.prevDrugFile is not None and args.prevDrugFile is not "":
39+
prevdrugs = [pd.read_csv(t,sep='\t') for t in args.prevDrugFile.split(',')]
40+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
41+
42+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
43+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
44+
45+
##write drugs
46+
newdrugs.to_csv(args.output, sep='\t', index=False)
47+
48+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
49+
print('Missing drugs in existing file, querying pubchem')
50+
update_dataframe_and_write_tsv(newdrugnames,args.output)
4751
##calculate drug descriptors
4852

4953

build/pancpdo/04-getPancPDOExperiments.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import argparse
66
import synapseclient as sc
77
import math
8-
8+
import re
99

1010
def main():
1111
##current AUC values are here: https://aacr.figshare.com/ndownloader/files/39996295 tabs 2 and 3
@@ -17,14 +17,17 @@ def main():
1717

1818
args = parser.parse_args()
1919
newdata = get_data(args.pat)
20-
newdata = newdata.rename(columns={'Organoid':'other_id','Drug':'chem_name','Dose':'DOSE','FracResponse':'GROWTH','Passage':'time'})
20+
newdata = newdata.rename(columns={'Organoid':'other_id','Drug':'chem_name','Dose':'DOSE','PercResponse':'GROWTH','Passage':'time'})
21+
# print(newdata)
2122
newdata = newdata[['other_id','chem_name','DOSE','GROWTH']]
2223
newdata[['time']]='120'
2324
newdata[['time_unit']]='hours'
2425
newdata[['study']]='pancpdo'
2526
newdata[['source']]='TiriacEtAl2018'
27+
print('collected doses and response for '+str(len(set(newdata.chem_name)))+' drugs and '+str(len(set(newdata.other_id)))+' samples')
2628
# 'source', 'improve_sample_id', 'Drug', 'study','time','time_unit'
27-
mappedresponse = map_drugs_to_samps(newddata,args.drugs,args,samples)
29+
mappedresponse = map_to_drugs_samps(newdata,args.drugs,args.samples)
30+
print('mapped doses and response for '+str(len(set(mappedresponse.Drug)))+' drugs and '+str(len(set(mappedresponse.improve_sample_id)))+' samples')
2831
mappedresponse.to_csv(args.output, sep='\t', index=False)
2932

3033
def map_to_drugs_samps(dose_rep,drugfile,sampfile):
@@ -35,8 +38,10 @@ def map_to_drugs_samps(dose_rep,drugfile,sampfile):
3538
samps = pd.read_csv(sampfile)
3639

3740
merged = dose_rep.merge(drugs).merge(samps)
38-
merged = merged[['improve_sample_id','improve_drug_id','DOSE','GROWTH','time','time_unit','study','source']]
39-
merged = merged.rename(columns={'improve_drug_id':'Drug'})
41+
42+
merged = merged.rename(columns={'improve_drug_id':'Drug'})
43+
merged = merged[['improve_sample_id','Drug','DOSE','GROWTH','time','time_unit','study','source']].drop_duplicates()
44+
print(merged)
4045
return merged
4146

4247
def get_data(token):
@@ -72,10 +77,13 @@ def get_data(token):
7277

7378

7479
##now melt the data into single columns
75-
rtab = responses.melt(id_vars = responses.columns[0:4],value_vars=responses.columns[4:10], var_name='Drug',value_name='Response')
76-
80+
rtab = responses.melt(id_vars = responses.columns[0:4],value_vars=responses.columns[4:20], var_name='Drug',value_name='Response')
81+
print('Collected results from '+str(len(set(rtab.Drug)))+' drugs and '+str(len(set(rtab.Organoid)))+' organoids')
82+
#print(set(rtab.Drug))
7783
##rename the drugs
7884
rtab[['Drug','Rep']]=rtab['Drug'].str.lower().str.split('.',expand=True)
85+
rtab.Drug=[re.sub('-','',a) for a in rtab.Drug]
86+
#print(set(rtab.Drug))
7987
newrep=[]
8088
for r in rtab.Rep:
8189
if r is None:
@@ -94,12 +102,16 @@ def get_data(token):
94102
##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over
95103
dosenum = [a for a in range(15)]
96104
rtab['Dosenum']=dosenum*int(rtab.shape[0]/15)
97-
105+
106+
#print(set(rtab.Drug))
98107
##merge the concentrations
99108
concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose')
109+
print(concs)
110+
concs.Dose = [d*10.0**6.0 for d in concs.Dose] ## convert M to uM here
111+
100112
concs.Drug=concs.Drug.str.lower()
101113
concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge
102-
114+
#print(set(concs.Drug))
103115

104116
return rtab.merge(concs)
105117

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
'''
2+
This script pulls down pre-computed curves and compares our fits with theirs
3+
'''
4+
5+
import pandas as pd
6+
import math
7+
import argparse
8+
9+
tablink = 'https://aacr.silverchair-cdn.com/aacr/content_public/journal/cancerdiscovery/8/9/10.1158_2159-8290.cd-18-0349/5/21598290cd180349-sup-199398_2_supp_4775187_p95dln.xlsx?Expires=1738004990&Signature=av8XadTm9AmI20O2Y7J7aHDtPbpluKJIfI5ubsoiYJ15D0zh5p1ltF4a7-DCSWTSMs-qX5TD09shxHeqkQ2NkLWHZsXoCD5KyREGhEgcDAvWZ1V9kwXDm0bjpINipAPPtC20oeuw6c~hPooF3Mtgzp4MzMCCjcVwfn05u27a0kS0yifBi11wQj3nmHlR3ym-2fYkFuqQtnNPCzH8-yIw21y0kTvXrNodAzC5pGA8qUK4PLxBt52xUIvTEPsPiPjXwBnDCfVsLGGdDYIY25lEPKiA403q6kFYvrSQ3bsTvM4kuvltb7yS4AXjK0-tthMOKbqq8~uREmJCcueADUF91g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
10+
11+
12+
13+
def main():
14+
##so far we have data for 'chemo' tab. how about the targeted tab?
15+
16+
chemo = pd.read_excel(tablink,sheet_name=1)
17+
targeted = res = pd.read_excel(tablink,sheet_name=2)
18+
19+
20+
##add in these scores to the drug file
21+
##get drug file
22+
23+
24+
25+
if __name__=='__main__':
26+
main()

build/pancpdo/build_drugs.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11

2-
/opt/venv/bin/python3 03-getPancPDODrugs.py --pat $SYNAPSE_AUTH_TOKEN --prevDrugFile=$1 --output=/tmp/pancpdo_drugs.tsv.gz
3-
/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/pancpdo_drugs.tsv.gz --desctable /tmp/pancpdo_drug_descriptors.tsv.gz
2+
python 03-getPancPDODrugs.py --prevDrugFile=$1 --output=/tmp/pancpdo_drugs.tsv
3+
python build_drug_desc.py --drugtable /tmp/pancpdo_drugs.tsv --desctable /tmp/pancpdo_drug_descriptors.tsv.gz
4+

build/pancpdo/build_exp.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,9 @@ set -euo pipefail
44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

66
echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1"
7-
/opt/venv/bin/python 04-getPancPDOExperiments.py --pat $SYNAPSE_AUTH_TOKEN --drugs $2 --samples $1 --output /tmp/pancpdo_doserep.tsv
8-
/opt/venv/bin/python fit_curv.py --input /tmp/panpdo_doserep.tsv --output /tmp/pancpdo_experiments.tsv.gz
7+
python 04-getPancPDOExperiments.py --pat $SYNAPSE_AUTH_TOKEN --drugs $2 --samples $1 --output /tmp/pancpdo_doserep.tsv
8+
python fit_curve.py --input /tmp/pancpdo_doserep.tsv
9+
10+
##now move file and gzip
11+
mv /tmp/pancpdo_doserep.tsv /tmp/pancpdo_experiments.tsv
12+
gzip /tmp/pancpdo_experiments.tsv

build/pancpdo/build_omics.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ python 02-getPancPDOData.py -m full_manifest.txt -t transcriptomics -o /tmp/panc
99
#echo "Running 02-getPancPDOData.py for copy_number."
1010
#python 02-getPancPDOData.py -m full_manifest.txt -t copy_number -o /tmp/pancpdo_copy_number.csv.gz -g $1 -s $2
1111

12-
echo "Running 02-getPancPDOData.py for mutations."
13-
python 02-getPancPDOData.py -m full_manifest.txt -t mutations -o /tmp/pancpdo_mutations.csv.gz -g $1 -s $2
12+
#echo "Running 02-getPancPDOData.py for mutations."
13+
#python 02-getPancPDOData.py -m full_manifest.txt -t mutations -o /tmp/pancpdo_mutations.csv.gz -g $1 -s $2

0 commit comments

Comments
 (0)