Skip to content

Commit cf73f12

Browse files
committed
added drug processing script
Currently untested as it is missing the sample matching component.
1 parent 7dc8824 commit cf73f12

6 files changed

Lines changed: 139 additions & 160 deletions

File tree

build/docker/Dockerfile.pancpdo

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ WORKDIR /usr/src/app
44

55
COPY build/pancpdo/01-createPancPDOSamplesFile.py .
66
COPY build/pancpdo/02-getPancPDOData.py .
7+
COPY build/pancpdo/03-getPancPDODrugs.py .
8+
COPY build/pancpdo/04-getPancPDOExperiments.py .
79
COPY build/pancpdo/full_manifest.txt .
810
COPY build/pancpdo/requirements.txt .
911
COPY build/pancpdo/*sh ./
1012
COPY build/pancpdo/pancpdo_cancer_types.csv ./
11-
13+
COPY build/utils/* ./
1214

1315
# Set MPLCONFIGDIR to a writable directory
1416
ENV MPLCONFIGDIR=/app/tmp/matplotlib

build/pancpdo/03-getPancPDODrugs.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pandas as pd
22
import os
33
import argparse
4-
import synapseclient
4+
import synapseclient as sc
55

66

77

@@ -12,37 +12,39 @@
1212
##get third tab and drugsa re listeda cross top
1313

1414

15-
1615

17-
def retrieve_figshare_data(url):
18-
"""
19-
Download data from a given Figshare URL.
20-
21-
Parameters
22-
----------
23-
url : string
24-
The Figshare URL to download data from.
25-
26-
Returns
27-
-------
28-
string
29-
Name of the downloaded file.
30-
"""
31-
32-
files_0 = os.listdir()
33-
wget.download(url)
34-
files_1 = os.listdir()
35-
new_file = str(next(iter(set(files_1) - set(files_0))))
36-
return new_file
16+
def getDrugNames(token=""):
17+
if token !="":
18+
syn = sc.login(token)
19+
else:
20+
syn = sc.login()
21+
fpath = syn.get(synid).path
22+
print(fpath)
23+
tab = pd.read_excel(fpath,sheet_name='concentrations')
24+
drugs = [a.lower() for a in tab.columns]
25+
return drugs
26+
27+
3728

3829
def main():
39-
parser = argparse.ArgumentParser(description='Download and match pancpdocdrugs')
40-
parser.add_argument('-d', '--prevDrugFile')
41-
parser.add_argument('-o', '--output', default = '/tmp/panpdc_drugs.tsv')
30+
parser = argparse.ArgumentParser(description='Download and match pancpdodrugs')
31+
parser.add_argument('-p', '--pat',help='Synapse authentication token with permission to syn64333325')
32+
parser.add_argument('-d', '--prevDrugFile',help='Comma-delimited list of previous drug files')
33+
parser.add_argument('-o', '--output', default = '/tmp/pancpdo_drugs.tsv.gz')
34+
35+
args = parser.parse_args()
36+
newdrugs = getDrugNames(args.pat)
37+
38+
prevdrugs = [pd.read_csv(t,sep='\t') for t in args.prevDrugFile.split(',')]
39+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
40+
41+
imps = alldrugs[alldrugs.chem_name.isin(newdrugs)]
42+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
4243

43-
auc_file = retrieve_figshare_data(filelink)
44+
##write drugs
45+
newdrugs.to_csv(args.output, sep='\t', compression='gzip', index=False)
4446

45-
tab = pd.read_excel(auc_file,sheet='')
47+
##calculate drug descriptors
4648

4749

4850
if __name__=='__main__':

build/pancpdo/04-getPancPDOExperiments.py

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,105 @@
33
import pandas as pd
44
import wget
55
import argparse
6-
import synapseclient
6+
import synapseclient as sc
7+
import math
78

89

910
def main():
1011
##current AUC values are here: https://aacr.figshare.com/ndownloader/files/39996295 tabs 2 and 3
1112
parser = argparse.ArgumentParser()
13+
parser.add_argument('-p', '--pat', help='Synapse authentication token')
14+
parser.add_argument('-s', '--samples', help='Sample mapping file for panc pdo samples')
15+
parser.add_argument('-d', '--drugs', help='Drug mapping file for panc pdo samples')
16+
parser.add_argument('-o', '--output', default = '/tmp/pancpdo_doserep.tsv',help='Output file to be read into curve fitting code')
1217

13-
rawdata = 'syn64333325'
18+
args = parser.parse_args()
19+
newdata = get_data(args.pat)
20+
newdata = newdata.rename(columns={'Organoid':'other_id','Drug':'chem_name','Dose':'DOSE','FracResponse':'GROWTH','Passage':'time'})
21+
newdata = newdata[['other_id','chem_name','DOSE','GROWTH']]
22+
newdata[['time']]='120'
23+
newdata[['time_unit']]='hours'
24+
newdata[['study']]='pancpdo'
25+
newdata[['source']]='TiriacEtAl2018'
26+
# 'source', 'improve_sample_id', 'Drug', 'study','time','time_unit'
27+
mappedresponse = map_drugs_to_samps(newddata,args.drugs,args,samples)
28+
mappedresponse.to_csv(args.output, sep='\t', index=False)
29+
30+
def map_to_drugs_samps(dose_rep,drugfile,sampfile):
31+
'''
32+
Collect dose response data frame, map drugs and organoids to improve drug and sample ids
33+
'''
34+
drugs = pd.read_csv(drugfile, sep='\t')
35+
samps = pd.read_csv(sampfile)
36+
37+
merged = dose_rep.merge(drugs).merge(samps)
38+
merged = merged[['improve_sample_id','improve_drug_id','DOSE','GROWTH','time','time_unit','study','source']]
39+
merged = merged.rename(columns={'improve_drug_id':'Drug'})
40+
return merged
41+
42+
def get_data(token):
43+
synid = 'syn64333325'
44+
45+
syn = sc.login(authToken=token)
46+
fpath = syn.get(synid).path
47+
print(fpath)
48+
concs = pd.read_excel(fpath,sheet_name='concentrations')
49+
50+
responses = pd.read_excel(fpath,sheet_name='Sheet1').dropna(axis=0,how='all')
51+
52+
##kludgy way of fixing rows so that all data is in each row
53+
newrows=[]
54+
org=''
55+
passage=''
56+
date=''
57+
pate=''
58+
responses = responses.fillna('').reset_index(drop=True)
59+
for rownum, row in responses.iterrows():
60+
if row['Organoid']!="":
61+
org = row['Organoid']
62+
passage = row['Passage']
63+
date = row['Date']
64+
pate = row['pate']
65+
newrows.append({'Organoid':org,'Passage':passage,'Date':date,'pate':pate})
66+
67+
releft = pd.DataFrame(newrows)
68+
responses.Organoid = releft.Organoid
69+
responses.Passage = releft.Passage
70+
responses.pate = releft.pate
71+
responses.Date = releft.Date
72+
73+
74+
##now melt the data into single columns
75+
rtab = responses.melt(id_vars = responses.columns[0:4],value_vars=responses.columns[4:10], var_name='Drug',value_name='Response')
76+
77+
##rename the drugs
78+
rtab[['Drug','Rep']]=rtab['Drug'].str.lower().str.split('.',expand=True)
79+
newrep=[]
80+
for r in rtab.Rep:
81+
if r is None:
82+
newrep.append(0)
83+
else:
84+
newrep.append(r)
85+
rtab.Rep=newrep
86+
87+
##renormalize values to max
88+
##IMPORTANT: this is how we normalize without DMSO. We need to consider how we're doing this for EACH ORGANOID
89+
##currently we take the max value of each orgnaoid/replicate.
90+
rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
91+
rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
92+
93+
94+
##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over
95+
dosenum = [a for a in range(15)]
96+
rtab['Dosenum']=dosenum*int(rtab.shape[0]/15)
97+
98+
##merge the concentrations
99+
concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose')
100+
concs.Drug=concs.Drug.str.lower()
101+
concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge
102+
103+
104+
return rtab.merge(concs)
105+
106+
if __name__=='__main__':
107+
main()

build/pancpdo/build_drugs.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
/opt/venv/bin/python3 03-getPancPDODrugs.py --pat $SYNAPSE_AUTH_TOKEN --prevDrugFile=$1 --output=/tmp/pancpdo_drugs.tsv.gz
3+
/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/pancpdo_drugs.tsv.gz --desctable /tmp/pancpdo_drug_descriptors.tsv.gz

build/pancpdo/build_exp.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1"
7+
/opt/venv/bin/python 04-getPancPDOExperiments.py --pat $SYNAPSE_AUTH_TOKEN --drugs $2 --samples $1 --output /tmp/pancpdo_doserep.tsv
8+
/opt/venv/bin/python fit_curv.py --input /tmp/panpdo_doserep.tsv --output /tmp/pancpdo_experiments.tsv.gz

mpnstpdx_ignore_chems.txt

Lines changed: 0 additions & 130 deletions
This file was deleted.

0 commit comments

Comments
 (0)