|
3 | 3 | import pandas as pd |
4 | 4 | import wget |
5 | 5 | import argparse |
6 | | -import synapseclient |
| 6 | +import synapseclient as sc |
| 7 | +import math |
7 | 8 |
|
8 | 9 |
|
9 | 10 | def main(): |
10 | 11 | ##current AUC values are here: https://aacr.figshare.com/ndownloader/files/39996295 tabs 2 and 3 |
11 | 12 | parser = argparse.ArgumentParser() |
| 13 | + parser.add_argument('-p', '--pat', help='Synapse authentication token') |
| 14 | + parser.add_argument('-s', '--samples', help='Sample mapping file for panc pdo samples') |
| 15 | + parser.add_argument('-d', '--drugs', help='Drug mapping file for panc pdo samples') |
| 16 | + parser.add_argument('-o', '--output', default = '/tmp/pancpdo_doserep.tsv',help='Output file to be read into curve fitting code') |
12 | 17 |
|
13 | | - rawdata = 'syn64333325' |
| 18 | + args = parser.parse_args() |
| 19 | + newdata = get_data(args.pat) |
| 20 | + newdata = newdata.rename(columns={'Organoid':'other_id','Drug':'chem_name','Dose':'DOSE','FracResponse':'GROWTH','Passage':'time'}) |
| 21 | + newdata = newdata[['other_id','chem_name','DOSE','GROWTH']] |
| 22 | + newdata[['time']]='120' |
| 23 | + newdata[['time_unit']]='hours' |
| 24 | + newdata[['study']]='pancpdo' |
| 25 | + newdata[['source']]='TiriacEtAl2018' |
| 26 | +# 'source', 'improve_sample_id', 'Drug', 'study','time','time_unit' |
| 27 | + mappedresponse = map_drugs_to_samps(newddata,args.drugs,args,samples) |
| 28 | + mappedresponse.to_csv(args.output, sep='\t', index=False) |
| 29 | + |
| 30 | +def map_to_drugs_samps(dose_rep,drugfile,sampfile): |
| 31 | + ''' |
| 32 | + Collect dose response data frame, map drugs and organoids to improve drug and sample ids |
| 33 | + ''' |
| 34 | + drugs = pd.read_csv(drugfile, sep='\t') |
| 35 | + samps = pd.read_csv(sampfile) |
| 36 | + |
| 37 | + merged = dose_rep.merge(drugs).merge(samps) |
| 38 | + merged = merged[['improve_sample_id','improve_drug_id','DOSE','GROWTH','time','time_unit','study','source']] |
| 39 | + merged = merged.rename(columns={'improve_drug_id':'Drug'}) |
| 40 | + return merged |
| 41 | + |
| 42 | +def get_data(token): |
| 43 | + synid = 'syn64333325' |
| 44 | + |
| 45 | + syn = sc.login(authToken=token) |
| 46 | + fpath = syn.get(synid).path |
| 47 | + print(fpath) |
| 48 | + concs = pd.read_excel(fpath,sheet_name='concentrations') |
| 49 | + |
| 50 | + responses = pd.read_excel(fpath,sheet_name='Sheet1').dropna(axis=0,how='all') |
| 51 | + |
| 52 | + ##kludgy way of fixing rows so that all data is in each row |
| 53 | + newrows=[] |
| 54 | + org='' |
| 55 | + passage='' |
| 56 | + date='' |
| 57 | + pate='' |
| 58 | + responses = responses.fillna('').reset_index(drop=True) |
| 59 | + for rownum, row in responses.iterrows(): |
| 60 | + if row['Organoid']!="": |
| 61 | + org = row['Organoid'] |
| 62 | + passage = row['Passage'] |
| 63 | + date = row['Date'] |
| 64 | + pate = row['pate'] |
| 65 | + newrows.append({'Organoid':org,'Passage':passage,'Date':date,'pate':pate}) |
| 66 | + |
| 67 | + releft = pd.DataFrame(newrows) |
| 68 | + responses.Organoid = releft.Organoid |
| 69 | + responses.Passage = releft.Passage |
| 70 | + responses.pate = releft.pate |
| 71 | + responses.Date = releft.Date |
| 72 | + |
| 73 | + |
| 74 | + ##now melt the data into single columns |
| 75 | + rtab = responses.melt(id_vars = responses.columns[0:4],value_vars=responses.columns[4:10], var_name='Drug',value_name='Response') |
| 76 | + |
| 77 | + ##rename the drugs |
| 78 | + rtab[['Drug','Rep']]=rtab['Drug'].str.lower().str.split('.',expand=True) |
| 79 | + newrep=[] |
| 80 | + for r in rtab.Rep: |
| 81 | + if r is None: |
| 82 | + newrep.append(0) |
| 83 | + else: |
| 84 | + newrep.append(r) |
| 85 | + rtab.Rep=newrep |
| 86 | + |
| 87 | + ##renormalize values to max |
| 88 | + ##IMPORTANT: this is how we normalize without DMSO. We need to consider how we're doing this for EACH ORGANOID |
| 89 | + ##currently we take the max value of each orgnaoid/replicate. |
| 90 | + rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max') |
| 91 | + rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00 |
| 92 | + |
| 93 | + |
| 94 | + ##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over |
| 95 | + dosenum = [a for a in range(15)] |
| 96 | + rtab['Dosenum']=dosenum*int(rtab.shape[0]/15) |
| 97 | + |
| 98 | + ##merge the concentrations |
| 99 | + concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose') |
| 100 | + concs.Drug=concs.Drug.str.lower() |
| 101 | + concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge |
| 102 | + |
| 103 | + |
| 104 | + return rtab.merge(concs) |
| 105 | + |
| 106 | +if __name__=='__main__': |
| 107 | + main() |
0 commit comments