Skip to content

Commit 08c2f2f

Browse files
committed
Addition of drugs and experiments, alterations to calc_pdx_metrics
Addition of drug data (~12 drugs were not matchable) and experiments from `calc_pdx_metrics.py`. `calc_pdx_metrics.py` was altered to skip auc calculations with fewer than 2 points (the linear model would not converge).
1 parent 7c37952 commit 08c2f2f

7 files changed

Lines changed: 223 additions & 7 deletions

File tree

build/novartispdx/02-omics-novartispdx.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,33 @@
44
import math
55
import argparse
66

7+
8+
def get_copy_call(a):
9+
"""
10+
Heler Function - Determine copy call for a value.
11+
"""
12+
13+
if a is None:
14+
return float('nan')
15+
16+
if math.isnan(a):
17+
return float('nan')
18+
19+
a_val = math.log2(float(a)+0.000001)
20+
if a_val < 0.5210507:
21+
return 'deep del'
22+
elif a_val < 0.7311832:
23+
return 'het loss'
24+
elif a_val < 1.214125:
25+
return 'diploid'
26+
elif a_val < 1.422233:
27+
return 'gain'
28+
else:
29+
return 'amp'
30+
31+
return pd.Series([get_copy_call(a) for a in arr])
32+
33+
734
def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None):
835
"""
936
Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account
@@ -167,4 +194,21 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat
167194
return(sample_entrez_transcriptomics_df)
168195

169196

197+
if __name__ == "__main__":
198+
print('in main')
199+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
200+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
201+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
202+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
203+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
204+
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
205+
parser.add_argument('-t', '--token', help='Synapse token')
206+
207+
args = parser.parse_args()
208+
print("Logging into Synapse")
209+
PAT = args.token
210+
211+
genes=pd.read_csv(args.genes)
212+
samples = pd.read_csv(args.samples)
170213

214+
data =download_parse_omics_novPDX(syn id,savestring, PAT)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
# for testing locally
7+
from utils.pubchem_retrieval import update_dataframe_and_write_tsv
8+
# for building in docker
9+
#from pubchem_retrieval import update_dataframe_and_write_tsv
10+
11+
12+
def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath):
13+
file = synObject.get('syn66276102')
14+
# read raw drug data from synapse
15+
rawDrugData = pd.read_csv(file.path)
16+
# split on + operator - there are 2- and one 3- way drug combos in this dataset
17+
sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True)
18+
### NEED TO ALSO remove drug names with different dose info
19+
20+
21+
# taking the drug names from the first and second column from the split - there is only one
22+
# drug name in the 3rd column (onen 3-way combo) that is replicated in other treatments as well
23+
alldrugnames = pd.Series(pd.concat([sepDrugNames[0], sepDrugNames[1]]).dropna()).str.split('"', expand=True)[0].str.split("-", expand=True)[0]
24+
#nodoseinfo = pd.Series(alldrugnames.str.split("-", expand =True)[0])
25+
#combineddrugames = pd.concat([alldrugnames, nodoseinfo])
26+
finalDrugNames = pd.Series(alldrugnames.unique()).str.strip().unique()
27+
# get unique drugs
28+
newdrugnames = finalDrugNames[finalDrugNames != 'untreated']
29+
30+
#print(finalDrugNames.tolist)
31+
#newdrugnames = finalDrugNames.remove('untreated')
32+
print(2)
33+
print(newdrugnames)
34+
35+
36+
# use helper functions in pubchem_retrieval.py
37+
alldrugs = []
38+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
39+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
40+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
41+
42+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
43+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
44+
45+
##write drugs
46+
newdrugs.to_csv(outputPath, sep='\t', index=False)
47+
48+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
49+
print('Missing drugs in existing file, querying pubchem')
50+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
51+
52+
53+
if __name__ == "__main__":
54+
55+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Novartis PDX data")
56+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for bladderpdo', nargs="?", default = None)
57+
parser.add_argument('-o', '--outputPath', help='Output path for updated novartispdx drug file', default = "/tmp/novartispdx_drugs.tsv")
58+
parser.add_argument('-t', '--token', help='Synapse token')
59+
60+
args = parser.parse_args()
61+
print("Logging into Synapse")
62+
PAT = args.token
63+
print("after PAT assignment")
64+
synObject = synapseclient.login(authToken=PAT)
65+
print('after creating synObject')
66+
if args.prevDrugFilePath:
67+
previousDrugs = args.prevDrugFilePath
68+
else:
69+
previousDrugs = None
70+
create_novartis_pdx_drugs_file(synObject, previousDrugs, args.outputPath)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
def get_novartis_pdx_experiments_file:
8+
# input for the calc_pdx_metrics script
9+
10+
file1 = synObject.get('syn66276102')
11+
rawDrugData = pd.read_csv(file1.path)
12+
# STILL NEED TO : link to improve ids.
13+
# update a few drug ids for greater inclusion
14+
novartispdx_curvefile = rawDrugData[['Model', 'Days Post T0', 'Volume (mm3)', 'Treatment']]
15+
novartispdx_curvefile=novartispdx_curvefile.rename({'Model': 'model_id', 'Days Post T0' : 'time', 'Volume (mm3)': 'volume', 'Treatment':'treatment'}, axis=1)
16+
novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.lower()
17+
novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.replace('"', '')
18+
novartispdx_curvefile['treatment']=novartispdx_curvefile['treatment'].str.replace('untreated', 'control')
19+
novartispdx_curvefile['experiment'] = novartispdx_curvefile.groupby(['model_id']).ngroup()+1
20+
# remove triple combination(s)
21+
novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+.*\+')]
22+
# remove dose information appended to some drugs in the treatment column and include in dose colum
23+
druganddose = novartispdx_curvefile['treatment'].str.split('-', expand=True)
24+
druganddose = druganddose.rename({0: 'treatment', 1:'dose'}, axis=1)
25+
novartispdx_curvefile['treatment']=druganddose['treatment']
26+
novartispdx_curvefile['dose'] = druganddose['dose']
27+
# remove pdxs with only one drug treatment (no control)
28+
unique_vals_tally = novartispdx_curvefile.groupby('experiment').nunique()
29+
todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index
30+
novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)]
31+
# remove groups with no 'control' treatment
32+
groupeddf = test.groupby('experiment')
33+
no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any())
34+
35+
missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment']
36+
finaldf=test[~test['experiment'].isin(missingcontrols)]
37+
38+
finalcurvefile = finaldf
39+
return finalcurvefile
40+
#finalcurvefile.to_csv('/tmp/novartispdx_doserep.tsv', sep="\t")
41+
42+
43+
if __name__ == "__main__":
44+
parser = argparse.ArgumentParser()
45+
parser.add_argument('-t', '--token', help='Synapse authentication token')
46+
parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples')
47+
parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples')
48+
parser.add_argument('-o', '--output', default = '/tmp/novartispdx_doserep.tsv',help='Output file to be read into curve fitting code')
49+
50+
args = parser.parse_args()
51+
print("Logging into Synapse")
52+
PAT = args.token
53+
synObject = synapseclient.login(authToken=PAT)
54+
drug_df = pd.read_csv(args.drugfile, sep='\t')
55+
samples_df = pd.read_csv(args.curSampleFile)
56+
57+
doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df, drug_df)
58+
doseresponse_data.to_csv(args.output, sep='\t')
59+

build/novartispdx/build_drugs.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running script with token and drugFile $1"
7+
# for running locally (from build directory):
8+
python3 -m novartispdx.03-drugs-novartispdx --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv
9+
#python3 novar
10+
#python3 03-drugs-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv
11+
12+
echo "Running build_drug_desc.py..."
13+
#for running locally:
14+
python3 utils/build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz
15+
#python3 build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz

build/novartispdx/build_experiments.sh

Whitespace-only changes.

build/novartispdx/build_omics.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running script with token, curSamples $2, and genes $1."
7+
# for mutation data (-m)
8+
python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -m
9+
# for expressiondata (-e)
10+
python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -e
11+
# for copynumber
12+
python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -c

build/utils/calc_pdx_metrics.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True):
184184
dict: Dictionary containing the AUC value.
185185
"""
186186
auc = trapz_auc(time, volume)
187-
#print(time)
187+
print('at line 187')
188+
print(time.shape)
189+
print(time.dtype)
190+
print(np.max(time.astype(int)))
191+
print('auc is : ')
192+
print(auc)
188193
if time_normalize:
189194
auc = auc/np.max(time)
190195
return {"metric": "auc", "value": auc, 'time':np.max(time)}
@@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name):
270275
raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'")
271276

272277
data['log_volume'] = np.log(data['volume'])
273-
278+
print('drug name is ' + drug_name)
279+
data['exp_type'] = data['exp_type'].astype('category')
280+
data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True)
281+
print(data)
282+
print(data['exp_type'].cat.categories)
274283
# Define the formula for mixed linear model
275284
formula = 'log_volume ~ time*exp_type'
276285

286+
#print(data['exp_type'].cat.categories)
277287
# Fit the model
278288
model = mixedlm(formula, data, groups=data['model_id'])
279289
fit = model.fit()
@@ -282,8 +292,9 @@ def lmm(time, volume, treatment, drug_name):
282292
#interaction_term = 'time:exp_type'
283293
# if interaction_term in fit.params:
284294
# time_coef_value = fit.params['time']
285-
#print(fit.params)
295+
print(fit.params)
286296
i_coef_value = fit.params['time:exp_type[T.'+drug_name+']']
297+
#i_coef_value = fit.params['time:exp_type['+drug_name+']']
287298
# else:
288299
# coef_value = None # Handle the case when the interaction term is not present
289300

@@ -341,21 +352,25 @@ def get_drug_stats(df, control='control'):
341352
for name, group in tqdm(groups):
342353
# Each group contains multiple treatments and a control
343354
drugs = set(group.treatment) - set([control])
355+
print('line 355')
344356
print(name[0])
345357
print(drugs)
346358
mod = list(set(group.model_id))[0]
347359

348360
ctl_data = group[group.treatment == control]
349361
ctl_time = np.array(ctl_data.time)
350362
ctl_volume = np.array(ctl_data.volume)
351-
363+
if (ctl_volume.shape[0] < 2):
364+
continue
352365
ctl_auc = AUC(ctl_time, ctl_volume)
353366
for d in drugs:
354-
print(d)
355-
d_data = group[group.treatment == d]
367+
print('is our drug a string or dict?')
368+
print(str(d))
369+
d_data = group[group.treatment == str(d)]
356370
treat_time = np.array(d_data.time)
357371
treat_volume = np.array(d_data.volume)
358-
372+
if (treat_volume.shape[0] < 2):
373+
continue
359374
# Get ABC for group
360375
treat_auc = AUC(treat_time, treat_volume)
361376
treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume)
@@ -368,6 +383,7 @@ def get_drug_stats(df, control='control'):
368383

369384
#llm
370385
comb = pd.concat([ctl_data, d_data])
386+
#print(comb)
371387
lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
372388
lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
373389
if '+' in d:

0 commit comments

Comments
 (0)