Skip to content

Commit 8470ca0

Browse files
committed
merge
2 parents 3c58dba + bd584cc commit 8470ca0

25 files changed

Lines changed: 1015 additions & 533 deletions

.github/workflows/main.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ on:
44
push:
55
tags:
66
- '*' # Triggers the workflow only on version tags
7+
workflow_dispatch: # Allows manual triggering of the workflow
78

89
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
910
permissions:
@@ -44,4 +45,4 @@ jobs:
4445
steps:
4546
- name: Deploy to GitHub Pages
4647
id: deployment
47-
uses: actions/deploy-pages@v4
48+
uses: actions/deploy-pages@v4

build/README.md

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,10 @@ are added.
1010

1111
## build_all.py script
1212

13-
This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare and pypi.
13+
This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare.
1414

1515
It requires the following authorization tokens to be set in the local environment depending on the use case:
1616
`SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Join the [CoderData team](https://www.synapse.org/#!Team:3503472) on Synapse and generate an access token.
17-
`PYPI_TOKEN`: This token is required to upload to PyPI.
1817
`FIGSHARE_TOKEN`: This token is required to upload to Figshare.
1918
`GITHUB_TOKEN`: This token is required to upload to GitHub.
2019

@@ -25,21 +24,20 @@ It requires the following authorization tokens to be set in the local environmen
2524
- `--omics`: Processes and builds the omics data files.
2625
- `--drugs`: Processes and builds the drug data files.
2726
- `--exp`: Processes and builds the experiment data files.
28-
- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate, figshare, or pypi commands.
27+
- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate or figshare commands.
2928
- `--validate`: Validates the generated datasets using the schema check scripts. This is automatically included if data upload occurs.
3029
- `--figshare`: Uploads the datasets to Figshare. FIGSHARE_TOKEN must be set in local environment.
31-
- `--pypi`: Uploads the package to PyPI. PYPI_TOKEN must be set in local environment.
3230
- `--high_mem`: Utilizes high memory mode for concurrent data processing. This has been successfully tested using 32 or more vCPUs.
3331
- `--dataset`: Specifies the datasets to process (default='broad_sanger,hcmi,beataml,mpnst,cptac').
34-
- `--version`: Specifies the version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for figshare and PyPI upload steps. This must be a higher version than previously published versions.
32+
- `--version`: Specifies the version number for the Figshare upload title (e.g., "0.1.29"). This must be a higher version than previously published versions.
3533
- `--github-username`: GitHub username matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
3634
- `--github-email`: GitHub email matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
3735

3836
**Example usage**:
39-
- Build all datasets and upload to Figshare and PyPI and GitHub.
40-
Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `PYPI_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.
37+
- Build all datasets and upload to Figshare and GitHub.
38+
Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.
4139
```bash
42-
python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
40+
python build/build_all.py --all --high_mem --validate --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
4341
```
4442

4543
- Build only the experiment files.

build/beatAML/GetBeatAML.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -467,10 +467,10 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
467467
how='left')
468468
mapped_df.insert(0, 'improve_sample_id', mapped_df.pop('improve_sample_id'))
469469

470-
print(mapped_df.to_string())
471-
mapped_df['improve_sample_id'] = mapped_df['improve_sample_id'].astype(int)
472-
mapped_df['entrez_id'] = mapped_df['entrez_id'].fillna(0)
473-
mapped_df['entrez_id'] = mapped_df['entrez_id'].astype(int)
470+
# Replace NaNs, round values, and convert to integers for specified columns
471+
columns_to_convert = ['improve_sample_id', 'entrez_id']
472+
mapped_df[columns_to_convert] = mapped_df[columns_to_convert].fillna(0).round().astype('int32')
473+
474474
mapped_df['source'] = 'synapse'
475475
mapped_df['study'] = 'BeatAML'
476476
mapped_df =mapped_df.drop_duplicates()

build/hcmi/02-getHCMIData.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -474,15 +474,15 @@ def get_copy_call(a):
474474

475475
if math.isnan(a):
476476
return float('nan')
477-
478-
a_val = math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
479-
if a_val < 0.0: #0.5210507:
477+
478+
a_val = math.log2(float(a)+0.000001)
479+
if a_val < 0.5210507:
480480
return 'deep del'
481481
elif a_val < 0.7311832:
482482
return 'het loss'
483483
elif a_val < 1.214125:
484484
return 'diploid'
485-
elif a_val < 1.731183:
485+
elif a_val < 1.422233:
486486
return 'gain'
487487
else:
488488
return 'amp'
@@ -697,4 +697,4 @@ def main():
697697

698698
if __name__ == "__main__":
699699
main()
700-
700+

build/utils/build_drug_desc.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def smiles_to_mordred(smiles,nproc=2):
6464
##reformat here
6565
longtab = pd.melt(dd,id_vars='smile',value_vars=values)
6666
longtab = longtab.rename({'variable':'structural_descriptor','value':'descriptor_value'},axis=1)
67+
6768
return longtab
6869

6970
def main():
@@ -82,16 +83,24 @@ def main():
8283
cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan']
8384
# isosmiles = list(set(tab.isoSMILES))
8485
morgs = smiles_to_fingerprint(cansmiles)
85-
# print(morgs)
86+
8687
ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates()
87-
# print(ids)
88+
8889
id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']]
8990

9091
mords = smiles_to_mordred(cansmiles,nproc=ncors)
9192

9293
id_mord = ids.rename({'canSMILES':'smile'},axis=1).merge(mords)[['improve_drug_id','structural_descriptor','descriptor_value']]
9394

94-
full = pd.concat([id_morg,id_mord],axis=0)
95+
full = pd.concat([id_morg,id_mord],axis=0)
96+
97+
# Convert any values that contain the following strings to NA. I think this covers all of the cases, but add here if more are found.
98+
strings_to_replace = ["min", "max", "invalid", "multiple", "missing"]
99+
pattern = '|'.join(strings_to_replace)
100+
full['descriptor_value'] = full['descriptor_value'].astype(str)
101+
full.loc[full['descriptor_value'].str.contains(pattern, case=False, na=False), 'descriptor_value'] = "NaN"
102+
103+
95104
full.to_csv(args.outtable,sep='\t',index=False,compression='gzip')
96105

97106
if __name__=='__main__':

build/utils/calc_pdx_metrics.py

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -310,91 +310,90 @@ def main():
310310
singles, combos = get_drug_stats(tab)
311311

312312
##join with drug ids
313-
expsing = singles.rename({'drug':'chem_name','metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','drug_combination_metric','drug_combination_value']]
313+
expsing = singles.rename({'drug':'chem_name','metric':'dose_response_metric','value':'dose_response_value','sample':'improve_sample_id'},axis=1).merge(drugs,on='chem_name',how='left')[['improve_drug_id','improve_sample_id','time_unit','time','dose_response_metric','dose_response_value']]
314314
expsing = expsing.dropna()
315+
316+
# source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value
315317

316318
combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True)
317319
combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna()
318320

319-
expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','drug_combination_metric','drug_combination_value']]
320-
expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','drug_combination_metric','drug_combination_value']]
321+
expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
322+
expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']]
321323

322324
expcomb[['source']]='Synapse'
323325
expcomb[['study']]='MPNST PDX in vivo'
324326

325327
expsing[['source']]='Synapse'
326328
expsing[['study']]='MPNST PDX in vivo'
327-
expsing.to_csv(args.outprefix+'_experiments.csv',index=False)
328-
expcomb.to_csv(args.outprefix+'_combinations.csv',index=False)
329+
expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t")
330+
expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t")
329331

330332

331333

332-
def get_drug_stats(df,control='control'):
334+
def get_drug_stats(df, control='control'):
333335
##for each experiment, call group
334-
cols = ['experiment','model_id']
336+
cols = ['experiment', 'model_id']
335337
groups = df.groupby(cols)
336338
singleres = []
337339
combores = []
338340

339-
for name,group in tqdm(groups):
340-
#each group contains multiple treatments anda control
341-
drugs = set(group.treatment)-set([control])
341+
for name, group in tqdm(groups):
342+
# Each group contains multiple treatments and a control
343+
drugs = set(group.treatment) - set([control])
342344
print(name[0])
343345
print(drugs)
344346
mod = list(set(group.model_id))[0]
345-
# print(set(group.model_id))
346-
ctl_data = group[group.treatment==control]
347+
348+
ctl_data = group[group.treatment == control]
347349
ctl_time = np.array(ctl_data.time)
348350
ctl_volume = np.array(ctl_data.volume)
349351

350-
ctl_auc = AUC(ctl_time,ctl_volume)
352+
ctl_auc = AUC(ctl_time, ctl_volume)
351353
for d in drugs:
352354
print(d)
353-
d_data = group[group.treatment==d]
355+
d_data = group[group.treatment == d]
354356
treat_time = np.array(d_data.time)
355357
treat_volume = np.array(d_data.volume)
356358

357-
#get abc for group
358-
treat_auc = AUC(treat_time,treat_volume)
359-
treat_abc = ABC(ctl_time,ctl_volume,treat_time,treat_volume)
360-
#print(f"AUC: {treat_auc}")
361-
#print(f"ABC: {treat_abc}")
362-
treat_abc.update({'sample':mod,'drug':d,'time_unit':'days'})
359+
# Get ABC for group
360+
treat_auc = AUC(treat_time, treat_volume)
361+
treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume)
362+
print(f"treat_time:, {treat_time}")
363+
treat_abc.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
363364
if '+' in d:
364365
combores.append(treat_abc)
365366
else:
366367
singleres.append(treat_abc)
367-
#lmm
368-
comb = pd.concat([ctl_data,d_data])
369-
lmm_res = lmm(comb.time, comb.volume, comb.treatment,d)
370-
lmm_res.update({'sample':mod,'drug':d,'time_unit':'days'})
371-
#print(f"LMM: {lmm_res}")
368+
369+
#llm
370+
comb = pd.concat([ctl_data, d_data])
371+
lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
372+
lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
372373
if '+' in d:
373374
combores.append(lmm_res)
374375
else:
375376
singleres.append(lmm_res)
376377

377-
#get tgi for group
378-
tg = TGI(ctl_volume,treat_volume,treat_time)
379-
tg.update({'sample':mod,'drug':d,'time_unit':'days'})
380-
#print(tg)
378+
# Get TGI for group
379+
tg = TGI(ctl_volume, treat_volume, treat_time)
380+
tg.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
381381
if '+' in d:
382382
combores.append(tg)
383383
else:
384384
singleres.append(tg)
385385

386-
387-
#get mRECIST for group
388-
mr = mrecist(treat_time,treat_volume)
389-
mr.update({'sample':mod,'drug':d,'time_unit':'days'})
386+
# Get mRECIST for group
387+
mr = mrecist(treat_time, treat_volume)
388+
mr.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
390389
if '+' in d:
391390
combores.append(mr)
392391
else:
393392
singleres.append(mr)
394393

395394
sing = pd.DataFrame.from_records(singleres)
396395
comb = pd.DataFrame.from_records(combores)
397-
return sing,comb
396+
return sing, comb
398397

399398
if __name__=='__main__':
400399
main()

0 commit comments

Comments
 (0)