Skip to content

Commit 729db17

Browse files
committed
Merge remote-tracking branch 'refs/remotes/origin/mpnst_dataset_join' into mpnst_dataset_join
2 parents 91c9079 + e811969 commit 729db17

10 files changed

Lines changed: 473 additions & 4 deletions

build/mpnst/03_get_experiments.R

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,15 @@ fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), s
239239

240240
message("Wrote PDX curve data")
241241

242+
242243
system(sprintf(
243-
"/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s",
244-
paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"),
245-
drugfile,
246-
paste0("/tmp/", out_prefix, "_pdx")
244+
"/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s --source 'NF Data Portal' --study 'MPNST PDX'",
245+
paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"),
246+
drugfile,
247+
paste0("/tmp/", out_prefix, "_pdx")
247248
))
248249

250+
249251
message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations")
250252

251253

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import pandas as pd
2+
import synapseclient
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
def get_complete_novartispdx_sample_sheet(synObject):
8+
9+
files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file']))
10+
11+
synIDs = [item['id'] for item in files]
12+
# leave off synIDs for drug info
13+
synIDs.remove('syn66276102')
14+
synIDs.remove('syn66276098')
15+
synIDs.remove("syn66477971")
16+
# create empty dataframe
17+
allsamplesheet = pd.DataFrame()
18+
# iterate through IDs and concatenate
19+
for id in synIDs:
20+
curr = synObject.get(id)
21+
currdf = pd.read_csv(curr.path)
22+
allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True)
23+
# rename columns and reformat cancer type from CANCER_HISTOLOGY column
24+
allsamplesheet['other_id'] = allsamplesheet['Sample ID']
25+
allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID']
26+
allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1]
27+
allsamplesheet['species'] = "Homo Sapiens(human)"
28+
allsamplesheet['model_type'] = 'patient derived xenograft'
29+
allsamplesheet['other_id_source'] = 'Synapse'
30+
allsamplesheet['other_names'] = ''
31+
finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
32+
return finalsamplesheet
33+
34+
if __name__ == "__main__":
35+
36+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet")
37+
38+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
39+
40+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
41+
42+
args = parser.parse_args()
43+
44+
print("Logging into Synapse")
45+
PAT = args.token
46+
synObject = synapseclient.login(authToken=PAT)
47+
samplesheet = get_complete_novartispdx_sample_sheet(synObject)
48+
49+
if (args.prevSamples):
50+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
51+
else:
52+
prev_max_improve_id = 0
53+
54+
samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1)
55+
56+
samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False)
57+
58+
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
import math
5+
import argparse
6+
7+
8+
def get_copy_call(a):
9+
"""
10+
Heler Function - Determine copy call for a value.
11+
"""
12+
13+
if a is None:
14+
return float('nan')
15+
16+
if math.isnan(a):
17+
return float('nan')
18+
19+
a_val = math.log2(float(a)+0.000001)
20+
if a_val < 0.5210507:
21+
return 'deep del'
22+
elif a_val < 0.7311832:
23+
return 'het loss'
24+
elif a_val < 1.214125:
25+
return 'diploid'
26+
elif a_val < 1.422233:
27+
return 'gain'
28+
else:
29+
return 'amp'
30+
31+
return pd.Series([get_copy_call(a) for a in arr])
32+
33+
34+
def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None):
35+
"""
36+
Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account
37+
and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
38+
Omics data is an excel file. The excel file is then parsed for the RNAseq, copy number, and mutations data.
39+
40+
Parameters
41+
----------
42+
synID : string
43+
SynapseID of dataset to download. Default is synapseID of the sequencing dataset.
44+
45+
save_path : string
46+
Local path where the downloaded file will be saved.
47+
48+
synToken : string
49+
Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
50+
51+
Returns
52+
-------
53+
mutations_data : pd.DataFrame
54+
A DataFrame containing mutations data.
55+
56+
copy_number_data : pd.DataFrame
57+
A DataFrame containing copy number data.
58+
59+
rnaseq_data : pd.DataFrame
60+
A DataFrame containing RNAseq data.
61+
"""
62+
63+
syn = synapseclient.Synapse()
64+
syn.login(authToken=synToken)
65+
66+
# Obtain a pointer and download the data
67+
syn66364488 = syn.get(entity=synID, downloadLocation = save_path)
68+
69+
# Get the path to the local copy of the data file
70+
sequencing_filepath = syn66364488.path
71+
all_omics_excel = pd.ExcelFile(open(sequencing_filepath, 'rb'))
72+
mutations_data = pd.read_excel(all_omics_excel, 'pdxe_mut_and_cn2') # table with somatic mutation information
73+
copy_number_data = pd.read_excel(all_omics_excel, 'copy number') # table with copy number information
74+
rnaseq_data = pd.read_excel(all_omics_excel, 'RNAseq_fpkm')
75+
76+
77+
return(rnaseq_data, copy_number_data, mutations_data)
78+
79+
80+
def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data):
81+
"""
82+
Maps copy number data to improved sample id's and entrez gene data. Also does some data formatting.
83+
84+
Parameters
85+
----------
86+
copy_number_data : pd.Dataframe OR string
87+
Pandas dataframe object with copy number data OR path to csv with copy number data
88+
89+
improve_id_data : pd.Dataframe OR string
90+
Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2()
91+
92+
entrez_data : pd.Dataframe OR string
93+
Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes
94+
95+
Returns
96+
-------
97+
sample_entrez_cn_df : pd.DataFrame
98+
A DataFrame containing the mapped copy number data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id
99+
100+
"""
101+
# read in data
102+
if isinstance(copy_number_data, pd.DataFrame) == False:
103+
copy_number_data = pd.read_csv(copy_number_data)
104+
105+
if isinstance(improve_id_data, pd.DataFrame) == False:
106+
improve_id_data = pd.read_csv(improve_id_data)
107+
108+
if isinstance(entrez_data, pd.DataFrame) == False:
109+
entrez_data = pd.read_csv(entrez_data)
110+
111+
# melt dataframe so that there is gene name and improve_sample_id per row
112+
long_cn_df = pd.melt(copy_number_data, id_vars=['Sample'], value_vars=copy_number_data.columns[copy_number_data.columns != 'Sample'])
113+
114+
# get entrez id's from Sample
115+
entrez_cn_df = pd.merge(long_cn_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'left', left_on= "Sample", right_on= "other_id")
116+
117+
# get copy call from value column (aka copy number)
118+
entrez_cn_df['copy_call'] = [get_copy_call(a) for a in entrez_cn_df['value']]
119+
120+
# get improve sample id
121+
improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","")
122+
sample_entrez_cn_df = pd.merge(entrez_cn_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'left', left_on= "variable", right_on= "to_merge")
123+
124+
# clean up columns and data types
125+
sample_entrez_cn_df = sample_entrez_cn_df.drop(columns=['Sample','variable','other_id','to_merge'])
126+
sample_entrez_cn_df['source'] = "CPDM"
127+
sample_entrez_cn_df['study'] = "novartispdx"
128+
sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'})
129+
sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'})
130+
sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']]
131+
sample_entrez_cn_df = sample_entrez_cn_df.drop_duplicates()
132+
133+
134+
return(sample_entrez_cn_df)
135+
136+
137+
def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_data):
138+
"""
139+
Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting.
140+
141+
Parameters
142+
----------
143+
copy_number_data : pd.Dataframe OR string
144+
Pandas dataframe object with transcriptomics data OR path to csv with transcriptomics data
145+
146+
improve_id_data : pd.Dataframe OR string
147+
Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2()
148+
149+
entrez_data : pd.Dataframe OR string
150+
Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes
151+
152+
Returns
153+
-------
154+
sample_entrez_cn_df : pd.DataFrame
155+
A DataFrame containing the mapped transcriptomics data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id
156+
157+
"""
158+
# read in data
159+
if isinstance(transcriptomics_data, pd.DataFrame) == False:
160+
transcriptomics_data = pd.read_csv(transcriptomics_data)
161+
162+
if isinstance(improve_id_data, pd.DataFrame) == False:
163+
improve_id_data = pd.read_csv(improve_id_data)
164+
165+
if isinstance(entrez_data, pd.DataFrame) == False:
166+
entrez_data = pd.read_csv(entrez_data)
167+
168+
# melt dataframe so that there is gene name and improve_sample_id per row
169+
rnaseq_df = rnaseq_df.rename(columns={'Sample':'stable_id'})
170+
rnaseq_df.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t')
171+
172+
# run tpmFromCounts.py to convert counts to tpm
173+
os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv")
174+
175+
# read in amd melt dataframe so that there is an entrez and sample id per row
176+
tpm_transciptomics_data = pd.read_csv("/tmp/transcriptomics_tpm.tsv", sep="\t")
177+
long_rnaseq = pd.melt(tpm_transciptomics_data, id_vars=['stable_id'], value_vars=tpm_transciptomics_data.columns[tpm_transciptomics_data.columns != 'stable_id'])
178+
179+
# merge entrez id's
180+
entrez_transcriptomics_df = pd.merge(long_rnaseq.drop_duplicates(), entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
181+
182+
# get improve sample id
183+
improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","")
184+
sample_entrez_transcriptomics_df = pd.merge(entrez_transcriptomics_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "to_merge")
185+
186+
# clean up columns and data types
187+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.drop(columns=['stable_id','variable','other_id','to_merge'])
188+
sample_entrez_transcriptomics_df['source'] = "CPDM"
189+
sample_entrez_transcriptomics_df['study'] = "novartispdx"
190+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.rename(columns={'value':'transcriptomics'})
191+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
192+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']]
193+
194+
return(sample_entrez_transcriptomics_df)
195+
196+
197+
if __name__ == "__main__":
198+
print('in main')
199+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
200+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
201+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
202+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
203+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
204+
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
205+
parser.add_argument('-t', '--token', help='Synapse token')
206+
207+
args = parser.parse_args()
208+
print("Logging into Synapse")
209+
PAT = args.token
210+
211+
genes=pd.read_csv(args.genes)
212+
samples = pd.read_csv(args.samples)
213+
214+
data =download_parse_omics_novPDX(syn id,savestring, PAT)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
# for testing locally
7+
from utils.pubchem_retrieval import update_dataframe_and_write_tsv
8+
# for building in docker
9+
#from pubchem_retrieval import update_dataframe_and_write_tsv
10+
11+
12+
def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath):
13+
file = synObject.get('syn66276102')
14+
# read raw drug data from synapse
15+
rawDrugData = pd.read_csv(file.path)
16+
# split on + operator - there are 2- and one 3- way drug combos in this dataset
17+
sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True)
18+
19+
20+
21+
# taking the drug names from the first and second column from the split - there is only one
22+
# drug name in the 3rd column (onen 3-way combo) that is replicated in other treatments as well
23+
alldrugnames = pd.Series(pd.concat([sepDrugNames[0], sepDrugNames[1]]).dropna()).str.split('"', expand=True)[0].str.split("-", expand=True)[0]
24+
#nodoseinfo = pd.Series(alldrugnames.str.split("-", expand =True)[0])
25+
#combineddrugames = pd.concat([alldrugnames, nodoseinfo])
26+
finalDrugNames = pd.Series(alldrugnames.unique()).str.strip().unique()
27+
# get unique drugs
28+
newdrugnames = finalDrugNames[finalDrugNames != 'untreated']
29+
30+
#print(finalDrugNames.tolist)
31+
#newdrugnames = finalDrugNames.remove('untreated')
32+
print(2)
33+
print(newdrugnames)
34+
35+
36+
# use helper functions in pubchem_retrieval.py
37+
alldrugs = []
38+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
39+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
40+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
41+
42+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
43+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
44+
45+
##write drugs
46+
newdrugs.to_csv(outputPath, sep='\t', index=False)
47+
48+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
49+
print('Missing drugs in existing file, querying pubchem')
50+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
51+
52+
53+
if __name__ == "__main__":
54+
55+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Novartis PDX data")
56+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for bladderpdo', nargs="?", default = None)
57+
parser.add_argument('-o', '--outputPath', help='Output path for updated novartispdx drug file', default = "/tmp/novartispdx_drugs.tsv")
58+
parser.add_argument('-t', '--token', help='Synapse token')
59+
60+
args = parser.parse_args()
61+
print("Logging into Synapse")
62+
PAT = args.token
63+
print("after PAT assignment")
64+
synObject = synapseclient.login(authToken=PAT)
65+
print('after creating synObject')
66+
if args.prevDrugFilePath:
67+
previousDrugs = args.prevDrugFilePath
68+
else:
69+
previousDrugs = None
70+
create_novartis_pdx_drugs_file(synObject, previousDrugs, args.outputPath)

0 commit comments

Comments
 (0)