Skip to content

Commit 4b824d6

Browse files
added final mutations func
1 parent 73511c9 commit 4b824d6

1 file changed

Lines changed: 61 additions & 1 deletion

File tree

build/novartispdx/02-omics-novartispdx.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import math
55
import argparse
6+
import synapseclient
67

78

89
def get_copy_call(a):
@@ -194,6 +195,65 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat
194195
return(sample_entrez_transcriptomics_df)
195196

196197

198+
199+
def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data):
200+
"""
201+
Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting.
202+
203+
Parameters
204+
----------
205+
mutation_data : pd.Dataframe OR string
206+
Pandas dataframe object with mutations data OR path to csv with mutations data
207+
208+
improve_id_data : pd.Dataframe OR string
209+
Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2()
210+
211+
entrez_data : pd.Dataframe OR string
212+
Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes
213+
214+
Returns
215+
-------
216+
mutations_final : pd.DataFrame
217+
A DataFrame containing the mapped mutations data with columns: entrez_id, mutation, variant_classification, improve_sample_id, source, study
218+
219+
"""
220+
# read in data
221+
if isinstance(mutation_data, pd.DataFrame) == False:
222+
mutation_data = pd.read_csv(mutation_data)
223+
224+
if isinstance(improve_id_data, pd.DataFrame) == False:
225+
improve_id_data = pd.read_csv(improve_id_data)
226+
227+
if isinstance(entrez_data, pd.DataFrame) == False:
228+
entrez_data = pd.read_csv(entrez_data)
229+
# include only rows that are mutations (data had both cn and mutations)
230+
mutations_only_df = mutations_df[mutations_df['Category'].isin(["MutNovel","MutKnownFunctional","MutLikelyFunctional"])]
231+
232+
# turn details column into mutation column
233+
mutations_only_df['mutation'] = mutations_only_df['Details'].str.split(pat = ",", expand=True).iloc[:,0]
234+
235+
# create variant classifications with information that we have
236+
mutations_only_df['variant_classification'] = np.nan
237+
mutations_only_df.loc[mutations_only_df['mutation'].str.contains("-"),['variant_classification']] = "Frame_Shift_Del"
238+
mutations_only_df.loc[mutations_only_df['mutation'].str.contains(r'[A-Za-z]\d+[A-Za-z]$', regex=True, na=False),['variant_classification']] = "Missense_Mutation"
239+
mutations_only_df['variant_classification'] = mutations_only_df['variant_classification'].fillna("Undetermined")
240+
241+
# missing entrex id's are not in genes.csv, so get rid of those rows
242+
mutations_only_df = mutations_only_df[mutations_only_df['Entrez'].notna()]
243+
244+
# merge improve sample names
245+
improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","")
246+
mutations_final = pd.merge(mutations_only_df, improve_id_data[['to_merge','improve_sample_id']], how = 'inner', left_on='Sample', right_on='to_merge')
247+
248+
# clean up column names and data types
249+
mutations_final = mutations_final.rename(columns={'Entrez':'entrez_id'})
250+
mutations_final = mutations_final.drop(columns=['Sample','Gene','Category','Details','to_merge'])
251+
mutations_final['source'] = "CPDM"
252+
mutations_final['study'] = "novartispdx"
253+
mutations_final = mutations_final.astype({'entrez_id':'int'})
254+
255+
return(mutations_final)
256+
197257
if __name__ == "__main__":
198258
print('in main')
199259
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
@@ -243,7 +303,7 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat
243303
exit()
244304
else:
245305
print("Starting mutations data.")
246-
mutation_df = map_mutations(mutation_data = "/tmp/mutation_data.csv", improve_id_data = "/tmp/novartispdx_samples.csv", entrez_data = "/tmp/genes.csv")
306+
mutation_df = map_mutations_novPDX(mutation_data = "/tmp/mutation_data.csv", improve_id_data = "/tmp/novartispdx_samples.csv", entrez_data = "/tmp/genes.csv")
247307
mutation_df.to_csv("/tmp/crcpdo_mutations.csv", index=False)
248308

249309
if args.copy_number:

0 commit comments

Comments
 (0)