Skip to content

Commit 9b664bf

Browse files
added copy number funciton
1 parent 8cd287c commit 9b664bf

1 file changed

Lines changed: 56 additions & 1 deletion

File tree

build/novartispdx/02-omics-novartispdx.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,59 @@ def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str =
4747
rnaseq_data = pd.read_excel(all_omics_excel, 'RNAseq_fpkm')
4848

4949

50-
return(rnaseq_data, copy_number_data, mutations_data)
50+
return(rnaseq_data, copy_number_data, mutations_data)
51+
52+
53+
def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data):
54+
"""
55+
Maps copy number data to improved sample id's and entrez gene data. Also does some data formatting.
56+
57+
Parameters
58+
----------
59+
copy_number_data : pd.Dataframe OR string
60+
Pandas dataframe object with copy number data OR path to csv with copy number data
61+
62+
improve_id_data : pd.Dataframe OR string
63+
Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2()
64+
65+
entrez_data : pd.Dataframe OR string
66+
Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes
67+
68+
Returns
69+
-------
70+
sample_entrez_cn_df : pd.DataFrame
71+
A DataFrame containing the mapped copy number data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id
72+
73+
"""
74+
# read in data
75+
if isinstance(copy_number_data, pd.DataFrame) == False:
76+
copy_number_data = pd.read_csv(copy_number_data)
77+
78+
if isinstance(improve_id_data, pd.DataFrame) == False:
79+
improve_id_data = pd.read_csv(improve_id_data)
80+
81+
if isinstance(entrez_data, pd.DataFrame) == False:
82+
entrez_data = pd.read_csv(entrez_data)
83+
84+
# melt dataframe so that there is gene name and improve_sample_id per row
85+
long_cn_df = pd.melt(copy_number_data, id_vars=['Sample'], value_vars=copy_number_data.columns[copy_number_data.columns != 'Sample'])
86+
87+
# get entrez id's from Sample
88+
entrez_cn_df = pd.merge(long_cn_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'left', left_on= "Sample", right_on= "other_id")
89+
90+
# get copy call from value column (aka copy number)
91+
entrez_cn_df['copy_call'] = [get_copy_call(a) for a in entrez_cn_df['value']]
92+
93+
# get improve sample id
94+
improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","")
95+
sample_entrez_cn_df = pd.merge(entrez_cn_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'left', left_on= "variable", right_on= "to_merge")
96+
97+
# clean up columns and data types
98+
sample_entrez_cn_df = sample_entrez_cn_df.drop(columns=['Sample','variable','other_id','to_merge'])
99+
sample_entrez_cn_df['source'] = "CPDM"
100+
sample_entrez_cn_df['study'] = "novartispdx"
101+
sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'})
102+
sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'})
103+
sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']]
104+
105+
return(sample_entrez_cn_df)

0 commit comments

Comments
 (0)