Skip to content

Commit 7c37952

Browse files
added transcriptomics function
1 parent 9b664bf commit 7c37952

1 file changed

Lines changed: 66 additions & 1 deletion

File tree

build/novartispdx/02-omics-novartispdx.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,5 +101,70 @@ def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data):
101101
sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'})
102102
sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'})
103103
sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']]
104+
sample_entrez_cn_df = sample_entrez_cn_df.drop_duplicates()
105+
106+
107+
return(sample_entrez_cn_df)
108+
109+
110+
def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_data):
111+
"""
112+
Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting.
104113
105-
return(sample_entrez_cn_df)
114+
Parameters
115+
----------
116+
copy_number_data : pd.Dataframe OR string
117+
Pandas dataframe object with transcriptomics data OR path to csv with transcriptomics data
118+
119+
improve_id_data : pd.Dataframe OR string
120+
Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2()
121+
122+
entrez_data : pd.Dataframe OR string
123+
Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes
124+
125+
Returns
126+
-------
127+
sample_entrez_cn_df : pd.DataFrame
128+
A DataFrame containing the mapped transcriptomics data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id
129+
130+
"""
131+
# read in data
132+
if isinstance(transcriptomics_data, pd.DataFrame) == False:
133+
transcriptomics_data = pd.read_csv(transcriptomics_data)
134+
135+
if isinstance(improve_id_data, pd.DataFrame) == False:
136+
improve_id_data = pd.read_csv(improve_id_data)
137+
138+
if isinstance(entrez_data, pd.DataFrame) == False:
139+
entrez_data = pd.read_csv(entrez_data)
140+
141+
# melt dataframe so that there is gene name and improve_sample_id per row
142+
rnaseq_df = rnaseq_df.rename(columns={'Sample':'stable_id'})
143+
rnaseq_df.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t')
144+
145+
# run tpmFromCounts.py to convert counts to tpm
146+
os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv")
147+
148+
# read in amd melt dataframe so that there is an entrez and sample id per row
149+
tpm_transciptomics_data = pd.read_csv("/tmp/transcriptomics_tpm.tsv", sep="\t")
150+
long_rnaseq = pd.melt(tpm_transciptomics_data, id_vars=['stable_id'], value_vars=tpm_transciptomics_data.columns[tpm_transciptomics_data.columns != 'stable_id'])
151+
152+
# merge entrez id's
153+
entrez_transcriptomics_df = pd.merge(long_rnaseq.drop_duplicates(), entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
154+
155+
# get improve sample id
156+
improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","")
157+
sample_entrez_transcriptomics_df = pd.merge(entrez_transcriptomics_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "to_merge")
158+
159+
# clean up columns and data types
160+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.drop(columns=['stable_id','variable','other_id','to_merge'])
161+
sample_entrez_transcriptomics_df['source'] = "CPDM"
162+
sample_entrez_transcriptomics_df['study'] = "novartispdx"
163+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.rename(columns={'value':'transcriptomics'})
164+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
165+
sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']]
166+
167+
return(sample_entrez_transcriptomics_df)
168+
169+
170+

0 commit comments

Comments
 (0)