Skip to content

Commit 9c1b85d

Browse files
more corrections to omics py
1 parent 9622410 commit 9c1b85d

1 file changed

Lines changed: 25 additions & 23 deletions

File tree

build/liverpdo/02-omics-liverpdo.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -128,34 +128,34 @@ def map_mutations(mutation_data, improve_id_data, entrez_data):
128128
entrez_data = pd.read_csv(entrez_data)
129129

130130
# create mutation names using chr, position, etc
131-
mutations_df.columns = mutations_df.iloc[0]
132-
mutations_df = mutations_df.drop([0], axis=0)
133-
mutations_df['mutation'] = "g."+ mutations_df['Chromosome'] + ":" + np.where(mutations_df['Start_Position'] == mutations_df['End_Position'], mutations_df['Start_Position'].astype(str), mutations_df['Start_Position'].astype(str) + "_" + mutations_df['End_Position'].astype(str))
134-
for index, row in mutations_df.iterrows():
135-
if mutations_df.at[index,'Variant_Classification'].__contains__("Ins"):
136-
mutations_df.at[index,'mutation'] = mutations_df.at[index,'mutation'] + "ins" + mutations_df.at[index,'Tumor_Seq_Allele2']
137-
elif mutations_df.at[index,'Variant_Classification'].__contains__("Del"):
138-
mutations_df.at[index,'mutation'] = mutations_df.at[index,'mutation'] + "del" + mutations_df.at[index,'Tumor_Seq_Allele1']
131+
mutation_data.columns = mutation_data.iloc[0]
132+
mutation_data = mutation_data.drop([0], axis=0)
133+
mutation_data['mutation'] = "g."+ mutation_data['Chromosome'] + ":" + np.where(mutation_data['Start_Position'] == mutation_data['End_Position'], mutation_data['Start_Position'].astype(str), mutation_data['Start_Position'].astype(str) + "_" + mutation_data['End_Position'].astype(str))
134+
for index, row in mutation_data.iterrows():
135+
if mutation_data.at[index,'Variant_Classification'].__contains__("Ins"):
136+
mutation_data.at[index,'mutation'] = mutation_data.at[index,'mutation'] + "ins" + mutation_data.at[index,'Tumor_Seq_Allele2']
137+
elif mutation_data.at[index,'Variant_Classification'].__contains__("Del"):
138+
mutation_data.at[index,'mutation'] = mutation_data.at[index,'mutation'] + "del" + mutation_data.at[index,'Tumor_Seq_Allele1']
139139
else:
140-
mutations_df.at[index,'mutation'] = mutations_df.at[index,'mutation'] + mutations_df.at[index,'Tumor_Seq_Allele1'] + ">" + mutations_df.at[index,'Tumor_Seq_Allele2']
140+
mutation_data.at[index,'mutation'] = mutation_data.at[index,'mutation'] + mutation_data.at[index,'Tumor_Seq_Allele1'] + ">" + mutation_data.at[index,'Tumor_Seq_Allele2']
141141

142142
# map columns in mutations data to their improved id
143-
sample_mutations_df = pd.merge(mutations_df, samples_df[['other_id','improve_sample_id']], how='inner', left_on="Tumor_Sample_Barcode", right_on="other_id")
143+
sample_mutation_data = pd.merge(mutation_data, improve_id_data[['other_id','improve_sample_id']], how='inner', left_on="Tumor_Sample_Barcode", right_on="other_id")
144144

145145
# the data's variant classification matches scheme well, except "Non-coding_Transcript". let's change those to RNA
146-
sample_entrez_mutations_df = pd.merge(sample_mutations_df, entrez_df[['entrez_id','other_id']], how='left', left_on="Hugo_Symbol", right_on="other_id") # merge with our entrez database to see if we have additional matches
146+
sample_entrez_mutation_data = pd.merge(sample_mutation_data, entrez_data[['entrez_id','other_id']], how='left', left_on="Hugo_Symbol", right_on="other_id") # merge with our entrez database to see if we have additional matches
147147

148148
# clean up column names and data types
149-
columns_to_drop = set(sample_entrez_mutations_df.columns) - set(['entrez_id','mutation','Variant_Classification','improve_sample_id'])
150-
mapped_mutations_df = sample_entrez_mutations_df.drop(columns=columns_to_drop)
151-
mapped_mutations_df = mapped_mutations_df.rename(columns={'Variant_Classification':'variant_classification'})
152-
mapped_mutations_df['source'] = "Synapse"
153-
mapped_mutations_df['study'] = "liverpdo"
154-
mapped_mutations_df = mapped_mutations_df.astype({'entrez_id':'int','improve_sample_id':'int'})
155-
mapped_mutations_df = mapped_mutations_df.drop_duplicates()
156-
mapped_mutations_df = mapped_mutations_df[['entrez_id','mutation','variant_classification','improve_sample_id','study','source']]
149+
columns_to_drop = set(sample_entrez_mutation_data.columns) - set(['entrez_id','mutation','Variant_Classification','improve_sample_id'])
150+
mapped_mutation_data = sample_entrez_mutation_data.drop(columns=columns_to_drop)
151+
mapped_mutation_data = mapped_mutation_data.rename(columns={'Variant_Classification':'variant_classification'})
152+
mapped_mutation_data['source'] = "Synapse"
153+
mapped_mutation_data['study'] = "liverpdo"
154+
mapped_mutation_data = mapped_mutation_data.astype({'entrez_id':'int','improve_sample_id':'int'})
155+
mapped_mutation_data = mapped_mutation_data.drop_duplicates()
156+
mapped_mutation_data = mapped_mutation_data[['entrez_id','mutation','variant_classification','improve_sample_id','study','source']]
157157

158-
return(mapped_mutations_df)
158+
return(mapped_mutation_data)
159159

160160

161161
def get_copy_call(a):
@@ -235,8 +235,9 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
235235
entrez_data = pd.read_csv(entrez_data)
236236

237237
# first, convert genes, which are in ensembl id's to gene names
238+
transciptomics_data = transciptomics_data.rename(columns={'Unnamed: 0': 'stable_id'})
238239
mg = mygene.MyGeneInfo()
239-
ensembl_ids = transciptomics_data.iloc[:,0].values
240+
ensembl_ids = transciptomics_data['stable_id'].values
240241
gene_info_list = mg.getgenes(ensembl_ids, fields='symbol')
241242
gene_df = pd.DataFrame.from_dict(gene_info_list)
242243
for_tpm = pd.merge(transciptomics_data, gene_df[['query','symbol']], how = 'inner', left_on= "stable_id", right_on= "query")
@@ -266,6 +267,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
266267
mapped_transcriptomics_df = mapped_transcriptomics_df.drop(columns=['stable_id','variable','other_id_x','other_id_y'])
267268
mapped_transcriptomics_df['source'] = "Synapse"
268269
mapped_transcriptomics_df['study'] = "liverpdo"
270+
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna()
269271
mapped_transcriptomics_df = mapped_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
270272
mapped_transcriptomics_df = mapped_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']]
271273

@@ -297,10 +299,10 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
297299
# Download and parse rnaseq data
298300
rnaseq_df = download_parse_rna_data(synID="syn68327513", synToken = args.token, save_path="/tmp/")
299301
# Download rest of omics data
300-
mutations_df, copynum_df, proteomics_df= download_parse_omics_data(synID="syn66401303", synToken = args.token, save_path="/tmp/")
302+
mutation_data, copynum_df, proteomics_df= download_parse_omics_data(synID="syn66401303", synToken = args.token, save_path="/tmp/")
301303
# Save mutation and copy number data into csv format
302304
rnaseq_df.to_csv("/tmp/raw_rnaseq_data.csv")
303-
mutations_df.to_csv("/tmp/raw_mutation_data.csv")
305+
mutation_data.to_csv("/tmp/raw_mutation_data.csv")
304306
copynum_df.to_csv("/tmp/raw_copynum_data.csv")
305307
proteomics_df.to_csv("/tmp/raw_proteomics_data.csv")
306308

0 commit comments

Comments
 (0)