Skip to content

Commit 7b2b074

Browse files
changes
1 parent 41f9015 commit 7b2b074

1 file changed

Lines changed: 5 additions & 3 deletions

File tree

build/liverpdo/02-omics-liverpdo.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def map_mutations(mutation_data, improve_id_data, entrez_data):
143143
sample_mutation_data = pd.merge(mutation_data, improve_id_data[['other_id','improve_sample_id']], how='inner', left_on="Tumor_Sample_Barcode", right_on="other_id")
144144

145145
# the data's variant classification matches scheme well, except "Non-coding_Transcript". let's change those to RNA
146-
sample_entrez_mutation_data = pd.merge(sample_mutation_data, entrez_data[['entrez_id','other_id']], how='left', left_on="Hugo_Symbol", right_on="other_id") # merge with our entrez database to see if we have additional matches
146+
sample_entrez_mutation_data = pd.merge(sample_mutation_data, entrez_data[['entrez_id','other_id']], how='inner', left_on="Hugo_Symbol", right_on="other_id") # merge with our entrez database to see if we have additional matches
147147

148148
# clean up column names and data types
149149
columns_to_drop = set(sample_entrez_mutation_data.columns) - set(['entrez_id','mutation','Variant_Classification','improve_sample_id'])
@@ -196,6 +196,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
196196
entrez_data = pd.read_csv(entrez_data)
197197

198198
# get data ready
199+
copy_number_data = copy_number_data.iloc[:,1:]
199200
copy_number_data.columns = copy_number_data.iloc[0]
200201
copy_number_data = copy_number_data.drop([0], axis=0)
201202
copynum_df = copy_number_data.drop(columns=['Hugo_Symbol','Cytoband'])
@@ -206,6 +207,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
206207

207208
# do copy_number calculation from score and get copy call column
208209
long_cn_df = long_cn_df.rename(columns={0:'other_id'})
210+
long_cn_df = long_cn_df.astype({'value':'float'})
209211
long_cn_df['copy_number'] = pow(2,long_cn_df['value'])*2
210212
long_cn_df['copy_call'] = [get_copy_call(a) for a in long_cn_df['copy_number']]
211213

@@ -259,11 +261,11 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
259261

260262

261263
# map gene names to entrez id's
262-
mapped_transcriptomics_df = pd.merge(long_transcriptomics_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'left', left_on= "stable_id", right_on= "other_id")
264+
mapped_transcriptomics_df = pd.merge(long_transcriptomics_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
263265
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna(subset=['entrez_id'])
264266

265267
# mapping improve sample id'samples_df
266-
mapped_transcriptomics_df = pd.merge(mapped_transcriptomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'left', left_on= "variable", right_on= "other_id")
268+
mapped_transcriptomics_df = pd.merge(mapped_transcriptomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "other_id")
267269

268270
# clean up column names and data types
269271
mapped_transcriptomics_df = mapped_transcriptomics_df.drop(columns=['stable_id','variable','other_id_x','other_id_y'])

0 commit comments

Comments
 (0)