Skip to content

Commit 96c0140

Browse files
committed
fix to use updated genes.tsv in coderdata
1 parent 0c724de commit 96c0140

1 file changed

Lines changed: 25 additions & 27 deletions

File tree

scripts/prepare_data_for_improve.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,6 @@ def main():
7474
type=int,
7575
default=10
7676
)
77-
p_process_datasets.add_argument(
78-
'-g', '--gene_table', dest='GENE_TABLE',
79-
type=str,
80-
required=True
81-
)
8277

8378
p_all = command_parsers.add_parser(
8479
"all",
@@ -208,18 +203,21 @@ def process_datasets(args):
208203
# expression / transcriptome is not in HGNC. Those currenly result
209204
# in NaNs for the gene symbol
210205

211-
data_gene_names = pd.read_table(
212-
filepath_or_buffer=args.GENE_TABLE,
213-
)
206+
data_gene_names = list(data_sets.values())[0].genes
207+
data_gene_names = (
208+
data_gene_names[data_gene_names['other_id_source'] == 'ensembl_gene']
209+
.drop_duplicates(
210+
subset=['entrez_id', 'gene_symbol'],
211+
keep='first')
212+
)
214213
data_gene_names.rename(
215-
columns={
216-
'NCBI Gene ID': 'entrez_id',
217-
'Ensembl gene ID': 'ensemble_gene_id',
218-
'Approved symbol': 'gene_symbol'
219-
},
220-
inplace=True,
221-
)
222-
data_gene_names.dropna(axis=0, subset='entrez_id', inplace=True)
214+
columns={'other_id' : 'ensembl_gene_id'},
215+
inplace=True
216+
)
217+
data_gene_names.drop(
218+
columns=['other_id_source'], inplace=True
219+
)
220+
223221
data_gene_names['entrez_id'] = data_gene_names['entrez_id'].astype(int)
224222

225223
#-------------------------------------------------------------------
@@ -240,7 +238,7 @@ def process_datasets(args):
240238
merged_transcriptomics,
241239
data_gene_names[[
242240
'entrez_id',
243-
'ensemble_gene_id',
241+
'ensembl_gene_id',
244242
'gene_symbol'
245243
]],
246244
how='left',
@@ -252,8 +250,8 @@ def process_datasets(args):
252250
# respectively
253251
merged_transcriptomics.insert(
254252
1,
255-
'ensemble_gene_id',
256-
merged_transcriptomics.pop('ensemble_gene_id')
253+
'ensembl_gene_id',
254+
merged_transcriptomics.pop('ensembl_gene_id')
257255
)
258256
merged_transcriptomics.insert(
259257
1,
@@ -299,7 +297,7 @@ def process_datasets(args):
299297
merged_copy_number,
300298
data_gene_names[[
301299
'entrez_id',
302-
'ensemble_gene_id',
300+
'ensembl_gene_id',
303301
'gene_symbol'
304302
]],
305303
how='left',
@@ -308,8 +306,8 @@ def process_datasets(args):
308306

309307
merged_copy_number.insert(
310308
1,
311-
'ensemble_gene_id',
312-
merged_copy_number.pop('ensemble_gene_id')
309+
'ensembl_gene_id',
310+
merged_copy_number.pop('ensembl_gene_id')
313311
)
314312
merged_copy_number.insert(
315313
1,
@@ -336,7 +334,7 @@ def process_datasets(args):
336334
discretized_copy_number,
337335
data_gene_names[[
338336
'entrez_id',
339-
'ensemble_gene_id',
337+
'ensembl_gene_id',
340338
'gene_symbol'
341339
]],
342340
how='left',
@@ -345,8 +343,8 @@ def process_datasets(args):
345343

346344
discretized_copy_number.insert(
347345
1,
348-
'ensemble_gene_id',
349-
discretized_copy_number.pop('ensemble_gene_id')
346+
'ensembl_gene_id',
347+
discretized_copy_number.pop('ensembl_gene_id')
350348
)
351349
discretized_copy_number.insert(
352350
1,
@@ -421,8 +419,8 @@ def split_data_sets(
421419

422420
splits = {}
423421
for i in range(0, args.NUM_SPLITS):
424-
logger.debug(
425-
f"split #{i} of {args.NUM_SPLITS} for {data_set} ..."
422+
logger.info(
423+
f"split #{i+1} of {args.NUM_SPLITS} for {data_set} ..."
426424
)
427425
splits[i] = data_sets[data_set].train_test_validate(
428426
split_type=split_type,

0 commit comments

Comments
 (0)