@@ -74,11 +74,6 @@ def main():
7474 type = int ,
7575 default = 10
7676 )
77- p_process_datasets .add_argument (
78- '-g' , '--gene_table' , dest = 'GENE_TABLE' ,
79- type = str ,
80- required = True
81- )
8277
8378 p_all = command_parsers .add_parser (
8479 "all" ,
@@ -208,18 +203,21 @@ def process_datasets(args):
208203 # expression / transcriptome is not in HGNC. Those currenly result
209204 # in NaNs for the gene symbol
210205
211- data_gene_names = pd .read_table (
212- filepath_or_buffer = args .GENE_TABLE ,
213- )
206+ data_gene_names = list (data_sets .values ())[0 ].genes
207+ data_gene_names = (
208+ data_gene_names [data_gene_names ['other_id_source' ] == 'ensembl_gene' ]
209+ .drop_duplicates (
210+ subset = ['entrez_id' , 'gene_symbol' ],
211+ keep = 'first' )
212+ )
214213 data_gene_names .rename (
215- columns = {
216- 'NCBI Gene ID' : 'entrez_id' ,
217- 'Ensembl gene ID' : 'ensemble_gene_id' ,
218- 'Approved symbol' : 'gene_symbol'
219- },
220- inplace = True ,
221- )
222- data_gene_names .dropna (axis = 0 , subset = 'entrez_id' , inplace = True )
214+ columns = {'other_id' : 'ensembl_gene_id' },
215+ inplace = True
216+ )
217+ data_gene_names .drop (
218+ columns = ['other_id_source' ], inplace = True
219+ )
220+
223221 data_gene_names ['entrez_id' ] = data_gene_names ['entrez_id' ].astype (int )
224222
225223 #-------------------------------------------------------------------
@@ -240,7 +238,7 @@ def process_datasets(args):
240238 merged_transcriptomics ,
241239 data_gene_names [[
242240 'entrez_id' ,
243- 'ensemble_gene_id ' ,
241+ 'ensembl_gene_id ' ,
244242 'gene_symbol'
245243 ]],
246244 how = 'left' ,
@@ -252,8 +250,8 @@ def process_datasets(args):
252250 # respectively
253251 merged_transcriptomics .insert (
254252 1 ,
255- 'ensemble_gene_id ' ,
256- merged_transcriptomics .pop ('ensemble_gene_id ' )
253+ 'ensembl_gene_id ' ,
254+ merged_transcriptomics .pop ('ensembl_gene_id ' )
257255 )
258256 merged_transcriptomics .insert (
259257 1 ,
@@ -299,7 +297,7 @@ def process_datasets(args):
299297 merged_copy_number ,
300298 data_gene_names [[
301299 'entrez_id' ,
302- 'ensemble_gene_id ' ,
300+ 'ensembl_gene_id ' ,
303301 'gene_symbol'
304302 ]],
305303 how = 'left' ,
@@ -308,8 +306,8 @@ def process_datasets(args):
308306
309307 merged_copy_number .insert (
310308 1 ,
311- 'ensemble_gene_id ' ,
312- merged_copy_number .pop ('ensemble_gene_id ' )
309+ 'ensembl_gene_id ' ,
310+ merged_copy_number .pop ('ensembl_gene_id ' )
313311 )
314312 merged_copy_number .insert (
315313 1 ,
@@ -336,7 +334,7 @@ def process_datasets(args):
336334 discretized_copy_number ,
337335 data_gene_names [[
338336 'entrez_id' ,
339- 'ensemble_gene_id ' ,
337+ 'ensembl_gene_id ' ,
340338 'gene_symbol'
341339 ]],
342340 how = 'left' ,
@@ -345,8 +343,8 @@ def process_datasets(args):
345343
346344 discretized_copy_number .insert (
347345 1 ,
348- 'ensemble_gene_id ' ,
349- discretized_copy_number .pop ('ensemble_gene_id ' )
346+ 'ensembl_gene_id ' ,
347+ discretized_copy_number .pop ('ensembl_gene_id ' )
350348 )
351349 discretized_copy_number .insert (
352350 1 ,
@@ -421,8 +419,8 @@ def split_data_sets(
421419
422420 splits = {}
423421 for i in range (0 , args .NUM_SPLITS ):
424- logger .debug (
425- f"split #{ i } of { args .NUM_SPLITS } for { data_set } ..."
422+ logger .info (
423+ f"split #{ i + 1 } of { args .NUM_SPLITS } for { data_set } ..."
426424 )
427425 splits [i ] = data_sets [data_set ].train_test_validate (
428426 split_type = split_type ,
0 commit comments