Skip to content

Commit 6ab6a59

Browse files
committed
added gene expression master table generation
1 parent 5f0a3fd commit 6ab6a59

1 file changed

Lines changed: 144 additions & 1 deletion

File tree

scripts/prepare_data_for_improve.py

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11

22
import argparse
3+
import functools as ft
34
from os import PathLike
45
from pathlib import Path
56
from pathlib import PurePath
@@ -63,6 +64,11 @@ def main():
6364
type=int,
6465
default=10
6566
)
67+
p_process_datasets.add_argument(
68+
'-g', '--gene_table', dest='GENE_TABLE',
69+
type=str,
70+
required=True
71+
)
6672

6773
p_all = command_parsers.add_parser(
6874
"all",
@@ -189,6 +195,8 @@ def process_datasets(args):
189195

190196
# generation of the actual splits
191197

198+
# TODO: potentially clean this up with a function that is
199+
192200
splits = {}
193201
for i in range(0, args.NUM_SPLITS):
194202
splits[i] = data_sets[data_set].train_test_validate(
@@ -283,12 +291,147 @@ def process_datasets(args):
283291
index=False,
284292
header=False
285293
)
286-
294+
295+
#-------------------------------------------------------------------
296+
# getting common / reference gene symbols
297+
#-------------------------------------------------------------------
298+
299+
# TODO: potentially add mapping to the genes table in coderdata
300+
# currently we do not make use of the 'genes' DataFrame in a Dataset
301+
# object. The gene symbol information comes directly from HGNC.
302+
# There are instances where the entrez_id that is recoreded in the
303+
# expression / transcriptome is not in HGNC. Those currenly result
304+
# in NaNs for the gene symbol
305+
306+
data_gene_names = pd.read_table(
307+
filepath_or_buffer=args.GENE_TABLE,
308+
)
309+
data_gene_names.rename(
310+
columns={
311+
'NCBI Gene ID': 'entrez_id',
312+
'Ensembl gene ID': 'ensemble_gene_id',
313+
'Approved symbol': 'gene_symbol'
314+
},
315+
inplace=True,
316+
)
317+
data_gene_names.dropna(axis=0, subset='entrez_id', inplace=True)
318+
data_gene_names['entrez_id'] = data_gene_names['entrez_id'].astype(int)
319+
320+
#-------------------------------------------------------------------
321+
# create gene expression master table
322+
#-------------------------------------------------------------------
323+
324+
merged_transcriptomics = merge_master_tables(
325+
args=args,
326+
data_type='transcriptomics'
327+
)
328+
329+
# TODO: Potentially cast 'NaN's to 0
330+
331+
# merging ensemble gene id & gene symbol into the transcriptomics
332+
# data
333+
merged_transcriptomics = pd.merge(
334+
merged_transcriptomics,
335+
data_gene_names[[
336+
'entrez_id',
337+
'ensemble_gene_id',
338+
'gene_symbol'
339+
]],
340+
how='left',
341+
on='entrez_id',
342+
)
343+
344+
# moving ensemble_id & gene_symbol columns to the front of the table
345+
# such that when transposing the DataFrame they are row 3 and 2
346+
# respectively
347+
merged_transcriptomics.insert(
348+
1,
349+
'ensemble_gene_id',
350+
merged_transcriptomics.pop('ensemble_gene_id')
351+
)
352+
merged_transcriptomics.insert(
353+
1,
354+
'gene_symbol',
355+
merged_transcriptomics.pop('gene_symbol')
356+
)
357+
358+
# writing the expression datatable to '/x_data/*_expression.tsv'
359+
outfile_path = args.WORKDIR.joinpath(
360+
"data_out",
361+
"y_data",
362+
"cancer_gene_expression.tsv"
363+
)
364+
merged_transcriptomics.transpose().to_csv(
365+
path_or_buf=outfile_path,
366+
sep='\t',
367+
header=False
368+
)
369+
370+
371+
#-------------------------------------------------------------------
372+
# create copynumber master table
373+
#-------------------------------------------------------------------
374+
287375

288376

289377
# join the "meta data tables" like copynumber etc.
290378

291379

380+
def merge_master_tables(args, data_type: str='transcriptomics'):
381+
"""
382+
Helper function to merge several DataTables into one master table
383+
384+
Parameters
385+
----------
386+
args : _type_
387+
_description_
388+
data_type : str, optional
389+
_description_, by default 'transcriptomics'
390+
391+
Returns
392+
-------
393+
_type_
394+
_description_
395+
"""
396+
397+
local_path = args.WORKDIR.joinpath('data_in_tmp')
398+
399+
# getting the info which datasets are available
400+
data_sets_info = cd.list_datasets(raw=True)
401+
402+
# loading all available datasets into a dict where the dataset name
403+
# is the key
404+
data_sets = {}
405+
for data_set in data_sets_info.keys():
406+
data_sets[data_set] = cd.load(name=data_set, local_path=local_path)
407+
408+
# creating a list that contains all DataFrames to be merged
409+
dfs_to_merge = []
410+
for data_set in data_sets:
411+
if data_sets[data_set].experiments is not None:
412+
if data_type in ['transcriptomics', 'copy_number']:
413+
dfs_to_merge.append(
414+
data_sets[data_set].format(data_type=data_type)
415+
)
416+
417+
merged_data = ft.reduce(
418+
lambda left_df, right_df: pd.merge(
419+
left_df,
420+
right_df,
421+
on='entrez_id',
422+
how='outer',
423+
),
424+
dfs_to_merge,
425+
)
426+
427+
# temporary fix to values that should be int but currently aren't
428+
# in the coderdata dataset storage
429+
if not merged_data.index.dtype == int:
430+
merged_data.index = merged_data.index.astype(int)
431+
432+
return merged_data
433+
434+
292435
def download_datasets(args):
293436
local_path = args.WORKDIR.joinpath('data_in_tmp')
294437
exist_ok = args.OVERWRITE

0 commit comments

Comments
 (0)