|
1 | 1 |
|
2 | 2 | import argparse |
| 3 | +import functools as ft |
3 | 4 | from os import PathLike |
4 | 5 | from pathlib import Path |
5 | 6 | from pathlib import PurePath |
@@ -63,6 +64,11 @@ def main(): |
63 | 64 | type=int, |
64 | 65 | default=10 |
65 | 66 | ) |
| 67 | + p_process_datasets.add_argument( |
| 68 | + '-g', '--gene_table', dest='GENE_TABLE', |
| 69 | + type=str, |
| 70 | + required=True |
| 71 | + ) |
66 | 72 |
|
67 | 73 | p_all = command_parsers.add_parser( |
68 | 74 | "all", |
@@ -189,6 +195,8 @@ def process_datasets(args): |
189 | 195 |
|
190 | 196 | # generation of the actual splits |
191 | 197 |
|
| 198 | + # TODO: potentially clean this up with a function that is |
| 199 | + |
192 | 200 | splits = {} |
193 | 201 | for i in range(0, args.NUM_SPLITS): |
194 | 202 | splits[i] = data_sets[data_set].train_test_validate( |
@@ -283,12 +291,147 @@ def process_datasets(args): |
283 | 291 | index=False, |
284 | 292 | header=False |
285 | 293 | ) |
286 | | - |
| 294 | + |
| 295 | + #------------------------------------------------------------------- |
| 296 | + # getting common / reference gene symbols |
| 297 | + #------------------------------------------------------------------- |
| 298 | + |
| 299 | + # TODO: potentially add mapping to the genes table in coderdata |
| 300 | + # currently we do not make use of the 'genes' DataFrame in a Dataset |
| 301 | + # object. The gene symbol information comes directly from HGNC. |
| 302 | + # There are instances where the entrez_id that is recoreded in the |
| 303 | + # expression / transcriptome is not in HGNC. Those currenly result |
| 304 | + # in NaNs for the gene symbol |
| 305 | + |
| 306 | + data_gene_names = pd.read_table( |
| 307 | + filepath_or_buffer=args.GENE_TABLE, |
| 308 | + ) |
| 309 | + data_gene_names.rename( |
| 310 | + columns={ |
| 311 | + 'NCBI Gene ID': 'entrez_id', |
| 312 | + 'Ensembl gene ID': 'ensemble_gene_id', |
| 313 | + 'Approved symbol': 'gene_symbol' |
| 314 | + }, |
| 315 | + inplace=True, |
| 316 | + ) |
| 317 | + data_gene_names.dropna(axis=0, subset='entrez_id', inplace=True) |
| 318 | + data_gene_names['entrez_id'] = data_gene_names['entrez_id'].astype(int) |
| 319 | + |
| 320 | + #------------------------------------------------------------------- |
| 321 | + # create gene expression master table |
| 322 | + #------------------------------------------------------------------- |
| 323 | + |
| 324 | + merged_transcriptomics = merge_master_tables( |
| 325 | + args=args, |
| 326 | + data_type='transcriptomics' |
| 327 | + ) |
| 328 | + |
| 329 | + # TODO: Potentially cast 'NaN's to 0 |
| 330 | + |
| 331 | + # merging ensemble gene id & gene symbol into the transcriptomics |
| 332 | + # data |
| 333 | + merged_transcriptomics = pd.merge( |
| 334 | + merged_transcriptomics, |
| 335 | + data_gene_names[[ |
| 336 | + 'entrez_id', |
| 337 | + 'ensemble_gene_id', |
| 338 | + 'gene_symbol' |
| 339 | + ]], |
| 340 | + how='left', |
| 341 | + on='entrez_id', |
| 342 | + ) |
| 343 | + |
| 344 | + # moving ensemble_id & gene_symbol columns to the front of the table |
| 345 | + # such that when transposing the DataFrame they are row 3 and 2 |
| 346 | + # respectively |
| 347 | + merged_transcriptomics.insert( |
| 348 | + 1, |
| 349 | + 'ensemble_gene_id', |
| 350 | + merged_transcriptomics.pop('ensemble_gene_id') |
| 351 | + ) |
| 352 | + merged_transcriptomics.insert( |
| 353 | + 1, |
| 354 | + 'gene_symbol', |
| 355 | + merged_transcriptomics.pop('gene_symbol') |
| 356 | + ) |
| 357 | + |
| 358 | + # writing the expression datatable to '/x_data/*_expression.tsv' |
| 359 | + outfile_path = args.WORKDIR.joinpath( |
| 360 | + "data_out", |
| 361 | + "y_data", |
| 362 | + "cancer_gene_expression.tsv" |
| 363 | + ) |
| 364 | + merged_transcriptomics.transpose().to_csv( |
| 365 | + path_or_buf=outfile_path, |
| 366 | + sep='\t', |
| 367 | + header=False |
| 368 | + ) |
| 369 | + |
| 370 | + |
| 371 | + #------------------------------------------------------------------- |
| 372 | + # create copynumber master table |
| 373 | + #------------------------------------------------------------------- |
| 374 | + |
287 | 375 |
|
288 | 376 |
|
289 | 377 | # join the "meta data tables" like copynumber etc. |
290 | 378 |
|
291 | 379 |
|
| 380 | +def merge_master_tables(args, data_type: str='transcriptomics'): |
| 381 | + """ |
| 382 | + Helper function to merge several DataTables into one master table |
| 383 | +
|
| 384 | + Parameters |
| 385 | + ---------- |
| 386 | + args : _type_ |
| 387 | + _description_ |
| 388 | + data_type : str, optional |
| 389 | + _description_, by default 'transcriptomics' |
| 390 | +
|
| 391 | + Returns |
| 392 | + ------- |
| 393 | + _type_ |
| 394 | + _description_ |
| 395 | + """ |
| 396 | + |
| 397 | + local_path = args.WORKDIR.joinpath('data_in_tmp') |
| 398 | + |
| 399 | + # getting the info which datasets are available |
| 400 | + data_sets_info = cd.list_datasets(raw=True) |
| 401 | + |
| 402 | + # loading all available datasets into a dict where the dataset name |
| 403 | + # is the key |
| 404 | + data_sets = {} |
| 405 | + for data_set in data_sets_info.keys(): |
| 406 | + data_sets[data_set] = cd.load(name=data_set, local_path=local_path) |
| 407 | + |
| 408 | + # creating a list that contains all DataFrames to be merged |
| 409 | + dfs_to_merge = [] |
| 410 | + for data_set in data_sets: |
| 411 | + if data_sets[data_set].experiments is not None: |
| 412 | + if data_type in ['transcriptomics', 'copy_number']: |
| 413 | + dfs_to_merge.append( |
| 414 | + data_sets[data_set].format(data_type=data_type) |
| 415 | + ) |
| 416 | + |
| 417 | + merged_data = ft.reduce( |
| 418 | + lambda left_df, right_df: pd.merge( |
| 419 | + left_df, |
| 420 | + right_df, |
| 421 | + on='entrez_id', |
| 422 | + how='outer', |
| 423 | + ), |
| 424 | + dfs_to_merge, |
| 425 | + ) |
| 426 | + |
| 427 | + # temporary fix to values that should be int but currently aren't |
| 428 | + # in the coderdata dataset storage |
| 429 | + if not merged_data.index.dtype == int: |
| 430 | + merged_data.index = merged_data.index.astype(int) |
| 431 | + |
| 432 | + return merged_data |
| 433 | + |
| 434 | + |
292 | 435 | def download_datasets(args): |
293 | 436 | local_path = args.WORKDIR.joinpath('data_in_tmp') |
294 | 437 | exist_ok = args.OVERWRITE |
|
0 commit comments