Skip to content

Commit 113bba2

Browse files
authored
Merge pull request #381 from PNNL-CompBio/304-datasetgenes-not-dataset-specific
304 datasetgenes not dataset specific
2 parents 781891a + d6532e0 commit 113bba2

1 file changed

Lines changed: 63 additions & 15 deletions

File tree

coderdata/dataset/dataset.py

Lines changed: 63 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,21 @@ def load(
470470
_description_
471471
"""
472472

473+
data_types_to_load = (
474+
'transcriptomics',
475+
'proteomics',
476+
'mutations',
477+
'copy_number',
478+
'samples',
479+
'drugs',
480+
'drug_descriptors',
481+
'mirna',
482+
'experiments',
483+
'methylation',
484+
'metabolomics',
485+
'genes',
486+
)
487+
473488
if type(local_path) is not Path:
474489
try:
475490
local_path = Path(local_path)
@@ -487,30 +502,63 @@ def load(
487502
dataset = Dataset(name)
488503
accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
489504
print(f"Importing raw data ...", file=sys.stderr)
490-
for child in local_path.iterdir():
491-
if child.name in ["genes.csv", "genes.csv.gz"]:
505+
506+
# generating the file list that contains all files that need to
507+
# be imported based on the Dataset name
508+
files = {}
509+
for p in local_path.glob(f'{name}_*'):
510+
if p.name.endswith(accepted_file_endings) and p.is_file():
511+
dataset_type = p.name[len(name)+1:].split('.')[0]
512+
files[dataset_type] = p
513+
for p in local_path.glob(f'genes*'):
514+
if p.name.endswith(accepted_file_endings) and p.is_file():
515+
files['genes'] = p
516+
517+
for dataset_type in data_types_to_load:
518+
if dataset_type not in files:
492519
print(
493-
f"Importing 'genes' from {child} ...",
494-
end=' ',
520+
f"'{dataset_type}' not available for {name}",
521+
end='\n',
495522
file=sys.stderr
496523
)
497-
dataset.genes = _load_file(child)
498-
print("DONE", file=sys.stderr)
499-
500-
if (
501-
child.name.startswith(name)
502-
and child.name.endswith(accepted_file_endings)
503-
):
504-
505-
dataset_type = child.name[len(name)+1:].split('.')[0]
524+
continue
525+
file = files[dataset_type]
526+
if dataset_type != 'genes':
506527
print(
507-
f"Importing '{dataset_type}' from {child} ...",
528+
f"Importing '{dataset_type}' from {file} ...",
508529
end=' ',
509530
file=sys.stderr
510531
)
511532
if hasattr(dataset, dataset_type):
512-
setattr(dataset, dataset_type, _load_file(child))
533+
setattr(dataset, dataset_type, _load_file(file))
513534
print("DONE", file=sys.stderr)
535+
else:
536+
'''
537+
The genes dataset available in the online repository is
538+
universal and contains information on genes of all
539+
datasets. To that end it needs to be subsetted to only
540+
those genes that are associate with a specific cancer
541+
dataset.
542+
'''
543+
print(
544+
f"Importing 'genes' from {file} ...",
545+
end=' ',
546+
file=sys.stderr
547+
)
548+
dataset.genes = _load_file(file)
549+
550+
entrez_ids = set()
551+
for dataset_type in ('transcriptomics', 'proteomics',
552+
'mutations', 'copy_number'):
553+
if getattr(dataset, dataset_type) is not None:
554+
entrez_ids.update(list(
555+
getattr(dataset, dataset_type)['entrez_id'].unique()
556+
))
557+
dataset.genes = dataset.genes[
558+
dataset.genes['entrez_id'].isin(entrez_ids)
559+
]
560+
print("DONE", file=sys.stderr)
561+
514562
print(f"Importing raw data ... DONE", file=sys.stderr)
515563
return dataset
516564

0 commit comments

Comments
 (0)