Skip to content

Commit d6532e0

Browse files
committed
added logic to subset genes table to only contain those genes that are associated with cancer dataset of interest
1 parent 286ce3a commit d6532e0

1 file changed

Lines changed: 18 additions & 0 deletions

File tree

coderdata/dataset/dataset.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,12 +533,30 @@ def load(
533533
setattr(dataset, dataset_type, _load_file(file))
534534
print("DONE", file=sys.stderr)
535535
else:
536+
'''
537+
The genes dataset available in the online repository is
538+
universal and contains information on genes of all
539+
datasets. To that end it needs to be subsetted to only
540+
those genes that are associate with a specific cancer
541+
dataset.
542+
'''
536543
print(
537544
f"Importing 'genes' from {file} ...",
538545
end=' ',
539546
file=sys.stderr
540547
)
541548
dataset.genes = _load_file(file)
549+
550+
entrez_ids = set()
551+
for dataset_type in ('transcriptomics', 'proteomics',
552+
'mutations', 'copy_number'):
553+
if getattr(dataset, dataset_type) is not None:
554+
entrez_ids.update(list(
555+
getattr(dataset, dataset_type)['entrez_id'].unique()
556+
))
557+
dataset.genes = dataset.genes[
558+
dataset.genes['entrez_id'].isin(entrez_ids)
559+
]
542560
print("DONE", file=sys.stderr)
543561

544562
print(f"Importing raw data ... DONE", file=sys.stderr)

0 commit comments

Comments
 (0)