@@ -470,6 +470,21 @@ def load(
470470 _description_
471471 """
472472
473+ data_types_to_load = (
474+ 'transcriptomics' ,
475+ 'proteomics' ,
476+ 'mutations' ,
477+ 'copy_number' ,
478+ 'samples' ,
479+ 'drugs' ,
480+ 'drug_descriptors' ,
481+ 'mirna' ,
482+ 'experiments' ,
483+ 'methylation' ,
484+ 'metabolomics' ,
485+ 'genes' ,
486+ )
487+
473488 if type (local_path ) is not Path :
474489 try :
475490 local_path = Path (local_path )
@@ -487,30 +502,63 @@ def load(
487502 dataset = Dataset (name )
488503 accepted_file_endings = ('.csv' , '.tsv' , '.csv.gz' , '.tsv.gz' )
489504 print (f"Importing raw data ..." , file = sys .stderr )
490- for child in local_path .iterdir ():
491- if child .name in ["genes.csv" , "genes.csv.gz" ]:
505+
506+ # generating the file list that contains all files that need to
507+ # be imported based on the Dataset name
508+ files = {}
509+ for p in local_path .glob (f'{ name } _*' ):
510+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
511+ dataset_type = p .name [len (name )+ 1 :].split ('.' )[0 ]
512+ files [dataset_type ] = p
513+ for p in local_path .glob (f'genes*' ):
514+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
515+ files ['genes' ] = p
516+
517+ for dataset_type in data_types_to_load :
518+ if dataset_type not in files :
492519 print (
493- f"Importing 'genes' from { child } ... " ,
494- end = ' ' ,
520+ f"' { dataset_type } ' not available for { name } " ,
521+ end = '\n ' ,
495522 file = sys .stderr
496523 )
497- dataset .genes = _load_file (child )
498- print ("DONE" , file = sys .stderr )
499-
500- if (
501- child .name .startswith (name )
502- and child .name .endswith (accepted_file_endings )
503- ):
504-
505- dataset_type = child .name [len (name )+ 1 :].split ('.' )[0 ]
524+ continue
525+ file = files [dataset_type ]
526+ if dataset_type != 'genes' :
506527 print (
507- f"Importing '{ dataset_type } ' from { child } ..." ,
528+ f"Importing '{ dataset_type } ' from { file } ..." ,
508529 end = ' ' ,
509530 file = sys .stderr
510531 )
511532 if hasattr (dataset , dataset_type ):
512- setattr (dataset , dataset_type , _load_file (child ))
533+ setattr (dataset , dataset_type , _load_file (file ))
513534 print ("DONE" , file = sys .stderr )
535+ else :
536+ '''
537+ The genes dataset available in the online repository is
538+ universal and contains information on genes of all
539+ datasets. To that end it needs to be subsetted to only
540+ those genes that are associate with a specific cancer
541+ dataset.
542+ '''
543+ print (
544+ f"Importing 'genes' from { file } ..." ,
545+ end = ' ' ,
546+ file = sys .stderr
547+ )
548+ dataset .genes = _load_file (file )
549+
550+ entrez_ids = set ()
551+ for dataset_type in ('transcriptomics' , 'proteomics' ,
552+ 'mutations' , 'copy_number' ):
553+ if getattr (dataset , dataset_type ) is not None :
554+ entrez_ids .update (list (
555+ getattr (dataset , dataset_type )['entrez_id' ].unique ()
556+ ))
557+ dataset .genes = dataset .genes [
558+ dataset .genes ['entrez_id' ].isin (entrez_ids )
559+ ]
560+ print ("DONE" , file = sys .stderr )
561+
514562 print (f"Importing raw data ... DONE" , file = sys .stderr )
515563 return dataset
516564
0 commit comments