1- import coderdata . load . DatasetLoader as cd
1+ import coderdata as cd
22import yaml
33
44class DatasetStatistics :
55 def __init__ (self , dataset_type ):
6- self .dataset_loader = cd .DatasetLoader (dataset_type )
6+ self .data = cd .load (dataset_type )
77
88 def count_unique (self , attribute , unique_field ):
9- if hasattr (self .dataset_loader , attribute ):
10- dataset = getattr (self .dataset_loader , attribute )
9+ if getattr (self .data , attribute ) is not None :
10+ dataset = getattr (self .data , attribute )
1111 if unique_field in dataset .columns :
1212 return len (dataset [unique_field ].unique ())
1313 return 0
1414
1515 def count_unique_genes (self ):
1616 gene_ids = set ()
17- for data_type in ['transcriptomics' , 'proteomics' , 'mutations' , 'copy_number' , 'methylation' ]:
18- if hasattr (self .dataset_loader , data_type ):
19- dataset = getattr (self .dataset_loader , data_type )
17+ for data_type in ['transcriptomics' , 'proteomics' , 'mutations' , 'copy_number' ]:
18+ if getattr (self .data , data_type ) is not None :
19+ dataset = getattr (self .data , data_type )
2020 if 'entrez_id' in dataset .columns :
2121 gene_ids .update (dataset .entrez_id .unique ().tolist ())
2222 return len (gene_ids )
@@ -41,5 +41,5 @@ def calculate_stats_for_datasets(dataset_types):
4141 yaml .dump (stats , file )
4242
4343# Dataset types
44- dataset_types = [ 'broad_sanger' , 'cptac' , 'beataml' , 'hcmi' , 'mpnst' ]
44+ dataset_types = cd . list_datasets ( raw = True ). keys ()
4545calculate_stats_for_datasets (dataset_types )
0 commit comments