@@ -39,41 +39,22 @@ class Split:
3939
4040class Dataset :
4141
42- data_format_params = {
43- "samples" : (
44- "improve_sample_id" , "cancer_type" , "model_type" , "common_name" ,
45- "other_id" , "other_names" , "id_source" , "species"
46- ),
47- "transcriptomics" : (
48- "improve_sample_id" , "entrez_id" , "transcriptomics"
49- ),
50- "proteomics" : ("improve_sample_id" , "entrez_id" , "proteomics" ),
51- "mutations" : ("improve_sample_id" , "entrez_id" , "mutation" ),
52- "copy_number" : ("improve_sample_id" , "entrez_id" , "copy_number" ),
53- "methylation" : ("improve_sample_id" , "entrez_id" , "methylation" ),
54- "experiments" : (
55- "improve_sample_id" , "improve_drug_id" , "dose_response_value"
56- ),
57- "drugs" : ("improve_drug_id" , "chem_name" , "isoSMILES" ),
58- "genes" : ("entrez_id" , "gene_symbol" , "other_id" )
59- }
60-
6142 def __init__ (
6243 self ,
63- name : str = None ,
64- transcriptomics : pd .DataFrame = None ,
65- proteomics : pd .DataFrame = None ,
66- mutations : pd .DataFrame = None ,
67- copy_number : pd .DataFrame = None ,
68- samples : pd .DataFrame = None ,
69- drugs : pd .DataFrame = None ,
70- drug_descriptors : pd .DataFrame = None ,
71- mirna : pd .DataFrame = None ,
72- experiments : pd .DataFrame = None ,
73- methylation : pd .DataFrame = None ,
74- metabolomics : pd .DataFrame = None ,
75- genes : pd .DataFrame = None ,
76- combinations : pd .DataFrame = None ,
44+ name : Optional [ str ] = None ,
45+ transcriptomics : Optional [ pd .DataFrame ] = None ,
46+ proteomics : Optional [ pd .DataFrame ] = None ,
47+ mutations : Optional [ pd .DataFrame ] = None ,
48+ copy_number : Optional [ pd .DataFrame ] = None ,
49+ samples : Optional [ pd .DataFrame ] = None ,
50+ drugs : Optional [ pd .DataFrame ] = None ,
51+ drug_descriptors : Optional [ pd .DataFrame ] = None ,
52+ mirna : Optional [ pd .DataFrame ] = None ,
53+ experiments : Optional [ pd .DataFrame ] = None ,
54+ methylation : Optional [ pd .DataFrame ] = None ,
55+ metabolomics : Optional [ pd .DataFrame ] = None ,
56+ genes : Optional [ pd .DataFrame ] = None ,
57+ combinations : Optional [ pd .DataFrame ] = None ,
7758 ):
7859 """
7960 Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
131112 # getters / setters & deleters
132113 # ----------------------------
133114
134-
135- @property
136- def data_format_params (self ):
137- return self ._data_format_params
138-
139-
140115 @property
141116 def name (self ):
142117 return self ._name
@@ -330,10 +305,10 @@ def format(
330305 'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
331306 'genes' , 'samples' ,
332307 ],
333- use_polars : bool = False ,
308+ remove_na : bool = False ,
334309 ** kwargs : dict ,
335310 ):
336- return format (self , data_type = data_type , use_polars = use_polars , ** kwargs )
311+ return format (self , data_type = data_type , remove_na = False , ** kwargs )
337312
338313
339314 def split_train_other (
@@ -574,6 +549,7 @@ def load(
574549 dataset = pickle .load (file = file )
575550 print ("DONE" , file = sys .stderr )
576551 return dataset
552+ raise FileNotFoundError ("No suitable pickle file found." )
577553
578554
579555
@@ -584,7 +560,7 @@ def format(
584560 'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
585561 'genes' , 'samples' ,
586562 ],
587- use_polars : bool = False ,
563+ remove_na : bool = False ,
588564 ** kwargs : dict ,
589565 ):
590566
@@ -690,6 +666,8 @@ def format(
690666 columns = 'dose_response_metric' ,
691667 values = 'dose_response_value'
692668 ).reset_index ().rename_axis (None , axis = 1 )
669+ if remove_na :
670+ ret .dropna (axis = 'index' , inplace = True )
693671 elif shape == 'matrix' :
694672 if len (metrics ) > 1 :
695673 raise ValueError (
@@ -702,7 +680,6 @@ def format(
702680 index = 'improve_drug_id' ,
703681 columns = 'improve_sample_id'
704682 )
705- return ret
706683
707684 elif data_type == "combinations" :
708685 raise NotImplementedError (
@@ -819,7 +796,7 @@ def split_train_test_validate(
819796 train , other = _split_two_way (
820797 data = data ,
821798 split_type = split_type ,
822- ratio = [ ratio [0 ], ratio [1 ] + ratio [2 ]] ,
799+ ratio = ( ratio [0 ], ratio [1 ] + ratio [2 ]) ,
823800 stratify_by = stratify_by ,
824801 balance = balance ,
825802 random_state = random_state ,
@@ -829,7 +806,7 @@ def split_train_test_validate(
829806 test , val = _split_two_way (
830807 data = other ,
831808 split_type = split_type ,
832- ratio = [ ratio [1 ], ratio [2 ]] ,
809+ ratio = ( ratio [1 ], ratio [2 ]) ,
833810 stratify_by = stratify_by ,
834811 balance = balance ,
835812 random_state = random_state ,
@@ -1041,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
10411018 return data_ret
10421019
10431020def _balance_data (
1044- data : pd .Dataframe ,
1021+ data : pd .DataFrame ,
10451022 random_state : Optional [Union [int ,RandomState ]]= None ,
10461023 # oversample: bool=False,
1047- ) -> pd .Dataframe :
1024+ ) -> pd .DataFrame :
10481025 tmp = deepcopy (data )
10491026 counts = tmp .value_counts ('split_class' )
10501027 ret_df = (
@@ -1060,7 +1037,7 @@ def _create_classes(
10601037 metric : str ,
10611038 num_classes : int = 2 ,
10621039 quantiles : bool = True ,
1063- thresh : float = None ,
1040+ thresh : Optional [ float ] = None ,
10641041 ) -> pd .DataFrame :
10651042 """
10661043 Helper function that bins experiment data into a number of defined
@@ -1149,7 +1126,7 @@ def _split_two_way(
11491126 split_type : Literal [
11501127 'mixed-set' , 'drug-blind' , 'cancer-blind'
11511128 ]= 'mixed-set' ,
1152- ratio : tuple [int , int , int ]= (8 ,2 ),
1129+ ratio : tuple [int , int ]= (8 ,2 ),
11531130 balance : bool = False ,
11541131 stratify_by : Optional [str ]= None ,
11551132 random_state : Optional [Union [int ,RandomState ]]= None ,
@@ -1255,7 +1232,8 @@ def _split_two_way(
12551232 columns = 'dose_response_metric' ,
12561233 values = 'dose_response_value'
12571234 ).reset_index ()
1258-
1235+ if stratify_by is not None :
1236+ df_full .dropna (axis = 'index' , subset = [stratify_by ], inplace = True )
12591237 # Defining the split sizes.
12601238 train_size = float (ratio [0 ]) / sum (ratio )
12611239 test_val_size = float (ratio [1 ]) / sum (ratio )
0 commit comments