Skip to content

Commit 93a1d58

Browse files
authored
Merge pull request #404 from PNNL-CompBio/340-implement-function-arguments-to-datasetformat-that-triggers-removal-of-na-values-in-the-returned-dfs
340 implement function arguments to datasetformat that triggers removal of na values in the returned dfs
2 parents 113bba2 + 6a74456 commit 93a1d58

1 file changed

Lines changed: 28 additions & 50 deletions

File tree

coderdata/dataset/dataset.py

Lines changed: 28 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -39,41 +39,22 @@ class Split:
3939

4040
class Dataset:
4141

42-
data_format_params = {
43-
"samples": (
44-
"improve_sample_id", "cancer_type", "model_type", "common_name",
45-
"other_id", "other_names", "id_source", "species"
46-
),
47-
"transcriptomics": (
48-
"improve_sample_id", "entrez_id", "transcriptomics"
49-
),
50-
"proteomics": ("improve_sample_id", "entrez_id", "proteomics"),
51-
"mutations": ("improve_sample_id", "entrez_id", "mutation"),
52-
"copy_number": ("improve_sample_id", "entrez_id", "copy_number"),
53-
"methylation": ("improve_sample_id", "entrez_id", "methylation"),
54-
"experiments": (
55-
"improve_sample_id", "improve_drug_id", "dose_response_value"
56-
),
57-
"drugs": ("improve_drug_id", "chem_name", "isoSMILES"),
58-
"genes": ("entrez_id", "gene_symbol", "other_id")
59-
}
60-
6142
def __init__(
6243
self,
63-
name: str=None,
64-
transcriptomics: pd.DataFrame=None,
65-
proteomics: pd.DataFrame=None,
66-
mutations: pd.DataFrame=None,
67-
copy_number: pd.DataFrame=None,
68-
samples: pd.DataFrame=None,
69-
drugs: pd.DataFrame=None,
70-
drug_descriptors: pd.DataFrame=None,
71-
mirna: pd.DataFrame=None,
72-
experiments: pd.DataFrame=None,
73-
methylation: pd.DataFrame=None,
74-
metabolomics: pd.DataFrame=None,
75-
genes: pd.DataFrame=None,
76-
combinations: pd.DataFrame=None,
44+
name: Optional[str]=None,
45+
transcriptomics: Optional[pd.DataFrame]=None,
46+
proteomics: Optional[pd.DataFrame]=None,
47+
mutations: Optional[pd.DataFrame]=None,
48+
copy_number: Optional[pd.DataFrame]=None,
49+
samples: Optional[pd.DataFrame]=None,
50+
drugs: Optional[pd.DataFrame]=None,
51+
drug_descriptors: Optional[pd.DataFrame]=None,
52+
mirna: Optional[pd.DataFrame]=None,
53+
experiments: Optional[pd.DataFrame]=None,
54+
methylation: Optional[pd.DataFrame]=None,
55+
metabolomics: Optional[pd.DataFrame]=None,
56+
genes: Optional[pd.DataFrame]=None,
57+
combinations: Optional[pd.DataFrame]=None,
7758
):
7859
"""
7960
Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
131112
# getters / setters & deleters
132113
# ----------------------------
133114

134-
135-
@property
136-
def data_format_params(self):
137-
return self._data_format_params
138-
139-
140115
@property
141116
def name(self):
142117
return self._name
@@ -330,10 +305,10 @@ def format(
330305
'experiments', 'combinations', 'drug_descriptor', 'drugs',
331306
'genes', 'samples',
332307
],
333-
use_polars: bool=False,
308+
remove_na: bool=False,
334309
**kwargs: dict,
335310
):
336-
return format(self, data_type=data_type, use_polars=use_polars, **kwargs)
311+
return format(self, data_type=data_type, remove_na=False, **kwargs)
337312

338313

339314
def split_train_other(
@@ -574,6 +549,7 @@ def load(
574549
dataset = pickle.load(file=file)
575550
print("DONE", file=sys.stderr)
576551
return dataset
552+
raise FileNotFoundError("No suitable pickle file found.")
577553

578554

579555

@@ -584,7 +560,7 @@ def format(
584560
'experiments', 'combinations', 'drug_descriptor', 'drugs',
585561
'genes', 'samples',
586562
],
587-
use_polars: bool=False,
563+
remove_na: bool=False,
588564
**kwargs: dict,
589565
):
590566

@@ -690,6 +666,8 @@ def format(
690666
columns = 'dose_response_metric',
691667
values = 'dose_response_value'
692668
).reset_index().rename_axis(None, axis=1)
669+
if remove_na:
670+
ret.dropna(axis='index', inplace=True)
693671
elif shape == 'matrix':
694672
if len(metrics) > 1:
695673
raise ValueError(
@@ -702,7 +680,6 @@ def format(
702680
index='improve_drug_id',
703681
columns='improve_sample_id'
704682
)
705-
return ret
706683

707684
elif data_type == "combinations":
708685
raise NotImplementedError(
@@ -819,7 +796,7 @@ def split_train_test_validate(
819796
train, other = _split_two_way(
820797
data=data,
821798
split_type=split_type,
822-
ratio=[ratio[0], ratio[1] + ratio[2]],
799+
ratio=(ratio[0], ratio[1] + ratio[2]),
823800
stratify_by=stratify_by,
824801
balance=balance,
825802
random_state=random_state,
@@ -829,7 +806,7 @@ def split_train_test_validate(
829806
test, val = _split_two_way(
830807
data=other,
831808
split_type=split_type,
832-
ratio=[ratio[1], ratio[2]],
809+
ratio=(ratio[1], ratio[2]),
833810
stratify_by=stratify_by,
834811
balance=balance,
835812
random_state=random_state,
@@ -1041,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
10411018
return data_ret
10421019

10431020
def _balance_data(
1044-
data: pd.Dataframe,
1021+
data: pd.DataFrame,
10451022
random_state: Optional[Union[int,RandomState]]=None,
10461023
# oversample: bool=False,
1047-
) -> pd.Dataframe:
1024+
) -> pd.DataFrame:
10481025
tmp = deepcopy(data)
10491026
counts = tmp.value_counts('split_class')
10501027
ret_df = (
@@ -1060,7 +1037,7 @@ def _create_classes(
10601037
metric: str,
10611038
num_classes: int=2,
10621039
quantiles: bool=True,
1063-
thresh: float=None,
1040+
thresh: Optional[float]=None,
10641041
) -> pd.DataFrame:
10651042
"""
10661043
Helper function that bins experiment data into a number of defined
@@ -1149,7 +1126,7 @@ def _split_two_way(
11491126
split_type: Literal[
11501127
'mixed-set', 'drug-blind', 'cancer-blind'
11511128
]='mixed-set',
1152-
ratio: tuple[int, int, int]=(8,2),
1129+
ratio: tuple[int, int]=(8,2),
11531130
balance: bool=False,
11541131
stratify_by: Optional[str]=None,
11551132
random_state: Optional[Union[int,RandomState]]=None,
@@ -1255,7 +1232,8 @@ def _split_two_way(
12551232
columns = 'dose_response_metric',
12561233
values = 'dose_response_value'
12571234
).reset_index()
1258-
1235+
if stratify_by is not None:
1236+
df_full.dropna(axis='index', subset=[stratify_by], inplace=True)
12591237
# Defining the split sizes.
12601238
train_size = float(ratio[0]) / sum(ratio)
12611239
test_val_size = float(ratio[1]) / sum(ratio)

0 commit comments

Comments
 (0)