Implementing scaling.py functions in plugins

franflame · franflame · commit 3e4ff22eb6c4 · 2024-03-25T14:58:36.000-07:00
diff --git a/foqus_lib/framework/surrogate/keras_nn.py b/foqus_lib/framework/surrogate/keras_nn.py
@@ -41,6 +41,7 @@
 from pathlib import Path
 from tokenize import String
 
+from typing import Tuple
 import numpy as np
 import pandas as pd
 import tensorflow as tf  # pylint: disable=import-error
@@ -52,6 +53,19 @@
 from foqus_lib.framework.surrogate.surrogate import surrogate
 from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
 
+from foqus_lib.framework.surrogate.scaling import (
+    BaseScaler,
+    LinearScaler,
+    LogScaler,
+    LogScaler2, 
+    PowerScaler, 
+    PowerScaler2,
+    map_name_to_scaler,
+    scale_dataframe
+)
+
+# mapping between the human-readable name for the scaling variant
+# and an instance of the corresponding scaler class
 
 # custom class to define Keras NN layers
 @tf.keras.utils.register_keras_serializable()
@@ -293,6 +307,14 @@ def __init__(self, dat=None):
             desc="Name of output file for model, should have file extension: .keras",
             hint="Enter a custom file name if desired",
         )
+        # add option for normalization_form, make dropdown option
+        self.options.add(
+            name="scaling_function",
+            default="Linear",
+            dtype=str,
+            desc="Scaling/normalization function for input data",
+            validValues=list(map_name_to_scaler.keys()),
+        )
 
     def run(self):
         """
@@ -316,6 +338,9 @@ def run(self):
         self.msgQueue.put(f"input data columns: {input_data.columns}")
         self.msgQueue.put(f"output data columns: {output_data.columns}")
 
+        # extract scaling function option, apply it to the input data
+        # get scaler object
+
         # np.random.seed(46)
         # rn.seed(1342)
         # tf.random.set_seed(62)
@@ -341,22 +366,13 @@ def run(self):
         xdata = input_data
         zdata = output_data
 
-        xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
-        zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
-
-        # normalize data using Linear form
-        # users can normalize with any allowed form # manually, and then pass the
-        # appropriate flag to FOQUS from the allowed list:
-        # ["Linear", "Log", "Power", "Log 2", "Power 2"] - see the documentation for
-        # details on the scaling formulations
-        xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
-        zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
-        xdata, zdata = np.array(xdata), np.array(zdata)
-        for i in range(len(xdata)):
-            for j in range(len(xlabels)):
-                xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
-            for j in range(len(zlabels)):
-                zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
+        scaling_func_option = self.options["scaling_function"].value
+
+        scaler_instance = map_name_to_scaler[scaling_func_option]
+        xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
+        zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)
+
+        print(f"using scaling function: {scaling_func_option}")
 
         # method to create model
         def create_model():
@@ -370,7 +386,7 @@ def create_model():
                 input_bounds=xdata_bounds,
                 output_bounds=zdata_bounds,
                 normalized=True,
-                normalization_form="Linear",
+                normalization_form=scaling_func_option,
             )
 
             outputs = layers(inputs)  # use network as function outputs = f(inputs)
diff --git a/foqus_lib/framework/surrogate/pytorch_nn.py b/foqus_lib/framework/surrogate/pytorch_nn.py
@@ -50,6 +50,16 @@
 # from foqus_lib.framework.graph.graph import Graph
 from foqus_lib.framework.surrogate.surrogate import surrogate
 from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
+from foqus_lib.framework.surrogate.scaling import (
+    BaseScaler,
+    LinearScaler,
+    LogScaler,
+    LogScaler2, 
+    PowerScaler, 
+    PowerScaler2,
+    map_name_to_scaler,
+    scale_dataframe
+)
 
 # custom class to define Keras NN layers
 np.random.seed(46)
@@ -284,6 +294,13 @@ def __init__(self, dat=None):
             desc="Name of output file for model, should have file extension: .pt",
             hint="Enter a custom file name if desired",
         )
+        self.options.add(
+            name="scaling_function",
+            default="Linear",
+            dtype=str,
+            desc="Scaling/normalization function for input data",
+            validValues=["Linear", "Log", "Log2", "Power", "Power2"],
+        )
 
     def run(self):
         """
@@ -326,22 +343,16 @@ def run(self):
         zlabels = list(output_data.columns)
         xdata = input_data
         zdata = output_data
-        xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
-        zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
-
-        # normalize data using Linear form, pass as custom string and parse with SymPy
-        # users can normalize with any allowed form # manually, and then pass the
-        # appropriate flag to FOQUS from the allowed list:
-        # ["Linear", "Log", "Power", "Log 2", "Power 2", "Custom] - see the
-        # documentation for details on the scaling formulations
-        xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
-        zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
-        xdata, zdata = np.array(xdata), np.array(zdata)
-        for i in range(len(xdata)):
-            for j in range(len(xlabels)):
-                xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
-            for j in range(len(zlabels)):
-                zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
+        # xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
+        # zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
+
+        scaling_func_option = self.options["scaling_function"].value
+
+        scaler_instance = map_name_to_scaler[scaling_func_option]
+        xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
+        zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)
+
+        print(f"using scaling function: {scaling_func_option}")
 
         model_data = np.concatenate(
             (xdata, zdata), axis=1
@@ -353,8 +364,11 @@ def run(self):
 
         # raise exception here after BPC position
         # create model
-        x_train = torch.from_numpy(xdata).float().to(device)
-        z_train = torch.from_numpy(zdata).float().to(device)
+
+        # need to convert xdata to a numpy array for the below to work
+        # otherwise causes TypeError: expected np.ndarray (got DataFrame)
+        x_train = torch.from_numpy(xdata.to_numpy()).float().to(device)
+        z_train = torch.from_numpy(zdata.to_numpy()).float().to(device)
 
         # print type at this point
         # can also print inside create_model
diff --git a/foqus_lib/framework/surrogate/scaling.py b/foqus_lib/framework/surrogate/scaling.py
@@ -5,6 +5,8 @@
 from collections import OrderedDict
 
 import numpy as np
+import pandas as pd
+from typing import Tuple
 
 
 def validate_for_scaling(array_in, lo, hi) -> None:
@@ -130,6 +132,10 @@ def unscale_power2(array_in, lo, hi):
     return result
 
 class BaseScaler:
+    # def __init__(self, data_array: np.ndarray):
+    #     self.data = data_array
+    #     self.lo_ = np.min(data_array)
+    #     self.hi_ = np.max(data_array)
 
     def fit(self, X: np.ndarray):
         self.lo_ = np.min(X)
@@ -184,4 +190,24 @@ def transform(self, X: np.ndarray) -> np.ndarray:
 
     def inverse_transform(self, X: np.ndarray) -> np.ndarray:
         return unscale_power2(X, self.lo_, self.hi_)
-    
+
+map_name_to_scaler = {
+    "Linear": LinearScaler(),
+    "Log": LogScaler(),
+    "Log2": LogScaler2(),
+    "Power": PowerScaler(),
+    "Power2": PowerScaler2(),
+    #...
+}
+
+def scale_dataframe(df: pd.DataFrame, scaler: BaseScaler) -> Tuple[pd.DataFrame, dict]:
+    scaled_df = pd.DataFrame(np.nan, columns=df.columns, index=df.index)
+    bounds = {}
+
+    for col_name in df:
+        unscaled_col_data = df[col_name]
+        scaled_col_data = scaler.fit_transform(unscaled_col_data)
+        bounds[col_name] = scaler.lo_, scaler.hi_
+        scaled_df.loc[:, col_name] = scaled_col_data
+
+    return scaled_df, bounds
diff --git a/foqus_lib/framework/surrogate/scikit_nn.py b/foqus_lib/framework/surrogate/scikit_nn.py
@@ -52,6 +52,16 @@
 from foqus_lib.framework.surrogate.surrogate import surrogate
 from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
 
+from foqus_lib.framework.surrogate.scaling import (
+    BaseScaler,
+    LinearScaler,
+    LogScaler,
+    LogScaler2, 
+    PowerScaler, 
+    PowerScaler2,
+    map_name_to_scaler,
+    scale_dataframe
+)
 
 def validate_training_data(xdata: np.ndarray, zdata: np.ndarray):
     number_columns_in_xdata = xdata.shape[1]
@@ -250,6 +260,14 @@ def __init__(self, dat=None):
             hint="Enter a custom file name if desired",
         )
 
+        self.options.add(
+            name="scaling_function",
+            default="Linear",
+            dtype=str,
+            desc="Scaling/normalization function for input data",
+            validValues=["Linear", "Log", "Log2", "Power", "Power2"],
+        )
+
     def run(self):
         """
         This function overloads the Thread class function,
@@ -300,22 +318,13 @@ def run(self):
         xdata = input_data
         zdata = output_data
 
-        xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
-        zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
-
-        # normalize data using Linear form, pass as custom string and parse with SymPy
-        # users can normalize with any allowed form # manually, and then pass the
-        # appropriate flag to FOQUS from the allowed list:
-        # ["Linear", "Log", "Power", "Log 2", "Power 2", "Custom] - see the
-        # documentation for details on the scaling formulations
-        xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
-        zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
-        xdata, zdata = np.array(xdata), np.array(zdata)
-        for i in range(len(xdata)):
-            for j in range(len(xlabels)):
-                xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
-            for j in range(len(zlabels)):
-                zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
+        scaling_func_option = self.options["scaling_function"].value
+
+        scaler_instance = map_name_to_scaler[scaling_func_option]
+        xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
+        zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)
+
+        print(f"using scaling function: {scaling_func_option}")
 
         model_data = np.concatenate(
             (xdata, zdata), axis=1