fix(transformers): slopmop

dv-ar · dv-ar · commit 6ddabb1df2ec · 2024-11-22T00:48:53.000+01:00
diff --git a/emm/indexing/pandas_sentence_transformer.py b/emm/indexing/pandas_sentence_transformer.py
@@ -23,6 +23,7 @@ def __init__(
         batch_size: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         encode_kwargs: Optional[Dict[str, Any]] = None,
+        similarity_threshold: float = 0.5,
         **kwargs,
     ) -> None:
         """Initialize sentence transformer indexer
@@ -37,6 +38,7 @@ def __init__(
             batch_size: Batch size for encoding
             model_kwargs: Additional kwargs for model initialization (e.g. {'truncate_dim': 384})
             encode_kwargs: Additional kwargs for encoding method
+            similarity_threshold: Similarity threshold for filtering matches
             **kwargs: Additional indexer parameters
         """
         check_sentence_transformers_available()
@@ -60,6 +62,7 @@ def __init__(
         self.gt = None
         logger.info(f"Initializing SentenceTransformerIndexer with model {model_name}")
         self.carry_on_cols = []
+        self.similarity_threshold = similarity_threshold
 
     def fit(self, X: pd.DataFrame, y: Any = None) -> TransformerMixin:
         """Compute embeddings for base names and fit nearest neighbors
@@ -187,6 +190,8 @@ def transform(self, X: pd.DataFrame, multiple_indexers: bool = False) -> pd.Data
                     'rank': f'rank_{self.column_prefix()}'
                 })
                 
+                candidates = candidates[candidates["similarity_score"] >= self.similarity_threshold]
+                
                 logger.info(f"Generated {len(candidates)} candidates")
                 return candidates
 
@@ -198,7 +203,7 @@ def transform(self, X: pd.DataFrame, multiple_indexers: bool = False) -> pd.Data
     def _process_matches(self, similarities, indices, query_indices, base_indices):
         """Process matches and filter by similarity threshold"""
         results = []
-        mask = similarities >= self.cos_sim_lower_bound
+        mask = similarities >= self.similarity_threshold
         
         for i, (sim_row, idx_row, mask_row) in enumerate(zip(similarities, indices, mask)):
             valid_indices = idx_row[mask_row]
diff --git a/emm/models/sentence_transformer/base.py b/emm/models/sentence_transformer/base.py
@@ -30,6 +30,7 @@ class BaseSentenceTransformerComponent:
     def __init__(
         self,
         model_name: str = "all-MiniLM-L6-v2",
+        similarity_threshold: float = 0.5,
         device: Optional[str] = None,
         batch_size: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
@@ -39,6 +40,7 @@ def __init__(
         
         Args:
             model_name: Name or path of the sentence transformer model to use
+            similarity_threshold: Similarity threshold for filtering candidates
             device: Device to run model on ('cpu', 'cuda', or None for auto-detection)
             batch_size: Batch size for encoding (None for auto-detection)
             model_kwargs: Additional kwargs passed to SentenceTransformer initialization
@@ -74,6 +76,7 @@ def __init__(
         self.encode_kwargs = encode_kwargs or {}
         self.model = SentenceTransformer(model_name, device=self.device, **self.model_kwargs)
         self.model_name = model_name
+        self.similarity_threshold = similarity_threshold
 
     def encode_texts(self, texts: List[str]) -> NDArray[np.float32]:
         """Encode a list of texts into embeddings.
diff --git a/emm/models/sentence_transformer/tuning/config.py b/emm/models/sentence_transformer/tuning/config.py
@@ -17,6 +17,7 @@ class TuningConfig:
         wandb_project: Optional W&B project for logging
         loss_type: Type of loss function ('dae', 'contrastive', 'combined')
         device_count: Number of devices to use
+        similarity_threshold: Similarity threshold for filtering candidates
     """
     model_name: str = 'all-MiniLM-L6-v2'
     batch_size: int = 32
@@ -27,4 +28,5 @@ class TuningConfig:
     output_path: Optional[Path] = None
     wandb_project: Optional[str] = None
     loss_type: str = 'dae'  # 'dae', 'contrastive', or 'combined'
-    device_count: int = 1 
+    device_count: int = 1
+    similarity_threshold: float = 0.5
diff --git a/emm/parameters.py b/emm/parameters.py
@@ -27,6 +27,25 @@
 
 ROOT_DIRECTORY = Path(__file__).resolve().parent.parent
 
+# Common sentence transformer settings for reuse
+SENTENCE_TRANSFORMER_BASE = {
+    "type": "sentence_transformer",
+    "model_name": "all-MiniLM-L6-v2",  # Default lightweight model
+    "num_candidates": 10,
+    "similarity_threshold": 0.5,  # Renamed from cos_sim_lower_bound for clarity
+    "device": None,  # Auto-detect CUDA/CPU
+    "batch_size": None,  # Auto-detect based on available memory
+    "input_col": "preprocessed",
+    # Support for model-specific parameters as shown in mixedbread example
+    "model_kwargs": {
+        "normalize_embeddings": True,
+        # Other model-specific params like truncate_dim can be added here
+    },
+    "encode_kwargs": {
+        "normalize_embeddings": True,
+    },
+}
+
 # default model parameters picked up in PandasEntityMatching and SparkEntityMatching
 MODEL_PARAMS = {
     # type of name preprocessor defined in name_preprocessing.py
@@ -44,17 +63,8 @@
             "type": "sni",  # Sorted Neighbourhood Indexing,
             "window_length": 3,
         },
-        # Sentence transformer indexer
-        {
-            "type": "sentence_transformer",
-            "model_name": "all-MiniLM-L6-v2",
-            "num_candidates": 10,
-            "cos_sim_lower_bound": 0.5,
-            "device": None,
-            "batch_size": None,
-            "model_kwargs": None,
-            "encode_kwargs": None,
-        },
+        # Sentence transformer indexer with base settings
+        SENTENCE_TRANSFORMER_BASE,
     ],
     "partition_size": 5000,  # Number of names in ground_truth and names_to_match per Spark partition: across-worker division. (Set to None for no automatic repartitioning)
     # input columns:
@@ -88,31 +98,20 @@
     "cosine_similarity": {
         "tokenizer": "words",  # "words" or "characters"
         "ngram": 1,  # number of token per n-gram
-        "cos_sim_lower_bound": 0.0,
-        "num_candidates": 10,  # Number of candidates returned by indexer.
-        "binary_countvectorizer": True,  # use binary countVectorizer or not
-        # the same value as is used in Spark pipeline in CountVectorizer(vocabSize) 2**25=33554432, 2**24=16777216
+        "similarity_threshold": 0.0,  # Renamed from cos_sim_lower_bound for consistency
+        "num_candidates": 10,
+        "binary_countvectorizer": True,
         "max_features": 2**25,
-        # Python function to be used in blocking ground_truth & names_to_match (only pairs within the same block will be considered in cosine similarity)
-        # - None   # No Blocking
-        # - blocking_functions.first()  # block using first character
         "blocking_func": None,
     },
     "sni": {
-        "window_length": 3,  # window size for SNI
-        "mapping_func": None,  # custom mapping function applied in SNI step
+        "window_length": 3,
+        "mapping_func": None,
     },
     "naive": {},
     "sentence_transformer": {
-        "model_name": "all-MiniLM-L6-v2",  # Default lightweight model or path to fine-tuned model
-        "num_candidates": 10,  # Number of candidates returned by indexer
-        "cos_sim_lower_bound": 0.5,  # Minimum similarity threshold
-        "batch_size": None,  # Will use auto-detection
-        "device": None,  # Auto-detect device
-        "blocking_func": None,  # Optional blocking function
-        "input_col": "preprocessed",  # Input column name
-        "model_kwargs": None,  # Optional kwargs for model initialization
-        "encode_kwargs": None,  # Optional kwargs for encoding
+        **SENTENCE_TRANSFORMER_BASE,
+        "blocking_func": None,  # Additional parameter specific to indexer
     },
 }
 
diff --git a/emm/supervised_model/sentence_transformer_model.py b/emm/supervised_model/sentence_transformer_model.py
@@ -44,6 +44,7 @@ def __init__(
         self,
         score_col: str = "nm_score",
         model_name: str = "all-MiniLM-L6-v2",
+        similarity_threshold: float = 0.5,
         device: Optional[str] = None,
         batch_size: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
@@ -74,6 +75,7 @@ def __init__(
         )
         BaseSupervisedModel.__init__(self)
         self.score_col = score_col
+        self.similarity_threshold = similarity_threshold
 
     def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> SentenceTransformerLayerTransformer:
         """Placeholder for fit method - not used as we use pre-trained models
@@ -187,6 +189,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame | None:
             })
             
             X = self.calc_score(X)
+            X = X[X[self.score_col] >= self.similarity_threshold]
             X = self.select_best_score(X, group_cols=["uid"])
 
             timer.log_param("cands", len(X))