|
27 | 27 |
|
28 | 28 | ROOT_DIRECTORY = Path(__file__).resolve().parent.parent |
29 | 29 |
|
| 30 | +# Common sentence transformer settings for reuse |
| 31 | +SENTENCE_TRANSFORMER_BASE = { |
| 32 | + "type": "sentence_transformer", |
| 33 | + "model_name": "all-MiniLM-L6-v2", # Default lightweight model |
| 34 | + "num_candidates": 10, |
| 35 | + "similarity_threshold": 0.5, # Renamed from cos_sim_lower_bound for clarity |
| 36 | + "device": None, # Auto-detect CUDA/CPU |
| 37 | + "batch_size": None, # Auto-detect based on available memory |
| 38 | + "input_col": "preprocessed", |
| 39 | + # Support for model-specific parameters as shown in mixedbread example |
| 40 | + "model_kwargs": { |
| 41 | + "normalize_embeddings": True, |
| 42 | + # Other model-specific params like truncate_dim can be added here |
| 43 | + }, |
| 44 | + "encode_kwargs": { |
| 45 | + "normalize_embeddings": True, |
| 46 | + }, |
| 47 | +} |
| 48 | + |
30 | 49 | # default model parameters picked up in PandasEntityMatching and SparkEntityMatching |
31 | 50 | MODEL_PARAMS = { |
32 | 51 | # type of name preprocessor defined in name_preprocessing.py |
|
44 | 63 | "type": "sni", # Sorted Neighbourhood Indexing, |
45 | 64 | "window_length": 3, |
46 | 65 | }, |
47 | | - # Sentence transformer indexer |
48 | | - { |
49 | | - "type": "sentence_transformer", |
50 | | - "model_name": "all-MiniLM-L6-v2", |
51 | | - "num_candidates": 10, |
52 | | - "cos_sim_lower_bound": 0.5, |
53 | | - "device": None, |
54 | | - "batch_size": None, |
55 | | - "model_kwargs": None, |
56 | | - "encode_kwargs": None, |
57 | | - }, |
| 66 | + # Sentence transformer indexer with base settings |
| 67 | + SENTENCE_TRANSFORMER_BASE, |
58 | 68 | ], |
59 | 69 | "partition_size": 5000, # Number of names in ground_truth and names_to_match per Spark partition: across-worker division. (Set to None for no automatic repartitioning) |
60 | 70 | # input columns: |
|
88 | 98 | "cosine_similarity": { |
89 | 99 | "tokenizer": "words", # "words" or "characters" |
90 | 100 | "ngram": 1, # number of token per n-gram |
91 | | - "cos_sim_lower_bound": 0.0, |
92 | | - "num_candidates": 10, # Number of candidates returned by indexer. |
93 | | - "binary_countvectorizer": True, # use binary countVectorizer or not |
94 | | - # the same value as is used in Spark pipeline in CountVectorizer(vocabSize) 2**25=33554432, 2**24=16777216 |
| 101 | + "similarity_threshold": 0.0, # Renamed from cos_sim_lower_bound for consistency |
| 102 | + "num_candidates": 10, |
| 103 | + "binary_countvectorizer": True, |
95 | 104 | "max_features": 2**25, |
96 | | - # Python function to be used in blocking ground_truth & names_to_match (only pairs within the same block will be considered in cosine similarity) |
97 | | - # - None # No Blocking |
98 | | - # - blocking_functions.first() # block using first character |
99 | 105 | "blocking_func": None, |
100 | 106 | }, |
101 | 107 | "sni": { |
102 | | - "window_length": 3, # window size for SNI |
103 | | - "mapping_func": None, # custom mapping function applied in SNI step |
| 108 | + "window_length": 3, |
| 109 | + "mapping_func": None, |
104 | 110 | }, |
105 | 111 | "naive": {}, |
106 | 112 | "sentence_transformer": { |
107 | | - "model_name": "all-MiniLM-L6-v2", # Default lightweight model or path to fine-tuned model |
108 | | - "num_candidates": 10, # Number of candidates returned by indexer |
109 | | - "cos_sim_lower_bound": 0.5, # Minimum similarity threshold |
110 | | - "batch_size": None, # Will use auto-detection |
111 | | - "device": None, # Auto-detect device |
112 | | - "blocking_func": None, # Optional blocking function |
113 | | - "input_col": "preprocessed", # Input column name |
114 | | - "model_kwargs": None, # Optional kwargs for model initialization |
115 | | - "encode_kwargs": None, # Optional kwargs for encoding |
| 113 | + **SENTENCE_TRANSFORMER_BASE, |
| 114 | + "blocking_func": None, # Additional parameter specific to indexer |
116 | 115 | }, |
117 | 116 | } |
118 | 117 |
|
|
0 commit comments