Skip to content

Commit ee4d97a

Browse files
edg956claudepmbrull
committed
Use context to enhance recognition by default (#25856)
* Implement custom context enhancement for Presidio recognizers Presidio's default context enhancement relies heavily on NLP and often fails when analyzing individual values rather than full text. This implements a custom context enhancement that: - Boosts recognizer scores to MAX when context keywords match - Applies a minimum score threshold (0.3) before enhancement - Skips already-enhanced results to prevent double-boosting - Introduces a decorator pattern for composing recognizer enhancements - Adds eager_us_bank_recognizer with higher base scores for better results The enhancement works by checking if any context words from the recognizer match the provided context list, then boosting the confidence score to maximum and setting the IS_SCORE_ENHANCED_BY_CONTEXT_KEY metadata flag. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Integrate context enhancement into recognizer factory Updates PresidioRecognizerFactory to apply the new decorator pattern: - All recognizers now use enhance_using_context decorator - Confidence threshold filtering applied via filter_enhanced_results_below_threshold - Decorators composed using decorate_recognizer for clean application - Context passed to PatternRecognizer during creation This ensures all enabled recognizers benefit from custom context enhancement while maintaining backward compatibility with confidence thresholds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Add migration to update PII tag recognizers with enhanced configs Adds a v1.1.22 migration that updates existing PII tags with improved recognizer configurations featuring context keywords and optimized patterns. Changes: - Add patchRecognizers method in CollectionDAO for updating tag recognizers - Implement setRecognizersForSensitiveTags in MigrationUtil to load and apply recognizer configs from piiTagsWithRecognizers.json - Update piiTagsWithRecognizers.json with context keywords for better classification accuracy - Execute migration as post-DDL script for both MySQL and PostgreSQL This migration ensures existing deployments benefit from the improved context enhancement logic without manual reconfiguration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Fix potential NPE and type mismatch in migration utility Addresses code review feedback: 1. Fix potential NullPointerException in setRecognizersForSensitiveTags - Use Boolean.TRUE.equals() instead of auto-unboxing for nullable Boolean - Prevents NPE when autoClassificationEnabled is absent from JSON - Follows existing pattern from v1120/MigrationUtil.java 2. Fix Boolean vs boolean type mismatch in updateTagRecognizers - Change isForceMigration parameter from boxed Boolean to primitive boolean - Matches caller signature and eliminates latent NPE risk - Maintains consistency across method signatures 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Add context to NHSRecognizer * Fix typing * Fix broken unit tests * Fix broken integration test --------- Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
1 parent 36ec306 commit ee4d97a

10 files changed

Lines changed: 807 additions & 19 deletions

File tree

ingestion/src/metadata/pii/algorithms/presidio_recognizer_factory.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@
2828
from metadata.generated.schema.type.recognizer import Recognizer
2929
from metadata.generated.schema.type.recognizers.regexFlags import RegexFlags
3030
from metadata.pii.algorithms.presidio_utils import (
31-
apply_confidence_threshold,
31+
decorate_recognizer,
32+
enhance_using_context,
33+
filter_enhanced_results_below_threshold,
3234
recognizer_factories,
3335
)
3436
from metadata.utils.logger import pii_logger
@@ -84,9 +86,15 @@ def create_recognizer(
8486
logger.warning(f"Unknown recognizer type for {recognizer_config.name}")
8587
return None
8688

87-
if recognizer and (threshold := recognizer_config.confidenceThreshold):
88-
patch_analyze = apply_confidence_threshold(threshold)
89-
recognizer = patch_analyze(recognizer)
89+
decorators: List[Callable[[EntityRecognizer], EntityRecognizer]] = [
90+
enhance_using_context,
91+
]
92+
93+
if threshold := recognizer_config.confidenceThreshold:
94+
decorators.append(filter_enhanced_results_below_threshold(threshold))
95+
96+
if recognizer:
97+
recognizer = decorate_recognizer(*decorators)(recognizer)
9098

9199
return recognizer
92100

@@ -127,6 +135,7 @@ def _create_pattern_recognizer(
127135
patterns=patterns,
128136
name=recognizer_config.name.root,
129137
supported_language=config.supportedLanguage.value,
138+
context=config.context or [],
130139
global_regex_flags=PresidioRecognizerFactory._get_regex_flags(
131140
config.regexFlags
132141
),

ingestion/src/metadata/pii/algorithms/presidio_utils.py

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
"""
1414
import inspect
1515
import logging
16-
from functools import cache
16+
import types
17+
from functools import cache, wraps
1718
from itertools import groupby
1819
from typing import (
1920
Any,
@@ -45,6 +46,7 @@
4546
CreditCardRecognizer,
4647
DateRecognizer,
4748
NhsRecognizer,
49+
UsBankRecognizer,
4850
UsLicenseRecognizer,
4951
)
5052
from spacy.cli.download import download # pyright: ignore[reportUnknownVariableType]
@@ -288,6 +290,63 @@ def date_recognizer(**kwargs: Any) -> ValidatedDateRecognizer:
288290
return ValidatedDateRecognizer(**kwargs)
289291

290292

293+
class ContextAwareUsBankRecognizer(UsBankRecognizer):
294+
def enhance_using_context(
295+
self,
296+
text: str,
297+
raw_recognizer_results: List[RecognizerResult],
298+
other_raw_recognizer_results: List[RecognizerResult],
299+
nlp_artifacts: NlpArtifacts,
300+
context: Optional[List[str]] = None,
301+
) -> List[RecognizerResult]:
302+
"""Enhance confidence score using context of the entity.
303+
304+
Boosts the very low scores of the patterns
305+
306+
:param text: The actual text that was analyzed
307+
:param raw_recognizer_results: This recognizer's results, to be updated
308+
based on recognizer specific context.
309+
:param other_raw_recognizer_results: Other recognizer results matched in
310+
the given text to allow related entity context enhancement
311+
:param nlp_artifacts: The nlp artifacts contains elements
312+
such as lemmatized tokens for better
313+
accuracy of the context enhancement process
314+
:param context: list of context words
315+
"""
316+
if context is None:
317+
return raw_recognizer_results
318+
319+
context_lower = " ".join(context).lower()
320+
321+
for result in raw_recognizer_results:
322+
# if previously enhanced, then ignore
323+
if result.recognition_metadata.get( # pyright: ignore[reportUnknownMemberType]
324+
RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
325+
):
326+
continue
327+
328+
if any(ctx_word.lower() in context_lower for ctx_word in self.context):
329+
original_score = result.score
330+
result.score = self.MAX_SCORE
331+
332+
result.recognition_metadata[ # pyright: ignore[reportUnknownMemberType]
333+
RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
334+
] = True
335+
336+
logger.debug(
337+
f"Enhanced {result.entity_type} score: {original_score:.2f}{result.score:.2f} (context: {self.context})"
338+
)
339+
340+
return raw_recognizer_results
341+
342+
343+
@recognizer_factories.add( # pyright: ignore[reportUnknownMemberType, reportUntypedFunctionDecorator]
344+
UsBankRecognizer
345+
)
346+
def eager_us_bank_recognizer(**kwargs: Any) -> ContextAwareUsBankRecognizer:
347+
return ContextAwareUsBankRecognizer(**kwargs)
348+
349+
291350
def _get_all_pattern_recognizers() -> Iterable[EntityRecognizer]:
292351
for cls in _get_all_entity_recognizer_classes():
293352
if issubclass(cls, PatternRecognizer):
@@ -330,6 +389,107 @@ def analyze(
330389
return decorate_entity_recognizer
331390

332391

392+
def enhance_using_context(recognizer: EntityRecognizer) -> EntityRecognizer:
393+
MIN_SCORE_FOR_ENHANCEMENT = 0.3
394+
old_enhancing_function = recognizer.enhance_using_context
395+
396+
@wraps(old_enhancing_function)
397+
def wrapped(
398+
rec: EntityRecognizer,
399+
text: str,
400+
raw_recognizer_results: List[RecognizerResult],
401+
other_raw_recognizer_results: List[RecognizerResult],
402+
nlp_artifacts: NlpArtifacts,
403+
context: Optional[List[str]] = None,
404+
) -> List[RecognizerResult]:
405+
results = old_enhancing_function(
406+
text,
407+
raw_recognizer_results,
408+
other_raw_recognizer_results,
409+
nlp_artifacts,
410+
context,
411+
)
412+
413+
if not rec.context or not context:
414+
# If no context is given or the recognizer does not support it,
415+
# then ignore this
416+
return results
417+
418+
context_lower = " ".join(context).lower()
419+
420+
for result in results:
421+
# if previously enhanced, then ignore
422+
if result.recognition_metadata.get( # pyright: ignore[reportUnknownMemberType]
423+
RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
424+
):
425+
continue
426+
427+
# Skip boosting scores that are too low
428+
if result.score < MIN_SCORE_FOR_ENHANCEMENT:
429+
continue
430+
431+
if any(ctx_word.lower() in context_lower for ctx_word in rec.context):
432+
original_score = result.score
433+
result.score = rec.MAX_SCORE
434+
435+
result.recognition_metadata[ # pyright: ignore[reportUnknownMemberType]
436+
RecognizerResult.IS_SCORE_ENHANCED_BY_CONTEXT_KEY
437+
] = True
438+
439+
logger.debug(
440+
f"Enhanced {result.entity_type} score: {original_score:.2f}{result.score:.2f} (context: {rec.context})"
441+
)
442+
443+
return results
444+
445+
recognizer.enhance_using_context = types.MethodType(wrapped, recognizer)
446+
447+
return recognizer
448+
449+
450+
def filter_enhanced_results_below_threshold(
451+
threshold: float,
452+
) -> Callable[[EntityRecognizer], EntityRecognizer]:
453+
def decorate_entity_recognizer(recognizer: EntityRecognizer) -> EntityRecognizer:
454+
old_enhancing_function = recognizer.enhance_using_context
455+
456+
@wraps(old_enhancing_function)
457+
def wrapped(
458+
rec: EntityRecognizer, # pyright: ignore[reportUnusedParameter]
459+
text: str,
460+
raw_recognizer_results: List[RecognizerResult],
461+
other_raw_recognizer_results: List[RecognizerResult],
462+
nlp_artifacts: NlpArtifacts,
463+
context: Optional[List[str]] = None,
464+
) -> List[RecognizerResult]:
465+
results = old_enhancing_function(
466+
text,
467+
raw_recognizer_results,
468+
other_raw_recognizer_results,
469+
nlp_artifacts,
470+
context,
471+
)
472+
473+
return [result for result in results if result.score >= threshold]
474+
475+
recognizer.enhance_using_context = types.MethodType(wrapped, recognizer)
476+
return recognizer
477+
478+
return decorate_entity_recognizer
479+
480+
481+
def decorate_recognizer(
482+
*decorators: Callable[[EntityRecognizer], EntityRecognizer]
483+
) -> Callable[[EntityRecognizer], EntityRecognizer]:
484+
def decorator(recognizer: EntityRecognizer) -> EntityRecognizer:
485+
decorated = recognizer
486+
for dec in decorators:
487+
decorated = dec(decorated)
488+
return decorated
489+
490+
return decorator
491+
492+
333493
def explain_recognition_results(results: List[RecognizerResult]) -> str:
334494
"""Builds a verbose explanation of the recognition results taking into account multiple values"""
335495

ingestion/tests/integration/auto_classification/conftest.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from metadata.generated.schema.api.classification.createTag import CreateTagRequest
2020
from metadata.generated.schema.entity.classification.classification import (
2121
Classification,
22+
ConflictResolution,
2223
)
2324
from metadata.generated.schema.entity.classification.tag import Tag
2425
from metadata.generated.schema.type.piiEntity import PIIEntity
@@ -413,7 +414,10 @@ def person_column_name_recognizer() -> Recognizer:
413414
def pii_classification(
414415
metadata: OpenMetadata[Classification, CreateClassificationRequest]
415416
) -> Classification:
416-
create_classification_request = CreateClassificationRequestFactory.create(fqn="PII")
417+
create_classification_request = CreateClassificationRequestFactory.create(
418+
fqn="PII",
419+
autoClassificationConfig__conflictResolution=ConflictResolution.highest_priority.value,
420+
)
417421
entity = metadata.create_or_update(create_classification_request)
418422

419423
return entity
@@ -461,6 +465,7 @@ def sensitive_pii_tag(
461465
create_tag_request: CreateTagRequest = CreateTagRequestFactory.create(
462466
tag_name="Sensitive",
463467
tag_classification=pii_classification.fullyQualifiedName.root,
468+
autoClassificationPriority=100,
464469
recognizers=[
465470
credit_card_recognizer,
466471
aba_routing_recognizer,
@@ -512,6 +517,7 @@ def non_sensitive_pii_tag(
512517
create_tag_request: CreateTagRequest = CreateTagRequestFactory.create(
513518
tag_name="NonSensitive",
514519
tag_classification=pii_classification.fullyQualifiedName.root,
520+
autoClassificationPriority=80,
515521
recognizers=[
516522
date_recognizer,
517523
phone_recognizer,

0 commit comments

Comments
 (0)