Skip to content
161 changes: 161 additions & 0 deletions core/common/lexical_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""
Lexical Variant Dictionary lookup.

Loads a dictionary Source (one Concept per equivalence class, with each variant
as a Name on that Concept) and provides token-level variant lookup for query
expansion in concept search and matching.

The dictionary lives as a normal OCL Source (e.g. ocl/lexical-variants-en),
giving it versioning, release management, locale handling, and editability
through OCL's existing infrastructure.
"""
from dataclasses import dataclass

from django.conf import settings
from django.core.cache import cache


@dataclass(frozen=True)
class LexicalVariant:
term: str
name_type: str
locale: str
source_concept_uri: str


class LexicalVariantDictionary:
CACHE_KEY_PREFIX = 'lexical_variants'
CACHE_TIMEOUT = settings.LEXICAL_VARIANTS_CACHE_TIMEOUT

@classmethod
def get_lexical_variants(cls, text, source_uri=None):
"""
Return lexical variants for `text` looked up in the dictionary at
`source_uri` (defaults to settings.DEFAULT_LEXICAL_VARIANTS_REPO).

Tokenizes input, looks each token up in the dictionary's Names, and returns
the sibling Names on each matching Concept. Returns [] if the dictionary
Source can't be resolved or the token has no entry — never raises.
"""
if not text:
return []
source = cls._resolve_source(source_uri or settings.DEFAULT_LEXICAL_VARIANTS_REPO)
if source is None:
return []
try:
index = cls._get_index(source)
except Exception: # pylint: disable=broad-except
return []

seen = set()
out = []
for token in cls._tokenize(text):
for variant in index.get(token, []):
dedup_key = (variant.term, variant.locale)
if dedup_key in seen:
continue
seen.add(dedup_key)
out.append(variant)
return out

@classmethod
def get_variant_terms(cls, text, source_uri=None):
"""Convenience wrapper returning just the variant strings, deduplicated."""
seen = set()
out = []
for variant in cls.get_lexical_variants(text, source_uri=source_uri):
if variant.term not in seen:
seen.add(variant.term)
out.append(variant.term)
return out

@classmethod
def _cache_key(cls, source):
version = getattr(source, 'version', 'HEAD') or 'HEAD'
return f'{cls.CACHE_KEY_PREFIX}|{source.uri}|{version}'

@classmethod
def invalidate_cache(cls, source_uri=None):
"""Clear cached dictionary contents. Call after a Source version changes."""
pattern = f'{cls.CACHE_KEY_PREFIX}|'
pattern += '*' if source_uri is None else f'{source_uri}|*'
cache.delete_pattern(pattern)

@classmethod
def _get_index(cls, source):
key = cls._cache_key(source)
raw = cache.get(key)
if raw is None:
index = cls._load_dictionary(source)
cache.set(key, cls._serialize_index(index), timeout=cls.CACHE_TIMEOUT)
return index
return cls._deserialize_index(raw)

@staticmethod
def _resolve_source(source_uri):
from core.sources.models import Source
if not source_uri:
return None
repo, _ = Source.resolve_reference_expression(source_uri)
return repo if repo and repo.id else None

@staticmethod
def _load_dictionary(source):
from django.db.models import F
from core.concepts.models import ConceptName

names = ConceptName.objects.filter(
concept__parent_id=source.id,
concept__id=F('concept__versioned_object_id'),
concept__retired=False,
concept__is_active=True,
).select_related('concept')

by_concept = {}
for cn in names:
by_concept.setdefault(cn.concept_id, []).append(cn)

index = {}
for group in by_concept.values():
for source_name in group:
siblings = [n for n in group if n.id != source_name.id]
if not siblings:
continue
key = source_name.name.strip().lower()
if not key:
continue
variants = [
LexicalVariant(
term=sib.name,
name_type=sib.type or '',
locale=sib.locale or '',
source_concept_uri=sib.concept.uri,
)
for sib in siblings
]
index.setdefault(key, []).extend(variants)
return index

@staticmethod
def _serialize_index(index):
return {
token: [
{'term': v.term, 'name_type': v.name_type, 'locale': v.locale, 'source_concept_uri': v.source_concept_uri}
for v in variants
]
for token, variants in index.items()
}

@staticmethod
def _deserialize_index(raw):
return {
token: [LexicalVariant(**d) for d in variants]
for token, variants in raw.items()
}

@staticmethod
def _tokenize(text):
if not text:
return []
cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text.lower())
return [tok for tok in cleaned.split() if tok]
106 changes: 106 additions & 0 deletions core/common/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,3 +1515,109 @@ def test_core_user_gets_core_throttle_not_standard(self):
self.assertIsInstance(throttles[1], CoreDayThrottle)
self.assertIsInstance(match_throttles[0], MatchCoreMinuteThrottle)
self.assertIsInstance(match_throttles[1], MatchCoreDayThrottle)


@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
class LexicalVariantsTest(OCLTestCase):
def setUp(self):
super().setUp()
from django.core.cache import cache
# locmem has no delete_pattern; add a shim so invalidate_cache() works in tests
if not hasattr(cache, 'delete_pattern'):
cache.delete_pattern = lambda pattern: cache.clear()
cache.clear()

def test_tokenize_lowercases_and_splits(self):
from core.common.lexical_variants import LexicalVariantDictionary
self.assertEqual(LexicalVariantDictionary._tokenize("Leukaemia"), ["leukaemia"])
self.assertEqual(LexicalVariantDictionary._tokenize("Anti-HCV IgG"), ["anti", "hcv", "igg"])
self.assertEqual(LexicalVariantDictionary._tokenize(" spaced out "), ["spaced", "out"])
self.assertEqual(LexicalVariantDictionary._tokenize(""), [])
self.assertEqual(LexicalVariantDictionary._tokenize(None), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_returns_variants_for_known_token(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'leukaemia': [LexicalVariant(
term='leukemia', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
'leukemia': [LexicalVariant(
term='leukaemia', name_type='Fully Specified', locale='en-GB',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
}

variants = LexicalVariantDictionary.get_lexical_variants('leukaemia')
self.assertEqual(len(variants), 1)
self.assertEqual(variants[0].term, 'leukemia')
self.assertEqual(variants[0].locale, 'en-US')

terms = LexicalVariantDictionary.get_variant_terms('leukemia')
self.assertEqual(terms, ['leukaemia'])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_returns_empty_for_unknown_token(self, mock_load, mock_resolve):
"""Regression: words containing 'hem'/'haem' as a substring must NOT match."""
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'hemorrhage': [LexicalVariant(
term='haemorrhage', name_type='Fully Specified', locale='en-GB',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/hemorrhage/',
)],
}

for false_positive in ['themselves', 'anthem', 'hemisphere', 'hemp', 'hemlock', 'remember']:
with self.subTest(token=false_positive):
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(false_positive), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
def test_returns_empty_when_source_missing(self, mock_resolve):
from core.common.lexical_variants import LexicalVariantDictionary
mock_resolve.return_value = None
self.assertEqual(LexicalVariantDictionary.get_lexical_variants('leukaemia'), [])

def test_returns_empty_for_empty_input(self):
from core.common.lexical_variants import LexicalVariantDictionary
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(''), [])
self.assertEqual(LexicalVariantDictionary.get_lexical_variants(None), [])

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_caches_dictionary_per_source_version(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='v1.0')
mock_load.return_value = {}

LexicalVariantDictionary.get_lexical_variants('leukaemia')
LexicalVariantDictionary.get_lexical_variants('color')
LexicalVariantDictionary.get_lexical_variants('anything')
self.assertEqual(mock_load.call_count, 1)

LexicalVariantDictionary.invalidate_cache()
LexicalVariantDictionary.get_lexical_variants('leukaemia')
self.assertEqual(mock_load.call_count, 2)

@patch('core.common.lexical_variants.LexicalVariantDictionary._resolve_source')
@patch('core.common.lexical_variants.LexicalVariantDictionary._load_dictionary')
def test_multi_token_input_expands_each_known_token(self, mock_load, mock_resolve):
from core.common.lexical_variants import LexicalVariant, LexicalVariantDictionary
mock_resolve.return_value = MagicMock(uri='/orgs/OCL/sources/lexical-variants-en/', version='HEAD')
mock_load.return_value = {
'leukaemia': [LexicalVariant(
term='leukemia', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/leukemia/',
)],
'colour': [LexicalVariant(
term='color', name_type='Fully Specified', locale='en-US',
source_concept_uri='/orgs/OCL/sources/lexical-variants-en/concepts/color/',
)],
}

terms = LexicalVariantDictionary.get_variant_terms('childhood leukaemia colour')
self.assertEqual(set(terms), {'leukemia', 'color'})
Loading