Skip to content

Commit 57bb9e8

Browse files
committed
Add Solr cache failover/backoff behavior to avoid repeated timeouts
1 parent 1649393 commit 57bb9e8

3 files changed

Lines changed: 128 additions & 2 deletions

File tree

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
here = path.abspath(path.dirname(__file__))
55

6-
__version__ = "1.6.11"
6+
__version__ = "1.6.12"
77

88
# Get the long description from the README file
99
with open(path.join(here, 'README.md')) as f:
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
4+
import requests
5+
6+
from vfbquery.solr_result_cache import SolrResultCache
7+
8+
9+
class SolrResultCacheFailoverTest(unittest.TestCase):
10+
def test_disable_and_reenable_on_solr_failure(self):
11+
cache = SolrResultCache()
12+
cache._solr_backoff_seconds = 10
13+
14+
# First, cause a caching failure (Solr write times out)
15+
with patch("vfbquery.solr_result_cache.time.time", return_value=1000), \
16+
patch("vfbquery.solr_result_cache.requests.post") as post:
17+
post.side_effect = requests.exceptions.ReadTimeout()
18+
ok = cache.cache_result("term_info", "FBbt_00000000", {"foo": "bar"})
19+
self.assertFalse(ok)
20+
self.assertTrue(cache._solr_disabled)
21+
self.assertGreater(cache._solr_disabled_until, 1000)
22+
self.assertEqual(post.call_count, 1)
23+
24+
# Subsequent cache attempts should not hit Solr again during backoff
25+
ok2 = cache.cache_result("term_info", "FBbt_00000000", {"foo": "bar"})
26+
self.assertFalse(ok2)
27+
self.assertEqual(post.call_count, 1)
28+
29+
# Advance time past backoff and allow a successful Solr health check
30+
with patch("vfbquery.solr_result_cache.time.time", return_value=cache._solr_disabled_until + 1), \
31+
patch("vfbquery.solr_result_cache.requests.get") as get:
32+
get.return_value = MagicMock(status_code=200, json=lambda: {"response": {"docs": []}})
33+
34+
# get_cached_result triggers a health check and should re-enable caching
35+
res = cache.get_cached_result("term_info", "FBbt_00000000")
36+
self.assertIsNone(res)
37+
self.assertFalse(cache._solr_disabled)
38+
# One call for the health check, one for the cache query itself
39+
self.assertGreaterEqual(get.call_count, 1)
40+
41+
42+
if __name__ == "__main__":
43+
unittest.main(verbosity=2)

src/vfbquery/solr_result_cache.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"""
1212

1313
import json
14+
import os
1415
import requests
1516
import hashlib
1617
import time
@@ -42,6 +43,10 @@ class SolrResultCache:
4243
4344
Stores computed query results in a dedicated SOLR collection to enable
4445
instant retrieval without expensive computation on cold starts.
46+
47+
This cache layer is "best-effort"; if the Solr cache becomes unavailable,
48+
it will temporarily disable itself and continue serving live results. It
49+
will periodically probe Solr and re-enable itself when the service recovers.
4550
"""
4651

4752
def __init__(self,
@@ -60,6 +65,14 @@ def __init__(self,
6065
self.ttl_hours = ttl_hours
6166
self.max_result_size_mb = max_result_size_mb
6267
self.max_result_size_bytes = max_result_size_mb * 1024 * 1024
68+
69+
# When Solr is unreachable, disable caching for a period (backoff).
70+
# This prevents the app from logging repeated timeout errors and
71+
# allows the query path to continue working.
72+
self._solr_disabled = False
73+
self._solr_disabled_until = 0.0 # epoch timestamp
74+
self._solr_backoff_seconds = int(os.getenv('VFBQUERY_SOLR_BACKOFF_SECONDS', '60'))
75+
self._solr_last_error = None
6376

6477
def _create_cache_metadata(self, result: Any, **params) -> Optional[Dict[str, Any]]:
6578
"""Create metadata for cached result with 3-month expiration"""
@@ -85,6 +98,47 @@ def _create_cache_metadata(self, result: Any, **params) -> Optional[Dict[str, An
8598
"ttl_hours": self.ttl_hours # Store TTL for debugging
8699
}
87100

101+
def _solr_available(self) -> bool:
102+
"""Return True if Solr caching looks operational.
103+
104+
If Solr has recently failed, this method will delay further cache
105+
operations until the backoff window elapses. When the backoff window
106+
expires, it will probe Solr with a lightweight request and re-enable
107+
caching if successful.
108+
"""
109+
now = time.time()
110+
if not self._solr_disabled:
111+
return True
112+
if now < self._solr_disabled_until:
113+
return False
114+
115+
# Backoff period elapsed: try a small health check
116+
try:
117+
resp = requests.get(
118+
f"{self.cache_url}/select",
119+
params={"q": "*:*", "rows": 0, "wt": "json"},
120+
timeout=2,
121+
)
122+
if resp.status_code == 200:
123+
if self._solr_last_error is not None:
124+
logger.info("Solr cache re-enabled")
125+
self._solr_disabled = False
126+
self._solr_last_error = None
127+
return True
128+
except Exception as e:
129+
err = str(e)
130+
if err != self._solr_last_error:
131+
logger.warning(
132+
"Solr cache unavailable, retrying in %ds: %s",
133+
self._solr_backoff_seconds,
134+
err,
135+
)
136+
self._solr_last_error = err
137+
138+
self._solr_disabled = True
139+
self._solr_disabled_until = now + self._solr_backoff_seconds
140+
return False
141+
88142
def get_cached_result(self, query_type: str, term_id: str, **params) -> Optional[Any]:
89143
"""
90144
Retrieve cached result from separate cache document
@@ -97,6 +151,9 @@ def get_cached_result(self, query_type: str, term_id: str, **params) -> Optional
97151
Returns:
98152
Cached result or None if not found/expired
99153
"""
154+
if not self._solr_available():
155+
return None
156+
100157
try:
101158
# Query for cache document with prefixed ID including query type
102159
# This ensures different query types for the same term have separate cache entries
@@ -226,6 +283,10 @@ def cache_result(self, query_type: str, term_id: str, result: Any, **params) ->
226283
if not result:
227284
logger.debug("Empty result, not caching")
228285
return False
286+
287+
if not self._solr_available():
288+
# Solr is temporarily unavailable; skip caching and continue serving.
289+
return False
229290

230291
try:
231292
# Create cached metadata and result
@@ -263,12 +324,24 @@ def cache_result(self, query_type: str, term_id: str, result: Any, **params) ->
263324
return False
264325

265326
except Exception as e:
266-
logger.error(f"Error caching result: {e}")
327+
# Mark Solr as temporarily unavailable to avoid repeated errors
328+
self._solr_disabled = True
329+
self._solr_disabled_until = time.time() + self._solr_backoff_seconds
330+
err = str(e)
331+
if err != self._solr_last_error:
332+
logger.warning(
333+
"Solr cache write failed; disabling cache for %ds: %s",
334+
self._solr_backoff_seconds,
335+
err,
336+
)
337+
self._solr_last_error = err
267338
return False
268339

269340

270341
def _clear_expired_cache_document(self, cache_doc_id: str):
271342
"""Delete expired cache document from SOLR"""
343+
if not self._solr_available():
344+
return
272345
try:
273346
requests.post(
274347
f"{self.cache_url}/update",
@@ -291,6 +364,8 @@ def clear_cache_entry(self, query_type: str, term_id: str) -> bool:
291364
Returns:
292365
True if successfully cleared, False otherwise
293366
"""
367+
if not self._solr_available():
368+
return False
294369
try:
295370
# Include query_type in cache document ID to match storage format
296371
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
@@ -313,6 +388,8 @@ def clear_cache_entry(self, query_type: str, term_id: str) -> bool:
313388

314389
def _increment_cache_hit_count(self, cache_doc_id: str, current_count: int):
315390
"""Increment hit count for cache document (background operation)"""
391+
if not self._solr_available():
392+
return
316393
try:
317394
# Update hit count in cache document
318395
new_count = current_count + 1
@@ -339,6 +416,9 @@ def get_cache_age(self, query_type: str, term_id: str, **params) -> Optional[Dic
339416
Returns:
340417
Dictionary with cache age info or None if not cached
341418
"""
419+
if not self._solr_available():
420+
return None
421+
342422
try:
343423
# Include query_type in cache document ID to match storage format
344424
cache_doc_id = f"vfb_query_{query_type}_{term_id}"
@@ -396,6 +476,9 @@ def cleanup_expired_entries(self) -> int:
396476
Returns:
397477
Number of expired cache documents cleaned up
398478
"""
479+
if not self._solr_available():
480+
return 0
481+
399482
try:
400483
now = datetime.now().astimezone()
401484
cleaned_count = 0

0 commit comments

Comments
 (0)