Skip to content

Commit 7d7f51e

Browse files
authored
fix(490): Add question mark to escaped chars in TokenEscaper (#519)
This PR adds question mark `?` to the escape characters in `TokenEscaper`. The ["Expand TokenEscaper to escape ? and | characters" issue](#490) also mentions adding the pipe character `|`, but this will cause `TestMultiPrefixTextQuery.test_text_query_returns_both_prefixes`, which uses the pipe character as a logical OR in its text search, to fail.
1 parent 817a255 commit 7d7f51e

2 files changed

Lines changed: 7 additions & 6 deletions

File tree

redisvl/utils/token_escaper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ class TokenEscaper:
99
"""
1010

1111
# Characters that RediSearch requires us to escape during queries.
12-
# Source: https://redis.io/docs/stack/search/reference/escaping/#the-rules-of-text-field-tokenization
13-
DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]"
12+
# Source: https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/escaping/#tokenization-rules-for-text-fields
13+
DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ \?]"
1414

15-
# Same as above but excludes * to allow wildcard patterns
15+
# Same as above but excludes * and ? to allow wildcard patterns
1616
ESCAPED_CHARS_NO_WILDCARD = r"[,.<>{}\[\]\\\"\':;!@#$%^&()\-+=~\/ ]"
1717

1818
def __init__(self, escape_chars_re: Optional[Pattern] = None):

tests/unit/test_token_escaper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ def escaper():
1919
),
2020
(
2121
r"& symbols, like * and ?",
22-
r"\&\ symbols\,\ like\ \*\ and\ ?",
23-
), # TODO: question marks are not caught?
22+
r"\&\ symbols\,\ like\ \*\ and\ \?",
23+
),
2424
# underscores are ignored
2525
(r"-dashes_and_underscores-", r"\-dashes_and_underscores\-"),
2626
],
@@ -57,7 +57,7 @@ def test_escape_text_chars(escaper, test_input, expected):
5757
("(parentheses)", r"\(parentheses\)"),
5858
("[brackets]", r"\[brackets\]"),
5959
("{braces}", r"\{braces\}"),
60-
# ("question?mark", r"question\?mark"), #TODO - question marks are not caught?
60+
("question?mark", r"question\?mark"),
6161
# Unicode characters in tags
6262
("你好", r"你好"), # Assuming non-Latin characters don't need escaping
6363
("emoji:😊", r"emoji\:😊"),
@@ -81,6 +81,7 @@ def test_escape_text_chars(escaper, test_input, expected):
8181
"parentheses",
8282
"brackets",
8383
"braces",
84+
"question",
8485
"non-latin",
8586
"emoji",
8687
],

0 commit comments

Comments
 (0)