Skip to content

Commit 8aa7465

Browse files
committed
Merge branch '47-relevance-score-and-c-api-renames' into 'dev'
Resolve "Bubble up relevance_score / rename to OBXVectorDistanceType" Closes #47 See merge request objectbox/objectbox-python!32
2 parents dc6fe8a + ca083c2 commit 8aa7465

8 files changed

Lines changed: 109 additions & 48 deletions

File tree

example/vectorsearch-cities/model.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ class City:
1111
location = Property(np.ndarray, type=PropertyType.floatVector, id=3, uid=1003, index=HnswIndex(
1212
id=3, uid=10001,
1313
dimensions=2,
14-
distance_type=HnswDistanceType.EUCLIDEAN
14+
distance_type=VectorDistanceType.EUCLIDEAN
1515
))
1616

17+
1718
def get_objectbox_model():
1819
m = Model()
1920
m.entity(City, last_property_id=IdUid(3, 1003))
2021
m.last_entity_id = IdUid(1, 1)
21-
m.last_index_id = IdUid(3,10001)
22+
m.last_index_id = IdUid(3, 10001)
2223
return m

objectbox/c.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def shlib_name(library: str) -> str:
7979
OBXPutPaddingMode = ctypes.c_int
8080
OBXOrderFlags = ctypes.c_int
8181
OBXHnswFlags = ctypes.c_int
82-
OBXHnswDistanceType = ctypes.c_int
82+
OBXVectorDistanceType = ctypes.c_int
8383
OBXValidateOnOpenPagesFlags = ctypes.c_int
8484
OBXValidateOnOpenKvFlags = ctypes.c_int
8585
OBXBackupRestoreFlags = ctypes.c_int
@@ -360,6 +360,12 @@ def c_array_pointer(py_list: Union[List[Any], np.ndarray], c_type):
360360
return ctypes.cast(c_array(py_list, c_type), ctypes.POINTER(c_type))
361361

362362

363+
# OBX_C_API float obx_vector_distance_float32(OBXVectorDistanceType type, const float* vector1, const float* vector2, size_t dimension);
364+
obx_vector_distance_float32 = c_fn("obx_vector_distance_float32", ctypes.c_float, [OBXVectorDistanceType, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float), ctypes.c_size_t])
365+
366+
# OBX_C_API float obx_vector_distance_to_relevance(OBXVectorDistanceType type, float distance);
367+
obx_vector_distance_to_relevance = c_fn("obx_vector_distance_to_relevance", ctypes.c_float, [OBXVectorDistanceType, ctypes.c_float])
368+
363369
# OBX_model* (void);
364370
obx_model = c_fn('obx_model', OBX_model_p, [])
365371

@@ -393,9 +399,8 @@ def c_array_pointer(py_list: Union[List[Any], np.ndarray], c_type):
393399
obx_model_property_index_hnsw_flags = \
394400
c_fn_rc('obx_model_property_index_hnsw_flags', [OBX_model_p, OBXHnswFlags])
395401

396-
# obx_err obx_model_property_index_hnsw_distance_type(OBX_model* model, OBXHnswDistanceType value)
397-
obx_model_property_index_hnsw_distance_type = \
398-
c_fn_rc('obx_model_property_index_hnsw_distance_type', [OBX_model_p, OBXHnswDistanceType])
402+
# obx_err obx_model_property_index_hnsw_distance_type(OBX_model* model, OBXVectorDistanceType value)
403+
obx_model_property_index_hnsw_distance_type = c_fn_rc('obx_model_property_index_hnsw_distance_type', [OBX_model_p, OBXVectorDistanceType])
399404

400405
# obx_err obx_model_property_index_hnsw_reparation_backlink_probability(OBX_model* model, float value)
401406
obx_model_property_index_hnsw_reparation_backlink_probability = \
@@ -980,11 +985,11 @@ def c_array_pointer(py_list: Union[List[Any], np.ndarray], c_type):
980985
OBXHnswFlags_VECTOR_CACHE_SIMD_PADDING_OFF = 4
981986
OBXHnswFlags_REPARATION_LIMIT_CANDIDATES = 8
982987

983-
OBXHnswDistanceType_UNKNOWN = 0
984-
OBXHnswDistanceType_EUCLIDEAN = 1
985-
OBXHnswDistanceType_COSINE = 2
986-
OBXHnswDistanceType_DOT_PRODUCT = 3
987-
OBXHnswDistanceType_DOT_PRODUCT_NON_NORMALIZED = 10
988+
OBXVectorDistanceType_UNKNOWN = 0
989+
OBXVectorDistanceType_EUCLIDEAN = 1
990+
OBXVectorDistanceType_COSINE = 2
991+
OBXVectorDistanceType_DOT_PRODUCT = 3
992+
OBXVectorDistanceType_DOT_PRODUCT_NON_NORMALIZED = 10
988993

989994
OBXPutPaddingMode_PaddingAutomatic = 1
990995
OBXPutPaddingMode_PaddingAllowedByBuffer = 2

objectbox/model/properties.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,26 +94,26 @@ class HnswFlags(IntEnum):
9494
REPARATION_LIMIT_CANDIDATES = 8
9595

9696

97-
class HnswDistanceType(IntEnum):
98-
UNKNOWN = OBXHnswDistanceType_UNKNOWN
99-
EUCLIDEAN = OBXHnswDistanceType_EUCLIDEAN
100-
COSINE = OBXHnswDistanceType_COSINE
101-
DOT_PRODUCT = OBXHnswDistanceType_DOT_PRODUCT
102-
DOT_PRODUCT_NON_NORMALIZED = OBXHnswDistanceType_DOT_PRODUCT_NON_NORMALIZED
103-
104-
HnswDistanceType.UNKNOWN.__doc__ = "Not a real type, just best practice (e.g. forward compatibility)"
105-
HnswDistanceType.EUCLIDEAN.__doc__ = "The default; typically 'euclidean squared' internally."
106-
HnswDistanceType.COSINE.__doc__ = """
97+
class VectorDistanceType(IntEnum):
98+
UNKNOWN = OBXVectorDistanceType_UNKNOWN
99+
EUCLIDEAN = OBXVectorDistanceType_EUCLIDEAN
100+
COSINE = OBXVectorDistanceType_COSINE
101+
DOT_PRODUCT = OBXVectorDistanceType_DOT_PRODUCT
102+
DOT_PRODUCT_NON_NORMALIZED = OBXVectorDistanceType_DOT_PRODUCT_NON_NORMALIZED
103+
104+
VectorDistanceType.UNKNOWN.__doc__ = "Not a real type, just best practice (e.g. forward compatibility)"
105+
VectorDistanceType.EUCLIDEAN.__doc__ = "The default; typically 'euclidean squared' internally."
106+
VectorDistanceType.COSINE.__doc__ = """
107107
Cosine similarity compares two vectors irrespective of their magnitude (compares the angle of two vectors).
108108
Often used for document or semantic similarity.
109109
Value range: 0.0 - 2.0 (0.0: same direction, 1.0: orthogonal, 2.0: opposite direction)
110110
"""
111-
HnswDistanceType.DOT_PRODUCT.__doc__ = """
111+
VectorDistanceType.DOT_PRODUCT.__doc__ = """
112112
For normalized vectors (vector length == 1.0), the dot product is equivalent to the cosine similarity.
113113
Because of this, the dot product is often preferred as it performs better.
114114
Value range (normalized vectors): 0.0 - 2.0 (0.0: same direction, 1.0: orthogonal, 2.0: opposite direction)
115115
"""
116-
HnswDistanceType.DOT_PRODUCT_NON_NORMALIZED.__doc__ = """
116+
VectorDistanceType.DOT_PRODUCT_NON_NORMALIZED.__doc__ = """
117117
A custom dot product similarity measure that does not require the vectors to be normalized.
118118
Note: this is no replacement for cosine similarity (like DotProduct for normalized vectors is).
119119
The non-linear conversion provides a high precision over the entire float range (for the raw dot product).
@@ -130,7 +130,7 @@ class HnswIndex:
130130
neighbors_per_node: Optional[int] = None
131131
indexing_search_count: Optional[int] = None
132132
flags: HnswFlags = HnswFlags.NONE
133-
distance_type: HnswDistanceType = HnswDistanceType.EUCLIDEAN
133+
distance_type: VectorDistanceType = VectorDistanceType.EUCLIDEAN
134134
reparation_backlink_probability: Optional[float] = None
135135
vector_cache_hint_size_kb: Optional[float] = None
136136

objectbox/query_builder.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import numpy as np
33
from typing import *
44

5+
from objectbox.c import *
56
from objectbox.model.properties import Property
67
from objectbox.objectbox import ObjectBox
78
from objectbox.query import Query
8-
from objectbox.c import *
9+
from objectbox.utils import check_float_vector
910

1011

1112
class QueryBuilder:
@@ -108,10 +109,11 @@ def between_2ints(self, prop: Union[int, str, Property], value_a: int, value_b:
108109
cond = obx_qb_between_2ints(self._c_builder, prop_id, value_a, value_b)
109110
return cond
110111

111-
def nearest_neighbors_f32(self, prop: Union[int, str, Property], query_vector: Union[np.ndarray, List[float]],
112+
def nearest_neighbors_f32(self,
113+
prop: Union[int, str, Property],
114+
query_vector: Union[np.ndarray, List[float]],
112115
element_count: int):
113-
if isinstance(query_vector, np.ndarray) and query_vector.dtype != np.float32:
114-
raise Exception(f"query_vector dtype is expected to be np.float32, got: {query_vector.dtype}")
116+
check_float_vector(query_vector, "query_vector")
115117
prop_id = self._entity.get_property_id(prop)
116118
c_query_vector = c_array(query_vector, ctypes.c_float)
117119
cond = obx_qb_nearest_neighbors_f32(self._c_builder, prop_id, c_query_vector, element_count)

objectbox/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import numpy as np
2+
3+
from objectbox.c import *
4+
from objectbox.model.properties import VectorDistanceType
5+
6+
7+
def check_float_vector(vector: Union[np.ndarray, List[float]], vector_name: str):
8+
""" Checks that the given vector is a float vector (either np.ndarray or Python's list). """
9+
if isinstance(vector, np.ndarray) and vector.dtype != np.float32:
10+
raise Exception(f"{vector_name} dtype is expected to be np.float32, got: {vector.dtype}")
11+
elif isinstance(vector, list) and len(vector) > 0 and (type(vector[0]) is not float):
12+
raise Exception(f"{vector_name} is expected to be a float list, got vector[0]: {type(vector[0])}")
13+
14+
15+
def vector_distance_f32(distance_type: VectorDistanceType,
16+
vector1: Union[np.ndarray, List[float]],
17+
vector2: Union[np.ndarray, List[float]],
18+
dimension: int) -> float:
19+
""" Utility function to calculate the distance of two vectors. """
20+
check_float_vector(vector1, "vector1")
21+
check_float_vector(vector2, "vector2")
22+
return obx_vector_distance_float32(distance_type,
23+
c_array(vector1, ctypes.c_float),
24+
c_array(vector2, ctypes.c_float),
25+
dimension)
26+
27+
28+
def vector_distance_to_relevance(distance_type: VectorDistanceType, distance: float) -> float:
29+
""" Converts the given distance to a relevance score in range [0.0, 1.0], according to its type. """
30+
return obx_vector_distance_to_relevance(distance_type, distance)

tests/model.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,23 +58,22 @@ class VectorEntity:
5858
id = Id(id=1, uid=4001)
5959
name = Property(str, type=PropertyType.string, id=2, uid=4002)
6060
vector_euclidean = Property(np.ndarray, type=PropertyType.floatVector, id=3, uid=4003,
61-
index=HnswIndex(
62-
id=3, uid=40001,
63-
dimensions=2, distance_type=HnswDistanceType.EUCLIDEAN)
64-
)
61+
index=HnswIndex(
62+
id=3, uid=40001,
63+
dimensions=2, distance_type=VectorDistanceType.EUCLIDEAN)
64+
)
6565
vector_cosine = Property(np.ndarray, type=PropertyType.floatVector, id=4, uid=4004,
66-
index=HnswIndex(
67-
id=4, uid=40002,
68-
dimensions=2, distance_type=HnswDistanceType.COSINE)
69-
)
66+
index=HnswIndex(
67+
id=4, uid=40002,
68+
dimensions=2, distance_type=VectorDistanceType.COSINE)
69+
)
7070
vector_dot_product = Property(np.ndarray, type=PropertyType.floatVector, id=5, uid=4005,
71-
index=HnswIndex(
72-
id=5, uid=40003,
73-
dimensions=2, distance_type=HnswDistanceType.DOT_PRODUCT)
74-
)
75-
#vector_dot_product_non_normalized = Property(np.ndarray, type=PropertyType.floatVector, id=6, uid=4006,
71+
index=HnswIndex(
72+
id=5, uid=40003,
73+
dimensions=2, distance_type=VectorDistanceType.DOT_PRODUCT)
74+
)
75+
# vector_dot_product_non_normalized = Property(np.ndarray, type=PropertyType.floatVector, id=6, uid=4006,
7676
# index=HnswIndex(
7777
# id=6, uid=40004,
78-
# dimensions=2, distance_type=HnswDistanceType.DOT_PRODUCT_NON_NORMALIZED)
78+
# dimensions=2, distance_type=VectorDistanceType.DOT_PRODUCT_NON_NORMALIZED)
7979
# )
80-

tests/test_hnsw.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@ def _find_expected_nn(points: np.ndarray, query: np.ndarray, n: int):
1515
return np.argsort(d)[:n]
1616

1717

18-
def _test_random_points(num_points: int, num_query_points: int, seed: Optional[int] = None, distance_type: HnswDistanceType = HnswDistanceType.EUCLIDEAN, min_score: float = 0.5):
18+
def _test_random_points(
19+
num_points: int,
20+
num_query_points: int,
21+
seed: Optional[int] = None,
22+
distance_type: VectorDistanceType = VectorDistanceType.EUCLIDEAN,
23+
min_score: float = 0.5):
1924
""" Generates random points in a 2d plane; checks the queried NN against the expected. """
2025

2126
vector_field_name = "vector_"+distance_type.name.lower()
@@ -76,7 +81,7 @@ def _test_random_points(num_points: int, num_query_points: int, seed: Optional[i
7681
def test_random_points():
7782

7883
min_score = 0.5
79-
distance_type = HnswDistanceType.EUCLIDEAN
84+
distance_type = VectorDistanceType.EUCLIDEAN
8085
_test_random_points(num_points=100, num_query_points=10, seed=10, distance_type=distance_type, min_score=min_score)
8186
_test_random_points(num_points=100, num_query_points=10, seed=11, distance_type=distance_type, min_score=min_score)
8287
_test_random_points(num_points=100, num_query_points=10, seed=12, distance_type=distance_type, min_score=min_score)
@@ -86,8 +91,9 @@ def test_random_points():
8691

8792
# TODO: Cosine and Dot Product may result in 0 score
8893

89-
def _test_combined_nn_search(distance_type: HnswDistanceType = HnswDistanceType.EUCLIDEAN):
90-
94+
95+
def _test_combined_nn_search(distance_type: VectorDistanceType = VectorDistanceType.EUCLIDEAN):
96+
9197
db = create_test_objectbox()
9298

9399
box = objectbox.Box(db, VectorEntity)
@@ -175,6 +181,6 @@ def _test_combined_nn_search(distance_type: HnswDistanceType = HnswDistanceType.
175181

176182
def test_combined_nn_search():
177183
""" Tests NN search combined with regular query conditions, offset and limit. """
178-
distance_type = HnswDistanceType.EUCLIDEAN
184+
distance_type = VectorDistanceType.EUCLIDEAN
179185
_test_combined_nn_search(distance_type)
180186
# TODO: Cosine, DotProduct diverges see below

tests/test_utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pytest
2+
3+
from objectbox.utils import *
4+
5+
6+
def test_vector_distance_f32():
7+
""" Tests distance values between two vectors. """
8+
9+
a = np.array([3.4, 2.9, -10, 1.0], dtype=np.float32)
10+
b = np.array([56., -1.2, 22, 2.0], dtype=np.float32)
11+
12+
a_norm = a / np.linalg.norm(a)
13+
b_norm = b / np.linalg.norm(b)
14+
15+
assert vector_distance_f32(VectorDistanceType.EUCLIDEAN, a, b, 4) == pytest.approx(np.dot(b - a, b - a))
16+
assert vector_distance_f32(VectorDistanceType.COSINE, a, b, 4) == pytest.approx(1.0469311)
17+
assert vector_distance_f32(VectorDistanceType.DOT_PRODUCT, a_norm, b_norm, 4) == pytest.approx(1.0469311)
18+
assert vector_distance_f32(VectorDistanceType.DOT_PRODUCT_NON_NORMALIZED, a, b, 4) == pytest.approx(1.519307)

0 commit comments

Comments
 (0)