|
| 1 | +import pytest |
| 2 | +from rrf import reciprocal_rank_fusion, extract_doc_id |
| 3 | + |
| 4 | +def test_extract_doc_id(): |
| 5 | + assert extract_doc_id({"id": "123"}) == "123" |
| 6 | + assert extract_doc_id({"_id": "456"}) == "456" |
| 7 | + assert extract_doc_id({"id": "123", "_id": "456"}) == "123" # Prefers 'id' |
| 8 | + assert extract_doc_id({}) == "" |
| 9 | + |
| 10 | +def test_rrf_single_list(): |
| 11 | + list1 = [{"id": "A"}, {"id": "B"}, {"id": "C"}] |
| 12 | + fused = reciprocal_rank_fusion([list1], k=60, top_k=10) |
| 13 | + |
| 14 | + assert len(fused) == 3 |
| 15 | + assert fused[0]["id"] == "A" |
| 16 | + assert fused[1]["id"] == "B" |
| 17 | + assert fused[2]["id"] == "C" |
| 18 | + |
| 19 | + # Check score math: A=1/61, B=1/62, C=1/63 |
| 20 | + assert fused[0]["rrf_score"] == 1 / 61 |
| 21 | + assert fused[1]["rrf_score"] == 1 / 62 |
| 22 | + assert fused[2]["rrf_score"] == 1 / 63 |
| 23 | + |
| 24 | +def test_rrf_two_lists_same_order(): |
| 25 | + list1 = [{"id": "A"}, {"id": "B"}] |
| 26 | + list2 = [{"_id": "A"}, {"_id": "B"}] # Note list2 uses _id |
| 27 | + fused = reciprocal_rank_fusion([list1, list2], k=60, top_k=10) |
| 28 | + |
| 29 | + assert len(fused) == 2 |
| 30 | + assert fused[0]["id"] == "A" # Source dict comes from list1 first |
| 31 | + assert fused[1]["id"] == "B" |
| 32 | + |
| 33 | + # A is rank 1 in both: 1/61 + 1/61 |
| 34 | + assert fused[0]["rrf_score"] == (1/61) + (1/61) |
| 35 | + |
| 36 | +def test_rrf_boosts_overlap(): |
| 37 | + # A is in both lists but ranked lower. B is rank 1 in list1 only. C is rank 1 in list2 only. |
| 38 | + list1 = [{"id": "B"}, {"id": "A"}, {"id": "X"}] |
| 39 | + list2 = [{"id": "C"}, {"id": "A"}, {"id": "Y"}] |
| 40 | + |
| 41 | + fused = reciprocal_rank_fusion([list1, list2], k=60, top_k=10) |
| 42 | + |
| 43 | + weights = {doc["id"]: doc["rrf_score"] for doc in fused} |
| 44 | + |
| 45 | + # A: rank 2 + rank 2 = 1/62 + 1/62 = 0.032258 |
| 46 | + # B: rank 1 + none = 1/61 + 0 = 0.016393 |
| 47 | + # C: rank 1 + none = 1/61 + 0 = 0.016393 |
| 48 | + |
| 49 | + assert fused[0]["id"] == "A" |
| 50 | + assert weights["A"] > weights["B"] |
| 51 | + assert weights["A"] > weights["C"] |
| 52 | + |
| 53 | +def test_rrf_empty_lists(): |
| 54 | + assert reciprocal_rank_fusion([], k=60) == [] |
| 55 | + assert reciprocal_rank_fusion([[], []], k=60) == [] |
| 56 | + |
| 57 | + list1 = [{"id": "A"}] |
| 58 | + # Fuses one empty list and one populated list |
| 59 | + fused = reciprocal_rank_fusion([list1, []], k=60) |
| 60 | + assert len(fused) == 1 |
| 61 | + assert fused[0]["id"] == "A" |
| 62 | + |
| 63 | +def test_rrf_top_k_truncates(): |
| 64 | + list1 = [{"id": str(i)} for i in range(100)] |
| 65 | + fused = reciprocal_rank_fusion([list1], k=60, top_k=5) |
| 66 | + assert len(fused) == 5 |
| 67 | + assert fused[-1]["id"] == "4" # Indices 0, 1, 2, 3, 4 |
| 68 | + |
| 69 | +def test_rrf_id_fallback(): |
| 70 | + # If a document doesn't have id or _id, the function uses a hash fallback. |
| 71 | + # While relying on title_guess is weak, this ensures no crash. |
| 72 | + list1 = [{"title_guess": "Unique Title"}, {"title_guess": "Another Title"}] |
| 73 | + fused = reciprocal_rank_fusion([list1]) |
| 74 | + assert len(fused) == 2 |
| 75 | + assert fused[0].get("rrf_score") is not None |
0 commit comments