Skip to content

Commit 3e2e025

Browse files
committed
fix duplicate snippets and newline counts
1 parent 3c1b1e4 commit 3e2e025

3 files changed

Lines changed: 210 additions & 10 deletions

File tree

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
UPDATE content_blob_chunks cbc
2+
SET chunk_line_count = CASE
3+
WHEN chunks.text_content = '' THEN 0
4+
ELSE (
5+
length(chunks.text_content) - length(replace(chunks.text_content, E'\n', ''))
6+
+ CASE
7+
WHEN right(chunks.text_content, 1) = E'\n' THEN 0
8+
ELSE 1
9+
END
10+
)
11+
END
12+
FROM chunks
13+
WHERE chunks.chunk_hash = cbc.chunk_hash
14+
AND cbc.chunk_line_count IS DISTINCT FROM CASE
15+
WHEN chunks.text_content = '' THEN 0
16+
ELSE (
17+
length(chunks.text_content) - length(replace(chunks.text_content, E'\n', ''))
18+
+ CASE
19+
WHEN right(chunks.text_content, 1) = E'\n' THEN 0
20+
ELSE 1
21+
END
22+
)
23+
END;
24+
25+
WITH blob_counts AS (
26+
SELECT
27+
cbc.content_hash AS hash,
28+
COALESCE(SUM(cbc.chunk_line_count), 0) AS corrected_line_count
29+
FROM content_blob_chunks cbc
30+
GROUP BY cbc.content_hash
31+
)
32+
UPDATE content_blobs cb
33+
SET line_count = blob_counts.corrected_line_count
34+
FROM blob_counts
35+
WHERE blob_counts.hash = cb.hash
36+
AND cb.line_count IS DISTINCT FROM blob_counts.corrected_line_count;

indexer/src/utils.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ pub fn line_count(bytes: &[u8]) -> i32 {
120120
}
121121

122122
let line_breaks = bytes.iter().filter(|b| **b == b'\n').count();
123-
(line_breaks + 1) as i32
123+
if bytes.last() == Some(&b'\n') {
124+
line_breaks as i32
125+
} else {
126+
(line_breaks + 1) as i32
127+
}
124128
}
125129

126130
pub fn normalize_relative_path(path: &Path) -> String {
@@ -216,3 +220,18 @@ pub fn ensure_relative(path: &Path, root: &Path) -> Result<PathBuf> {
216220
.map(|p| p.to_path_buf())
217221
.with_context(|| format!("{} is not inside {}", path.display(), root.display()))
218222
}
223+
224+
#[cfg(test)]
225+
mod tests {
226+
use super::line_count;
227+
228+
#[test]
229+
fn line_count_ignores_single_trailing_newline() {
230+
assert_eq!(line_count(b"alpha\nbeta\n"), 2);
231+
}
232+
233+
#[test]
234+
fn line_count_preserves_real_blank_lines() {
235+
assert_eq!(line_count(b"alpha\n\n"), 2);
236+
}
237+
}

src/db/postgres.rs

Lines changed: 154 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3304,8 +3304,12 @@ fn is_identifier_byte(byte: u8) -> bool {
33043304
}
33053305

33063306
fn snippet_end_line(content_text: &str, start_line: i32) -> i32 {
3307-
let line_count = content_text.split('\n').count().max(1) as i32;
3308-
start_line.saturating_add(line_count.saturating_sub(1))
3307+
let line_count = content_text.lines().count() as i32;
3308+
if line_count == 0 {
3309+
start_line
3310+
} else {
3311+
start_line.saturating_add(line_count.saturating_sub(1))
3312+
}
33093313
}
33103314

33113315
fn merge_overlapping_snippets(mut snippets: Vec<SearchSnippet>) -> Vec<SearchSnippet> {
@@ -3327,8 +3331,9 @@ fn merge_overlapping_snippets(mut snippets: Vec<SearchSnippet>) -> Vec<SearchSni
33273331

33283332
for snippet in snippets.into_iter().skip(1) {
33293333
if snippet.start_line <= current_end.saturating_add(1) {
3330-
current_end = current_end.max(snippet.end_line);
3331-
merge_snippet_line_map(&mut line_map, &snippet);
3334+
let (merged_start, merged_end) = merge_snippet_line_map(&mut line_map, &snippet);
3335+
current_start = current_start.min(merged_start);
3336+
current_end = current_end.max(merged_end);
33323337
} else {
33333338
merged.push(build_snippet_from_map(
33343339
current_start,
@@ -3357,8 +3362,7 @@ fn build_snippet_line_map(
33573362
snippet: &SearchSnippet,
33583363
) -> BTreeMap<i32, (String, Vec<SearchMatchSpan>)> {
33593364
let mut map = BTreeMap::new();
3360-
for (idx, (line, spans)) in split_snippet_lines(snippet).into_iter().enumerate() {
3361-
let line_number = snippet.start_line.saturating_add(idx as i32);
3365+
for (line_number, line, spans) in aligned_snippet_lines(&map, snippet) {
33623366
insert_snippet_line(&mut map, line_number, line, spans);
33633367
}
33643368
map
@@ -3367,11 +3371,86 @@ fn build_snippet_line_map(
33673371
fn merge_snippet_line_map(
33683372
map: &mut BTreeMap<i32, (String, Vec<SearchMatchSpan>)>,
33693373
snippet: &SearchSnippet,
3370-
) {
3371-
for (idx, (line, spans)) in split_snippet_lines(snippet).into_iter().enumerate() {
3372-
let line_number = snippet.start_line.saturating_add(idx as i32);
3374+
) -> (i32, i32) {
3375+
let mut min_line = i32::MAX;
3376+
let mut max_line = i32::MIN;
3377+
for (line_number, line, spans) in aligned_snippet_lines(map, snippet) {
3378+
min_line = min_line.min(line_number);
3379+
max_line = max_line.max(line_number);
33733380
insert_snippet_line(map, line_number, line, spans);
33743381
}
3382+
if min_line == i32::MAX {
3383+
(snippet.start_line, snippet.end_line)
3384+
} else {
3385+
(min_line, max_line)
3386+
}
3387+
}
3388+
3389+
fn aligned_snippet_lines(
3390+
map: &BTreeMap<i32, (String, Vec<SearchMatchSpan>)>,
3391+
snippet: &SearchSnippet,
3392+
) -> Vec<(i32, String, Vec<SearchMatchSpan>)> {
3393+
let split_lines = split_snippet_lines(snippet);
3394+
let shift = best_snippet_line_shift(map, snippet.start_line, &split_lines);
3395+
split_lines
3396+
.into_iter()
3397+
.enumerate()
3398+
.map(|(idx, (line, spans))| {
3399+
(
3400+
snippet
3401+
.start_line
3402+
.saturating_add(idx as i32)
3403+
.saturating_add(shift),
3404+
line,
3405+
spans,
3406+
)
3407+
})
3408+
.collect()
3409+
}
3410+
3411+
fn best_snippet_line_shift(
3412+
map: &BTreeMap<i32, (String, Vec<SearchMatchSpan>)>,
3413+
start_line: i32,
3414+
lines: &[(String, Vec<SearchMatchSpan>)],
3415+
) -> i32 {
3416+
if map.is_empty() || lines.is_empty() {
3417+
return 0;
3418+
}
3419+
3420+
const MAX_SHIFT: i32 = 3;
3421+
3422+
let mut best_shift: i32 = 0;
3423+
let mut best_score: i32 = 0;
3424+
3425+
for shift in -MAX_SHIFT..=MAX_SHIFT {
3426+
let mut exact_matches = 0i32;
3427+
let mut conflicts = 0i32;
3428+
3429+
for (idx, (line, _)) in lines.iter().enumerate() {
3430+
let line_number = start_line.saturating_add(idx as i32).saturating_add(shift);
3431+
let Some((existing, _)) = map.get(&line_number) else {
3432+
continue;
3433+
};
3434+
3435+
if existing.is_empty() || line.is_empty() {
3436+
continue;
3437+
}
3438+
3439+
if existing == line {
3440+
exact_matches += 1;
3441+
} else {
3442+
conflicts += 1;
3443+
}
3444+
}
3445+
3446+
let score = exact_matches * 3 - conflicts * 4;
3447+
if score > best_score || (score == best_score && shift.abs() < best_shift.abs()) {
3448+
best_score = score;
3449+
best_shift = shift;
3450+
}
3451+
}
3452+
3453+
best_shift
33753454
}
33763455

33773456
fn insert_snippet_line(
@@ -3663,6 +3742,13 @@ mod tests {
36633742
assert_eq!(end, 101);
36643743
}
36653744

3745+
#[test]
3746+
fn snippet_end_line_ignores_trailing_newline() {
3747+
let text = "alpha\nip_rcv\n";
3748+
let end = snippet_end_line(text, 99);
3749+
assert_eq!(end, 100);
3750+
}
3751+
36663752
#[test]
36673753
fn merge_overlapping_snippets_merges_adjacent_and_preserves_spans() {
36683754
let snippet_a = SearchSnippet {
@@ -3732,6 +3818,65 @@ mod tests {
37323818
);
37333819
}
37343820

3821+
#[test]
3822+
fn merge_overlapping_snippets_realigns_conflicting_overlap_by_text() {
3823+
let snippet_a = SearchSnippet {
3824+
start_line: 100,
3825+
end_line: 105,
3826+
match_line: 102,
3827+
content_text: concat!(
3828+
"func validateCidrInFilter(...) bool {\n",
3829+
"\tuuidStr, _ := global_config.ProtoUuidToStringWithDash(adUUID)\n",
3830+
"\tlogging.L(ctx).Debug(\"Target filter\", zap.String(\"uuid\", uuidStr))\n",
3831+
"\tfor _, filter := range filters {\n",
3832+
"\t\tuuidStr, _ := global_config.ProtoUuidToStringWithDash(filter.Id)\n",
3833+
"\t\tlogging.L(ctx).Debug(\"Check filter\", zap.String(\"uuid\", uuidStr))"
3834+
)
3835+
.to_string(),
3836+
match_spans: vec![SearchMatchSpan {
3837+
start: 95,
3838+
end: 108,
3839+
}],
3840+
};
3841+
let snippet_b = SearchSnippet {
3842+
start_line: 104,
3843+
end_line: 109,
3844+
match_line: 107,
3845+
content_text: concat!(
3846+
"\tlogging.L(ctx).Debug(\"Target filter\", zap.String(\"uuid\", uuidStr))\n",
3847+
"\tfor _, filter := range filters {\n",
3848+
"\t\tuuidStr, _ := global_config.ProtoUuidToStringWithDash(filter.Id)\n",
3849+
"\t\tlogging.L(ctx).Debug(\"Check filter\", zap.String(\"uuid\", uuidStr))\n",
3850+
"\t\tif proto.Equal(adUUID, filter.Id) {\n",
3851+
"\t\t\tlogging.L(ctx).Debug(\"Found filter\", zap.Any(\"uuid\", adUUID))"
3852+
)
3853+
.to_string(),
3854+
match_spans: vec![SearchMatchSpan {
3855+
start: 185,
3856+
end: 197,
3857+
}],
3858+
};
3859+
3860+
let merged = merge_overlapping_snippets(vec![snippet_a, snippet_b]);
3861+
let merged_snippet = &merged[0];
3862+
let lines: Vec<&str> = merged_snippet.content_text.lines().collect();
3863+
3864+
assert_eq!(merged_snippet.start_line, 100);
3865+
assert_eq!(
3866+
lines,
3867+
vec![
3868+
"func validateCidrInFilter(...) bool {",
3869+
"\tuuidStr, _ := global_config.ProtoUuidToStringWithDash(adUUID)",
3870+
"\tlogging.L(ctx).Debug(\"Target filter\", zap.String(\"uuid\", uuidStr))",
3871+
"\tfor _, filter := range filters {",
3872+
"\t\tuuidStr, _ := global_config.ProtoUuidToStringWithDash(filter.Id)",
3873+
"\t\tlogging.L(ctx).Debug(\"Check filter\", zap.String(\"uuid\", uuidStr))",
3874+
"\t\tif proto.Equal(adUUID, filter.Id) {",
3875+
"\t\t\tlogging.L(ctx).Debug(\"Found filter\", zap.Any(\"uuid\", adUUID))",
3876+
]
3877+
);
3878+
}
3879+
37353880
#[test]
37363881
fn merged_snippets_preserve_zero_based_end_exclusive_phrase_spans() {
37373882
let snippet_a = SearchSnippet {

0 commit comments

Comments
 (0)