Skip to content

Commit ae246c7

Browse files
committed
Fix retry queue to re-run failed tests on Buildkite rebuild
When Buildkite rebuilds a job (new BUILDKITE_BUILD_ID), workers start with an empty per-worker log in the new Redis namespace. Previously, retry_queue would find no matching failures and fall through to "The retry queue does not contain any failure" — leaving error-reports populated and the summary step failing indefinitely. Fix: when the per-worker log intersection yields no failures, fall back to redis.hkeys(key('error-reports')) so any worker can retry failures from any other worker. Also handles automatic LSO retries (BUILDKITE_RETRY_TYPE=automatic) via the existing elsif retry? branch. Additional fixes: - Retry#acknowledge queues a Redis SADD so BuildRecord#record_success gets correct multi-exec result indices for stats delta correction - reset_stats now purges error-report-deltas for the resetting worker to prevent stats underflow on same-worker retries - QueueEntry.format raises ArgumentError when file_path is nil/empty and canonicalizes paths via File.expand_path - SingleExample#queue_entry resolves file_path via source_location - RSpec SingleExample and BuildStatusRecorder use example.file_path
1 parent b2dd54a commit ae246c7

13 files changed

Lines changed: 293 additions & 47 deletions

File tree

ruby/lib/ci/queue/queue_entry.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ def self.parse(entry)
1717
end
1818

1919
def self.format(test_id, file_path)
20-
JSON.dump({ test_id: test_id, file_path: file_path })
20+
raise ArgumentError, "file_path is required for '#{test_id}' — the test file path must be resolvable" if file_path.nil? || file_path.empty?
21+
canonical = load_error_payload?(file_path) ? file_path : ::File.expand_path(file_path)
22+
JSON.dump({ test_id: test_id, file_path: canonical })
2123
end
2224

2325
def self.load_error_payload?(file_path)

ruby/lib/ci/queue/redis/build_record.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ def error_reports
145145
redis.hgetall(key('error-reports')).transform_keys { |entry| CI::Queue::QueueEntry.test_id(entry) }
146146
end
147147

148+
def failed_test_entries
149+
redis.hkeys(key('error-reports'))
150+
end
151+
148152
def flaky_reports
149153
redis.smembers(key('flaky-reports')).map { |entry| CI::Queue::QueueEntry.test_id(entry) }
150154
end
@@ -177,6 +181,16 @@ def reset_stats(stat_names)
177181
pipeline.hdel(key(stat_name), config.worker_id)
178182
end
179183
end
184+
# Purge any error-report-deltas that reference this worker so that
185+
# apply_error_report_delta_correction cannot double-subtract from
186+
# the now-zeroed counters on a subsequent successful retry.
187+
deltas = redis.hgetall(key('error-report-deltas'))
188+
to_delete = deltas.filter_map do |entry, delta_json|
189+
entry if JSON.parse(delta_json)['worker_id'].to_s == config.worker_id.to_s
190+
rescue JSON::ParserError
191+
nil
192+
end
193+
redis.hdel(key('error-report-deltas'), *to_delete) unless to_delete.empty?
180194
end
181195

182196
private

ruby/lib/ci/queue/redis/retry.rb

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,24 @@ def stream_populate(tests, random: nil, batch_size: nil)
2828
self
2929
end
3030

31+
# Queue a Redis SADD so that BuildRecord#record_success can include this
32+
# in its multi-exec transaction. Without this, Static#acknowledge returns
33+
# a Ruby value (not a Redis future), shifting the result indices and
34+
# breaking the stats delta correction.
35+
def acknowledge(entry, error: nil, pipeline: redis)
36+
@progress += 1
37+
return @progress unless pipeline
38+
test_id = CI::Queue::QueueEntry.test_id(entry)
39+
pipeline.sadd(key('processed'), test_id)
40+
end
41+
3142
private
3243

3344
attr_reader :redis
45+
46+
def key(*args)
47+
['build', config.build_id, *args].join(':')
48+
end
3449
end
3550
end
3651
end

ruby/lib/ci/queue/redis/worker.rb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,15 @@ def retry_queue
158158
log.select! { |entry| failures.include?(CI::Queue::QueueEntry.test_id(entry)) }
159159
log.uniq! { |entry| CI::Queue::QueueEntry.test_id(entry) }
160160
log.reverse!
161+
162+
if log.empty?
163+
# Per-worker log has no matching failures — this worker didn't run
164+
# the failing tests (e.g. Buildkite rebuild with new worker IDs,
165+
# or a different parallel slot). Fall back to ALL unresolved
166+
# failures from error-reports so any worker can retry them.
167+
log = redis.hkeys(key('error-reports'))
168+
end
169+
161170
Retry.new(log, config, redis: redis)
162171
end
163172

ruby/lib/minitest/queue.rb

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,16 @@ def id
330330
end
331331

332332
def queue_entry
333-
@queue_entry ||= CI::Queue::QueueEntry.format(id, nil)
333+
@queue_entry ||= begin
334+
unless runnable.is_a?(Module)
335+
raise ArgumentError, "runnable must be a Module (got #{runnable.class}). " \
336+
"Do not create SingleExample with string class names."
337+
end
338+
file_path = runnable.instance_method(method_name).source_location&.first
339+
raise ArgumentError, "Cannot resolve source file for #{id} — " \
340+
"ensure the test method is defined in a Ruby source file" if file_path.nil?
341+
CI::Queue::QueueEntry.format(id, file_path)
342+
end
334343
end
335344

336345
def <=>(other)

ruby/lib/rspec/queue.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,10 @@ def id
254254
end
255255

256256
def queue_entry
257-
@queue_entry ||= CI::Queue::QueueEntry.format(id, nil)
257+
@queue_entry ||= begin
258+
file_path = example.metadata[:absolute_file_path] || example.file_path
259+
CI::Queue::QueueEntry.format(id, file_path)
260+
end
258261
end
259262

260263
def <=>(other)

ruby/lib/rspec/queue/build_status_recorder.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ def initialize(*)
1818

1919
def example_passed(notification)
2020
example = notification.example
21-
entry = CI::Queue::QueueEntry.format(example.id, nil)
21+
entry = CI::Queue::QueueEntry.format(example.id, example.file_path)
2222
build.record_success(entry)
2323
end
2424

2525
def example_failed(notification)
2626
example = notification.example
27-
entry = CI::Queue::QueueEntry.format(example.id, nil)
27+
entry = CI::Queue::QueueEntry.format(example.id, example.file_path)
2828
build.record_error(entry, dump(notification))
2929
end
3030

ruby/test/ci/queue/queue_entry_test.rb

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,9 @@
22
require 'test_helper'
33

44
class CI::Queue::QueueEntryTest < Minitest::Test
5-
def test_parse_without_file_path
6-
entry = CI::Queue::QueueEntry.format("FooTest#test_bar", nil)
7-
parsed = CI::Queue::QueueEntry.parse(entry)
8-
assert_equal "FooTest#test_bar", parsed[:test_id]
9-
assert_nil parsed[:file_path]
5+
def test_format_raises_without_file_path
6+
assert_raises(ArgumentError) { CI::Queue::QueueEntry.format("FooTest#test_bar", nil) }
7+
assert_raises(ArgumentError) { CI::Queue::QueueEntry.format("FooTest#test_bar", "") }
108
end
119

1210
def test_parse_with_file_path
@@ -16,18 +14,6 @@ def test_parse_with_file_path
1614
assert_equal "/tmp/foo_test.rb", parsed[:file_path]
1715
end
1816

19-
def test_format_without_file_path
20-
entry_nil = CI::Queue::QueueEntry.format("FooTest#test_bar", nil)
21-
parsed_nil = JSON.parse(entry_nil, symbolize_names: true)
22-
assert_equal "FooTest#test_bar", parsed_nil[:test_id]
23-
assert_nil parsed_nil[:file_path]
24-
25-
entry_empty = CI::Queue::QueueEntry.format("FooTest#test_bar", "")
26-
parsed_empty = JSON.parse(entry_empty, symbolize_names: true)
27-
assert_equal "FooTest#test_bar", parsed_empty[:test_id]
28-
assert_equal "", parsed_empty[:file_path]
29-
end
30-
3117
def test_format_with_file_path
3218
entry = CI::Queue::QueueEntry.format("FooTest#test_bar", "/tmp/foo_test.rb")
3319
parsed = JSON.parse(entry, symbolize_names: true)
@@ -60,11 +46,6 @@ def test_round_trip_preserves_test_id
6046
assert_equal file_path, parsed[:file_path]
6147
end
6248

63-
def test_test_id_without_file_path
64-
entry = CI::Queue::QueueEntry.format("FooTest#test_bar", nil)
65-
assert_equal "FooTest#test_bar", CI::Queue::QueueEntry.test_id(entry)
66-
end
67-
6849
def test_test_id_with_file_path
6950
entry = CI::Queue::QueueEntry.format("FooTest#test_bar", "/tmp/foo_test.rb")
7051
assert_equal "FooTest#test_bar", CI::Queue::QueueEntry.test_id(entry)

0 commit comments

Comments
 (0)