Skip to content

Commit e472ef8

Browse files
Merge pull request #101 from RodrigoMNardi/feature/github/cancel_by_timer
Execution Hanging Detection
2 parents b329cf1 + caeaf16 commit e472ef8

9 files changed

Lines changed: 189 additions & 2 deletions

File tree

lib/github/plan_execution/finished.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Finished
2020
def initialize(payload)
2121
@check_suite = CheckSuite.find_by(bamboo_ci_ref: payload['bamboo_ref'])
2222
@logger = GithubLogger.instance.create('github_plan_execution_finished.log', Logger::INFO)
23+
@hanged = payload['hanged'] || false
2324
end
2425

2526
def finished

lib/github/update_status.rb

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def update_status
6060
case @status
6161
when 'in_progress'
6262
@job.in_progress(@github_check)
63+
create_timeout_worker
6364
when 'success'
6465
@job.success(@github_check)
6566
@job.update_execution_time
@@ -81,6 +82,16 @@ def update_status
8182
[500, 'Internal Server Error']
8283
end
8384

85+
def create_timeout_worker
86+
Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{@check_suite.id}%")&.delete_all
87+
88+
logger(Logger::INFO, "CiJobStatus::Update: TimeoutExecution for '#{@check_suite.id}'")
89+
90+
TimeoutExecution
91+
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
92+
.timeout(@check_suite.id)
93+
end
94+
8495
def insert_new_delayed_job
8596
queue = @job.check_suite.pull_request.github_pr_id % 10
8697

@@ -91,7 +102,7 @@ def delete_and_create_delayed_job(queue)
91102
fetch_delayed_job&.destroy_all
92103

93104
CiJobStatus
94-
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now, queue: queue)
105+
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now.utc, queue: queue)
95106
.update(@job.check_suite.id, @job.id)
96107
end
97108

@@ -117,7 +128,7 @@ def failure
117128
return failures_stats if @failures.is_a? Array and !@failures.empty?
118129

119130
CiJobFetchTopotestFailures
120-
.delay(run_at: 5.minutes.from_now, queue: 'fetch_topotest_failures')
131+
.delay(run_at: 5.minutes.from_now.utc, queue: 'fetch_topotest_failures')
121132
.update(@job.id, 1)
122133
end
123134

lib/github_ci_app.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040

4141
# Workers
4242
require_relative '../workers/ci_job_status'
43+
require_relative '../workers/timeout_execution'
4344
require_relative '../workers/ci_job_fetch_topotest_failures'
4445

4546
# Slack libs

lib/models/check_suite.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,8 @@ def in_progress?
4848
def execution_started?
4949
ci_jobs.where(status: :in_progress).size < 2
5050
end
51+
52+
def last_job_updated_at_timer
53+
ci_jobs.max_by(&:updated_at)&.updated_at
54+
end
5155
end

spec/lib/github/update_status_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
before do
1717
allow(Github::PlanExecution::Finished).to receive(:new).and_return(fake_finish_plan)
1818
allow(fake_finish_plan).to receive(:fetch_build_status)
19+
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
1920
end
2021

2122
describe 'Validates different Ci Job status' do
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-License-Identifier: BSD-2-Clause
2+
#
3+
# ci_job_spec.rb
4+
# Part of NetDEF CI System
5+
#
6+
# Copyright (c) 2023 by
7+
# Network Device Education Foundation, Inc. ("NetDEF")
8+
#
9+
# frozen_string_literal: true
10+
11+
describe CheckSuite do
12+
context '#execution_started?' do
13+
let(:check_suite) { create(:check_suite) }
14+
let(:check_suite_running) { create(:check_suite, :with_in_progress) }
15+
16+
it 'returns true when there are less than 2 jobs in progress' do
17+
expect(check_suite.execution_started?).to be_truthy
18+
end
19+
20+
it 'returns false' do
21+
expect(check_suite_running.execution_started?).to be_falsey
22+
end
23+
end
24+
25+
context '#last_job_updated_at_timer? -> success' do
26+
let(:ci_job) { create(:ci_job) }
27+
let(:check_suite) { create(:check_suite, ci_jobs: [ci_job]) }
28+
29+
it 'returns false' do
30+
expect(check_suite.last_job_updated_at_timer).not_to be_nil
31+
end
32+
end
33+
34+
context '#last_job_updated_at_timer? -> error' do
35+
let(:ci_job) { create(:ci_job, updated_at: nil) }
36+
let(:check_suite) { create(:check_suite, ci_jobs: []) }
37+
38+
it 'returns false' do
39+
expect(check_suite.last_job_updated_at_timer).to be_nil
40+
end
41+
end
42+
end

spec/spec_helper.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def app
3939
config.include FactoryBot::Syntax::Methods
4040
config.include WebMock::API
4141

42+
config.add_formatter('json', 'tmp/rspec_results.json')
43+
4244
pid = nil
4345

4446
config.before(:suite) do
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# SPDX-License-Identifier: BSD-2-Clause
2+
#
3+
# slack_bot_spec.rb
4+
# Part of NetDEF CI System
5+
#
6+
# Copyright (c) 2023 by
7+
# Network Device Education Foundation, Inc. ("NetDEF")
8+
#
9+
# frozen_string_literal: true
10+
11+
describe TimeoutExecution do
12+
let(:timeout_execution) { described_class.instance }
13+
let(:finished_instance) { Github::PlanExecution::Finished.new({}) }
14+
15+
before do
16+
allow(Github::PlanExecution::Finished).to receive(:new).and_return(finished_instance)
17+
end
18+
19+
context 'when timeout is called, but still running' do
20+
let(:check_suite) { create(:check_suite) }
21+
22+
before do
23+
allow(CheckSuite).to receive(:find).and_return(check_suite)
24+
allow(check_suite).to receive(:finished?).and_return(true)
25+
end
26+
27+
it 'calls timeout job' do
28+
expect(described_class.timeout(check_suite.id)).to be_falsey
29+
end
30+
end
31+
32+
context 'when timeout is called, but hanged' do
33+
let(:check_suite) { create(:check_suite) }
34+
35+
before do
36+
allow(CheckSuite).to receive(:find).and_return(check_suite)
37+
allow(check_suite).to receive(:finished?).and_return(false)
38+
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
39+
allow(finished_instance).to receive(:finished).and_return([200, 'Finished'])
40+
end
41+
42+
it 'calls timeout job' do
43+
expect(described_class.timeout(check_suite.id)).to be_truthy
44+
end
45+
end
46+
47+
context 'when timeout is called and rescheduled' do
48+
let(:check_suite) { create(:check_suite) }
49+
50+
before do
51+
allow(CheckSuite).to receive(:find).and_return(check_suite)
52+
allow(check_suite).to receive(:finished?).and_return(false)
53+
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc + 3.hours)
54+
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
55+
end
56+
57+
it 'calls timeout job' do
58+
expect(described_class.timeout(check_suite.id)).to be_falsey
59+
end
60+
end
61+
62+
context 'when timeout is called, last update in 2 hour ago' do
63+
let(:check_suite) { create(:check_suite) }
64+
65+
before do
66+
allow(CheckSuite).to receive(:find).and_return(check_suite)
67+
allow(check_suite).to receive(:finished?).and_return(false)
68+
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
69+
end
70+
71+
it 'calls timeout job' do
72+
expect(TimeoutExecution).to receive(:timeout)
73+
expect(described_class.timeout(check_suite.id)).to be_falsey
74+
end
75+
end
76+
end

workers/timeout_execution.rb

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# SPDX-License-Identifier: BSD-2-Clause
2+
#
3+
# ci_job_status.rb
4+
# Part of NetDEF CI System
5+
#
6+
# Copyright (c) 2024 by
7+
# Network Device Education Foundation, Inc. ("NetDEF")
8+
#
9+
# frozen_string_literal: true
10+
11+
require_relative '../config/setup'
12+
13+
class TimeoutExecution
14+
class << self
15+
def timeout(check_suite_id)
16+
@logger = GithubLogger.instance.create('timeout_execution_worker.log', Logger::INFO)
17+
check_suite = CheckSuite.find(check_suite_id)
18+
19+
@logger.info("Timeout execution for check_suite_id: #{check_suite_id} -> finished? #{check_suite.finished?}")
20+
21+
return false if check_suite.finished?
22+
return rescheduling([], check_suite_id) if check_suite.last_job_updated_at_timer > 2.hour.ago.utc
23+
24+
@logger.info("Calling Github::PlanExecution::Finished.new(#{check_suite.bamboo_ci_ref}).finished")
25+
26+
rescheduling(finished(check_suite), check_suite_id)
27+
end
28+
29+
def finished(check_suite)
30+
Github::PlanExecution::Finished
31+
.new({ 'bamboo_ref' => check_suite.bamboo_ci_ref, hanged: true })
32+
.finished
33+
end
34+
35+
def rescheduling(resp, check_suite_id)
36+
return true if resp == [200, 'Finished']
37+
38+
@logger.info("Rescheduling check_suite_id: #{check_suite_id}")
39+
40+
Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{check_suite_id}%").delete_all
41+
42+
TimeoutExecution
43+
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
44+
.timeout(check_suite_id)
45+
46+
false
47+
end
48+
end
49+
end

0 commit comments

Comments
 (0)