Skip to content

Commit 901208a

Browse files
bghgaryCopilot
andauthored
Clean up deadlock regression test (#151)
Clean up the deadlock regression test from #147. ## Changes - Run entire test on a separate thread so gtest can detect deadlock via timeout - Install hook after initialization (no hookEnabled flag needed) - Hook only fires once (early return after first invocation) to prevent secondary deadlock from the hook sleeping on subsequent `wait()` calls - Use plain `bool` for hookSignaled (only accessed by worker thread) - Expanded comment on the sleep explaining non-determinism and its purpose - Update comments and flow diagram to match code ## Dependency Updates - Bump arcana.cpp GIT_TAG to b9bf9d8 (renamed `ARCANA_TESTING_HOOKS` to `ARCANA_TEST_HOOKS`, hooks moved to `arcana::test_hooks::blocking_concurrent_queue` namespace) - Includes merge from main (#149 WorkQueue merge, #152 perf test flake fix) ## Verification - **Fixed code (#151)**: 19/19 CI green — test passes - **Broken code (#146)**: 12 deadlock failures as expected — test catches the bug --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d0926d4 commit 901208a

3 files changed

Lines changed: 80 additions & 63 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ FetchContent_Declare(AndroidExtensions
1515
EXCLUDE_FROM_ALL)
1616
FetchContent_Declare(arcana.cpp
1717
GIT_REPOSITORY https://github.com/microsoft/arcana.cpp.git
18-
GIT_TAG d5dd03cc6dd138fc17c277f61abbe2bc388444af
18+
GIT_TAG b9bf9d85fce37d5fc9dbfc4a4dc5e1531bee215a
1919
EXCLUDE_FROM_ALL)
2020
FetchContent_Declare(asio
2121
GIT_REPOSITORY https://github.com/chriskohlhoff/asio.git
@@ -114,7 +114,7 @@ endif()
114114
# --------------------------------------------------
115115

116116
if(JSRUNTIMEHOST_TESTS)
117-
add_compile_definitions(ARCANA_TESTING_HOOKS)
117+
add_compile_definitions(ARCANA_TEST_HOOKS)
118118
endif()
119119

120120
FetchContent_MakeAvailable_With_Message(arcana.cpp)

Tests/UnitTests/Android/app/src/main/cpp/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ add_library(UnitTestsJNI SHARED
2222
${UNIT_TESTS_DIR}/Shared/Shared.cpp)
2323

2424
target_compile_definitions(UnitTestsJNI PRIVATE JSRUNTIMEHOST_PLATFORM="${JSRUNTIMEHOST_PLATFORM}")
25-
target_compile_definitions(UnitTestsJNI PRIVATE ARCANA_TESTING_HOOKS)
25+
target_compile_definitions(UnitTestsJNI PRIVATE ARCANA_TEST_HOOKS)
2626

2727
target_include_directories(UnitTestsJNI
2828
PRIVATE ${UNIT_TESTS_DIR})

Tests/UnitTests/Shared/Shared.cpp

Lines changed: 77 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -125,82 +125,99 @@ TEST(Console, Log)
125125

126126
TEST(AppRuntime, DestroyDoesNotDeadlock)
127127
{
128-
// Deterministic test for the race condition in the AppRuntime destructor.
128+
// Regression test verifying AppRuntime destruction doesn't deadlock.
129+
// Uses a global arcana hook to sleep while holding the queue mutex
130+
// before wait(), ensuring the worker is in the vulnerable window
131+
// when the destructor fires. See #147 for details on the bug and fix.
129132
//
130-
// A global hook sleeps WHILE HOLDING the queue mutex, right before
131-
// condition_variable::wait(). We synchronize so the worker is definitely
132-
// in the hook before triggering destruction.
133+
// The entire test runs on a separate thread so the gtest thread can
134+
// detect a deadlock via timeout without hanging the process.
133135
//
134-
// Old (broken) code: cancel() + notify_all() fire without the mutex,
135-
// so the notification is lost while the worker sleeps → deadlock.
136-
// Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex,
137-
// so it blocks until the worker enters wait() → notification delivered.
138-
139-
// Shared state for hook synchronization
140-
std::atomic<bool> hookEnabled{false};
141-
std::atomic<bool> hookSignaled{false};
136+
// Test flow:
137+
//
138+
// Test Thread Worker Thread
139+
// ----------- -------------
140+
// 1. Create AppRuntime Worker starts, enters blocking_tick
141+
// Wait for init to complete
142+
// 2. Install hook
143+
// Dispatch(no-op) Worker wakes, runs no-op,
144+
// returns to blocking_tick
145+
// Hook fires:
146+
// signal workerInHook
147+
// sleep 200ms (holding mutex!)
148+
// 3. workerInHook.wait()
149+
// Worker is sleeping in hook
150+
// 4. ~AppRuntime():
151+
// cancel()
152+
// Append(no-op):
153+
// push() blocks ------> (worker holds mutex)
154+
// 200ms sleep ends
155+
// wait(lock) releases mutex
156+
// push() acquires mutex
157+
// pushes, notifies ---> wakes up!
158+
// join() waits drains no-op, cancelled -> exit
159+
// join() returns <----- thread exits
160+
// 5. destroy completes -> PASS
161+
162+
bool hookSignaled{false};
142163
std::promise<void> workerInHook;
164+
std::promise<void> testDone;
165+
166+
// Run the full lifecycle on a separate thread so the gtest thread
167+
// can detect a deadlock via timeout.
168+
std::thread testThread([&]() {
169+
auto runtime = std::make_unique<Babylon::AppRuntime>();
170+
171+
// Wait for the runtime to fully initialize. The constructor dispatches
172+
// CreateForJavaScript which must complete before we install the hook
173+
// so the worker is idle and ready to enter the hook on the next wait.
174+
std::promise<void> ready;
175+
runtime->Dispatch([&ready](Napi::Env) {
176+
ready.set_value();
177+
});
178+
ready.get_future().wait();
143179

144-
// Set the callback. It checks hookEnabled so we control
145-
// when it actually sleeps.
146-
arcana::set_before_wait_callback([&]() {
147-
if (hookEnabled.load() && !hookSignaled.exchange(true))
148-
{
180+
// Install the hook and dispatch a no-op to wake the worker,
181+
// ensuring it cycles through the hook on its way back to idle.
182+
arcana::test_hooks::blocking_concurrent_queue::set_before_wait_callback([&]() {
183+
if (hookSignaled)
184+
{
185+
return;
186+
}
187+
hookSignaled = true;
149188
workerInHook.set_value();
150-
}
151-
if (hookEnabled.load())
152-
{
189+
// This sleep is not truly deterministic. Its purpose is to hold the
190+
// mutex long enough for runtime.reset() (called by the test thread
191+
// after workerInHook signals) to reach push() while the mutex is
192+
// still held. When the sleep ends, the worker enters wait() which
193+
// releases the mutex, allowing push() to acquire it and deliver the
194+
// wake-up notification. If runtime.reset() hasn't reached push()
195+
// by the time the sleep ends, the test still passes but doesn't
196+
// exercise the intended contention window.
153197
std::this_thread::sleep_for(std::chrono::milliseconds(200));
154-
}
155-
});
198+
});
199+
runtime->Dispatch([](Napi::Env) {});
156200

157-
auto runtime = std::make_unique<Babylon::AppRuntime>();
201+
// Wait for the worker to be in the hook (holding mutex, sleeping)
202+
workerInHook.get_future().wait();
158203

159-
// Dispatch work and wait for completion
160-
std::promise<void> ready;
161-
runtime->Dispatch([&ready](Napi::Env) {
162-
ready.set_value();
204+
// Destroy — if the fix works, the destructor completes.
205+
// If broken, it deadlocks and the timeout detects it.
206+
runtime.reset();
207+
testDone.set_value();
163208
});
164-
ready.get_future().wait();
165-
166-
// Enable the hook and dispatch a no-op to wake the worker,
167-
// ensuring it cycles through the hook on its way back to idle
168-
hookEnabled.store(true);
169-
runtime->Dispatch([](Napi::Env) {});
170209

171-
// Wait for the worker to be in the hook (holding mutex, sleeping)
172-
auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5));
173-
if (hookStatus == std::future_status::timeout)
174-
{
175-
// Hook didn't fire — no deadlock risk, clean up normally
176-
arcana::set_before_wait_callback([]() {});
177-
FAIL() << "Worker thread did not enter before-wait hook";
178-
}
210+
auto status = testDone.get_future().wait_for(std::chrono::seconds(5));
179211

180-
// Worker is in the hook (holding mutex, sleeping). Destroy on a
181-
// detachable thread so the test doesn't hang if the destructor deadlocks.
182-
auto runtimePtr = std::make_shared<std::unique_ptr<Babylon::AppRuntime>>(std::move(runtime));
183-
std::promise<void> destroyDone;
184-
auto destroyFuture = destroyDone.get_future();
185-
std::thread destroyThread([runtimePtr, &destroyDone]() {
186-
runtimePtr->reset();
187-
destroyDone.set_value();
188-
});
212+
arcana::test_hooks::blocking_concurrent_queue::set_before_wait_callback([]() {});
189213

190-
auto status = destroyFuture.wait_for(std::chrono::seconds(5));
191214
if (status == std::future_status::timeout)
192215
{
193-
destroyThread.detach();
194-
}
195-
else
196-
{
197-
destroyThread.join();
216+
testThread.detach();
217+
FAIL() << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds";
198218
}
199219

200-
arcana::set_before_wait_callback([]() {});
201-
202-
ASSERT_NE(status, std::future_status::timeout)
203-
<< "Deadlock detected: AppRuntime destructor did not complete within 5 seconds";
220+
testThread.join();
204221
}
205222

206223
int RunTests()

0 commit comments

Comments
 (0)