From 63ffd5564ce0da8cfed06822b1ebd7a53e2bc2b0 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Tue, 9 Jun 2026 16:56:34 -0500 Subject: [PATCH 1/7] fix(python): avoid async callback GIL deadlock --- crates/bashkit-python/src/lib.rs | 11 +++--- .../tests/test_async_callbacks.py | 35 +++++++++++++++++++ 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/crates/bashkit-python/src/lib.rs b/crates/bashkit-python/src/lib.rs index 75e70564..a3dfdc35 100644 --- a/crates/bashkit-python/src/lib.rs +++ b/crates/bashkit-python/src/lib.rs @@ -2194,7 +2194,7 @@ struct PyPrivateAsyncLoop { // all run_until_complete calls happen on the thread that created the loop, // satisfying asyncio's thread-affinity requirement (review comment on // spawn_blocking scheduling successive callbacks on different OS threads). - worker_tx: StdMutex>>, + worker_tx: StdMutex>>, // Cached Python helper for background-thread fallback (Jupyter/IPython compatibility). bg_thread_runner: StdMutex>>, } @@ -2212,12 +2212,12 @@ impl PyPrivateAsyncLoop { // keeps it there for the lifetime of the PyPrivateAsyncLoop. This pins all // run_until_complete calls to a single thread, matching asyncio's thread- // affinity contract. - fn ensure_worker_tx(&self) -> PyResult> { + fn ensure_worker_tx(&self) -> PyResult> { let mut guard = self.worker_tx.lock().expect("private loop worker tx lock"); if let Some(ref tx) = *guard { return Ok(tx.clone()); } - let (tx, rx) = std::sync::mpsc::sync_channel::(0); + let (tx, rx) = std::sync::mpsc::channel::(); std::thread::Builder::new() .name("bashkit-py-loop".into()) .spawn(move || { @@ -2329,8 +2329,9 @@ def _run(coro, ctx): // Dispatch to the dedicated worker thread that owns the asyncio event loop. // This ensures all run_until_complete calls happen on the same OS thread, - // preserving asyncio thread-affinity. The GIL is released while waiting so - // the worker thread can acquire it to run Python. + // preserving asyncio thread-affinity. The work channel is unbounded so + // sending cannot rendezvous with worker startup while the caller holds + // the GIL; the result wait below releases the GIL for Python execution. let tx = self.ensure_worker_tx()?; let (result_tx, result_rx) = std::sync::mpsc::sync_channel(0); let item = PrivateLoopWorkItem { diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index 430d3067..61900df7 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -11,6 +11,9 @@ import asyncio import contextvars import gc +import subprocess +import sys +import textwrap import threading import time import weakref @@ -39,6 +42,38 @@ def _collect_between_tests(): # =========================================================================== +def test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock(): + """First private-loop async callback returns instead of deadlocking the GIL.""" + + code = textwrap.dedent( + r""" + import asyncio + from bashkit import ScriptedTool + + async def greet(params, stdin=None): + await asyncio.sleep(0) + return "ok\n" + + tool = ScriptedTool("api") + tool.add_tool("greet", "Greet", callback=greet) + result = tool.execute_sync("greet") + print(result.exit_code) + print(result.stdout.strip()) + """ + ) + + completed = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + timeout=3.0, + check=False, + ) + + assert completed.returncode == 0, completed.stderr + assert completed.stdout.splitlines() == ["0", "ok"] + + def test_async_callback_execute_sync_honors_timeout(): """execute_sync() timeout preempts slow async callbacks on private loop.""" From 13fc9d39fb7e95d441439b339259b973e966e90c Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:11:32 +0000 Subject: [PATCH 2/7] fix(python): update TM-PY-030 comment for unbounded work channel The previous comment described a capacity-0 rendezvous channel and explained why GIL-detach was needed for the send. With the unbounded channel the send is non-blocking; GIL release is only required for the result rendezvous. Update the threat-model comment to match. --- crates/bashkit-python/src/lib.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/bashkit-python/src/lib.rs b/crates/bashkit-python/src/lib.rs index a3dfdc35..d73b03fc 100644 --- a/crates/bashkit-python/src/lib.rs +++ b/crates/bashkit-python/src/lib.rs @@ -2339,12 +2339,10 @@ def _run(coro, ctx): context: context.clone_ref(py), result_tx, }; - // THREAT[TM-PY-030]: detach (release the GIL) around BOTH the send and - // the recv. The channel is a rendezvous (capacity 0), so send blocks - // until the worker receives — and on first use the worker must acquire - // the GIL to create its event loop before it ever reaches recv(). - // Sending while attached therefore deadlocks the whole process: - // dispatcher holds the GIL waiting on send, worker waits on the GIL. + // THREAT[TM-PY-030]: detach (release the GIL) for the result rendezvous. + // The work channel is unbounded so send() returns immediately without + // blocking; only result_rx.recv() is a true blocking wait. GIL is still + // released across the entire block so the worker can attach at any point. // `move` ensures result_rx (Send, not Sync) is owned by the closure. py.detach(move || { tx.send(item) From 7fae4ed8e60a8289df97c07a2994fc165062d95f Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:13:12 +0000 Subject: [PATCH 3/7] chore(specs): update TM-PY-030 mitigation for unbounded work channel Variant (1) is now mitigated by using an unbounded channel() so send() cannot block, rather than solely relying on GIL detach around a rendezvous. Add regression test reference for the first-call deadlock. --- specs/threat-model.md | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/specs/threat-model.md b/specs/threat-model.md index 4ac136ea..fe6f531a 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -1745,7 +1745,7 @@ caller's GIL hold. | TM-PY-026 | reset() discards security config | `BashTool.reset()` creates new `Bash` with bare builder, dropping all configured limits | `PyBash::reset` and `BashTool::reset` rebuild via `replace_live_bash_with_builder` + `build_live_builder`, which preserves the original limits, env, and registered builtins | **MITIGATED** | | TM-PY-027 | Unbounded recursion in JSON conversion | `py_to_json`/`json_to_py` recurse without depth limit on nested dicts/lists | `json_to_py_inner`, `py_to_json_inner`, and the MontyObject converters all carry a `depth` arg; depth > `MAX_NESTING_DEPTH = 64` raises `ValueError("… nesting depth exceeds maximum of 64")` | **MITIGATED** | -| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Dispatch detaches around both the send and the receive; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** | +| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Work dispatch uses an unbounded `channel()` so send() never blocks; result rendezvous runs inside `py.detach(...)` so GIL is released for the blocking wait; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** | **TM-PY-026** (mitigated): `PyBash::reset` and `BashTool::reset` (`crates/bashkit-python/src/lib.rs`) rebuild the inner `Bash` via `replace_live_bash_with_builder` + `build_live_builder`, which @@ -1758,25 +1758,27 @@ MontyObject converters in `crates/bashkit-python/src/lib.rs` carry a `depth: usi At `depth > MAX_NESTING_DEPTH = 64`, conversion raises a Python `ValueError` instead of recursing. Coverage: `tests/_security_advanced.py::JsonConversionBoundariesTests`. -**TM-PY-030** (mitigated): two deadlock variants in the async-callback private-loop +**TM-PY-030** (mitigated): three deadlock/crash variants in the async-callback private-loop machinery (`crates/bashkit-python/src/lib.rs`). (1) `PyPrivateAsyncLoop::run_awaitable` -sent work to the dedicated worker thread over a `sync_channel(0)` rendezvous while -attached; on first use the worker must attach (acquire the GIL) to create its asyncio -loop before it can `recv()`, so dispatcher and worker waited on each other. The send -and receive now both run inside `py.detach(...)`. (2) Pyclass dealloc runs attached -and dropped the last `Arc`; tokio's default `Runtime::drop` joins in-flight -blocking tasks, and an abandoned (timed-out) callback task must re-attach to finish — -freezing the entire interpreter. The `PyRuntime` handle now shuts the runtime down -with `shutdown_background()` on last drop. (3) The private-loop worker thread called -`Python::attach` on its exit path to close its asyncio loop; the worker usually wakes -because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` — -attaching during finalization fatals CPython (`PyGILState_Release`, SIGABRT at -interpreter exit; `Python::try_attach` cannot detect finalization before 3.13). The -worker exit path no longer touches Python: the loop's `Py` ref is dropped unattached -(deferred decref) and the loop is closed by `BaseEventLoop.__del__`. Regression tests: -`tests/test_async_callbacks.py::test_async_callback_execute_sync_honors_timeout`, -`…::test_dealloc_during_inflight_callback_does_not_deadlock`; variant (3) is covered -by the `langgraph_async_tool.py` example run in the Python CI Examples job. +originally sent work over a `sync_channel(0)` rendezvous while attached; on first use the +worker must attach (acquire the GIL) to create its asyncio loop before it can `recv()`, so +dispatcher and worker waited on each other. The work dispatch now uses an unbounded +`channel()` so `send()` never blocks regardless of worker state; the result rendezvous +(`result_rx.recv()`) still runs inside `py.detach(...)` to release the GIL for the blocking +wait. Regression test: `test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock`. +(2) Pyclass dealloc runs attached and dropped the last `Arc`; tokio's default +`Runtime::drop` joins in-flight blocking tasks, and an abandoned (timed-out) callback task +must re-attach to finish — freezing the entire interpreter. The `PyRuntime` handle now shuts +the runtime down with `shutdown_background()` on last drop. Regression test: +`test_dealloc_during_inflight_callback_does_not_deadlock`. (3) The private-loop worker thread +called `Python::attach` on its exit path to close its asyncio loop; the worker usually wakes +because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` — attaching during +finalization fatals CPython (`PyGILState_Release`, SIGABRT at interpreter exit; +`Python::try_attach` cannot detect finalization before 3.13). The worker exit path no longer +touches Python: the loop's `Py` ref is dropped unattached (deferred decref) and the loop is +closed by `BaseEventLoop.__del__`. Variant (3) is covered by the `langgraph_async_tool.py` +example run in the Python CI Examples job. Regression test also: +`test_async_callback_execute_sync_honors_timeout`. | TM-PY-029 | Host clock information disclosure | `datetime.date.today()` / `datetime.datetime.now()` expose host system time and timezone | Intentional — required for correct datetime semantics | **ACCEPTED** | From 7f86446c92a2cfc1d5db663e28ede710d62b8e0c Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:18:55 +0000 Subject: [PATCH 4/7] fix(python): fix subprocess test env and address review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Propagate sys.path via PYTHONPATH to subprocess so it finds the compiled native extension in CI (maturin PYTHONPATH-only setup) - Increase subprocess timeout from 3s to 10s to avoid flakiness on loaded CI runners during extension startup - Add comment to unbounded channel explaining natural queue-depth bound (each caller blocks on result_rx.recv(), so queue depth ≤ concurrent calls) --- crates/bashkit-python/src/lib.rs | 5 +++++ crates/bashkit-python/tests/test_async_callbacks.py | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/bashkit-python/src/lib.rs b/crates/bashkit-python/src/lib.rs index d73b03fc..c0649735 100644 --- a/crates/bashkit-python/src/lib.rs +++ b/crates/bashkit-python/src/lib.rs @@ -2217,6 +2217,11 @@ impl PyPrivateAsyncLoop { if let Some(ref tx) = *guard { return Ok(tx.clone()); } + // Unbounded channel: each caller enqueues exactly one item and then + // blocks on result_rx.recv(), so the queue depth equals the number of + // concurrent run_awaitable calls — naturally bounded by the execution + // model. Using unbounded (vs sync_channel(0)) ensures send() never + // blocks while the caller holds the GIL (TM-PY-030 variant 1). let (tx, rx) = std::sync::mpsc::channel::(); std::thread::Builder::new() .name("bashkit-py-loop".into()) diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index 61900df7..5c3eacc1 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -11,6 +11,7 @@ import asyncio import contextvars import gc +import os import subprocess import sys import textwrap @@ -62,12 +63,17 @@ async def greet(params, stdin=None): """ ) + # Propagate sys.path via PYTHONPATH so the subprocess finds the compiled + # native extension (bashkit._bashkit) even when it isn't installed + # system-wide (e.g. maturin develop in a PYTHONPATH-only CI setup). + env = {**os.environ, "PYTHONPATH": os.pathsep.join(p for p in sys.path if p)} completed = subprocess.run( [sys.executable, "-c", code], capture_output=True, text=True, - timeout=3.0, + timeout=10.0, check=False, + env=env, ) assert completed.returncode == 0, completed.stderr From 7b6fc5625f73c3b222e232fdad4b863522661910 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:28:23 +0000 Subject: [PATCH 5/7] fix(python): use find_spec to locate bashkit extension for subprocess test The previous PYTHONPATH propagation (sys.path join) injected source-tree paths before site-packages, causing the subprocess to find bashkit/__init__.py from the source directory but fail to find _bashkit.so (which is only in site-packages after `pip install`). Use importlib.util.find_spec('bashkit._bashkit') to locate the .so in the same installation the test process uses, then prepend its parent directory to PYTHONPATH. Works for both pip-installed (site-packages) and maturin- develop (source tree) setups. --- .../tests/test_async_callbacks.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index 5c3eacc1..1536c81c 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -11,6 +11,7 @@ import asyncio import contextvars import gc +import importlib.util import os import subprocess import sys @@ -63,10 +64,19 @@ async def greet(params, stdin=None): """ ) - # Propagate sys.path via PYTHONPATH so the subprocess finds the compiled - # native extension (bashkit._bashkit) even when it isn't installed - # system-wide (e.g. maturin develop in a PYTHONPATH-only CI setup). - env = {**os.environ, "PYTHONPATH": os.pathsep.join(p for p in sys.path if p)} + # Build a PYTHONPATH that points directly at the directory that contains + # both bashkit/__init__.py AND the compiled bashkit._bashkit extension. + # We resolve this via find_spec on the native module so we always use the + # same installation as the running test process, regardless of whether + # bashkit was installed via `pip install` (site-packages) or + # `maturin develop` (source tree). A naive sys.path propagation would + # put source-tree paths first, shadowing the installed .so. + spec = importlib.util.find_spec("bashkit._bashkit") + pkg_root = os.path.dirname(os.path.dirname(spec.origin)) if spec and spec.origin else None + env = { + **os.environ, + "PYTHONPATH": (pkg_root + os.pathsep if pkg_root else "") + os.environ.get("PYTHONPATH", ""), + } completed = subprocess.run( [sys.executable, "-c", code], capture_output=True, From ebd7d579b5d73c013cd0de5f218847edebc6cef3 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:35:46 +0000 Subject: [PATCH 6/7] test(python): fix subprocess PYTHONPATH using glob scan instead of find_spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit find_spec("bashkit._bashkit") returns None in CI because pytest already imports bashkit from the source tree; the submodule lookup resolves in that context, finds no .so, and returns None — leaving PYTHONPATH unset and the subprocess still hitting the source tree. Switch to scanning sys.path with glob.glob for _bashkit*.so / *.pyd: this is independent of Python import state and correctly finds site-packages (pip install) or the source tree (maturin develop) in all environments. --- .../tests/test_async_callbacks.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index 1536c81c..d810689e 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -11,7 +11,7 @@ import asyncio import contextvars import gc -import importlib.util +import glob as _glob import os import subprocess import sys @@ -66,13 +66,22 @@ async def greet(params, stdin=None): # Build a PYTHONPATH that points directly at the directory that contains # both bashkit/__init__.py AND the compiled bashkit._bashkit extension. - # We resolve this via find_spec on the native module so we always use the - # same installation as the running test process, regardless of whether - # bashkit was installed via `pip install` (site-packages) or - # `maturin develop` (source tree). A naive sys.path propagation would - # put source-tree paths first, shadowing the installed .so. - spec = importlib.util.find_spec("bashkit._bashkit") - pkg_root = os.path.dirname(os.path.dirname(spec.origin)) if spec and spec.origin else None + # We scan sys.path with glob rather than using find_spec: pytest imports + # bashkit from the source tree before this test runs, so find_spec looks + # for _bashkit inside that source package and returns None — leaving + # pkg_root unset and the subprocess still pointing at the source tree. + pkg_root = next( + ( + p + for p in sys.path + if p + and ( + _glob.glob(os.path.join(p, "bashkit", "_bashkit*.so")) + or _glob.glob(os.path.join(p, "bashkit", "_bashkit*.pyd")) + ) + ), + None, + ) env = { **os.environ, "PYTHONPATH": (pkg_root + os.pathsep if pkg_root else "") + os.environ.get("PYTHONPATH", ""), From 41993138a149894d7d5d5c967f7c030be68b5a00 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 10 Jun 2026 09:44:12 +0000 Subject: [PATCH 7/7] test(python): fix subprocess CWD shadowing installed package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pytest runs from crates/bashkit-python/, so the subprocess inherits that CWD. Python always puts '' (= CWD) as sys.path[0], placing the source tree before any PYTHONPATH entries — including the site-packages path we derive from the glob scan. Result: subprocess finds bashkit/__init__.py from source but not _bashkit.so, causing ModuleNotFoundError. Pass cwd='/' so '' resolves to a neutral directory with no bashkit/ subdir. The PYTHONPATH (site-packages or maturin-develop source tree, found via glob) is then the first real bashkit candidate and the .so is always found. --- crates/bashkit-python/tests/test_async_callbacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index d810689e..883a1f47 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -92,6 +92,7 @@ async def greet(params, stdin=None): text=True, timeout=10.0, check=False, + cwd="/", env=env, )