Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions crates/bashkit-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2194,7 +2194,7 @@ struct PyPrivateAsyncLoop {
// all run_until_complete calls happen on the thread that created the loop,
// satisfying asyncio's thread-affinity requirement (review comment on
// spawn_blocking scheduling successive callbacks on different OS threads).
worker_tx: StdMutex<Option<std::sync::mpsc::SyncSender<PrivateLoopWorkItem>>>,
worker_tx: StdMutex<Option<std::sync::mpsc::Sender<PrivateLoopWorkItem>>>,
// Cached Python helper for background-thread fallback (Jupyter/IPython compatibility).
bg_thread_runner: StdMutex<Option<Py<PyAny>>>,
}
Expand All @@ -2212,12 +2212,17 @@ impl PyPrivateAsyncLoop {
// keeps it there for the lifetime of the PyPrivateAsyncLoop. This pins all
// run_until_complete calls to a single thread, matching asyncio's thread-
// affinity contract.
fn ensure_worker_tx(&self) -> PyResult<std::sync::mpsc::SyncSender<PrivateLoopWorkItem>> {
fn ensure_worker_tx(&self) -> PyResult<std::sync::mpsc::Sender<PrivateLoopWorkItem>> {
let mut guard = self.worker_tx.lock().expect("private loop worker tx lock");
if let Some(ref tx) = *guard {
return Ok(tx.clone());
}
let (tx, rx) = std::sync::mpsc::sync_channel::<PrivateLoopWorkItem>(0);
// Unbounded channel: each caller enqueues exactly one item and then
// blocks on result_rx.recv(), so the queue depth equals the number of
// concurrent run_awaitable calls — naturally bounded by the execution
// model. Using unbounded (vs sync_channel(0)) ensures send() never
// blocks while the caller holds the GIL (TM-PY-030 variant 1).
let (tx, rx) = std::sync::mpsc::channel::<PrivateLoopWorkItem>();
Comment thread
chaliy marked this conversation as resolved.
std::thread::Builder::new()
.name("bashkit-py-loop".into())
.spawn(move || {
Expand Down Expand Up @@ -2329,21 +2334,20 @@ def _run(coro, ctx):

// Dispatch to the dedicated worker thread that owns the asyncio event loop.
// This ensures all run_until_complete calls happen on the same OS thread,
// preserving asyncio thread-affinity. The GIL is released while waiting so
// the worker thread can acquire it to run Python.
// preserving asyncio thread-affinity. The work channel is unbounded so
// sending cannot rendezvous with worker startup while the caller holds
// the GIL; the result wait below releases the GIL for Python execution.
let tx = self.ensure_worker_tx()?;
let (result_tx, result_rx) = std::sync::mpsc::sync_channel(0);
let item = PrivateLoopWorkItem {
awaitable: awaitable.clone_ref(py),
context: context.clone_ref(py),
result_tx,
};
// THREAT[TM-PY-030]: detach (release the GIL) around BOTH the send and
// the recv. The channel is a rendezvous (capacity 0), so send blocks
// until the worker receives — and on first use the worker must acquire
// the GIL to create its event loop before it ever reaches recv().
// Sending while attached therefore deadlocks the whole process:
// dispatcher holds the GIL waiting on send, worker waits on the GIL.
// THREAT[TM-PY-030]: detach (release the GIL) for the result rendezvous.
// The work channel is unbounded so send() returns immediately without
// blocking; only result_rx.recv() is a true blocking wait. GIL is still
// released across the entire block so the worker can attach at any point.
// `move` ensures result_rx (Send, not Sync) is owned by the closure.
py.detach(move || {
tx.send(item)
Expand Down
61 changes: 61 additions & 0 deletions crates/bashkit-python/tests/test_async_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
import asyncio
import contextvars
import gc
import glob as _glob
import os
import subprocess
import sys
import textwrap
import threading
import time
import weakref
Expand Down Expand Up @@ -39,6 +44,62 @@ def _collect_between_tests():
# ===========================================================================


def test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock():
"""First private-loop async callback returns instead of deadlocking the GIL."""

code = textwrap.dedent(
r"""
import asyncio
from bashkit import ScriptedTool

async def greet(params, stdin=None):
await asyncio.sleep(0)
return "ok\n"

tool = ScriptedTool("api")
tool.add_tool("greet", "Greet", callback=greet)
result = tool.execute_sync("greet")
print(result.exit_code)
print(result.stdout.strip())
"""
)

# Build a PYTHONPATH that points directly at the directory that contains
# both bashkit/__init__.py AND the compiled bashkit._bashkit extension.
# We scan sys.path with glob rather than using find_spec: pytest imports
# bashkit from the source tree before this test runs, so find_spec looks
# for _bashkit inside that source package and returns None — leaving
# pkg_root unset and the subprocess still pointing at the source tree.
pkg_root = next(
(
p
for p in sys.path
if p
and (
_glob.glob(os.path.join(p, "bashkit", "_bashkit*.so"))
or _glob.glob(os.path.join(p, "bashkit", "_bashkit*.pyd"))
)
),
None,
)
env = {
**os.environ,
"PYTHONPATH": (pkg_root + os.pathsep if pkg_root else "") + os.environ.get("PYTHONPATH", ""),
}
completed = subprocess.run(
[sys.executable, "-c", code],
capture_output=True,
text=True,
timeout=10.0,
check=False,
cwd="/",
env=env,
)

assert completed.returncode == 0, completed.stderr
assert completed.stdout.splitlines() == ["0", "ok"]


def test_async_callback_execute_sync_honors_timeout():
"""execute_sync() timeout preempts slow async callbacks on private loop."""

Expand Down
40 changes: 21 additions & 19 deletions specs/threat-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,7 @@ caller's GIL hold.

| TM-PY-026 | reset() discards security config | `BashTool.reset()` creates new `Bash` with bare builder, dropping all configured limits | `PyBash::reset` and `BashTool::reset` rebuild via `replace_live_bash_with_builder` + `build_live_builder`, which preserves the original limits, env, and registered builtins | **MITIGATED** |
| TM-PY-027 | Unbounded recursion in JSON conversion | `py_to_json`/`json_to_py` recurse without depth limit on nested dicts/lists | `json_to_py_inner`, `py_to_json_inner`, and the MontyObject converters all carry a `depth` arg; depth > `MAX_NESTING_DEPTH = 64` raises `ValueError("… nesting depth exceeds maximum of 64")` | **MITIGATED** |
| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Dispatch detaches around both the send and the receive; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** |
| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Work dispatch uses an unbounded `channel()` so send() never blocks; result rendezvous runs inside `py.detach(...)` so GIL is released for the blocking wait; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** |

**TM-PY-026** (mitigated): `PyBash::reset` and `BashTool::reset` (`crates/bashkit-python/src/lib.rs`)
rebuild the inner `Bash` via `replace_live_bash_with_builder` + `build_live_builder`, which
Expand All @@ -1758,25 +1758,27 @@ MontyObject converters in `crates/bashkit-python/src/lib.rs` carry a `depth: usi
At `depth > MAX_NESTING_DEPTH = 64`, conversion raises a Python `ValueError` instead of
recursing. Coverage: `tests/_security_advanced.py::JsonConversionBoundariesTests`.

**TM-PY-030** (mitigated): two deadlock variants in the async-callback private-loop
**TM-PY-030** (mitigated): three deadlock/crash variants in the async-callback private-loop
machinery (`crates/bashkit-python/src/lib.rs`). (1) `PyPrivateAsyncLoop::run_awaitable`
sent work to the dedicated worker thread over a `sync_channel(0)` rendezvous while
attached; on first use the worker must attach (acquire the GIL) to create its asyncio
loop before it can `recv()`, so dispatcher and worker waited on each other. The send
and receive now both run inside `py.detach(...)`. (2) Pyclass dealloc runs attached
and dropped the last `Arc<Runtime>`; tokio's default `Runtime::drop` joins in-flight
blocking tasks, and an abandoned (timed-out) callback task must re-attach to finish —
freezing the entire interpreter. The `PyRuntime` handle now shuts the runtime down
with `shutdown_background()` on last drop. (3) The private-loop worker thread called
`Python::attach` on its exit path to close its asyncio loop; the worker usually wakes
because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` —
attaching during finalization fatals CPython (`PyGILState_Release`, SIGABRT at
interpreter exit; `Python::try_attach` cannot detect finalization before 3.13). The
worker exit path no longer touches Python: the loop's `Py` ref is dropped unattached
(deferred decref) and the loop is closed by `BaseEventLoop.__del__`. Regression tests:
`tests/test_async_callbacks.py::test_async_callback_execute_sync_honors_timeout`,
`…::test_dealloc_during_inflight_callback_does_not_deadlock`; variant (3) is covered
by the `langgraph_async_tool.py` example run in the Python CI Examples job.
originally sent work over a `sync_channel(0)` rendezvous while attached; on first use the
worker must attach (acquire the GIL) to create its asyncio loop before it can `recv()`, so
dispatcher and worker waited on each other. The work dispatch now uses an unbounded
`channel()` so `send()` never blocks regardless of worker state; the result rendezvous
(`result_rx.recv()`) still runs inside `py.detach(...)` to release the GIL for the blocking
wait. Regression test: `test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock`.
(2) Pyclass dealloc runs attached and dropped the last `Arc<Runtime>`; tokio's default
`Runtime::drop` joins in-flight blocking tasks, and an abandoned (timed-out) callback task
must re-attach to finish — freezing the entire interpreter. The `PyRuntime` handle now shuts
the runtime down with `shutdown_background()` on last drop. Regression test:
`test_dealloc_during_inflight_callback_does_not_deadlock`. (3) The private-loop worker thread
called `Python::attach` on its exit path to close its asyncio loop; the worker usually wakes
because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` — attaching during
finalization fatals CPython (`PyGILState_Release`, SIGABRT at interpreter exit;
`Python::try_attach` cannot detect finalization before 3.13). The worker exit path no longer
touches Python: the loop's `Py` ref is dropped unattached (deferred decref) and the loop is
closed by `BaseEventLoop.__del__`. Variant (3) is covered by the `langgraph_async_tool.py`
example run in the Python CI Examples job. Regression test also:
`test_async_callback_execute_sync_honors_timeout`.

| TM-PY-029 | Host clock information disclosure | `datetime.date.today()` / `datetime.datetime.now()` expose host system time and timezone | Intentional — required for correct datetime semantics | **ACCEPTED** |

Expand Down
Loading