@@ -52,6 +52,12 @@ def __init__(self) -> None:
5252 self .unsubscribe_called = threading .Event ()
5353 self .subscription = BlockingSubscription ()
5454
55+ def fileno (self ) -> int :
56+ # Called by LCMService.start() to pre-warm LCM recv setup. A real LCM
57+ # sets up sockets and runs a self-test here; for the fake we just
58+ # return a sentinel fd since nothing will select() on it.
59+ return - 1
60+
5561 def handle_timeout (self , _timeout : int ) -> None :
5662 self .handle_entered .set ()
5763 self .release_handle .wait (timeout = 1.0 )
@@ -323,7 +329,7 @@ def fake_pub_sub(fake_lcm):
323329 pubsub .stop ()
324330
325331
326- def test_publish_proceeds_during_handle_loop (fake_lcm , fake_pub_sub ):
332+ def test_publish_waits_for_handle_loop (fake_lcm , fake_pub_sub ):
327333 assert fake_lcm .handle_entered .wait (timeout = 0.5 )
328334
329335 publisher = threading .Thread (
@@ -332,14 +338,20 @@ def test_publish_proceeds_during_handle_loop(fake_lcm, fake_pub_sub):
332338 )
333339 publisher .start ()
334340
335- assert fake_lcm . publish_called . wait ( timeout = 0.5 )
336- publisher . join (timeout = 1.0 )
337- assert not publisher .is_alive ()
341+ # Loop holds _l_lock inside handle_timeout; publish waits for the lock.
342+ assert not fake_lcm . publish_called . wait (timeout = 0.1 )
343+ assert publisher .is_alive ()
338344
345+ # Releasing handle_timeout drops _l_lock; publish now proceeds.
339346 fake_lcm .release_handle .set ()
340347
348+ assert fake_lcm .publish_called .wait (timeout = 1.0 )
349+ publisher .join (timeout = 1.0 )
350+ assert not publisher .is_alive ()
341351
342- def test_subscribe_proceeds_during_handle_loop (fake_lcm , fake_pub_sub ):
352+
353+ def test_subscribe_waits_for_handle_loop (fake_lcm , fake_pub_sub ):
354+ """subscribe() must block while the loop thread is inside handle_timeout()."""
343355 assert fake_lcm .handle_entered .wait (timeout = 0.5 )
344356
345357 subscriber = threading .Thread (
@@ -348,16 +360,27 @@ def test_subscribe_proceeds_during_handle_loop(fake_lcm, fake_pub_sub):
348360 )
349361 subscriber .start ()
350362
351- assert fake_lcm .subscribe_called .wait (timeout = 0.5 )
363+ assert not fake_lcm .subscribe_called .wait (timeout = 0.1 )
364+ assert subscriber .is_alive ()
365+
366+ fake_lcm .release_handle .set ()
367+
368+ assert fake_lcm .subscribe_called .wait (timeout = 1.0 )
352369 subscriber .join (timeout = 1.0 )
353370 assert not subscriber .is_alive ()
354371 assert fake_lcm .subscription .queue_capacity == 10000
355372
356- fake_lcm .release_handle .set ()
357373
374+ def test_unsubscribe_waits_for_handle_loop (fake_lcm , fake_pub_sub ):
375+ """unsubscribe() must block while the loop thread is inside handle_timeout().
358376
359- def test_unsubscribe_proceeds_during_handle_loop (fake_lcm , fake_pub_sub ):
377+ This is the specific race whose resolution fixes the segfault in
378+ pylcm.c. Unsubscribing from another thread while dispatch is running
379+ would set subs_obj->lcm_obj = NULL under the nose of pylcm_msg_handler.
380+ """
381+ # Let the first handle_timeout complete so we can subscribe cleanly.
360382 assert fake_lcm .handle_entered .wait (timeout = 0.5 )
383+ fake_lcm .release_handle .set ()
361384
362385 unsubscribe_holder : dict [str , object ] = {}
363386
@@ -366,20 +389,29 @@ def do_subscribe() -> None:
366389
367390 subscriber = threading .Thread (target = do_subscribe , daemon = True )
368391 subscriber .start ()
369- assert fake_lcm .subscribe_called .wait (timeout = 0.5 )
392+ assert fake_lcm .subscribe_called .wait (timeout = 1.0 )
370393 subscriber .join (timeout = 1.0 )
371394 assert not subscriber .is_alive ()
372395
396+ # Reset gates so the next handle_timeout iteration blocks again.
397+ fake_lcm .handle_entered .clear ()
398+ fake_lcm .release_handle .clear ()
399+ assert fake_lcm .handle_entered .wait (timeout = 1.0 )
400+
373401 unsubscribe = unsubscribe_holder ["fn" ]
374402 unsub_thread = threading .Thread (target = unsubscribe , daemon = True ) # type: ignore[arg-type]
375403 unsub_thread .start ()
376404
377- assert fake_lcm . unsubscribe_called . wait ( timeout = 0.5 )
378- unsub_thread . join (timeout = 1.0 )
379- assert not unsub_thread .is_alive ()
405+ # Loop holds _l_lock; unsubscribe waits for the lock.
406+ assert not fake_lcm . unsubscribe_called . wait (timeout = 0.1 )
407+ assert unsub_thread .is_alive ()
380408
381409 fake_lcm .release_handle .set ()
382410
411+ assert fake_lcm .unsubscribe_called .wait (timeout = 1.0 )
412+ unsub_thread .join (timeout = 1.0 )
413+ assert not unsub_thread .is_alive ()
414+
383415
384416def test_stop_from_within_lcm_thread (mocker ):
385417 """stop() called from inside handle_timeout must not deadlock and must
@@ -391,6 +423,9 @@ class SelfStoppingLCM:
391423 def __init__ (self ) -> None :
392424 self .done = threading .Event ()
393425
426+ def fileno (self ) -> int :
427+ return - 1
428+
394429 def handle_timeout (self , _timeout : int ) -> None :
395430 if not self .done .is_set ():
396431 captured ["thread" ] = threading .current_thread ()
@@ -420,3 +455,57 @@ def unsubscribe(self, *_args: object) -> None:
420455 assert not thread .is_alive ()
421456 assert service .l is None
422457 assert service ._thread is None
458+
459+
460+ def test_handler_can_publish_via_rlock_reentry (mocker ):
461+ """A message handler dispatched from handle_timeout runs on the loop
462+ thread while it already holds _l_lock. Reentry must work so the handler
463+ can call self.publish/subscribe/unsubscribe. This is why _l_lock is an
464+ RLock rather than a plain Lock.
465+ """
466+ publish_calls : list [tuple [str , bytes ]] = []
467+ handler_done = threading .Event ()
468+
469+ class ReentrantLCM :
470+ def __init__ (self ) -> None :
471+ self ._handler = None
472+ self ._dispatched = False
473+ self ._subscription = BlockingSubscription ()
474+
475+ def fileno (self ) -> int :
476+ return - 1
477+
478+ def handle_timeout (self , _timeout : int ) -> None :
479+ # Dispatch one fake message on the first call after a handler
480+ # has been registered. The handler will call self.publish, which
481+ # must reenter _l_lock recursively on the loop thread.
482+ if self ._handler is not None and not self ._dispatched :
483+ self ._dispatched = True
484+ self ._handler ("/req" , b"req-payload" )
485+ handler_done .set ()
486+
487+ def publish (self , channel : str , message : bytes ) -> None :
488+ publish_calls .append ((channel , message ))
489+
490+ def subscribe (self , _channel , handler ) -> BlockingSubscription :
491+ self ._handler = handler
492+ return self ._subscription
493+
494+ def unsubscribe (self , _subscription ) -> None :
495+ pass
496+
497+ fake = ReentrantLCM ()
498+ mocker .patch ("dimos.protocol.service.lcmservice.lcm_mod.LCM" , return_value = fake )
499+
500+ pubsub = LCMPubSubBase ()
501+ pubsub .start ()
502+
503+ def reentrant_callback (_msg : bytes , _topic : Topic ) -> None :
504+ pubsub .publish (Topic ("/res" ), b"res-payload" )
505+
506+ pubsub .subscribe (Topic ("/req" ), reentrant_callback )
507+
508+ assert handler_done .wait (timeout = 2.0 )
509+ pubsub .stop ()
510+
511+ assert ("/res" , b"res-payload" ) in publish_calls
0 commit comments