Skip to content

Commit 5dee961

Browse files
authored
Fix config entry FAILED_UNLOAD on hung BLE disconnect, add disconnect tracing (#359)
`BleakClient.disconnect()` can block until the connect timeout (default 20s) when a write-with-response is still pending on the transport after a mid-auth BLE drop, which is common through an ESPHome proxy. `Connection.disconnect()` awaited it with no deadline, so `async_unload_entry` could outrun HA's unload window and leave the entry in `ConfigEntryState.FAILED_UNLOAD` - which only a full restart recovers from, re-triggering the same flap. Every client teardown now goes through a bounded `_disconnect_client()` helper (`asyncio.timeout`, `DISCONNECT_TIMEOUT = 5s`) that swallows the usual "already down" errors and, on timeout, gives up waiting and lets the caller finish local cleanup - the transport drains on its own once the drop is detected. This enforces the deadline in the library where the hang actually happens, so all five teardown paths benefit and no `async_unload_entry` guard is needed. While here, make disconnects traceable from a diagnostics download alone (no debug logging required), since the original report was hard to pin down: - `disconnect()` and the bleak `disconnected()` callback record the caller chain as the `reason` on the `DISCONNECTING` / `DISCONNECTED` entries in `connection_state_history`, so a requested disconnect (unload, reload) is distinguishable from an unsolicited bleak drop. - `_disconnect_client()` records each outcome (`ok` / `timeout` / `already_down`) with its trigger into a bounded `disconnect_log` that is included in the diagnostics dump, so timeouts and which path caused them are visible across every teardown path. Resolves #338. Reimplements #339 with the fix moved into the library and the root cause framed correctly. Should also address #366
1 parent 2828ce8 commit 5dee961

4 files changed

Lines changed: 95 additions & 29 deletions

File tree

custom_components/ef_ble/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,10 @@ async def async_unload_entry(hass: HomeAssistant, entry: DeviceConfigEntry) -> b
192192
"""Unload a config entry."""
193193
_cancel_reappear_callback(hass, entry)
194194
device = entry.runtime_data
195-
await device.disconnect()
195+
try:
196+
await device.disconnect()
197+
except Exception:
198+
_LOGGER.exception("Error disconnecting device during unload, continuing")
196199
device.with_logging_options(LogOptions.no_options())
197200
return await hass.config_entries.async_unload_platforms(entry, PLATFORMS)
198201

custom_components/ef_ble/eflib/connection.py

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from dataclasses import dataclass
1212
from enum import StrEnum, auto
1313
from functools import cached_property
14-
from typing import Literal, Self
14+
from typing import Any, Literal, Self
1515

1616
import ecdsa
1717
from bleak import BleakClient
@@ -43,13 +43,19 @@
4343
SimplePacketAssembler,
4444
)
4545
from .listeners import ListenerGroup, ListenerRegistry
46-
from .logging_util import ConnectionLogger, LogOptions
46+
from .logging_util import ConnectionLogger, LogOptions, caller_chain
4747
from .packet import Packet
4848
from .props.utils import classproperty
4949

5050
MAX_RECONNECT_ATTEMPTS = 2
5151
MAX_CONNECTION_ATTEMPTS = 10
5252

53+
# `BleakClient.disconnect()` can block until the connect timeout (default 20s) when a
54+
# write-with-response is still pending on the transport after a mid-auth BLE drop
55+
# (notably through an ESPHome proxy). Left unbounded it stalls `async_unload_entry`
56+
# long enough for HA to mark the entry `FAILED_UNLOAD`, so cap every disconnect.
57+
DISCONNECT_TIMEOUT = 5.0
58+
5359

5460
_BT_PROTOCOL_UUIDS = {
5561
"rfcomm": {
@@ -236,6 +242,7 @@ def __init__(
236242

237243
self._errors = 0
238244
self._last_errors = deque(maxlen=10)
245+
self._disconnect_log: deque[dict[str, Any]] = deque(maxlen=10)
239246
self._client = None
240247
self._connected = asyncio.Event()
241248
self._disconnected = asyncio.Event()
@@ -257,6 +264,7 @@ def __init__(
257264
self._reconnect = True
258265

259266
self._connection_state: ConnectionState = None # pyright: ignore[reportAttributeAccessIssue]
267+
self._state_reason: str | None = None
260268
self._set_state(ConnectionState.CREATED)
261269

262270
@property
@@ -387,8 +395,7 @@ async def connect(
387395
self._set_state(ConnectionState.ERROR_BLEAK, e)
388396

389397
if error is not None:
390-
if self._client is not None and self._client.is_connected:
391-
await self._client.disconnect()
398+
await self._disconnect_client()
392399

393400
self._logger.error("Failed to connect to the device: %s", error)
394401
self._last_errors.append(f"Failed to connect to the device: {error}")
@@ -405,7 +412,10 @@ async def connect(
405412
await self.initBleSessionKey()
406413

407414
def disconnected(self, *args, **kwargs) -> None:
408-
self._logger.warning("Disconnected from device")
415+
# Traces the trigger: an unsolicited bleak drop shows bleak/asyncio frames here,
416+
# whereas a drop we requested shows our own `disconnect` chain.
417+
trigger = caller_chain()
418+
self._logger.warning("Disconnected from device (%s)", trigger)
409419
self._client = None
410420

411421
# NOTE(gnox): don't trigger disconnect/reconnect logic while
@@ -422,7 +432,7 @@ def disconnected(self, *args, **kwargs) -> None:
422432
self._disconnected.set()
423433
if self._state is not ConnectionState.DISCONNECTING:
424434
self._notify_disconnect()
425-
self._set_state(ConnectionState.DISCONNECTED)
435+
self._set_state(ConnectionState.DISCONNECTED, reason=trigger)
426436
return
427437

428438
if self._reconnect_task is not None:
@@ -476,31 +486,62 @@ async def reconnect(self) -> None:
476486
self._set_state(ConnectionState.RECONNECTING)
477487
await self.connect()
478488

479-
async def disconnect(self) -> None:
480-
self._logger.info(msg="Disconnecting from device")
489+
async def _disconnect_client(self) -> None:
490+
if self._client is None or not self._client.is_connected:
491+
return
492+
trigger = caller_chain()
493+
self._logger.debug("Disconnecting BLE client (%s)", trigger)
494+
outcome = "ok"
495+
try:
496+
async with asyncio.timeout(DISCONNECT_TIMEOUT):
497+
await self._client.disconnect()
498+
except (EOFError, BleakError) as e:
499+
outcome = f"already_down: {e}"
500+
self._logger.warning("Disconnect failed (already down): %s", e)
501+
except TimeoutError:
502+
outcome = "timeout"
503+
self._logger.warning(
504+
"BleakClient.disconnect() did not return within %ss (%s); continuing "
505+
"with local cleanup (write-with-response likely still pending after a "
506+
"mid-auth BLE drop)",
507+
DISCONNECT_TIMEOUT,
508+
trigger,
509+
)
510+
except (OSError, RuntimeError) as e:
511+
outcome = f"transport_broken: {e}"
512+
self._logger.warning(
513+
"BleakClient.disconnect() raised %s (%s); the BLE transport is broken, "
514+
"continuing with local cleanup",
515+
type(e).__name__,
516+
trigger,
517+
)
518+
self._disconnect_log.append(
519+
{"time": time.time(), "trigger": trigger, "outcome": outcome}
520+
)
521+
522+
@property
523+
def disconnect_log(self) -> list[dict[str, Any]]:
524+
"""Recent BLE client disconnect outcomes, for diagnostics"""
525+
return list(self._disconnect_log)
526+
527+
async def disconnect(self, reason: str | None = None) -> None:
528+
self._logger.info("Disconnecting from device (%s)", reason or "no reason given")
481529
self._retry_on_disconnect = False
482530

483531
self._reconnect_attempt = 0
484532
self._cancel_tasks()
485533

486534
if self._client is not None and self._client.is_connected:
487-
self._set_state(ConnectionState.DISCONNECTING)
488-
try:
489-
await self._client.disconnect()
490-
except (EOFError, BleakError) as e:
491-
self._logger.debug("Disconnect failed (already down): %s", e)
535+
self._set_state(ConnectionState.DISCONNECTING, reason=reason)
536+
await self._disconnect_client()
492537

493538
self._client = None
494539
if self._state == ConnectionState.DISCONNECTING:
495-
self._set_state(ConnectionState.DISCONNECTED)
540+
self._set_state(ConnectionState.DISCONNECTED, reason=reason)
496541

497542
async def _disconnect_error(self, state: ConnectionState, exc: Exception):
498543
self._set_state(state, exc)
499-
if self._client is not None and self._client.is_connected:
500-
try:
501-
await self._client.disconnect()
502-
except (EOFError, BleakError) as e:
503-
self._logger.debug("Disconnect failed (already down): %s", e)
544+
await self._disconnect_client()
504545
raise exc
505546

506547
@staticmethod
@@ -601,10 +642,7 @@ async def add_error(self, exception: Exception):
601642
self._set_state(ConnectionState.ERROR_TOO_MANY_ERRORS, exception)
602643
if self._client is not None and self._client.is_connected:
603644
self._logger.warning("Client disconnected after encountering 5 errors")
604-
try:
605-
await self._client.disconnect()
606-
except (EOFError, BleakError) as e:
607-
self._logger.debug("Disconnect failed (already down): %s", e)
645+
await self._disconnect_client()
608646

609647
def _reset_error_counter(self):
610648
self._errors = 0
@@ -621,13 +659,21 @@ def _state(self, value: ConnectionState):
621659
self._state_changed.clear()
622660
self._listeners.on_connection_state_change(value)
623661

662+
@property
663+
def state_reason(self) -> str | None:
664+
return self._state_reason
665+
624666
def _set_state(
625-
self, state: ConnectionState, exc: Exception | type[Exception] | None = None
667+
self,
668+
state: ConnectionState,
669+
exc: Exception | type[Exception] | None = None,
670+
reason: str | None = None,
626671
):
627672
self._state_exception = exc
628673
if exc is not None:
629674
self._last_exception = exc
630675

676+
self._state_reason = reason
631677
self._state = state
632678

633679
if state.is_error:
@@ -1041,8 +1087,7 @@ async def _check_auth(self, packet: Packet):
10411087
self._logger.error("Authentication failed, packet: %s", packet, exc_info=exc)
10421088
self._set_state(ConnectionState.ERROR_AUTH_FAILED, exc)
10431089

1044-
if self._client is not None and self._client.is_connected:
1045-
await self._client.disconnect()
1090+
await self._disconnect_client()
10461091
raise exc
10471092

10481093
async def send_auth_status_packet(self):

custom_components/ef_ble/eflib/devicebase.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DeviceDiagnosticsCollector,
2727
DeviceLogger,
2828
LogOptions,
29+
caller_chain,
2930
)
3031
from .packet import Packet
3132
from .props.raw_data_props import Literal
@@ -291,7 +292,7 @@ async def connect(
291292
self._conn.on_packet_data_received(self._listeners.on_packet_received)
292293
self._conn.on_packet_parsed(self._listeners.on_packet_parsed)
293294
self._conn.on_state_change(self._listeners.on_connection_state_change)
294-
self._conn.on_state_change(self.connection_log.append)
295+
self._conn.on_state_change(self._append_state_to_log)
295296
self._conn.on_data_received(self._listeners.on_data_received)
296297
self._conn.on_data_send(self._listeners.on_data_send)
297298

@@ -300,12 +301,16 @@ async def connect(
300301

301302
await self._conn.connect(max_attempts=max_attempts)
302303

304+
def _append_state_to_log(self, state: ConnectionState) -> None:
305+
reason = self._conn.state_reason if self._conn is not None else None
306+
self.connection_log.append(state, reason)
307+
303308
async def disconnect(self):
304309
if self._conn is None:
305310
self._logger.error("Device has no connection")
306311
return
307312

308-
await self._conn.disconnect()
313+
await self._conn.disconnect(reason=caller_chain())
309314
self._connection_event.clear()
310315
self._conn = None
311316

custom_components/ef_ble/eflib/logging_util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ def build_diagnostics_dict(self, session: Session | None = None) -> dict:
352352
}
353353
if session is not None:
354354
result["session"] = session.header.hex()
355+
if (conn := device._conn) is not None and conn.disconnect_log:
356+
result["disconnect_log"] = conn.disconnect_log
355357
if self.is_enabled:
356358
result |= self.as_dict(session)
357359
return result
@@ -592,3 +594,14 @@ def __str__(self) -> str:
592594

593595
def __repr__(self) -> str:
594596
return bytes(self._data).hex()
597+
598+
599+
def caller_chain(depth: int = 4) -> str:
600+
"""
601+
Compact `->` chain of the innermost `depth` call frames, for tracing triggers
602+
603+
Drops this helper's own frame; awaited coroutine frames stay on the stack, so the
604+
chain reaches across `await` boundaries to whatever ultimately triggered the call.
605+
"""
606+
frames = traceback.extract_stack()[:-1]
607+
return " -> ".join(frame.name for frame in frames[-depth:])

0 commit comments

Comments
 (0)