Skip to content

Commit ac95f0d

Browse files
committed
Bound BLE client disconnect and trace disconnect triggers
1 parent 860073a commit ac95f0d

3 files changed

Lines changed: 91 additions & 28 deletions

File tree

custom_components/ef_ble/eflib/connection.py

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from dataclasses import dataclass
1212
from enum import StrEnum, auto
1313
from functools import cached_property
14-
from typing import Literal, Self
14+
from typing import Any, Literal, Self
1515

1616
import ecdsa
1717
from bleak import BleakClient
@@ -43,13 +43,19 @@
4343
SimplePacketAssembler,
4444
)
4545
from .listeners import ListenerGroup, ListenerRegistry
46-
from .logging_util import ConnectionLogger, LogOptions
46+
from .logging_util import ConnectionLogger, LogOptions, caller_chain
4747
from .packet import Packet
4848
from .props.utils import classproperty
4949

5050
MAX_RECONNECT_ATTEMPTS = 2
5151
MAX_CONNECTION_ATTEMPTS = 10
5252

53+
# `BleakClient.disconnect()` can block until the connect timeout (default 20s) when a
54+
# write-with-response is still pending on the transport after a mid-auth BLE drop
55+
# (notably through an ESPHome proxy). Left unbounded it stalls `async_unload_entry`
56+
# long enough for HA to mark the entry `FAILED_UNLOAD`, so cap every disconnect.
57+
DISCONNECT_TIMEOUT = 5.0
58+
5359

5460
_BT_PROTOCOL_UUIDS = {
5561
"rfcomm": {
@@ -236,6 +242,7 @@ def __init__(
236242

237243
self._errors = 0
238244
self._last_errors = deque(maxlen=10)
245+
self._disconnect_log: deque[dict[str, Any]] = deque(maxlen=10)
239246
self._client = None
240247
self._connected = asyncio.Event()
241248
self._disconnected = asyncio.Event()
@@ -257,6 +264,7 @@ def __init__(
257264
self._reconnect = True
258265

259266
self._connection_state: ConnectionState = None # pyright: ignore[reportAttributeAccessIssue]
267+
self._state_reason: str | None = None
260268
self._set_state(ConnectionState.CREATED)
261269

262270
@property
@@ -387,8 +395,7 @@ async def connect(
387395
self._set_state(ConnectionState.ERROR_BLEAK, e)
388396

389397
if error is not None:
390-
if self._client is not None and self._client.is_connected:
391-
await self._client.disconnect()
398+
await self._disconnect_client()
392399

393400
self._logger.error("Failed to connect to the device: %s", error)
394401
self._last_errors.append(f"Failed to connect to the device: {error}")
@@ -405,7 +412,10 @@ async def connect(
405412
await self.initBleSessionKey()
406413

407414
def disconnected(self, *args, **kwargs) -> None:
408-
self._logger.warning("Disconnected from device")
415+
# Traces the trigger: an unsolicited bleak drop shows bleak/asyncio frames here,
416+
# whereas a drop we requested shows our own `disconnect` chain.
417+
trigger = caller_chain()
418+
self._logger.warning("Disconnected from device (%s)", trigger)
409419
self._client = None
410420

411421
# NOTE(gnox): don't trigger disconnect/reconnect logic while
@@ -422,7 +432,7 @@ def disconnected(self, *args, **kwargs) -> None:
422432
self._disconnected.set()
423433
if self._state is not ConnectionState.DISCONNECTING:
424434
self._notify_disconnect()
425-
self._set_state(ConnectionState.DISCONNECTED)
435+
self._set_state(ConnectionState.DISCONNECTED, reason=trigger)
426436
return
427437

428438
if self._reconnect_task is not None:
@@ -476,31 +486,62 @@ async def reconnect(self) -> None:
476486
self._set_state(ConnectionState.RECONNECTING)
477487
await self.connect()
478488

479-
async def disconnect(self) -> None:
480-
self._logger.info(msg="Disconnecting from device")
489+
async def _disconnect_client(self) -> None:
490+
"""
491+
Disconnect the underlying BLE client, bounded by `DISCONNECT_TIMEOUT`
492+
493+
Swallows the usual "already down" errors and, on timeout, gives up waiting and
494+
lets the caller proceed with local cleanup; the transport drains on its own once
495+
the drop is detected. Used everywhere we tear the client down so no path can
496+
block on a hung `BleakClient.disconnect()`.
497+
"""
498+
if self._client is None or not self._client.is_connected:
499+
return
500+
trigger = caller_chain()
501+
self._logger.debug("Disconnecting BLE client (%s)", trigger)
502+
outcome = "ok"
503+
try:
504+
async with asyncio.timeout(DISCONNECT_TIMEOUT):
505+
await self._client.disconnect()
506+
except (EOFError, BleakError) as e:
507+
outcome = f"already_down: {e}"
508+
self._logger.warning("Disconnect failed (already down): %s", e)
509+
except TimeoutError:
510+
outcome = "timeout"
511+
self._logger.warning(
512+
"BleakClient.disconnect() did not return within %ss (%s); continuing "
513+
"with local cleanup (write-with-response likely still pending after a "
514+
"mid-auth BLE drop)",
515+
DISCONNECT_TIMEOUT,
516+
trigger,
517+
)
518+
self._disconnect_log.append(
519+
{"time": time.time(), "trigger": trigger, "outcome": outcome}
520+
)
521+
522+
@property
523+
def disconnect_log(self) -> list[dict[str, Any]]:
524+
"""Recent BLE client disconnect outcomes, for diagnostics"""
525+
return list(self._disconnect_log)
526+
527+
async def disconnect(self, reason: str | None = None) -> None:
528+
self._logger.info("Disconnecting from device (%s)", reason or "no reason given")
481529
self._retry_on_disconnect = False
482530

483531
self._reconnect_attempt = 0
484532
self._cancel_tasks()
485533

486534
if self._client is not None and self._client.is_connected:
487-
self._set_state(ConnectionState.DISCONNECTING)
488-
try:
489-
await self._client.disconnect()
490-
except (EOFError, BleakError) as e:
491-
self._logger.debug("Disconnect failed (already down): %s", e)
535+
self._set_state(ConnectionState.DISCONNECTING, reason=reason)
536+
await self._disconnect_client()
492537

493538
self._client = None
494539
if self._state == ConnectionState.DISCONNECTING:
495-
self._set_state(ConnectionState.DISCONNECTED)
540+
self._set_state(ConnectionState.DISCONNECTED, reason=reason)
496541

497542
async def _disconnect_error(self, state: ConnectionState, exc: Exception):
498543
self._set_state(state, exc)
499-
if self._client is not None and self._client.is_connected:
500-
try:
501-
await self._client.disconnect()
502-
except (EOFError, BleakError) as e:
503-
self._logger.debug("Disconnect failed (already down): %s", e)
544+
await self._disconnect_client()
504545
raise exc
505546

506547
@staticmethod
@@ -601,10 +642,7 @@ async def add_error(self, exception: Exception):
601642
self._set_state(ConnectionState.ERROR_TOO_MANY_ERRORS, exception)
602643
if self._client is not None and self._client.is_connected:
603644
self._logger.warning("Client disconnected after encountering 5 errors")
604-
try:
605-
await self._client.disconnect()
606-
except (EOFError, BleakError) as e:
607-
self._logger.debug("Disconnect failed (already down): %s", e)
645+
await self._disconnect_client()
608646

609647
def _reset_error_counter(self):
610648
self._errors = 0
@@ -621,13 +659,21 @@ def _state(self, value: ConnectionState):
621659
self._state_changed.clear()
622660
self._listeners.on_connection_state_change(value)
623661

662+
@property
663+
def state_reason(self) -> str | None:
664+
return self._state_reason
665+
624666
def _set_state(
625-
self, state: ConnectionState, exc: Exception | type[Exception] | None = None
667+
self,
668+
state: ConnectionState,
669+
exc: Exception | type[Exception] | None = None,
670+
reason: str | None = None,
626671
):
627672
self._state_exception = exc
628673
if exc is not None:
629674
self._last_exception = exc
630675

676+
self._state_reason = reason
631677
self._state = state
632678

633679
if state.is_error:
@@ -1041,8 +1087,7 @@ async def _check_auth(self, packet: Packet):
10411087
self._logger.error("Authentication failed, packet: %s", packet, exc_info=exc)
10421088
self._set_state(ConnectionState.ERROR_AUTH_FAILED, exc)
10431089

1044-
if self._client is not None and self._client.is_connected:
1045-
await self._client.disconnect()
1090+
await self._disconnect_client()
10461091
raise exc
10471092

10481093
async def send_auth_status_packet(self):

custom_components/ef_ble/eflib/devicebase.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DeviceDiagnosticsCollector,
2727
DeviceLogger,
2828
LogOptions,
29+
caller_chain,
2930
)
3031
from .packet import Packet
3132
from .props.raw_data_props import Literal
@@ -283,7 +284,7 @@ async def connect(
283284
self._conn.on_packet_data_received(self._listeners.on_packet_received)
284285
self._conn.on_packet_parsed(self._listeners.on_packet_parsed)
285286
self._conn.on_state_change(self._listeners.on_connection_state_change)
286-
self._conn.on_state_change(self.connection_log.append)
287+
self._conn.on_state_change(self._append_state_to_log)
287288
self._conn.on_data_received(self._listeners.on_data_received)
288289
self._conn.on_data_send(self._listeners.on_data_send)
289290

@@ -292,12 +293,16 @@ async def connect(
292293

293294
await self._conn.connect(max_attempts=max_attempts)
294295

296+
def _append_state_to_log(self, state: ConnectionState) -> None:
297+
reason = self._conn.state_reason if self._conn is not None else None
298+
self.connection_log.append(state, reason)
299+
295300
async def disconnect(self):
296301
if self._conn is None:
297302
self._logger.error("Device has no connection")
298303
return
299304

300-
await self._conn.disconnect()
305+
await self._conn.disconnect(reason=caller_chain())
301306
self._connection_event.clear()
302307
self._conn = None
303308

custom_components/ef_ble/eflib/logging_util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,8 @@ def build_diagnostics_dict(self, session: Session | None = None) -> dict:
352352
}
353353
if session is not None:
354354
result["session"] = session.header.hex()
355+
if (conn := device._conn) is not None and conn.disconnect_log:
356+
result["disconnect_log"] = conn.disconnect_log
355357
if self.is_enabled:
356358
result |= self.as_dict(session)
357359
return result
@@ -592,3 +594,14 @@ def __str__(self) -> str:
592594

593595
def __repr__(self) -> str:
594596
return bytes(self._data).hex()
597+
598+
599+
def caller_chain(depth: int = 4) -> str:
600+
"""
601+
Compact `->` chain of the innermost `depth` call frames, for tracing triggers
602+
603+
Drops this helper's own frame; awaited coroutine frames stay on the stack, so the
604+
chain reaches across `await` boundaries to whatever ultimately triggered the call.
605+
"""
606+
frames = traceback.extract_stack()[:-1]
607+
return " -> ".join(frame.name for frame in frames[-depth:])

0 commit comments

Comments
 (0)