Skip to content

Commit 55a27a3

Browse files
authored
Merge pull request NousResearch#2517 from NousResearch/hermes/hermes-31d7db3b
fix(telegram): auto-reconnect polling after network interruption
2 parents 8587cdd + 2bd8e5c commit 55a27a3

File tree

1 file changed

+82
-4
lines changed

1 file changed

+82
-4
lines changed

gateway/platforms/telegram.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def __init__(self, config: PlatformConfig):
130130
self._token_lock_identity: Optional[str] = None
131131
self._polling_error_task: Optional[asyncio.Task] = None
132132
self._polling_conflict_count: int = 0
133+
self._polling_network_error_count: int = 0
133134
self._polling_error_callback_ref = None
134135

135136
@staticmethod
@@ -141,6 +142,80 @@ def _looks_like_polling_conflict(error: Exception) -> bool:
141142
or "another bot instance is running" in text
142143
)
143144

145+
@staticmethod
146+
def _looks_like_network_error(error: Exception) -> bool:
147+
"""Return True for transient network errors that warrant a reconnect attempt."""
148+
name = error.__class__.__name__.lower()
149+
if name in ("networkerror", "timedout", "connectionerror"):
150+
return True
151+
try:
152+
from telegram.error import NetworkError, TimedOut
153+
if isinstance(error, (NetworkError, TimedOut)):
154+
return True
155+
except ImportError:
156+
pass
157+
return isinstance(error, OSError)
158+
159+
async def _handle_polling_network_error(self, error: Exception) -> None:
160+
"""Reconnect polling after a transient network interruption.
161+
162+
Triggered by NetworkError/TimedOut in the polling error callback, which
163+
happen when the host loses connectivity (Mac sleep, WiFi switch, VPN
164+
reconnect, etc.). The gateway process stays alive but the long-poll
165+
connection silently dies; without this handler the bot never recovers.
166+
167+
Strategy: exponential back-off (5s, 10s, 20s, 40s, 60s cap) up to
168+
MAX_NETWORK_RETRIES attempts, then mark the adapter retryable-fatal so
169+
the supervisor restarts the gateway process.
170+
"""
171+
if self.has_fatal_error:
172+
return
173+
174+
MAX_NETWORK_RETRIES = 10
175+
BASE_DELAY = 5
176+
MAX_DELAY = 60
177+
178+
self._polling_network_error_count += 1
179+
attempt = self._polling_network_error_count
180+
181+
if attempt > MAX_NETWORK_RETRIES:
182+
message = (
183+
"Telegram polling could not reconnect after %d network error retries. "
184+
"Restarting gateway." % MAX_NETWORK_RETRIES
185+
)
186+
logger.error("[%s] %s Last error: %s", self.name, message, error)
187+
self._set_fatal_error("telegram_network_error", message, retryable=True)
188+
await self._notify_fatal_error()
189+
return
190+
191+
delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY)
192+
logger.warning(
193+
"[%s] Telegram network error (attempt %d/%d), reconnecting in %ds. Error: %s",
194+
self.name, attempt, MAX_NETWORK_RETRIES, delay, error,
195+
)
196+
await asyncio.sleep(delay)
197+
198+
try:
199+
if self._app and self._app.updater and self._app.updater.running:
200+
await self._app.updater.stop()
201+
except Exception:
202+
pass
203+
204+
try:
205+
await self._app.updater.start_polling(
206+
allowed_updates=Update.ALL_TYPES,
207+
drop_pending_updates=False,
208+
error_callback=self._polling_error_callback_ref,
209+
)
210+
logger.info(
211+
"[%s] Telegram polling resumed after network error (attempt %d)",
212+
self.name, attempt,
213+
)
214+
self._polling_network_error_count = 0
215+
except Exception as retry_err:
216+
logger.warning("[%s] Telegram polling reconnect failed: %s", self.name, retry_err)
217+
# The next network error will trigger another attempt.
218+
144219
async def _handle_polling_conflict(self, error: Exception) -> None:
145220
if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict":
146221
return
@@ -276,12 +351,15 @@ async def connect(self) -> bool:
276351
loop = asyncio.get_running_loop()
277352

278353
def _polling_error_callback(error: Exception) -> None:
279-
if not self._looks_like_polling_conflict(error):
280-
logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True)
281-
return
282354
if self._polling_error_task and not self._polling_error_task.done():
283355
return
284-
self._polling_error_task = loop.create_task(self._handle_polling_conflict(error))
356+
if self._looks_like_polling_conflict(error):
357+
self._polling_error_task = loop.create_task(self._handle_polling_conflict(error))
358+
elif self._looks_like_network_error(error):
359+
logger.warning("[%s] Telegram network error, scheduling reconnect: %s", self.name, error)
360+
self._polling_error_task = loop.create_task(self._handle_polling_network_error(error))
361+
else:
362+
logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True)
285363

286364
# Store reference for retry use in _handle_polling_conflict
287365
self._polling_error_callback_ref = _polling_error_callback

0 commit comments

Comments
 (0)