Skip to content

Commit b09ab7f

Browse files
committed
chore: bump version to 0.7.12 and enhance CloudTunnel connection handling
- Updated version number to 0.7.12 in VERSION file. - Improved connection logic in CloudTunnel to prefer IPv4 by default, with options for IPv6 and dual-stack configurations. - Enhanced error reporting during connection failures to include specific address and error details. - Implemented exponential backoff for reconnect attempts to reduce log clutter during network issues.
1 parent d4d2c4a commit b09ab7f

2 files changed

Lines changed: 98 additions & 8 deletions

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.7.11
1+
0.7.12

src/daemon/cloud_tunnel.cpp

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,16 @@ extern "C" {
2525
#include <unistd.h>
2626

2727
#include <algorithm>
28+
#include <cerrno>
2829
#include <chrono>
30+
#include <cstdlib>
2931
#include <cstring>
3032
#include <filesystem>
3133
#include <fstream>
3234
#include <iostream>
3335
#include <random>
3436
#include <sstream>
37+
#include <string_view>
3538
#include <thread>
3639

3740
namespace tenbox::daemon {
@@ -511,27 +514,70 @@ bool CloudTunnel::Connect(std::string* error) {
511514
url.target += "host_id=" + host_id_;
512515
}
513516

517+
// Default to IPv4 only. The cloud gateway (my.tenbox.ai) is fronted by
518+
// Cloudflare, which is dual-stack — but in practice plenty of home /
519+
// SMB / single-board-computer environments (RK3588 boards, consumer
520+
// routers, etc.) advertise an IPv6 default route that doesn't actually
521+
// forward traffic. Without this hint glibc's RFC 6724 sort would put
522+
// AAAA first, the connect() below would either time out or stall, and
523+
// the user only sees a generic "failed to read WebSocket handshake"
524+
// log line with no clue that the daemon never got past TCP.
525+
//
526+
// Operators on IPv6-only networks can opt back in with
527+
// TENBOX_CLOUD_PREFER_FAMILY=v6 (single-stack v6) or =any
528+
// (dual-stack with v6 preferred — pre-fix behaviour).
514529
addrinfo hints{};
515530
hints.ai_socktype = SOCK_STREAM;
516-
hints.ai_family = AF_UNSPEC;
531+
hints.ai_family = AF_INET;
532+
if (const char* fam = std::getenv("TENBOX_CLOUD_PREFER_FAMILY")) {
533+
if (std::string_view(fam) == "v6") hints.ai_family = AF_INET6;
534+
else if (std::string_view(fam) == "any") hints.ai_family = AF_UNSPEC;
535+
// anything else (including "v4") falls through to AF_INET.
536+
}
537+
517538
addrinfo* result = nullptr;
518539
const int gai = ::getaddrinfo(url.host.c_str(), url.port.c_str(), &hints, &result);
519540
if (gai != 0) {
520541
if (error) *error = gai_strerror(gai);
521542
return false;
522543
}
523544

545+
// Walk every candidate so a transient failure on the first record
546+
// (e.g. one Cloudflare anycast IP that happens to be flapping) still
547+
// tries the next. Capture the last connect() errno so the caller can
548+
// log "tried 1.2.3.4:443 -> ECONNREFUSED" instead of the previous
549+
// opaque "failed to connect cloud gateway".
524550
int fd = -1;
551+
int last_errno = 0;
552+
char last_addr[64] = {};
525553
for (auto* rp = result; rp; rp = rp->ai_next) {
526554
fd = ::socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
527-
if (fd < 0) continue;
528-
if (::connect(fd, rp->ai_addr, rp->ai_addrlen) == 0) break;
555+
if (fd < 0) {
556+
last_errno = errno;
557+
continue;
558+
}
559+
if (::connect(fd, rp->ai_addr, rp->ai_addrlen) == 0) {
560+
// Stash the address we actually connected to in case TLS or
561+
// the WS handshake fails later and we want to log it.
562+
(void)::getnameinfo(rp->ai_addr, rp->ai_addrlen, last_addr,
563+
sizeof(last_addr), nullptr, 0, NI_NUMERICHOST);
564+
break;
565+
}
566+
last_errno = errno;
567+
(void)::getnameinfo(rp->ai_addr, rp->ai_addrlen, last_addr,
568+
sizeof(last_addr), nullptr, 0, NI_NUMERICHOST);
529569
::close(fd);
530570
fd = -1;
531571
}
532572
::freeaddrinfo(result);
533573
if (fd < 0) {
534-
if (error) *error = "failed to connect cloud gateway";
574+
if (error) {
575+
std::ostringstream msg;
576+
msg << "failed to connect cloud gateway (";
577+
if (last_addr[0]) msg << last_addr << ":" << url.port << ", ";
578+
msg << "errno=" << last_errno << " " << std::strerror(last_errno) << ")";
579+
*error = msg.str();
580+
}
535581
return false;
536582
}
537583
fd_ = fd;
@@ -705,6 +751,34 @@ bool CloudTunnel::ReadJson(nlohmann::json* value) {
705751
}
706752

707753
void CloudTunnel::ThreadMain() {
754+
// Exponential backoff for reconnect attempts. Caps at 60s so a flaky
755+
// network or temporary cloud outage does not keep the journal at the
756+
// previous 1-attempt-per-second cadence (which on a host with broken
757+
// IPv6 + AF_UNSPEC produced ~30k log lines/day for nothing). Reset
758+
// only after the link has been *up* for kBackoffResetThreshold so a
759+
// pathological connect-then-immediately-disconnect loop still backs
760+
// off instead of hot-spinning at 1Hz.
761+
using Clock = std::chrono::steady_clock;
762+
constexpr auto kBackoffMin = std::chrono::seconds(1);
763+
constexpr auto kBackoffMax = std::chrono::seconds(60);
764+
constexpr auto kBackoffResetThreshold = std::chrono::seconds(30);
765+
auto backoff = kBackoffMin;
766+
auto bump_backoff = [&]() {
767+
backoff = std::min(backoff * 2, kBackoffMax);
768+
};
769+
// Sleep `delay` but wake early on shutdown so SIGTERM doesn't have
770+
// to wait out a full 60s slot during the backoff phase.
771+
auto interruptible_sleep = [this](std::chrono::seconds delay) {
772+
const auto deadline = Clock::now() + delay;
773+
while (running_ && Clock::now() < deadline) {
774+
const auto remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
775+
deadline - Clock::now());
776+
const auto slice = std::min<std::chrono::milliseconds>(
777+
remaining, std::chrono::milliseconds(200));
778+
std::this_thread::sleep_for(slice);
779+
}
780+
};
781+
708782
while (running_) {
709783
// Refresh the token snapshot at the top of each connect attempt so
710784
// a successful pair (which writes device.token mid-session) takes
@@ -721,11 +795,14 @@ void CloudTunnel::ThreadMain() {
721795

722796
std::string error;
723797
if (!Connect(&error)) {
724-
std::cerr << "cloud tunnel connect failed: " << error << "\n";
725-
std::this_thread::sleep_for(std::chrono::seconds(2));
798+
std::cerr << "cloud tunnel connect failed: " << error
799+
<< " (retrying in " << backoff.count() << "s)\n";
800+
interruptible_sleep(backoff);
801+
bump_backoff();
726802
continue;
727803
}
728804

805+
const auto connected_at = Clock::now();
729806
std::cerr << "cloud tunnel connected as " << host_id_ << "\n";
730807
SendJson(HelloPayload());
731808
// Once the WS upgrade succeeds and we know we still need pairing,
@@ -771,7 +848,20 @@ void CloudTunnel::ThreadMain() {
771848
}
772849

773850
Disconnect();
774-
if (running_) std::this_thread::sleep_for(std::chrono::seconds(1));
851+
// Reset the backoff only when the session was healthy long enough
852+
// that the disconnect can plausibly be a one-off (network blip,
853+
// server restart). Sub-threshold sessions keep escalating so a
854+
// pathological "WS upgrade succeeds, server kicks us 200ms later"
855+
// loop doesn't reconnect at 1Hz.
856+
const auto stayed_up = Clock::now() - connected_at;
857+
if (stayed_up >= kBackoffResetThreshold) {
858+
backoff = kBackoffMin;
859+
} else {
860+
bump_backoff();
861+
}
862+
if (running_) {
863+
interruptible_sleep(backoff);
864+
}
775865
}
776866
}
777867

0 commit comments

Comments
 (0)