@@ -25,13 +25,16 @@ extern "C" {
2525#include < unistd.h>
2626
2727#include < algorithm>
28+ #include < cerrno>
2829#include < chrono>
30+ #include < cstdlib>
2931#include < cstring>
3032#include < filesystem>
3133#include < fstream>
3234#include < iostream>
3335#include < random>
3436#include < sstream>
37+ #include < string_view>
3538#include < thread>
3639
3740namespace tenbox ::daemon {
@@ -511,27 +514,70 @@ bool CloudTunnel::Connect(std::string* error) {
511514 url.target += " host_id=" + host_id_;
512515 }
513516
517+ // Default to IPv4 only. The cloud gateway (my.tenbox.ai) is fronted by
518+ // Cloudflare, which is dual-stack — but in practice plenty of home /
519+ // SMB / single-board-computer environments (RK3588 boards, consumer
520+ // routers, etc.) advertise an IPv6 default route that doesn't actually
521+ // forward traffic. Without this hint glibc's RFC 6724 sort would put
522+ // AAAA first, the connect() below would either time out or stall, and
523+ // the user only sees a generic "failed to read WebSocket handshake"
524+ // log line with no clue that the daemon never got past TCP.
525+ //
526+ // Operators on IPv6-only networks can opt back in with
527+ // TENBOX_CLOUD_PREFER_FAMILY=v6 (single-stack v6) or =any
528+ // (dual-stack with v6 preferred — pre-fix behaviour).
514529 addrinfo hints{};
515530 hints.ai_socktype = SOCK_STREAM;
516- hints.ai_family = AF_UNSPEC;
531+ hints.ai_family = AF_INET;
532+ if (const char * fam = std::getenv (" TENBOX_CLOUD_PREFER_FAMILY" )) {
533+ if (std::string_view (fam) == " v6" ) hints.ai_family = AF_INET6;
534+ else if (std::string_view (fam) == " any" ) hints.ai_family = AF_UNSPEC;
535+ // anything else (including "v4") falls through to AF_INET.
536+ }
537+
517538 addrinfo* result = nullptr ;
518539 const int gai = ::getaddrinfo (url.host .c_str (), url.port .c_str (), &hints, &result);
519540 if (gai != 0 ) {
520541 if (error) *error = gai_strerror (gai);
521542 return false ;
522543 }
523544
545+ // Walk every candidate so a transient failure on the first record
546+ // (e.g. one Cloudflare anycast IP that happens to be flapping) still
547+ // tries the next. Capture the last connect() errno so the caller can
548+ // log "tried 1.2.3.4:443 -> ECONNREFUSED" instead of the previous
549+ // opaque "failed to connect cloud gateway".
524550 int fd = -1 ;
551+ int last_errno = 0 ;
552+ char last_addr[64 ] = {};
525553 for (auto * rp = result; rp; rp = rp->ai_next ) {
526554 fd = ::socket (rp->ai_family , rp->ai_socktype , rp->ai_protocol );
527- if (fd < 0 ) continue ;
528- if (::connect (fd, rp->ai_addr , rp->ai_addrlen ) == 0 ) break ;
555+ if (fd < 0 ) {
556+ last_errno = errno;
557+ continue ;
558+ }
559+ if (::connect (fd, rp->ai_addr , rp->ai_addrlen ) == 0 ) {
560+ // Stash the address we actually connected to in case TLS or
561+ // the WS handshake fails later and we want to log it.
562+ (void )::getnameinfo (rp->ai_addr , rp->ai_addrlen , last_addr,
563+ sizeof (last_addr), nullptr , 0 , NI_NUMERICHOST);
564+ break ;
565+ }
566+ last_errno = errno;
567+ (void )::getnameinfo (rp->ai_addr , rp->ai_addrlen , last_addr,
568+ sizeof (last_addr), nullptr , 0 , NI_NUMERICHOST);
529569 ::close (fd);
530570 fd = -1 ;
531571 }
532572 ::freeaddrinfo (result);
533573 if (fd < 0 ) {
534- if (error) *error = " failed to connect cloud gateway" ;
574+ if (error) {
575+ std::ostringstream msg;
576+ msg << " failed to connect cloud gateway (" ;
577+ if (last_addr[0 ]) msg << last_addr << " :" << url.port << " , " ;
578+ msg << " errno=" << last_errno << " " << std::strerror (last_errno) << " )" ;
579+ *error = msg.str ();
580+ }
535581 return false ;
536582 }
537583 fd_ = fd;
@@ -705,6 +751,34 @@ bool CloudTunnel::ReadJson(nlohmann::json* value) {
705751}
706752
707753void CloudTunnel::ThreadMain () {
754+ // Exponential backoff for reconnect attempts. Caps at 60s so a flaky
755+ // network or temporary cloud outage does not keep the journal at the
756+ // previous 1-attempt-per-second cadence (which on a host with broken
757+ // IPv6 + AF_UNSPEC produced ~30k log lines/day for nothing). Reset
758+ // only after the link has been *up* for kBackoffResetThreshold so a
759+ // pathological connect-then-immediately-disconnect loop still backs
760+ // off instead of hot-spinning at 1Hz.
761+ using Clock = std::chrono::steady_clock;
762+ constexpr auto kBackoffMin = std::chrono::seconds (1 );
763+ constexpr auto kBackoffMax = std::chrono::seconds (60 );
764+ constexpr auto kBackoffResetThreshold = std::chrono::seconds (30 );
765+ auto backoff = kBackoffMin ;
766+ auto bump_backoff = [&]() {
767+ backoff = std::min (backoff * 2 , kBackoffMax );
768+ };
769+ // Sleep `delay` but wake early on shutdown so SIGTERM doesn't have
770+ // to wait out a full 60s slot during the backoff phase.
771+ auto interruptible_sleep = [this ](std::chrono::seconds delay) {
772+ const auto deadline = Clock::now () + delay;
773+ while (running_ && Clock::now () < deadline) {
774+ const auto remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
775+ deadline - Clock::now ());
776+ const auto slice = std::min<std::chrono::milliseconds>(
777+ remaining, std::chrono::milliseconds (200 ));
778+ std::this_thread::sleep_for (slice);
779+ }
780+ };
781+
708782 while (running_) {
709783 // Refresh the token snapshot at the top of each connect attempt so
710784 // a successful pair (which writes device.token mid-session) takes
@@ -721,11 +795,14 @@ void CloudTunnel::ThreadMain() {
721795
722796 std::string error;
723797 if (!Connect (&error)) {
724- std::cerr << " cloud tunnel connect failed: " << error << " \n " ;
725- std::this_thread::sleep_for (std::chrono::seconds (2 ));
798+ std::cerr << " cloud tunnel connect failed: " << error
799+ << " (retrying in " << backoff.count () << " s)\n " ;
800+ interruptible_sleep (backoff);
801+ bump_backoff ();
726802 continue ;
727803 }
728804
805+ const auto connected_at = Clock::now ();
729806 std::cerr << " cloud tunnel connected as " << host_id_ << " \n " ;
730807 SendJson (HelloPayload ());
731808 // Once the WS upgrade succeeds and we know we still need pairing,
@@ -771,7 +848,20 @@ void CloudTunnel::ThreadMain() {
771848 }
772849
773850 Disconnect ();
774- if (running_) std::this_thread::sleep_for (std::chrono::seconds (1 ));
851+ // Reset the backoff only when the session was healthy long enough
852+ // that the disconnect can plausibly be a one-off (network blip,
853+ // server restart). Sub-threshold sessions keep escalating so a
854+ // pathological "WS upgrade succeeds, server kicks us 200ms later"
855+ // loop doesn't reconnect at 1Hz.
856+ const auto stayed_up = Clock::now () - connected_at;
857+ if (stayed_up >= kBackoffResetThreshold ) {
858+ backoff = kBackoffMin ;
859+ } else {
860+ bump_backoff ();
861+ }
862+ if (running_) {
863+ interruptible_sleep (backoff);
864+ }
775865 }
776866}
777867
0 commit comments