Skip to content

Commit f917dca

Browse files
78cursoragent
andcommitted
fix(cloud_tunnel): gate background sends on connected_, not fd_
The tick thread (and other producers like the log flusher and runtime event callbacks) used `fd_ >= 0` as a proxy for "WS link is up". With TLS that gate opens far too early: Connect() sets `fd_` right after the TCP handshake, then runs SSL_connect() and the WebSocket upgrade. If TickMain woke up in that window it called SendJson -> SSL_write before SSL_connect() returned, and OpenSSL emitted the WebSocket frame as an early "Application Data" record sandwiched in front of the client's ChangeCipherSpec/Finished. Cloudflare answered with a fatal `unexpected_message` alert and reset the connection, surfacing as cloud tunnel connect failed: TLS handshake failed (error:0A0003F2:SSL routines::sslv3 alert unexpected message) in tenboxd's journal on the affected hosts. Add `std::atomic<bool> connected_` set to true only after the WS 101 upgrade lands and cleared on Disconnect(). SendJson now refuses to write before that, and TickMain / FlushLogBuffers / the Push* helpers / the host.connected status field all gate on the same flag. Bumps VERSION to 0.7.14. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 101d324 commit f917dca

3 files changed

Lines changed: 35 additions & 12 deletions

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.7.13
1+
0.7.14

src/daemon/cloud_tunnel.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,10 @@ void CloudTunnel::Stop() {
485485
}
486486

487487
void CloudTunnel::Disconnect() {
488+
// Flip the gate first so any background producer that wakes up mid-
489+
// teardown bails out instead of racing into SSL_write on a half-closed
490+
// SSL object.
491+
connected_.store(false);
488492
if (ssl_) {
489493
// Best-effort shutdown; SSL_shutdown returning 0 means peer hasn't
490494
// sent close_notify yet, but we always close the underlying fd next
@@ -692,6 +696,10 @@ bool CloudTunnel::Connect(std::string* error) {
692696
if (error) *error = "cloud gateway did not accept WebSocket upgrade";
693697
return false;
694698
}
699+
// Only now is it safe for the tick thread / log flusher / event
700+
// callbacks to write WS frames. See cloud_tunnel.h::connected_ for why
701+
// gating on fd_ alone produced a TLS `unexpected_message` regression.
702+
connected_.store(true);
695703
return true;
696704
}
697705

@@ -729,6 +737,12 @@ bool CloudTunnel::TransportRecv(void* data, size_t size) {
729737

730738
bool CloudTunnel::SendJson(const nlohmann::json& value) {
731739
std::lock_guard<std::mutex> lock(send_mu_);
740+
// Guard against background producers (tick thread, log flusher, runtime
741+
// event callbacks) that race ahead of the TLS+WS handshake. Writing a
742+
// WebSocket frame onto the SSL session while SSL_connect() is still in
743+
// flight makes OpenSSL emit it as an early Application Data record,
744+
// which Cloudflare fatally rejects with `unexpected_message`.
745+
if (!connected_.load()) return false;
732746
if (fd_ < 0) return false;
733747
const std::string payload = value.dump();
734748
std::vector<uint8_t> frame;
@@ -1024,7 +1038,7 @@ nlohmann::json CloudTunnel::HostResourcesPayload() const {
10241038
{"runtime_path", config_.runtime_path},
10251039
{"daemon_version", TENBOX_VERSION},
10261040
{"daemon_uptime_seconds", std::max<int64_t>(0, UnixNow() - start_time_seconds_)},
1027-
{"cloud_connected", fd_ >= 0},
1041+
{"cloud_connected", connected_.load()},
10281042
{"tenbox_vm_memory_bytes", vm_rss},
10291043
{"image_cache_bytes", CachedDirectorySizeBytes(images_dir)},
10301044
{"encoder_caps", EncoderCapabilitiesCached()},
@@ -1950,19 +1964,20 @@ void CloudTunnel::TickMain() {
19501964
// we observe an fd_ transition so reconnects also push a fresh snapshot.
19511965
auto next_host = Clock::now();
19521966
auto next_vm = Clock::now() + kVmResourcesInterval;
1953-
int last_fd = fd_;
1967+
bool last_connected = connected_.load();
19541968

19551969
while (running_) {
19561970
const auto now = Clock::now();
1971+
const bool is_connected = connected_.load();
19571972

1958-
if (last_fd < 0 && fd_ >= 0) {
1973+
if (!last_connected && is_connected) {
19591974
// Tunnel just (re)connected — refresh the host snapshot
19601975
// immediately rather than waiting up to kHostResourcesInterval.
19611976
next_host = now;
19621977
}
1963-
last_fd = fd_;
1978+
last_connected = is_connected;
19641979

1965-
if (fd_ >= 0) {
1980+
if (is_connected) {
19661981
if (now >= next_host) {
19671982
// Reuse the full HostResourcesPayload so resource ticks also
19681983
// carry daemon_version / uptime / encoder_caps etc. The
@@ -2041,7 +2056,7 @@ void CloudTunnel::FlushLogBuffers() {
20412056
if (log_buffer_.empty()) return;
20422057
drained.swap(log_buffer_);
20432058
}
2044-
if (fd_ < 0) return; // Drop silently when offline; tail RPC will catch up.
2059+
if (!connected_.load()) return; // Drop silently when offline; tail RPC will catch up.
20452060
for (auto& [vm_id, lines] : drained) {
20462061
if (lines.empty()) continue;
20472062
nlohmann::json arr = nlohmann::json::array();
@@ -2076,7 +2091,7 @@ nlohmann::json CloudTunnel::VmResourcesSnapshot() const {
20762091
}
20772092

20782093
void CloudTunnel::PushVmStateChanged(const std::string& vm_id, const VmRuntimeInfo& info) {
2079-
if (fd_ < 0) return;
2094+
if (!connected_.load()) return;
20802095
(void)SendJson({
20812096
{"id", GenerateUuid()},
20822097
{"type", "vm.state_changed"},
@@ -2090,7 +2105,7 @@ void CloudTunnel::PushVmStateChanged(const std::string& vm_id, const VmRuntimeIn
20902105
}
20912106

20922107
void CloudTunnel::PushImageCachedAdded(const std::string& cache_id, const std::string& image_name) {
2093-
if (fd_ < 0) return;
2108+
if (!connected_.load()) return;
20942109
(void)SendJson({
20952110
{"id", GenerateUuid()},
20962111
{"type", "image.cached.added"},
@@ -2103,7 +2118,7 @@ void CloudTunnel::PushImageCachedAdded(const std::string& cache_id, const std::s
21032118
}
21042119

21052120
void CloudTunnel::PushImageCachedRemoved(const std::string& cache_id) {
2106-
if (fd_ < 0) return;
2121+
if (!connected_.load()) return;
21072122
(void)SendJson({
21082123
{"id", GenerateUuid()},
21092124
{"type", "image.cached.removed"},
@@ -2113,7 +2128,7 @@ void CloudTunnel::PushImageCachedRemoved(const std::string& cache_id) {
21132128
}
21142129

21152130
void CloudTunnel::PushDownloadProgress(const DownloadJob& job) {
2116-
if (fd_ < 0) return;
2131+
if (!connected_.load()) return;
21172132
(void)SendJson({
21182133
{"id", GenerateUuid()},
21192134
{"type", "image.download.progress"},
@@ -2123,7 +2138,7 @@ void CloudTunnel::PushDownloadProgress(const DownloadJob& job) {
21232138
}
21242139

21252140
void CloudTunnel::PushDownloadTerminal(const DownloadJob& job) {
2126-
if (fd_ < 0) return;
2141+
if (!connected_.load()) return;
21272142
const std::string type = job.status == "done"
21282143
? "image.download.completed"
21292144
: (job.status == "cancelled" ? "image.download.cancelled" : "image.download.failed");

src/daemon/cloud_tunnel.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,14 @@ class CloudTunnel {
114114
SSL_CTX* ssl_ctx_ = nullptr;
115115
SSL* ssl_ = nullptr;
116116
bool tls_enabled_ = false;
117+
// True only after the TLS handshake (if any) and the WebSocket upgrade
118+
// have completed and we are ready to send WS frames. The tick thread
119+
// and any other background producer must gate writes on this rather
120+
// than `fd_ >= 0`: with TLS the socket is alive long before SSL_write
121+
// can be safely called, and a stray frame written during the handshake
122+
// window lands on the wire as a bogus "Application Data" record that
123+
// Cloudflare answers with a fatal `unexpected_message` alert.
124+
std::atomic<bool> connected_{false};
117125
std::mutex send_mu_;
118126
std::string host_id_;
119127
// Pairing state: when device.token exists on disk we reconnect with the

0 commit comments

Comments
 (0)