Skip to content

Commit 76aa7c4

Browse files
committed
fix(smp): complete bidirectional chat in multi-task architecture
- Accept batched server responses with txCount > 1 - Forward TX2 MSG from SUB response batch to App Task - Handle re-delivery (msg_ns < recv) with ACK-only response - Add TCP Keep-Alive (idle=30, intvl=15, cnt=4) - Add SMP PING/PONG keepalive (30s interval) - Subscribe Reply Queue on main socket after 42d handshake Root cause: Server batches SUB OK + pending MSG in single block with txCount=2. Parser expected txCount==1, discarding the batch. MSG in TX2 was never read, server waited for ACK indefinitely. Session 31 - Six fixes from zero frames to bidirectional chat.
1 parent eddcd1a commit 76aa7c4

5 files changed

Lines changed: 369 additions & 58 deletions

File tree

docs/protocol-analysis/files.zip

-64.3 KB
Binary file not shown.

main/core/smp_tasks.c

Lines changed: 132 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ static uint8_t s_session_id[32]; // Phase 3 T4b: session ID for ACK signing
4747
// Phase 3 T3: Post-confirmation state (set by Reply Queue decrypt, read by 42d handler later)
4848
static uint8_t s_peer_sender_auth_key[44];
4949
static bool s_has_peer_sender_auth = false;
50+
static bool s_42d_completed = false; // Prevent 42d from running more than once
5051

5152
// Helper: log both internal and PSRAM heap
5253
static void log_heap(const char *label)
@@ -86,6 +87,15 @@ static void network_task(void *arg)
8687
ESP_LOGI(TAG, "Network task: socket timeout set to 1s");
8788
}
8889

90+
// PING keepalive state
91+
TickType_t last_ping_tick = 0;
92+
TickType_t last_pong_tick = 0;
93+
bool first_pong_logged = false;
94+
bool first_ping_sent = false;
95+
bool pong_timeout_warned = false;
96+
const TickType_t ping_interval_ticks = pdMS_TO_TICKS(30000); // 30s
97+
const TickType_t pong_timeout_ticks = pdMS_TO_TICKS(60000); // 60s warning
98+
8999
while (1) {
90100
// T6-Debug: Heartbeat counter
91101
static int loop_count = 0;
@@ -97,15 +107,81 @@ static void network_task(void *arg)
97107
// === 1. SSL READ (T1, timeout reduced from 5000 to 1000 for T4c) ===
98108
int content_len = smp_read_block(s_ssl, block, 1000);
99109

110+
// RAW FRAME DIAGNOSTIK: Hex dump BEFORE any parsing/filtering
111+
if (content_len > 0) {
112+
int dump_len = content_len < 64 ? content_len : 64;
113+
ESP_LOGI("RAW", "sock %d recv %d bytes:", s_sock_fd, content_len);
114+
ESP_LOG_BUFFER_HEX_LEVEL("RAW", block + 2, dump_len, ESP_LOG_INFO);
115+
}
116+
100117
if (content_len > 0) {
101-
ESP_LOGI(TAG, "Network task: Frame received, content_len=%d, writing to ring buffer",
102-
content_len);
103-
104-
BaseType_t sent = xRingbufferSend(net_to_app_buf, block,
105-
content_len + 2,
106-
pdMS_TO_TICKS(1000));
107-
if (sent != pdTRUE) {
108-
ESP_LOGW(TAG, "Network task: Ring buffer full, frame dropped!");
118+
// PONG detection: parse frame before forwarding
119+
bool is_pong = false;
120+
{
121+
uint8_t *resp = block + 2;
122+
int p = 0;
123+
if (p + 3 <= content_len) {
124+
p += 3; // txCount + 2 skip bytes
125+
if (p < content_len) {
126+
int authLen = resp[p++];
127+
if (p + authLen <= content_len) {
128+
p += authLen;
129+
// v7: no sessLen
130+
if (p < content_len) {
131+
int corrLen = resp[p++];
132+
if (p + corrLen <= content_len) {
133+
p += corrLen;
134+
if (p < content_len) {
135+
int entLen = resp[p++];
136+
if (p + entLen <= content_len) {
137+
p += entLen;
138+
if (p + 4 <= content_len &&
139+
resp[p] == 'P' && resp[p+1] == 'O' &&
140+
resp[p+2] == 'N' && resp[p+3] == 'G') {
141+
is_pong = true;
142+
}
143+
}
144+
}
145+
}
146+
}
147+
}
148+
}
149+
}
150+
}
151+
152+
if (is_pong) {
153+
last_pong_tick = xTaskGetTickCount();
154+
float delta_s = first_ping_sent
155+
? (float)(last_pong_tick - last_ping_tick) / (float)configTICK_RATE_HZ
156+
: 0.0f;
157+
ESP_LOGI("PING", "PONG received on sock %d (%.1fs after PING)",
158+
s_sock_fd, delta_s);
159+
pong_timeout_warned = false;
160+
161+
// First PONG: full hex dump for documentation
162+
if (!first_pong_logged) {
163+
first_pong_logged = true;
164+
int dump_len = content_len < 64 ? content_len : 64;
165+
ESP_LOGW("PING", "First PONG frame hex dump (%d bytes):", content_len);
166+
ESP_LOG_BUFFER_HEX_LEVEL("PING", block + 2, dump_len, ESP_LOG_WARN);
167+
}
168+
// Do NOT forward PONG to App Task
169+
} else {
170+
ESP_LOGI(TAG, "Network task: Frame received, content_len=%d, writing to ring buffer",
171+
content_len);
172+
173+
// Log if data frame arrives after PING is active (could be MSG!)
174+
if (first_ping_sent) {
175+
ESP_LOGI("PING", "Data frame on sock %d after PING active (content_len=%d)",
176+
s_sock_fd, content_len);
177+
}
178+
179+
BaseType_t sent = xRingbufferSend(net_to_app_buf, block,
180+
content_len + 2,
181+
pdMS_TO_TICKS(1000));
182+
if (sent != pdTRUE) {
183+
ESP_LOGW(TAG, "Network task: Ring buffer full, frame dropped!");
184+
}
109185
}
110186
} else if (content_len == -2) {
111187
// Timeout — log every 30th
@@ -148,6 +224,45 @@ static void network_task(void *arg)
148224
// Check for more commands
149225
cmd_item = xRingbufferReceive(app_to_net_buf, &cmd_size, 0);
150226
}
227+
228+
// === 3. PING KEEPALIVE ===
229+
{
230+
TickType_t now = xTaskGetTickCount();
231+
232+
if (now - last_ping_tick >= ping_interval_ticks) {
233+
// SMP PING: [sigLen=0][corrIdLen=24][24B random corrId][entityIdLen=0]["PING"]
234+
uint8_t ping_trans[30];
235+
int pp = 0;
236+
ping_trans[pp++] = 0; // no signature
237+
ping_trans[pp++] = 24; // corrId length = 0x18
238+
uint8_t ping_corr[24];
239+
esp_fill_random(ping_corr, 24);
240+
memcpy(&ping_trans[pp], ping_corr, 24);
241+
pp += 24;
242+
ping_trans[pp++] = 0; // no entityId
243+
ping_trans[pp++] = 'P';
244+
ping_trans[pp++] = 'I';
245+
ping_trans[pp++] = 'N';
246+
ping_trans[pp++] = 'G';
247+
248+
smp_write_command_block(s_ssl, block, ping_trans, pp);
249+
last_ping_tick = now;
250+
first_ping_sent = true;
251+
pong_timeout_warned = false;
252+
253+
ESP_LOGI("PING", "PING sent on sock %d (tick=%lu)",
254+
s_sock_fd, (unsigned long)now);
255+
}
256+
257+
// Warn once if no PONG within 60s
258+
if (first_ping_sent && !pong_timeout_warned &&
259+
last_pong_tick < last_ping_tick &&
260+
now - last_ping_tick >= pong_timeout_ticks) {
261+
ESP_LOGW("PING", "No PONG for 60s! Server may have dropped sub. sock=%d",
262+
s_sock_fd);
263+
pong_timeout_warned = true;
264+
}
265+
}
151266
}
152267

153268
ESP_LOGW(TAG, "Network task: SSL read loop ended, cleaning up");
@@ -352,7 +467,8 @@ void smp_app_run(QueueHandle_t kbd_queue)
352467
log_heap("after_rq_decrypt");
353468

354469
// === T4e: 42d Post-Confirmation Block ===
355-
if (s_has_peer_sender_auth) {
470+
if (s_has_peer_sender_auth && !s_42d_completed) {
471+
s_42d_completed = true; // Prevent re-trigger from re-deliveries
356472
ESP_LOGI(TAG_APP, "APP: 42d — Starting post-confirmation handshake");
357473

358474
// 1. Reconnect Reply Queue (eigene SSL-Verbindung, nicht Haupt-SSL)
@@ -448,6 +564,13 @@ void smp_app_run(QueueHandle_t kbd_queue)
448564

449565
skip_42d_app: ;
450566

567+
// ROOT CAUSE FIX: Close sock 56 so server delivers MSGs to sock 54!
568+
// Without this, server sees TWO active SUBs on Reply Queue
569+
// (sock 56 from queue_reconnect + sock 54 from subscribe_all)
570+
// and may deliver MSGs to sock 56, which nobody reads.
571+
queue_disconnect();
572+
ESP_LOGW(TAG_APP, "APP: 42d — sock 56 closed, MSGs will arrive on sock 54");
573+
451574
log_heap("after_42d");
452575
}
453576

main/net/smp_network.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "smp_types.h"
88
#include <string.h>
99
#include <sys/socket.h>
10+
#include <netinet/tcp.h>
1011
#include <netdb.h>
1112
#include <errno.h>
1213
#include <unistd.h>
@@ -46,6 +47,17 @@ int smp_tcp_connect(const char *host, int port) {
4647
return -1;
4748
}
4849

50+
// TCP Keep-Alive (matches Haskell client: keepIdle=30, keepIntvl=15, keepCnt=4)
51+
int keepalive = 1;
52+
setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive));
53+
int keepidle = 30;
54+
setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, &keepidle, sizeof(keepidle));
55+
int keepintvl = 15;
56+
setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, &keepintvl, sizeof(keepintvl));
57+
int keepcnt = 4;
58+
setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, &keepcnt, sizeof(keepcnt));
59+
ESP_LOGI(TAG, "TCP Keep-Alive set: idle=%d, intvl=%d, cnt=%d", keepidle, keepintvl, keepcnt);
60+
4961
struct timeval tv;
5062
tv.tv_sec = 15;
5163
tv.tv_usec = 0;
@@ -59,6 +71,7 @@ int smp_tcp_connect(const char *host, int port) {
5971
}
6072

6173
freeaddrinfo(res);
74+
ESP_LOGI(TAG, "TCP connected: sock %d to %s:%d", sock, host, port);
6275
return sock;
6376
}
6477

main/protocol/smp_ratchet.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,16 @@ int ratchet_decrypt_body(ratchet_decrypt_mode_t mode,
814814
recv_chain_key[4], recv_chain_key[5], recv_chain_key[6], recv_chain_key[7]);
815815
}
816816

817+
// Re-delivery detection: In SAME mode, if msg_ns < msg_num_recv,
818+
// this message was already processed. Return early without decrypting.
819+
if (mode == RATCHET_MODE_SAME && msg_ns < ratchet_state.msg_num_recv) {
820+
ESP_LOGW(TAG, "");
821+
ESP_LOGW(TAG, " ⚠️ Re-delivery detected: msg_ns=%u < msg_num_recv=%u",
822+
msg_ns, ratchet_state.msg_num_recv);
823+
ESP_LOGW(TAG, " Skipping decrypt, caller should ACK and ignore.");
824+
return -10; // RE_DELIVERY
825+
}
826+
817827
// SCHRITT 3: Chain KDF
818828
// ADVANCE: chain starts at position 0, skip 0..msg_ns-1 (absolute)
819829
// SAME: chain starts at position msg_num_recv, skip msg_num_recv..msg_ns-1 (relative)

0 commit comments

Comments
 (0)