fix(distributed): leaf timeout between requests + streaming space loss

sherm8n · claude · sherm8n · commit e27caa861ee8 · 2026-04-09T20:07:18.000+01:00
Both found by live-testing 2-node native mode on real hardware with
TinyLlama. Both would have hit users on the multi-machine path.

1. Leaf timed out between requests
   recv_tensor had a 300s default applied to every receive, including
   the leaf's "wait for next prompt" call in broadcast_metadata. After
   5 minutes of user think-time the socket fired TimeoutError and the
   leaf loop crashed. Fix: broadcast_metadata and broadcast_metadata_objects
   now pass timeout=None on the receive side — the wait for the next
   user message is unbounded and shouldn't time out.

2. Streaming output had no spaces
   _generate_stream decoded one token at a time and yielded the result.
   For SentencePiece/BPE tokenizers (Llama/Mistral/Qwen), the leading
   space metadata only appears when decoding multiple tokens together,
   so individual decodes produced "Thereare50states" instead of
   "There are 50 states". Fix: track the running token id list, decode
   the full list each step, and yield only the new substring. Standard
   incremental-decode pattern used by HF TextStreamer.

Both verified live: leaf survived a multi-minute pause between
requests, and TinyLlama-Chat output now renders with spaces in the
chat UI.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/ravnest/communication/communication_dynamic.py b/ravnest/communication/communication_dynamic.py
@@ -42,8 +42,8 @@ def send_tensor(sock, tensor):
 
     @staticmethod
     def recv_tensor(sock, device="cpu", timeout=300):
-        """Receive a tensor from a socket."""
-        sock.settimeout(timeout)
+        """Receive a tensor from a socket. Pass timeout=None to block indefinitely."""
+        sock.settimeout(timeout)  # None = blocking forever
         header = TensorSocket._recv_exactly(sock, 4)
         if header is None:
             raise ConnectionError("Connection closed while reading header")
@@ -420,8 +420,11 @@ def broadcast_metadata(self, data):
                 if isinstance(key, str) and key.startswith("meta_"):
                     TensorSocket.send_tensor(sock, data)
         else:
-            # Receive from root
-            received = TensorSocket.recv_tensor(self.peers["meta_root"], device=str(self.device))
+            # Receive from root — block forever; user think-time between
+            # requests is unbounded so a fixed timeout isn't appropriate here
+            received = TensorSocket.recv_tensor(
+                self.peers["meta_root"], device=str(self.device), timeout=None
+            )
             data.copy_(received)
 
     def broadcast_metadata_objects(self, data):
@@ -430,7 +433,8 @@ def broadcast_metadata_objects(self, data):
                 if isinstance(key, str) and key.startswith("meta_"):
                     TensorSocket.send_object(sock, data)
         else:
-            received = TensorSocket.recv_object(self.peers["meta_root"])
+            # Block forever waiting for next prompt — user think-time is unbounded
+            received = TensorSocket.recv_object(self.peers["meta_root"], timeout=None)
             for i in range(len(data)):
                 if i < len(received):
                     data[i] = received[i]
diff --git a/ravnest/inference/inference_engine.py b/ravnest/inference/inference_engine.py
@@ -219,6 +219,11 @@ def _generate_stream(self, input_ids=None, max_seq_lengths=None, top_k=1, temper
         num_generated_tokens = 0
         is_generation_done = torch.tensor([False]*bs).to(self.node.device)
         pad_token_tensor = torch.tensor([self.tokenizer.pad_token_id]*bs).to(self.node.device)
+        # Track generated token ids for incremental decoding (BPE/SentencePiece
+        # tokenizers lose leading-space info when decoded one token at a time,
+        # so we decode the running list and yield only the new substring)
+        generated_ids_so_far = []
+        decoded_so_far = ""
         while num_generated_tokens < max_seq_length_in_batch:
             self.comm_session.forward_input_shapes[0][1] = seq_length
 
@@ -261,10 +266,16 @@ def _generate_stream(self, input_ids=None, max_seq_lengths=None, top_k=1, temper
                 new_token_mask = kwargs['attention_mask'].new_ones((bs,1))
                 kwargs['attention_mask'] = torch.cat((kwargs['attention_mask'], new_token_mask), axis=-1)
 
-            # Yield the decoded token for each sequence in the batch
+            # Yield the decoded token for each sequence in the batch.
+            # Decode the full running list and emit only the new substring,
+            # so BPE/SentencePiece leading spaces are preserved.
             if self.node_type != NodeTypes.LEAF:
-                token_text = self.tokenizer.decode(next_token_ids[0].item(), skip_special_tokens=True)
-                yield token_text
+                generated_ids_so_far.append(next_token_ids[0].item())
+                full_decoded = self.tokenizer.decode(generated_ids_so_far, skip_special_tokens=True)
+                if len(full_decoded) > len(decoded_so_far):
+                    delta = full_decoded[len(decoded_so_far):]
+                    decoded_so_far = full_decoded
+                    yield delta
 
             is_generation_done = self.is_generation_complete(is_generation_done, next_token_ids, num_generated_tokens, max_seq_lengths)