Skip to content

Commit 6dfba08

Browse files
alec-flowersclaude
andcommitted
feat: configurable NATS max_payload for long-ISL disagg serving
NATS default max_payload is 1MB, which causes request timeouts with disaggregated serving at long ISL (65K+ tokens). The prompt data exceeds 1MB when base64-encoded in the NATS message. - Add infra.nats_max_payload_mb config option (default: None = NATS 1MB) - When set, writes /tmp/nats.conf with the custom payload size - Plumbed through schema -> do_sweep -> setup_head.py CLI arg Recipe usage: infra: nats_max_payload_mb: 24 # for long-ISL disagg (65K+ tokens) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 083c68d commit 6dfba08

3 files changed

Lines changed: 29 additions & 5 deletions

File tree

src/srtctl/cli/do_sweep.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ def start_head_infrastructure(self, registry: ProcessRegistry) -> ManagedProcess
108108
"--log-dir",
109109
str(self.runtime.log_dir),
110110
]
111+
if self.config.infra.nats_max_payload_mb is not None:
112+
cmd += ["--nats-max-payload-mb", str(self.config.infra.nats_max_payload_mb)]
111113

112114
mounts = dict(self.runtime.container_mounts)
113115
mounts[setup_script] = setup_script_container

src/srtctl/cli/setup_head.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,13 @@ def setup_logging():
119119
)
120120

121121

122-
def start_nats(binary_path: str = "/configs/nats-server") -> subprocess.Popen:
122+
def start_nats(binary_path: str = "/configs/nats-server", max_payload_mb: int | None = None) -> subprocess.Popen:
123123
"""Start NATS server.
124124
125125
Args:
126126
binary_path: Path to nats-server binary
127+
max_payload_mb: Maximum message payload size in MB, or None for NATS default (1MB).
128+
Set to 24+ for disaggregated serving with long ISL (65K+ tokens).
127129
128130
Returns:
129131
Popen object for the NATS process
@@ -132,14 +134,24 @@ def start_nats(binary_path: str = "/configs/nats-server") -> subprocess.Popen:
132134
raise FileNotFoundError(f"NATS binary not found: {binary_path}")
133135

134136
# Use /tmp for JetStream storage - avoids "Temporary storage directory" warning
135-
# and ensures we're using fast local storage'
137+
# and ensures we're using fast local storage
136138
if os.path.exists("/tmp/nats"):
137139
shutil.rmtree("/tmp/nats")
138140
nats_store_dir = "/tmp/nats"
139141
os.makedirs(nats_store_dir, exist_ok=True)
140142

141-
logger.info("Starting NATS server...")
142-
cmd = [binary_path, "-js", "-sd", nats_store_dir]
143+
if max_payload_mb is not None:
144+
# Write NATS config with custom max_payload
145+
nats_config_path = "/tmp/nats.conf"
146+
max_payload_bytes = max_payload_mb * 1024 * 1024
147+
with open(nats_config_path, "w") as f:
148+
f.write(f"max_payload: {max_payload_bytes}\n")
149+
f.write(f"jetstream {{ store_dir: \"{nats_store_dir}\" }}\n")
150+
logger.info("Starting NATS server (max_payload: %dMB)...", max_payload_mb)
151+
cmd = [binary_path, "-c", nats_config_path]
152+
else:
153+
logger.info("Starting NATS server...")
154+
cmd = [binary_path, "-js", "-sd", nats_store_dir]
143155

144156
proc = subprocess.Popen(
145157
cmd,
@@ -246,6 +258,12 @@ def main():
246258
default="/configs/etcd",
247259
help="Path to etcd binary",
248260
)
261+
parser.add_argument(
262+
"--nats-max-payload-mb",
263+
type=int,
264+
default=None,
265+
help="NATS max message payload in MB (default: NATS default 1MB)",
266+
)
249267

250268
args = parser.parse_args()
251269

@@ -264,7 +282,7 @@ def main():
264282
etcd_proc = None
265283

266284
try:
267-
nats_proc = start_nats(args.nats_binary)
285+
nats_proc = start_nats(args.nats_binary, max_payload_mb=args.nats_max_payload_mb)
268286
etcd_proc = start_etcd(host_ip, args.etcd_binary, log_dir)
269287

270288
# Wait for services

src/srtctl/core/schema.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,9 +865,13 @@ class InfraConfig:
865865
etcd_nats_dedicated_node: If True, run etcd and nats on a dedicated node
866866
instead of the head node. This reserves the first node exclusively
867867
for infrastructure services. Default: False.
868+
nats_max_payload_mb: Maximum NATS message payload in MB. Default: None (uses
869+
NATS default of 1MB). Set to 24+ for disaggregated serving with long ISL
870+
(e.g. 65K+ tokens where prompt data exceeds 1MB in NATS messages).
868871
"""
869872

870873
etcd_nats_dedicated_node: bool = False
874+
nats_max_payload_mb: int | None = None
871875

872876
Schema: ClassVar[type[Schema]] = Schema
873877

0 commit comments

Comments
 (0)