fix: per-stage CLI runner deadlocked on output larger than the pipe buffer (issue #4)

SouravRoy-ETL · SouravRoy-ETL · commit 4effb4b77ba3 · 2026-05-29T13:13:52.000+05:30
The per-stage DuckDB runner spawned the CLI with stdout/stderr piped, then
polled try_wait() to completion and only read the output via
wait_with_output() AFTER the process exited. A child process's output pipe
has a fixed OS buffer (~64 KiB on Windows); once the CLI's result exceeded
it, DuckDB blocked writing stdout while the engine blocked waiting for exit.
Permanent deadlock.

This bit the per-stage path's node preview: `SELECT * FROM &lt;node&gt; LIMIT 100`.
For a wide table (a 36-column Oracle date dimension serialized to ~128 KiB of
JSON) it hung every run, on the SOURCE node's preview, before the sink stage
ever executed. Symptoms matched exactly: the source row count was correct
(COUNT(*) is tiny, well under the buffer), the sink produced no file (never
reached), and the temp DB was left on disk (the run never reached cleanup).
SQL Server "worked" only because that table was narrow enough to stay under
the buffer - it was table-width-specific, not Oracle-specific. The batched
(pure-SQL) path was immune because it already drains stdout on a thread.

The runner now drains stdout and stderr on dedicated threads while the
process runs, so any result size completes. Cancellation still kills the
child and joins the readers. Adds a regression test: a wide CSV whose
100-row preview exceeds the pipe buffer, forced onto the per-stage path,
must complete.
diff --git a/crates/duckdb-engine/src/lib.rs b/crates/duckdb-engine/src/lib.rs
@@ -165,35 +165,68 @@ impl DuckdbEngine {
             .spawn()
             .map_err(|e| EngineError::Other(format!("could not start duckdb: {}", e)))?;
 
-        loop {
+        // Drain stdout AND stderr on dedicated threads so the child can
+        // never deadlock against a full OS pipe buffer. The previous code
+        // polled try_wait() to completion and only called wait_with_output()
+        // *after* the process exited - but a Windows anonymous pipe holds
+        // only ~64 KiB, so once DuckDB's result exceeds that it blocks
+        // writing stdout while we block waiting for it to exit. A wide-table
+        // preview (`SELECT * ... LIMIT 100` over ~36 columns is ~128 KiB)
+        // hit this exactly, hanging the whole pipeline on the source node's
+        // preview before it ever reached the sink (issue #4). Concurrent
+        // readers keep the pipe drained regardless of result size.
+        use std::io::Read;
+        let mut stdout_pipe = child
+            .stdout
+            .take()
+            .ok_or_else(|| EngineError::Other("duckdb stdout not captured".into()))?;
+        let mut stderr_pipe = child
+            .stderr
+            .take()
+            .ok_or_else(|| EngineError::Other("duckdb stderr not captured".into()))?;
+        let stdout_reader = std::thread::spawn(move || {
+            let mut buf = Vec::new();
+            let _ = stdout_pipe.read_to_end(&mut buf);
+            buf
+        });
+        let stderr_reader = std::thread::spawn(move || {
+            let mut buf = Vec::new();
+            let _ = stderr_pipe.read_to_end(&mut buf);
+            buf
+        });
+
+        let status = loop {
             match child.try_wait() {
-                Ok(Some(_)) => break,
+                Ok(Some(s)) => break s,
                 Ok(None) => {
                     if self.cancel.load(Ordering::Relaxed) {
                         let _ = child.kill();
                         let _ = child.wait();
+                        // Killing closes the pipes, so the reader threads
+                        // unblock; join them so their handles are released.
+                        let _ = stdout_reader.join();
+                        let _ = stderr_reader.join();
                         return Err(EngineError::Cancelled);
                     }
                     std::thread::sleep(std::time::Duration::from_millis(40));
                 }
                 Err(e) => return Err(EngineError::Other(e.to_string())),
             }
-        }
+        };
 
-        let out = child
-            .wait_with_output()
-            .map_err(|e| EngineError::Other(e.to_string()))?;
-        if !out.status.success() {
-            let mut msg = String::from_utf8_lossy(&out.stderr).trim().to_string();
+        let stdout_bytes = stdout_reader.join().unwrap_or_default();
+        let stderr_bytes = stderr_reader.join().unwrap_or_default();
+        if !status.success() {
+            let mut msg = String::from_utf8_lossy(&stderr_bytes).trim().to_string();
             if msg.is_empty() {
-                msg = String::from_utf8_lossy(&out.stdout).trim().to_string();
+                msg = String::from_utf8_lossy(&stdout_bytes).trim().to_string();
             }
             if msg.is_empty() {
                 msg = "DuckDB CLI exited with an error".into();
             }
             return Err(EngineError::Query(msg));
         }
-        Ok(String::from_utf8_lossy(&out.stdout).into_owned())
+        Ok(String::from_utf8_lossy(&stdout_bytes).into_owned())
     }
 
     /// Run SQL and return the first JSON array of rows it printed
diff --git a/crates/duckdb-engine/tests/execution.rs b/crates/duckdb-engine/tests/execution.rs
@@ -237,6 +237,58 @@ fn csv_to_csv_roundtrip_preserves_rows() {
     assert_eq!(count(&format!("read_csv_auto('{}')", out)), 3);
 }
 
+#[test]
+fn per_stage_wide_preview_does_not_deadlock() {
+    // Regression for issue #4: the per-stage CLI runner buffered stdout
+    // in the OS pipe and only read it after the process exited. A wide
+    // node preview (`SELECT * ... LIMIT 100`) whose JSON exceeds the
+    // ~64 KiB Windows pipe buffer deadlocked - DuckDB blocked writing
+    // stdout while the engine blocked waiting for exit - hanging the
+    // whole pipeline on the source node's preview, before the sink ever
+    // ran. (An Oracle date-dimension with 36 columns produced a ~128 KiB
+    // preview and hit this every time.) The runner now drains stdout +
+    // stderr concurrently, so any result size completes.
+    //
+    // Reproduced here without a driver source: a wide CSV (its 100-row
+    // preview is ~150 KiB) plus memoryLimitMb on a node, which forces
+    // the per-stage path (the batched path drains on a thread already).
+    let tmp = tempfile::tempdir().unwrap();
+    let cols = 8usize;
+    let rows = 200usize;
+    let cell = "x".repeat(200); // 200-char cells -> ~1.6 KiB/row
+    let mut csv = String::new();
+    csv.push_str(
+        &(0..cols)
+            .map(|c| format!("c{}", c))
+            .collect::<Vec<_>>()
+            .join(","),
+    );
+    csv.push('\n');
+    for _ in 0..rows {
+        csv.push_str(
+            &(0..cols).map(|_| cell.as_str()).collect::<Vec<_>>().join(","),
+        );
+        csv.push('\n');
+    }
+    let in_path = write_file(tmp.path(), "wide.csv", &csv);
+    let out = out_path(tmp.path(), "wide_out.csv");
+
+    let engine = engine_or_skip!();
+    let d = doc(
+        json!([
+            node("s1", "src.csv", json!({ "path": in_path, "hasHeader": true })),
+            // memoryLimitMb forces the per-stage path (where the buggy
+            // runner lived); the value itself is irrelevant to the test.
+            node("k1", "snk.csv", json!({ "path": out, "hasHeader": true, "memoryLimitMb": 512 })),
+        ]),
+        json!([main_edge("e1", "s1", "k1")]),
+    );
+    let result = engine.execute_pipeline(&d);
+    assert_eq!(result.status, "ok", "wide per-stage run failed/hung: {:?}", result.error);
+    assert!(Path::new(&out).exists());
+    assert_eq!(count(&format!("read_csv_auto('{}')", out)), rows as i64);
+}
+
 #[test]
 fn aggregate_groups_and_sums() {
     let tmp = tempfile::tempdir().unwrap();