Skip to content

Commit c7886ea

Browse files
authored
feat: sql jobs outputting to s3 + streaming for high-number of rows (#5704)
* stream to s3 boilerplate * S3 works with new syntax * snowflake s3 streaming support * postgres s3 support * fix postgres stream format * mysql s3 streaming * mssql s3 streaming * new s3 mode syntax * optional folder param * rename folder to prefix * json_stream_arr_values * cargo toml rollback * convert_ndjson with datafusion * format conversion kinda works * Fixed not finishing the datafusion writer * support for pg and mssql * fix file ext * bigquery conversion and works with s3 streaming * fix s3 flag parser * snowflake s3 streaming support * factor out duplicate code * remove anyhow * Err case for parse s3 mode * Send error to mpsc * bigquery s3 streaming fix for huge queries * remove extra stuff * snowflake s3 streaming support * small regex mistake * cfg(not(feature = "parquet")) * fix CI (unused import) * error handling fix (graphite)
1 parent 76258b7 commit c7886ea

File tree

12 files changed

+718
-144
lines changed

12 files changed

+718
-144
lines changed

backend/Cargo.lock

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/parsers/windmill-parser-sql/src/lib.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,60 @@ pub fn parse_db_resource(code: &str) -> Option<String> {
120120
cap.map(|x| x.get(1).map(|x| x.as_str().to_string()).unwrap())
121121
}
122122

123+
#[derive(Clone, Copy, Debug)]
124+
pub enum S3ModeFormat {
125+
Json,
126+
Csv,
127+
Parquet,
128+
}
129+
pub fn s3_mode_extension(format: S3ModeFormat) -> &'static str {
130+
match format {
131+
S3ModeFormat::Json => "json",
132+
S3ModeFormat::Csv => "csv",
133+
S3ModeFormat::Parquet => "parquet",
134+
}
135+
}
136+
pub struct S3ModeArgs {
137+
pub prefix: Option<String>,
138+
pub storage: Option<String>,
139+
pub format: S3ModeFormat,
140+
}
141+
pub fn parse_s3_mode(code: &str) -> anyhow::Result<Option<S3ModeArgs>> {
142+
let cap = match RE_S3_MODE.captures(code) {
143+
Some(x) => x,
144+
None => return Ok(None),
145+
};
146+
let args_str = cap
147+
.get(1)
148+
.map(|x| x.as_str().to_string())
149+
.unwrap_or_default();
150+
151+
let mut prefix = None;
152+
let mut storage = None;
153+
let mut format = S3ModeFormat::Json;
154+
155+
for kv in args_str.split(' ').map(|kv| kv.trim()) {
156+
if kv.is_empty() {
157+
continue;
158+
}
159+
let mut it = kv.split('=');
160+
let (Some(key), Some(value)) = (it.next(), it.next()) else {
161+
return Err(anyhow!("Invalid S3 mode argument: {}", kv));
162+
};
163+
match (key.trim(), value.trim()) {
164+
("prefix", _) => prefix = Some(value.to_string()),
165+
("storage", _) => storage = Some(value.to_string()),
166+
("format", "json") => format = S3ModeFormat::Json,
167+
("format", "parquet") => format = S3ModeFormat::Parquet,
168+
("format", "csv") => format = S3ModeFormat::Csv,
169+
("format", format) => return Err(anyhow!("Invalid S3 mode format: {}", format)),
170+
(_, _) => return Err(anyhow!("Invalid S3 mode argument: {}", kv)),
171+
}
172+
}
173+
174+
Ok(Some(S3ModeArgs { prefix, storage, format }))
175+
}
176+
123177
pub fn parse_sql_blocks(code: &str) -> Vec<&str> {
124178
let mut blocks = vec![];
125179
let mut last_idx = 0;
@@ -147,6 +201,7 @@ lazy_static::lazy_static! {
147201
static ref RE_NONEMPTY_SQL_BLOCK: Regex = Regex::new(r#"(?m)^\s*[^\s](?:[^-]|$)"#).unwrap();
148202

149203
static ref RE_DB: Regex = Regex::new(r#"(?m)^-- database (\S+) *(?:\r|\n|$)"#).unwrap();
204+
static ref RE_S3_MODE: Regex = Regex::new(r#"(?m)^-- s3( (.+))? *(?:\r|\n|$)"#).unwrap();
150205

151206
// -- $1 name (type) = default
152207
static ref RE_ARG_MYSQL: Regex = Regex::new(r#"(?m)^-- \? (\w+) \((\w+)\)(?: ?\= ?(.+))? *(?:\r|\n|$)"#).unwrap();

backend/windmill-common/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ tantivy = []
1212
prometheus = ["dep:prometheus"]
1313
loki = ["dep:tracing-loki"]
1414
benchmark = []
15-
parquet = ["dep:object_store", "dep:aws-config", "dep:aws-sdk-sts"]
15+
parquet = ["dep:object_store", "dep:aws-config", "dep:aws-sdk-sts", "dep:datafusion"]
1616
aws_auth = ["dep:aws-sdk-sts", "dep:aws-config"]
1717
otel = ["dep:opentelemetry-semantic-conventions", "dep:opentelemetry-otlp", "dep:opentelemetry_sdk",
1818
"dep:opentelemetry", "dep:tracing-opentelemetry", "dep:opentelemetry-appender-tracing", "dep:tonic"]
@@ -44,6 +44,9 @@ tracing = { workspace = true }
4444
axum = { workspace = true }
4545
hyper = { workspace = true }
4646
tokio = { workspace = true }
47+
tokio-stream.workspace = true
48+
tokio-util.workspace = true
49+
datafusion = { workspace = true, optional = true}
4750
reqwest = { workspace = true }
4851
tracing-subscriber = { workspace = true }
4952
lazy_static.workspace = true
@@ -67,6 +70,7 @@ async-stream.workspace = true
6770
const_format.workspace = true
6871
crc.workspace = true
6972
windmill-macros.workspace = true
73+
windmill-parser-sql.workspace = true
7074
jsonwebtoken.workspace = true
7175
backon.workspace = true
7276

backend/windmill-common/src/s3_helpers.rs

Lines changed: 207 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,35 @@ use object_store::{aws::AmazonS3Builder, ClientOptions};
1616
use reqwest::header::HeaderMap;
1717
use serde::{Deserialize, Serialize};
1818
#[cfg(feature = "parquet")]
19-
use std::sync::Arc;
19+
use std::sync::{Arc, Mutex};
2020
#[cfg(feature = "parquet")]
2121
use tokio::sync::RwLock;
2222

23+
#[cfg(feature = "parquet")]
24+
use crate::error::to_anyhow;
25+
#[cfg(feature = "parquet")]
26+
use crate::utils::rd_string;
27+
#[cfg(feature = "parquet")]
28+
use bytes::Bytes;
29+
#[cfg(feature = "parquet")]
30+
use datafusion::arrow::array::{RecordBatch, RecordBatchWriter};
31+
#[cfg(feature = "parquet")]
32+
use datafusion::arrow::error::ArrowError;
33+
#[cfg(feature = "parquet")]
34+
use datafusion::arrow::json::writer::JsonArray;
35+
#[cfg(feature = "parquet")]
36+
use datafusion::arrow::{csv, json};
37+
#[cfg(feature = "parquet")]
38+
use datafusion::parquet::arrow::ArrowWriter;
39+
#[cfg(feature = "parquet")]
40+
use futures::TryStreamExt;
41+
#[cfg(feature = "parquet")]
42+
use std::io::Write;
43+
#[cfg(feature = "parquet")]
44+
use tokio::task;
45+
#[cfg(feature = "parquet")]
46+
use windmill_parser_sql::S3ModeFormat;
47+
2348
#[cfg(feature = "parquet")]
2449
lazy_static::lazy_static! {
2550

@@ -480,3 +505,184 @@ pub fn bundle(w_id: &str, hash: &str) -> String {
480505
pub fn raw_app(w_id: &str, version: &i64) -> String {
481506
format!("/home/rfiszel/raw_app/{}/{}", w_id, version)
482507
}
508+
509+
// Originally used a Arc<Mutex<dyn RecordBatchWriter + Send>>
510+
// But cannot call .close() on it because it moves the value and the object is not Sized
511+
#[cfg(feature = "parquet")]
512+
enum RecordBatchWriterEnum {
513+
Parquet(ArrowWriter<ChannelWriter>),
514+
Csv(csv::Writer<ChannelWriter>),
515+
Json(json::Writer<ChannelWriter, JsonArray>),
516+
}
517+
518+
#[cfg(feature = "parquet")]
519+
impl RecordBatchWriter for RecordBatchWriterEnum {
520+
fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> {
521+
match self {
522+
RecordBatchWriterEnum::Parquet(w) => w.write(batch).map_err(|e| e.into()),
523+
RecordBatchWriterEnum::Csv(w) => w.write(batch),
524+
RecordBatchWriterEnum::Json(w) => w.write(batch),
525+
}
526+
}
527+
528+
fn close(self) -> Result<(), ArrowError> {
529+
match self {
530+
RecordBatchWriterEnum::Parquet(w) => w.close().map_err(|e| e.into()).map(drop),
531+
RecordBatchWriterEnum::Csv(w) => w.close(),
532+
RecordBatchWriterEnum::Json(w) => w.close(),
533+
}
534+
}
535+
}
536+
537+
#[cfg(feature = "parquet")]
538+
struct ChannelWriter {
539+
sender: tokio::sync::mpsc::Sender<anyhow::Result<Bytes>>,
540+
}
541+
542+
#[cfg(feature = "parquet")]
543+
impl Write for ChannelWriter {
544+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
545+
let data: Bytes = buf.to_vec().into();
546+
self.sender.blocking_send(Ok(data)).map_err(|e| {
547+
std::io::Error::new(
548+
std::io::ErrorKind::BrokenPipe,
549+
format!("Channel send error: {}", e),
550+
)
551+
})?;
552+
Ok(buf.len())
553+
}
554+
555+
fn flush(&mut self) -> std::io::Result<()> {
556+
Ok(())
557+
}
558+
}
559+
560+
#[cfg(not(feature = "parquet"))]
561+
pub async fn convert_json_line_stream<E: Into<anyhow::Error>>(
562+
mut _stream: impl futures::TryStreamExt<Item = Result<serde_json::Value, E>> + Unpin,
563+
_output_format: windmill_parser_sql::S3ModeFormat,
564+
) -> anyhow::Result<impl futures::TryStreamExt<Item = anyhow::Result<bytes::Bytes>>> {
565+
Ok(async_stream::stream! {
566+
yield Err(anyhow::anyhow!("Parquet feature is not enabled. Cannot convert JSON line stream."));
567+
})
568+
}
569+
570+
#[cfg(feature = "parquet")]
571+
pub async fn convert_json_line_stream<E: Into<anyhow::Error>>(
572+
mut stream: impl TryStreamExt<Item = Result<serde_json::Value, E>> + Unpin,
573+
output_format: S3ModeFormat,
574+
) -> anyhow::Result<impl TryStreamExt<Item = anyhow::Result<bytes::Bytes>>> {
575+
const MAX_MPSC_SIZE: usize = 1000;
576+
577+
use datafusion::{execution::context::SessionContext, prelude::NdJsonReadOptions};
578+
use futures::StreamExt;
579+
use std::path::PathBuf;
580+
use tokio::io::AsyncWriteExt;
581+
582+
let mut path = PathBuf::from(std::env::temp_dir());
583+
path.push(format!("{}.json", rd_string(8)));
584+
let path_str = path
585+
.to_str()
586+
.ok_or_else(|| anyhow::anyhow!("Invalid path"))?;
587+
588+
// Write the stream to a temporary file
589+
let mut file: tokio::fs::File = tokio::fs::File::create(&path).await.map_err(to_anyhow)?;
590+
591+
while let Some(chunk) = stream.next().await {
592+
match chunk {
593+
Ok(chunk) => {
594+
// Convert the chunk to bytes and write it to the file
595+
let b: bytes::Bytes = serde_json::to_string(&chunk)?.into();
596+
file.write_all(&b).await?;
597+
file.write_all(b"\n").await?;
598+
}
599+
Err(e) => {
600+
tokio::fs::remove_file(&path).await?;
601+
return Err(e.into());
602+
}
603+
}
604+
}
605+
606+
file.flush().await?;
607+
file.sync_all().await?;
608+
drop(file);
609+
610+
let ctx = SessionContext::new();
611+
ctx.register_json(
612+
"my_table",
613+
path_str,
614+
NdJsonReadOptions { ..Default::default() },
615+
)
616+
.await
617+
.map_err(to_anyhow)?;
618+
619+
let df = ctx.sql("SELECT * FROM my_table").await.map_err(to_anyhow)?;
620+
let schema = df.schema().clone().into();
621+
let mut datafusion_stream = df.execute_stream().await.map_err(to_anyhow)?;
622+
623+
let (tx, rx) = tokio::sync::mpsc::channel(MAX_MPSC_SIZE);
624+
let writer: Arc<Mutex<Option<RecordBatchWriterEnum>>> =
625+
Arc::new(Mutex::new(Some(match output_format {
626+
S3ModeFormat::Parquet => RecordBatchWriterEnum::Parquet(
627+
ArrowWriter::try_new(ChannelWriter { sender: tx.clone() }, Arc::new(schema), None)
628+
.map_err(to_anyhow)?,
629+
),
630+
631+
S3ModeFormat::Csv => {
632+
RecordBatchWriterEnum::Csv(csv::Writer::new(ChannelWriter { sender: tx.clone() }))
633+
}
634+
S3ModeFormat::Json => {
635+
RecordBatchWriterEnum::Json(json::Writer::<_, JsonArray>::new(ChannelWriter {
636+
sender: tx.clone(),
637+
}))
638+
}
639+
})));
640+
641+
// This spawn is so that the data is sent in the background. Else the function would deadlock
642+
// when hitting the mpsc channel limit
643+
task::spawn(async move {
644+
while let Some(batch_result) = datafusion_stream.next().await {
645+
let batch: RecordBatch = match batch_result {
646+
Ok(batch) => batch,
647+
Err(e) => {
648+
tracing::error!("Error in datafusion stream: {:?}", &e);
649+
match tx.send(Err(e.into())).await {
650+
Ok(_) => {}
651+
Err(e) => tracing::error!("Failed to write error to channel: {:?}", &e),
652+
}
653+
break;
654+
}
655+
};
656+
let writer = writer.clone();
657+
// Writer calls blocking_send which would crash if called from the async context
658+
let write_result = task::spawn_blocking(move || {
659+
// SAFETY: We await so the code is actually sequential, lock unwrap cannot panic
660+
// Second unwrap is ok because we initialized the option with Some
661+
writer.lock().unwrap().as_mut().unwrap().write(&batch)
662+
})
663+
.await;
664+
match write_result {
665+
Ok(Ok(_)) => {}
666+
Ok(Err(e)) => {
667+
tracing::error!("Error writing batch: {:?}", &e);
668+
match tx.send(Err(e.into())).await {
669+
Ok(_) => {}
670+
Err(e) => tracing::error!("Failed to write error to channel: {:?}", &e),
671+
}
672+
}
673+
Err(e) => tracing::error!("Error in blocking task: {:?}", &e),
674+
};
675+
}
676+
task::spawn_blocking(move || {
677+
writer.lock().unwrap().take().unwrap().close()?;
678+
drop(writer);
679+
Ok::<_, anyhow::Error>(())
680+
})
681+
.await??;
682+
drop(ctx);
683+
tokio::fs::remove_file(&path).await?;
684+
Ok::<_, anyhow::Error>(())
685+
});
686+
687+
Ok(tokio_stream::wrappers::ReceiverStream::new(rx))
688+
}

backend/windmill-worker/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ deno_tls = { workspace = true, optional = true }
9090
deno_permissions = { workspace = true, optional = true }
9191
deno_io = { workspace = true, optional = true }
9292
deno_error = { workspace = true, optional = true }
93+
async-stream.workspace = true
9394

9495
postgres-native-tls.workspace = true
9596
native-tls.workspace = true

0 commit comments

Comments
 (0)