Skip to content

Commit db84a7c

Browse files
peaseelukekim
authored andcommitted
refactor: Separate a DataStorage and ETL Sink (#63)
* refactor: Separate a DataStorage and ETL Sink * chore: fmt
1 parent 5eee64f commit db84a7c

14 files changed

Lines changed: 256 additions & 304 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/data-generation/src/generator.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,19 @@ use tokio::task::JoinSet;
2424
use super::config::IngestorConfig;
2525
use super::dataset::Dataset;
2626
use super::metrics::{IngestResult, Metrics};
27-
use super::target::Target;
27+
use super::storage::DataStorage;
2828

2929
pub struct DataGenerator {
3030
dataset: Arc<dyn Dataset>,
31-
target: Arc<dyn Target>,
31+
target: Arc<dyn DataStorage>,
3232
metrics: Metrics,
3333
semaphore: Arc<Semaphore>,
3434
}
3535

3636
impl DataGenerator {
3737
pub fn new(
3838
dataset: Arc<dyn Dataset>,
39-
target: Arc<dyn Target>,
39+
target: Arc<dyn DataStorage>,
4040
config: &IngestorConfig,
4141
metrics: Metrics,
4242
) -> Self {

crates/data-generation/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,4 @@ pub mod config;
1818
pub mod dataset;
1919
pub mod generator;
2020
pub mod metrics;
21-
pub mod source;
2221
pub mod storage;
23-
pub mod target;

crates/data-generation/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ limitations under the License.
1515
*/
1616

1717
use clap::Parser;
18-
use data_generation::target::Target;
18+
use data_generation::storage::DataStorage;
1919
use tracing_subscriber::EnvFilter;
2020

2121
use std::sync::Arc;
@@ -74,7 +74,7 @@ fn build(args: &CommonArgs) -> anyhow::Result<DataGenerator> {
7474

7575
let ingestor = DataGenerator::new(
7676
dataset,
77-
target as Arc<dyn Target>,
77+
target as Arc<dyn DataStorage>,
7878
&ingestor_config,
7979
metrics,
8080
);

crates/data-generation/src/metrics.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use std::sync::Arc;
1818
use std::sync::atomic::{AtomicU64, Ordering};
1919
use std::time::{Duration, Instant};
2020

21-
use crate::target::WriteResult;
21+
use crate::storage::WriteResult;
2222

2323
#[derive(Clone)]
2424
pub struct Metrics {

crates/data-generation/src/source/mod.rs

Lines changed: 0 additions & 46 deletions
This file was deleted.

crates/data-generation/src/source/s3.rs

Lines changed: 0 additions & 72 deletions
This file was deleted.

crates/data-generation/src/storage/mod.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,54 @@ limitations under the License.
1515
*/
1616

1717
pub mod s3;
18+
19+
use arrow::array::RecordBatch;
20+
use async_trait::async_trait;
21+
use std::collections::HashMap;
22+
23+
pub struct ReadResult {
24+
pub batches: Vec<RecordBatch>,
25+
pub rows_read: u64,
26+
pub bytes_read: u64,
27+
}
28+
29+
pub struct WriteResult {
30+
pub rows_written: u64,
31+
pub bytes_written: u64,
32+
}
33+
34+
#[async_trait]
35+
pub trait DataStorage: Send + Sync + 'static {
36+
/// List available batch object paths for a given table.
37+
async fn list_batches(&self, table_name: &str) -> anyhow::Result<Vec<String>>;
38+
39+
/// Read a single batch from the source by its batch ID and table name.
40+
///
41+
/// Returns `Ok(None)` when the batch does not exist in the underlying
42+
/// storage (e.g. the table has fewer batches than others). The caller
43+
/// should treat this as the table having no more data.
44+
///
45+
/// The concrete implementation is responsible for mapping `(table_name,
46+
/// batch_id)` to the underlying storage path.
47+
async fn read_batch(
48+
&self,
49+
table_name: &str,
50+
batch_id: u64,
51+
) -> anyhow::Result<Option<ReadResult>>;
52+
53+
async fn write(
54+
&self,
55+
table_name: &str,
56+
batch_id: u64,
57+
batch: RecordBatch,
58+
) -> anyhow::Result<WriteResult>;
59+
60+
fn table_params(&self, table_name: &str) -> HashMap<String, serde_json::Value>;
61+
62+
/// Returns the list of file paths/URIs that would exist after a successful
63+
/// generation for the given table and batch IDs.
64+
///
65+
/// This is a planning method — no I/O is performed. Each implementation
66+
/// maps `(table_name, batch_id)` to its own path scheme (e.g. an S3 URI).
67+
fn expected_files(&self, table_name: &str, batch_ids: &[u64]) -> Vec<String>;
68+
}

crates/data-generation/src/storage/s3.rs

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,19 @@ use object_store::aws::AmazonS3Builder;
2121
use object_store::path::Path as ObjectPath;
2222

2323
use crate::config::TargetConfig;
24+
use crate::storage::DataStorage;
25+
26+
use arrow::array::RecordBatch;
27+
use async_trait::async_trait;
28+
use futures::TryStreamExt;
29+
use object_store::PutPayload;
30+
use parquet::arrow::ArrowWriter;
31+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
32+
use parquet::basic::Compression;
33+
use parquet::file::properties::WriterProperties;
34+
use std::collections::HashMap;
35+
36+
use super::{ReadResult, WriteResult};
2437

2538
/// Unified S3 storage backend that implements both [`Source`] and [`Target`].
2639
///
@@ -90,3 +103,125 @@ impl S3Storage {
90103
}
91104
}
92105
}
106+
107+
#[async_trait]
108+
impl DataStorage for S3Storage {
109+
fn expected_files(&self, table_name: &str, batch_ids: &[u64]) -> Vec<String> {
110+
batch_ids
111+
.iter()
112+
.map(|id| {
113+
if self.prefix.is_empty() {
114+
format!("s3://{}/{table_name}/batch-{id:06}.parquet", self.bucket)
115+
} else {
116+
format!(
117+
"s3://{}/{}/{table_name}/batch-{id:06}.parquet",
118+
self.bucket, self.prefix
119+
)
120+
}
121+
})
122+
.collect()
123+
}
124+
125+
fn table_params(&self, table_name: &str) -> HashMap<String, serde_json::Value> {
126+
let mut params = HashMap::new();
127+
params.insert(
128+
"connector".to_string(),
129+
serde_json::Value::String("s3".to_string()),
130+
);
131+
params.insert(
132+
"from".to_string(),
133+
serde_json::Value::String(self.table_s3_path(table_name)),
134+
);
135+
params.insert(
136+
"file_format".to_string(),
137+
serde_json::Value::String("parquet".to_string()),
138+
);
139+
140+
if let Some(region) = &self.region {
141+
params.insert(
142+
"s3_region".to_string(),
143+
serde_json::Value::String(region.clone()),
144+
);
145+
}
146+
147+
params
148+
}
149+
150+
async fn write(
151+
&self,
152+
table_name: &str,
153+
batch_id: u64,
154+
batch: RecordBatch,
155+
) -> anyhow::Result<WriteResult> {
156+
let rows = batch.num_rows() as u64;
157+
let schema = batch.schema();
158+
159+
// Serialize RecordBatch to Parquet bytes in memory
160+
let props = WriterProperties::builder()
161+
.set_compression(Compression::SNAPPY)
162+
.build();
163+
164+
let mut buf = Vec::new();
165+
let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(props))?;
166+
writer.write(&batch)?;
167+
writer.close()?;
168+
169+
let bytes_written = buf.len() as u64;
170+
171+
// Upload to S3 with per-table directory structure
172+
let path = self.batch_object_path(table_name, batch_id);
173+
174+
self.store.put(&path, PutPayload::from(buf)).await?;
175+
176+
Ok(WriteResult {
177+
rows_written: rows,
178+
bytes_written,
179+
})
180+
}
181+
182+
async fn list_batches(&self, table_name: &str) -> anyhow::Result<Vec<String>> {
183+
let prefix = self.table_object_prefix(table_name);
184+
185+
let objects: Vec<_> = self.store.list(Some(&prefix)).try_collect().await?;
186+
187+
let paths: Vec<String> = objects
188+
.into_iter()
189+
.filter(|meta| meta.location.as_ref().ends_with(".parquet"))
190+
.map(|meta| meta.location.to_string())
191+
.collect();
192+
193+
Ok(paths)
194+
}
195+
196+
async fn read_batch(
197+
&self,
198+
table_name: &str,
199+
batch_id: u64,
200+
) -> anyhow::Result<Option<ReadResult>> {
201+
let location = self.batch_object_path(table_name, batch_id);
202+
203+
let get_result = match self.store.get(&location).await {
204+
Ok(r) => r,
205+
Err(object_store::Error::NotFound { .. }) => return Ok(None),
206+
Err(e) => return Err(e.into()),
207+
};
208+
let bytes = get_result.bytes().await?;
209+
let bytes_read = bytes.len() as u64;
210+
211+
let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)?.build()?;
212+
213+
let mut batches = Vec::new();
214+
let mut rows_read = 0u64;
215+
for batch in reader {
216+
let batch = batch?;
217+
rows_read += batch.num_rows() as u64;
218+
batches.push(batch);
219+
}
220+
221+
Ok(Some(ReadResult {
222+
batches,
223+
rows_read,
224+
bytes_read,
225+
}))
226+
}
227+
}

0 commit comments

Comments
 (0)