Skip to content

Commit 35e5259

Browse files
Move to obj store for tpch gen (#323)
1 parent b86ea0b commit 35e5259

12 files changed

Lines changed: 374 additions & 199 deletions

File tree

.github/actions/setup-rust/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ runs:
3737
shell: bash
3838
run: |
3939
cargo install taplo-cli --locked
40-
cargo install cargo-machete
40+
cargo install cargo-machete just

.github/workflows/test.yml

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ on: [push, pull_request]
2020

2121
jobs:
2222
test:
23-
name: Core
23+
name: DB
2424
runs-on: ubuntu-latest
2525
strategy:
2626
matrix:
@@ -43,7 +43,7 @@ jobs:
4343
uses: ./.github/actions/setup-rust
4444
- name: Run core tests
4545
run: |
46-
cargo test
46+
cargo test db
4747
test-flightsql:
4848
name: Extension / FlightSQL
4949
runs-on: ubuntu-latest
@@ -72,6 +72,57 @@ jobs:
7272
# test to only be run against the server spun up in that test. With parallelism tests
7373
# can connec to server in different test which breaks determinism.
7474
cargo test --features=flightsql extension_cases::flightsql -- --test-threads=1
75+
test-cli:
76+
name: App / CLI
77+
runs-on: ubuntu-latest
78+
strategy:
79+
matrix:
80+
arch: [amd64]
81+
steps:
82+
- uses: actions/checkout@v3
83+
- name: Setup Rust Toolchain
84+
uses: ./.github/actions/setup-rust
85+
- name: Start LocalStack
86+
uses: LocalStack/setup-localstack@v0.2.3
87+
with:
88+
image-tag: 'latest'
89+
install-awslocal: 'true'
90+
configuration: DEBUG=1
91+
- name: Run Tests against LocalStack
92+
run: |
93+
awslocal s3 mb s3://test
94+
awslocal s3 mv data/aggregate_test_100.csv s3://test/
95+
awslocal s3 mb s3://tpch-db
96+
echo "Test Execution complete!"
97+
- name: Run CLI tests
98+
run: |
99+
cargo test cli_cases
100+
test-tui:
101+
name: App / TUI
102+
runs-on: ubuntu-latest
103+
strategy:
104+
matrix:
105+
arch: [amd64]
106+
steps:
107+
- uses: actions/checkout@v3
108+
- name: Setup Rust Toolchain
109+
uses: ./.github/actions/setup-rust
110+
- name: Start LocalStack
111+
uses: LocalStack/setup-localstack@v0.2.3
112+
with:
113+
image-tag: 'latest'
114+
install-awslocal: 'true'
115+
configuration: DEBUG=1
116+
- name: Run Tests against LocalStack
117+
run: |
118+
awslocal s3 mb s3://test
119+
awslocal s3 mv data/aggregate_test_100.csv s3://test/
120+
awslocal s3 mb s3://tpch-db
121+
echo "Test Execution complete!"
122+
- name: Run CLI tests
123+
run: |
124+
cargo test tui_cases
125+
75126
test-s3:
76127
name: Extension / S3
77128
runs-on: ubuntu-latest

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ lazy_static = "1.4.0"
3434
log = "0.4.22"
3535
metrics = { version = "0.24.0", optional = true }
3636
metrics-exporter-prometheus = { version = "0.16.0", optional = true }
37+
object_store = "0.11.2"
3738
parquet = "54"
3839
pin-project-lite = { version = "0.2.14" }
3940
prost = "0.13.1"
@@ -62,7 +63,7 @@ tracing = { version = "0.1.41", features = ["log"] }
6263
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
6364
tui-logger = { version = "0.12", features = ["tracing-support"] }
6465
tui-textarea = { version = "0.6.1", features = ["search"] }
65-
url = { version = "2.5.2", optional = true }
66+
url = { version = "2.5.2", features = ["serde"] }
6667
uuid = { version = "1.10.0", optional = true }
6768

6869
[dev-dependencies]
@@ -80,7 +81,7 @@ url = "2.5.2"
8081

8182
# When addding a new feature, also add it to the features tested list in CI (`.github/workflows/rust.yml`)
8283
[features]
83-
default = ["functions-parquet"]
84+
default = ["functions-parquet", "s3"]
8485
deltalake = ["datafusion-app/deltalake"]
8586
flightsql = [
8687
"datafusion-app/flightsql",
@@ -105,7 +106,7 @@ http = [
105106
"dep:uuid",
106107
]
107108
huggingface = ["datafusion-app/huggingface"]
108-
s3 = ["datafusion-app/s3", "url"]
109+
s3 = ["datafusion-app/s3"]
109110
udfs-wasm = ["datafusion-app/udfs-wasm"]
110111

111112
[[bin]]

justfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,7 @@ bench-http-custom file:
3636
echo "Running bench on $custom_bench_path"
3737
echo ""
3838
oha --urls-from-file "$custom_bench_path"
39+
40+
setup-test-env:
41+
localstack start -d
42+
awslocal s3api create-bucket --bucket tmp --acl public-read

src/config.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use serde::Deserialize;
3030

3131
#[cfg(any(feature = "flightsql", feature = "http"))]
3232
use datafusion_app::config::AuthConfig;
33+
use url::Url;
3334

3435
lazy_static! {
3536
pub static ref PROJECT_NAME: String = env!("CARGO_CRATE_NAME").to_uppercase().to_string();
@@ -194,7 +195,7 @@ fn default_interaction_config() -> InteractionConfig {
194195
#[derive(Debug, Clone, Deserialize)]
195196
pub struct DbConfig {
196197
#[serde(default = "default_db_path")]
197-
pub path: PathBuf,
198+
pub path: Url,
198199
}
199200

200201
impl Default for DbConfig {
@@ -209,9 +210,18 @@ fn default_db_config() -> DbConfig {
209210
}
210211
}
211212

212-
fn default_db_path() -> PathBuf {
213+
#[allow(unused)]
214+
fn default_db_path() -> Url {
213215
let base = directories::BaseDirs::new().expect("Base directories should be available");
214-
base.data_dir().to_path_buf().join("dft")
216+
let path = base
217+
.data_dir()
218+
.to_path_buf()
219+
.join("dft/")
220+
.to_str()
221+
.unwrap()
222+
.to_string();
223+
let with_schema = format!("file://{path}");
224+
Url::parse(&with_schema).unwrap()
215225
}
216226

217227
#[derive(Clone, Debug, Deserialize)]

src/db.rs

Lines changed: 47 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::{fs::read_dir, sync::Arc};
18+
use std::sync::Arc;
1919

2020
use color_eyre::{Report, Result};
2121
use datafusion::{
@@ -26,34 +26,26 @@ use datafusion::{
2626
},
2727
prelude::SessionContext,
2828
};
29-
use log::{error, info};
29+
use log::info;
3030

3131
use crate::config::DbConfig;
3232

3333
pub async fn register_db(ctx: &SessionContext, db_config: &DbConfig) -> Result<()> {
3434
info!("registering tables to database");
35-
let tables_path = db_config.path.join("tables");
36-
if !tables_path.exists() || !tables_path.is_dir() {
37-
info!("no tables directory configured, skipping table registration");
38-
return Ok(());
39-
}
40-
let catalogs = read_dir(tables_path)?;
41-
info!("...reading catalogs");
42-
for maybe_catalog in catalogs {
43-
let catalog = maybe_catalog?;
44-
let catalog_file_name = catalog.file_name();
45-
let catalog_name = catalog_file_name.to_str().ok_or(Report::msg(format!(
46-
"invalid catalog path {catalog_file_name:?}"
47-
)))?;
48-
// Every catalog should be a directory
49-
if !catalog.path().is_dir() {
50-
error!("catalog {catalog_name:?} is not a directory, skipping");
51-
continue;
52-
}
53-
let catalog_path = catalog.path();
54-
info!("...handling {:?} catalog", catalog_name);
35+
let tables_url = db_config.path.join("tables")?;
36+
let listing_tables_url = ListingTableUrl::parse(tables_url.clone())?;
37+
let store_url = listing_tables_url.object_store();
38+
let store = ctx.runtime_env().object_store(store_url)?;
39+
let tables_path = object_store::path::Path::from_url_path(tables_url.path())?;
40+
let catalogs = store.list_with_delimiter(Some(&tables_path)).await?;
41+
for catalog in catalogs.common_prefixes {
42+
let catalog_name = catalog
43+
.filename()
44+
.ok_or(Report::msg("missing catalog name"))?;
45+
info!("...handling {catalog_name} catalog");
5546
let maybe_catalog = ctx.catalog(catalog_name);
5647
let catalog_provider = match maybe_catalog {
48+
Some(catalog) => catalog,
5749
None => {
5850
info!("...catalog does not exist, createing");
5951
let mem_catalog_provider = Arc::new(MemoryCatalogProvider::new());
@@ -62,23 +54,16 @@ pub async fn register_db(ctx: &SessionContext, db_config: &DbConfig) -> Result<(
6254
"missing catalog {catalog_name}, shouldnt be possible"
6355
)))?
6456
}
65-
Some(catalog) => catalog,
6657
};
67-
for maybe_schema in read_dir(&catalog_path)? {
68-
let schema = maybe_schema?;
69-
let schema_file_name = schema.file_name();
70-
let schema_name = schema_file_name.to_str().ok_or(Report::msg(format!(
71-
"invalid schema path {schema_file_name:?}"
72-
)))?;
73-
// Every schema should be a directory
74-
if !schema.path().is_dir() {
75-
error!("schema {schema_name:?} is not a directory, skipping",);
76-
continue;
77-
}
78-
let schema_path = schema.path();
79-
info!("...handling {:?} schema", schema_name);
58+
let schemas = store.list_with_delimiter(Some(&catalog)).await?;
59+
for schema in schemas.common_prefixes {
60+
let schema_name = schema
61+
.filename()
62+
.ok_or(Report::msg("missing schema name"))?;
63+
info!("...handling {schema_name} schema");
8064
let maybe_schema = catalog_provider.schema(schema_name);
8165
let schema_provider = match maybe_schema {
66+
Some(schema) => schema,
8267
None => {
8368
info!("...schema does not exist, creating");
8469
let mem_schema_provider = Arc::new(MemorySchemaProvider::new());
@@ -89,24 +74,19 @@ pub async fn register_db(ctx: &SessionContext, db_config: &DbConfig) -> Result<(
8974
"missing schema {schema_name}, shouldnt be possible"
9075
)))?
9176
}
92-
Some(schema) => schema,
9377
};
94-
for maybe_table in read_dir(schema_path)? {
95-
let table = maybe_table?;
96-
// Every table should be a directory even if there is a single data file
97-
if !table.path().is_dir() {
98-
error!("table {:?} is not a directory, skipping", catalog.path());
99-
continue;
100-
}
101-
let table_path = table.path();
102-
let table_file_name = table.file_name();
103-
let table_name = table_file_name.to_str().ok_or(Report::msg(format!(
104-
"invalid table path {table_file_name:?}"
105-
)))?;
106-
info!("...handling table {table_name:?}");
107-
let table_url = ListingTableUrl::parse(table_path.to_str().ok_or(Report::msg(
108-
format!("Invalid table path for {table_path:?}"),
109-
))?)?;
78+
let tables = store.list_with_delimiter(Some(&schema)).await?;
79+
for table_path in tables.common_prefixes {
80+
let table_name = table_path
81+
.filename()
82+
.ok_or(Report::msg("missing table name"))?;
83+
info!("...handling table \"{catalog_name}.{schema_name}.{table_name}\"");
84+
85+
let p = tables_url
86+
.join(&format!("{catalog_name}/"))?
87+
.join(&format!("{schema_name}/"))?
88+
.join(&format!("{table_name}/"))?;
89+
let table_url = ListingTableUrl::parse(p)?;
11090
let file_format = ParquetFormat::new();
11191
let listing_options =
11292
ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet");
@@ -119,11 +99,12 @@ pub async fn register_db(ctx: &SessionContext, db_config: &DbConfig) -> Result<(
11999
.with_schema(resolved_schema);
120100
// Create a new TableProvider
121101
let provider = Arc::new(ListingTable::try_new(config)?);
122-
info!("...registering {table_name}");
102+
info!("...table registered");
123103
schema_provider.register_table(table_name.to_string(), provider)?;
124104
}
125105
}
126106
}
107+
127108
Ok(())
128109
}
129110

@@ -179,9 +160,9 @@ mod test {
179160
let ctx = setup();
180161
let dir = tempfile::tempdir().unwrap();
181162
let db_path = dir.path().join("db");
182-
let config = DbConfig {
183-
path: db_path.clone(),
184-
};
163+
let path = format!("file://{}/", db_path.to_str().unwrap());
164+
let db_url = url::Url::parse(&path).unwrap();
165+
let config = DbConfig { path: db_url };
185166
let data_path = db_path.join("tables").join("dft").join("stuff").join("hi");
186167

187168
let df = ctx.sql("SELECT 1").await.unwrap();
@@ -231,9 +212,9 @@ mod test {
231212
let ctx = setup();
232213
let dir = tempfile::tempdir().unwrap();
233214
let db_path = dir.path().join("db");
234-
let config = DbConfig {
235-
path: db_path.clone(),
236-
};
215+
let path = format!("file://{}/", db_path.to_str().unwrap());
216+
let db_url = url::Url::parse(&path).unwrap();
217+
let config = DbConfig { path: db_url };
237218
let data_1_path = db_path.join("tables").join("dft").join("stuff").join("hi");
238219
let data_2_path = db_path.join("tables").join("dft").join("stuff").join("bye");
239220

@@ -290,9 +271,9 @@ mod test {
290271
let ctx = setup();
291272
let dir = tempfile::tempdir().unwrap();
292273
let db_path = dir.path().join("db");
293-
let config = DbConfig {
294-
path: db_path.clone(),
295-
};
274+
let path = format!("file://{}/", db_path.to_str().unwrap());
275+
let db_url = url::Url::parse(&path).unwrap();
276+
let config = DbConfig { path: db_url };
296277
let data_1_path = db_path.join("tables").join("dft").join("stuff").join("hi");
297278
let data_2_path = db_path
298279
.join("tables")
@@ -353,9 +334,9 @@ mod test {
353334
let ctx = setup();
354335
let dir = tempfile::tempdir().unwrap();
355336
let db_path = dir.path().join("db");
356-
let config = DbConfig {
357-
path: db_path.clone(),
358-
};
337+
let path = format!("file://{}/", db_path.to_str().unwrap());
338+
let db_url = url::Url::parse(&path).unwrap();
339+
let config = DbConfig { path: db_url };
359340
let data_1_path = db_path.join("tables").join("dft2").join("stuff").join("hi");
360341
let data_2_path = db_path
361342
.join("tables")

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ async fn app_entry_point(cli: DftArgs) -> Result<()> {
6262
}
6363
let cfg = create_config(cli.config_path());
6464
if let Some(Command::GenerateTpch { scale_factor }) = cli.command {
65-
tpch::generate(cfg.clone(), scale_factor)?;
65+
tpch::generate(cfg.clone(), scale_factor).await?;
6666
return Ok(());
6767
}
6868

0 commit comments

Comments
 (0)