Skip to content

Commit 3e4eadc

Browse files
authored
fix etl initialize before system adapter setup (#122)
* initialize pipeline earlier * wip * wip * fix: Don't make spicebench build duckdb * fix
1 parent 429feb5 commit 3e4eadc

5 files changed

Lines changed: 161 additions & 199 deletions

File tree

.github/workflows/data_generation_run.yml

Lines changed: 8 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,8 @@ on:
88
required: false
99
default: 'tpch'
1010
type: string
11-
system_under_test:
12-
required: false
13-
default: 'spice_cloud'
14-
type: string
15-
table_format:
16-
required: false
17-
default: 'parquet'
18-
type: string
19-
databricks_variant:
20-
required: false
21-
default: 'databricks'
11+
version:
12+
required: true
2213
type: string
2314
scale_factor:
2415
required: false
@@ -72,27 +63,10 @@ on:
7263
required: false
7364
default: 'spiceai-macos'
7465
type: string
75-
system_under_test:
76-
description: 'System under test (for version derivation, must match the run workflow)'
77-
required: false
78-
default: 'spice_cloud'
79-
type: choice
80-
options:
81-
- spice_cloud
82-
- databricks
83-
table_format:
84-
description: 'Table format (for version derivation, must match the run workflow)'
85-
required: false
86-
default: 'parquet'
66+
version:
67+
description: 'Data generation version identifier'
68+
required: true
8769
type: string
88-
databricks_variant:
89-
description: 'Databricks variant (for version derivation, must match the run workflow)'
90-
required: false
91-
default: 'databricks'
92-
type: choice
93-
options:
94-
- databricks
95-
- lakebase
9670
scale_factor:
9771
description: 'TPC-H scale factor'
9872
required: true
@@ -168,9 +142,7 @@ jobs:
168142
env:
169143
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
170144
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
171-
SYSTEM_UNDER_TEST: ${{ inputs.system_under_test || github.event.inputs.system_under_test || 'spice_cloud' }}
172-
TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
173-
DATABRICKS_VARIANT: ${{ inputs.databricks_variant || github.event.inputs.databricks_variant || 'databricks' }}
145+
VERSION: ${{ inputs.version || github.event.inputs.version }}
174146
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
175147
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
176148
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
@@ -181,18 +153,6 @@ jobs:
181153
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
182154
RUST_LOG: info
183155
run: |
184-
DATABRICKS_VARIANT_FOR_VERSION="na"
185-
if [ "${SYSTEM_UNDER_TEST}" = "databricks" ]; then
186-
DATABRICKS_VARIANT_FOR_VERSION="${DATABRICKS_VARIANT}"
187-
fi
188-
189-
SCALE_FACTOR_TAG="$(printf '%s' "${SCALE_FACTOR}" | sed 's/\./p/g')"
190-
191-
VERSION="auto-${SCENARIO}-sf${SCALE_FACTOR_TAG}-${SYSTEM_UNDER_TEST}-s3-hive-${TABLE_FORMAT}-${DATABRICKS_VARIANT_FOR_VERSION}-createdat"
192-
VERSION="$(printf '%s' "${VERSION}" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9._-' '-')"
193-
VERSION="${VERSION%-}"
194-
echo "Derived VERSION=${VERSION} from workflow inputs"
195-
196156
ARGS="--dataset ${SCENARIO}"
197157
ARGS="${ARGS} --scenario ${SCENARIO}"
198158
ARGS="${ARGS} --version ${VERSION}"
@@ -223,16 +183,13 @@ jobs:
223183
if: steps.cache-checkpointer.outputs.cache-hit != 'true'
224184
run: |
225185
mkdir -p ~/.spice/bin
226-
cargo build --release -p checkpointer
186+
cargo build --release --features duckdb -p checkpointer
227187
install -m 755 target/release/checkpointer ~/.spice/bin/checkpointer
228188
229189
- name: Run checkpointer
230190
env:
231191
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
232-
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
233-
SYSTEM_UNDER_TEST: ${{ inputs.system_under_test || github.event.inputs.system_under_test || 'spice_cloud' }}
234-
TABLE_FORMAT: ${{ inputs.table_format || github.event.inputs.table_format || 'parquet' }}
235-
DATABRICKS_VARIANT: ${{ inputs.databricks_variant || github.event.inputs.databricks_variant || 'databricks' }}
192+
VERSION: ${{ inputs.version || github.event.inputs.version }}
236193
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
237194
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
238195
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
@@ -241,18 +198,6 @@ jobs:
241198
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
242199
RUST_LOG: info
243200
run: |
244-
DATABRICKS_VARIANT_FOR_VERSION="na"
245-
if [ "${SYSTEM_UNDER_TEST}" = "databricks" ]; then
246-
DATABRICKS_VARIANT_FOR_VERSION="${DATABRICKS_VARIANT}"
247-
fi
248-
249-
SCALE_FACTOR_TAG="$(printf '%s' "${SCALE_FACTOR}" | sed 's/\./p/g')"
250-
251-
VERSION="auto-${SCENARIO}-sf${SCALE_FACTOR_TAG}-${SYSTEM_UNDER_TEST}-s3-hive-${TABLE_FORMAT}-${DATABRICKS_VARIANT_FOR_VERSION}-createdat"
252-
VERSION="$(printf '%s' "${VERSION}" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9._-' '-')"
253-
VERSION="${VERSION%-}"
254-
echo "Derived VERSION=${VERSION} from workflow inputs"
255-
256201
ARGS="--scenario ${SCENARIO}"
257202
ARGS="${ARGS} --version ${VERSION}"
258203
ARGS="${ARGS} --bucket ${BUCKET}"

.github/workflows/run_spicebench.yml

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,9 @@ on:
3535
required: false
3636
default: 'data-gen'
3737
type: string
38-
scale_factor:
39-
description: 'Scale factor used by data generation (included in derived ETL version key)'
40-
required: false
41-
default: '0.01'
38+
etl_version:
39+
description: 'Data generation version identifier (the {version} portion of {prefix}/{scenario}/{version}/)'
40+
required: true
4241
type: string
4342
etl_region:
4443
description: 'AWS region for the ETL S3 bucket'
@@ -63,7 +62,7 @@ on:
6362
jobs:
6463
run-spicebench:
6564
name: Run spicebench
66-
runs-on: spiceai-dev-runners
65+
runs-on: ubuntu-latest
6766
timeout-minutes: 600
6867
concurrency:
6968
group: spicebench-run
@@ -226,7 +225,7 @@ jobs:
226225
EXECUTOR_INSTANCE_TYPE: ${{ github.event.inputs.executor_instance_type || 'github-hosted-ubuntu-latest' }}
227226
ETL_BUCKET: ${{ github.event.inputs.etl_bucket }}
228227
ETL_PREFIX: ${{ github.event.inputs.etl_prefix || 'data-gen' }}
229-
SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '0.01' }}
228+
ETL_VERSION: ${{ github.event.inputs.etl_version }}
230229
ETL_REGION: ${{ github.event.inputs.etl_region || 'us-east-1' }}
231230
ETL_ENDPOINT: ${{ github.event.inputs.etl_endpoint }}
232231
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -235,18 +234,6 @@ jobs:
235234
SPIDAPTER_ICEBERG_CATALOG_FROM: iceberg:https://glue.us-west-1.amazonaws.com/iceberg/v1/catalogs/211125479522/namespaces
236235
RUST_LOG: 'info'
237236
run: |
238-
DATABRICKS_VARIANT_FOR_VERSION="na"
239-
if [ "${SYSTEM_UNDER_TEST}" = "databricks" ]; then
240-
DATABRICKS_VARIANT_FOR_VERSION="${DATABRICKS_VARIANT}"
241-
fi
242-
243-
SCALE_FACTOR_TAG="$(printf '%s' "${SCALE_FACTOR}" | sed 's/\./p/g')"
244-
245-
ETL_VERSION="auto-${SCENARIO}-sf${SCALE_FACTOR_TAG}-${SYSTEM_UNDER_TEST}-s3-hive-${DATABRICKS_TABLE_FORMAT}-${DATABRICKS_VARIANT_FOR_VERSION}-createdat"
246-
ETL_VERSION="$(printf '%s' "${ETL_VERSION}" | tr '[:upper:]' '[:lower:]' | tr -cs 'a-z0-9._-' '-')"
247-
ETL_VERSION="${ETL_VERSION%-}"
248-
echo "Derived ETL_VERSION=${ETL_VERSION} from workflow inputs"
249-
250237
ETL_ARGS="--etl-bucket ${ETL_BUCKET} --etl-version ${ETL_VERSION}"
251238
if [ -n "${ETL_PREFIX}" ]; then
252239
ETL_ARGS="${ETL_ARGS} --etl-prefix ${ETL_PREFIX}"

crates/checkpointer/Cargo.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,16 @@ anyhow.workspace = true
1313
arrow.workspace = true
1414
clap = { workspace = true, features = ["derive"] }
1515
data-generation = { path = "../data-generation" }
16-
etl = { path = "../etl", features = ["duckdb"] }
16+
etl = { path = "../etl" }
1717
test-framework = { path = "../test-framework" }
1818
object_store = { workspace = true }
1919
serde = { workspace = true, features = ["derive"] }
2020
serde_json.workspace = true
2121
parquet.workspace = true
2222
tokio.workspace = true
2323
tracing.workspace = true
24-
tracing-subscriber = { workspace = true, features = ["env-filter"] }
24+
tracing-subscriber = { workspace = true, features = ["env-filter"] }
25+
26+
[features]
27+
default = []
28+
duckdb = ["etl/duckdb"]

0 commit comments

Comments
 (0)