Skip to content

Commit 56f259b

Browse files
committed
feat: Enhance data generation workflow with additional inputs and caching
1 parent 4af0190 commit 56f259b

2 files changed

Lines changed: 160 additions & 26 deletions

File tree

Lines changed: 94 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,70 @@
11
name: data-generation run
2-
run-name: data-generation run - sf${{ github.event.inputs.scale_factor }} - ${{ github.event.inputs.num_steps }} steps
2+
run-name: data-generation run - ${{ github.event.inputs.scenario }} - sf${{ github.event.inputs.scale_factor }}
33

44
on:
5+
workflow_call:
6+
inputs:
7+
scenario:
8+
required: false
9+
default: 'tpch'
10+
type: string
11+
scale_factor:
12+
required: false
13+
default: '1.0'
14+
type: string
15+
bucket:
16+
required: true
17+
type: string
18+
prefix:
19+
required: false
20+
default: 'data-gen'
21+
type: string
22+
max_concurrency:
23+
required: false
24+
default: '8'
25+
type: string
26+
region:
27+
required: false
28+
default: 'us-east-1'
29+
type: string
30+
skip_initial:
31+
required: false
32+
default: false
33+
type: boolean
34+
runner_type:
35+
required: false
36+
default: 'ubuntu-latest'
37+
type: string
38+
secrets:
39+
AWS_ACCESS_KEY_ID:
40+
required: true
41+
AWS_SECRET_ACCESS_KEY:
42+
required: true
543
workflow_dispatch:
644
inputs:
45+
scenario:
46+
description: 'Dataset/scenario to generate (e.g. tpch)'
47+
required: true
48+
default: 'tpch'
49+
type: string
50+
runner_type:
51+
description: 'GitHub runner label to execute on'
52+
required: false
53+
default: 'ubuntu-latest'
54+
type: string
755
scale_factor:
856
description: 'TPC-H scale factor'
957
required: true
1058
default: '1.0'
1159
type: string
12-
num_steps:
13-
description: 'Number of data generation steps'
14-
required: false
15-
default: '100'
16-
type: string
1760
bucket:
1861
description: 'S3 bucket name'
1962
required: true
2063
type: string
2164
prefix:
22-
description: 'S3 key prefix for generated files'
65+
description: 'Base S3 key prefix for generated files (scenario is appended automatically)'
2366
required: false
24-
default: 'data-gen/tpch'
67+
default: 'data-gen'
2568
type: string
2669
max_concurrency:
2770
description: 'Maximum number of concurrent S3 writes'
@@ -42,33 +85,60 @@ on:
4285
jobs:
4386
run-data-generation:
4487
name: Run data generation
45-
runs-on: ${{ github.event.inputs.runner_type }}
88+
runs-on: ${{ inputs.runner_type || github.event.inputs.runner_type || 'ubuntu-latest' }}
4689
timeout-minutes: 600
4790
steps:
4891
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6
4992

50-
- name: Install Rust
51-
uses: dtolnay/rust-toolchain@stable
93+
- name: Cache data-generation binary
94+
id: cache-data-generation
95+
uses: actions/cache@v4
96+
with:
97+
path: ~/.spice/bin/data-generation
98+
key: data-generation-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }}
99+
restore-keys: |
100+
data-generation-${{ runner.os }}-
101+
102+
- name: Setup Rust toolchain
103+
if: steps.cache-data-generation.outputs.cache-hit != 'true'
104+
uses: actions-rust-lang/setup-rust-toolchain@v1
105+
with:
106+
toolchain: 1.91
107+
cache: true
52108

53109
- name: Build data-generation
54-
run: cargo build --release -p data-generation
110+
if: steps.cache-data-generation.outputs.cache-hit != 'true'
111+
run: |
112+
mkdir -p ~/.spice/bin
113+
cargo build --release -p data-generation
114+
install -m 755 target/release/data-generation ~/.spice/bin/data-generation
55115
56116
- name: Run data generation
117+
env:
118+
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
119+
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '1.0' }}
120+
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
121+
PREFIX_BASE: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
122+
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
123+
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
124+
SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
125+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
126+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
127+
RUST_LOG: info
57128
run: |
58-
ARGS="--scale-factor ${{ github.event.inputs.scale_factor }}"
59-
ARGS="${ARGS} --num-steps ${{ github.event.inputs.num_steps }}"
60-
ARGS="${ARGS} --bucket ${{ github.event.inputs.bucket }}"
61-
ARGS="${ARGS} --prefix ${{ github.event.inputs.prefix }}"
62-
ARGS="${ARGS} --max-concurrency ${{ github.event.inputs.max_concurrency }}"
63-
ARGS="${ARGS} --region ${{ github.event.inputs.region }}"
129+
PREFIX="${PREFIX_BASE}"
130+
PREFIX="${PREFIX%/}/${SCENARIO}"
64131
65-
if [ "${{ github.event.inputs.skip_initial }}" = "true" ]; then
132+
ARGS="--dataset ${SCENARIO}"
133+
ARGS="${ARGS} --scale-factor ${SCALE_FACTOR}"
134+
ARGS="${ARGS} --bucket ${BUCKET}"
135+
ARGS="${ARGS} --prefix ${PREFIX}"
136+
ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
137+
ARGS="${ARGS} --region ${REGION}"
138+
139+
if [ "${SKIP_INITIAL}" = "true" ]; then
66140
ARGS="${ARGS} --skip-initial"
67141
fi
68142
69143
echo "Running: data-generation run ${ARGS}"
70-
./target/release/data-generation run ${ARGS}
71-
env:
72-
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
73-
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
74-
RUST_LOG: info
144+
~/.spice/bin/data-generation run ${ARGS}

.github/workflows/run_spicebench.yml

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,74 @@
11
name: Run spicebench
2+
run-name: Run spicebench - ${{ github.event.inputs.scenario || 'tpch' }}
23

34
on:
45
workflow_dispatch:
6+
inputs:
7+
scenario:
8+
description: 'Scenario/query set to run (e.g. tpch)'
9+
required: true
10+
default: 'tpch'
11+
type: string
12+
run_data_generation:
13+
description: 'Run data-generation before spicebench'
14+
required: false
15+
default: false
16+
type: boolean
17+
data_generation_runner_type:
18+
description: 'Runner label for data-generation job'
19+
required: false
20+
default: 'ubuntu-latest'
21+
type: string
22+
data_generation_scale_factor:
23+
description: 'Data-generation scale factor'
24+
required: false
25+
default: '1.0'
26+
type: string
27+
data_generation_bucket:
28+
description: 'S3 bucket for generated data (required if run_data_generation=true)'
29+
required: false
30+
type: string
31+
data_generation_prefix:
32+
description: 'Base S3 prefix for generated data (scenario appended automatically)'
33+
required: false
34+
default: 'data-gen'
35+
type: string
36+
data_generation_max_concurrency:
37+
description: 'Data-generation max concurrent S3 writes'
38+
required: false
39+
default: '8'
40+
type: string
41+
data_generation_region:
42+
description: 'AWS region for data-generation'
43+
required: false
44+
default: 'us-east-1'
45+
type: string
46+
data_generation_skip_initial:
47+
description: 'Skip data-generation initial ingest'
48+
required: false
49+
default: false
50+
type: boolean
551

652
jobs:
53+
data-generation:
54+
name: Run data generation
55+
if: ${{ github.event.inputs.run_data_generation == 'true' }}
56+
uses: ./.github/workflows/data_generation_run.yml
57+
with:
58+
scenario: ${{ github.event.inputs.scenario }}
59+
runner_type: ${{ github.event.inputs.data_generation_runner_type }}
60+
scale_factor: ${{ github.event.inputs.data_generation_scale_factor }}
61+
bucket: ${{ github.event.inputs.data_generation_bucket }}
62+
prefix: ${{ github.event.inputs.data_generation_prefix }}
63+
max_concurrency: ${{ github.event.inputs.data_generation_max_concurrency }}
64+
region: ${{ github.event.inputs.data_generation_region }}
65+
skip_initial: ${{ github.event.inputs.data_generation_skip_initial == 'true' }}
66+
secrets: inherit
67+
768
run-spicebench:
869
name: Run spicebench
70+
needs: [data-generation]
71+
if: ${{ always() && (needs.data-generation.result == 'success' || needs.data-generation.result == 'skipped') }}
972
runs-on: ubuntu-latest
1073
timeout-minutes: 600
1174
steps:
@@ -40,7 +103,7 @@ jobs:
40103
uses: actions-rust-lang/setup-rust-toolchain@v1
41104
with:
42105
toolchain: 1.91
43-
cache: false
106+
cache: true
44107

45108
- name: Build spicebench
46109
if: steps.cache-spicebench.outputs.cache-hit != 'true'
@@ -54,9 +117,10 @@ jobs:
54117
SPICEAI_API_KEY: ${{ secrets.SPICEAI_API_KEY }}
55118
SPICE_CLOUD_API_URL: https://dev-api.spice.ai
56119
RUSTFLAGS: "-A warnings"
120+
SCENARIO: ${{ github.event.inputs.scenario || 'tpch' }}
57121
run: |
58122
~/.spice/bin/spicebench \
59123
--concurrency 2 \
60-
--query-set tpch \
124+
--query-set "${SCENARIO}" \
61125
--system-adapter-stdio-cmd docker \
62126
--system-adapter-stdio-args "run ghcr.io/spiceai/spidapter:latest -e SPICEAI_API_KEY -e SPICE_CLOUD_API_URL run stdio --verbose"

0 commit comments

Comments
 (0)