Skip to content

data-generation run - tpch - sf1 #76

data-generation run - tpch - sf1

data-generation run - tpch - sf1 #76

name: data-generation run
run-name: data-generation run - ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} - sf${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
on:
workflow_call:
inputs:
scenario:
required: false
default: 'tpch'
type: string
scale_factor:
required: false
default: '0.01'
type: string
bucket:
required: false
default: 'spiceai-public-datasets'
type: string
prefix:
required: false
default: 'data-gen'
type: string
region:
required: false
default: 'us-east-1'
type: string
num_steps:
required: false
default: '25'
type: string
checkpoint_interval_steps:
required: false
default: '100'
type: string
secrets:
MINIO_ENDPOINT:
required: true
MINIO_ACCESS_KEY_ID:
required: true
MINIO_SECRET_ACCESS_KEY:
required: true
workflow_dispatch:
inputs:
scenario:
description: 'Dataset/scenario to generate (e.g. tpch)'
required: true
default: 'tpch'
type: string
scale_factor:
description: 'TPC-H scale factor'
required: true
default: '0.01'
type: string
bucket:
description: 'S3 bucket name'
required: false
default: 'spicebench'
type: string
prefix:
description: 'Base S3 key prefix for generated files (scenario is appended automatically)'
required: false
default: 'data-gen'
type: string
region:
description: 'AWS region'
required: true
default: 'us-east-1'
type: string
num_steps:
description: 'Number of data generation steps (partitions for TPC-H dbgen)'
required: false
default: '5'
type: string
checkpoint_interval_steps:
description: 'Every N steps to take a checkpoint'
required: false
default: '3'
type: string
jobs:
run-data-generation:
name: Run data generation
runs-on: spiceai-dev-runners
timeout-minutes: 600
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6
- name: Cache data-generation binary
id: cache-data-generation
uses: actions/cache@v4
with:
path: ~/.spice/bin/data-generation
key: data-generation-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }}
restore-keys: |
data-generation-${{ runner.os }}-
- name: Cache checkpointer binary
id: cache-checkpointer
uses: actions/cache@v4
with:
path: ~/.spice/bin/checkpointer
key: checkpointer-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }}
restore-keys: |
checkpointer-${{ runner.os }}-
- name: Setup Rust toolchain
if: steps.cache-data-generation.outputs.cache-hit != 'true' || steps.cache-checkpointer.outputs.cache-hit != 'true'
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: 1.91
cache: false
- name: Setup CC
if: steps.cache-data-generation.outputs.cache-hit != 'true' || steps.cache-checkpointer.outputs.cache-hit != 'true'
uses: ./.github/actions/setup-cc
- name: Build data-generation
if: steps.cache-data-generation.outputs.cache-hit != 'true'
run: |
mkdir -p ~/.spice/bin
cargo build --release -p data-generation
install -m 755 target/release/data-generation ~/.spice/bin/data-generation
- name: Run data generation
env:
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
NUM_STEPS: ${{ inputs.num_steps || github.event.inputs.num_steps || '25' }}
MINIO_ENDPOINT: ${{ secrets.MINIO_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.MINIO_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MINIO_SECRET_ACCESS_KEY }}
RUST_LOG: info
run: |
ARGS="--dataset ${SCENARIO}"
ARGS="${ARGS} --scenario ${SCENARIO}"
ARGS="${ARGS} --scale-factor ${SCALE_FACTOR}"
ARGS="${ARGS} --bucket ${BUCKET}"
ARGS="${ARGS} --prefix ${PREFIX}"
ARGS="${ARGS} --region ${REGION}"
ARGS="${ARGS} --num-steps ${NUM_STEPS}"
ARGS="${ARGS} --endpoint ${MINIO_ENDPOINT}"
echo "Running: data-generation run ${ARGS}"
~/.spice/bin/data-generation run ${ARGS}
- name: Build checkpointer
if: steps.cache-checkpointer.outputs.cache-hit != 'true'
run: |
mkdir -p ~/.spice/bin
cargo build --release --features duckdb -p checkpointer
install -m 755 target/release/checkpointer ~/.spice/bin/checkpointer
- name: Run checkpointer
env:
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
CHECKPOINT_INTERVAL_STEPS: ${{ inputs.checkpoint_interval_steps || github.event.inputs.checkpoint_interval_steps || '100' }}
MINIO_ENDPOINT: ${{ secrets.MINIO_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.MINIO_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MINIO_SECRET_ACCESS_KEY }}
RUST_LOG: info
run: |
VERSION=$(python3 - <<'PY'
import os
sf = float(os.environ["SCALE_FACTOR"])
s = format(sf, ".15g")
if "e" in s or "E" in s:
s = ("%.15f" % sf).rstrip("0").rstrip(".")
if "." not in s:
s = f"{s}.0"
print(s)
PY
)
ARGS="--scenario ${SCENARIO}"
ARGS="${ARGS} --version ${VERSION}"
ARGS="${ARGS} --bucket ${BUCKET}"
ARGS="${ARGS} --prefix ${PREFIX}"
ARGS="${ARGS} --duckdb-path ./checkpointer.duckdb"
ARGS="${ARGS} --checkpoint-dir ./checkpoints"
if [ -n "${REGION}" ]; then
ARGS="${ARGS} --region ${REGION}"
fi
ARGS="${ARGS} --checkpoint-interval-steps ${CHECKPOINT_INTERVAL_STEPS}"
ARGS="${ARGS} --endpoint ${MINIO_ENDPOINT}"
echo "Running: checkpointer ${ARGS}"
~/.spice/bin/checkpointer ${ARGS}