Skip to content

data-generation run - tpch - sf10 #59

data-generation run - tpch - sf10

data-generation run - tpch - sf10 #59

name: data-generation run
run-name: data-generation run - ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} - sf${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
on:
workflow_call:
inputs:
scenario:
required: false
default: 'tpch'
type: string
scale_factor:
required: false
default: '0.01'
type: string
bucket:
required: true
type: string
prefix:
required: false
default: 'data-gen'
type: string
max_concurrency:
required: false
default: '8'
type: string
region:
required: false
default: 'us-east-1'
type: string
skip_initial:
required: false
default: false
type: boolean
num_steps:
required: false
default: '25'
type: string
checkpoint_interval_steps:
required: false
default: '100'
type: string
runner_type:
required: false
default: 'ubuntu-latest'
type: string
secrets:
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
workflow_dispatch:
inputs:
scenario:
description: 'Dataset/scenario to generate (e.g. tpch)'
required: true
default: 'tpch'
type: string
runner_type:
description: 'GitHub runner label to execute on'
required: false
default: 'spiceai-macos'
type: string
scale_factor:
description: 'TPC-H scale factor'
required: true
default: '0.01'
type: string
bucket:
description: 'S3 bucket name'
required: true
type: string
prefix:
description: 'Base S3 key prefix for generated files (scenario is appended automatically)'
required: false
default: 'data-gen'
type: string
max_concurrency:
description: 'Maximum number of concurrent S3 writes'
required: true
default: '8'
type: string
region:
description: 'AWS region'
required: true
default: 'us-east-1'
type: string
skip_initial:
description: 'Skip data ingested by running --initial command'
required: false
default: false
type: boolean
num_steps:
description: 'Number of data generation steps (partitions for TPC-H dbgen)'
required: false
default: '5'
type: string
checkpoint_interval_steps:
description: 'Every N steps to take a checkpoint'
required: false
default: '3'
type: string
jobs:
run-data-generation:
name: Run data generation
runs-on: ${{ inputs.runner_type || github.event.inputs.runner_type || 'ubuntu-latest' }}
timeout-minutes: 600
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6
- name: Cache data-generation binary
id: cache-data-generation
uses: actions/cache@v4
with:
path: ~/.spice/bin/data-generation
key: data-generation-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }}
restore-keys: |
data-generation-${{ runner.os }}-
- name: Setup Rust toolchain
if: steps.cache-data-generation.outputs.cache-hit != 'true'
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: 1.91
cache: false
- name: Build data-generation
if: steps.cache-data-generation.outputs.cache-hit != 'true'
run: |
mkdir -p ~/.spice/bin
cargo build --release -p data-generation
install -m 755 target/release/data-generation ~/.spice/bin/data-generation
- name: Run data generation
env:
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }}
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }}
NUM_STEPS: ${{ inputs.num_steps || github.event.inputs.num_steps || '25' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
RUST_LOG: info
run: |
ARGS="--dataset ${SCENARIO}"
ARGS="${ARGS} --scenario ${SCENARIO}"
ARGS="${ARGS} --scale-factor ${SCALE_FACTOR}"
ARGS="${ARGS} --bucket ${BUCKET}"
ARGS="${ARGS} --prefix ${PREFIX}"
ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}"
ARGS="${ARGS} --region ${REGION}"
ARGS="${ARGS} --num-steps ${NUM_STEPS}"
if [ "${SKIP_INITIAL}" = "true" ]; then
ARGS="${ARGS} --skip-initial"
fi
echo "Running: data-generation run ${ARGS}"
~/.spice/bin/data-generation run ${ARGS}
- name: Cache checkpointer binary
id: cache-checkpointer
uses: actions/cache@v4
with:
path: ~/.spice/bin/checkpointer
key: checkpointer-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }}
restore-keys: |
checkpointer-${{ runner.os }}-
- name: Build checkpointer
if: steps.cache-checkpointer.outputs.cache-hit != 'true'
run: |
mkdir -p ~/.spice/bin
cargo build --release --features duckdb -p checkpointer
install -m 755 target/release/checkpointer ~/.spice/bin/checkpointer
- name: Run checkpointer
env:
SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }}
SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }}
BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }}
PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }}
REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }}
CHECKPOINT_INTERVAL_STEPS: ${{ inputs.checkpoint_interval_steps || github.event.inputs.checkpoint_interval_steps || '100' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
RUST_LOG: info
run: |
VERSION=$(python3 - <<'PY'
import os
sf = float(os.environ["SCALE_FACTOR"])
s = format(sf, ".15g")
if "e" in s or "E" in s:
s = ("%.15f" % sf).rstrip("0").rstrip(".")
if "." not in s:
s = f"{s}.0"
print(s)
PY
)
ARGS="--scenario ${SCENARIO}"
ARGS="${ARGS} --version ${VERSION}"
ARGS="${ARGS} --bucket ${BUCKET}"
ARGS="${ARGS} --prefix ${PREFIX}"
ARGS="${ARGS} --duckdb-path ./checkpointer.duckdb"
ARGS="${ARGS} --checkpoint-dir ./checkpoints"
if [ -n "${REGION}" ]; then
ARGS="${ARGS} --region ${REGION}"
fi
ARGS="${ARGS} --checkpoint-interval-steps ${CHECKPOINT_INTERVAL_STEPS}"
echo "Running: checkpointer ${ARGS}"
~/.spice/bin/checkpointer ${ARGS}