data-generation run - tpch - sf1.0 #58
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: data-generation run | |
| run-name: data-generation run - ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} - sf${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| on: | |
| workflow_call: | |
| inputs: | |
| scenario: | |
| required: false | |
| default: 'tpch' | |
| type: string | |
| scale_factor: | |
| required: false | |
| default: '0.01' | |
| type: string | |
| bucket: | |
| required: true | |
| type: string | |
| prefix: | |
| required: false | |
| default: 'data-gen' | |
| type: string | |
| max_concurrency: | |
| required: false | |
| default: '8' | |
| type: string | |
| region: | |
| required: false | |
| default: 'us-east-1' | |
| type: string | |
| skip_initial: | |
| required: false | |
| default: false | |
| type: boolean | |
| num_steps: | |
| required: false | |
| default: '25' | |
| type: string | |
| checkpoint_interval_steps: | |
| required: false | |
| default: '100' | |
| type: string | |
| runner_type: | |
| required: false | |
| default: 'ubuntu-latest' | |
| type: string | |
| secrets: | |
| AWS_ACCESS_KEY_ID: | |
| required: true | |
| AWS_SECRET_ACCESS_KEY: | |
| required: true | |
| workflow_dispatch: | |
| inputs: | |
| scenario: | |
| description: 'Dataset/scenario to generate (e.g. tpch)' | |
| required: true | |
| default: 'tpch' | |
| type: string | |
| runner_type: | |
| description: 'GitHub runner label to execute on' | |
| required: false | |
| default: 'spiceai-macos' | |
| type: string | |
| scale_factor: | |
| description: 'TPC-H scale factor' | |
| required: true | |
| default: '0.01' | |
| type: string | |
| bucket: | |
| description: 'S3 bucket name' | |
| required: true | |
| type: string | |
| prefix: | |
| description: 'Base S3 key prefix for generated files (scenario is appended automatically)' | |
| required: false | |
| default: 'data-gen' | |
| type: string | |
| max_concurrency: | |
| description: 'Maximum number of concurrent S3 writes' | |
| required: true | |
| default: '8' | |
| type: string | |
| region: | |
| description: 'AWS region' | |
| required: true | |
| default: 'us-east-1' | |
| type: string | |
| skip_initial: | |
| description: 'Skip data ingested by running --initial command' | |
| required: false | |
| default: false | |
| type: boolean | |
| num_steps: | |
| description: 'Number of data generation steps (partitions for TPC-H dbgen)' | |
| required: false | |
| default: '5' | |
| type: string | |
| checkpoint_interval_steps: | |
| description: 'Every N steps to take a checkpoint' | |
| required: false | |
| default: '3' | |
| type: string | |
| jobs: | |
| run-data-generation: | |
| name: Run data generation | |
| runs-on: ${{ inputs.runner_type || github.event.inputs.runner_type || 'ubuntu-latest' }} | |
| timeout-minutes: 600 | |
| steps: | |
| - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 | |
| - name: Cache data-generation binary | |
| id: cache-data-generation | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.spice/bin/data-generation | |
| key: data-generation-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }} | |
| restore-keys: | | |
| data-generation-${{ runner.os }}- | |
| - name: Setup Rust toolchain | |
| if: steps.cache-data-generation.outputs.cache-hit != 'true' | |
| uses: actions-rust-lang/setup-rust-toolchain@v1 | |
| with: | |
| toolchain: 1.91 | |
| cache: false | |
| - name: Build data-generation | |
| if: steps.cache-data-generation.outputs.cache-hit != 'true' | |
| run: | | |
| mkdir -p ~/.spice/bin | |
| cargo build --release -p data-generation | |
| install -m 755 target/release/data-generation ~/.spice/bin/data-generation | |
| - name: Run data generation | |
| env: | |
| SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} | |
| SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }} | |
| PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }} | |
| MAX_CONCURRENCY: ${{ inputs.max_concurrency || github.event.inputs.max_concurrency || '8' }} | |
| REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }} | |
| SKIP_INITIAL: ${{ inputs.skip_initial || github.event.inputs.skip_initial || false }} | |
| NUM_STEPS: ${{ inputs.num_steps || github.event.inputs.num_steps || '25' }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| RUST_LOG: info | |
| run: | | |
| ARGS="--dataset ${SCENARIO}" | |
| ARGS="${ARGS} --scenario ${SCENARIO}" | |
| ARGS="${ARGS} --scale-factor ${SCALE_FACTOR}" | |
| ARGS="${ARGS} --bucket ${BUCKET}" | |
| ARGS="${ARGS} --prefix ${PREFIX}" | |
| ARGS="${ARGS} --max-concurrency ${MAX_CONCURRENCY}" | |
| ARGS="${ARGS} --region ${REGION}" | |
| ARGS="${ARGS} --num-steps ${NUM_STEPS}" | |
| if [ "${SKIP_INITIAL}" = "true" ]; then | |
| ARGS="${ARGS} --skip-initial" | |
| fi | |
| echo "Running: data-generation run ${ARGS}" | |
| ~/.spice/bin/data-generation run ${ARGS} | |
| - name: Cache checkpointer binary | |
| id: cache-checkpointer | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.spice/bin/checkpointer | |
| key: checkpointer-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }} | |
| restore-keys: | | |
| checkpointer-${{ runner.os }}- | |
| - name: Build checkpointer | |
| if: steps.cache-checkpointer.outputs.cache-hit != 'true' | |
| run: | | |
| mkdir -p ~/.spice/bin | |
| cargo build --release --features duckdb -p checkpointer | |
| install -m 755 target/release/checkpointer ~/.spice/bin/checkpointer | |
| - name: Run checkpointer | |
| env: | |
| SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} | |
| SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }} | |
| PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }} | |
| REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }} | |
| CHECKPOINT_INTERVAL_STEPS: ${{ inputs.checkpoint_interval_steps || github.event.inputs.checkpoint_interval_steps || '100' }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| RUST_LOG: info | |
| run: | | |
| VERSION=$(python3 - <<'PY' | |
| import os | |
| sf = float(os.environ["SCALE_FACTOR"]) | |
| s = format(sf, ".15g") | |
| if "e" in s or "E" in s: | |
| s = ("%.15f" % sf).rstrip("0").rstrip(".") | |
| if "." not in s: | |
| s = f"{s}.0" | |
| print(s) | |
| PY | |
| ) | |
| ARGS="--scenario ${SCENARIO}" | |
| ARGS="${ARGS} --version ${VERSION}" | |
| ARGS="${ARGS} --bucket ${BUCKET}" | |
| ARGS="${ARGS} --prefix ${PREFIX}" | |
| ARGS="${ARGS} --duckdb-path ./checkpointer.duckdb" | |
| ARGS="${ARGS} --checkpoint-dir ./checkpoints" | |
| if [ -n "${REGION}" ]; then | |
| ARGS="${ARGS} --region ${REGION}" | |
| fi | |
| ARGS="${ARGS} --checkpoint-interval-steps ${CHECKPOINT_INTERVAL_STEPS}" | |
| echo "Running: checkpointer ${ARGS}" | |
| ~/.spice/bin/checkpointer ${ARGS} |