data-generation run - tpch - sf10.0 #69
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: data-generation run | |
| run-name: data-generation run - ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} - sf${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| on: | |
| workflow_call: | |
| inputs: | |
| scenario: | |
| required: false | |
| default: 'tpch' | |
| type: string | |
| scale_factor: | |
| required: false | |
| default: '0.01' | |
| type: string | |
| bucket: | |
| required: false | |
| default: 'spiceai-public-datasets' | |
| type: string | |
| prefix: | |
| required: false | |
| default: 'data-gen' | |
| type: string | |
| region: | |
| required: false | |
| default: 'us-east-1' | |
| type: string | |
| num_steps: | |
| required: false | |
| default: '25' | |
| type: string | |
| checkpoint_interval_steps: | |
| required: false | |
| default: '100' | |
| type: string | |
| secrets: | |
| MINIO_ENDPOINT: | |
| required: true | |
| MINIO_ACCESS_KEY_ID: | |
| required: true | |
| MINIO_SECRET_ACCESS_KEY: | |
| required: true | |
| workflow_dispatch: | |
| inputs: | |
| scenario: | |
| description: 'Dataset/scenario to generate (e.g. tpch)' | |
| required: true | |
| default: 'tpch' | |
| type: string | |
| scale_factor: | |
| description: 'TPC-H scale factor' | |
| required: true | |
| default: '0.01' | |
| type: string | |
| bucket: | |
| description: 'S3 bucket name' | |
| required: false | |
| default: 'spicebench' | |
| type: string | |
| prefix: | |
| description: 'Base S3 key prefix for generated files (scenario is appended automatically)' | |
| required: false | |
| default: 'data-gen' | |
| type: string | |
| region: | |
| description: 'AWS region' | |
| required: true | |
| default: 'us-east-1' | |
| type: string | |
| num_steps: | |
| description: 'Number of data generation steps (partitions for TPC-H dbgen)' | |
| required: false | |
| default: '5' | |
| type: string | |
| checkpoint_interval_steps: | |
| description: 'Every N steps to take a checkpoint' | |
| required: false | |
| default: '3' | |
| type: string | |
| jobs: | |
| run-data-generation: | |
| name: Run data generation | |
| runs-on: spiceai-dev-runners | |
| timeout-minutes: 600 | |
| steps: | |
| - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 | |
| - name: Cache data-generation binary | |
| id: cache-data-generation | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.spice/bin/data-generation | |
| key: data-generation-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }} | |
| restore-keys: | | |
| data-generation-${{ runner.os }}- | |
| - name: Cache checkpointer binary | |
| id: cache-checkpointer | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.spice/bin/checkpointer | |
| key: checkpointer-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml', '**/*.rs') }} | |
| restore-keys: | | |
| checkpointer-${{ runner.os }}- | |
| - name: Setup Rust toolchain | |
| if: steps.cache-data-generation.outputs.cache-hit != 'true' || steps.cache-checkpointer.outputs.cache-hit != 'true' | |
| uses: actions-rust-lang/setup-rust-toolchain@v1 | |
| with: | |
| toolchain: 1.91 | |
| cache: false | |
| - name: Setup CC | |
| if: steps.cache-data-generation.outputs.cache-hit != 'true' || steps.cache-checkpointer.outputs.cache-hit != 'true' | |
| uses: ./.github/actions/setup-cc | |
| - name: Build data-generation | |
| if: steps.cache-data-generation.outputs.cache-hit != 'true' | |
| run: | | |
| mkdir -p ~/.spice/bin | |
| cargo build --release -p data-generation | |
| install -m 755 target/release/data-generation ~/.spice/bin/data-generation | |
| - name: Run data generation | |
| env: | |
| SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} | |
| SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }} | |
| PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }} | |
| REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }} | |
| NUM_STEPS: ${{ inputs.num_steps || github.event.inputs.num_steps || '25' }} | |
| MINIO_ENDPOINT: ${{ secrets.MINIO_ENDPOINT }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.MINIO_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.MINIO_SECRET_ACCESS_KEY }} | |
| RUST_LOG: info | |
| run: | | |
| ARGS="--dataset ${SCENARIO}" | |
| ARGS="${ARGS} --scenario ${SCENARIO}" | |
| ARGS="${ARGS} --scale-factor ${SCALE_FACTOR}" | |
| ARGS="${ARGS} --bucket ${BUCKET}" | |
| ARGS="${ARGS} --prefix ${PREFIX}" | |
| ARGS="${ARGS} --region ${REGION}" | |
| ARGS="${ARGS} --num-steps ${NUM_STEPS}" | |
| ARGS="${ARGS} --endpoint ${MINIO_ENDPOINT}" | |
| echo "Running: data-generation run ${ARGS}" | |
| ~/.spice/bin/data-generation run ${ARGS} | |
| - name: Build checkpointer | |
| if: steps.cache-checkpointer.outputs.cache-hit != 'true' | |
| run: | | |
| mkdir -p ~/.spice/bin | |
| cargo build --release --features duckdb -p checkpointer | |
| install -m 755 target/release/checkpointer ~/.spice/bin/checkpointer | |
| - name: Run checkpointer | |
| env: | |
| SCENARIO: ${{ inputs.scenario || github.event.inputs.scenario || 'tpch' }} | |
| SCALE_FACTOR: ${{ inputs.scale_factor || github.event.inputs.scale_factor || '0.01' }} | |
| BUCKET: ${{ inputs.bucket || github.event.inputs.bucket }} | |
| PREFIX: ${{ inputs.prefix || github.event.inputs.prefix || 'data-gen' }} | |
| REGION: ${{ inputs.region || github.event.inputs.region || 'us-east-1' }} | |
| CHECKPOINT_INTERVAL_STEPS: ${{ inputs.checkpoint_interval_steps || github.event.inputs.checkpoint_interval_steps || '100' }} | |
| MINIO_ENDPOINT: ${{ secrets.MINIO_ENDPOINT }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.MINIO_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.MINIO_SECRET_ACCESS_KEY }} | |
| RUST_LOG: info | |
| run: | | |
| VERSION=$(python3 - <<'PY' | |
| import os | |
| sf = float(os.environ["SCALE_FACTOR"]) | |
| s = format(sf, ".15g") | |
| if "e" in s or "E" in s: | |
| s = ("%.15f" % sf).rstrip("0").rstrip(".") | |
| if "." not in s: | |
| s = f"{s}.0" | |
| print(s) | |
| PY | |
| ) | |
| ARGS="--scenario ${SCENARIO}" | |
| ARGS="${ARGS} --version ${VERSION}" | |
| ARGS="${ARGS} --bucket ${BUCKET}" | |
| ARGS="${ARGS} --prefix ${PREFIX}" | |
| ARGS="${ARGS} --duckdb-path ./checkpointer.duckdb" | |
| ARGS="${ARGS} --checkpoint-dir ./checkpoints" | |
| if [ -n "${REGION}" ]; then | |
| ARGS="${ARGS} --region ${REGION}" | |
| fi | |
| ARGS="${ARGS} --checkpoint-interval-steps ${CHECKPOINT_INTERVAL_STEPS}" | |
| ARGS="${ARGS} --endpoint ${MINIO_ENDPOINT}" | |
| echo "Running: checkpointer ${ARGS}" | |
| ~/.spice/bin/checkpointer ${ARGS} |