diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index cd538642a9..3681057db3 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -50,7 +50,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + uses: github/codeql-action/init@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -69,7 +69,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + uses: github/codeql-action/autobuild@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4 # ℹ️ Command-line programs to run using the OS shell. # πŸ“š https://git.io/JvXDl @@ -87,4 +87,4 @@ jobs: run: sccache --show-stats - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + uses: github/codeql-action/analyze@9e0d7b8d25671d64c341c19c0152d693099fb5ba # v4 diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 086440b298..d8eb105446 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -338,7 +338,7 @@ jobs: - name: Set up Github Token if: needs.check_changes.outputs.relevant_changes == 'true' - uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v2 + uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 # v2 id: github-app-token with: app-id: ${{ vars.ORG_MEMBERS_GITHUB_APP_ID }} diff --git a/.github/workflows/testoperator_run_append.yml b/.github/workflows/testoperator_run_append.yml index 488d746bf2..c9c68def80 100644 --- a/.github/workflows/testoperator_run_append.yml +++ b/.github/workflows/testoperator_run_append.yml @@ -13,6 +13,11 @@ on: required: true default: '720' type: string + concurrency: + description: 'Number of analytical query workers to run while append operations execute' + required: false + default: '1' + type: string spicepod_path: description: 'The spicepod file to test with' required: true @@ -147,6 +152,7 @@ jobs: --disable-progress-bars \ --metrics \ --duration ${{ github.event.inputs.duration }} \ + --concurrency ${{ github.event.inputs.concurrency }} \ --load-interval ${{ github.event.inputs.load_interval }} \ --load-steps ${{ github.event.inputs.load_steps }} \ ${{ github.event.inputs.query_overrides != '' && format('--query-overrides {0}', github.event.inputs.query_overrides) || '' }} \ diff --git a/Cargo.lock b/Cargo.lock index 482ed37301..86584075fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1601,9 +1601,9 @@ dependencies = [ [[package]] name = "aws-sdk-glue" -version = "1.145.0" +version = "1.145.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95190a2c2e0be9088d0f26ad4931d91defed3a5f584559ecb71189de6d4bc238" +checksum = "6eaa019d39389807e4681e6a040a2268fa5dce7c2be4407117d54f52c0c77bb5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -3407,6 +3407,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-table-providers", + "duckdb", "flatbuffers", "futures", "hash-index", @@ -3423,6 +3424,7 @@ dependencies = [ "serde", "serde_json", "snafu", + "telemetry", "tempfile", "test-framework", "test-log", @@ -6931,6 +6933,16 @@ dependencies = [ "util", ] +[[package]] +name = "earcutr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79127ed59a85d7687c409e9978547cffb7dc79675355ed22da6b66fd5f6ead01" +dependencies = [ + "itertools 0.11.0", + "num-traits", +] + [[package]] name = "ecdsa" version = "0.14.8" @@ -7680,6 +7692,12 @@ dependencies = [ "rand_distr 0.5.1", ] +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + [[package]] name = "fnv" version = "1.0.7" @@ -8241,6 +8259,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "geo" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fc1a1678e54befc9b4bcab6cd43b8e7f834ae8ea121118b0fd8c42747675b4a" +dependencies = [ + "earcutr", + "float_next_after", + "geo-types", + "geographiclib-rs", + "i_overlay", + "log", + "num-traits", + "robust", + "rstar", + "spade", +] + +[[package]] +name = "geo-traits" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7c353d12a704ccfab1ba8bfb1a7fe6cb18b665bf89d37f4f7890edcd260206" +dependencies = [ + "geo-types", +] + [[package]] name = "geo-types" version = "0.7.18" @@ -8249,7 +8294,91 @@ checksum = "24f8647af4005fa11da47cd56252c6ef030be8fa97bdbf355e7dfb6348f0a82c" dependencies = [ "approx", "num-traits", + "rayon", + "rstar", + "serde", +] + +[[package]] +name = "geoarrow-array" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1cc4106ac0a0a512c398961ce95d8150475c84a84e17c4511c3643fa120a17" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "geo-traits", + "geoarrow-schema", + "num-traits", + "wkb", + "wkt", +] + +[[package]] +name = "geoarrow-expr-geo" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa84300361ce57fb875bcaa6e32b95b0aff5c6b1af692b936bdd58ff343f4394" +dependencies = [ + "arrow-array", + "arrow-buffer", + "geo", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", +] + +[[package]] +name = "geoarrow-schema" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97be4e9f523f92bd6a0e0458323f4b783d073d011664decd8dbf05651704f34" +dependencies = [ + "arrow-schema", + "geo-traits", "serde", + "serde_json", + "thiserror 1.0.69", +] + +[[package]] +name = "geodatafusion" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cb8faa9b3bf4ae9f49b1f023b82d20626826f6448a7055498376146c10c4ead" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-schema", + "datafusion", + "geo", + "geo-traits", + "geoarrow-array", + "geoarrow-expr-geo", + "geoarrow-schema", + "geohash", + "thiserror 1.0.69", + "wkt", +] + +[[package]] +name = "geographiclib-rs" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5a7f08910fd98737a6eda7568e7c5e645093e073328eeef49758cfe8b0489c7" +dependencies = [ + "libm", +] + +[[package]] +name = "geohash" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fb94b1a65401d6cbf22958a9040aa364812c26674f841bee538b12c135db1e6" +dependencies = [ + "geo-types", + "libm", ] [[package]] @@ -8730,6 +8859,15 @@ dependencies = [ "byteorder", ] +[[package]] +name = "hash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.5.0" @@ -8852,13 +8990,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" dependencies = [ "atomic-polyfill", - "hash32", + "hash32 0.2.1", "rustc_version", "serde", "spin 0.9.8", "stable_deref_trait", ] +[[package]] +name = "heapless" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" +dependencies = [ + "hash32 0.3.1", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.4.1" @@ -9341,6 +9489,49 @@ dependencies = [ "tower-service", ] +[[package]] +name = "i_float" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "010025c2c532c8d82e42d0b8bb5184afa449fa6f06c709ea9adcb16c49ae405b" +dependencies = [ + "libm", +] + +[[package]] +name = "i_key_sort" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9190f86706ca38ac8add223b2aed8b1330002b5cdbbce28fb58b10914d38fc27" + +[[package]] +name = "i_overlay" +version = "4.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413183068e6e0289e18d7d0a1f661b81546e6918d5453a44570b9ab30cbed1b3" +dependencies = [ + "i_float", + "i_key_sort", + "i_shape", + "i_tree", + "rayon", +] + +[[package]] +name = "i_shape" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ea154b742f7d43dae2897fcd5ead86bc7b5eefcedd305a7ebf9f69d44d61082" +dependencies = [ + "i_float", +] + +[[package]] +name = "i_tree" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e6d558e6d4c7b82bc51d9c771e7a927862a161a7d87bf2b0541450e0e20915" + [[package]] name = "iana-time-zone" version = "0.1.64" @@ -9980,6 +10171,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -13866,7 +14066,7 @@ dependencies = [ "cobs", "embedded-io 0.4.0", "embedded-io 0.6.1", - "heapless", + "heapless 0.7.17", "serde", ] @@ -15493,6 +15693,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "robust" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e27ee8bb91ca0adcf0ecb116293afa12d393f9c2b9b9cd54d33e8078fe19839" + [[package]] name = "rsa" version = "0.9.10" @@ -15514,6 +15720,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rstar" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" +dependencies = [ + "heapless 0.8.0", + "num-traits", + "smallvec 1.15.1", +] + [[package]] name = "rstest" version = "0.25.0" @@ -15679,6 +15896,7 @@ dependencies = [ "flight_client", "fundu", "futures", + "geodatafusion", "gethostname", "globset", "governor", @@ -17693,6 +17911,18 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fef461faaeb36c340b6c887167a9054a034f6acfc50a014ead26a02b4356b3de" +[[package]] +name = "spade" +version = "2.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9699399fd9349b00b184f5635b074f9ec93afffef30c853f8c875b32c0f8c7fa" +dependencies = [ + "hashbrown 0.16.1", + "num-traits", + "robust", + "smallvec 1.15.1", +] + [[package]] name = "spark-connect-core" version = "0.0.1-beta.4" @@ -18820,7 +19050,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" dependencies = [ "fastrand 2.3.0", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix 1.1.4", "windows-sys 0.61.2", @@ -22959,6 +23189,31 @@ dependencies = [ "wasmparser 0.244.0", ] +[[package]] +name = "wkb" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a120b336c7ad17749026d50427c23d838ecb50cd64aaea6254b5030152f890a9" +dependencies = [ + "byteorder", + "geo-traits", + "num_enum", + "thiserror 1.0.69", +] + +[[package]] +name = "wkt" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efb2b923ccc882312e559ffaa832a055ba9d1ac0cc8e86b3e25453247e4b81d7" +dependencies = [ + "geo-traits", + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "workers" version = "2.0.0-unstable" diff --git a/Cargo.toml b/Cargo.toml index 3ca0407d4b..620eb422fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,7 +127,7 @@ aws-sdk-cognitoidentity = "1.99.0" aws-sdk-cognitoidentityprovider = "1.116.0" aws-sdk-dynamodb = "1.111.0" aws-sdk-dynamodbstreams = "1.99.0" -aws-sdk-glue = "1.145.0" +aws-sdk-glue = "1.145.1" aws-sdk-s3 = "1.132.0" aws-sdk-s3vectors = "1.24.0" aws-sdk-secretsmanager = "1.104.0" @@ -191,6 +191,7 @@ duckdb = "1.10502" # 1.10502.x corresponds to DuckDB v1.5.2 dyn-hash = "1.0.0" fundu = "2.0.1" futures = "0.3.32" +geodatafusion = { version = "0.3", default-features = false } gethostname = "1.1.0" git2 = { version = "0.20", default-features = false, features = [ "https", diff --git a/bin/spiced/Cargo.toml b/bin/spiced/Cargo.toml index c7828e1016..39d0cf944b 100644 --- a/bin/spiced/Cargo.toml +++ b/bin/spiced/Cargo.toml @@ -123,6 +123,7 @@ duckdb = ["connector-duckdb", "runtime/duckdb"] dynamodb = ["runtime/dynamodb"] flightsql = ["connector-flightsql", "runtime/flightsql"] ftp = ["connector-ftp", "runtime/ftp"] +geo = ["runtime/geo"] http-functions = ["runtime/http-functions"] rate-control = ["runtime/rate-control"] wasm-functions = ["runtime/wasm-functions"] diff --git a/crates/cayenne/Cargo.toml b/crates/cayenne/Cargo.toml index 94877271da..1e83b6ba61 100644 --- a/crates/cayenne/Cargo.toml +++ b/crates/cayenne/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["Spice.ai OSS Authors"] -description = "Cayenne: A DuckLake-inspired lakehouse format using SQLite for metadata and Vortex files for data storage" +description = "Cayenne: A lakehouse format using SQLite or Turso for metadata and Vortex files for data storage" edition = "2024" license = "Apache-2.0" name = "cayenne" @@ -42,6 +42,7 @@ runtime-datafusion = { path = "../runtime-datafusion" } runtime-table-partition = { path = "../runtime-table-partition", optional = true } rusqlite = { workspace = true } serde = { workspace = true } +telemetry = { path = "../telemetry" } tokio-rusqlite = { workspace = true } serde_json = { workspace = true } snafu = { workspace = true } @@ -57,6 +58,7 @@ vortex = { workspace = true } vortex-datafusion = { workspace = true } vortex-scan = { workspace = true } vortex-session = { workspace = true } +duckdb = { workspace = true, features = ["bundled"], optional = true } # Force aegis to use pure-rust implementation for ARM64 builds to avoid C compilation issues [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] aegis = { version = "0.9.3", default-features = false, features = [ @@ -68,6 +70,7 @@ aegis = { version = "0.9.3", default-features = false, features = [ default = [] turso = ["dep:turso"] partition-table-provider = ["dep:runtime-table-partition"] +duckdb-bench = ["dep:duckdb"] [dev-dependencies] criterion = { version = "0.7", features = ["html_reports", "async_tokio"] } @@ -101,3 +104,117 @@ name = "mutation_writer" [[bench]] harness = false name = "listing_fence_overhead" + +[[bench]] +harness = false +name = "sorted_append_overhead" + +[[bench]] +harness = false +name = "inner_join_sort_merge_rewrite" + +[[bench]] +harness = false +name = "staging_move_concurrency" + +[[bench]] +harness = false +name = "column_stats_contention" + +[[bench]] +harness = false +name = "checkpoint_fence_stall" + +[[bench]] +harness = false +name = "metastore_connection_contention" + +[[bench]] +harness = false +name = "validate_on_conflict_buffering" + +[[bench]] +harness = false +name = "apply_on_conflict_per_row_alloc" + +[[bench]] +harness = false +name = "compaction_sort_serialization" + +[[bench]] +harness = false +name = "inline_memtable_read_overhead" + +[[bench]] +harness = false +name = "inline_upsert_rewrite_overhead" + +[[bench]] +harness = false +name = "cached_table_statistics_wide" + +[[bench]] +harness = false +name = "deletion_index_extend_map_clone" + +[[bench]] +harness = false +name = "deletion_vector_bitmap_to_treemap" + +[[bench]] +harness = false +name = "wide_table_key_probe_scan" + +[[bench]] +harness = false +name = "vs_duckdb_in_list_delete" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_ingest" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_scan" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_pk_lookup" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_delete" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_burst" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_upsert" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_groupby" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_join" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "vs_duckdb_concurrent" +required-features = ["duckdb-bench"] + +[[bench]] +harness = false +name = "compaction_picker" diff --git a/crates/cayenne/README.md b/crates/cayenne/README.md index 740943214b..55510987cd 100644 --- a/crates/cayenne/README.md +++ b/crates/cayenne/README.md @@ -1,60 +1,77 @@ # Cayenne -A DuckLake-inspired lakehouse format for the Vortex accelerator that combines pluggable metastore backends (SQLite, Turso) for metadata management with Vortex files as the data lake. +A lakehouse format for the Vortex accelerator. Combines pluggable metastore backends (SQLite, Turso) for transactional metadata with Vortex files for columnar data, plus an LSM-style level-0 inline-data tier that absorbs small writes without writing data files. ## Overview -Cayenne provides a lakehouse format that enables efficient CRUD operations on columnar data with the following features: +Cayenne provides a lakehouse format that enables efficient CRUD operations on columnar data: -- **Pluggable Metastore Backends**: Transactional metadata management with support for SQLite and Turso (optional) -- **Vortex Data Files**: High-performance columnar storage with compression -- **Deletion Vectors**: Efficient delete tracking using Arrow IPC files, supporting both position-based and key-based deletion -- **Sequence-Based Ordering**: Iceberg-style sequence numbers for correct delete/insert ordering across snapshots -- **Partition Metadata**: File-based partitioning; metadata supports composite partition keys (current public API exposes a single partition column) -- **Staging WAL**: Crash-safe write-ahead log for in-progress writes +- **Pluggable metastore backends** (`metastore::sqlite::SqliteMetastore`, optional `metastore::turso::TursoMetastore`) for transactional metadata with `BEGIN ... COMMIT` semantics. +- **Vortex data files** as the persistent columnar tier, with configurable target file size, compression, and concurrent upload fan-out. +- **Inline-data memtable** (`cayenne_inlined_data` / `cayenne_inlined_delete` tables) absorbs small bursts directly in the metastore as Arrow IPC blobs, flushed to Vortex once accumulated rows / segments / bytes exceed configurable thresholds. +- **Deletion vectors** stored as Arrow IPC files for position-based deletion, plus an in-memory PK index (`DeletionIndex` / `KeyDeletionIndex`) for key-based deletion. Sequence-numbered for Iceberg-style upsert semantics. +- **Staging WAL** (`provider/staging_wal.rs`) provides crash-safe append commit via tmp+fsync+rename of the WAL marker, atomic rename of staged Vortex files into the current snapshot, and self-healing recovery on the next provider open. +- **Tiered small-files compaction** (`provider/compaction.rs`) triggered best-effort after writes and periodically by a per-table background compactor, gated by a shared per-accelerator semaphore so a fleet of tables can't oversubscribe the writer pool. +- **CDC apply pipelining** (`provider/mutation_writer::write_cdc_pipelined`): Stage A writes Vortex files into the staging dir under the staging WAL; Stage B (move + listing-cache invalidation) is spawned as a finalize task so the next burst's Stage A can begin work. Stage A and Stage B always preserve burst order. +- **Sequence-based ordering** (Iceberg-style) for correct delete/insert visibility across snapshots. +- **Partitioning** via composite partition keys; the current public API surface accepts a single partition column. +- **PK conflict detection opt-out** (`cayenne_pk_conflict_detection: none`) for append-only CDC workloads where the source enforces PK uniqueness and the ingestion path cannot replay existing rows. ## Architecture ```text -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ CayenneTableProvider β”‚ -β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Metastore Backend β”‚ β”‚ -β”‚ β”‚ (SQLite or Turso) β”‚ β”‚ -β”‚ β”‚ β”‚ β”‚ -β”‚ β”‚ - Table Schemas & Config β”‚ β”‚ -β”‚ β”‚ - Delete File References β”‚ β”‚ -β”‚ β”‚ - Partition Metadata β”‚ β”‚ -β”‚ β”‚ - Insert Records (PK tracking) β”‚ β”‚ -β”‚ β”‚ - Snapshot Sequences β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Vortex Data Lake β”‚ β”‚ -β”‚ β”‚ β”‚ β”‚ -β”‚ β”‚ └─ / β”‚ β”‚ -β”‚ β”‚ β”œβ”€ / β”‚ β”‚ -β”‚ β”‚ β”‚ β”œβ”€ data_001.vortex β”‚ β”‚ -β”‚ β”‚ β”‚ β”œβ”€ data_002.vortex β”‚ β”‚ -β”‚ β”‚ β”‚ └─ deletions/ β”‚ β”‚ -β”‚ β”‚ β”‚ └─ del_001.arrow β”‚ β”‚ -β”‚ β”‚ └─ / β”‚ β”‚ -β”‚ β”‚ └─ ... β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Optional: Object Store β”‚ β”‚ -β”‚ β”‚ (S3, S3 Express One Zone) β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ CayenneTableProvider β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Metastore (SqliteMetastore or TursoMetastore) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ cayenne_table cayenne_partition β”‚ β”‚ +β”‚ β”‚ cayenne_delete_file cayenne_insert_record β”‚ β”‚ +β”‚ β”‚ cayenne_snapshot_sequence β”‚ β”‚ +β”‚ β”‚ cayenne_table_statistics β”‚ β”‚ +β”‚ β”‚ cayenne_inlined_data ← LSM level-0 memtable β”‚ β”‚ +β”‚ β”‚ cayenne_inlined_delete ← LSM level-0 tombstones β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Vortex Data Lake β€” listing tables per snapshot dir β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ / β”‚ β”‚ +β”‚ β”‚ β”œβ”€ / β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€ part-001.vortex β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€ part-002.vortex β”‚ β”‚ +β”‚ β”‚ β”‚ └─ deletions/del-001.arrow β”‚ β”‚ +β”‚ β”‚ β”œβ”€ / ← Stage A buffer β”‚ β”‚ +β”‚ β”‚ β”‚ β”œβ”€ WAL β”‚ β”‚ +β”‚ β”‚ β”‚ └─ part-…vortex β”‚ β”‚ +β”‚ β”‚ └─ / β”‚ β”‚ +β”‚ β”‚ └─ … β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ In-memory state β”‚ β”‚ +β”‚ β”‚ listing_fence (RwLock) β€” read/write barrier β”‚ β”‚ +β”‚ β”‚ listing_table (ArcSwap) β”‚ β”‚ +β”‚ β”‚ scan_listing_tables (cache, Mutex) β”‚ β”‚ +β”‚ β”‚ pk_deletion_strategy (ArcSwap) β”‚ β”‚ +β”‚ β”‚ protected_snapshots (RwLock) β”‚ β”‚ +β”‚ β”‚ inlined_row_count (AtomicI64) β€” memtable size β”‚ β”‚ +β”‚ β”‚ post_write_maintenance (debounced refresh + stats) β”‚ β”‚ +β”‚ β”‚ background_compactor (per-table) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Optional: Object Store (S3, S3 Express One Zone) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` ## Key Components -### 1. Metastore Backend (`metastore.rs`) +### 1. Metastore backend (`metastore.rs`) -The `MetastoreBackend` trait defines a pluggable storage abstraction for metadata: +The `MetastoreBackend` trait defines the pluggable storage abstraction: ```rust #[async_trait] @@ -62,120 +79,216 @@ pub trait MetastoreBackend: Send + Sync { async fn init_schema(&self) -> CatalogResult<()>; async fn execute(&self, params: ExecuteParams<'_>) -> CatalogResult<()>; async fn execute_batch(&self, sql: &str) -> CatalogResult<()>; - async fn query_row(&self, params: QueryRowParams<'_>, f: F) -> CatalogResult; - async fn query(&self, params: QueryParams<'_>, f: F) -> CatalogResult>; + async fn query_row(&self, params: QueryRowParams<'_>, f: F) -> CatalogResult + where F: FnOnce(&dyn MetastoreRow) -> CatalogResult + Send, + T: Send; + async fn query(&self, params: QueryParams<'_>, f: F) -> CatalogResult> + where F: FnMut(&dyn MetastoreRow) -> CatalogResult + Send, + T: Send; + async fn begin_transaction(&self) -> CatalogResult>; async fn shutdown(&self) -> CatalogResult<()>; } ``` -Transactions are handled by a separate `MetastoreTransaction` trait: +Transactions go through a separate `MetastoreTransaction` trait that owns the transaction handle and is consumed by `commit` / `rollback`: ```rust #[async_trait] -pub trait MetastoreTransaction: Send + Sync { +pub trait MetastoreTransaction: Send { async fn execute(&self, params: ExecuteParams<'_>) -> CatalogResult<()>; - async fn query_row(&self, params: QueryRowParams<'_>, f: F) -> CatalogResult; - async fn query(&self, params: QueryParams<'_>, f: F) -> CatalogResult>; - async fn commit(self) -> CatalogResult<()>; - async fn rollback(self) -> CatalogResult<()>; + async fn execute_batch(&self, sql: &str) -> CatalogResult<()>; + async fn query_row(&self, params: QueryRowParams<'_>, f: F) -> CatalogResult where ...; + async fn query(&self, params: QueryParams<'_>, f: F) -> CatalogResult> where ...; + async fn commit(self: Box) -> CatalogResult<()>; + async fn rollback(self: Box) -> CatalogResult<()>; } ``` **Implementations:** -- **SQLite** (`metastore/sqlite.rs`): Default backend using rusqlite with WAL mode for concurrent access -- **Turso** (`metastore/turso.rs`): Optional backend using libsql/Turso (requires `turso` feature flag) +- **`SqliteMetastore`** (`metastore/sqlite.rs`): default; `tokio-rusqlite` with WAL mode and busy-timeout. All metastore operations serialize through one `tokio::sync::Mutex`, so writes across all tables sharing the same metastore are ordered. +- **`TursoMetastore`** (`metastore/turso.rs`): optional, gated on the `turso` feature. libSQL/Turso backend that supports `BEGIN CONCURRENT` for higher write parallelism. -### 2. Metadata Catalog (`catalog.rs`) +### 2. Metadata catalog (`catalog.rs`) -The `MetadataCatalog` trait defines the interface for metadata operations: +`MetadataCatalog` is the higher-level interface that the table provider uses; `CayenneCatalog` (`cayenne_catalog.rs`) is the concrete implementation backed by any `MetastoreBackend`. Selected methods (full signature in `catalog.rs`): ```rust #[async_trait] pub trait MetadataCatalog: Send + Sync { async fn init(&self) -> CatalogResult<()>; async fn list_table_names(&self) -> CatalogResult>; - async fn create_table(&self, options: CreateTableOptions) -> CatalogResult; + async fn create_table(&self, options: CreateTableOptions) -> CatalogResult; // table_id async fn get_table(&self, table_name: &str) -> CatalogResult; - async fn set_current_snapshot(&self, table_id: i64, snapshot_id: &str) -> CatalogResult<()>; - async fn increment_sequence_number(&self, table_id: i64) -> CatalogResult; - async fn get_sequence_number(&self, table_id: i64) -> CatalogResult; - async fn add_delete_file(&self, delete_file: DeleteFile) -> CatalogResult; - async fn get_table_delete_files(&self, table_id: i64) -> CatalogResult>; - async fn remove_delete_files(&self, table_id: i64, delete_file_ids: &[i64]) -> CatalogResult<()>; - async fn clear_delete_files(&self, table_id: i64) -> CatalogResult<()>; - async fn add_insert_record(&self, table_id: i64, pk_bytes: Vec, sequence_number: i64) -> CatalogResult<()>; - async fn add_insert_records_batch(&self, table_id: i64, pk_bytes_list: Vec>, sequence_number: i64) -> CatalogResult<()>; - async fn get_insert_records(&self, table_id: i64) -> CatalogResult, i64>>; - async fn clear_insert_records(&self, table_id: i64) -> CatalogResult<()>; - async fn set_snapshot_sequence(&self, table_id: i64, snapshot_id: &str, sequence_number: i64) -> CatalogResult<()>; - async fn get_snapshot_sequence(&self, table_id: i64, snapshot_id: &str) -> CatalogResult>; - async fn get_all_snapshot_sequences(&self, table_id: i64) -> CatalogResult>; - async fn clear_snapshot_sequence(&self, table_id: i64, snapshot_id: &str) -> CatalogResult<()>; - async fn commit_compaction(&self, table_id: i64, new_snapshot_id: &str) -> CatalogResult<()>; - async fn add_partition(&self, partition: PartitionMetadata) -> CatalogResult; - async fn get_partitions(&self, table_id: i64) -> CatalogResult>; async fn drop_table(&self, table_name: &str) -> CatalogResult; + + // Sequence numbers + async fn increment_sequence_number(&self, table_id: &str) -> CatalogResult; + async fn get_sequence_number(&self, table_id: &str) -> CatalogResult; + + // Delete files (position- and key-based) + async fn add_delete_file(&self, delete_file: DeleteFile) -> CatalogResult; + async fn get_table_delete_files(&self, table_id: &str) -> CatalogResult>; + async fn remove_delete_files(&self, table_id: &str, ids: &[String]) -> CatalogResult<()>; + async fn clear_delete_files(&self, table_id: &str) -> CatalogResult<()>; + + // Insert records for upsert re-insertion tracking + async fn add_insert_records_batch(&self, table_id: &str, pks: Vec>, seq: i64) -> CatalogResult<()>; + async fn get_insert_records(&self, table_id: &str) -> CatalogResult, i64>>; + async fn clear_insert_records(&self, table_id: &str) -> CatalogResult<()>; + + // Snapshot sequences (drives protected-snapshot filtering) + async fn set_snapshot_sequence(&self, table_id: &str, snapshot_id: &str, seq: i64) -> CatalogResult<()>; + async fn get_all_snapshot_sequences(&self, table_id: &str) -> CatalogResult>; + async fn clear_snapshot_sequence(&self, table_id: &str, snapshot_id: &str) -> CatalogResult<()>; + + // Atomic snapshot pointer flips (compaction and overwrite share retry-on-conflict logic) + async fn commit_compaction(&self, table_id: &str, new_snapshot_id: &str) -> CatalogResult<()>; + async fn commit_overwrite(&self, table_id: &str, new_snapshot_id: &str) -> CatalogResult<()>; + + // Partitions + async fn add_partition(&self, partition: PartitionMetadata) -> CatalogResult; + async fn get_partitions(&self, table_id: &str) -> CatalogResult>; + + // Persisted table statistics (column-level, loaded from Vortex footers) + async fn upsert_table_statistics(&self, stats: &TableStatistics) -> CatalogResult<()>; + async fn get_table_statistics(&self, table_id: &str) -> CatalogResult>; + async fn clear_table_statistics(&self, table_id: &str) -> CatalogResult<()>; + + // Inline-data memtable (small-write LSM level 0, stored as Arrow IPC blobs) + async fn add_inlined_data(&self, data: InlinedData) -> CatalogResult; + async fn get_inlined_data(&self, table_id: &str) -> CatalogResult>; + async fn get_inlined_data_for_partition(&self, table_id: &str, partition_key: &str) -> CatalogResult>; + async fn get_inlined_data_count(&self, table_id: &str) -> CatalogResult; + async fn get_inlined_data_stats(&self, table_id: &str) -> CatalogResult; + async fn clear_inlined_data(&self, table_id: &str) -> CatalogResult<()>; + + // Inline tombstones + async fn add_inlined_delete(&self, delete: InlinedDelete) -> CatalogResult; + async fn commit_inlined_mutation(&self, ...) -> CatalogResult<()>; // atomic data+delete update + async fn get_inlined_deletes(&self, table_id: &str) -> CatalogResult>; + async fn clear_inlined_deletes(&self, table_id: &str) -> CatalogResult<()>; + + async fn export_dataset_slice(&self, ...) -> CatalogResult<...>; // for snapshot/restore async fn shutdown(&self) -> CatalogResult<()>; } ``` -Implementation: `CayenneCatalog` (`cayenne_catalog.rs`), backed by any `MetastoreBackend`. - -### 3. Metadata Structures (`metadata.rs`) +`table_id` is a `String` (UUIDv7) β€” not an integer β€” so identifiers are stable across catalog dumps and snapshots. -Core data structures: +### 3. Metadata structures (`metadata.rs`) -- **`TableMetadata`**: Table schema, configuration, current snapshot ID, and sequence number -- **`DataFile`**: Reference to a Vortex data file with partition and sequence tracking -- **`DeleteFile`**: Reference to a deletion vector (Arrow IPC file) with sequence number -- **`VortexConfig`**: Vortex file compression and caching configuration +- **`TableMetadata`** β€” table schema, primary key, on-conflict policy, current snapshot id, sequence number, `VortexConfig`. +- **`DataFile`** β€” virtual file (a directory containing one or more Vortex files), with row count, byte size, partition id, sequence number, and a row-id base. +- **`DeleteFile`** β€” deletion vector reference (Arrow IPC file), with `DeletionType` (position- vs key-based) and sequence number. +- **`InlinedData`** β€” Arrow IPC blob stored inline in the metastore, with row count and sequence number. +- **`InlinedDelete`** β€” inline tombstone for upserted/deleted PKs that haven't yet been checkpointed to a delete-vector file. +- **`InlinedDataStats`** β€” `{ total_rows, segment_count, total_bytes }` aggregated from `cayenne_inlined_data` for memtable-pressure decisions. +- **`PartitionMetadata`** β€” composite partition key, partition path, record/byte counts. +- **`TableStatistics`** β€” serialized `FileStatistics` blob plus `num_rows`; populated from Vortex file footers and read by the DataFusion planner. +- **`VortexConfig`** β€” Vortex-side tuning. All fields configurable per dataset via `cayenne_*` runtime parameters: ```rust pub struct VortexConfig { - pub footer_cache_mb: usize, // default: 128 - pub segment_cache_mb: usize, // default: 256 - pub target_vortex_file_size_mb: usize, // default: 128 - pub sort_columns: Vec, // default: empty - pub compression_strategy: CompressionStrategy, // default: CompressionStrategy::default() - pub upload_concurrency: usize, // default: 4 + // Vortex caches and file shape + pub footer_cache_mb: usize, // default 128 (currently ignored in 2.0.0-unstable) + pub segment_cache_mb: usize, // default 256 (currently ignored in 2.0.0-unstable) + pub target_vortex_file_size_mb: usize, // default 128 + + // Encoding / sort + pub sort_columns: Vec, // default [] + pub compression_strategy: CompressionStrategy, // default Btrblocks + + // Writer concurrency + pub upload_concurrency: usize, // default available_parallelism() + pub write_concurrency: Option, // None = session target_partitions; forced to 1 if sort_columns set + + // Compaction + pub compaction_trigger_files: usize, // default 8 + pub compaction_max_levels: usize, // default 3 + pub compaction_max_files_per_pick: usize, // default 32 + pub compaction_background_interval_ms: u64, // default 30_000, 0 disables background loop + + // Inline-write admission (per-call gate) + pub inline_max_rows: usize, // default 1_024 + pub inline_max_bytes: usize, // default 1_048_576 (1 MiB serialized IPC) + pub inline_max_buffer_bytes: usize, // default 4_194_304 (4 MiB pre-decode buffer) + + // Inline-memtable flush triggers (cumulative gate) + pub inline_flush_max_rows: i64, // default 10_000 + pub inline_flush_max_segments: i64, // default 64 + pub inline_flush_max_bytes: i64, // default 8_388_608 (8 MiB total IPC) + + // PK conflict detection + pub pk_conflict_detection: PkConflictDetection, // default Auto; None opts into blind append for CDC } ``` -### 4. Deletion Vectors (`provider/delete/vector_io.rs`) +Two distinct threshold groups for inline data β€” `inline_max_*` is the *per-write admission* gate ("is this single write small enough to absorb into the memtable?"); `inline_flush_max_*` is the *cumulative flush* gate ("has the accumulated memtable grown enough that we should checkpoint it to Vortex?"). -Efficient delete tracking without rewriting data files. Deletion vectors are stored as Arrow IPC files and support two modes: +### 4. Deletion vectors (`provider/delete/vector_io.rs`) + +Two deletion modes, persisted as Arrow IPC files referenced by `cayenne_delete_file`: ```rust pub enum DeletionIdentifier { - /// Position-based: tracks specific row IDs within a data file - PositionBased { file_path: String, row_ids: Vec }, - /// Key-based: tracks primary key bytes for cross-file deletion + /// Position-based: row positions inside a specific data file. + PositionBased { row_ids: Vec }, + /// Key-based: PK bytes; survive partition reorganization and parallel coalescing. KeyBased(Vec>), } ``` -The `DeletionVectorWriter` writes deletion vectors as Arrow IPC files. The `DeletionVectorReader` reads them back for query-time filtering. +At scan time: + +- **Position-based** strategy attaches a `RoaringBitmap` per file via `Selection::ExcludeRoaring`, pushed down to the Vortex scan layer (`provider/vortex_format::DeletionFilteringVortexFormat`). +- **Key-based** strategy (Int64 PK or row-key) runs `Int64PkDeletionFilterExec` / `KeyBasedDeletionFilterExec` (`provider/delete/filter_exec.rs`) above the file scan. Each row's PK is bloom-prefiltered, then probed against the cached `DeletionIndex` / `KeyDeletionIndex`. + +The deletion index plus its companion insert-records index are published as a single atomic snapshot (`Int64PkDeletionSnapshot` / `RowConverterDeletionSnapshot` β€” `provider/deletion_strategy.rs`), held in one `ArcSwap` per table so concurrent scans observe consistent `(deleted, insert_records)` pairs even mid-upsert. -### 5. Table Provider (`provider/table.rs`) +### 5. Table provider (`provider/table.rs`) -DataFusion `TableProvider` implementation with builder pattern: +DataFusion `TableProvider` implementation. Constructed via `CayenneTableProviderBuilder`. The struct holds (abbreviated β€” full list in source): ```rust pub struct CayenneTableProvider { table_metadata: TableMetadata, catalog: Arc, - listing_table: Arc>>, + + // Listing-table state + listing_table: Arc>, // legacy stats path + listing_fence: Arc>, // read/write barrier + scan_listing_tables: Arc>>>, + table_statistics: Arc>>, + + // Filters and conflict resolution retention_filters: Vec, time_retention_filter_builder: Option, context: Arc, pk_deletion_strategy: PkDeletionStrategyWithCache, pk_row_converter: Option>, pk_column_indices: Vec, + + // Per-table locks write_lock: Arc>, + compaction_lock: Arc>, + + // Object store object_store_config: Option, - current_snapshot_id: Arc>, - protected_snapshots: Arc>>, + object_store_registered_runtime_envs: Arc>>, + + // Snapshot state + current_snapshot_id: Arc>, + protected_snapshots: Arc>>, + + // Memtable + maintenance + inlined_row_count: Arc, + new_files_since_last_compaction: Arc, + staging_wal_present: Arc, + staging_may_have_files: Arc, + post_write_compaction_scheduled: Arc, + post_write_maintenance: Arc, + background_compactor: Arc>, } ``` @@ -193,15 +306,40 @@ let provider = CayenneTableProviderBuilder::new(catalog, runtime_env) Provides: -- Query execution with automatic deletion vector filtering -- Insert operations via DataFusion's `insert_into()` API -- Delete via DataFusion's SQL `DELETE FROM` path -- Sequence-based ordering for correct delete/insert visibility -- Protected snapshot tracking for concurrent access +- Query execution with key- and position-based deletion-vector filtering, protected-snapshot routing, and inlined-data union. +- Insert operations via DataFusion's `insert_into` API (regular path) and the dedicated `write_cdc_append_stream` (CDC-pipelined path). +- Deletes via DataFusion's SQL `DELETE FROM` path. +- Sequence-based ordering for correct delete/insert visibility. +- Protected snapshot tracking for concurrent access. +- Per-scan ListingTable cache and per-`RuntimeEnv` object-store registration short-circuit. + +### 6. CDC apply pipeline (`provider/mutation_writer.rs`, `provider/staging_wal.rs`) -## CRUD Operations +`write_cdc_append_stream` is the entry point used by the runtime's CDC apply loop (`crates/runtime/src/accelerated_table/refresh_task/changes.rs`). Per burst: -### Create Table +1. Acquire `write_lock`. +2. `ensure_no_incomplete_write` β€” error if a previous burst's WAL is on disk and unreconciled. +3. `prepare_stream_for_insert` β€” if `pk_conflict_detection: auto` (default), build an existing-PK keyset via `load_existing_keyset` and resolve on-conflict deletions; if `pk_conflict_detection: none`, skip. +4. Decide `can_stage_for_pipeline`: simple appends (no sort columns, no partition column, no retention filters, no pending PK deletions, no file/on-conflict deletions) take the pipelined path; others fall back to a fully synchronous write. +5. **Stage A** β€” `write_to_snapshot` into the staging dir; `write_staging_wal` makes the file list durable via tmp+fsync+rename. +6. Return a `CayenneCdcWrite` holding the staged-write handle and the still-held write lock; the runtime spawns Stage B on a background task. +7. **Stage B** β€” under the listing fence: `move_files_to_current_snapshot`, `remove_staging_wal`, `publish_current_snapshot_files_changed` (invalidates DataFusion's list-files cache). The write lock drops when Stage B completes. + +Stage A and Stage B preserve burst order via the runtime's `PendingApplyFinalize` FIFO. The runtime acks the source-side LSN after Stage A returns (data durable) without waiting for Stage B (data visible), so PG can recycle WAL ahead of visibility. + +### 7. Compaction (`provider/compaction.rs`) + +Tiered small-files compaction picks the smallest eligible file tier whose total size and file count exceed thresholds, and rewrites the current snapshot through the same `write_to_snapshot` + `commit_compaction` path as writes. Triggered by: + +- **Inline post-write trigger** (`schedule_post_write_compaction`): `tokio::spawn` with an `AcqRel` dedup flag so at most one inline pass is queued per table. +- **Background compactor** (`BackgroundCompactor`): per-table periodic task gated by a shared per-accelerator semaphore (`Semaphore::new(available_parallelism())`). +- **Inline memtable flush** (`checkpoint_inlined_data_if_memtable_pressure_exceeded`): drains `cayenne_inlined_data` into a Vortex file when cumulative rows / segments / IPC bytes exceed `inline_flush_max_*`. + +All compaction triggers `try_lock` the table write lock and skip if a writer is active. + +## CRUD operations + +### Create table ```rust let options = CreateTableOptions { @@ -222,134 +360,189 @@ let provider = CayenneTableProviderBuilder::new(catalog, runtime_env) .await?; ``` -### Insert Data +### Insert data ```rust -// Insert record batches via DataFusion's insert_into() API use datafusion::prelude::*; let ctx = SessionContext::new(); ctx.register_table("my_table", Arc::new(provider))?; ctx.sql("INSERT INTO my_table SELECT * FROM source_table").await?.collect().await?; ``` -### Delete +For CDC apply, the runtime calls `provider.write_cdc_append_stream(stream, &task_ctx)` directly to take the pipelined path. -Deletes are performed through DataFusion's SQL `DELETE FROM` path: +### Delete ```sql DELETE FROM users WHERE id IN (1, 2, 3) ``` -Deletion vectors are written as Arrow IPC files, avoiding data file rewrites. - -### Query with Deletion Filters +Deletion vectors are written as Arrow IPC files for PK-keyed and position-keyed deletes; small batches land inline as `InlinedDelete` entries first and are flushed on memtable pressure. -Queries automatically apply deletion vectors and sequence-based ordering: +### Query ```sql SELECT * FROM users WHERE id > 100 --- Deletion vectors are applied transparently ``` -## DuckLake Specification Alignment +Deletion vectors, protected snapshots, inlined data union, and time-retention filters are all applied transparently. -Cayenne implements a subset of the DuckLake v0.3 specification: +## Relationship to the DuckLake specification -### Implemented +Cayenne shares some shape with the [DuckLake v1.0 specification](https://ducklake.select/docs/stable/specification/introduction) β€” both store transactional table metadata in a SQL database and put data in object storage β€” but the two formats are not interchangeable. The differences are deliberate, driven by Cayenne's use of the Vortex columnar format and the runtime's HTAP / CDC workloads. -- βœ… Table metadata management -- βœ… Delete file tracking with sequence numbers -- βœ… Partition metadata (composite partition keys) +### Shared concepts -### Minimal/Simplified +- Transactional catalog database (Cayenne supports SQLite or Turso; DuckLake also allows DuckDB, Postgres, MySQL). +- Sequence-numbered snapshots for visibility ordering. +- Per-table partition metadata and per-snapshot data layout. +- Delete-file references decoupled from data files (so deletes don't rewrite data). +- Inline data table for small-write absorption (`cayenne_inlined_data` mirrors `ducklake_inlined_data_tables` in concept). -- ⚠️ Schema evolution (simplified) -- ⚠️ Statistics tracking (basic) +### Major divergences from DuckLake v1.0 -### Not Implemented (Future) +| Area | DuckLake v1.0 | Cayenne | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| **Data file format** | Parquet (mandated) | Vortex | +| **Catalog table prefix** | `ducklake_` | `cayenne_` | +| **Data file metadata** | Explicit `ducklake_data_file` row per file with column stats in `ducklake_file_column_stats` | No explicit data-file table; Cayenne lets DataFusion's `ListingTable` enumerate Vortex files in each snapshot directory. Table-level stats only in `cayenne_table_statistics`. | +| **Snapshot model** | Dedicated `ducklake_snapshot` + `ducklake_snapshot_changes` change log | `current_snapshot_id` field on `cayenne_table` plus `cayenne_snapshot_sequence` for protected-snapshot routing | +| **Schema representation** | Column-level rows in `ducklake_column`, evolution via `ducklake_schema_versions`, `ducklake_column_mapping`, `ducklake_name_mapping` | Schema stored as a JSON blob (`schema_json`) on `cayenne_table`; schema evolution is intentionally simplified | +| **Namespaces / schemas** | `ducklake_schema` supports nested namespaces | Flat table namespace | +| **Upsert / PK semantics** | Snapshot-based merge | Iceberg-style PK insert tracking in `cayenne_insert_record`, paired with `cayenne_inlined_delete` tombstones | +| **GC** | `ducklake_files_scheduled_for_deletion` work queue | Old-snapshot cleanup triggered inline by compaction/sort/overwrite paths | +| **Views, SQL macros, tags** | First-class (`ducklake_view`, `ducklake_macro*`, `ducklake_tag`, `ducklake_column_tag`) | Not implemented | +| **Sort metadata** | `ducklake_sort_expression`, `ducklake_sort_info` | `sort_columns` is a per-dataset config field, not a catalog table | +| **Variant column stats** | `ducklake_file_variant_stats` | Not implemented | -- ❌ File compaction -- ❌ Snapshot expiration -- ❌ Column mapping -- ❌ MVCC (multi-version concurrency control) +### What Cayenne implements relative to its own goals -## Database Schema +- Table metadata with sequence-numbered operations +- Position- and key-based delete files +- Composite partition keys (single partition column via current public API) +- Tiered small-files compaction (inline + background) +- Inline-data memtable with per-write and cumulative-flush thresholds +- CDC apply pipelining with debounced post-write maintenance +- Protected-snapshot scan routing for upsert correctness -Cayenne uses these tables in the metastore (SQLite/Turso): +### Not implemented (and not currently planned) + +- Schema evolution at column-row granularity (column adds / drops / renames / mappings) +- SQL macros, views, table/column tags +- Snapshot expiration and time-travel queries +- Full MVCC + +If interoperability with a DuckLake catalog reader is a requirement, Cayenne is not the right tool. If a Vortex-native, CDC-friendly accelerator backed by SQLite or Turso fits the workload, Cayenne is purpose-built for that. + +## Database schema + +The metastore (SQLite or Turso) materializes these tables. DDL lives in `crates/cayenne/src/metastore/sqlite.rs:198+` and is mirrored by Turso. ```sql CREATE TABLE IF NOT EXISTS cayenne_table ( - table_id INTEGER PRIMARY KEY AUTOINCREMENT, - table_uuid TEXT NOT NULL, - table_name TEXT NOT NULL, + table_id TEXT PRIMARY KEY, -- UUIDv7 + table_name TEXT NOT NULL UNIQUE, path TEXT NOT NULL, - path_is_relative BOOLEAN NOT NULL, + path_is_relative INTEGER NOT NULL, schema_json TEXT NOT NULL, primary_key_json TEXT, on_conflict_json TEXT, current_snapshot_id TEXT NOT NULL DEFAULT '', partition_column TEXT, vortex_config_json TEXT, - current_sequence_number BIGINT NOT NULL DEFAULT 0 + current_sequence_number INTEGER NOT NULL DEFAULT 0 ); CREATE TABLE IF NOT EXISTS cayenne_delete_file ( - delete_file_id INTEGER PRIMARY KEY AUTOINCREMENT, - table_id INTEGER NOT NULL, + delete_file_id TEXT PRIMARY KEY, -- UUIDv7 + table_id TEXT NOT NULL, + source_data_file_path TEXT, path TEXT NOT NULL, - path_is_relative BOOLEAN NOT NULL, + path_is_relative INTEGER NOT NULL, format TEXT NOT NULL, - delete_count BIGINT NOT NULL, - file_size_bytes BIGINT NOT NULL, - source_data_file_path TEXT, - sequence_number BIGINT NOT NULL DEFAULT 0, + delete_count INTEGER NOT NULL, + file_size_bytes INTEGER NOT NULL, + deletion_type TEXT NOT NULL, + sequence_number INTEGER NOT NULL DEFAULT 0, FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE ); CREATE TABLE IF NOT EXISTS cayenne_partition ( - partition_id INTEGER PRIMARY KEY AUTOINCREMENT, - table_id INTEGER NOT NULL, + partition_id TEXT PRIMARY KEY, + table_id TEXT NOT NULL, partition_columns_json TEXT NOT NULL, partition_values_json TEXT NOT NULL, partition_key TEXT NOT NULL, path TEXT NOT NULL, - path_is_relative BOOLEAN NOT NULL, - record_count BIGINT NOT NULL DEFAULT 0, - file_size_bytes BIGINT NOT NULL DEFAULT 0, + path_is_relative INTEGER NOT NULL, + record_count INTEGER NOT NULL DEFAULT 0, + file_size_bytes INTEGER NOT NULL DEFAULT 0, FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE, UNIQUE(table_id, partition_key) ); CREATE TABLE IF NOT EXISTS cayenne_insert_record ( insert_record_id INTEGER PRIMARY KEY AUTOINCREMENT, - table_id INTEGER NOT NULL, + table_id TEXT NOT NULL, pk_bytes BLOB NOT NULL, - sequence_number BIGINT NOT NULL, + sequence_number INTEGER NOT NULL, FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE, UNIQUE(table_id, pk_bytes) ); CREATE TABLE IF NOT EXISTS cayenne_snapshot_sequence ( - table_id INTEGER NOT NULL, + table_id TEXT NOT NULL, snapshot_id TEXT NOT NULL, - sequence_number BIGINT NOT NULL, + sequence_number INTEGER NOT NULL, FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE, PRIMARY KEY (table_id, snapshot_id) ); + +CREATE TABLE IF NOT EXISTS cayenne_table_statistics ( + table_id TEXT PRIMARY KEY, + num_rows INTEGER NOT NULL, + statistics_blob BLOB NOT NULL, + FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS cayenne_inlined_data ( + inlined_id TEXT PRIMARY KEY, + table_id TEXT NOT NULL, + partition_key TEXT, + data_ipc BLOB NOT NULL, -- Arrow IPC stream + record_count INTEGER NOT NULL, + sequence_number INTEGER NOT NULL, + created_at TEXT NOT NULL, + FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE +); +CREATE INDEX idx_cayenne_inlined_data_table_seq + ON cayenne_inlined_data(table_id, sequence_number); + +CREATE TABLE IF NOT EXISTS cayenne_inlined_delete ( + inlined_delete_id TEXT PRIMARY KEY, + table_id TEXT NOT NULL, + delete_ipc BLOB NOT NULL, -- Arrow IPC stream of PK row keys + sequence_number INTEGER NOT NULL, + created_at TEXT NOT NULL, + FOREIGN KEY (table_id) REFERENCES cayenne_table(table_id) ON DELETE CASCADE +); +CREATE INDEX idx_cayenne_inlined_delete_table_seq + ON cayenne_inlined_delete(table_id, sequence_number); ``` -## Usage Example +The DDL source is authoritative; treat this section as a quick reference. + +## Usage example ```rust use cayenne::{ - CayenneCatalog, CayenneTableProviderBuilder, CreateTableOptions, + CayenneCatalog, CayenneTableProviderBuilder, + metadata::{CreateTableOptions, VortexConfig}, }; -// Create catalog (synchronous, returns CatalogResult) let catalog = Arc::new(CayenneCatalog::new("sqlite:///data/catalog.db")?); catalog.init().await?; -// Create table let options = CreateTableOptions { table_name: "events".to_string(), schema: Arc::new(Schema::new(vec![ @@ -361,101 +554,174 @@ let options = CreateTableOptions { on_conflict: None, base_path: "/data/events".to_string(), partition_column: None, - vortex_config: cayenne::metadata::VortexConfig::default(), + vortex_config: VortexConfig::default(), }; let provider = CayenneTableProviderBuilder::new(catalog, runtime_env) .create(options) .await?; -// Insert data via DataFusion's insert_into() API let ctx = SessionContext::new(); ctx.register_table("events", Arc::new(provider))?; -let batch = create_record_batch()?; -ctx.read_batch(batch)?.write_table("events", DataFrameWriteOptions::new()).await?; - -// Query (deletion vectors applied automatically) let df = ctx.sql("SELECT * FROM events WHERE event_id > 1000").await?; df.show().await?; ``` -## Implementation Status - -### Current Status +## Implementation status -- βœ… Trait abstractions defined -- βœ… Data structures implemented -- βœ… Deletion vector logic (Arrow IPC, position-based and key-based) -- βœ… SQLite catalog implementation -- βœ… Turso catalog implementation (optional feature) -- βœ… Table provider with scan and deletion filtering -- βœ… Insert operations via DataFusion -- βœ… Delete via DataFusion SQL `DELETE FROM` path -- βœ… Primary key support -- βœ… Streaming data ingestion and queries -- βœ… File-mode acceleration -- βœ… S3 Express One Zone support -- βœ… Partition support (composite partition keys) -- βœ… Upsert on conflict behavior -- βœ… Retention policies (time-based and SQL-based) -- βœ… Sequence-based ordering for delete/insert visibility -- βœ… Protected snapshot tracking -- βœ… Staging WAL for crash-safe writes -- βœ… Compaction via `commit_compaction` API +### Current status -### Known Limitations +- Pluggable metastore (SQLite default; Turso optional) +- Position- and key-based deletion vectors +- Primary keys, upsert on-conflict, retention policies (time- and SQL-based) +- Sequence-based ordering with protected snapshots +- Streaming data ingestion and queries +- File-mode acceleration +- S3 and S3 Express One Zone support +- Composite partition keys +- Staging WAL with crash-safe recovery +- Tiered small-files compaction (inline + background) +- Inline-data memtable (per-write admission + cumulative flush thresholds, both configurable) +- CDC apply pipelining with debounced post-write maintenance +- Per-dataset `cayenne_pk_conflict_detection` opt-out for append-only CDC +- CDC apply observability metrics (`dataset_acceleration_cdc_apply_*`) +- Same-source large-join `HashJoin β†’ SortMergeJoin` rewriter for spillable hash-join build sides -The following limitations apply to the Cayenne accelerator: +### Known limitations -#### Access Mode +#### Access mode -- **File mode only**: Cayenne only supports file-based acceleration (`mode: file`). In-memory mode is not supported. +Cayenne supports `mode: file` only. In-memory mode is not supported. -#### Data Types +#### Data types -Some Arrow data types are not natively supported by the Vortex format used by Cayenne: +Some Arrow data types are not natively supported by the Vortex format: - `Interval` types - `Duration` types - `Map` types - `FixedSizeBinary` types -- `Float16` types (automatically converted) +- `Float16` (automatically converted) - Timestamp units other than microseconds (automatically normalized) -To handle unsupported types, use the `cayenne_unsupported_type_action` parameter: +The `cayenne_unsupported_type_action` parameter controls handling: -- `string` (default): Convert unsupported types to UTF-8 strings -- `error`: Fail on unsupported types -- `warn`: Include in schema but may fail on insert -- `ignore`: Skip unsupported fields +- `string` (default): convert unsupported types to UTF-8 strings +- `error`: fail on unsupported types +- `warn`: include in schema but may fail on insert +- `ignore`: skip unsupported fields #### Indexes -- Secondary indexes are not supported. Primary keys are supported for efficient upserts and deletions. +Secondary indexes are not supported. Primary keys drive efficient upserts and deletions. #### MVCC -- Full MVCC (multi-version concurrency control) is not yet supported. +Full MVCC (multi-version concurrency control) is not supported. -### Future Enhancements +### Future enhancements +- Snapshot expiration and time-travel queries - Full MVCC support -- Advanced statistics +- Advanced statistics (column-level histograms, sketches) - Additional catalog backends (PostgreSQL, DuckDB) -- Snapshot expiration and time-travel queries +- Apply-side pipelining at finer granularity (Stage A of burst N+1 overlapping Stage B of burst N without write-lock serialization) +- Cached `insert_into` execution plan reuse across CDC bursts ## Benefits -1. **Efficient Deletes**: No data file rewrites, deletion vectors stored as Arrow IPC files -2. **ACID Transactions**: SQLite provides transaction guarantees for metadata -3. **Performance**: Vortex's compression and columnar format with configurable caching -4. **Simplicity**: Single SQLite file for metadata -5. **Flexibility**: Trait-based design allows multiple metastore backends -6. **Crash Safety**: Staging WAL ensures write atomicity -7. **Object Store Support**: Native S3 and S3 Express One Zone integration +1. **Efficient deletes**: deletion vectors stored as Arrow IPC files; no data-file rewrites. +2. **ACID metadata**: SQLite (or Turso) provides transaction guarantees for catalog operations. +3. **Performance**: Vortex columnar format with configurable compression and caches; inline memtable absorbs small writes without writing data files. +4. **Crash safety**: staging WAL with tmp+fsync+rename ensures atomic visibility, with self-healing recovery on next open. +5. **Object store support**: native S3 and S3 Express One Zone integration. +6. **CDC-friendly**: Stage A / Stage B pipelining, debounced maintenance, and optional blind-append mode for append-only ingestion. +7. **Flexibility**: trait-based metastore lets the same catalog logic run against SQLite or Turso. + +## Research behind Spice Cayenne + +Cayenne is an engineering synthesis of several lines of database research. The +references below are the ones most directly load-bearing for the design decisions +in this crate. + +### Lakehouse formats and metadata catalogs + +- **DuckLake** β€” DuckDB's specification for a SQL-catalogued lakehouse. Cayenne + shares high-level shape with DuckLake (transactional metadata catalog plus + object-store data) but diverges substantively (Vortex instead of Parquet, + no per-file data-file table, JSON-blob schema instead of column-level rows, + no views/macros/tags). See the *Relationship to the DuckLake specification* + section above for the full table-by-table comparison against v1.0. + - [DuckLake Specification v1.0](https://ducklake.select/docs/stable/specification/introduction) + - DuckDB blog: *"Announcing DuckLake"* β€” +- **Apache Iceberg** β€” table format with sequence-number-driven snapshot + visibility and position/equality delete files. Cayenne's + `cayenne_snapshot_sequence`, sequence-ordered insert/delete semantics, and + protected-snapshot scan routing follow Iceberg's model. The Iceberg spec is + authoritative for the visibility rules Cayenne reimplements for Vortex. + - [Apache Iceberg Spec](https://iceberg.apache.org/spec/) +- **Delta Lake** β€” Databricks' transactional log over Parquet. Not implemented + by Cayenne, but informs the trade-offs around `_delta_log`-style file logs vs. + Cayenne's catalog-table approach. + - Armbrust et al., *"Delta Lake: high-performance ACID table storage over + cloud object stores"*, VLDB 2020. + +### Columnar storage and compression + +- **Vortex** β€” Spiral DB's open-source columnar file format, the persistent + storage tier for Cayenne. Provides predicate pushdown, zone maps, and a + pluggable compression strategy. + - [spiraldb/vortex](https://github.com/spiraldb/vortex) +- **BtrBlocks** β€” adaptive columnar compression scheme used as one of Vortex's + strategies; Cayenne exposes it as `cayenne_compression_strategy: btrblocks`. + Kuschewski et al., *"BtrBlocks: Efficient Wire-Compatible Compression for Data + Lakes"*, SIGMOD 2023. +- **Apache Arrow** β€” in-memory columnar format and Arrow IPC stream encoding. + Cayenne serializes inline-memtable entries and key-based deletion vectors as + Arrow IPC blobs in the metastore. + - [Apache Arrow](https://arrow.apache.org/) + +### Write-optimized storage (LSM-tree) + +- **The Log-Structured Merge-Tree (LSM-Tree)** β€” O'Neil, Cheng, Gawlick, O'Neil, + *Acta Informatica* 33(4), 1996. The level-0 ↔ on-disk-tiers structure + Cayenne uses for inline data (memtable in metastore + flush to Vortex files) + is the LSM pattern adapted to a transactional metastore. + - Author-hosted PDF: +- **LSM-based Storage Techniques: A Survey** β€” Luo and Carey, *VLDB Journal* + 29(1), 2020. Surveys compaction strategies and tiering decisions relevant to + Cayenne's tiered small-files compactor. + +### Deletion vectors and bitmap indexes + +- **Roaring Bitmaps** β€” the bitmap encoding used by Cayenne's position-based + deletion vectors (`Selection::ExcludeRoaring` pushed into Vortex). + - Chambi, Lemire, Kaser, Godin, *"Better bitmap performance with Roaring + bitmaps"*, *Software: Practice and Experience* 46(5), 2016. + arXiv preprint: + - [RoaringBitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec) + +### Query execution + +- **Apache DataFusion** β€” the embedded query engine Cayenne integrates with as + a `TableProvider`. Cayenne's optimizer rules (`CayenneJoinRewriter`, + `CayenneAntiJoinSortMergeRewriter`, `CayenneDynamicFilterSharing`, + `CayennePropagateFilterAcrossEquiJoinKeys`) plug into DataFusion's physical + and logical optimizer pipelines. + - [Apache DataFusion](https://datafusion.apache.org/) + +### Related work referenced in optimizer-rule design + +- The "no-spill build-side memory strategy" documented in + `crates/cayenne/src/optimizer_rules.rs` (Inner-join β†’ SortMergeJoin rewrite + above a 10M-row build-side threshold) builds on classical join-spilling + literature; the rewriter targets the chbench q21 shape specifically. ## References -- [DuckLake Specification v0.3](https://ducklake.select/docs/stable/specification/introduction.html) -- [DuckLake Tables](https://ducklake.select/docs/stable/specification/tables/overview.html) +- [DuckLake Specification v1.0](https://ducklake.select/docs/stable/specification/introduction) +- [DuckLake Tables (v1.0)](https://ducklake.select/docs/stable/specification/tables/overview) - [Vortex Format](https://github.com/spiraldb/vortex) +- [Apache Iceberg Specification](https://iceberg.apache.org/spec/) +- [Apache Arrow](https://arrow.apache.org/) +- [Apache DataFusion](https://datafusion.apache.org/) diff --git a/crates/cayenne/benches/apply_on_conflict_per_row_alloc.rs b/crates/cayenne/benches/apply_on_conflict_per_row_alloc.rs new file mode 100644 index 0000000000..a16104e101 --- /dev/null +++ b/crates/cayenne/benches/apply_on_conflict_per_row_alloc.rs @@ -0,0 +1,235 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-row heap allocation cliff inside +//! `CayenneTableProvider::apply_on_conflict_to_batch` +//! (`crates/cayenne/src/provider/table.rs:3598-3741`). +//! +//! The upsert path performs three independent `OwnedRow` clones per +//! row, each of which is a heap allocation under +//! `arrow::row::RowConverter`'s `Box<[u8]>` payload: +//! +//! ```ignore +//! let key = rows.row(row_idx).owned(); // clone 1: from Arrow rows +//! ... +//! kept_keys.insert(key.clone()); // clone 2: HashSet +//! row_keys.push(key); // move (no alloc) +//! ... +//! // Second pass, upsert dedup +//! seen.insert(key.clone(), row_idx); // clone 3: HashMap +//! ``` +//! +//! Plus the `RowConverterBased` deletion-strategy branch +//! (`table.rs:3667-3674`) does a fourth heap allocation per +//! conflict-deleted row: +//! +//! ```ignore +//! let row_key = key.as_ref().to_vec().into_boxed_slice(); +//! ``` +//! +//! Each `OwnedRow` clone is a small `Box<[u8]>` allocation, ~16-24 +//! bytes payload + Rust allocator overhead (~50 ns malloc + ~30 ns +//! free on glibc/jemalloc, more on macOS). For a CDC commit at the +//! CH-benCH SF100 upsert-heavy shape β€” 100K-row coalesced batches on +//! `customer` and `stock` β€” that is **300K–400K small heap allocs +//! per commit**, ~15-20 ms of pure allocator overhead before any +//! Vortex byte is written. +//! +//! The TigerStyle remedy is a per-batch arena: encode all row keys +//! into one contiguous `Vec` once, hand out `&[u8]` slices indexed +//! by `(start, len)` to every downstream consumer (HashSet, HashMap, +//! delete spec). One allocation per batch instead of N allocations +//! per row. `arrow::row::Rows` already exposes this shape β€” its +//! `Rows::row(i)` borrows from a shared buffer; the production code +//! pays the heap allocation only because it materializes +//! `OwnedRow = Box<[u8]>` to satisfy `HashMap` ownership constraints. +//! +//! ## What this bench measures +//! +//! A focused shape bench β€” no Cayenne setup, no Vortex, no metastore. +//! Models the per-row inner loop of `apply_on_conflict_to_batch` for +//! the **upsert** path (the highest-cost branch). Three lanes: +//! +//! - `current_three_clones/` β€” three `Box<[u8]>` clones per +//! row plus two HashMap inserts. Mirrors the production hot loop. +//! - `single_owned_clone/` β€” strips clones 2 and 3 by keying +//! the HashMaps with `usize` row index (still one `Box<[u8]>` per +//! row for the `OwnedRow` materialization). Models a "small win" +//! refactor. +//! - `arena_indexed/` β€” one `Vec` arena holds every row +//! key end-to-end; HashMaps use `(start, len)` index pairs. Zero +//! per-row heap allocations after the initial batch reserve. +//! Models the structural fix. +//! +//! Row width is 16 bytes (matches Arrow `RowConverter` output for a +//! single `Int64` PK column with the standard row-encoding header). +//! +//! ## How to read +//! +//! `cargo bench --bench apply_on_conflict_per_row_alloc -p cayenne`. +//! Compare each lane at `rows=100_000`: +//! +//! - `current_three_clones` β€” wall time scales with `rows * (3 allocs +//! + 2 hashes + 1 vec push)`. The slope per row is the per-commit +//! tax that the unsorted CDC ingest pays. +//! - `arena_indexed` β€” wall time scales with `rows * (1 memcpy + 2 +//! hashes + 1 index push)`. Slope is bounded by HashMap insert +//! cost; allocator overhead disappears. +//! +//! The ratio between lanes is the maximum throughput headroom from +//! eliminating per-row clones. For PK-heavy CDC tables (`customer`, +//! `stock`, `district` in the May 15 2026 SF100 retest) this is the +//! per-commit-cost floor below which `pk_conflict_detection: Auto` +//! cannot go. + +#![allow(clippy::expect_used)] + +use std::collections::HashMap; +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Fixed row-key width β€” matches Arrow `RowConverter` output for a +/// single `Int64` PK column with the 1-byte null header. Widening to +/// 32 or 64 bytes (composite PKs) increases the absolute cost but +/// does not change the ratio between lanes; the cliff is allocator- +/// bound, not memcpy-bound. +const ROW_WIDTH: usize = 16; + +/// Row counts straddling realistic CDC batch sizes: +/// - 1 K: a typical small append. +/// - 8 K: a moderate coalesced burst. +/// - 100 K: an upsert-heavy table burst at CH-benCH SF100 shape. +const ROW_COUNTS: &[usize] = &[1_024, 8_192, 100_000]; + +fn make_key(idx: usize) -> Box<[u8]> { + let mut buf = vec![0u8; ROW_WIDTH]; + // Embed the row index so each key is unique. The `wrapping_mul` + // by a Knuth constant scatters the values across the key space so + // HashMap collisions match production cardinality, not a contiguous + // best case. + let scrambled = (idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + buf[..8].copy_from_slice(&scrambled.to_le_bytes()); + buf.into_boxed_slice() +} + +/// Mirrors the production hot loop: three clones per row plus two +/// HashMap inserts and one Vec push. +fn current_three_clones(rows: usize) -> usize { + let mut kept_keys: HashMap, usize> = HashMap::with_capacity(rows); + let mut seen: HashMap, usize> = HashMap::with_capacity(rows); + let mut row_keys: Vec> = Vec::with_capacity(rows); + + for row_idx in 0..rows { + // Clone 1: materialize `OwnedRow` from `Arrow Rows::row(i).owned()`. + let key = make_key(row_idx); + + // Clone 2: `kept_keys.insert(key.clone())`. + kept_keys.insert(key.clone(), row_idx); + + // Move into row_keys (no clone, but heap-occupying). + row_keys.push(key.clone()); + + // Clone 3: upsert dedup second pass `seen.insert(key.clone(), row_idx)`. + seen.insert(key, row_idx); + } + + black_box(&kept_keys); + black_box(&seen); + black_box(&row_keys); + kept_keys.len() +} + +/// Strips clones 2 and 3 by keying the HashMaps with `usize` row index. +/// One Box<[u8]> per row remains. +fn single_owned_clone(rows: usize) -> usize { + let mut kept_keys: HashMap, usize> = HashMap::with_capacity(rows); + + for row_idx in 0..rows { + // Single allocation per row. + let key = make_key(row_idx); + kept_keys.insert(key, row_idx); + } + + black_box(&kept_keys); + kept_keys.len() +} + +/// Arena-allocated: one contiguous `Vec` holds every key. HashMap +/// entries are `(start, len)` slices into the arena. Zero per-row heap +/// allocations after the initial `with_capacity`. +fn arena_indexed(rows: usize) -> usize { + let mut arena: Vec = Vec::with_capacity(rows * ROW_WIDTH); + // Owned `Vec` slot still required because borrows from `arena` + // would be invalidated by growth β€” but `arena` is pre-sized, so + // this is a single allocation up front. In production, the row + // builder would write directly into `arena` from the Arrow encoder. + let mut row_offsets: Vec<(usize, usize)> = Vec::with_capacity(rows); + let mut kept_indices: HashMap = HashMap::with_capacity(rows); + + for row_idx in 0..rows { + // Write the encoded row into the arena. + let start = arena.len(); + let scrambled = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + arena.extend_from_slice(&scrambled.to_le_bytes()); + arena.resize(start + ROW_WIDTH, 0); + row_offsets.push((start, ROW_WIDTH)); + + // Key the HashMap by a content hash rather than the byte slice, + // so we never allocate a `Box<[u8]>` per row. In production this + // would use `RowConverter`'s deterministic hash or an + // `ahash::RandomState`-keyed `HashMap<&[u8], usize>` with the + // arena slice as the borrow source. + let h = scrambled; + kept_indices.insert(h, row_idx); + } + + black_box(&arena); + black_box(&row_offsets); + black_box(&kept_indices); + kept_indices.len() +} + +fn bench_apply_on_conflict_per_row_alloc(c: &mut Criterion) { + let mut group = c.benchmark_group("apply_on_conflict_per_row_alloc"); + for &rows in ROW_COUNTS { + group.throughput(Throughput::Elements( + u64::try_from(rows).unwrap_or(u64::MAX), + )); + + group.bench_with_input( + BenchmarkId::new("current_three_clones", rows), + &rows, + |b, &rows| b.iter(|| current_three_clones(black_box(rows))), + ); + + group.bench_with_input( + BenchmarkId::new("single_owned_clone", rows), + &rows, + |b, &rows| b.iter(|| single_owned_clone(black_box(rows))), + ); + + group.bench_with_input( + BenchmarkId::new("arena_indexed", rows), + &rows, + |b, &rows| b.iter(|| arena_indexed(black_box(rows))), + ); + } + group.finish(); +} + +criterion_group!(benches, bench_apply_on_conflict_per_row_alloc); +criterion_main!(benches); diff --git a/crates/cayenne/benches/cached_table_statistics_wide.rs b/crates/cayenne/benches/cached_table_statistics_wide.rs new file mode 100644 index 0000000000..efd9edf4ae --- /dev/null +++ b/crates/cayenne/benches/cached_table_statistics_wide.rs @@ -0,0 +1,248 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-scan cost of cloning cached `Statistics` for the +//! optimizer in `CayenneTableProvider::cached_table_statistics_for_optimizer` +//! (`crates/cayenne/src/provider/table.rs:3304-3331`). +//! +//! The optimizer hot path runs once per `TableProvider::statistics()` call, +//! which DataFusion makes for every scan and for several physical-optimizer +//! rules (column pruning, partition pruning, join order, exact-join-filter +//! sizing). The current implementation: +//! +//! ```ignore +//! let stats = stats.clone(); // O(num_columns) deep clone +//! if has_pending_visibility_changes { +//! Some(Self::statistics_to_inexact(stats)) // O(num_columns) re-build +//! } else { +//! Some(stats) +//! } +//! ``` +//! +//! Each `ColumnStatistics` carries up to five `Precision` fields +//! (`null_count`, `min_value`, `max_value`, `sum_value`, `distinct_count`) +//! plus `byte_size`. Cloning a `Precision` heap-allocates for +//! variable-width scalars (Utf8, Binary, List, Struct, decimal-256, …). On +//! pending overlays the entire `Vec` is consumed by `into_iter`, +//! `to_inexact` is called on every field of every column, and the +//! `Statistics` is rebuilt. +//! +//! For a 256-column table that is the practical ceiling that the recent +//! `TABLE_STATISTICS_FULL_COLUMN_SYNC_LIMIT = 256` workaround +//! (commit `2d5ced3d7f`) was chosen to bound. The workaround returns +//! top-level stats only (`num_rows`, `total_byte_size`, empty +//! `column_statistics`) for wider tables β€” preserving the planner from a +//! per-scan clone cliff at the cost of losing column min/max information +//! that the optimizer needs for partition pruning, exact-join-filter +//! sizing, and join-order cost models. +//! +//! Two unresolved concerns this bench surfaces: +//! +//! 1. **Cliff above 256**: tables with 257+ columns silently lose all +//! column-level statistics for optimizer planning. A plan that would +//! have pruned 95% of files on a 200-column table can degenerate to a +//! full scan on a 300-column table for the same query shape. +//! 2. **Cost below 256**: even at 100-200 columns the per-scan clone is a +//! measurable fraction of planning latency on overlay-active tables +//! (writes still pending, inline rows present). Reused across every +//! optimizer rule that calls `statistics()`, the cost compounds. +//! +//! The TigerStyle remedy is to share the cached `Statistics` by `Arc` and +//! lazy-transform only when an overlay is active (or never, if callers can +//! accept a `Cow<'_, Statistics>`-style API). One allocation per write, +//! not per scan. +//! +//! ## What this bench measures +//! +//! Pure CPU shape β€” no Cayenne setup, no metastore, no DataFusion planner. +//! Models the per-scan body of `cached_table_statistics_for_optimizer` at +//! four column counts that bracket the workaround threshold: +//! +//! - 64 columns: typical narrow table. +//! - 200 columns: just under the workaround threshold; still pays the clone. +//! - 256 columns: at the threshold; still pays the clone (workaround +//! triggers at `> 256`, i.e. 257+). +//! - 1024 columns: well past the threshold; pays the workaround's +//! top-level path and loses column stats entirely. +//! +//! Three lanes per width: +//! +//! - `full_clone_no_overlay/` β€” mirrors today's no-overlay path +//! (`stats.clone()` then return). Wall time is the deep `Vec` +//! clone. +//! - `full_clone_with_overlay/` β€” mirrors today's overlay path +//! (`stats.clone()` then `statistics_to_inexact`). Wall time is the +//! clone plus the per-column `to_inexact` rebuild β€” i.e. the path +//! taken on inserts-pending-checkpoint and pending-deletion tables. +//! - `top_level_only/` β€” mirrors the wide-table workaround +//! (`top_level_statistics_only`). Wall time is two `Precision` clones. +//! Used at 1024 columns to model the workaround floor. +//! +//! ## How to read +//! +//! `cargo bench --bench cached_table_statistics_wide -p cayenne`. +//! +//! - `full_clone_with_overlay/256` β€” per-scan tax on an overlay-active +//! 200-column table. At 10K scans/sec on the read path, multiplying by +//! this number gives the planner-side CPU floor. +//! - The ratio `full_clone_with_overlay/256` vs `top_level_only/256` is +//! the headroom from sharing stats via `Arc` (or moving the workaround +//! lower). Per-call clone dominates; the per-column copy is the +//! wallclock weight. +//! - The jump between `full_clone_with_overlay/64` and +//! `full_clone_with_overlay/256` is the symbol-of-cost the workaround +//! was sized to dodge. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use datafusion_common::stats::Precision; +use datafusion_common::{ColumnStatistics, ScalarValue, Statistics}; + +/// Column counts bracketing the wide-table workaround threshold of 256. +const COLUMN_COUNTS: &[usize] = &[64, 200, 256, 1024]; + +/// Build a `Statistics` shaped like a metastore-loaded snapshot: every +/// column has an exact min/max as `ScalarValue::Utf8` (the cliff is +/// variable-width allocator-bound, not memcpy-bound β€” Int64 stats are +/// faster but the production mix is dominated by string/decimal/timestamp +/// columns whose `ScalarValue` clones heap-allocate). +fn build_stats(num_columns: usize) -> Statistics { + let mut column_statistics = Vec::with_capacity(num_columns); + for i in 0..num_columns { + column_statistics.push(ColumnStatistics { + null_count: Precision::Exact(0), + min_value: Precision::Exact(ScalarValue::Utf8(Some(format!("min_value_{i:06}")))), + max_value: Precision::Exact(ScalarValue::Utf8(Some(format!("max_value_{i:06}")))), + sum_value: Precision::Absent, + distinct_count: Precision::Exact(1_024), + byte_size: Precision::Exact(8_192), + }); + } + + Statistics { + num_rows: Precision::Exact(1_000_000), + total_byte_size: Precision::Exact(64 * 1024 * 1024), + column_statistics, + } +} + +/// Mirrors `column_statistics_to_inexact` in +/// `crates/cayenne/src/provider/table.rs:3364-3373`. Reproduced inline +/// because the method is private to `CayenneTableProvider`. +fn column_statistics_to_inexact(stats: ColumnStatistics) -> ColumnStatistics { + ColumnStatistics { + null_count: stats.null_count.to_inexact(), + max_value: stats.max_value.to_inexact(), + min_value: stats.min_value.to_inexact(), + sum_value: stats.sum_value.to_inexact(), + distinct_count: stats.distinct_count.to_inexact(), + byte_size: stats.byte_size.to_inexact(), + } +} + +/// Mirrors `statistics_to_inexact` in +/// `crates/cayenne/src/provider/table.rs:3352-3362`. +fn statistics_to_inexact(stats: Statistics) -> Statistics { + Statistics { + num_rows: stats.num_rows.to_inexact(), + total_byte_size: stats.total_byte_size.to_inexact(), + column_statistics: stats + .column_statistics + .into_iter() + .map(column_statistics_to_inexact) + .collect(), + } +} + +/// Mirrors `top_level_statistics_only` in +/// `crates/cayenne/src/provider/table.rs:3333-3350`. The wide-table +/// workaround: returns an empty `column_statistics` and clones only the +/// two top-level `Precision` fields. +fn top_level_statistics_only(stats: &Statistics, inexact: bool) -> Statistics { + let num_rows = if inexact { + stats.num_rows.clone().to_inexact() + } else { + stats.num_rows.clone() + }; + let total_byte_size = if inexact { + stats.total_byte_size.clone().to_inexact() + } else { + stats.total_byte_size.clone() + }; + + Statistics { + num_rows, + total_byte_size, + column_statistics: Vec::new(), + } +} + +fn bench_full_clone_no_overlay(c: &mut Criterion) { + let mut group = c.benchmark_group("cached_table_statistics_full_clone_no_overlay"); + for &n in COLUMN_COUNTS { + let stats = build_stats(n); + group.throughput(Throughput::Elements(n as u64)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + let cloned = stats.clone(); + black_box(cloned); + }); + }); + } + group.finish(); +} + +fn bench_full_clone_with_overlay(c: &mut Criterion) { + let mut group = c.benchmark_group("cached_table_statistics_full_clone_with_overlay"); + for &n in COLUMN_COUNTS { + let stats = build_stats(n); + group.throughput(Throughput::Elements(n as u64)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + let cloned = stats.clone(); + let inexact = statistics_to_inexact(cloned); + black_box(inexact); + }); + }); + } + group.finish(); +} + +fn bench_top_level_only(c: &mut Criterion) { + let mut group = c.benchmark_group("cached_table_statistics_top_level_only"); + for &n in COLUMN_COUNTS { + let stats = build_stats(n); + group.throughput(Throughput::Elements(n as u64)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + let top_only = top_level_statistics_only(&stats, true); + black_box(top_only); + }); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_full_clone_no_overlay, + bench_full_clone_with_overlay, + bench_top_level_only, +); +criterion_main!(benches); diff --git a/crates/cayenne/benches/checkpoint_fence_stall.rs b/crates/cayenne/benches/checkpoint_fence_stall.rs new file mode 100644 index 0000000000..01fe5d1c01 --- /dev/null +++ b/crates/cayenne/benches/checkpoint_fence_stall.rs @@ -0,0 +1,175 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: scan stall during inline-memtable checkpoint. +//! +//! `CayenneTableProvider::checkpoint_inlined_data` +//! (`crates/cayenne/src/provider/table.rs:5740-5830`) ends by holding the +//! `listing_fence.write()` guard +//! (`table.rs:5823-5827`) across `clear_inlined_metadata_after_checkpoint` +//! (`table.rs:5832-5841`), which issues **two** sequential awaited +//! metastore DELETEs: +//! +//! ```ignore +//! { +//! let _fence = self.listing_fence.write().await; +//! self.clear_inlined_metadata_after_checkpoint().await?; // 2 awaits +//! self.refresh_listing_table_under_held_fence()?; +//! } +//! +//! async fn clear_inlined_metadata_after_checkpoint(&self) -> Result<()> { +//! self.catalog.clear_inlined_data(&id).await?; // round trip 1 +//! self.catalog.clear_inlined_deletes(&id).await?; // round trip 2 +//! ... +//! } +//! ``` +//! +//! Every concurrent scan acquiring `listing_fence.read().await` +//! (`table.rs:6989`) blocks for the full duration of those two round +//! trips. The in-source comment at `table.rs:5819` claims this is +//! "microseconds in the typical case", which is only true on co-located +//! SQLite without `fsync`. On a remote metastore (Turso wire RTT ~10 ms, +//! managed PostgreSQL ~10-30 ms) two sequential round trips mean every +//! reader stalls 20-60 ms per checkpoint. Sustained inline ingestion +//! triggers `checkpoint_inlined_data` whenever +//! `inline_flush_max_bytes` / `inline_flush_max_rows` / +//! `inline_flush_max_segments` is crossed β€” typically several times per +//! minute at production ingest rates β€” so this is a recurring tail-latency +//! source, not a one-time cost. +//! +//! The fix is to fold the two DELETEs into a single metastore +//! transaction: `clear_inlined_data_and_deletes` issues one BEGIN + +//! two DELETEs + one COMMIT in one wire round-trip. The listing-fence +//! bracket then holds for only one RTT instead of two β€” the in-process +//! cost of the bracket is unchanged but the wire-bound term halves. +//! +//! ## What this bench measures +//! +//! Two lanes, identical fence-bracket pattern, identical "refresh +//! listing table" no-op, identical lock primitive (`tokio::sync::RwLock` +//! β€” same primitive used by `listing_fence` at `table.rs:880`). +//! +//! Per-call metastore work is simulated by `tokio::time::sleep(rtt)`. +//! Real `InMemory` round-trip time is below the timer resolution, so the +//! sleep is the *only* meaningful work β€” exactly the model we want +//! because it isolates the sequential-vs-batched pattern from any +//! confounding compute. +//! +//! - `checkpoint_fence_stall/current_two_sequential_deletes/` β€” +//! `fence.write().await; sleep(rtt).await; sleep(rtt).await; drop(fence);` +//! Mirrors today's two-DELETE shape. +//! - `checkpoint_fence_stall/achievable_single_batch_delete/` β€” +//! `fence.write().await; sleep(rtt).await; drop(fence);` Single +//! batched DELETE. +//! +//! ## How to read +//! +//! `cargo bench --bench checkpoint_fence_stall -p cayenne`. The +//! `current_two_sequential_deletes` lane is ~2Γ— the duration of +//! `achievable_single_batch_delete`. Because the lock is held for the +//! whole duration, **the duration of the current lane is also the +//! worst-case scan tail latency caused by one checkpoint** β€” every +//! concurrent reader stalls that long. The bench output makes the +//! tail-latency floor visible at three RTTs that cover production +//! deployments: +//! +//! - `rtt_1ms` β€” local SQLite with `fsync` (best case). +//! - `rtt_10ms` β€” same-zone network metastore (typical Turso / managed +//! Postgres). +//! - `rtt_30ms` β€” cross-region network metastore. +//! +//! Use the `current_two_sequential_deletes/rtt_30ms` value as the +//! upper bound on how long a scan can hang during one checkpoint. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use tokio::runtime::Runtime; +use tokio::sync::RwLock; + +/// Stand-in for the in-process work +/// `refresh_listing_table_under_held_fence` does after the metastore +/// returns. Real cost is sub-microsecond (`ArcSwap::store` + invalidate +/// the DataFusion list-files cache); we keep the symbol so both lanes +/// pay the same constant overhead. +#[inline(never)] +fn refresh_listing_table_no_op() { + black_box(0u64); +} + +/// Simulated round-trip times spanning the three realistic deployment +/// profiles. Local in-process SQLite without `fsync` (< 100 Β΅s) is not +/// included β€” at that scale the bench duration is dominated by lock +/// acquisition overhead and the regression is not visible. +const RTTS: &[(&str, Duration)] = &[ + ("rtt_1ms", Duration::from_millis(1)), + ("rtt_10ms", Duration::from_millis(10)), + ("rtt_30ms", Duration::from_millis(30)), +]; + +async fn current_two_sequential_deletes(fence: &RwLock<()>, rtt: Duration) { + let _guard = fence.write().await; + // clear_inlined_data β€” first metastore round trip. + tokio::time::sleep(rtt).await; + // clear_inlined_deletes β€” second metastore round trip. + tokio::time::sleep(rtt).await; + refresh_listing_table_no_op(); +} + +async fn achievable_single_batch_delete(fence: &RwLock<()>, rtt: Duration) { + let _guard = fence.write().await; + // clear_inlined_data_and_deletes β€” single transaction, one round trip. + tokio::time::sleep(rtt).await; + refresh_listing_table_no_op(); +} + +fn bench_checkpoint_fence_stall(c: &mut Criterion) { + let rt = Runtime::new().expect("tokio runtime"); + let fence = Arc::new(RwLock::new(())); + + let mut group = c.benchmark_group("checkpoint_fence_stall"); + for &(label, rtt) in RTTS { + let fence_a = Arc::clone(&fence); + group.bench_with_input( + BenchmarkId::new("current_two_sequential_deletes", label), + &rtt, + |b, &rtt| { + let fence = Arc::clone(&fence_a); + b.to_async(&rt) + .iter(|| async { current_two_sequential_deletes(&fence, rtt).await }); + }, + ); + + let fence_b = Arc::clone(&fence); + group.bench_with_input( + BenchmarkId::new("achievable_single_batch_delete", label), + &rtt, + |b, &rtt| { + let fence = Arc::clone(&fence_b); + b.to_async(&rt) + .iter(|| async { achievable_single_batch_delete(&fence, rtt).await }); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_checkpoint_fence_stall); +criterion_main!(benches); diff --git a/crates/cayenne/benches/column_stats_contention.rs b/crates/cayenne/benches/column_stats_contention.rs new file mode 100644 index 0000000000..2887f3826d --- /dev/null +++ b/crates/cayenne/benches/column_stats_contention.rs @@ -0,0 +1,309 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: lock pattern in +//! `ColumnStatsAccumulator::update` +//! (`crates/cayenne/src/provider/table.rs:259-303`). +//! +//! The current `ColumnStatsAccumulator` +//! (`crates/cayenne/src/provider/table.rs:214-228`) holds **two** separate +//! `std::sync::Mutex`es: +//! +//! ```ignore +//! columns: std::sync::Mutex>, +//! columns_seeded: std::sync::Mutex>, +//! ``` +//! +//! `update()` acquires both β€” `columns` then `columns_seeded` β€” on every +//! `RecordBatch` from the write hot path (called from the streaming wrapper +//! at `crates/cayenne/src/provider/table.rs:2790`). Multi-partition writers +//! share a single `Arc` +//! (`table.rs:2782`), so each writer task serializes through *the same two +//! mutexes* on every batch. Per-batch fixed cost is the floor; under +//! contention it becomes the throughput ceiling. +//! +//! `ColumnStatsAccumulator` is `pub(crate)`, so this bench is a shape bench +//! β€” it models the exact `std::sync::Mutex>` + `std::sync::Mutex>` +//! pattern, with the same per-batch body shape (read columns slice, branch +//! on per-column seeded flag, mutate both vectors). Same precedent as +//! `listing_fence_overhead.rs` which benches the synchronization pattern +//! rather than the concrete `ListingTable` it guards. +//! +//! ## Three lanes +//! +//! - `current_two_locks/` β€” mirrors today's structure. Each +//! thread locks `columns`, then locks `columns_seeded`, does per-column +//! work, drops both guards. Models the production pattern. +//! - `single_combined_lock/` β€” merges the two `Mutex>` +//! fields into one `Mutex` where `State` owns both vectors. +//! One atomic acquisition per batch instead of two. Same contention +//! profile, smaller per-call constant. +//! - `per_thread_then_merge/` β€” each thread accumulates into a +//! thread-local accumulator with no synchronization at all; a single +//! final merge folds them together. Models the structural fix. Wall +//! time should scale near-linearly with thread count down to the +//! merge cost. +//! +//! ## How to read +//! +//! `cargo bench --bench column_stats_contention -p cayenne`. For threads=8 +//! and `BATCHES_PER_THREAD=512`: +//! +//! - `current_two_locks/8` is the regression baseline. As threads +//! increases, time stays nearly flat β€” i.e. the lock is the bottleneck. +//! - `single_combined_lock/8` should be ~2Γ— faster than `current_two_locks/8` +//! (one atomic CAS instead of two) but still serial. +//! - `per_thread_then_merge/8` should be ~Nx faster on an N-core box, +//! because the threads truly run in parallel. +//! +//! Use the gap between `current_two_locks` and `per_thread_then_merge` +//! to size the headroom from migrating to per-partition accumulators. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Mutex; +use std::sync::{Arc, atomic::AtomicI64}; +use std::thread; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Number of stat columns. Picked to match a typical accelerated table β€” +/// most production schemas have 4-32 columns. Per-column work scales with +/// this value linearly inside the locked critical section. +const NUM_COLUMNS: usize = 8; + +/// Per-thread batch count. Large enough to amortize thread spawn overhead; +/// small enough that the bench stays in the millisecond range. +const BATCHES_PER_THREAD: usize = 512; + +/// Concurrency levels straddling the typical writer-partition count +/// (`target_partitions` defaults to logical CPU count). +const THREAD_COUNTS: &[usize] = &[1, 4, 8, 16]; + +/// Stand-in for the work that +/// `crate::stats::column_stats_to_stats_set(...)` plus +/// `existing.merge_unordered(...)` does per column inside the locked +/// critical section. The exact wall-clock value does not matter for the +/// contention story; what matters is that there is *some* nonzero work +/// inside the lock, so contention is observable rather than instantaneous. +#[inline(never)] +fn per_column_work(state: u64, batch_contribution: u64) -> u64 { + // A few non-trivial integer ops so the optimizer cannot fold this + // into a single instruction. `black_box` keeps both inputs alive. + let a = black_box(state).wrapping_mul(0x9E37_79B9_7F4A_7C15); + let b = black_box(batch_contribution).wrapping_add(0xDEAD_BEEF_CAFE_BABE); + a ^ b.rotate_left(13) +} + +// --------------------------------------------------------------------------- +// Lane 1: current_two_locks β€” exact mirror of `ColumnStatsAccumulator`. +// --------------------------------------------------------------------------- + +struct CurrentTwoLocks { + columns: Mutex>, + columns_seeded: Mutex>, + row_count: AtomicI64, +} + +impl CurrentTwoLocks { + fn new() -> Self { + Self { + columns: Mutex::new(vec![0u64; NUM_COLUMNS]), + columns_seeded: Mutex::new(vec![false; NUM_COLUMNS]), + row_count: AtomicI64::new(0), + } + } + + fn update(&self, batch_rows: i64, batch_contribution: u64) { + let mut cols = self.columns.lock().expect("cols poisoned"); + let mut seeded = self.columns_seeded.lock().expect("seeded poisoned"); + self.row_count + .fetch_add(batch_rows, std::sync::atomic::Ordering::Relaxed); + for i in 0..NUM_COLUMNS { + let next = per_column_work(cols[i], batch_contribution); + if seeded[i] { + cols[i] = next; + } else { + cols[i] = next; + seeded[i] = true; + } + } + } +} + +// --------------------------------------------------------------------------- +// Lane 2: single_combined_lock β€” one Mutex owning both vectors. +// --------------------------------------------------------------------------- + +struct CombinedState { + columns: Vec, + columns_seeded: Vec, +} + +struct SingleCombinedLock { + state: Mutex, + row_count: AtomicI64, +} + +impl SingleCombinedLock { + fn new() -> Self { + Self { + state: Mutex::new(CombinedState { + columns: vec![0u64; NUM_COLUMNS], + columns_seeded: vec![false; NUM_COLUMNS], + }), + row_count: AtomicI64::new(0), + } + } + + fn update(&self, batch_rows: i64, batch_contribution: u64) { + let mut state = self.state.lock().expect("state poisoned"); + self.row_count + .fetch_add(batch_rows, std::sync::atomic::Ordering::Relaxed); + for i in 0..NUM_COLUMNS { + let next = per_column_work(state.columns[i], batch_contribution); + if state.columns_seeded[i] { + state.columns[i] = next; + } else { + state.columns[i] = next; + state.columns_seeded[i] = true; + } + } + } +} + +// --------------------------------------------------------------------------- +// Lane 3: per_thread_then_merge β€” thread-local accumulators, one merge at +// the end. Models the structural fix (per-partition accumulators that +// finalize into the shared one). +// --------------------------------------------------------------------------- + +#[derive(Clone)] +struct LocalAcc { + columns: Vec, + columns_seeded: Vec, + row_count: i64, +} + +impl LocalAcc { + fn new() -> Self { + Self { + columns: vec![0u64; NUM_COLUMNS], + columns_seeded: vec![false; NUM_COLUMNS], + row_count: 0, + } + } + + fn update(&mut self, batch_rows: i64, batch_contribution: u64) { + self.row_count = self.row_count.saturating_add(batch_rows); + for i in 0..NUM_COLUMNS { + let next = per_column_work(self.columns[i], batch_contribution); + if self.columns_seeded[i] { + self.columns[i] = next; + } else { + self.columns[i] = next; + self.columns_seeded[i] = true; + } + } + } + + fn merge(&mut self, other: &LocalAcc) { + self.row_count = self.row_count.saturating_add(other.row_count); + for i in 0..NUM_COLUMNS { + if other.columns_seeded[i] { + let next = per_column_work(self.columns[i], other.columns[i]); + self.columns[i] = next; + self.columns_seeded[i] = self.columns_seeded[i] || true; + } + } + } +} + +// --------------------------------------------------------------------------- +// Drivers. +// --------------------------------------------------------------------------- + +fn run_current(threads: usize) { + let acc = Arc::new(CurrentTwoLocks::new()); + thread::scope(|s| { + for t in 0..threads { + let acc = Arc::clone(&acc); + s.spawn(move || { + for b in 0..BATCHES_PER_THREAD { + acc.update(1024, (t as u64).wrapping_mul(b as u64 + 1)); + } + }); + } + }); + black_box(acc.row_count.load(std::sync::atomic::Ordering::Relaxed)); +} + +fn run_combined(threads: usize) { + let acc = Arc::new(SingleCombinedLock::new()); + thread::scope(|s| { + for t in 0..threads { + let acc = Arc::clone(&acc); + s.spawn(move || { + for b in 0..BATCHES_PER_THREAD { + acc.update(1024, (t as u64).wrapping_mul(b as u64 + 1)); + } + }); + } + }); + black_box(acc.row_count.load(std::sync::atomic::Ordering::Relaxed)); +} + +fn run_per_thread(threads: usize) { + let final_acc = Arc::new(Mutex::new(LocalAcc::new())); + thread::scope(|s| { + for t in 0..threads { + let final_acc = Arc::clone(&final_acc); + s.spawn(move || { + let mut local = LocalAcc::new(); + for b in 0..BATCHES_PER_THREAD { + local.update(1024, (t as u64).wrapping_mul(b as u64 + 1)); + } + final_acc.lock().expect("final acc").merge(&local); + }); + } + }); + black_box(final_acc.lock().expect("final").row_count); +} + +fn bench_column_stats_contention(c: &mut Criterion) { + let mut group = c.benchmark_group("column_stats_contention"); + for &t in THREAD_COUNTS { + let work_units = u64::try_from(t * BATCHES_PER_THREAD).unwrap_or(u64::MAX); + group.throughput(Throughput::Elements(work_units)); + + group.bench_with_input(BenchmarkId::new("current_two_locks", t), &t, |b, &t| { + b.iter(|| run_current(t)); + }); + + group.bench_with_input(BenchmarkId::new("single_combined_lock", t), &t, |b, &t| { + b.iter(|| run_combined(t)); + }); + + group.bench_with_input(BenchmarkId::new("per_thread_then_merge", t), &t, |b, &t| { + b.iter(|| run_per_thread(t)); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_column_stats_contention); +criterion_main!(benches); diff --git a/crates/cayenne/benches/compaction_picker.rs b/crates/cayenne/benches/compaction_picker.rs new file mode 100644 index 0000000000..8eb7d9b95b --- /dev/null +++ b/crates/cayenne/benches/compaction_picker.rs @@ -0,0 +1,72 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Pure-CPU Criterion benchmark for [`cayenne::provider::compaction::pick_candidates`]. +//! +//! The picker runs on the hot write path after every Vortex flush. This bench +//! validates that even for large directories the picker stays O(n log n) and +//! fast in absolute terms. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; + +#[allow(dead_code)] +#[path = "../src/provider/compaction.rs"] +mod compaction; + +use compaction::{CompactionPickerConfig, FileEntry, pick_candidates}; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +fn synthetic_files(count: usize) -> Vec> { + // Distribute sizes across a range that exercises both Small and Mid tiers + // for a 128 MiB target. Sizes cycle from 1 MiB to 100 MiB. + (0..count) + .map(|idx| { + let mib = 1 + ((idx * 37) % 100) as u64; + FileEntry { + path: format!("data_{idx:06}.vortex"), + size_bytes: mib * 1024 * 1024, + } + }) + .collect() +} + +fn bench_pick_candidates(c: &mut Criterion) { + let mut group = c.benchmark_group("compaction_picker_pick_candidates"); + let cfg = CompactionPickerConfig::new(8, 32, 128 * 1024 * 1024); + + for &count in &[10_usize, 100, 1_000, 10_000] { + let files = synthetic_files(count); + group.throughput(Throughput::Elements(count as u64)); + group.bench_with_input(BenchmarkId::from_parameter(count), &files, |b, files| { + b.iter(|| { + let candidate = pick_candidates( + black_box(files).iter().map(|entry| FileEntry { + path: entry.path.as_str(), + size_bytes: entry.size_bytes, + }), + black_box(&cfg), + ); + black_box(candidate); + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_pick_candidates); +criterion_main!(benches); diff --git a/crates/cayenne/benches/compaction_sort_serialization.rs b/crates/cayenne/benches/compaction_sort_serialization.rs new file mode 100644 index 0000000000..2205fa61e1 --- /dev/null +++ b/crates/cayenne/benches/compaction_sort_serialization.rs @@ -0,0 +1,275 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: compaction throughput cliff when `sort_columns` is +//! configured. +//! +//! `CayenneTableProvider::rewrite_current_snapshot_for_compaction` +//! (`crates/cayenne/src/provider/table.rs:4810-4889`) hard-codes +//! `target_partitions = 1` when `sort_columns` is set on the table: +//! +//! ```ignore +//! let target_partitions = if self.context.has_sort_columns() { +//! stream = self.sort_stream(stream)?; +//! 1 // ← single writer +//! } else { +//! ctx.state().config().target_partitions() // ← parallel writers +//! }; +//! ``` +//! +//! The trade-off is real: sorted output produces tight per-file zone maps, +//! which makes downstream OLAP queries dramatically faster on the +//! sort-column predicate. But the compaction rewrite itself loses all +//! writer parallelism: a 300M-row `order_line` table that finishes +//! compaction in minutes without sort_columns takes much longer with +//! sort_columns because a single Vortex writer thread serially encodes +//! every row. +//! +//! This was the question raised in the May 15 2026 SF100 retest: +//! *"How common is it to define sort columns on large tables in production +//! Cayenne deployments? Is the unsorted configuration representative of +//! typical usage?"* The 30Γ— bootstrap improvement that report measured is +//! configuration-specific β€” production deployments that need `sort_columns` +//! for OLAP query performance pay the KΓ— cliff this bench captures. +//! +//! The fix is a parallel sort-merge: range-partition the input by sort +//! key, sort each partition in parallel, and write each partition through +//! its own Vortex writer. Final output is split across K files (matching +//! today's `target_partitions=K` model) and each file is internally +//! sorted, so per-file zone maps stay tight. DataFusion already has +//! `SortPreservingMergeExec` for the merge layer; what is missing is the +//! `range-partition before sort` rewrite for the compaction path +//! specifically. +//! +//! ## What this bench measures +//! +//! Pure shape β€” no Vortex, no Cayenne setup. Models the +//! `target_partitions=1` (sorted) vs `target_partitions=K` (unsorted) +//! cliff on a synthetic stream of N rows. +//! +//! Per-row "write work" is simulated by a small CPU-bound function +//! (`xor`, `wrapping_mul`, `memcpy`) so the parallelism story is +//! observable as wall-clock speedup. The exact per-row cost does not +//! matter β€” only the ratio between lanes. +//! +//! Three lanes per `N_rows`: +//! +//! - `serial_sort_then_write/N` β€” mirrors today's sort_columns +//! compaction path. Allocates a `Vec` of all rows, sorts it by +//! the synthetic sort key, then processes every row on one thread. +//! Time = sort + N Β· per-row-work. +//! - `parallel_write_unsorted/N` β€” mirrors today's unsorted compaction +//! path. Round-robins N rows across `K = num_cpus.min(16)` worker +//! threads. No sort. Time = N Β· per-row-work / K. +//! - `parallel_sort_then_merge_write/N` β€” models the proposed fix. +//! Range-partitions input across K threads, sorts each partition in +//! parallel, then each thread writes its partition. Time = sort/K + +//! N Β· per-row-work / K. Total output is sorted within each partition +//! (no global merge needed for compaction since each Vortex file is +//! independently zone-mapped). +//! +//! ## How to read +//! +//! `cargo bench --bench compaction_sort_serialization -p cayenne`. At +//! `N_rows = 4_000_000` on a multi-core box: +//! +//! - `serial_sort_then_write` is the regression baseline. Slope is +//! bounded by single-thread throughput. +//! - `parallel_write_unsorted` is the headroom **without** sort_columns +//! β€” the KΓ— speedup over serial. +//! - `parallel_sort_then_merge_write` is the headroom **with** the +//! proposed fix β€” should approach `parallel_write_unsorted` minus the +//! per-partition sort cost (O((N/K) log (N/K))). +//! +//! The gap between `serial_sort_then_write` and +//! `parallel_sort_then_merge_write` is what production deployments using +//! `sort_columns` could reclaim at compaction time. For +//! N = 4_000_000 rows and K = 16, the gap should be ~10-14Γ— (sort itself +//! is sub-linear; the dominant savings come from parallel write work). + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::thread; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Total input rows. Three sizes to show the curve: +/// - 100 K: a small compaction (background tier 0). +/// - 1 M: a medium compaction (tier 1 / 2). +/// - 4 M: a large compaction (tier 3, single-partition production +/// `order_line` at SF10). +const ROW_COUNTS: &[usize] = &[100_000, 1_000_000, 4_000_000]; + +/// Worker count for parallel lanes. Capped at 16 so the bench runs in +/// reasonable time across hardware shapes; production picks +/// `SessionConfig::target_partitions()`, typically `num_cpus`. +fn worker_count() -> usize { + std::thread::available_parallelism() + .map_or(4, |n| n.get()) + .min(16) +} + +/// Synthetic row: 16 bytes of payload + an i64 sort key. Width is +/// representative of a narrow CDC row (PK + small payload). +#[derive(Clone)] +struct Row { + sort_key: i64, + _payload: [u8; 16], +} + +fn make_row(idx: usize) -> Row { + // Scrambled sort key so the input is unsorted but deterministic. + let scrambled = (idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + let sort_key = scrambled as i64; + let mut payload = [0u8; 16]; + payload[..8].copy_from_slice(&scrambled.to_le_bytes()); + Row { + sort_key, + _payload: payload, + } +} + +/// Simulated per-row work that a Vortex writer does: a few non-trivial +/// integer ops + a memcpy. Inline-never so the optimizer cannot hoist +/// it out of the loop or fuse it across rows. +#[inline(never)] +fn per_row_work(row: &Row, acc: u64) -> u64 { + let mut sink = [0u8; 16]; + sink.copy_from_slice(&row._payload); + let mixed = u64::from_le_bytes(sink[..8].try_into().expect("8 bytes")) + .wrapping_mul(0x9E37_79B9_7F4A_7C15); + acc.wrapping_add(mixed ^ row.sort_key as u64).rotate_left(7) +} + +fn generate_rows(n: usize) -> Vec { + (0..n).map(make_row).collect() +} + +/// Lane A: serial sort + single writer (today's sort_columns path). +fn serial_sort_then_write(n: usize) -> u64 { + let mut rows = generate_rows(n); + rows.sort_unstable_by_key(|r| r.sort_key); + + let mut acc = 0u64; + for row in &rows { + acc = per_row_work(row, acc); + } + black_box(&rows); + acc +} + +/// Lane B: parallel writer, no sort (today's unsorted path). +fn parallel_write_unsorted(n: usize) -> u64 { + let rows = generate_rows(n); + let k = worker_count(); + let chunk = n.div_ceil(k); + + let total: u64 = thread::scope(|s| { + let mut handles = Vec::with_capacity(k); + let rows_ref = &rows; + for w in 0..k { + let start = w * chunk; + let end = (start + chunk).min(n); + if start >= end { + break; + } + handles.push(s.spawn(move || { + let mut acc = 0u64; + for row in &rows_ref[start..end] { + acc = per_row_work(row, acc); + } + acc + })); + } + handles.into_iter().map(|h| h.join().expect("join")).sum() + }); + black_box(&rows); + total +} + +/// Lane C: parallel sort + parallel writer (proposed fix). Range- +/// partition by sort key bucket, sort each partition in parallel, write +/// in parallel. Each output partition is independently sorted, which is +/// sufficient for Cayenne's per-file zone maps. +fn parallel_sort_then_merge_write(n: usize) -> u64 { + let rows = generate_rows(n); + let k = worker_count(); + + // Range-partition by the high bits of sort_key. For our scrambled + // input the bucket distribution is approximately uniform β€” same + // shape as a real range-partition over a high-cardinality column. + let mut buckets: Vec> = (0..k).map(|_| Vec::with_capacity(n / k + 1)).collect(); + let bits = (k as u64).next_power_of_two().trailing_zeros(); + for row in rows { + let key = row.sort_key as u64; + let bucket = ((key >> (64 - bits)) as usize).min(k - 1); + buckets[bucket].push(row); + } + + let total: u64 = thread::scope(|s| { + let mut handles = Vec::with_capacity(k); + for bucket in buckets { + handles.push(s.spawn(move || { + let mut local = bucket; + local.sort_unstable_by_key(|r| r.sort_key); + let mut acc = 0u64; + for row in &local { + acc = per_row_work(row, acc); + } + black_box(&local); + acc + })); + } + handles.into_iter().map(|h| h.join().expect("join")).sum() + }); + total +} + +fn bench_compaction_sort_serialization(c: &mut Criterion) { + let mut group = c.benchmark_group("compaction_sort_serialization"); + for &n in ROW_COUNTS { + group.throughput(Throughput::Elements(u64::try_from(n).unwrap_or(u64::MAX))); + + group.bench_with_input( + BenchmarkId::new("serial_sort_then_write", n), + &n, + |b, &n| { + b.iter(|| serial_sort_then_write(black_box(n))); + }, + ); + + group.bench_with_input( + BenchmarkId::new("parallel_write_unsorted", n), + &n, + |b, &n| { + b.iter(|| parallel_write_unsorted(black_box(n))); + }, + ); + + group.bench_with_input( + BenchmarkId::new("parallel_sort_then_merge_write", n), + &n, + |b, &n| { + b.iter(|| parallel_sort_then_merge_write(black_box(n))); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_compaction_sort_serialization); +criterion_main!(benches); diff --git a/crates/cayenne/benches/deletion_index_extend_map_clone.rs b/crates/cayenne/benches/deletion_index_extend_map_clone.rs new file mode 100644 index 0000000000..f5e6b1af37 --- /dev/null +++ b/crates/cayenne/benches/deletion_index_extend_map_clone.rs @@ -0,0 +1,176 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-write cost of the unconditional `HashMap` clone in +//! `DeletionIndex::extend_max` and `KeyDeletionIndex::extend_max` +//! (`crates/cayenne/src/provider/deletion_index.rs:159-208` and `:306-358`). +//! +//! Every PK-aware CDC write (delete or upsert with a non-empty deletion +//! set) calls `extend_max` to publish a new immutable deletion snapshot. +//! The bloom filter side is amortized to O(K) per call by the doubling +//! capacity heuristic (commit history), but the entry map itself is still +//! cloned in full on every call: +//! +//! ```ignore +//! pub fn extend_max(&self, additions: impl IntoIterator) -> Self { +//! let mut entries = self.entries.clone(); // <-- O(N) on every call +//! ... +//! } +//! ``` +//! +//! `HashMap::clone()` for a `HashMap` of N entries: +//! - allocates a fresh bucket vector (~2.5N slots at default load factor) +//! - memcpy-copies every occupied slot (16 bytes of payload + the hash) +//! - rehashes nothing (the clone keeps the same hash seed) +//! +//! At 100K entries that is ~2 MB of allocator traffic per CDC commit. The +//! existing `bench_extend_max_at_growing_cache_sizes` +//! (`deletion_index_probe.rs:218`) measures `extend_max` as a whole β€” bloom +//! + map clone bundled β€” so the map-clone slice of the budget is not +//! directly visible. This bench isolates it. +//! +//! The TigerStyle remedy is to store the entry map as +//! `Arc>` and use `Arc::make_mut` to copy-on-write only when +//! the writer actually mutates; in practice all `extend_max` calls +//! mutate, but readers (`DeletionIndex::probe`) need only an `Arc::clone`. +//! Combined with persistent / structurally-shared maps (`im::HashMap` or +//! `imbl::HashMap`), the per-write cost drops to O(K log N) instead of +//! O(N), and steady-state CDC writes against a 1 M-entry deletion cache +//! stop scaling with cache size. +//! +//! ## What this bench measures +//! +//! Pure shape β€” no metastore, no Cayenne setup. Models the **map-clone +//! slice** of `extend_max` at four cache sizes that bracket realistic +//! deletion-cache shapes: +//! +//! - 1 K entries β€” a fresh table after the first few deletes. +//! - 10 K entries β€” typical operational state. +//! - 100 K entries β€” long-lived table that has absorbed many deletes +//! without a compaction. +//! - 1 M entries β€” the upper end before compaction absorbs deletions +//! into the data files. +//! +//! Two lanes per size: +//! +//! - `int64_map_clone_then_insert/` β€” `HashMap::clone()` +//! followed by inserting one fresh entry. Mirrors the body of +//! `DeletionIndex::extend_max`. +//! - `binary_map_clone_then_insert/` β€” `HashMap, i64>::clone()` +//! with 16-byte keys, plus one insert. Mirrors `KeyDeletionIndex::extend_max`, +//! which also has to clone every `Box<[u8]>` key (an additional heap +//! allocation per entry, not just memcpy). +//! +//! ## How to read +//! +//! `cargo bench --bench deletion_index_extend_map_clone -p cayenne`. +//! +//! - `int64_map_clone_then_insert/100000` is the per-CDC-commit tax for +//! the dominant integer-PK case. Multiply by your write rate to get +//! the allocator-bound floor on PK-deletion throughput. +//! - The ratio `int64_map_clone_then_insert/1000000` divided by +//! `int64_map_clone_then_insert/1000` shows linear scaling. The fix +//! should make this ratio approach 1 (i.e. constant time on the +//! common path). +//! - `binary_map_clone_then_insert` should be ~2-3 Γ— `int64_map_clone_then_insert` +//! at the same N, because each entry pays one extra `Box<[u8]>` allocation +//! on top of the memcpy. Composite-PK tables (Utf8 PKs, multi-column PKs) +//! land on this lane. + +#![allow(clippy::expect_used)] + +use std::collections::HashMap; +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Entry counts spanning fresh-table to long-lived-cache shapes. +const ENTRY_COUNTS: &[usize] = &[1_000, 10_000, 100_000, 1_000_000]; + +fn build_int64_map(n: usize) -> HashMap { + let mut map = HashMap::with_capacity(n); + for i in 0..n { + // Knuth-multiplicative scrambling so HashMap bucket distribution + // matches realistic collision profiles instead of a contiguous-key + // best case. + let scrambled = (i as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + map.insert(scrambled as i64, i as i64); + } + map +} + +fn build_binary_map(n: usize) -> HashMap, i64> { + let mut map = HashMap::with_capacity(n); + for i in 0..n { + let scrambled = (i as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + let mut buf = vec![0u8; 16]; + buf[..8].copy_from_slice(&scrambled.to_le_bytes()); + buf[8..].copy_from_slice(&(i as u64).to_le_bytes()); + map.insert(buf.into_boxed_slice(), i as i64); + } + map +} + +fn bench_int64_map_clone_then_insert(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_index_extend_map_clone_int64"); + for &n in ENTRY_COUNTS { + let base = build_int64_map(n); + group.throughput(Throughput::Elements(1)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &n| { + b.iter(|| { + // Exactly the body of `DeletionIndex::extend_max` for one + // fresh-key addition: clone the entire entry map, then + // insert one new entry past the populated range. + let mut cloned = base.clone(); + cloned.insert((n as i64) + 1, 1); + black_box(cloned); + }); + }); + } + group.finish(); +} + +fn bench_binary_map_clone_then_insert(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_index_extend_map_clone_binary"); + for &n in ENTRY_COUNTS { + let base = build_binary_map(n); + let fresh_key_template = { + let mut buf = vec![0u8; 16]; + buf[..8].copy_from_slice(&((n as u64) + 1).to_le_bytes()); + buf.into_boxed_slice() + }; + group.throughput(Throughput::Elements(1)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + // Mirrors `KeyDeletionIndex::extend_max` for one fresh-key + // addition. The clone has to copy every `Box<[u8]>` key + // β€” an additional heap allocation per entry on top of the + // bucket memcpy. + let mut cloned = base.clone(); + cloned.insert(fresh_key_template.clone(), 1); + black_box(cloned); + }); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_int64_map_clone_then_insert, + bench_binary_map_clone_then_insert, +); +criterion_main!(benches); diff --git a/crates/cayenne/benches/deletion_index_probe.rs b/crates/cayenne/benches/deletion_index_probe.rs index 7dde6df886..0fee9b9d4a 100644 --- a/crates/cayenne/benches/deletion_index_probe.rs +++ b/crates/cayenne/benches/deletion_index_probe.rs @@ -191,10 +191,98 @@ fn bench_concurrent_load_under_publish(c: &mut Criterion) { group.finish(); } +/// Micro-bench that quantifies the per-call cost of `DeletionIndex::extend_max` +/// as the cumulative deletion-cache size grows. This is the exact hot path +/// hit by every PK-aware upsert / delete on a table that accumulates +/// deletion entries. +/// +/// A previous revision rebuilt the bloom filter from scratch on every call +/// (iterating ALL existing entries to re-hash). That made per-call work +/// O(N) where N is the cumulative cache size. Across M writes the cost was +/// O(MΒ·N) β€” quadratic in the cache size, the root cause of the +/// user-reported ~200% ingestion regression. +/// +/// The current implementation keeps amortized cost at O(K) per call by: +/// - Tracking `bloom_capacity` and only rebuilding the bloom when entry +/// count crosses `2 * bloom_capacity` (geometric amortization). +/// - Inserting only newly-added keys into a clone of the existing bloom +/// in the common path. +/// +/// This bench runs `extend_max` at several pre-populated cache sizes and +/// reports per-call latency. Watch for these signals on regression: +/// - The 10K/100K/1M curves diverging from constant time (returning to +/// O(N)) is the regression returning. +/// - Sudden jumps at `2^k`-boundaries are the (intentional) amortized +/// full-rebuild cost; they should still be much cheaper than the +/// pre-fix worst case. +fn bench_extend_max_at_growing_cache_sizes(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_index_extend_max_growth"); + group.throughput(Throughput::Elements(1)); + + // For each pre-populated size, time one extend_max call that adds K=1 + // new key (the common per-row upsert pattern). Cache sizes are picked + // to span small (typical CDC), medium, and large (long-lived table) + // workloads. + for n in [100_usize, 1_000, 10_000, 100_000] { + let mut seed_map = HashMap::with_capacity(n); + for i in 0..n { + seed_map.insert(i as i64, 1_i64); + } + let base = DeletionIndex::from_map(seed_map); + + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &n| { + b.iter(|| { + // Always extend with one fresh key past the seeded range, so + // every iteration takes the Vacant branch. (If we extended + // with an existing key, the Occupied branch would short- + // circuit and obscure the new-key bloom-insert work.) + let next = base.extend_max([((n as i64) + 1, 2)]); + black_box(next); + }); + }); + } + + group.finish(); +} + +/// Companion bench that quantifies the *opposite* end of the workload: +/// many small extend_max calls in a row from an empty start. This is the +/// "high-rate CDC into a fresh table" pattern that catches the O(NΒ²) +/// cumulative regression β€” naive iteration time grows quadratically with N +/// if the bloom is rebuilt from scratch on every call, but stays linear +/// (one bloom rebuild per doubling) with the current amortized +/// implementation. +fn bench_extend_max_cumulative_from_empty(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_index_extend_max_cumulative"); + + for total in [128_usize, 1_024, 8_192] { + group.throughput(Throughput::Elements(total as u64)); + group.bench_with_input(BenchmarkId::from_parameter(total), &total, |b, &total| { + b.iter(|| { + // Re-build from empty on every iteration so the cumulative + // work is observable; the benchmark reports total time + // divided by Throughput=total, giving "per-row insert" + // latency. With the regression (O(NΒ²) cumulative) the per- + // row number grows linearly with `total`; with the fix it + // stays roughly flat. + let mut idx = DeletionIndex::empty(); + for i in 0..total as i64 { + idx = idx.extend_max([(i, 1)]); + } + black_box(idx); + }); + }); + } + + group.finish(); +} + criterion_group!( benches, bench_int64_probe, bench_row_keys_probe, - bench_concurrent_load_under_publish + bench_concurrent_load_under_publish, + bench_extend_max_at_growing_cache_sizes, + bench_extend_max_cumulative_from_empty, ); criterion_main!(benches); diff --git a/crates/cayenne/benches/deletion_vector_bitmap_to_treemap.rs b/crates/cayenne/benches/deletion_vector_bitmap_to_treemap.rs new file mode 100644 index 0000000000..4d19c6585a --- /dev/null +++ b/crates/cayenne/benches/deletion_vector_bitmap_to_treemap.rs @@ -0,0 +1,173 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-scan cost of converting per-file deletion vectors from +//! `RoaringBitmap` to `RoaringTreemap` in +//! `crates/cayenne/src/provider/vortex_format.rs:151-182`. +//! +//! Every `DeletionFilteringVortexFormat::create_physical_plan` call walks the +//! `FileScanConfig`'s file groups, looks up each file's deletion bitmap in the +//! `deletion_cache` (a `ArcSwap>>`), and β€” +//! for every file that has deletions β€” rebuilds a fresh `RoaringTreemap`: +//! +//! ```ignore +//! // attach_access_plan_to_file, vortex_format.rs:164 +//! let exclude: RoaringTreemap = bitmap.iter().map(u64::from).collect(); +//! let access_plan = VortexAccessPlan::default() +//! .with_selection(Selection::ExcludeRoaring(exclude)); +//! ``` +//! +//! The cache stores `Arc` (u32-keyed, compact form) because the +//! pre-cached deletion vectors were loaded as `RoaringBitmap`. The Vortex +//! `Selection::ExcludeRoaring` API consumes a `RoaringTreemap` (u64-keyed) for +//! billion-row tables. The conversion `bitmap.iter().map(u64::from).collect()` +//! materializes every deleted row id from the source bitmap, builds a fresh +//! `RoaringTreemap` containing the same elements, and discards both at the end +//! of the scan setup. +//! +//! Two consequences: +//! +//! 1. **Per-scan, per-file fixed cost**: a table with 1000 files where every +//! file carries 1000 deletions pays 1000 * (per-file conversion cost) on +//! every scan, *even when the underlying deletions are unchanged across +//! scans*. The deletion cache invalidates only on writes, but the converted +//! form is rebuilt per scan. +//! 2. **Quadratic-ish in deletion density**: as deletion rate per file rises +//! (e.g. after a large delete-by-predicate or a slow checkpoint absorption), +//! each per-file conversion grows linearly with the deletion count. +//! +//! The TigerStyle remedy is to store the converted form directly in the cache. +//! Two options: +//! - cache `Arc` instead of `Arc`, paying the +//! conversion once at deletion-cache publish time. The cache is published +//! under the write fence; readers only ever see the converted form. +//! - cache both shapes as `(Arc, OnceCell>)` +//! and lazily fill the treemap on first scan. Same amortization, slightly +//! more memory. +//! +//! Either fix drops the per-scan cost to `Arc::clone()` on the converted bitmap +//! β€” a single atomic refcount bump, independent of deletion count. +//! +//! ## What this bench measures +//! +//! Pure shape β€” no metastore, no Cayenne setup, no Vortex scan. Models the +//! conversion that every scan-time `attach_access_plan_to_file` invocation +//! performs on a single file's deletion bitmap. +//! +//! Two lanes per deletion count: +//! +//! - `convert_per_scan/` β€” mirrors today's +//! `bitmap.iter().map(u64::from).collect::()` on every scan. +//! Wall time is the iterator walk plus the new treemap allocation. +//! - `cached_arc_clone/` β€” models the proposed cache: a single +//! pre-built `Arc` cloned per scan. Wall time is one +//! `Arc::clone` β€” a single atomic refcount bump. +//! +//! Deletion counts mirror realistic file-level deletion densities: +//! +//! - 100 deletions: a few CDC deletes scattered across files. +//! - 1 K deletions: typical mid-life file under steady deletion load. +//! - 10 K deletions: a file approaching the rewrite-by-compaction threshold. +//! - 100 K deletions: a "delete-heavy" file before compaction absorbs them. +//! - 1 M deletions: extreme β€” a near-empty file kept alive by zone-map +//! relevance for some other column. +//! +//! Per-file densities multiply: at 1000 files * 10 K deletions/file the +//! per-scan tax is 1000 * `convert_per_scan/10000`. +//! +//! ## How to read +//! +//! `cargo bench --bench deletion_vector_bitmap_to_treemap -p cayenne`. +//! +//! - `convert_per_scan/100000` β€” per-file fixed cost on a delete-heavy file. +//! Multiply by your `num_files_with_deletions` to get the per-scan floor. +//! - The ratio `convert_per_scan/N` Γ· `cached_arc_clone/N` is the headroom +//! from the fix. At N=1 K the ratio is dominated by the +//! `RoaringTreemap::new()` allocation; at Nβ‰₯10 K it is dominated by the +//! `bitmap.iter()` walk plus `RoaringTreemap::insert` per element. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Arc; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use roaring::{RoaringBitmap, RoaringTreemap}; + +/// Deletion counts spanning realistic per-file shapes. +const DELETION_COUNTS: &[usize] = &[100, 1_000, 10_000, 100_000, 1_000_000]; + +/// Build a `RoaringBitmap` modelling realistic deletion locality. We scatter +/// keys with a Knuth multiplicative scramble across roughly 4Γ—N to mimic CDC +/// deletes that touch sparse rows in a file (rather than a contiguous prefix +/// that compresses pathologically well). +fn build_bitmap(n: usize) -> RoaringBitmap { + let mut bitmap = RoaringBitmap::new(); + for i in 0..n { + let scrambled = (i as u32).wrapping_mul(0x9E37_79B9_u32); + bitmap.insert(scrambled & 0x00FF_FFFF); // limit to 16M-row range + } + bitmap +} + +/// Mirror the exact production conversion at +/// `vortex_format.rs:164`: +/// `bitmap.iter().map(u64::from).collect::()`. +fn convert_to_treemap(bitmap: &RoaringBitmap) -> RoaringTreemap { + bitmap.iter().map(u64::from).collect() +} + +fn bench_convert_per_scan(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_vector_bitmap_to_treemap_convert_per_scan"); + for &n in DELETION_COUNTS { + let bitmap = build_bitmap(n); + group.throughput(Throughput::Elements(n as u64)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + // Exactly the body of `attach_access_plan_to_file` for one + // file that has deletions. Discard the result via black_box + // so the optimizer cannot lift the conversion out of the + // iteration loop. + let treemap = convert_to_treemap(&bitmap); + black_box(treemap); + }); + }); + } + group.finish(); +} + +fn bench_cached_arc_clone(c: &mut Criterion) { + let mut group = c.benchmark_group("deletion_vector_bitmap_to_treemap_cached_arc_clone"); + for &n in DELETION_COUNTS { + let bitmap = build_bitmap(n); + // Pre-build the treemap once, share via Arc β€” models the fix where + // the deletion cache stores `Arc` directly. + let treemap: Arc = Arc::new(convert_to_treemap(&bitmap)); + group.throughput(Throughput::Elements(n as u64)); + group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| { + b.iter(|| { + // Per-scan cost in the proposed cache shape: one `Arc::clone` + // (a single atomic refcount bump) regardless of deletion count. + let cloned = Arc::clone(&treemap); + black_box(cloned); + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_convert_per_scan, bench_cached_arc_clone); +criterion_main!(benches); diff --git a/crates/cayenne/benches/inline_memtable_read_overhead.rs b/crates/cayenne/benches/inline_memtable_read_overhead.rs new file mode 100644 index 0000000000..d762bd8a07 --- /dev/null +++ b/crates/cayenne/benches/inline_memtable_read_overhead.rs @@ -0,0 +1,204 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-scan cost of the inline-memtable read path. +//! +//! `CayenneTableProvider::read_inlined_batches` +//! (`crates/cayenne/src/provider/table.rs:5592-5619`) is invoked from +//! every `scan()` whose table has a non-empty inline memtable (the fast +//! skip at `table.rs:7059` checks the cached row count). On a cache +//! miss it performs: +//! +//! ```ignore +//! let inlined = self.catalog.get_inlined_data(&id).await?; // 1 metastore RTT +//! let inlined_deletions = self.load_inlined_deletion_maps().await?; // 1 more metastore RTT +//! for entry in &inlined { +//! let entry_batches = deserialize_ipc_to_batch(&entry.data_ipc)?; // Arrow IPC decode +//! for batch in entry_batches { +//! if let Some(filtered) = self.filter_inlined_batch_for_deletions(...) { +//! batches.push(filtered); +//! } +//! } +//! } +//! ``` +//! +//! There is no in-memory cache of the deserialized `Vec` β€” +//! every scan repeats the IPC decode and deletion-mask construction +//! even though the inlined state is **static** between writes and +//! checkpoints (writes set the cached row count via +//! `inlined_row_count`, checkpoints clear it; nothing else changes the +//! inlined data). +//! +//! Two consequences: +//! +//! 1. **Per-scan fixed cost**: a CDC table with 1 MiB of inlined data +//! pays ~100 Β΅s–1 ms of IPC decode per scan plus 2 metastore RTTs +//! (now parallel via the pool, but still ~0.5–2 ms latency). +//! 2. **Freshness-probe tail spikes**: the May 15 2026 SF100 retest +//! reported the probe table's p99 freshness regressed from 931 ms +//! to 1607 ms (+73%). One mechanism that fits: the probe's reads +//! re-decode inlined data on every poll, and CPU contention from +//! high-WAL-table flushes lengthens the decode tail. +//! +//! The TigerStyle remedy is an in-memory cache keyed by inline +//! generation (an `AtomicU64` bumped by every `commit_inlined_mutation` +//! / `clear_inlined_data_and_deletes`). On scan, atomic-load the +//! generation; if it matches the cached generation, return the cached +//! `Arc>`. Otherwise rebuild + cache. Wait-free in +//! steady state. +//! +//! ## What this bench measures +//! +//! Pure shape β€” no metastore, no Cayenne setup. Models the **CPU-side** +//! cost of the read path: Arrow IPC deserialize + per-row deletion-mask +//! probe. +//! +//! Two lanes per inline data size: +//! +//! - `current_decode_per_scan/` β€” mirrors today's `read_inlined_batches`: +//! re-deserialize the IPC payload on every iteration and rebuild the +//! filtered batch. The "metastore round trip" is not modeled because +//! the pool already parallelizes it; what remains is the CPU-bound +//! IPC decode that no fix to the metastore can address. +//! - `cached_arc_clone/` β€” models the proposed cache: a single +//! pre-decoded `Arc>` cloned per scan. Wall time is +//! one `Arc::clone` plus the downstream usage (the `black_box`). +//! +//! Inline sizes: +//! +//! - 1 KiB: a single small CDC envelope. +//! - 100 KiB: a few dozen envelopes, typical between checkpoints. +//! - 1 MiB: near the inline-memtable flush threshold. +//! +//! ## How to read +//! +//! `cargo bench --bench inline_memtable_read_overhead -p cayenne`. +//! +//! - `current_decode_per_scan/1MiB` is the per-scan fixed cost a +//! freshness-probe table pays today between checkpoints. At 1000 +//! QPS this is the latency floor below which p99 cannot go. +//! - `cached_arc_clone/1MiB` is the achievable floor. The ratio is +//! the QPS headroom from adding the cache. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{Int64Array, RecordBatch, StringArray}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::ipc::reader::StreamReader; +use arrow::ipc::writer::StreamWriter; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Row counts straddling realistic inline-memtable sizes: +/// - 64 rows β‰ˆ ~1 KiB IPC payload (one envelope). +/// - 4096 rows β‰ˆ ~100 KiB. +/// - 32768 rows β‰ˆ ~1 MiB (near the typical +/// `inline_flush_max_bytes` threshold). +const INLINE_ROW_COUNTS: &[usize] = &[64, 4_096, 32_768]; + +fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ])) +} + +fn make_batch(rows: usize) -> RecordBatch { + let ids: Vec = (0..rows as i64).collect(); + let names: Vec = (0..rows).map(|i| format!("row_{i}")).collect(); + RecordBatch::try_new( + schema(), + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + ], + ) + .expect("batch") +} + +/// Serialize a `RecordBatch` to Arrow IPC bytes β€” matches the +/// production storage shape (`cayenne_inlined_data.data_ipc` blob). +fn serialize_ipc(batch: &RecordBatch) -> Vec { + let mut buf = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()).expect("writer"); + writer.write(batch).expect("write"); + writer.finish().expect("finish"); + } + buf +} + +/// Mirrors `deserialize_ipc_to_batch` (`table.rs:793`): decode the IPC +/// stream into one or more `RecordBatch`es. +fn deserialize_ipc(blob: &[u8]) -> Vec { + let reader = StreamReader::try_new(blob, None).expect("ipc reader"); + reader + .collect::>>() + .expect("decode") +} + +/// Lane A: today's per-scan pattern β€” re-deserialize the IPC blob on +/// every scan and pretend to hand the batches to the downstream +/// MemorySourceConfig. +fn current_decode_per_scan(blob: &[u8]) -> usize { + let batches = deserialize_ipc(blob); + let total_rows: usize = batches.iter().map(RecordBatch::num_rows).sum(); + black_box(&batches); + total_rows +} + +/// Lane B: cached pre-decoded batches β€” one `Arc::clone` per scan. +fn cached_arc_clone(cached: &Arc>) -> usize { + let clone = Arc::clone(cached); + let total_rows: usize = clone.iter().map(RecordBatch::num_rows).sum(); + black_box(&clone); + total_rows +} + +fn bench_inline_memtable_read(c: &mut Criterion) { + let mut group = c.benchmark_group("inline_memtable_read_overhead"); + for &rows in INLINE_ROW_COUNTS { + let batch = make_batch(rows); + let blob = serialize_ipc(&batch); + let cached = Arc::new(vec![batch.clone()]); + + group.throughput(Throughput::Elements( + u64::try_from(rows).unwrap_or(u64::MAX), + )); + + group.bench_with_input( + BenchmarkId::new("current_decode_per_scan", rows), + &blob, + |b, blob| { + b.iter(|| current_decode_per_scan(black_box(blob.as_slice()))); + }, + ); + + group.bench_with_input( + BenchmarkId::new("cached_arc_clone", rows), + &cached, + |b, cached| { + b.iter(|| cached_arc_clone(black_box(cached))); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_inline_memtable_read); +criterion_main!(benches); diff --git a/crates/cayenne/benches/inline_upsert_rewrite_overhead.rs b/crates/cayenne/benches/inline_upsert_rewrite_overhead.rs new file mode 100644 index 0000000000..a382518ed1 --- /dev/null +++ b/crates/cayenne/benches/inline_upsert_rewrite_overhead.rs @@ -0,0 +1,256 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-upsert cost of the inline-memtable rewrite path. +//! +//! `CayenneTableProvider::build_inlined_data_rewrite_for_pk_keys` +//! (`crates/cayenne/src/provider/table.rs:3917-3987`) is invoked from +//! every upsert / on-conflict insert whose deleted-PK set is non-empty +//! and the table has pending inline rows +//! (`apply_on_conflict_deletions` -> rewrite branch). On each call it +//! performs: +//! +//! ```ignore +//! let inlined_data = self.catalog.get_inlined_data(&id).await?; // 1 metastore RTT +//! let legacy_inlined_deletions = self.load_inlined_deletion_maps().await?; // 1 more metastore RTT +//! for entry in inlined_data { +//! let batches = deserialize_ipc_to_batch(&entry.data_ipc)?; // Arrow IPC decode +//! for batch in batches { +//! let Some(visible_batch) = self.filter_inlined_batch_for_deletions(...)? +//! else { continue }; +//! let (filtered_batch, removed_rows) = +//! self.filter_inlined_batch_for_pk_deletions(...); // new PK filter +//! ... +//! } +//! } +//! ``` +//! +//! The just-committed `read_inlined_batches` cache (keyed by an +//! `AtomicU64` inline generation) eliminates the same IPC-decode + +//! deletion-filter work for the **scan** path, but the upsert rewrite +//! path bypasses the cache and pays the full cost on every commit. +//! `commit_inlined_data_mutation` -> `build_inlined_data_rewrite_for_pk_keys` +//! is the inner loop of an on-conflict CDC stream where every envelope +//! upserts a single PK; at that shape the redundant decode dominates +//! the per-upsert CPU budget. +//! +//! Two consequences: +//! +//! 1. **Per-upsert fixed cost**: each upsert against a table with +//! 1 MiB of inlined data pays ~100 Β΅s–1 ms of IPC decode plus two +//! metastore round-trips, even though `read_inlined_batches` may +//! have decoded the same payload milliseconds earlier. +//! 2. **Cache-coherence asymmetry**: writers serially invalidate the +//! scan cache (good), but each writer's *own* rewrite step then +//! re-pays the decode cost the cache was designed to amortize. +//! +//! The TigerStyle remedy is to share the existing +//! `read_inlined_batches` cache: have `build_inlined_data_rewrite_for_pk_keys` +//! call `read_inlined_batches` and apply only the new PK filter on top, +//! rather than re-reading and re-decoding `cayenne_inlined_data`. +//! +//! ## What this bench measures +//! +//! Pure CPU shape β€” no metastore, no Cayenne setup. Models the +//! per-upsert decode + double-filter cost. +//! +//! Two lanes per inline data size: +//! +//! - `decode_and_filter_per_upsert/` β€” mirrors today's +//! `build_inlined_data_rewrite_for_pk_keys`: deserialize the IPC +//! payload, build a deletion-mask (legacy inline deletes, modelled +//! as empty since legacy writes are gated on a separate code path), +//! and apply a PK-set filter producing the rewritten batch. +//! - `cached_filter_per_upsert/` β€” models the proposed share: +//! start from pre-decoded `Vec` (as if reusing the +//! scan cache), then apply only the new PK filter. +//! +//! Inline sizes mirror `inline_memtable_read_overhead`: +//! +//! - 1 KiB: a single small CDC envelope. +//! - 100 KiB: a few dozen envelopes, typical between checkpoints. +//! - 1 MiB: near the inline-memtable flush threshold. +//! +//! ## How to read +//! +//! `cargo bench --bench inline_upsert_rewrite_overhead -p cayenne`. +//! +//! - `decode_and_filter_per_upsert/1MiB` is the per-upsert CPU cost a +//! high-conflict CDC stream pays today. At 1000 upserts/sec this is +//! the latency floor below which p99 cannot go. +//! - `cached_filter_per_upsert/1MiB` is the achievable floor if the +//! rewrite path reuses the scan cache. The ratio is the QPS +//! headroom from the sharing fix. + +#![allow(clippy::expect_used)] + +use std::collections::HashSet; +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{BooleanArray, Int64Array, RecordBatch, StringArray}; +use arrow::compute::filter_record_batch; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::ipc::reader::StreamReader; +use arrow::ipc::writer::StreamWriter; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Row counts straddling realistic inline-memtable sizes: +/// - 64 rows β‰ˆ ~1 KiB IPC payload (one envelope). +/// - 4096 rows β‰ˆ ~100 KiB. +/// - 32768 rows β‰ˆ ~1 MiB (near the typical +/// `inline_flush_max_bytes` threshold). +const INLINE_ROW_COUNTS: &[usize] = &[64, 4_096, 32_768]; + +/// Fraction of inline rows whose PK is in the upsert delete-set on each +/// rewrite. 10 % matches the shape of a CDC stream that occasionally +/// re-keys but is mostly net-new rows; the absolute filter cost is +/// linear in this fraction, but the IPC decode is paid in full +/// regardless. +const UPSERT_HIT_FRACTION: f64 = 0.10; + +fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ])) +} + +fn make_batch(rows: usize) -> RecordBatch { + let ids: Vec = (0..rows as i64).collect(); + let names: Vec = (0..rows).map(|i| format!("row_{i}")).collect(); + RecordBatch::try_new( + schema(), + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + ], + ) + .expect("batch") +} + +/// PK set the upsert is rewriting. Picks every Nth row, where N is +/// chosen so `UPSERT_HIT_FRACTION` of the rows match. The actual +/// rewrite work `build_inlined_data_rewrite_for_pk_keys` does scales +/// with the **filter mask construction**, not with the number of hits, +/// because the mask is built row-by-row. +fn upsert_pk_set(rows: usize) -> HashSet { + let hits = ((rows as f64) * UPSERT_HIT_FRACTION).max(1.0) as usize; + let stride = rows / hits.max(1); + (0..rows).step_by(stride.max(1)).map(|i| i as i64).collect() +} + +/// Serialize a `RecordBatch` to Arrow IPC bytes β€” matches the +/// production storage shape (`cayenne_inlined_data.data_ipc` blob). +fn serialize_ipc(batch: &RecordBatch) -> Vec { + let mut buf = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()).expect("writer"); + writer.write(batch).expect("write"); + writer.finish().expect("finish"); + } + buf +} + +/// Mirrors `deserialize_ipc_to_batch` (`table.rs:793`): decode the IPC +/// stream into one or more `RecordBatch`es. +fn deserialize_ipc(blob: &[u8]) -> Vec { + let reader = StreamReader::try_new(blob, None).expect("ipc reader"); + reader + .collect::>>() + .expect("decode") +} + +/// Mirrors `filter_inlined_batch_for_pk_deletions` for the Int64 PK +/// strategy: build a `keep_mask` Vec by probing each row's PK +/// against the upsert delete-set, then materialize the filtered batch. +fn apply_pk_filter(batch: &RecordBatch, deleted: &HashSet) -> RecordBatch { + let pk_array = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("Int64 PK"); + let mut keep_mask = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keep_mask.push(!deleted.contains(&pk_array.value(row))); + } + let mask = BooleanArray::from(keep_mask); + filter_record_batch(batch, &mask).expect("filter") +} + +/// Lane A: today's per-upsert pattern β€” decode IPC blob, run +/// (modelled) legacy-deletion filter, then PK-set filter, then +/// materialize the rewritten batch. +fn decode_and_filter_per_upsert(blob: &[u8], deleted: &HashSet) -> usize { + let batches = deserialize_ipc(blob); + let mut total_rows = 0_usize; + for batch in &batches { + // legacy-deletion filter is modelled as a no-op here: in steady + // state writes go through `commit_inlined_data_mutation` which + // never writes `cayenne_inlined_delete`. The decode cost is + // paid in full regardless of legacy-delete population, so this + // accurately captures the per-upsert ceiling. + let filtered = apply_pk_filter(batch, deleted); + total_rows += filtered.num_rows(); + } + black_box(&batches); + total_rows +} + +/// Lane B: cached pre-decoded batches β€” apply only the new PK filter +/// (no IPC decode, no extra metastore round-trip). +fn cached_filter_per_upsert(cached: &Arc>, deleted: &HashSet) -> usize { + let mut total_rows = 0_usize; + for batch in cached.iter() { + let filtered = apply_pk_filter(batch, deleted); + total_rows += filtered.num_rows(); + } + total_rows +} + +fn bench_inline_upsert_rewrite(c: &mut Criterion) { + let mut group = c.benchmark_group("inline_upsert_rewrite_overhead"); + for &rows in INLINE_ROW_COUNTS { + let batch = make_batch(rows); + let blob = serialize_ipc(&batch); + let cached = Arc::new(vec![batch.clone()]); + let deleted = upsert_pk_set(rows); + + group.throughput(Throughput::Elements( + u64::try_from(rows).unwrap_or(u64::MAX), + )); + + group.bench_with_input( + BenchmarkId::new("decode_and_filter_per_upsert", rows), + &(blob.clone(), deleted.clone()), + |b, (blob, deleted)| { + b.iter(|| decode_and_filter_per_upsert(black_box(blob.as_slice()), deleted)); + }, + ); + + group.bench_with_input( + BenchmarkId::new("cached_filter_per_upsert", rows), + &(Arc::clone(&cached), deleted), + |b, (cached, deleted)| { + b.iter(|| cached_filter_per_upsert(black_box(cached), deleted)); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_inline_upsert_rewrite); +criterion_main!(benches); diff --git a/crates/cayenne/benches/inner_join_sort_merge_rewrite.rs b/crates/cayenne/benches/inner_join_sort_merge_rewrite.rs new file mode 100644 index 0000000000..49e5d0f872 --- /dev/null +++ b/crates/cayenne/benches/inner_join_sort_merge_rewrite.rs @@ -0,0 +1,283 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: latency cliff caused by `CayenneAntiJoinSortMergeRewriter` +//! firing on `Inner`-joins above the 10M-row build-side threshold. +//! +//! When the same-source inner-join build side exceeds +//! [`crate::ANTI_JOIN_SORT_MERGE_MIN_EXACT_ROWS`] (10M), the rewriter at +//! `crates/cayenne/src/optimizer_rules.rs:360-430` replaces the +//! `HashJoinExec` with `SortMergeJoinExec` + explicit `SortExec` inputs on +//! both sides. The rationale is correctness/safety: `HashJoinExec`'s build +//! side is non-spillable, so a large hash-table can OOM the runtime. +//! +//! But the rewrite is *expensive when the original hash-join would have fit +//! in memory*: +//! +//! - Both inputs are fully materialized and sorted (`SortExec` Γ— 2, +//! `O(N log N)` time per side, plus full-row width in memory or on +//! spill files). +//! - The sort-merge merge pass walks both inputs end-to-end. +//! - Total cost is typically 5–10Γ— the pure-hash-join cost when the +//! build side fits, and uses several times more peak memory because +//! `SortExec` materializes both sides instead of just one hash table. +//! +//! TPC-DS at SF10+ has multiple fact tables above the 10M threshold +//! (`store_sales` ~29M at SF10, `web_sales` ~7M at SF10 grows to +//! ~72M at SF100, `catalog_sales` ~14M at SF10, `inventory` ~117M at SF10), +//! so the rewriter fires on most fact-side inner joins at production scale +//! factors. End-to-end TPC-DS-on-Cayenne shows substantial query-time and +//! memory regressions as a result. The `pairs.yaml` testoperator manifest +//! at `tools/testoperator/dispatch/perf-cayenne-vs-duckdb/` carries the +//! end-to-end benchmark; this Criterion bench is the focused per-rule +//! reproducer. +//! +//! ## What this bench measures +//! +//! Two lanes, identical query shape β€” a self-join over an int64 key column, +//! aggregating the row count. The only difference is the preloaded table +//! size: +//! +//! - `below_threshold/` for `N < ANTI_JOIN_SORT_MERGE_MIN_EXACT_ROWS` β€” +//! the rule does not fire; `HashJoinExec` runs unchanged. +//! - `above_threshold/` for `N > ANTI_JOIN_SORT_MERGE_MIN_EXACT_ROWS` β€” +//! the rule fires; `SortMergeJoinExec` with `SortExec` inputs runs +//! instead. +//! +//! Because the row counts straddle the threshold by a small margin, the +//! raw data-size delta between the two lanes is modest (~2–3x), but the +//! query-time delta should be much larger if the rewrite is the +//! regression. Criterion's report makes that cliff visible. +//! +//! ## How to read the report +//! +//! After running `cargo bench --bench inner_join_sort_merge_rewrite -p cayenne`, +//! look at `inner_join_sort_merge_rewrite/below_threshold/` versus +//! `inner_join_sort_merge_rewrite/above_threshold/`. If the rewriter is +//! the cause of the TPC-DS regression, the time-per-row in the +//! `above_threshold` lane will be **significantly higher** than in the +//! `below_threshold` lane β€” disproportionate to the modest table-size +//! delta. +//! +//! A future fix (raise the threshold, make it memory-pool-aware, gate on +//! `cayenne_sort_merge_min_rows`, or split inner-join handling from +//! anti/semi-join handling) should bring the `above_threshold` curve back +//! into line with the `below_threshold` curve, scaled by raw data volume. + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{Int64Array, RecordBatch}; +use arrow::datatypes::{DataType, Field, Schema}; +use cayenne::metadata::CreateTableOptions; +use cayenne::{CayenneCatalog, CayenneTableProvider, MetadataCatalog}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion::datasource::TableProvider; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::prelude::SessionContext; +use datafusion_expr::dml::InsertOp; +use tempfile::TempDir; +use tokio::runtime::Runtime; + +/// Just below the rewriter's 10M-row gate +/// (`ANTI_JOIN_SORT_MERGE_MIN_EXACT_ROWS`). +const BELOW_THRESHOLD_ROWS: usize = 5_000_000; +/// Just above the rewriter's 10M-row gate β€” small margin so the data-size +/// delta vs the below-lane is modest. +const ABOVE_THRESHOLD_ROWS: usize = 12_000_000; + +/// Insert chunk size β€” chosen large enough that per-burst overhead is +/// amortized but small enough that preloading 12M rows keeps the in-flight +/// batch under a few hundred MB. +const PRELOAD_CHUNK: usize = 100_000; + +struct BenchTable { + _temp_dir: TempDir, + table: Arc, +} + +fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("value", DataType::Int64, false), + ])) +} + +fn make_batch(start: i64, rows: usize) -> RecordBatch { + let ids = (start..start + rows as i64).collect::>(); + let values = ids.iter().map(|id| id * 100).collect::>(); + RecordBatch::try_new( + schema(), + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(Int64Array::from(values)), + ], + ) + .expect("batch") +} + +async fn append_batch(table: &Arc, batch: RecordBatch) -> u64 { + let ctx = SessionContext::new(); + let input_schema = Arc::clone(batch.schema_ref()); + let input_exec = + MemorySourceConfig::try_new_exec(&[vec![batch]], input_schema, None).expect("memory exec"); + let insert_plan = table + .insert_into(&ctx.state(), input_exec, InsertOp::Append) + .await + .expect("insert plan"); + let results = datafusion_physical_plan::collect(insert_plan, ctx.task_ctx()) + .await + .expect("insert collect"); + results + .first() + .and_then(|batch| { + batch + .column(0) + .as_any() + .downcast_ref::() + }) + .map_or(0, |rows| rows.value(0)) +} + +async fn setup_table(table_name: &str, rows: usize) -> BenchTable { + let temp_dir = tempfile::tempdir().expect("temp dir"); + let data_path = temp_dir.path().join("data"); + tokio::fs::create_dir_all(&data_path) + .await + .expect("data dir"); + let db_path = temp_dir.path().join("bench.db"); + let catalog = Arc::new( + CayenneCatalog::new(format!("sqlite://{}", db_path.to_string_lossy())).expect("catalog"), + ); + catalog.init().await.expect("catalog init"); + + let ctx = SessionContext::new(); + let table = Arc::new( + CayenneTableProvider::create_table( + Arc::clone(&catalog) as Arc, + CreateTableOptions { + table_name: table_name.to_string(), + schema: schema(), + primary_key: vec![], + on_conflict: None, + base_path: data_path.to_string_lossy().to_string(), + partition_column: None, + vortex_config: cayenne::metadata::VortexConfig::default(), + }, + ctx.runtime_env(), + ) + .await + .expect("table"), + ); + + let mut written: usize = 0; + while written < rows { + let this_chunk = PRELOAD_CHUNK.min(rows - written); + let batch = make_batch(written as i64, this_chunk); + let n = append_batch(&table, batch).await; + assert_eq!(n as usize, this_chunk); + written += this_chunk; + } + + BenchTable { + _temp_dir: temp_dir, + table, + } +} + +/// Run a self-equi-join on `id` aggregating into a single row count. The +/// shape mirrors a TPC-DS fact-table self-join (e.g. `store_sales` +/// joined back to itself by `ss_ticket_number`) β€” the inner-join build +/// side is the same Cayenne-backed scan as the probe side, so the +/// rewriter's same-source precondition fires. +async fn run_self_join(table: &Arc) -> i64 { + let ctx = SessionContext::new(); + ctx.register_table("t", Arc::clone(table) as Arc) + .expect("register"); + + let df = ctx + .sql( + "SELECT COUNT(*) FROM t AS a INNER JOIN t AS b ON a.id = b.id \ + WHERE a.value > 0 AND b.value > 0", + ) + .await + .expect("sql"); + + let batches = df.collect().await.expect("collect"); + batches + .first() + .and_then(|batch| { + batch + .column(0) + .as_any() + .downcast_ref::() + }) + .map(|arr| arr.value(0)) + .unwrap_or(0) +} + +fn bench_inner_join_sort_merge_rewrite(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("inner_join_sort_merge_rewrite"); + // The preload is multi-second; cap samples to keep bench wall-time + // bounded while still resolving the regression cliff. + group.sample_size(10); + + // Preload each lane ONCE before the timing loop. Query lanes are + // pure reads, so the same fixture can be reused across all samples. + let below = Arc::new(rt.block_on(setup_table("below_bench", BELOW_THRESHOLD_ROWS))); + let above = Arc::new(rt.block_on(setup_table("above_bench", ABOVE_THRESHOLD_ROWS))); + + { + let below = Arc::clone(&below); + group.bench_with_input( + BenchmarkId::new("below_threshold", BELOW_THRESHOLD_ROWS), + &BELOW_THRESHOLD_ROWS, + |b, &_| { + b.iter(|| { + rt.block_on(async { + let n = run_self_join(&below.table).await; + black_box(n); + }); + }); + }, + ); + } + + { + let above = Arc::clone(&above); + group.bench_with_input( + BenchmarkId::new("above_threshold", ABOVE_THRESHOLD_ROWS), + &ABOVE_THRESHOLD_ROWS, + |b, &_| { + b.iter(|| { + rt.block_on(async { + let n = run_self_join(&above.table).await; + black_box(n); + }); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_inner_join_sort_merge_rewrite); +criterion_main!(benches); diff --git a/crates/cayenne/benches/metastore_connection_contention.rs b/crates/cayenne/benches/metastore_connection_contention.rs new file mode 100644 index 0000000000..d85ac608a2 --- /dev/null +++ b/crates/cayenne/benches/metastore_connection_contention.rs @@ -0,0 +1,225 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: cross-table CDC throughput ceiling from the +//! single-connection metastore mutex. +//! +//! `SqliteMetastore` (`crates/cayenne/src/metastore/sqlite.rs:38-50`) and +//! `TursoMetastore` (`crates/cayenne/src/metastore/turso.rs`) each hold one +//! `tokio::sync::Mutex` for the whole catalog: +//! +//! ```ignore +//! pub struct SqliteMetastore { +//! connection_string: String, +//! conn: OnceCell>>, +//! } +//! ``` +//! +//! **Every** metastore call from **every** Cayenne table sharing one +//! catalog acquires this same mutex β€” `execute`, `query`, `query_row`, +//! `begin_transaction`, and the newer `execute_transaction_batch` (added +//! to halve the in-checkpoint round-trips, but still funneling through +//! the same connection). The mutex is held across each `.await` of the +//! underlying `tokio_rusqlite` call, so concurrent CDC commits from +//! different tables serialize on this mutex. +//! +//! Under a workload with **N** independently-replicating tables (the +//! CH-benCH SF100 retest had 14), the metastore-bound term of every +//! commit becomes `N Β· RTT` instead of `RTT` β€” a 14Γ— ceiling on +//! aggregate metastore throughput at the SF100 shape. This matches the +//! observed behavior in the May 15 2026 retest: 6 of 14 tables +//! accumulated hundreds of MB of un-drained WAL while the +//! low-write-volume probe table stayed current β€” the probe's commit +//! waited behind the high-volume tables on the shared mutex, and any +//! table whose Postgres-side WAL rate exceeded +//! `(mutex_throughput / N_tables)` fell permanently behind. +//! +//! The fix is a connection pool of K independent +//! `tokio_rusqlite::Connection` instances behind a pool primitive +//! (`bb8`, `deadpool`, or a simple `Vec>`). K = N +//! lifts the ceiling entirely; K = small constant > 1 lifts it +//! proportionally. SQLite-WAL allows concurrent readers + one writer at +//! a time, so K writer connections do NOT serialize at the SQLite +//! level β€” only the in-process Rust mutex does. Turso's MVCC supports +//! `BEGIN CONCURRENT` so it gains even more from K > 1. +//! +//! ## What this bench measures +//! +//! Pure mutex contention pattern β€” no real SQLite, no on-disk work. +//! Simulated per-call metastore work is `tokio::time::sleep(rtt)` (one +//! RTT models the full `execute_transaction_batch` round trip after the +//! iteration-3 fix landed in `cayenne_catalog.rs:1716`). Isolates the +//! scheduling pattern (single mutex vs pooled connections) from +//! SQLite-specific cost. +//! +//! Two lanes per `(N_tables, RTT)` pair: +//! +//! - `current_single_mutex/N=...` β€” all N workers contend on one +//! `tokio::sync::Mutex<()>`. Total wall time β‰ˆ `N Β· commits Β· RTT` +//! because the mutex serializes every commit. +//! - `achievable_per_table_pool/N=...` β€” each worker has its own +//! `tokio::sync::Mutex<()>` (modeling a per-table connection in a +//! pool of size K = N). Total wall time β‰ˆ `commits Β· RTT` because +//! the N workers run in true parallel. +//! +//! ## How to read +//! +//! `cargo bench --bench metastore_connection_contention -p cayenne`. +//! The throughput report makes the ceiling visible: +//! +//! - `current_single_mutex/N=14/rtt_10ms` throughput is ~100 commits/s +//! total regardless of N β€” that's the per-process metastore cap. +//! - `achievable_per_table_pool/N=14/rtt_10ms` is ~1400 commits/s β€” +//! one RTT batch in parallel. +//! +//! At SF100's 14 tables, the gap is 14Γ—. At SF1000 with more tables +//! (or more concurrent compactions / catalog operations) the gap grows +//! linearly. **The `current_single_mutex` lane is the metastore-bound +//! throughput ceiling Spice's CDC pipeline cannot exceed today.** +//! +//! The bench also exercises two RTTs (`rtt_1ms` for local SQLite with +//! WAL+normal-sync, `rtt_10ms` for a network metastore like Turso) so +//! the ceiling is legible in both deployment shapes. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use tokio::runtime::Runtime; +use tokio::sync::Mutex; + +/// Per-call simulated metastore round trip. After the iteration-3 fix +/// (`execute_transaction_batch`), one commit β‰ˆ one round trip. Two +/// realistic shapes: +/// - 1 ms: local SQLite, WAL mode, NORMAL sync (Cayenne's default +/// tokio-rusqlite config β€” see `metastore/sqlite.rs:97-108`). +/// - 10 ms: same-zone network metastore (Turso, managed Postgres). +const RTTS: &[(&str, Duration)] = &[ + ("rtt_1ms", Duration::from_millis(1)), + ("rtt_10ms", Duration::from_millis(10)), +]; + +/// CDC table counts. 14 matches CH-benCH (12 TPC-C tables + 1 probe + +/// 1 marker, per the May 15 2026 SF100 retest). 4 is a typical small +/// pipeline. 32 stresses the ceiling at higher cardinality. +const TABLE_COUNTS: &[usize] = &[4, 14, 32]; + +/// Commits per worker per iteration. Picked so the simulated total +/// work lands in the low-millisecond range at `rtt_1ms` and the +/// high-millisecond range at `rtt_10ms` β€” Criterion can collect 10+ +/// samples in 2 s. +const COMMITS_PER_WORKER: usize = 8; + +/// One simulated CDC commit: acquire the connection mutex, do the +/// metastore round trip, release. Models the metastore-bound term of +/// `CayenneCatalog::commit_compaction` / +/// `clear_inlined_data_and_deletes` / `commit_inlined_mutation` β€” +/// after the iteration-3 fix, all of these are single-batch +/// `execute_transaction_batch` calls. +async fn one_commit(mutex: &Mutex<()>, rtt: Duration) { + let _guard = mutex.lock().await; + tokio::time::sleep(rtt).await; +} + +/// Lane A: all workers contend on one `Mutex<()>` β€” mirrors today's +/// `SqliteMetastore.conn`. +async fn run_single_mutex(n_tables: usize, rtt: Duration) { + let mutex = Arc::new(Mutex::new(())); + let mut handles = Vec::with_capacity(n_tables); + for _ in 0..n_tables { + let mutex = Arc::clone(&mutex); + handles.push(tokio::spawn(async move { + for _ in 0..COMMITS_PER_WORKER { + one_commit(&mutex, rtt).await; + } + })); + } + for h in handles { + h.await.expect("join"); + } + black_box(mutex); +} + +/// Lane B: each worker has its own `Mutex<()>` β€” models a connection +/// pool sized at N (one connection per table). +async fn run_per_table_pool(n_tables: usize, rtt: Duration) { + let mutexes: Vec>> = (0..n_tables).map(|_| Arc::new(Mutex::new(()))).collect(); + let mut handles = Vec::with_capacity(n_tables); + for mutex in &mutexes { + let mutex = Arc::clone(mutex); + handles.push(tokio::spawn(async move { + for _ in 0..COMMITS_PER_WORKER { + one_commit(&mutex, rtt).await; + } + })); + } + for h in handles { + h.await.expect("join"); + } + black_box(mutexes); +} + +fn bench_metastore_connection_contention(c: &mut Criterion) { + // Multi-thread runtime β€” the contention story requires multiple + // worker threads. A current-thread runtime would serialize every + // task and hide the gap. + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("tokio runtime"); + + let mut group = c.benchmark_group("metastore_connection_contention"); + for &(rtt_label, rtt) in RTTS { + for &n in TABLE_COUNTS { + let commits_total = u64::try_from(n * COMMITS_PER_WORKER).unwrap_or(u64::MAX); + group.throughput(Throughput::Elements(commits_total)); + + let id = format!("N={n}/{rtt_label}"); + group.bench_with_input( + BenchmarkId::new("current_single_mutex", &id), + &n, + |b, &n| { + b.to_async(&rt).iter(|| async move { + run_single_mutex(n, rtt).await; + }); + }, + ); + + group.bench_with_input( + BenchmarkId::new("achievable_per_table_pool", &id), + &n, + |b, &n| { + b.to_async(&rt).iter(|| async move { + run_per_table_pool(n, rtt).await; + }); + }, + ); + } + } + group.finish(); +} + +criterion_group!(benches, bench_metastore_connection_contention); +criterion_main!(benches); + +#[allow(dead_code)] +fn _runtime_local_for_clippy() -> Runtime { + Runtime::new().expect("runtime") +} diff --git a/crates/cayenne/benches/mutation_writer.rs b/crates/cayenne/benches/mutation_writer.rs index 5b12a3d8e2..23edba9d34 100644 --- a/crates/cayenne/benches/mutation_writer.rs +++ b/crates/cayenne/benches/mutation_writer.rs @@ -330,6 +330,75 @@ fn bench_directory_durability_primitives(c: &mut Criterion) { ); }); + // Quantifies the cost a "duplicate directory fsync" regression imposes on + // the staged-append commit path. The two benchmarks below replicate the + // exact post-rename pattern used in `move_staging_files_local` (open the + // directory + `sync_all` on the inode). The duplicate variant calls it + // back-to-back without any filesystem mutation in between β€” semantically + // identical to a single fsync on the same on-disk state, but pays the + // syscall and journal cost twice. + // + // Concretely: a previous revision of `move_staging_files_local` + // accidentally fsynced `target_dir` twice in a row. This is the cost it + // added per staged-append commit on local FS. If anyone reintroduces a + // duplicate fsync on this hot path, the `duplicate_dir_fsync` line of + // this group will be ~2Γ— the `single_dir_fsync` line in the criterion + // report, making the regression obvious. + group.bench_function("single_dir_fsync", |b| { + b.iter_batched( + || { + let temp = tempfile::tempdir().expect("tempdir for bench"); + let dir = temp.path().join("target_snapshot"); + std::fs::create_dir_all(&dir).expect("create snapshot dir"); + (temp, dir) + }, + |(_keep_alive, dir)| { + rt.block_on(async { + let path = dir.clone(); + tokio::task::spawn_blocking(move || { + let f = std::fs::File::open(&path).expect("open dir"); + f.sync_all().expect("fsync dir"); + }) + .await + .expect("join"); + black_box(dir); + }); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("duplicate_dir_fsync", |b| { + b.iter_batched( + || { + let temp = tempfile::tempdir().expect("tempdir for bench"); + let dir = temp.path().join("target_snapshot"); + std::fs::create_dir_all(&dir).expect("create snapshot dir"); + (temp, dir) + }, + |(_keep_alive, dir)| { + rt.block_on(async { + let path1 = dir.clone(); + tokio::task::spawn_blocking(move || { + let f = std::fs::File::open(&path1).expect("open dir"); + f.sync_all().expect("fsync dir"); + }) + .await + .expect("join1"); + let path2 = dir.clone(); + tokio::task::spawn_blocking(move || { + let f = std::fs::File::open(&path2).expect("open dir"); + f.sync_all().expect("fsync dir"); + }) + .await + .expect("join2"); + black_box(dir); + }); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); } diff --git a/crates/cayenne/benches/sorted_append_overhead.rs b/crates/cayenne/benches/sorted_append_overhead.rs new file mode 100644 index 0000000000..e31166e9b3 --- /dev/null +++ b/crates/cayenne/benches/sorted_append_overhead.rs @@ -0,0 +1,236 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-append cost when `sort_columns` is configured. +//! +//! Guards the sort-on-compact write path: a single append on a table with +//! `cayenne_sort_columns` set should stay proportional to the incoming append +//! size, not to the preloaded table size. Sort work is intentionally deferred +//! to compaction; the append path should not call +//! `CayenneTableProvider::sort_and_rewrite_data` after every write. +//! +//! For comparison the bench also measures the same append on an otherwise +//! identical table without `sort_columns`. That lane stays roughly constant +//! in the preload size because the append path is `O(K)` where `K` is the +//! incoming row count (no full-table rewrite). +//! +//! ## Why this matters +//! +//! Sustained CDC ingestion on a sort-column table should not rewrite the full +//! table on every coalesced burst. The benchmark is intentionally a regression +//! test: sorted and unsorted append lanes should both stay roughly constant in +//! the preload size. +//! +//! ## How to read the report +//! +//! Criterion will produce one group `sorted_append_overhead/{lane}/{preload}`. +//! Look for: +//! - `unsorted/` time roughly constant across preload sizes β€” the +//! `O(K)` baseline. +//! - `sorted/` time roughly constant across preload sizes β€” if it +//! grows with preload size, the write path has regressed back into doing +//! full-table sort rewrites. +//! +//! The append payload size is held constant at [`APPEND_ROWS`] across all +//! cases so the only varying input is the preloaded table size. + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{Int64Array, StringArray}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use cayenne::metadata::{CreateTableOptions, VortexConfig}; +use cayenne::{CayenneCatalog, CayenneTableProvider, MetadataCatalog}; +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use datafusion::datasource::TableProvider; +use datafusion::datasource::memory::MemorySourceConfig; +use datafusion::prelude::SessionContext; +use datafusion_expr::dml::InsertOp; +use tempfile::TempDir; +use tokio::runtime::Runtime; + +const APPEND_ROWS: usize = 1_024; +const PRELOAD_SIZES: &[usize] = &[8_192, 65_536, 524_288]; + +struct BenchTable { + _temp_dir: TempDir, + table: Arc, + schema: Arc, +} + +fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ])) +} + +fn make_batch(schema: Arc, start: i64, rows: usize) -> RecordBatch { + let ids = (start..start + rows as i64).collect::>(); + let names = ids + .iter() + .map(|id| format!("name_{id}")) + .collect::>(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + ], + ) + .expect("batch") +} + +async fn append_batch(table: &Arc, batch: RecordBatch) -> u64 { + let ctx = SessionContext::new(); + let input_schema = Arc::clone(batch.schema_ref()); + let input_exec = + MemorySourceConfig::try_new_exec(&[vec![batch]], input_schema, None).expect("memory exec"); + let insert_plan = table + .insert_into(&ctx.state(), input_exec, InsertOp::Append) + .await + .expect("insert plan"); + let results = datafusion_physical_plan::collect(insert_plan, ctx.task_ctx()) + .await + .expect("insert collect"); + results + .first() + .and_then(|batch| { + batch + .column(0) + .as_any() + .downcast_ref::() + }) + .map_or(0, |rows| rows.value(0)) +} + +async fn setup_table(table_name: &str, sorted: bool) -> BenchTable { + let temp_dir = tempfile::tempdir().expect("temp dir"); + let data_path = temp_dir.path().join("data"); + tokio::fs::create_dir_all(&data_path) + .await + .expect("data dir"); + let db_path = temp_dir.path().join("bench.db"); + let catalog = Arc::new( + CayenneCatalog::new(format!("sqlite://{}", db_path.to_string_lossy())).expect("catalog"), + ); + catalog.init().await.expect("catalog init"); + + let mut vortex_config = VortexConfig::default(); + if sorted { + vortex_config.sort_columns = vec!["id".to_string()]; + } + + let ctx = SessionContext::new(); + let schema = schema(); + let table = Arc::new( + CayenneTableProvider::create_table( + Arc::clone(&catalog) as Arc, + CreateTableOptions { + table_name: table_name.to_string(), + schema: Arc::clone(&schema), + primary_key: vec![], + on_conflict: None, + base_path: data_path.to_string_lossy().to_string(), + partition_column: None, + vortex_config, + }, + ctx.runtime_env(), + ) + .await + .expect("table"), + ); + + BenchTable { + _temp_dir: temp_dir, + table, + schema, + } +} + +/// Preload `rows` rows into the table, chunked so no single insert +/// dominates the preload time and so that the sorted-table preload itself +/// is representative of steady-state apply (each chunk triggers +/// `sort_and_rewrite_data` exactly the way a CDC burst would). +async fn preload(bench: &BenchTable, rows: usize) { + const CHUNK: usize = 4_096; + let mut written: usize = 0; + while written < rows { + let this_chunk = CHUNK.min(rows - written); + let batch = make_batch(Arc::clone(&bench.schema), written as i64, this_chunk); + let n = append_batch(&bench.table, batch).await; + assert_eq!(n as usize, this_chunk); + written += this_chunk; + } +} + +fn bench_sorted_append_overhead(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("sorted_append_overhead"); + // Sorted preloads at the largest preload size dominate setup time; keep + // the sample count low to bound total bench wall time. + group.sample_size(10); + + for &preload_rows in PRELOAD_SIZES { + group.throughput(Throughput::Elements(APPEND_ROWS as u64)); + + for sorted in [false, true] { + let lane = if sorted { "sorted" } else { "unsorted" }; + + group.bench_with_input( + BenchmarkId::new(lane, preload_rows), + &preload_rows, + |b, &preload_rows| { + b.iter_batched( + || { + rt.block_on(async { + let bench = setup_table("sorted_append_bench", sorted).await; + preload(&bench, preload_rows).await; + bench + }) + }, + |bench| { + rt.block_on(async { + let batch = make_batch( + Arc::clone(&bench.schema), + preload_rows as i64, + APPEND_ROWS, + ); + let written = append_batch(&bench.table, batch).await; + black_box((bench, written)); + }); + }, + // Preload reuses a fresh temp dir per iteration; the + // sorted-preload cost is bounded by APPEND_ROWS but + // the dataset is reset between iterations, so use + // `PerIteration` rather than the cheaper + // `LargeInput`. + BatchSize::PerIteration, + ); + }, + ); + } + } + + group.finish(); +} + +criterion_group!(benches, bench_sorted_append_overhead); +criterion_main!(benches); diff --git a/crates/cayenne/benches/staging_move_concurrency.rs b/crates/cayenne/benches/staging_move_concurrency.rs new file mode 100644 index 0000000000..f496145783 --- /dev/null +++ b/crates/cayenne/benches/staging_move_concurrency.rs @@ -0,0 +1,239 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-file serial latency of the S3 staged-file move +//! during the CDC pipelined finalize barrier. +//! +//! `CayenneTableProvider::move_staging_files_s3` +//! (`crates/cayenne/src/provider/table.rs:2122-2221`) moves files out of +//! `_staging//` into the live snapshot directory in two phases: +//! +//! ```ignore +//! // Phase 1: copy +//! for meta in &objects { +//! store.copy(&meta.location, &target_path).await?; +//! copied_locations.push(meta.location.clone()); +//! } +//! // Phase 2: delete staging originals +//! for location in &copied_locations { +//! store.delete(location).await?; +//! } +//! ``` +//! +//! Both phases iterate **serially** with `.await` between each S3 round trip. +//! The move runs under `apply_under_barrier` +//! (`crates/cayenne/src/provider/staging_wal.rs:307-333`), which holds +//! `visibility_lock` plus the `listing_fence` write guard +//! (`table.rs:880`) across the entire move. Every concurrent scan that +//! reaches `listing_fence.read().await` blocks until the move completes. +//! +//! For a CDC burst that produced `N` Vortex files, the held-fence time +//! includes `2 Β· N Β· RTT_s3` β€” copy RTT plus delete RTT per file. On S3 +//! with ~10–30 ms per op, a 64-file burst stalls every reader for +//! ~1.3–3.8 s. The same antipattern exists in: +//! +//! - `crates/cayenne/src/provider/table.rs:1721-1731` (`delete_prefix_with_object_store`) +//! - `crates/cayenne/src/provider/partitioned_wal.rs:287-307` (3 S3 ops where 1 suffices) +//! +//! The fix is `stream::iter(...).map(...).buffer_unordered(N).try_collect()` +//! β€” a small constant change that brings the fence-held time down to +//! `RTT_s3 Β· (N / parallelism) + RTT_s3 Β· (N / parallelism)`. For +//! `parallelism=16` and N=64 that is ~8 RTTs total instead of 128. +//! +//! ## What this bench measures +//! +//! Two lanes, identical work β€” move `N` 4 KiB objects between two +//! `object_store::memory::InMemory` prefixes. Per-op latency is simulated +//! by `tokio::time::sleep(SIMULATED_S3_RTT)` immediately before each +//! `copy` / `delete`. This isolates the scheduling pattern (serial loop +//! vs `buffer_unordered`) from real-network jitter. +//! +//! - `staging_move/current_serial/` β€” mirrors the loop in +//! `move_staging_files_s3`. Time grows linearly with `N`. +//! - `staging_move/achievable_concurrent/` β€” `buffer_unordered(16)` +//! over both phases. Time grows as `N / 16` (one RTT batch + a tail). +//! +//! Both lanes use the same store, the same byte payload, and the same +//! source/destination paths so the only difference is dispatch pattern. +//! +//! ## How to read the report +//! +//! After `cargo bench --bench staging_move_concurrency -p cayenne`: +//! +//! - Look at `staging_move/current_serial/64` vs +//! `staging_move/achievable_concurrent/64`. The ratio is approximately +//! `min(64, 16) * 2 / ceil(64 / 16) * 2` β‰ˆ 16Γ—. That ratio is the +//! reduction in fence-held time after fixing the antipattern. +//! - The `current_serial` lane is the **regression to track**: if a +//! future change adds work to the per-file body, this lane will grow. +//! - The `achievable_concurrent` lane shows where the fence-held time +//! *can* land with a minimal change. Use it as the floor. + +#![allow(clippy::expect_used)] + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use futures::StreamExt; +use futures::TryStreamExt; +use futures::stream; +use object_store::ObjectStore; +use object_store::PutPayload; +use object_store::memory::InMemory; +use object_store::path::Path as ObjectStorePath; +use tokio::runtime::Runtime; + +/// Simulated S3 per-op round trip. 50 Β΅s keeps each iteration in the low +/// millisecond range so Criterion can collect samples quickly while still +/// dominating any in-process `InMemory::copy` cost (sub-microsecond). +const SIMULATED_S3_RTT: Duration = Duration::from_micros(50); + +/// File counts straddle the small-burst / large-burst boundary. 16 is a +/// typical CDC append (a few small Vortex files); 256 is a fan-out burst +/// or a partitioned table. +const FILE_COUNTS: &[usize] = &[16, 64, 256]; + +/// Concurrency for the achievable-concurrent lane. Matches a reasonable +/// `buffer_unordered` width for S3 β€” large enough to saturate, small +/// enough to avoid hammering the underlying store with thousands of +/// in-flight requests. +const CONCURRENCY: usize = 16; + +/// Tiny payload β€” the cost we are measuring is dispatch, not bandwidth. +const PAYLOAD_BYTES: usize = 4 * 1024; + +fn payload() -> PutPayload { + PutPayload::from(vec![0u8; PAYLOAD_BYTES]) +} + +fn staging_path(i: usize) -> ObjectStorePath { + ObjectStorePath::from(format!("_staging/burst-1/data-{i:06}.vortex")) +} + +fn target_path(i: usize) -> ObjectStorePath { + ObjectStorePath::from(format!("current/data-{i:06}.vortex")) +} + +/// Seed `n` staging files in a fresh `InMemory` store. The cost of this +/// setup is deliberately outside Criterion's measurement window via +/// `iter_batched`. +async fn seed_store(n: usize) -> Arc { + let store = Arc::new(InMemory::new()); + for i in 0..n { + store + .put(&staging_path(i), payload()) + .await + .expect("seed put"); + } + store +} + +/// Mirrors `CayenneTableProvider::move_staging_files_s3` +/// (`crates/cayenne/src/provider/table.rs:2122-2221`): Phase 1 copies +/// every staged file to the target prefix serially, then Phase 2 deletes +/// each staged original serially. Each `.await` represents one S3 round +/// trip held under the listing-fence write guard. +async fn serial_copy_then_delete(store: Arc, n: usize) { + let mut copied = Vec::with_capacity(n); + // Phase 1: copy. + for i in 0..n { + let src = staging_path(i); + let dst = target_path(i); + tokio::time::sleep(SIMULATED_S3_RTT).await; + store.copy(&src, &dst).await.expect("copy"); + copied.push(src); + } + // Phase 2: delete. + for src in &copied { + tokio::time::sleep(SIMULATED_S3_RTT).await; + store.delete(src).await.expect("delete"); + } +} + +/// Achievable pattern: `buffer_unordered` across both phases. Same +/// two-phase ordering as the serial variant (Phase 2 only begins after +/// Phase 1 fully drains) so crash-safety semantics are preserved. +async fn concurrent_copy_then_delete(store: Arc, n: usize) { + // Phase 1: copy. + let store_phase1 = Arc::clone(&store); + stream::iter(0..n) + .map(|i| { + let store = Arc::clone(&store_phase1); + async move { + let src = staging_path(i); + let dst = target_path(i); + tokio::time::sleep(SIMULATED_S3_RTT).await; + store.copy(&src, &dst).await + } + }) + .buffer_unordered(CONCURRENCY) + .try_collect::>() + .await + .expect("phase 1 copy"); + + // Phase 2: delete. + let store_phase2 = Arc::clone(&store); + stream::iter(0..n) + .map(|i| { + let store = Arc::clone(&store_phase2); + async move { + let src = staging_path(i); + tokio::time::sleep(SIMULATED_S3_RTT).await; + store.delete(&src).await + } + }) + .buffer_unordered(CONCURRENCY) + .try_collect::>() + .await + .expect("phase 2 delete"); +} + +fn bench_staging_move(c: &mut Criterion) { + let rt = Runtime::new().expect("tokio runtime"); + + let mut group = c.benchmark_group("staging_move"); + // Throughput per file makes the per-file scheduling cost legible in + // Criterion's report. + for &n in FILE_COUNTS { + group.throughput(Throughput::Elements(u64::try_from(n).unwrap_or(u64::MAX))); + + // Setup runs inside the async body β€” `iter_batched` with a sync + // closure cannot use `Runtime::block_on` because it executes inside + // the runtime that `to_async` has already entered. The per-iteration + // seed cost is `n` cheap `InMemory::put` calls (no simulated RTT) + // and is identical across both lanes, so it does not skew the + // serial-vs-concurrent ratio that this bench measures. + group.bench_with_input(BenchmarkId::new("current_serial", n), &n, |b, &n| { + b.to_async(&rt).iter(|| async move { + let store = seed_store(n).await; + serial_copy_then_delete(black_box(store), black_box(n)).await; + }); + }); + + group.bench_with_input(BenchmarkId::new("achievable_concurrent", n), &n, |b, &n| { + b.to_async(&rt).iter(|| async move { + let store = seed_store(n).await; + concurrent_copy_then_delete(black_box(store), black_box(n)).await; + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_staging_move); +criterion_main!(benches); diff --git a/crates/cayenne/benches/validate_on_conflict_buffering.rs b/crates/cayenne/benches/validate_on_conflict_buffering.rs new file mode 100644 index 0000000000..024c7aa7ee --- /dev/null +++ b/crates/cayenne/benches/validate_on_conflict_buffering.rs @@ -0,0 +1,234 @@ +/* +Copyright 2026 The Spice.ai OSS Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Regression bench: per-commit cost of `validate_on_conflict`'s +//! unbounded buffering on the CDC ingestion path. +//! +//! `CayenneTableProvider::validate_on_conflict` +//! (`crates/cayenne/src/provider/table.rs:3491-3571`) drains the *entire* +//! incoming CDC batch into three heap-resident structures **before any +//! Vortex file is written**: +//! +//! ```ignore +//! while let Some(batch_result) = stream.next().await { +//! ... +//! incoming_keys.extend(kept_keys.iter().cloned()); // HashSet +//! all_kept_keys.extend(kept_keys); // HashSet +//! if let Some(batch) = filtered_batch { +//! filtered_batches.push(batch); // Vec +//! } +//! } +//! ``` +//! +//! Triggered on every CDC commit when +//! `pk_conflict_detection: Auto` (the default). The CH-benCH SF100 +//! retest reported 6 of 14 tables accumulating hundreds of MB of +//! un-drained WAL under sustained write load β€” this materialization is +//! one of the largest per-commit fixed costs, and it sits on the +//! critical path **before** any Vortex write begins. +//! +//! With `cdc_max_coalesced_bytes: 256 MB` (the SF100 spicepod default), +//! one coalesced burst allocates up to that much heap on the input +//! decode side, plus an `OwnedRow` for every row (~16-64 bytes +//! depending on PK shape), plus a `HashSet` entry for every +//! row. For PK-heavy tables (customer with ~500-byte rows updating per +//! Payment, stock with ~10 updates per NewOrder) this is the +//! commit-rate bottleneck after the metastore round trip. +//! +//! The TigerStyle remedy is a **bounded staging buffer**: pre-allocate +//! a fixed cap (e.g. 64 MiB), stream batches through dedup with only a +//! sliding window of keys, and apply backpressure to the upstream CDC +//! source when full. Today there is no cap and no backpressure. +//! +//! ## What this bench measures +//! +//! Pure shape β€” no Vortex, no metastore, no Cayenne setup. Models the +//! drain-into-Vec + grow-HashSet pattern on a synthetic CDC stream of +//! M batches Γ— K rows each, using a fixed PK width that matches Arrow +//! `RowConverter::convert_columns` output (16 bytes β€” same shape as +//! the production `OwnedRow` for a single `Int64` or `Decimal` PK). +//! +//! Two lanes: +//! +//! - `current_unbounded_accumulation/` mirrors today's +//! `validate_on_conflict`. Heap grows linearly with `MΒ·K`. +//! - `bounded_streaming/` processes each batch in isolation, drops +//! `filtered_batches` after handing off, and uses a sliding `dedup_window` +//! of only the most recent batch's keys. Heap stays constant at `K` +//! entries regardless of `M`. +//! +//! ## How to read +//! +//! `cargo bench --bench validate_on_conflict_buffering -p cayenne`. +//! Compare: +//! +//! - `current_unbounded_accumulation/M=512` (β‰ˆ a 256 MB CDC burst at +//! 1 KiB/row) β€” wall time scales linearly with `M` because each +//! batch adds K HashSet inserts plus a `RecordBatch` clone into the +//! growing Vec. The slope per batch is the per-commit overhead the +//! SF100 retest is paying. +//! - `bounded_streaming/M=512` β€” wall time is roughly constant per +//! batch, scaling with total `MΒ·K` work but with no per-batch alloc +//! growth. The gap visualizes the achievable per-commit cost. +//! +//! The ratio between lanes at `M=512` is the per-commit-cost overhead +//! that the materialization adds. At `M=512, K=1024` it is the cost +//! difference between writing 512 batches one-at-a-time vs first +//! collecting them all into a Vec then writing. + +#![allow(clippy::expect_used)] + +use std::collections::HashSet; +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; + +/// Number of rows per batch β€” matches a typical CDC envelope after +/// `cdc_max_coalesced_envelopes: 1024`. Each "row" is one fixed-width +/// PK encoding. +const ROWS_PER_BATCH: usize = 1024; + +/// Fixed PK width. 16 bytes matches Arrow `RowConverter` output for a +/// single `Int64` or `Decimal128` column with the standard row encoding +/// header. Most TPC-C / CH-benCH PKs are single integer columns; the +/// widest realistic PK in CH-benCH is `customer(c_w_id, c_d_id, c_id)` +/// which encodes to ~24 bytes β€” same order of magnitude. +const PK_WIDTH: usize = 16; + +/// Batch counts straddling the typical CDC burst sizes: +/// - 8 batches β‰ˆ a single small append (8 K rows total). +/// - 64 batches β‰ˆ a moderate coalesced burst (64 K rows). +/// - 512 batches β‰ˆ a full `cdc_max_coalesced_bytes: 256 MB` burst at +/// ~512 KiB per batch. +const BATCH_COUNTS: &[usize] = &[8, 64, 512]; + +type PkKey = [u8; PK_WIDTH]; + +/// Stand-in for `RecordBatch` β€” a `Box<[u8]>` payload sized to roughly +/// match a 1 KiB-per-row Arrow batch. The exact shape does not matter; +/// what matters is that pushing into a `Vec` clones an +/// `Arc`-equivalent (here, moves a `Box`) and holding many of them +/// retains heap memory. +struct Batch { + keys: Vec, + /// Dummy payload representing the column data β€” `Box<[u8]>` so the + /// allocation is real and `Vec` retains memory linearly + /// with `M`. + _payload: Box<[u8]>, +} + +fn make_batch(batch_idx: usize) -> Batch { + let mut keys = Vec::with_capacity(ROWS_PER_BATCH); + let base = (batch_idx as u64).wrapping_mul(ROWS_PER_BATCH as u64); + for r in 0..ROWS_PER_BATCH { + let mut key = [0u8; PK_WIDTH]; + key[..8].copy_from_slice(&(base + r as u64).to_le_bytes()); + keys.push(key); + } + Batch { + keys, + _payload: vec![0u8; ROWS_PER_BATCH].into_boxed_slice(), + } +} + +/// Mirrors `validate_on_conflict` (`table.rs:3491-3571`): drain stream +/// into Vec, grow HashSet across batches, retain +/// everything until the caller pulls. +fn current_unbounded_accumulation(m: usize) -> usize { + let mut filtered_batches: Vec = Vec::new(); + let mut incoming_keys: HashSet = HashSet::with_capacity(1024); + let mut all_kept_keys: HashSet = HashSet::with_capacity(1024); + + for batch_idx in 0..m { + let batch = make_batch(batch_idx); + + // Per-row dedup: every key from this batch goes into both + // hashsets, mirroring the `incoming_keys.extend(kept_keys.iter().cloned())` + // + `all_kept_keys.extend(kept_keys)` pattern. + for key in &batch.keys { + if !incoming_keys.contains(key) { + incoming_keys.insert(*key); + all_kept_keys.insert(*key); + } + } + + // Retain the batch in the growing Vec. + filtered_batches.push(batch); + } + + // The function does not free `filtered_batches`/`incoming_keys` β€” + // they are returned to the caller and only freed after the + // downstream Vortex write completes. + let kept = filtered_batches.iter().map(|b| b.keys.len()).sum::(); + black_box(&filtered_batches); + black_box(&incoming_keys); + black_box(&all_kept_keys); + kept +} + +/// Bounded streaming alternative: dedup window is at most one batch +/// (or up to a small fixed cap), `filtered_batches` is never retained. +/// Each batch is handed off to a hypothetical downstream consumer and +/// immediately dropped. +fn bounded_streaming(m: usize) -> usize { + let mut total_kept = 0usize; + // Sliding window of recent keys, bounded at `ROWS_PER_BATCH`. In + // production this would be a `parking_lot::Mutex>` + // sized at a few Γ— batch_size, or an LSM-style bloom filter. + let mut window: HashSet = HashSet::with_capacity(ROWS_PER_BATCH); + + for batch_idx in 0..m { + let batch = make_batch(batch_idx); + + window.clear(); + for key in &batch.keys { + if window.insert(*key) { + total_kept += 1; + } + } + + // Hand off batch to downstream β€” modeled as `black_box` so the + // optimizer cannot drop the work. Then the batch is dropped + // immediately, freeing its heap. + black_box(&batch); + } + + total_kept +} + +fn bench_validate_on_conflict_buffering(c: &mut Criterion) { + let mut group = c.benchmark_group("validate_on_conflict_buffering"); + for &m in BATCH_COUNTS { + let total_rows = u64::try_from(m * ROWS_PER_BATCH).unwrap_or(u64::MAX); + group.throughput(Throughput::Elements(total_rows)); + + group.bench_with_input( + BenchmarkId::new("current_unbounded_accumulation", m), + &m, + |b, &m| { + b.iter(|| current_unbounded_accumulation(black_box(m))); + }, + ); + + group.bench_with_input(BenchmarkId::new("bounded_streaming", m), &m, |b, &m| { + b.iter(|| bounded_streaming(black_box(m))); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_validate_on_conflict_buffering); +criterion_main!(benches); diff --git a/crates/cayenne/benches/vs_duckdb_burst.rs b/crates/cayenne/benches/vs_duckdb_burst.rs new file mode 100644 index 0000000000..515e837f95 --- /dev/null +++ b/crates/cayenne/benches/vs_duckdb_burst.rs @@ -0,0 +1,112 @@ +// Copyright 2026 The Spice.ai OSS Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 + +//! CDC-style append-burst throughput: Cayenne vs DuckDB. +//! +//! Models the apply-loop pattern from the runtime's CDC pipeline: many small +//! bursts arriving back-to-back, each going through the engine's full +//! per-burst commit path (Cayenne: inline + staged-WAL finalize + listing +//! refresh + stats persist; DuckDB: per-statement WAL append + B-tree update). +//! +//! Each iteration writes `burst_count` bursts of `burst_rows` rows each. The +//! timed region covers all bursts so per-burst fixed cost is amortized into +//! the throughput number. The total row count is the Criterion throughput +//! denominator, so the result is directly comparable across engines and +//! lanes. +//! +//! Lanes (compile-time gated): +//! - `cayenne` β€” Cayenne with the SQLite metastore (default) +//! - `cayenne_turso` β€” Cayenne with the Turso metastore (--features turso) +//! - `duckdb` β€” DuckDB file-mode with `INSERT INTO ... VALUES (...)` + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] +#![allow(clippy::cast_possible_truncation)] + +#[path = "vs_duckdb_helpers/common.rs"] +mod common; + +use std::hint::black_box; + +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use tokio::runtime::Runtime; + +use common::{ + CAYENNE_LANES, cayenne_insert, duckdb_insert_rows, make_batch, schema, setup_cayenne_for, + setup_duckdb, +}; + +const BURST_ROWS: usize = 64; +const BURST_COUNTS: &[usize] = &[16, 64, 256]; + +fn bench_burst(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("vs_duckdb_burst"); + group.sample_size(10); + + for &burst_count in BURST_COUNTS { + let total_rows = (burst_count * BURST_ROWS) as u64; + group.throughput(Throughput::Elements(total_rows)); + + // Pre-build the burst payload so the timed region only pays the + // engine's commit cost, not Arrow row construction. Each burst gets a + // distinct id range so PK collisions never happen on the no-PK path. + let batches: Vec<_> = (0..burst_count) + .map(|i| make_batch(schema(), (i * BURST_ROWS) as i64, BURST_ROWS)) + .collect(); + + for &lane in CAYENNE_LANES { + let lane_label = lane.lane(); + let batches_setup = batches.clone(); + group.bench_with_input( + BenchmarkId::new(lane_label, burst_count), + &burst_count, + |b, &_burst_count| { + b.iter_batched( + || { + let fixture = rt.block_on(setup_cayenne_for("burst_bench", lane)); + (fixture, batches_setup.clone()) + }, + |(fixture, burst_batches)| { + rt.block_on(async { + for batch in burst_batches { + let _ = cayenne_insert(&fixture.table, batch).await; + } + }); + black_box(fixture); + }, + BatchSize::PerIteration, + ); + }, + ); + } + + let batches_setup = batches.clone(); + group.bench_with_input( + BenchmarkId::new("duckdb", burst_count), + &burst_count, + |b, &_burst_count| { + b.iter_batched( + || (setup_duckdb("burst_bench"), batches_setup.clone()), + |(fixture, burst_batches)| { + for batch in burst_batches { + duckdb_insert_rows(&fixture.conn, "burst_bench", &batch); + } + black_box(fixture); + }, + BatchSize::PerIteration, + ); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_burst); +criterion_main!(benches); diff --git a/crates/cayenne/benches/vs_duckdb_concurrent.rs b/crates/cayenne/benches/vs_duckdb_concurrent.rs new file mode 100644 index 0000000000..1926183aa1 --- /dev/null +++ b/crates/cayenne/benches/vs_duckdb_concurrent.rs @@ -0,0 +1,196 @@ +// Copyright 2026 The Spice.ai OSS Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 + +//! Scan-under-write contention: Cayenne vs DuckDB. +//! +//! For each lane, the bench preloads `BASE_ROWS` rows, then spawns a +//! background writer that loops forever appending small bursts of new rows. +//! In the timed region, the foreground runs a scan query repeatedly. Criterion +//! reports scan latency *under* sustained write pressure, which is what the +//! Spice CH-benCH retest report measured at the system level (Finding 2: +//! mixed OLAP performance under concurrent write load). +//! +//! - Cayenne: background appends run on the Tokio runtime; each burst goes +//! through the full append path (acquire write_lock, write Vortex files, +//! refresh listing, persist stats, commit catalog metadata). +//! - DuckDB: background appends run on a dedicated `std::thread` with its +//! own `Connection` to the same file-backed DB. DuckDB serializes writes +//! internally; concurrent scans see snapshot-isolated state. +//! +//! Background lifecycle is RAII: dropping a `RunningWriter` signals the +//! writer to stop and joins it. This guarantees clean teardown between +//! benchmark groups. + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] +#![allow(clippy::cast_possible_truncation)] + +#[path = "vs_duckdb_helpers/common.rs"] +mod common; + +use std::hint::black_box; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use duckdb::Connection; +use tokio::runtime::Runtime; + +use common::{ + CAYENNE_LANES, CayenneFixture, DuckDbFixture, Metastore, cayenne_insert, cayenne_query, + duckdb_insert_parquet, duckdb_insert_rows, duckdb_query_scalar, make_batch, schema, + setup_cayenne_for, setup_duckdb, write_parquet, +}; + +const BASE_ROWS: usize = 50_000; +const BG_BURST_ROWS: usize = 64; + +const CAYENNE_SCAN_SQL: &str = "SELECT SUM(value) FROM t WHERE id BETWEEN 1000 AND 11000"; +const DUCKDB_SCAN_SQL: &str = + "SELECT SUM(value) FROM concurrent_bench WHERE id BETWEEN 1000 AND 11000"; + +/// Background writer handle for the Cayenne lane. Drop to stop + join. +struct CayenneBgWriter { + stop: Arc, + handle: Option>, + rt_handle: tokio::runtime::Handle, +} + +impl CayenneBgWriter { + fn spawn(rt: &Runtime, fixture: &CayenneFixture) -> Self { + let stop = Arc::new(AtomicBool::new(false)); + let stop_clone = Arc::clone(&stop); + let table = Arc::clone(&fixture.table); + let handle = rt.spawn(async move { + let mut cursor = BASE_ROWS as i64; + while !stop_clone.load(Ordering::Relaxed) { + let batch = make_batch(schema(), cursor, BG_BURST_ROWS); + cursor += BG_BURST_ROWS as i64; + let _ = cayenne_insert(&table, batch).await; + } + }); + Self { + stop, + handle: Some(handle), + rt_handle: rt.handle().clone(), + } + } +} + +impl Drop for CayenneBgWriter { + fn drop(&mut self) { + self.stop.store(true, Ordering::Relaxed); + if let Some(handle) = self.handle.take() { + // Block the bench thread on the background task settling. Using + // `block_on` here is safe because Criterion's bench thread is not + // the runtime worker β€” the runtime owns its own threads. + let _ = self.rt_handle.block_on(handle); + } + } +} + +/// Background writer handle for the DuckDB lane. Drop to stop + join. +struct DuckDbBgWriter { + stop: Arc, + handle: Option>, +} + +impl DuckDbBgWriter { + fn spawn(fixture: &DuckDbFixture) -> Self { + let stop = Arc::new(AtomicBool::new(false)); + let stop_clone = Arc::clone(&stop); + let db_path = fixture.db_path(); + let handle = std::thread::spawn(move || { + let conn = Connection::open(&db_path).expect("bg duckdb open"); + let mut cursor = BASE_ROWS as i64; + while !stop_clone.load(Ordering::Relaxed) { + let batch = make_batch(schema(), cursor, BG_BURST_ROWS); + cursor += BG_BURST_ROWS as i64; + duckdb_insert_rows(&conn, "concurrent_bench", &batch); + } + }); + Self { + stop, + handle: Some(handle), + } + } +} + +impl Drop for DuckDbBgWriter { + fn drop(&mut self) { + self.stop.store(true, Ordering::Relaxed); + if let Some(handle) = self.handle.take() { + let _ = handle.join(); + } + } +} + +async fn load_cayenne(lane: Metastore) -> CayenneFixture { + let fixture = setup_cayenne_for("concurrent_bench", lane).await; + let _ = cayenne_insert(&fixture.table, make_batch(schema(), 0, BASE_ROWS)).await; + fixture +} + +fn load_duckdb(parquet_path: &std::path::Path) -> DuckDbFixture { + let fixture = setup_duckdb("concurrent_bench"); + duckdb_insert_parquet(&fixture.conn, "concurrent_bench", parquet_path); + fixture +} + +fn bench_concurrent(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("vs_duckdb_concurrent"); + // Lower sample size β€” each iteration runs against a moving table + // (background writer keeps appending) and the goal is the relative + // delta vs `vs_duckdb_scan`, not absolute precision. + group.sample_size(10); + group.measurement_time(Duration::from_secs(15)); + + let parquet_dir = tempfile::tempdir().expect("parquet dir"); + let parquet_path = parquet_dir.path().join("base.parquet"); + write_parquet(&make_batch(schema(), 0, BASE_ROWS), &parquet_path); + + for &lane in CAYENNE_LANES { + let lane_label = lane.lane(); + let fixture = rt.block_on(load_cayenne(lane)); + let bg = CayenneBgWriter::spawn(&rt, &fixture); + + let table = Arc::clone(&fixture.table); + group.bench_function(BenchmarkId::new(lane_label, "scan_under_write"), |b| { + b.iter(|| { + rt.block_on(async { + let batches = cayenne_query(&table, CAYENNE_SCAN_SQL).await; + black_box(batches); + }); + }); + }); + + // Explicit drop order: stop the background writer before the + // fixture, so the writer doesn't try to insert into a torn-down + // table during cleanup. + drop(bg); + drop(fixture); + } + + let duckdb_fixture = load_duckdb(&parquet_path); + let bg = DuckDbBgWriter::spawn(&duckdb_fixture); + group.bench_function(BenchmarkId::new("duckdb", "scan_under_write"), |b| { + b.iter(|| { + let v = duckdb_query_scalar(&duckdb_fixture.conn, DUCKDB_SCAN_SQL); + black_box(v); + }); + }); + drop(bg); + drop(duckdb_fixture); + + group.finish(); +} + +criterion_group!(benches, bench_concurrent); +criterion_main!(benches); diff --git a/crates/cayenne/benches/vs_duckdb_delete.rs b/crates/cayenne/benches/vs_duckdb_delete.rs new file mode 100644 index 0000000000..62056f4a2c --- /dev/null +++ b/crates/cayenne/benches/vs_duckdb_delete.rs @@ -0,0 +1,194 @@ +// Copyright 2026 The Spice.ai OSS Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 + +//! Delete + re-scan throughput: Cayenne vs DuckDB. +//! +//! Cayenne's deletion-vector path is one of its biggest architectural +//! wins over DuckDB β€” DuckDB rewrites the affected blocks; Cayenne +//! writes an Arrow IPC deletion vector and applies it transparently at +//! read time. This bench measures the delta on the two halves of that +//! tradeoff: +//! +//! 1. `delete` β€” wall time to execute a `DELETE FROM t WHERE …` +//! touching ~10% of rows. +//! 2. `scan_after_delete` β€” full-table `SELECT SUM(value) FROM t` immediately +//! after the delete, exercising the read-time +//! deletion-vector filter on Cayenne and DuckDB's +//! rewritten blocks. + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] + +#[path = "vs_duckdb_helpers/common.rs"] +mod common; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::UInt64Array; +use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use datafusion::prelude::SessionContext; +use datafusion_expr::{col, lit}; +use tokio::runtime::Runtime; + +use common::{ + CayenneFixture, DuckDbFixture, capture_comparison_plans, cayenne_insert, cayenne_query, + duckdb_insert_parquet, duckdb_query_scalar, make_batch, schema, setup_cayenne_pk, + setup_duckdb_pk, write_parquet, +}; + +const TABLE_SIZES: &[usize] = &[16_384, 131_072, 1_048_576]; + +async fn cayenne_delete_range(fixture: &CayenneFixture, lo: i64, hi: i64) -> u64 { + let ctx = SessionContext::new(); + let filter = col("id").gt_eq(lit(lo)).and(col("id").lt_eq(lit(hi))); + let plan = fixture + .table + .delete_from(&ctx.state(), vec![filter]) + .await + .expect("cayenne delete plan"); + let results = datafusion_physical_plan::collect(plan, ctx.task_ctx()) + .await + .expect("cayenne delete collect"); + results + .first() + .and_then(|b| b.column(0).as_any().downcast_ref::()) + .and_then(|a| a.values().first()) + .copied() + .unwrap_or(0) +} + +fn duckdb_delete_range(fixture: &DuckDbFixture, table: &str, lo: i64, hi: i64) { + fixture + .conn + .execute_batch(&format!( + "DELETE FROM {table} WHERE id BETWEEN {lo} AND {hi};" + )) + .expect("duckdb delete"); +} + +async fn load_cayenne(rows: usize) -> CayenneFixture { + let fixture = setup_cayenne_pk("del_bench").await; + let batch = make_batch(schema(), 0, rows); + let _ = cayenne_insert(&fixture.table, batch).await; + fixture +} + +fn load_duckdb(parquet_path: &std::path::Path) -> DuckDbFixture { + let fixture = setup_duckdb_pk("del_bench"); + duckdb_insert_parquet(&fixture.conn, "del_bench", parquet_path); + fixture +} + +fn bench_delete(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("vs_duckdb_delete"); + group.sample_size(10); + + let parquet_dir = tempfile::tempdir().expect("parquet dir"); + + for &rows in TABLE_SIZES { + group.throughput(Throughput::Elements(rows as u64)); + + let parquet_path = parquet_dir.path().join(format!("rows_{rows}.parquet")); + let batch = make_batch(schema(), 0, rows); + write_parquet(&batch, &parquet_path); + + // Delete the middle ~10% of rows; both engines see the same range. + let lo = (rows as i64) * 45 / 100; + let hi = (rows as i64) * 55 / 100; + let cayenne_delete_sql = format!("DELETE FROM t WHERE id BETWEEN {lo} AND {hi}"); + let duckdb_delete_sql = format!("DELETE FROM del_bench WHERE id BETWEEN {lo} AND {hi}"); + + let plan_cayenne_fixture = rt.block_on(load_cayenne(rows)); + let plan_duckdb_fixture = load_duckdb(&parquet_path); + rt.block_on(capture_comparison_plans( + &format!("delete/{rows}/delete"), + &plan_cayenne_fixture.table, + &plan_duckdb_fixture.conn, + &cayenne_delete_sql, + &duckdb_delete_sql, + )); + + // --- delete (timed; setup is re-run per iteration to keep state clean) --- + group.bench_with_input(BenchmarkId::new("cayenne/delete", rows), &rows, |b, &_| { + b.iter_batched( + || rt.block_on(load_cayenne(rows)), + |fixture| { + rt.block_on(async { + let deleted = cayenne_delete_range(&fixture, lo, hi).await; + black_box((fixture, deleted)); + }); + }, + BatchSize::PerIteration, + ); + }); + let path = parquet_path.clone(); + group.bench_with_input(BenchmarkId::new("duckdb/delete", rows), &rows, |b, &_| { + b.iter_batched( + || load_duckdb(&path), + |fixture| { + duckdb_delete_range(&fixture, "del_bench", lo, hi); + black_box(fixture); + }, + BatchSize::PerIteration, + ); + }); + + // --- scan_after_delete (load + delete once outside the timed region, + // then query many times to measure read-time filtering cost) --- + let cayenne_fixture = Arc::new(rt.block_on(async { + let fixture = load_cayenne(rows).await; + let _ = cayenne_delete_range(&fixture, lo, hi).await; + fixture + })); + let duckdb_fixture = Arc::new({ + let fixture = load_duckdb(&parquet_path); + duckdb_delete_range(&fixture, "del_bench", lo, hi); + fixture + }); + + rt.block_on(capture_comparison_plans( + &format!("delete/{rows}/scan_after_delete"), + &cayenne_fixture.table, + &duckdb_fixture.conn, + "SELECT SUM(value) FROM t", + "SELECT SUM(value) FROM del_bench", + )); + + let cf = Arc::clone(&cayenne_fixture); + group.bench_with_input( + BenchmarkId::new("cayenne/scan_after_delete", rows), + &rows, + |b, &_| { + b.iter(|| { + rt.block_on(async { + let batches = cayenne_query(&cf.table, "SELECT SUM(value) FROM t").await; + black_box(batches); + }); + }); + }, + ); + let df = Arc::clone(&duckdb_fixture); + group.bench_with_input( + BenchmarkId::new("duckdb/scan_after_delete", rows), + &rows, + |b, &_| { + b.iter(|| { + let v = duckdb_query_scalar(&df.conn, "SELECT SUM(value) FROM del_bench"); + black_box(v); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_delete); +criterion_main!(benches); diff --git a/crates/cayenne/benches/vs_duckdb_groupby.rs b/crates/cayenne/benches/vs_duckdb_groupby.rs new file mode 100644 index 0000000000..1b91f65671 --- /dev/null +++ b/crates/cayenne/benches/vs_duckdb_groupby.rs @@ -0,0 +1,139 @@ +// Copyright 2026 The Spice.ai OSS Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 + +//! GROUP BY aggregation throughput: Cayenne vs DuckDB. +//! +//! Aggregation kernels are sensitive to group cardinality: low-cardinality +//! groups stay in CPU caches and stress hash-aggregate intrinsics; +//! high-cardinality groups thrash the hash table and stress probe / rehash +//! paths. This bench runs the same query at three cardinalities for each +//! row count, so the engine-to-engine delta and the cardinality sensitivity +//! both show up in the Criterion report. +//! +//! Lanes (compile-time gated): +//! - `cayenne` β€” Cayenne with the SQLite metastore (default) +//! - `cayenne_turso` β€” Cayenne with the Turso metastore (--features turso) +//! - `duckdb` β€” DuckDB file-mode + +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] +#![allow(clippy::cast_possible_truncation)] + +#[path = "vs_duckdb_helpers/common.rs"] +mod common; + +use std::hint::black_box; +use std::path::Path; +use std::sync::Arc; + +use arrow::array::RecordBatch; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use tokio::runtime::Runtime; + +use common::{ + CAYENNE_LANES, CayenneFixture, DuckDbFixture, Metastore, capture_comparison_plans, + cayenne_insert, cayenne_query, duckdb_insert_parquet, duckdb_query_count, make_batch_grouped, + schema, setup_cayenne_for, setup_duckdb, write_parquet, +}; + +const ROW_COUNTS: &[usize] = &[16_384, 131_072]; +const GROUP_CARDINALITIES: &[usize] = &[8, 1_024, 16_384]; + +async fn load_cayenne(lane: Metastore, rows: usize, groups: usize) -> CayenneFixture { + let fixture = setup_cayenne_for("groupby_bench", lane).await; + let batch = make_batch_grouped(schema(), 0, rows, groups); + let _ = cayenne_insert(&fixture.table, batch).await; + fixture +} + +fn load_duckdb(parquet_path: &Path) -> DuckDbFixture { + let fixture = setup_duckdb("groupby_bench"); + duckdb_insert_parquet(&fixture.conn, "groupby_bench", parquet_path); + fixture +} + +fn bench_groupby(c: &mut Criterion) { + let rt = Runtime::new().expect("runtime"); + let mut group = c.benchmark_group("vs_duckdb_groupby"); + group.sample_size(10); + + let parquet_dir = tempfile::tempdir().expect("parquet dir"); + + for &rows in ROW_COUNTS { + for &groups in GROUP_CARDINALITIES { + let effective_groups = groups.min(rows); + group.throughput(Throughput::Elements(rows as u64)); + + let parquet_path = parquet_dir + .path() + .join(format!("rows_{rows}_groups_{effective_groups}.parquet")); + let batch: RecordBatch = make_batch_grouped(schema(), 0, rows, effective_groups); + write_parquet(&batch, &parquet_path); + + let duckdb_fixture = Arc::new(load_duckdb(&parquet_path)); + + let cayenne_sql = "SELECT name, COUNT(*), SUM(value) FROM t GROUP BY name"; + let duckdb_sql = "SELECT name, COUNT(*), SUM(value) FROM groupby_bench GROUP BY name"; + + // Plan capture uses the SQLite lane β€” Turso would emit the same + // DataFusion plan because the metastore only affects metadata I/O, + // not query planning. + let plan_fixture = + Arc::new(rt.block_on(load_cayenne(Metastore::Sqlite, rows, effective_groups))); + rt.block_on(capture_comparison_plans( + &format!("groupby/{rows}/groups_{effective_groups}/group_by_name"), + &plan_fixture.table, + &duckdb_fixture.conn, + cayenne_sql, + duckdb_sql, + )); + + for &lane in CAYENNE_LANES { + let lane_label = lane.lane(); + let cayenne_fixture = + Arc::new(rt.block_on(load_cayenne(lane, rows, effective_groups))); + let cf = Arc::clone(&cayenne_fixture); + group.bench_with_input( + BenchmarkId::new( + format!("{lane_label}/group_by_name"), + format!("rows={rows}/groups={effective_groups}"), + ), + &rows, + |b, &_rows| { + b.iter(|| { + rt.block_on(async { + let batches = cayenne_query(&cf.table, cayenne_sql).await; + black_box(batches); + }); + }); + }, + ); + } + + let df = Arc::clone(&duckdb_fixture); + group.bench_with_input( + BenchmarkId::new( + "duckdb/group_by_name", + format!("rows={rows}/groups={effective_groups}"), + ), + &rows, + |b, &_rows| { + b.iter(|| { + let n = duckdb_query_count(&df.conn, duckdb_sql); + black_box(n); + }); + }, + ); + } + } + + group.finish(); +} + +criterion_group!(benches, bench_groupby); +criterion_main!(benches); diff --git a/crates/cayenne/benches/vs_duckdb_helpers/common.rs b/crates/cayenne/benches/vs_duckdb_helpers/common.rs new file mode 100644 index 0000000000..d8425ed643 --- /dev/null +++ b/crates/cayenne/benches/vs_duckdb_helpers/common.rs @@ -0,0 +1,720 @@ +// Copyright 2026 The Spice.ai OSS Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 + +//! Shared helpers for the Cayenne-vs-DuckDB micro-benchmarks. +//! +//! Each `vs_duckdb_*` bench compares Cayenne and DuckDB on the same Arrow +//! input, doing identical logical work. Helpers in this module own the +//! pieces that are identical across benches β€” schema, fixture generation, +//! parquet materialization, and the canonical Cayenne / DuckDB setup paths. +//! +//! Included via `#[path = "vs_duckdb_helpers/common.rs"] mod common;` +//! from each bench file. Placing the helper inside a subdirectory keeps +//! Cargo's bench auto-discovery from picking it up as a standalone target, +//! so no `autobenches = false` is required on the cayenne crate. + +#![allow(dead_code)] +#![allow(clippy::expect_used)] +#![allow(clippy::cast_possible_wrap)] +#![allow(clippy::cast_sign_loss)] + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use arrow::array::{Int64Array, RecordBatch, StringArray}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::util::pretty::pretty_format_batches; +use cayenne::metadata::CreateTableOptions; +use cayenne::{CayenneCatalog, CayenneTableProvider, MetadataCatalog}; +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion::parquet::arrow::ArrowWriter; +use datafusion::parquet::file::properties::WriterProperties; +use datafusion_table_providers::util::{ + column_reference::ColumnReference, on_conflict::OnConflict, +}; +use duckdb::Connection; +use tempfile::TempDir; + +/// Which Cayenne metastore backend to use in a fixture. +/// +/// `Sqlite` is Cayenne's default (no `cayenne_metastore` param). `Turso` is +/// available when the bench is built with `--features turso` and matches +/// `cayenne_metastore: turso` in spicepods. The DuckDB side is unaffected; +/// pairing a `Turso` Cayenne fixture against the same DuckDB fixture isolates +/// the metastore's contribution to overall numbers. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Metastore { + Sqlite, + #[cfg(feature = "turso")] + Turso, +} + +impl Metastore { + /// Stable lane label used in `BenchmarkId`s. + #[must_use] + pub fn lane(self) -> &'static str { + match self { + Metastore::Sqlite => "cayenne", + #[cfg(feature = "turso")] + Metastore::Turso => "cayenne_turso", + } + } + + fn connection_string(self, db_path: &Path) -> String { + let path = db_path.to_string_lossy(); + match self { + Metastore::Sqlite => format!("sqlite://{path}"), + #[cfg(feature = "turso")] + Metastore::Turso => format!("libsql://{path}"), + } + } +} + +/// All Cayenne lanes a bench should run. Compile-time gated on the `turso` +/// feature so benches built without it cleanly drop to a single lane. +pub const CAYENNE_LANES: &[Metastore] = &[ + Metastore::Sqlite, + #[cfg(feature = "turso")] + Metastore::Turso, +]; + +/// Canonical schema for the comparison benches. +/// +/// Three columns chosen to mirror the shape of a TPC-H `customer` / `orders` +/// row that's been keyed on a single int64 primary key: +/// - `id`: int64 PK (dense, monotonic) +/// - `name`: utf8 (variable-width, low cardinality on repeat) +/// - `value`: int64 (numeric payload) +pub fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Int64, false), + ])) +} + +/// Build a deterministic batch of `rows` rows starting at `start_id`. +/// +/// `name` is unique per row (`name_{id}`) so GROUP BY on `name` yields one +/// group per row. Use [`make_batch_grouped`] when low cardinality is wanted. +pub fn make_batch(schema: Arc, start_id: i64, rows: usize) -> RecordBatch { + let ids: Vec = (0..rows as i64).map(|i| start_id + i).collect(); + let names: Vec = ids.iter().map(|id| format!("name_{id}")).collect(); + let values: Vec = ids.iter().map(|id| id * 100).collect(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + Arc::new(Int64Array::from(values)), + ], + ) + .expect("batch") +} + +/// Build a deterministic batch with `groups` distinct `name` values, used by +/// the GROUP BY bench so the aggregation kernel produces a bounded number of +/// output groups regardless of row count. +pub fn make_batch_grouped( + schema: Arc, + start_id: i64, + rows: usize, + groups: usize, +) -> RecordBatch { + let group_count = groups.max(1); + let ids: Vec = (0..rows as i64).map(|i| start_id + i).collect(); + let names: Vec = (0..rows) + .map(|i| format!("group_{}", i % group_count)) + .collect(); + let values: Vec = ids.iter().map(|id| id * 100).collect(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + Arc::new(Int64Array::from(values)), + ], + ) + .expect("batch") +} + +/// Build a small "dimension" batch for the join bench. `id` is a foreign key +/// into the fact table; `region` is a 4-way low-cardinality dimension. +pub fn make_dim_batch(schema: Arc, rows: usize) -> RecordBatch { + const REGIONS: [&str; 4] = ["NA", "EU", "APAC", "LATAM"]; + let ids: Vec = (0..rows as i64).collect(); + let regions: Vec<&str> = (0..rows).map(|i| REGIONS[i % REGIONS.len()]).collect(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(regions)), + ], + ) + .expect("dim batch") +} + +/// Schema for the dim table used by the join bench. +pub fn dim_schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("region", DataType::Utf8, false), + ])) +} + +/// Write a single record batch to a parquet file so both engines can ingest +/// from the same on-disk source β€” the realistic Spice ingestion path. +pub fn write_parquet(batch: &RecordBatch, path: &Path) { + let file = std::fs::File::create(path).expect("create parquet"); + let props = WriterProperties::builder().build(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).expect("arrow writer"); + writer.write(batch).expect("write"); + writer.close().expect("close"); +} + +/// A clean Cayenne table backed by a fresh metastore + temp data dir. +/// +/// The backend (`SQLite` or `Turso`) is selected at fixture-creation time +/// via [`Metastore`] so each bench can run multiple metastore lanes. +pub struct CayenneFixture { + pub _temp_dir: TempDir, + pub table: Arc, + pub catalog: Arc, +} + +pub async fn setup_cayenne(table_name: &str) -> CayenneFixture { + setup_cayenne_with(table_name, Metastore::Sqlite, vec![], None, schema()).await +} + +pub async fn setup_cayenne_pk(table_name: &str) -> CayenneFixture { + setup_cayenne_with( + table_name, + Metastore::Sqlite, + vec!["id".to_string()], + Some(OnConflict::Upsert(ColumnReference::new(vec![ + "id".to_string(), + ]))), + schema(), + ) + .await +} + +/// Build a Cayenne fixture with a chosen metastore backend (default `schema()`). +pub async fn setup_cayenne_for(table_name: &str, metastore: Metastore) -> CayenneFixture { + setup_cayenne_with(table_name, metastore, vec![], None, schema()).await +} + +/// Build a Cayenne fixture with a chosen metastore backend AND a single-column +/// `id` primary key with upsert on-conflict resolution. +pub async fn setup_cayenne_pk_for(table_name: &str, metastore: Metastore) -> CayenneFixture { + setup_cayenne_with( + table_name, + metastore, + vec!["id".to_string()], + Some(OnConflict::Upsert(ColumnReference::new(vec![ + "id".to_string(), + ]))), + schema(), + ) + .await +} + +/// Build a Cayenne fixture that uses the dim-table schema (for the join bench). +pub async fn setup_cayenne_dim_for(table_name: &str, metastore: Metastore) -> CayenneFixture { + setup_cayenne_with(table_name, metastore, vec![], None, dim_schema()).await +} + +async fn setup_cayenne_with( + table_name: &str, + metastore: Metastore, + primary_key: Vec, + on_conflict: Option, + table_schema: Arc, +) -> CayenneFixture { + let temp_dir = tempfile::tempdir().expect("temp dir"); + let data_path = temp_dir.path().join("data"); + tokio::fs::create_dir_all(&data_path) + .await + .expect("data dir"); + let db_path = temp_dir.path().join("catalog.db"); + let catalog = + Arc::new(CayenneCatalog::new(metastore.connection_string(&db_path)).expect("catalog")); + catalog.init().await.expect("catalog init"); + + let table = Arc::new( + CayenneTableProvider::create_table( + Arc::clone(&catalog) as Arc, + CreateTableOptions { + table_name: table_name.to_string(), + schema: table_schema, + primary_key, + on_conflict, + base_path: data_path.to_string_lossy().to_string(), + partition_column: None, + vortex_config: cayenne::metadata::VortexConfig::default(), + }, + Arc::new(RuntimeEnv::default()), + ) + .await + .expect("cayenne create_table"), + ); + + CayenneFixture { + _temp_dir: temp_dir, + table, + catalog: Arc::clone(&catalog) as Arc, + } +} + +/// A clean DuckDB file-mode database with the same schema. +/// +/// File-backed (not in-memory) for parity with Cayenne, which only supports +/// `mode: file`. Comparing Cayenne-file vs DuckDB-memory would not be fair +/// (see `tools/testoperator/dispatch/perf-cayenne-vs-duckdb/README.md`). +pub struct DuckDbFixture { + pub _temp_dir: TempDir, + pub conn: Connection, +} + +impl DuckDbFixture { + /// Path to the on-disk `.duckdb` file. Used by the concurrent bench to + /// open a second connection from a background thread (DuckDB connections + /// are not `Send`). + #[must_use] + pub fn db_path(&self) -> PathBuf { + self._temp_dir.path().join("duck.db") + } +} + +pub fn setup_duckdb(table_name: &str) -> DuckDbFixture { + setup_duckdb_with_pk(table_name, false) +} + +pub fn setup_duckdb_pk(table_name: &str) -> DuckDbFixture { + setup_duckdb_with_pk(table_name, true) +} + +fn setup_duckdb_with_pk(table_name: &str, with_pk: bool) -> DuckDbFixture { + let temp_dir = tempfile::tempdir().expect("temp dir"); + let db_path = temp_dir.path().join("duck.db"); + let conn = Connection::open(&db_path).expect("duckdb open"); + let pk_clause = if with_pk { " PRIMARY KEY" } else { "" }; + conn.execute_batch(&format!( + "CREATE TABLE {table_name} (id BIGINT{pk_clause}, name VARCHAR NOT NULL, value BIGINT NOT NULL);" + )) + .expect("duckdb create table"); + DuckDbFixture { + _temp_dir: temp_dir, + conn, + } +} + +/// DuckDB fixture for the join bench: a `t` fact table (default schema) and +/// a `d` dim table (id, region). Both engines see the same shape so the +/// resulting join plans are directly comparable. +pub fn setup_duckdb_with_dim(fact_table: &str, dim_table: &str) -> DuckDbFixture { + let fixture = setup_duckdb(fact_table); + fixture + .conn + .execute_batch(&format!( + "CREATE TABLE {dim_table} (id BIGINT NOT NULL, region VARCHAR NOT NULL);" + )) + .expect("duckdb create dim table"); + fixture +} + +/// Upsert via DuckDB's `INSERT ... ON CONFLICT DO UPDATE`. Apples-to-apples +/// with Cayenne's `OnConflict::Upsert` on the `id` primary key. +pub fn duckdb_upsert_parquet(conn: &Connection, table_name: &str, parquet_path: &Path) { + conn.execute_batch(&format!( + "INSERT INTO {table_name} SELECT * FROM read_parquet('{}') \ + ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, value = EXCLUDED.value;", + parquet_path.display() + )) + .expect("duckdb upsert parquet"); +} + +/// Insert a small VALUES tuple list β€” used by the burst bench to mirror the +/// fine-grained per-burst insert path without paying parquet decode cost. +pub fn duckdb_insert_rows(conn: &Connection, table_name: &str, batch: &RecordBatch) { + use arrow::array::{Array, Int64Array, StringArray}; + + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("ids"); + let names = batch + .column(1) + .as_any() + .downcast_ref::() + .expect("names"); + let values = batch + .column(2) + .as_any() + .downcast_ref::() + .expect("values"); + + let mut sql = format!("INSERT INTO {table_name} VALUES "); + for i in 0..batch.num_rows() { + if i > 0 { + sql.push(','); + } + sql.push_str(&format!( + "({}, '{}', {})", + ids.value(i), + names.value(i).replace('\'', "''"), + values.value(i) + )); + } + sql.push(';'); + conn.execute_batch(&sql).expect("duckdb insert rows"); +} + +/// Insert the rows of `batch` into DuckDB's dim table. +pub fn duckdb_insert_dim_rows(conn: &Connection, table_name: &str, batch: &RecordBatch) { + use arrow::array::{Array, Int64Array, StringArray}; + + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("ids"); + let regions = batch + .column(1) + .as_any() + .downcast_ref::() + .expect("regions"); + + let mut sql = format!("INSERT INTO {table_name} VALUES "); + for i in 0..batch.num_rows() { + if i > 0 { + sql.push(','); + } + sql.push_str(&format!( + "({}, '{}')", + ids.value(i), + regions.value(i).replace('\'', "''"), + )); + } + sql.push(';'); + conn.execute_batch(&sql).expect("duckdb insert dim rows"); +} + +/// Bulk-insert via DuckDB's native parquet loader. This is DuckDB's +/// fastest ingestion path and the apples-to-apples comparison for +/// Cayenne's parquet-source insert path. +pub fn duckdb_insert_parquet(conn: &Connection, table_name: &str, parquet_path: &Path) { + conn.execute_batch(&format!( + "INSERT INTO {table_name} SELECT * FROM read_parquet('{}');", + parquet_path.display() + )) + .expect("duckdb insert parquet"); +} + +/// Insert an Arrow batch through Cayenne via the DataFusion `insert_into` API. +/// Mirrors how spiced loads accelerator data in production. +pub async fn cayenne_insert(table: &Arc, batch: RecordBatch) -> u64 { + use datafusion::datasource::TableProvider; + use datafusion::datasource::memory::MemorySourceConfig; + use datafusion::prelude::SessionContext; + use datafusion_expr::dml::InsertOp; + + let ctx = SessionContext::new(); + let schema = Arc::clone(batch.schema_ref()); + let input_exec = + MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None).expect("memory exec"); + let insert_plan = table + .insert_into(&ctx.state(), input_exec, InsertOp::Append) + .await + .expect("cayenne insert plan"); + let results = datafusion_physical_plan::collect(insert_plan, ctx.task_ctx()) + .await + .expect("cayenne insert collect"); + results + .first() + .and_then(|batch| { + batch + .column(0) + .as_any() + .downcast_ref::() + }) + .map_or(0, |rows| rows.value(0)) +} + +/// Insert from a parquet file through Cayenne via DataFusion's parquet +/// reader. Mirrors spiced's `file:` connector β†’ accelerator ingestion path +/// and gives parity with `duckdb_insert_parquet` (both engines now consume +/// the same on-disk parquet, including the decode work). +pub async fn cayenne_insert_from_parquet( + table: &Arc, + parquet_path: &Path, +) -> u64 { + use datafusion::datasource::TableProvider; + use datafusion::prelude::{ParquetReadOptions, SessionContext}; + use datafusion_expr::dml::InsertOp; + + let parquet_path = parquet_path.to_string_lossy().into_owned(); + let ctx = SessionContext::new(); + let df = ctx + .read_parquet::<&str>(parquet_path.as_str(), ParquetReadOptions::default()) + .await + .expect("cayenne read_parquet"); + let input_exec = df + .create_physical_plan() + .await + .expect("cayenne physical plan"); + let insert_plan = table + .insert_into(&ctx.state(), input_exec, InsertOp::Append) + .await + .expect("cayenne insert plan"); + let results = datafusion_physical_plan::collect(insert_plan, ctx.task_ctx()) + .await + .expect("cayenne insert collect"); + results + .first() + .and_then(|batch| { + batch + .column(0) + .as_any() + .downcast_ref::() + }) + .map_or(0, |rows| rows.value(0)) +} + +/// Run a SQL query through Cayenne and return the collected batches. +pub async fn cayenne_query(table: &Arc, sql: &str) -> Vec { + use datafusion::datasource::TableProvider; + use datafusion::prelude::SessionContext; + + let ctx = SessionContext::new(); + ctx.register_table("t", Arc::clone(table) as Arc) + .expect("register table"); + let df = ctx.sql(sql).await.expect("cayenne sql"); + df.collect().await.expect("cayenne collect") +} + +/// Run a SQL query against two Cayenne tables registered as `t` and `d`. +/// Used by the join bench so the SQL matches the DuckDB form. +pub async fn cayenne_query_join( + fact: &Arc, + dim: &Arc, + sql: &str, +) -> Vec { + use datafusion::datasource::TableProvider; + use datafusion::prelude::SessionContext; + + let ctx = SessionContext::new(); + ctx.register_table("t", Arc::clone(fact) as Arc) + .expect("register fact"); + ctx.register_table("d", Arc::clone(dim) as Arc) + .expect("register dim"); + let df = ctx.sql(sql).await.expect("cayenne join sql"); + df.collect().await.expect("cayenne join collect") +} + +/// Capture optimized and executed plans for a Cayenne/DuckDB query pair. +/// +/// Files are written to `target/cayenne_vs_duckdb_plans/