Skip to content

Commit ba8fbb2

Browse files
authored
feat: add lance_dataset_calculate_data_stats for per-field storage stats (#47)
Exposes upstream's `Dataset::calculate_data_stats()` so callers can size each field on disk for query planning and cost estimation. ## Surface `lance_dataset_calculate_data_stats()` walks every fragment and returns an opaque `LanceDataStatistics` snapshot, following the same handle + indexed accessor shape as `lance_dataset_versions()`: - `lance_data_statistics_count()` — number of fields - `lance_data_statistics_field_id_at(i)` — schema field id - `lance_data_statistics_bytes_on_disk_at(i)` — compressed on-disk size - `lance_data_statistics_close()` — free the snapshot Entries are ordered by schema field id, one per field (nested struct/list children included). The C++ side adds `Dataset::calculate_data_stats()` returning `std::vector<FieldStatistics>`. One caveat worth flagging in the docs: `bytes_on_disk` is 0 for datasets written with the legacy (v1) storage format, which doesn't track per-field sizes. Since 0 is also the error sentinel for the accessors, the headers tell callers to check `lance_last_error_code()` when they pass an untrusted index. ## Tests Rust integration tests cover the single-fragment happy path (field ids and non-zero sizes, including the field-id-0-vs-error-sentinel disambiguation), a three-field schema, and multi-fragment aggregation (verified by comparing a two-fragment dataset against a single-fragment baseline with identical rows in the first fragment, so the assertion actually proves summation). Two edge cases that exercise the documented contract: a legacy v1 dataset (every field reports 0 bytes, error stays clear) and an empty-schema dataset (count 0 with no error, distinguishing it from the NULL-handle error). The full rejection surface is covered too — NULL dataset, NULL handle on every accessor, out-of-range index, and NULL-safe close. C and C++ smoke tests run against the freshly-written (v2) dataset before the mutation tests reshape it.
1 parent 69671a6 commit ba8fbb2

7 files changed

Lines changed: 603 additions & 0 deletions

File tree

include/lance/lance.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ typedef struct LanceDataset LanceDataset;
140140
typedef struct LanceScanner LanceScanner;
141141
typedef struct LanceBatch LanceBatch;
142142
typedef struct LanceVersions LanceVersions;
143+
typedef struct LanceDataStatistics LanceDataStatistics;
143144

144145
/* ─── Dataset lifecycle ─── */
145146

@@ -205,6 +206,48 @@ int64_t lance_versions_timestamp_ms_at(const LanceVersions* versions, size_t ind
205206
/** Close and free a versions handle. Safe to call with NULL. */
206207
void lance_versions_close(LanceVersions* versions);
207208

209+
/* ─── Data statistics ─── */
210+
211+
/**
212+
* Compute per-field data statistics (compressed on-disk byte size) for query
213+
* planning. Walks every fragment, so this performs I/O. Caller frees the
214+
* returned handle with lance_data_statistics_close().
215+
*
216+
* Entries are ordered by schema field id, one per field (including nested
217+
* struct/list children).
218+
* @return handle on success, or NULL on error
219+
*/
220+
LanceDataStatistics* lance_dataset_calculate_data_stats(const LanceDataset* dataset);
221+
222+
/**
223+
* Number of fields in the statistics snapshot. Clears the thread-local error
224+
* on success. Returns 0 and sets LANCE_ERR_INVALID_ARGUMENT on a NULL handle;
225+
* a dataset with an empty schema also yields 0 with no error set, so check
226+
* lance_last_error_code() to distinguish the error case from an empty result.
227+
*/
228+
uint64_t lance_data_statistics_count(const LanceDataStatistics* stats);
229+
230+
/**
231+
* Schema field id at `index` (0 <= index < count).
232+
* Returns 0 on error (NULL handle or out-of-range index), setting
233+
* LANCE_ERR_INVALID_ARGUMENT. Because 0 is itself a valid field id, check
234+
* lance_last_error_code() when passing an untrusted index; iterating
235+
* `0..count` never errors.
236+
*/
237+
uint32_t lance_data_statistics_field_id_at(const LanceDataStatistics* stats, size_t index);
238+
239+
/**
240+
* Compressed on-disk byte size of the field at `index`.
241+
* Returns 0 on error (NULL handle or out-of-range index), setting
242+
* LANCE_ERR_INVALID_ARGUMENT. A field written with the legacy (v1) storage
243+
* format also reports 0 but sets no error, so check lance_last_error_code() to
244+
* distinguish a genuine 0 from the error sentinel.
245+
*/
246+
uint64_t lance_data_statistics_bytes_on_disk_at(const LanceDataStatistics* stats, size_t index);
247+
248+
/** Close and free a data statistics handle. Safe to call with NULL. */
249+
void lance_data_statistics_close(LanceDataStatistics* stats);
250+
208251
/**
209252
* Restore the dataset to an older version by committing a new manifest that
210253
* carries the fragments of `version`. If `version` is already the latest,

include/lance/lance.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ struct VersionInfo {
9797
int64_t timestamp_ms;
9898
};
9999

100+
/// Per-field storage statistics for query planning.
101+
/// `id` is the schema field id; `bytes_on_disk` is the compressed on-disk size
102+
/// (0 for datasets written with the legacy v1 storage format).
103+
struct FieldStatistics {
104+
uint32_t id;
105+
uint64_t bytes_on_disk;
106+
};
107+
100108
// ─── Write mode ──────────────────────────────────────────────────────────────
101109

102110
enum class WriteMode : int32_t {
@@ -349,6 +357,28 @@ class Dataset {
349357
return out;
350358
}
351359

360+
/// Compute per-field data statistics (compressed on-disk byte size) for
361+
/// query planning, ordered by schema field id. Performs I/O over every
362+
/// fragment. Throws lance::Error on failure.
363+
std::vector<FieldStatistics> calculate_data_stats() const {
364+
auto* raw = lance_dataset_calculate_data_stats(handle_.get());
365+
if (!raw) check_error();
366+
Handle<LanceDataStatistics, lance_data_statistics_close> snap(raw);
367+
368+
uint64_t n = lance_data_statistics_count(snap.get());
369+
std::vector<FieldStatistics> out;
370+
out.reserve(static_cast<size_t>(n));
371+
for (uint64_t i = 0; i < n; i++) {
372+
FieldStatistics fs;
373+
fs.id = lance_data_statistics_field_id_at(snap.get(), static_cast<size_t>(i));
374+
fs.bytes_on_disk =
375+
lance_data_statistics_bytes_on_disk_at(snap.get(), static_cast<size_t>(i));
376+
if (lance_last_error_code() != LANCE_OK) check_error();
377+
out.push_back(fs);
378+
}
379+
return out;
380+
}
381+
352382
/// Commit a new manifest that aliases `version` as the latest. The
353383
/// returned Dataset points at the target version; this handle is
354384
/// unchanged. If `version` is already the latest, no new manifest is

src/data_statistics.rs

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright The Lance Authors
3+
4+
//! Data statistics C API: per-field storage statistics for query planning.
5+
//!
6+
//! `lance_dataset_calculate_data_stats` walks every fragment to total each
7+
//! field's compressed on-disk byte size, returning the result as an opaque
8+
//! `LanceDataStatistics` snapshot. Accessors read entries by index, and
9+
//! `lance_data_statistics_close` frees it.
10+
11+
use lance::dataset::statistics::DatasetStatisticsExt;
12+
use lance_core::Result;
13+
14+
use crate::dataset::LanceDataset;
15+
use crate::error::{LanceErrorCode, clear_last_error, ffi_try, set_last_error};
16+
use crate::runtime::block_on;
17+
18+
/// Opaque snapshot of a dataset's per-field data statistics.
19+
pub struct LanceDataStatistics {
20+
fields: Vec<FieldStat>,
21+
}
22+
23+
#[derive(Clone, Copy)]
24+
struct FieldStat {
25+
id: u32,
26+
bytes_on_disk: u64,
27+
}
28+
29+
/// Compute per-field data statistics for the dataset. The caller frees the
30+
/// returned handle with `lance_data_statistics_close`. Returns NULL on error.
31+
///
32+
/// Entries are ordered by the dataset's schema field id, one per field
33+
/// (including nested struct/list children). `bytes_on_disk` is the field's
34+
/// compressed on-disk size; it is 0 for datasets written with the legacy (v1)
35+
/// storage format, which does not track per-field sizes.
36+
#[unsafe(no_mangle)]
37+
pub unsafe extern "C" fn lance_dataset_calculate_data_stats(
38+
dataset: *const LanceDataset,
39+
) -> *mut LanceDataStatistics {
40+
ffi_try!(unsafe { calculate_inner(dataset) }, null)
41+
}
42+
43+
unsafe fn calculate_inner(dataset: *const LanceDataset) -> Result<*mut LanceDataStatistics> {
44+
if dataset.is_null() {
45+
return Err(lance_core::Error::InvalidInput {
46+
source: "dataset must not be NULL".into(),
47+
location: snafu::location!(),
48+
});
49+
}
50+
// SAFETY: `dataset` is non-null (checked above) and points at a live
51+
// `LanceDataset` created by `lance_dataset_open`; we take only a shared
52+
// borrow, which is sound for the duration of this call.
53+
let ds = unsafe { &*dataset };
54+
let snapshot = ds.snapshot();
55+
let stats = block_on(snapshot.calculate_data_stats())?;
56+
let fields = stats
57+
.fields
58+
.into_iter()
59+
.map(|f| FieldStat {
60+
id: f.id,
61+
bytes_on_disk: f.bytes_on_disk,
62+
})
63+
.collect();
64+
Ok(Box::into_raw(Box::new(LanceDataStatistics { fields })))
65+
}
66+
67+
/// Return the number of fields in the statistics snapshot.
68+
///
69+
/// Clears the thread-local error on success. Returns 0 and sets
70+
/// `InvalidArgument` on a NULL handle. A dataset with an empty schema also
71+
/// yields 0 with no error set, so check `lance_last_error_code()` to
72+
/// distinguish the error case from an empty result.
73+
#[unsafe(no_mangle)]
74+
pub unsafe extern "C" fn lance_data_statistics_count(stats: *const LanceDataStatistics) -> u64 {
75+
if stats.is_null() {
76+
set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL");
77+
return 0;
78+
}
79+
// SAFETY: `stats` is non-null (checked above) and was produced by
80+
// `lance_dataset_calculate_data_stats` via `Box::into_raw`; the accessors
81+
// only ever take shared borrows, so no mutable alias exists.
82+
let s = unsafe { &*stats };
83+
let count = s.fields.len() as u64;
84+
clear_last_error();
85+
count
86+
}
87+
88+
/// Return the schema field id at `index` (0 <= index < count).
89+
///
90+
/// Returns 0 and sets the thread-local error on NULL or out-of-range input.
91+
/// Because 0 is itself a valid field id, check `lance_last_error_code()` when
92+
/// passing an untrusted index; iterating `0..count` never triggers the error
93+
/// path.
94+
#[unsafe(no_mangle)]
95+
pub unsafe extern "C" fn lance_data_statistics_field_id_at(
96+
stats: *const LanceDataStatistics,
97+
index: usize,
98+
) -> u32 {
99+
unsafe { entry_at(stats, index) }.map(|f| f.id).unwrap_or(0)
100+
}
101+
102+
/// Return the compressed on-disk byte size of the field at `index`.
103+
///
104+
/// Returns 0 and sets the thread-local error on NULL or out-of-range input.
105+
/// A genuine 0 (legacy storage, or an empty field) is indistinguishable from
106+
/// the error sentinel by value alone — check `lance_last_error_code()`.
107+
#[unsafe(no_mangle)]
108+
pub unsafe extern "C" fn lance_data_statistics_bytes_on_disk_at(
109+
stats: *const LanceDataStatistics,
110+
index: usize,
111+
) -> u64 {
112+
unsafe { entry_at(stats, index) }
113+
.map(|f| f.bytes_on_disk)
114+
.unwrap_or(0)
115+
}
116+
117+
/// Close and free a data statistics handle. Safe to call with NULL.
118+
#[unsafe(no_mangle)]
119+
pub unsafe extern "C" fn lance_data_statistics_close(stats: *mut LanceDataStatistics) {
120+
if !stats.is_null() {
121+
unsafe {
122+
let _ = Box::from_raw(stats);
123+
}
124+
}
125+
}
126+
127+
// ---------------------------------------------------------------------------
128+
// Internal helpers
129+
// ---------------------------------------------------------------------------
130+
131+
/// Copy the field stat at `index` out of the handle. Sets the thread-local
132+
/// error and returns `None` on NULL handle or out-of-range index.
133+
unsafe fn entry_at(stats: *const LanceDataStatistics, index: usize) -> Option<FieldStat> {
134+
if stats.is_null() {
135+
set_last_error(LanceErrorCode::InvalidArgument, "stats is NULL");
136+
return None;
137+
}
138+
// SAFETY: `stats` is non-null (checked above) and was produced by
139+
// `lance_dataset_calculate_data_stats` via `Box::into_raw`; we take only a
140+
// shared borrow.
141+
let s = unsafe { &*stats };
142+
match s.fields.get(index).copied() {
143+
Some(f) => {
144+
clear_last_error();
145+
Some(f)
146+
}
147+
None => {
148+
set_last_error(
149+
LanceErrorCode::InvalidArgument,
150+
format!(
151+
"field statistics index {} out of range; count = {}",
152+
index,
153+
s.fields.len()
154+
),
155+
);
156+
None
157+
}
158+
}
159+
}

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ mod alter_columns;
2020
mod async_dispatcher;
2121
mod batch;
2222
mod compact;
23+
mod data_statistics;
2324
mod dataset;
2425
mod delete;
2526
mod drop_columns;
@@ -40,6 +41,7 @@ pub use add_columns::*;
4041
pub use alter_columns::*;
4142
pub use batch::*;
4243
pub use compact::*;
44+
pub use data_statistics::*;
4345
pub use dataset::*;
4446
pub use delete::*;
4547
pub use drop_columns::*;

0 commit comments

Comments
 (0)