Skip to content

Commit 556b3bc

Browse files
Xuanwowestonpace
andauthored
refactor: allow datafiles to contain columns without field id (#5348)
This PR removes the check that requires all columns in data files to have a field ID. The check was only for safety and doesn’t enforce any real restriction in our logic. I’m removing it so we can keep building Blob V2. --- **This PR was primarily authored with Codex using GPT-5-Codex and then hand-reviewed by me. I AM responsible for every change made in this PR. I aimed to keep it aligned with our goals, though I may have missed minor issues. Please flag anything that feels off, I'll fix it quickly.** --------- Signed-off-by: Xuanwo <github@xuanwo.io> Co-authored-by: Weston Pace <weston.pace@gmail.com>
1 parent b218725 commit 556b3bc

2 files changed

Lines changed: 29 additions & 4 deletions

File tree

rust/lance-table/src/format/fragment.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ pub struct DataFile {
3333
///
3434
/// Note that -1 is a possibility and it indices that the field has
3535
/// no top-level column in the file.
36+
///
37+
/// Columns that lack a field id may still exist as extra entries in
38+
/// `column_indices`; such columns are ignored by field-id–based projection.
39+
/// For example, some fields, such as blob fields, occupy multiple
40+
/// columns in the file but only have a single field id.
3641
#[serde(default)]
3742
pub column_indices: Vec<i32>,
3843
/// The major version of the file format used to write this file.
@@ -139,10 +144,12 @@ impl DataFile {
139144
location!(),
140145
));
141146
}
142-
} else if self.fields.len() != self.column_indices.len() {
147+
} else if self.column_indices.len() < self.fields.len() {
148+
// Every recorded field id must have a column index, but not every column needs
149+
// to be associated with a field id (extra columns are allowed).
143150
return Err(Error::corrupt_file(
144151
base_path.child(self.path.clone()),
145-
"contained an unequal number of fields / column_indices",
152+
"contained fewer column_indices than fields",
146153
location!(),
147154
));
148155
}
@@ -531,6 +538,7 @@ mod tests {
531538
use arrow_schema::{
532539
DataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
533540
};
541+
use object_store::path::Path;
534542
use serde_json::{json, Value};
535543

536544
#[test]
@@ -618,4 +626,23 @@ mod tests {
618626
let frag2 = Fragment::from_json(&json).unwrap();
619627
assert_eq!(fragment, frag2);
620628
}
629+
630+
#[test]
631+
fn data_file_validate_allows_extra_columns() {
632+
let data_file = DataFile {
633+
path: "foo.lance".to_string(),
634+
fields: vec![1, 2],
635+
// One extra column without a field id mapping
636+
column_indices: vec![0, 1, 2],
637+
file_major_version: MAJOR_VERSION as u32,
638+
file_minor_version: MINOR_VERSION as u32,
639+
file_size_bytes: Default::default(),
640+
base_id: None,
641+
};
642+
643+
let base_path = Path::from("base");
644+
data_file
645+
.validate(&base_path)
646+
.expect("validation should allow extra columns without field ids");
647+
}
621648
}

rust/lance/src/dataset/fragment.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,8 +1194,6 @@ impl FileFragment {
11941194
/// Verifies:
11951195
/// * All field ids in the fragment are distinct
11961196
/// * Within each data file, field ids are in increasing order
1197-
/// * All fields in the schema have a corresponding field in one of the data
1198-
/// files
11991197
/// * All data files exist and have the same length
12001198
/// * Field ids are distinct between data files.
12011199
/// * Deletion file exists and has rowids in the correct range

0 commit comments

Comments
 (0)