Skip to content

geoarrow-flatgeobuf does not handle writing arrow record batches if they contain columns with 100% unll values #1419

@ttomasz

Description

@ttomasz

When writing multiple batches to a fgb file and some of the batches contain columns that only have null values then the fgb file will be invalid.

Some errors that I got depending on what too read the file or how it was generated:

IO Error:
GDAL Error (1): Invalid size detected: string value
range end index 1822473494 out of range for slice of length 333
ColumnNotFound

Example code:

#[cfg(test)]
mod test {
    use std::io::Seek;
    use std::sync::Arc;

    use arrow_array::{RecordBatch, create_array};
    use arrow_schema::{DataType, Field, Schema};
    use flatgeobuf::FallibleStreamingIterator;
    use flatgeobuf::FeatureProperties;
    use flatgeobuf::FgbReader;
    use flatgeobuf::geozero::ToJson;
    use geoarrow_array::GeoArrowArray;
    use geoarrow_array::builder::PointBuilder;
    use geoarrow_flatgeobuf::writer::{FlatGeobufWriter, FlatGeobufWriterOptions};
    use geoarrow_schema::{Dimension, PointType};
    use tempfile::tempfile;
    use wkt::wkt;

    #[test]
    fn test_sparse_fields() {
        let typ = PointType::new(Dimension::XY, Default::default());
        let example_geometry = PointBuilder::from_points(
            vec![wkt! { POINT (0. 1.) }, wkt! { POINT (1. 2.) }].iter(),
            typ.clone(),
        )
        .finish();
        let fields = vec![
            Arc::new(Field::new("u8", DataType::UInt8, true)),
            Arc::new(Field::new("string1", DataType::Utf8, true)),
            Arc::new(Field::new("string2", DataType::Utf8, true)),
            Arc::new(example_geometry.data_type().to_field("geometry", true)),
        ];
        let schema = Arc::new(Schema::new(fields));

        let batch1 = RecordBatch::try_new(
            schema.clone(),
            vec![
                create_array!(UInt8, [1, 2]),
                create_array!(Utf8, ["a", "b"]),
                create_array!(Utf8, ["zzz", "zzz"]),
                PointBuilder::from_points(
                    vec![wkt! { POINT (0. 1.) }, wkt! { POINT (1. 2.) }].iter(),
                    typ.clone(),
                )
                .finish()
                .into_array_ref(),
            ],
        )
        .unwrap();

        let batch2 = RecordBatch::try_new(
            schema.clone(),
            // working data without null values:
            // vec![
            //     create_array!(UInt8, [3, 4]),
            //     create_array!(Utf8, ["c", "d"]),
            //     create_array!(Utf8, ["zzz", "zzz"]),
            //     PointBuilder::from_points(
            //         vec![wkt! { POINT (3. 4.) }, wkt! { POINT (4. 5.) }].iter(),
            //         typ.clone(),
            //     )
            //     .finish()
            //     .into_array_ref(),
            // ],
            // ---
            // not working data with null values:
            vec![
                create_array!(UInt8, [3, 4]),
                create_array!(Utf8, [None::<String>, None]),
                create_array!(Utf8, ["zzz", "zzz"]),
                PointBuilder::from_points(
                    vec![wkt! { POINT (3. 4.) }, wkt! { POINT (4. 5.) }].iter(),
                    typ.clone(),
                )
                .finish()
                .into_array_ref(),
            ],
        )
        .unwrap();

        let mut file = tempfile().unwrap();
        let options = FlatGeobufWriterOptions::new("test".to_string());
        let mut fgb_writer = FlatGeobufWriter::try_new(&mut file, schema, options).unwrap();

        let _ = &fgb_writer.write(&batch1).unwrap();
        let _ = &fgb_writer.write(&batch2).unwrap();
        let _ = &fgb_writer.finish().unwrap();

        file.seek(std::io::SeekFrom::Start(0)).unwrap();
        let mut fgb_reader = FgbReader::open(&mut file).unwrap().select_all().unwrap();
        while let Some(feature) = fgb_reader.next().unwrap() {
            println!("u8 (should be one of: 1,2,3,4): {}", feature.property::<u8>("u8").unwrap());
            println!(
                "string1 (should be one of: a,b,c,d): {}",
                feature.property::<String>("string1").unwrap()
            );
            println!(
                "string2 (should be 'zzz'): {}",
                feature.property::<String>("string2").unwrap()
            );
            println!("{}", feature.to_json().unwrap());
            println!("-----------------------------------");
        }
    }
}

Deps:

arrow-array = "56.2.0"
arrow-schema = "56.2.0"
flatgeobuf = "5.0.0"
geoarrow-array = "0.6.2"
geozero = "0.15.1"
wkt = "0.14.0"
tempfile = "3.24.0"

I can provide some files but this probably demonstrates the issue. Note that wrong values are also read in case some field is missing.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions