diff --git a/.cursor/rules/general.mdc b/.cursor/rules/general.mdc index 5cced29..c3d09be 100644 --- a/.cursor/rules/general.mdc +++ b/.cursor/rules/general.mdc @@ -1,7 +1,7 @@ --- description: General rules about the project and coding guidelines globs: -alwaysApply: false +alwaysApply: true --- # Important Rules for LLM @@ -11,6 +11,7 @@ alwaysApply: false - If a test fails more than twice in a row, analyze the current situation and collaborate with the user to determine a solution. Avoid trial-and-error testing without a hypothesis. - The user has extensive knowledge gained from GitHub and can implement individual algorithms and libraries faster than Cline and Cursor. Code should be written while explaining to the user, using test cases to verify correctness. - However, Cline and Cursor is not good at handling processing logic based on the current context. If the context is unclear, confirm with the user. +- Once I ask you to do "memory it", you should update [general.mdc](mdc:.cursor/rules/general.mdc) or other corresponding document files such as [productContext.md](mdc:.cursor/rules/memory/productContext.md), [specification.md](mdc:.cursor/rules/memory/specification.md), [progress.md](mdc:.cursor/rules/memory/progress.md) so they are kept update to date. Even I don't ask, you should ask me to save memory or not when you think it's needed. --- @@ -23,23 +24,26 @@ I am a specialized software engineer with a distinct characteristic: my memory r All files are stored under [productContext.md](mdc:.cursor/rules/memory/productContext.md), [progress.md](mdc:.cursor/rules/memory/progress.md), [specification.md](mdc:.cursor/rules/memory/specification.md) and `.cursor/memory/*`. The memory bank consists of mandatory core files and optional context files, all formatted in Markdown. ### Core Files (Mandatory) -1. **`productContext.md`** +1. **`productContext.md`** [productContext.md](mdc:.cursor/rules/memory/productContext.md) - Explains the purpose of the project. - Identifies the problem it solves. - Describes expected functionality and user experience goals. -2. **`specification.md`** +2. **`specification.md`** [specification.md](mdc:.cursor/rules/memory/specification.md) - Explains the detail of FlatCityBuf specification - Describes its encoding strategy and decisions made -2. **`progress.md`** +3. **`progress.md`** [progress.md](mdc:.cursor/rules/memory/progress.md) - Completed features and pending tasks. - Current status. - Known issues. +4. **`tdd-guideline`** [tdd-rust-guidelines.md](mdc:.cursor/rules/memory/tdd-rust-guidelines.md) + - Explains the guidline about Test Driven Development + ### Additional Context Files -Additional files and folders can be created inside `memory-bank/` if they aid in organization: +Additional files and folders can be created inside `memory/rules/memory*` if they aid in organization: - Documentation for complex features. - Integration specifications. - API documentation. diff --git a/.cursor/rules/memory/productContext.md b/.cursor/rules/memory/productContext.md index 39abf66..c025314 100644 --- a/.cursor/rules/memory/productContext.md +++ b/.cursor/rules/memory/productContext.md @@ -245,14 +245,18 @@ async fn query_by_attribute(fcb_path: &str, field: &str, value: &str) -> Result< ### **12.4 HTTP Range Requests (JavaScript via WASM)** ```javascript -import { HttpFcbReader } from 'fcb_wasm'; +import init, { HttpFcbReader, WasmAttrQuery } from './fcb_wasm.js'; async function loadFeaturesFromUrl(url) { + // Initialize WASM module + await init(); + // Create HTTP reader - const reader = await FlatCityBufReader.fromUrl(url); + const reader = await new HttpFcbReader(url); + console.log('httpfcbreader instance created.'); // Get header information - const header = await reader.getHeader(); + const header = await reader.header(); console.log(`loaded file with ${header.features_count} features`); // Perform spatial query (only downloads necessary parts) @@ -261,11 +265,19 @@ async function loadFeaturesFromUrl(url) { max_x: 4.4, max_y: 52.1 }; - const features = await reader.queryBbox( + // Call the select_bbox method + const iter = await reader.select_bbox( bbox.min_x, bbox.min_y, bbox.max_x, bbox.max_y ); + // Iterate through features + let features = []; + let feature; + while ((feature = await iter.next()) !== null) { + features.push(feature); + } + console.log(`downloaded ${features.length} features using range requests`); return features; } diff --git a/.cursor/rules/memory/tdd-rust-guidelines.md b/.cursor/rules/memory/tdd-rust-guidelines.md new file mode 100644 index 0000000..525905c --- /dev/null +++ b/.cursor/rules/memory/tdd-rust-guidelines.md @@ -0,0 +1,64 @@ + +# Test-Driven Development (TDD) Basics in Rust + +## Core Concepts + +Test-Driven Development (TDD) follows this development cycle: + +1. **Red**: Write a failing test first. +2. **Green**: Implement the minimum necessary code to pass the test. +3. **Refactor**: Improve the code while ensuring tests still pass. + +## Key Principles + +- **Tests define the specification**: Test code expresses the expected behavior of the implementation. +- **Follow the Arrange-Act-Assert pattern**: + 1. **Arrange**: Set up the necessary test environment. + 2. **Act**: Execute the functionality under test. + 3. **Assert**: Verify the expected result. +- **Test names should follow a "Condition → Action → Expected Result" format**. Example: + - `"Given a valid token, retrieving user information should succeed"` + +## Essential Tools for the Refactoring Phase + +Once tests pass, use the following tools to refine your code: + +### 1. **Static Analysis & Linting** + - Run `cargo check` for type checking and borrow checking. + - Use `cargo clippy` to detect potential issues and enforce best practices. + +### 2. **Dead Code Detection & Removal** + - Run `cargo deadlinks` to check for dead documentation links. + - Use `cargo udeps` to find unused dependencies. + - Run `cargo rustc -- -W dead_code` to detect unused functions. + +### 3. **Code Coverage Analysis** + - Install `cargo-tarpaulin` for test coverage measurement: + ```bash + cargo install cargo-tarpaulin + cargo tarpaulin --out html + ``` + - Open the generated HTML report to review coverage. + +### 4. **Version Control with Git** + - Commit after each phase (test creation → implementation → refactoring). + - Review changes before committing: + ```bash + git status # Check modified files + git add + git commit -m "" + ``` + - Use commit prefixes for clarity: + - `test:` - Adding or modifying tests + - `feat:` - Implementing new features + - `refactor:` - Code refactoring + +## Further Reading + +For more details on TDD practices in Rust, naming conventions for tests, and best practices for refactoring, refer to: + +``` +.docs/tdd-rust-guidelines.md +``` + +This file includes step-by-step instructions for test-first development, structuring test cases, and leveraging Rust’s testing framework efficiently. diff --git a/.cursor/rules/mpc.md b/.cursor/rules/mpc.md deleted file mode 100644 index abc3e78..0000000 --- a/.cursor/rules/mpc.md +++ /dev/null @@ -1,26 +0,0 @@ -# Commands to run MPV servers -## Sequential thinking - -```bash -npx -y @modelcontextprotocol/server-sequential-thinking@0.6.2 -``` - -## Memory -```bash -npx -y @modelcontextprotocol/server-memory -``` - -## GitHub -```bash -npx -y @modelcontextprotocol/server-github -``` - -## Browser-tools -```bash -npx -y @agentdeskai/browser-tools-mcp@1.1.0 -``` - -## Puppeteer -```bash -npx -y @modelcontextprotocol/server-puppeteer -``` diff --git a/.gitignore b/.gitignore index c3b09d9..368711b 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,5 @@ src/rust/fcb_core/tests/data/ .cursorrules *.fcb + +.cursor/mcp.json diff --git a/src/rust/bst/Cargo.toml b/src/rust/bst/Cargo.toml index 10662ba..e8eb120 100644 --- a/src/rust/bst/Cargo.toml +++ b/src/rust/bst/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] default = ["http"] -http = ["packed_rtree"] +http = ["http-range-client", "async-trait", "bytes"] [dependencies] bincode = { workspace = true } @@ -14,5 +14,13 @@ anyhow = { workspace = true } chrono = { workspace = true } ordered-float = { workspace = true } once_cell = { workspace = true } -packed_rtree = { path = "../packed_rtree", optional = true } thiserror = { workspace = true } +http-range-client = { workspace = true, optional = true, default-features = false } +async-trait = { workspace = true, optional = true } +bytes = { workspace = true, optional = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["rt", "macros"] } +async-trait = { workspace = true } +bytes = { workspace = true } +tempfile = "3.10.1" diff --git a/src/rust/bst/README.md b/src/rust/bst/README.md new file mode 100644 index 0000000..d87ada8 --- /dev/null +++ b/src/rust/bst/README.md @@ -0,0 +1,431 @@ +# FlatCityBuf Implementation Strategy + +## Core Architecture + +FlatCityBuf uses a multi-layered architecture for efficient spatial data indexing and querying: + +1. **Index Layer**: Provides efficient key-value lookups with support for exact matches and range queries +2. **Query Layer**: Handles complex queries with multiple conditions across different indices +3. **Serialization Layer**: Enables persistent storage and streaming access to indices +4. **HTTP Layer**: Allows remote access to indices via HTTP range requests + +```mermaid +graph TD + subgraph "Application Layer" + A[Client Application] + end + + subgraph "Query Layer" + B[Query] + C[QueryCondition] + D[MultiIndex] + E[StreamableMultiIndex] + end + + subgraph "Index Layer" + F[BufferedIndex] + G[IndexMeta] + H[TypeErasedIndexMeta] + I[SearchableIndex] + J[TypedSearchableIndex] + end + + subgraph "Serialization Layer" + K[ByteSerializable] + L[ByteSerializableType] + M[IndexSerializable] + end + + subgraph "HTTP Layer" + N[AsyncHttpRangeClient] + O[HttpSearchResultItem] + end + + A -->|uses| B + B -->|contains| C + D -->|executes| B + E -->|executes| B + D -->|contains| I + E -->|contains| H + F -->|implements| I + F -->|implements| J + G -->|implements| J + H -->|type-erased| G + F -->|serializes via| M + I -->|uses| K + J -->|uses| K + K -->|returns| L + E -->|uses| N + N -->|returns| O +``` + +## Type System + +The type system in FlatCityBuf is built around the `ByteSerializable` trait, which provides methods for converting types to and from byte representations: + +```rust +pub trait ByteSerializable: Send + Sync { + fn to_bytes(&self) -> Vec; + fn from_bytes(bytes: &[u8]) -> Self; + fn value_type(&self) -> ByteSerializableType; +} +``` + +Key features: +- Implemented for common types (primitives, String, DateTime, etc.) +- Uses `OrderedFloat` for floating-point comparisons to handle NaN values +- Preserves type information in serialized format via `ByteSerializableType` enum +- Enables type-specific comparisons during binary search operations + +The `ByteSerializableType` enum represents all supported types: + +```rust +pub enum ByteSerializableType { + I64, I32, I16, I8, + U64, U32, U16, U8, + F64, F32, + Bool, + String, + NaiveDateTime, NaiveDate, DateTime, +} +``` + +Each type has a unique ID that is stored in the serialized index, allowing for correct type-specific comparisons when querying. + +## Index Implementation + +### BufferedIndex + +The `BufferedIndex` is an in-memory index implementation that stores key-value pairs where: +- Keys are of type `T` (which must be `Ord + ByteSerializable`) +- Values are vectors of offsets (`Vec`) pointing to the actual data + +```rust +pub struct BufferedIndex { + pub entries: Vec>, +} +``` + +Key features: +- Maintains keys in sorted order for efficient binary search +- Supports exact match and range queries +- Fully type-aware with generic parameter `T` +- Implements both `SearchableIndex` and `TypedSearchableIndex` traits + +### IndexMeta + +The `IndexMeta` structure provides metadata about an index and enables streaming access without loading the entire index into memory: + +```rust +pub struct IndexMeta { + /// Number of entries in the index. + pub entry_count: u64, + /// Total size of the index in bytes. + pub size: u64, + /// Phantom data to represent the type parameter. + pub _phantom: std::marker::PhantomData, +} +``` + +Key features: +- Stores only metadata, not the actual index data +- Provides methods for streaming queries directly from a file or HTTP source +- Uses binary search for efficient lookups +- Implements `TypedStreamableIndex` trait for type-safe streaming access + +### TypeErasedIndexMeta + +The `TypeErasedIndexMeta` structure is a type-erased version of `IndexMeta` that can work with any `ByteSerializable` type: + +```rust +pub struct TypeErasedIndexMeta { + /// Number of entries in the index. + pub entry_count: u64, + /// Total size of the index in bytes. + pub size: u64, + /// Type identifier for the index. + pub type_id: ByteSerializableType, +} +``` + +Key features: +- Enables storing different index types in a single collection +- Performs type-specific comparisons based on the `type_id` +- Used by `StreamableMultiIndex` to handle multiple indices with different key types + +## Query System + +### Query Structure + +Queries are represented by the `Query` struct which contains a list of conditions: + +```rust +pub struct Query { + pub conditions: Vec, +} + +pub struct QueryCondition { + /// The field identifier (e.g., "id", "name", etc.) + pub field: String, + /// The comparison operator. + pub operator: Operator, + /// The key value as a byte vector (obtained via ByteSerializable::to_bytes). + pub key: Vec, +} +``` + +The system supports six comparison operators: +- `Eq`: Equal to +- `Ne`: Not equal to +- `Gt`: Greater than +- `Lt`: Less than +- `Ge`: Greater than or equal to +- `Le`: Less than or equal to + +```mermaid +graph TD + A[Query] -->|contains| B[QueryCondition] + B -->|has| C[field: String] + B -->|has| D[operator: Operator] + B -->|has| E[key: Vec] + D -->|can be| F[Eq] + D -->|can be| G[Ne] + D -->|can be| H[Gt] + D -->|can be| I[Lt] + D -->|can be| J[Ge] + D -->|can be| K[Le] +``` + +### MultiIndex + +The `MultiIndex` provides a way to query multiple indices simultaneously: + +```rust +pub struct MultiIndex { + /// A mapping from field names to their corresponding index. + pub indices: HashMap>, +} +``` + +Key features: +- Stores multiple indices by field name +- Executes queries across all relevant indices +- Intersects results to find records that match all conditions +- Uses trait objects (`Box`) for type erasure + +### StreamableMultiIndex + +The `StreamableMultiIndex` extends the concept of `MultiIndex` for streaming access: + +```rust +pub struct StreamableMultiIndex { + /// A mapping from field names to their corresponding index metadata. + pub indices: HashMap, + /// A mapping from field names to their offsets in the file. + pub index_offsets: HashMap, +} +``` + +Key features: +- Stores index metadata and offsets instead of the actual indices +- Enables streaming queries without loading entire indices into memory +- Properly manages cursor positioning when querying multiple indices +- Supports the same query operators as `MultiIndex` + +## Streaming Query Process + +The streaming query process follows these steps: + +```mermaid +sequenceDiagram + participant Client + participant StreamableMultiIndex + participant TypeErasedIndexMeta + participant FileReader + + Client->>StreamableMultiIndex: stream_query(query) + StreamableMultiIndex->>FileReader: Save current position + + loop For each condition in query + StreamableMultiIndex->>StreamableMultiIndex: Get index metadata and offset + StreamableMultiIndex->>FileReader: Seek to index offset + + alt Operator is Eq + StreamableMultiIndex->>TypeErasedIndexMeta: stream_query_exact(key) + TypeErasedIndexMeta->>FileReader: Binary search for key + FileReader-->>TypeErasedIndexMeta: Return matching offsets + else Operator is range-based + StreamableMultiIndex->>TypeErasedIndexMeta: stream_query_range(lower, upper) + TypeErasedIndexMeta->>FileReader: Find bounds and scan range + FileReader-->>TypeErasedIndexMeta: Return matching offsets + end + + TypeErasedIndexMeta-->>StreamableMultiIndex: Return offsets + StreamableMultiIndex->>StreamableMultiIndex: Add to candidate sets + end + + StreamableMultiIndex->>StreamableMultiIndex: Intersect all candidate sets + StreamableMultiIndex->>FileReader: Restore original position + StreamableMultiIndex-->>Client: Return matching offsets +``` + +1. **Initialization**: + - Save the current file position + - Identify the relevant indices for the query conditions + +2. **Query Execution**: + - For each condition in the query: + - Find the corresponding index metadata and offset + - Seek to the correct offset in the file + - Execute the appropriate query method (exact or range) + - Collect the results into a candidate set + +3. **Result Combination**: + - Intersect all candidate sets to find records that match all conditions + - Sort the results for consistent output + +4. **Cursor Management**: + - Restore the original file position after the query is complete + +## HTTP Streaming Queries + +The HTTP implementation extends the streaming concept to remote data sources: + +```mermaid +sequenceDiagram + participant Client + participant StreamableMultiIndex + participant TypeErasedIndexMeta + participant HttpClient + participant Server + + Client->>StreamableMultiIndex: http_stream_query(query) + + loop For each condition in query + StreamableMultiIndex->>StreamableMultiIndex: Get index metadata and offset + StreamableMultiIndex->>HttpClient: Request index range + HttpClient->>Server: HTTP Range Request + Server-->>HttpClient: Partial content response + + alt Operator is Eq + StreamableMultiIndex->>TypeErasedIndexMeta: http_stream_query_exact(key) + TypeErasedIndexMeta->>HttpClient: Binary search (multiple range requests) + HttpClient->>Server: HTTP Range Requests + Server-->>HttpClient: Partial content responses + HttpClient-->>TypeErasedIndexMeta: Return matching offsets + else Operator is range-based + StreamableMultiIndex->>TypeErasedIndexMeta: http_stream_query_range(lower, upper) + TypeErasedIndexMeta->>HttpClient: Find bounds and request ranges + HttpClient->>Server: HTTP Range Requests + Server-->>HttpClient: Partial content responses + HttpClient-->>TypeErasedIndexMeta: Return matching offsets + end + + TypeErasedIndexMeta-->>StreamableMultiIndex: Return offsets + StreamableMultiIndex->>StreamableMultiIndex: Add to candidate sets + end + + StreamableMultiIndex->>StreamableMultiIndex: Intersect all candidate sets + StreamableMultiIndex-->>Client: Return matching HttpSearchResultItems +``` + +Key components: + +1. **AsyncHttpRangeClient**: + - Makes HTTP range requests to fetch specific byte ranges + - Buffers data to minimize the number of requests + - Handles network errors and retries + +2. **HTTP Streaming Queries**: + - Follow the same pattern as file-based streaming queries + - Use range requests to fetch only the necessary parts of the index + - Return `HttpSearchResultItem` objects with byte ranges for feature data + +3. **Batching Strategy**: + - Group nearby offsets to reduce the number of HTTP requests + - Use a threshold parameter to control the maximum distance between offsets in a batch + - Balance between minimizing requests and avoiding excessive data transfer + +## Serialization Strategy + +### Format + +Each index is serialized with the following structure: + +``` +[Type Identifier (4 bytes)] +[Number of Entries (8 bytes)] +For each entry: + [Key Length (8 bytes)] + [Key Bytes (variable)] + [Number of Offsets (8 bytes)] + For each offset: + [Offset Value (8 bytes)] +``` + +This format: +- Preserves type information for correct deserialization +- Maintains the sorted order of keys +- Allows efficient binary search directly on the serialized data +- Supports streaming access without loading the entire index + +## Integration with CityJSON + +FlatCityBuf is designed to optimize CityJSON for cloud-based applications: + +1. **Binary Encoding**: + - Reduces file size by 50-70% compared to JSON-based CityJSONSeq + - Preserves all semantic information from the original CityJSON + +2. **Spatial Indexing**: + - Implements Hilbert R-tree for efficient spatial queries + - Enables fast retrieval of city objects by location + +3. **Attribute Indexing**: + - Creates indices for commonly queried attributes + - Supports complex queries combining spatial and attribute conditions + +4. **Cloud Optimization**: + - Enables partial data retrieval via HTTP range requests + - Reduces bandwidth usage by downloading only needed data + - Improves loading times for web applications + +## Performance Considerations + +1. **Memory Efficiency**: + - Only metadata is loaded into memory, not the entire index + - Streaming access minimizes memory usage for large datasets + - Type-erased indices reduce memory overhead for multiple indices + +2. **I/O Optimization**: + - Binary search minimizes the number of reads + - Cursor positioning is carefully managed to avoid unnecessary seeks + - Batched HTTP requests reduce network overhead + +3. **Type Safety**: + - Type information is preserved in the serialized format + - Type-specific comparisons ensure correct ordering + - Generic implementations provide type safety at compile time + +4. **Query Optimization**: + - Conditions are processed in order, with no specific optimization yet + - Future improvements could include reordering conditions based on selectivity + - Caching frequently accessed index parts could improve performance + +## Future Enhancements + +1. **Query Optimization**: + - Implement query planning to reorder conditions for optimal performance + - Add statistics collection for better selectivity estimation + +2. **Advanced HTTP Optimizations**: + - Implement predictive prefetching for common query patterns + - Add support for HTTP/2 multiplexing to reduce connection overhead + +3. **Compression**: + - Add optional compression for index and feature data + - Support for compressed HTTP range requests + +4. **Integration with Other Formats**: + - Extend the approach to other geospatial formats + - Add support for vector tiles and other web-friendly formats diff --git a/src/rust/bst/src/byte_serializable.rs b/src/rust/bst/src/byte_serializable.rs index 1178fd3..da71fe3 100644 --- a/src/rust/bst/src/byte_serializable.rs +++ b/src/rust/bst/src/byte_serializable.rs @@ -1,15 +1,20 @@ use chrono::{DateTime, Datelike, NaiveDate, NaiveDateTime, Utc}; pub use ordered_float::OrderedFloat; +use crate::error; + pub type Float = OrderedFloat; /// A trait for converting types to and from bytes. -pub trait ByteSerializable { +pub trait ByteSerializable: Send + Sync { /// Convert self into a vector of bytes. fn to_bytes(&self) -> Vec; /// Construct an instance from the given bytes. fn from_bytes(bytes: &[u8]) -> Self; + + /// Return the type of the value. + fn value_type(&self) -> ByteSerializableType; } #[derive(Debug, Clone)] @@ -31,6 +36,129 @@ pub enum ByteSerializableValue { DateTime(DateTime), } +impl ByteSerializableValue { + pub fn to_bytes(&self) -> Vec { + match self { + ByteSerializableValue::I64(i) => i.to_bytes(), + ByteSerializableValue::I32(i) => i.to_bytes(), + ByteSerializableValue::I16(i) => i.to_bytes(), + ByteSerializableValue::I8(i) => i.to_bytes(), + ByteSerializableValue::U64(i) => i.to_bytes(), + ByteSerializableValue::U32(i) => i.to_bytes(), + ByteSerializableValue::U16(i) => i.to_bytes(), + ByteSerializableValue::U8(i) => i.to_bytes(), + ByteSerializableValue::F64(i) => i.to_bytes(), + ByteSerializableValue::F32(i) => i.to_bytes(), + ByteSerializableValue::Bool(i) => i.to_bytes(), + ByteSerializableValue::String(s) => s.to_bytes(), + ByteSerializableValue::NaiveDateTime(dt) => dt.to_bytes(), + ByteSerializableValue::NaiveDate(d) => d.to_bytes(), + ByteSerializableValue::DateTime(dt) => dt.to_bytes(), + } + } +} +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ByteSerializableType { + I64, + I32, + I16, + I8, + U64, + U32, + U16, + U8, + F64, + F32, + Bool, + String, + NaiveDateTime, + NaiveDate, + DateTime, +} +impl ByteSerializableType { + pub fn to_bytes(&self) -> Vec { + // Use u32 to represent the type and serialize in little endian + let type_id: u32 = match self { + ByteSerializableType::I64 => 0, + ByteSerializableType::I32 => 1, + ByteSerializableType::I16 => 2, + ByteSerializableType::I8 => 3, + ByteSerializableType::U64 => 4, + ByteSerializableType::U32 => 5, + ByteSerializableType::U16 => 6, + ByteSerializableType::U8 => 7, + ByteSerializableType::F64 => 8, + ByteSerializableType::F32 => 9, + ByteSerializableType::Bool => 10, + ByteSerializableType::String => 11, + ByteSerializableType::NaiveDateTime => 12, + ByteSerializableType::NaiveDate => 13, + ByteSerializableType::DateTime => 14, + }; + type_id.to_le_bytes().to_vec() + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + if bytes.len() < 4 { + return Err(error::Error::InvalidType( + "not enough bytes to deserialize type".to_string(), + )); + } + + // Read u32 in little endian format + let mut type_id_bytes = [0u8; 4]; + type_id_bytes.copy_from_slice(&bytes[0..4]); + let type_id = u32::from_le_bytes(type_id_bytes); + + match type_id { + 0 => Ok(ByteSerializableType::I64), + 1 => Ok(ByteSerializableType::I32), + 2 => Ok(ByteSerializableType::I16), + 3 => Ok(ByteSerializableType::I8), + 4 => Ok(ByteSerializableType::U64), + 5 => Ok(ByteSerializableType::U32), + 6 => Ok(ByteSerializableType::U16), + 7 => Ok(ByteSerializableType::U8), + 8 => Ok(ByteSerializableType::F64), + 9 => Ok(ByteSerializableType::F32), + 10 => Ok(ByteSerializableType::Bool), + 11 => Ok(ByteSerializableType::String), + 12 => Ok(ByteSerializableType::NaiveDateTime), + 13 => Ok(ByteSerializableType::NaiveDate), + 14 => Ok(ByteSerializableType::DateTime), + _ => Err(error::Error::InvalidType(format!( + "invalid type id: {}", + type_id + ))), + } + } + + /// Convert a type ID to the corresponding ByteSerializableType + pub fn from_type_id(type_id: u32) -> Result { + match type_id { + 0 => Ok(ByteSerializableType::I64), + 1 => Ok(ByteSerializableType::I32), + 2 => Ok(ByteSerializableType::I16), + 3 => Ok(ByteSerializableType::I8), + 4 => Ok(ByteSerializableType::U64), + 5 => Ok(ByteSerializableType::U32), + 6 => Ok(ByteSerializableType::U16), + 7 => Ok(ByteSerializableType::U8), + 8 => Ok(ByteSerializableType::F64), + 9 => Ok(ByteSerializableType::F32), + 10 => Ok(ByteSerializableType::Bool), + 11 => Ok(ByteSerializableType::String), + 12 => Ok(ByteSerializableType::NaiveDateTime), + 13 => Ok(ByteSerializableType::NaiveDate), + 14 => Ok(ByteSerializableType::DateTime), + _ => Err(error::Error::InvalidType(format!( + "invalid type id: {}", + type_id + ))), + } + } +} + impl ByteSerializable for i64 { fn to_bytes(&self) -> Vec { self.to_le_bytes().to_vec() @@ -40,6 +168,9 @@ impl ByteSerializable for i64 { array.copy_from_slice(&bytes[0..8]); i64::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::I64 + } } impl ByteSerializable for i32 { @@ -51,6 +182,9 @@ impl ByteSerializable for i32 { array.copy_from_slice(&bytes[0..4]); i32::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::I32 + } } impl ByteSerializable for i16 { @@ -62,6 +196,9 @@ impl ByteSerializable for i16 { array.copy_from_slice(&bytes[0..2]); i16::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::I16 + } } impl ByteSerializable for i8 { @@ -71,6 +208,9 @@ impl ByteSerializable for i8 { fn from_bytes(bytes: &[u8]) -> Self { bytes[0] as i8 } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::I8 + } } impl ByteSerializable for u64 { fn to_bytes(&self) -> Vec { @@ -81,6 +221,9 @@ impl ByteSerializable for u64 { array.copy_from_slice(&bytes[0..8]); u64::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::U64 + } } impl ByteSerializable for u32 { fn to_bytes(&self) -> Vec { @@ -91,6 +234,9 @@ impl ByteSerializable for u32 { array.copy_from_slice(&bytes[0..4]); u32::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::U32 + } } impl ByteSerializable for u16 { @@ -102,6 +248,9 @@ impl ByteSerializable for u16 { array.copy_from_slice(&bytes[0..2]); u16::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::U16 + } } impl ByteSerializable for u8 { @@ -111,6 +260,9 @@ impl ByteSerializable for u8 { fn from_bytes(bytes: &[u8]) -> Self { bytes[0] } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::U8 + } } impl ByteSerializable for String { @@ -120,6 +272,9 @@ impl ByteSerializable for String { fn from_bytes(bytes: &[u8]) -> Self { String::from_utf8(bytes.to_vec()).unwrap() } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::String + } } impl ByteSerializable for f64 { @@ -131,17 +286,31 @@ impl ByteSerializable for f64 { array.copy_from_slice(&bytes[0..8]); f64::from_le_bytes(array) } + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::F64 + } } impl ByteSerializable for f32 { fn to_bytes(&self) -> Vec { self.to_le_bytes().to_vec() } + fn from_bytes(bytes: &[u8]) -> Self { + // If the byte slice is empty, return a default value + if bytes.is_empty() { + return 0.0; + } + + // Otherwise, convert the bytes to an f32 let mut array = [0u8; 4]; array.copy_from_slice(&bytes[0..4]); f32::from_le_bytes(array) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::F32 + } } // Implement ByteSerializable for Float because f64 doesn't implement Ord trait because of NaN values. @@ -155,6 +324,10 @@ impl ByteSerializable for Float { array.copy_from_slice(&bytes[0..8]); OrderedFloat(f64::from_le_bytes(array)) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::F64 + } } // Implement ByteSerializable for Float because f32 doesn't implement Ord trait because of NaN values. @@ -162,11 +335,22 @@ impl ByteSerializable for Float { fn to_bytes(&self) -> Vec { self.0.to_le_bytes().to_vec() } + fn from_bytes(bytes: &[u8]) -> Self { + // If the byte slice is empty, return a default value + if bytes.is_empty() { + return OrderedFloat(0.0); + } + + // Otherwise, convert the bytes to an f32 let mut array = [0u8; 4]; array.copy_from_slice(&bytes[0..4]); OrderedFloat(f32::from_le_bytes(array)) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::F32 + } } impl ByteSerializable for bool { @@ -177,6 +361,10 @@ impl ByteSerializable for bool { fn from_bytes(bytes: &[u8]) -> Self { bytes.first().is_some_and(|&b| b != 0) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::Bool + } } /// We serialize a NaiveDateTime as 12 bytes: @@ -190,8 +378,6 @@ impl ByteSerializable for NaiveDateTime { } fn from_bytes(bytes: &[u8]) -> Self { - // Ensure there are at least 12 bytes. - assert!(bytes.len() >= 12, "Not enough bytes for NaiveDateTime"); let mut ts_bytes = [0u8; 8]; ts_bytes.copy_from_slice(&bytes[0..8]); let timestamp = i64::from_le_bytes(ts_bytes); @@ -202,6 +388,10 @@ impl ByteSerializable for NaiveDateTime { NaiveDateTime::from_timestamp(timestamp, nanosecond) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::NaiveDateTime + } } /// We serialize a NaiveDate as 4 bytes: @@ -233,6 +423,10 @@ impl ByteSerializable for NaiveDate { ) .unwrap() } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::NaiveDate + } } /// Since DateTime is essentially a NaiveDateTime with an offset, @@ -246,4 +440,8 @@ impl ByteSerializable for DateTime { let naive = ::from_bytes(bytes); DateTime::::from_utc(naive, Utc) } + + fn value_type(&self) -> ByteSerializableType { + ByteSerializableType::DateTime + } } diff --git a/src/rust/bst/src/error.rs b/src/rust/bst/src/error.rs index 3bc65ff..54b9317 100644 --- a/src/rust/bst/src/error.rs +++ b/src/rust/bst/src/error.rs @@ -1,7 +1,16 @@ +use crate::byte_serializable::ByteSerializableType; use thiserror::Error; #[derive(Error, Debug)] pub enum Error { #[error("io error: {0}")] Io(#[from] std::io::Error), + #[error("invalid type: {0}")] + InvalidType(String), + #[error("invalid type id: {0}")] + InvalidTypeId(u32), + #[error("type mismatch: expected {1:?}, got {0:?}")] + TypeMismatch(ByteSerializableType, ByteSerializableType), + #[error("invalid byte serializable value")] + InvalidByteSerializableValue, } diff --git a/src/rust/bst/src/lib.rs b/src/rust/bst/src/lib.rs index 1c98236..db0af39 100644 --- a/src/rust/bst/src/lib.rs +++ b/src/rust/bst/src/lib.rs @@ -10,7 +10,7 @@ pub use sorted_index::*; mod tests { use crate::byte_serializable::ByteSerializable; use crate::query::{MultiIndex, Operator, Query, QueryCondition}; - use crate::sorted_index::{IndexSerializable, KeyValue, SortedIndex, ValueOffset}; + use crate::sorted_index::{BufferedIndex, IndexSerializable, KeyValue, ValueOffset}; use crate::Float; use chrono::NaiveDate; use ordered_float::OrderedFloat; @@ -125,13 +125,13 @@ mod tests { } // Create SortedIndices and build each index. - let mut id_index = SortedIndex::new(); + let mut id_index = BufferedIndex::new(); id_index.build_index(id_entries); - let mut city_index = SortedIndex::new(); + let mut city_index = BufferedIndex::new(); city_index.build_index(city_entries); - let mut height_index = SortedIndex::new(); + let mut height_index = BufferedIndex::new(); height_index.build_index(height_entries); - let mut year_index = SortedIndex::new(); + let mut year_index = BufferedIndex::new(); year_index.build_index(year_entries); // Create a MultiIndex and register each index by field name. @@ -285,6 +285,26 @@ mod tests { let result9 = multi_index.query(query9); assert_eq!(result9, vec![1, 2, 4, 6, 7]); + let query10 = Query { + conditions: vec![QueryCondition { + field: "height".to_string(), + operator: Operator::Lt, + key: (30.6f64).to_bytes(), + }], + }; + let result10 = multi_index.query(query10); + assert_eq!(result10, vec![0, 2, 6]); + + let query11 = Query { + conditions: vec![QueryCondition { + field: "height".to_string(), + operator: Operator::Le, + key: (30.6f64).to_bytes(), + }], + }; + let result11 = multi_index.query(query11); + assert_eq!(result11, vec![0, 2, 3, 6]); + Ok(()) } @@ -332,9 +352,9 @@ mod tests { } // Create SortedIndices and build each index. - let mut id_index = SortedIndex::new(); + let mut id_index = BufferedIndex::new(); id_index.build_index(id_entries); - let mut city_index = SortedIndex::new(); + let mut city_index = BufferedIndex::new(); city_index.build_index(city_entries); let mut id_index_bytes = Vec::new(); @@ -342,9 +362,9 @@ mod tests { let mut city_index_bytes = Vec::new(); city_index.serialize(&mut city_index_bytes)?; - let id_index_deserialized = SortedIndex::::deserialize(&mut &id_index_bytes[..])?; + let id_index_deserialized = BufferedIndex::::deserialize(&mut &id_index_bytes[..])?; let city_index_deserialized = - SortedIndex::::deserialize(&mut &city_index_bytes[..])?; + BufferedIndex::::deserialize(&mut &city_index_bytes[..])?; assert_eq!(id_index.entries, id_index_deserialized.entries); assert_eq!(city_index.entries, city_index_deserialized.entries); diff --git a/src/rust/bst/src/query.rs b/src/rust/bst/src/query.rs index ff81950..16a3d2b 100644 --- a/src/rust/bst/src/query.rs +++ b/src/rust/bst/src/query.rs @@ -1,13 +1,18 @@ -use std::collections::{HashMap, HashSet}; +use crate::sorted_index::{SearchableIndex, ValueOffset}; +use crate::{error, sorted_index, ByteSerializable, ByteSerializableType}; +use std::collections::HashMap; +use std::collections::HashSet; +use std::io::{Read, Seek, SeekFrom}; -use crate::sorted_index::{AnyIndex, ValueOffset}; +use chrono::{DateTime, Utc}; +#[cfg(feature = "http")] +use http_range_client::{AsyncBufferedHttpRangeClient, AsyncHttpRangeClient}; -use crate::error::Error; #[cfg(feature = "http")] -use packed_rtree::http::{HttpRange, HttpSearchResultItem}; +use std::ops::Range; -/// Operators for comparisons in queries. -#[derive(Debug, Clone, Copy)] +/// Comparison operators for queries. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Operator { Eq, Ne, @@ -17,7 +22,9 @@ pub enum Operator { Le, } -/// A query condition now refers to a field by name and carries the serialized key. +/// A condition in a query, consisting of a field name, an operator, and a key value. +/// +/// The key value is stored as a byte vector, obtained via ByteSerializable::to_bytes. #[derive(Debug, Clone)] pub struct QueryCondition { /// The field identifier (e.g., "id", "name", etc.) @@ -28,15 +35,16 @@ pub struct QueryCondition { pub key: Vec, } -/// A query is a set of conditions (implicitly AND‑combined). +/// A query consisting of one or more conditions. #[derive(Debug, Clone)] pub struct Query { pub conditions: Vec, } +/// A multi-index that maps field names to their corresponding indices. pub struct MultiIndex { /// A mapping from field names to their corresponding index. - pub indices: HashMap>, + pub indices: HashMap>, } impl Default for MultiIndex { @@ -46,21 +54,21 @@ impl Default for MultiIndex { } impl MultiIndex { - /// Create an empty MultiIndex. + /// Create a new, empty multi-index. pub fn new() -> Self { - MultiIndex { + Self { indices: HashMap::new(), } } - /// Register an index under the given field name. - pub fn add_index(&mut self, field_name: String, index: Box) { + /// Add an index for a field. + pub fn add_index(&mut self, field_name: String, index: Box) { self.indices.insert(field_name, index); } - /// Execute a query over the registered indices. - /// For each condition, candidate offsets are retrieved from the corresponding index. - /// The final result is the intersection of candidates from all conditions. + /// Execute a query against the multi-index. + /// + /// Returns a vector of offsets for records that match all conditions in the query. pub fn query(&self, query: Query) -> Vec { let mut candidate_sets: Vec> = Vec::new(); @@ -125,70 +133,856 @@ impl MultiIndex { result.sort(); result } + + /// Performs a streaming query on the multi-index without loading the entire index into memory. + /// This is useful for large indices where loading the entire index would be inefficient. + /// + /// # Arguments + /// + /// * `reader` - A reader positioned at the start of the index data + /// * `query` - The query to execute + /// * `index_offsets` - A map of field names to their byte offsets in the file + /// + /// # Returns + /// + /// A vector of value offsets that match the query + pub fn stream_query( + &self, + reader: &mut R, + query: &Query, + index_offsets: &HashMap, + ) -> Result, error::Error> { + // If there are no conditions, return an empty result. + if query.conditions.is_empty() { + return Ok(Vec::new()); + } + + let field_names: Vec = query.conditions.iter().map(|c| c.field.clone()).collect(); + + // Only load the indices needed for this query + let filtered_offsets: HashMap = index_offsets + .iter() + .filter(|(k, _)| field_names.contains(k)) + .map(|(k, v)| (k.clone(), *v)) + .collect(); + + let streamable_index = StreamableMultiIndex::from_reader(reader, &filtered_offsets)?; + + // Execute the query using the streamable index + streamable_index.stream_query(reader, query) + } + + #[cfg(feature = "http")] + /// Performs a streaming query on the multi-index over HTTP without loading the entire index into memory. + /// This is useful for large indices where loading the entire index would be inefficient. + /// + /// # Arguments + /// + /// * `client` - An HTTP client for making range requests + /// * `query` - The query to execute + /// * `index_offsets` - A map of field names to their byte offsets in the file + /// * `feature_begin` - The byte offset where the feature data begins + /// + /// # Returns + /// + /// A vector of HTTP search result items that match the query + pub async fn http_stream_query( + &self, + client: &mut AsyncBufferedHttpRangeClient, + query: &Query, + index_offsets: &HashMap, + feature_begin: usize, + ) -> std::io::Result> { + // If there are no conditions, return an empty result. + if query.conditions.is_empty() { + return Ok(Vec::new()); + } + todo!() + } } -// TODO: improve this method to process on stream. Also, do something to avoid fetching many discrete ranges. #[cfg(feature = "http")] -pub async fn stream_query( - m_indices: &MultiIndex, - query: Query, - feature_begin: usize, -) -> Result, Error> { - // Compute candidate offset set for each query condition. - - let mut candidate_sets: Vec> = Vec::new(); - for condition in query.conditions.iter() { - if let Some(idx) = m_indices.indices.get(&condition.field) { - let offsets: Vec = match condition.operator { - Operator::Eq => idx.query_exact_bytes(&condition.key), - Operator::Gt => { - let offsets = idx.query_range_bytes(Some(&condition.key), None); - let eq = idx.query_exact_bytes(&condition.key); - offsets.into_iter().filter(|o| !eq.contains(o)).collect() +#[derive(Debug, Clone)] +pub enum HttpRange { + Range(Range), + RangeFrom(std::ops::RangeFrom), +} + +#[cfg(feature = "http")] +impl HttpRange { + pub fn start(&self) -> usize { + match self { + HttpRange::Range(range) => range.start, + HttpRange::RangeFrom(range) => range.start, + } + } + + pub fn end(&self) -> Option { + match self { + HttpRange::Range(range) => Some(range.end), + HttpRange::RangeFrom(_) => None, + } + } +} + +#[cfg(feature = "http")] +#[derive(Debug, Clone)] +pub struct HttpSearchResultItem { + /// Byte range in the feature data section + pub range: HttpRange, +} + +/// Type-erased IndexMeta that can work with any ByteSerializable type. +/// This allows us to store different IndexMeta instances in a HashMap. +#[derive(Debug, Clone)] +pub struct TypeErasedIndexMeta { + /// Number of entries in the index. + pub entry_count: u64, + /// Total size of the index in bytes. + pub size: u64, + /// Type identifier for the index. + pub type_id: ByteSerializableType, +} + +impl TypeErasedIndexMeta { + /// Create a new TypeErasedIndexMeta from an IndexMeta. + pub fn from_generic( + index_meta: &sorted_index::IndexMeta, + type_id: ByteSerializableType, + ) -> Self { + Self { + entry_count: index_meta.entry_count, + size: index_meta.size, + type_id, + } + } + + /// Read and deserialize stream query exact results. + pub fn stream_query_exact( + &self, + reader: &mut R, + key: &[u8], + ) -> std::io::Result> { + // Store current position to restore later + let start_pos = reader.stream_position()?; + + // Skip the type ID (4 bytes) and entry count (8 bytes) + reader.seek(SeekFrom::Start(start_pos + 12))?; + + // Binary search through the index + let mut left = 0; + let mut right = self.entry_count as i64 - 1; + let mut result = Vec::new(); + + while left <= right { + let mid = left + (right - left) / 2; + + // Seek to the entry at the mid position + self.seek_to_entry(reader, mid as u64, start_pos)?; + + // Read key length and key + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes) as usize; + + let mut key_buf = vec![0u8; key_len]; + reader.read_exact(&mut key_buf)?; + + // Compare keys based on the type + let comparison = self.compare_keys(&key_buf, key); + + match comparison { + std::cmp::Ordering::Equal => { + // Found a match, read offsets + let mut offsets_len_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_len_bytes)?; + let offsets_len = u64::from_le_bytes(offsets_len_bytes) as usize; + + for _ in 0..offsets_len { + let mut offset_bytes = [0u8; 8]; + reader.read_exact(&mut offset_bytes)?; + let offset = u64::from_le_bytes(offset_bytes); + result.push(offset); + } + break; } - Operator::Ge => idx.query_range_bytes(Some(&condition.key), None), - Operator::Lt => idx.query_range_bytes(None, Some(&condition.key)), - Operator::Le => { - let mut offsets = idx.query_range_bytes(None, Some(&condition.key)); - let eq = idx.query_exact_bytes(&condition.key); - offsets.extend(eq); - // Remove duplicates. - offsets - .into_iter() - .collect::>() + std::cmp::Ordering::Less => { + left = mid + 1; + } + std::cmp::Ordering::Greater => { + right = mid - 1; + } + } + } + + // Reset position + reader.seek(SeekFrom::Start(start_pos))?; + + Ok(result) + } + + /// Read and deserialize stream query range results. + pub fn stream_query_range( + &self, + reader: &mut R, + lower: Option<&[u8]>, + upper: Option<&[u8]>, + ) -> std::io::Result> { + // Store current position to restore later + let start_pos = reader.stream_position()?; + + // Find the starting position based on lower bound + let start_index = if let Some(lower_bound) = lower { + self.find_lower_bound(reader, lower_bound, start_pos)? + } else { + 0 + }; + + // Seek to the starting entry + self.seek_to_entry(reader, start_index, start_pos)?; + + let mut result = Vec::new(); + + // Iterate through entries until we hit the upper bound + for i in start_index..self.entry_count { + // Read key length + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes) as usize; + + // Read key bytes + let mut key_buf = vec![0u8; key_len]; + reader.read_exact(&mut key_buf)?; + + // Check upper bound + if let Some(upper_bound) = upper { + if self.compare_keys(&key_buf, upper_bound) != std::cmp::Ordering::Less { + break; + } + } + + // Read offsets + let mut offsets_len_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_len_bytes)?; + let offsets_len = u64::from_le_bytes(offsets_len_bytes) as usize; + + for _ in 0..offsets_len { + let mut offset_bytes = [0u8; 8]; + reader.read_exact(&mut offset_bytes)?; + let offset = u64::from_le_bytes(offset_bytes); + result.push(offset); + } + } + + // Reset position + reader.seek(SeekFrom::Start(start_pos))?; + + Ok(result) + } + + /// Helper method to seek to a specific entry in the index. + fn seek_to_entry( + &self, + reader: &mut R, + entry_index: u64, + start_pos: u64, + ) -> std::io::Result<()> { + // Reset to the beginning of the index + reader.seek(SeekFrom::Start(start_pos))?; + + // Skip the type identifier and entry count + reader.seek(SeekFrom::Current(12))?; + + // Iterate through entries until we reach the target + for _ in 0..entry_index { + // Read key length + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes) as usize; + + // Skip key bytes + reader.seek(SeekFrom::Current(key_len as i64))?; + + // Read offsets length + let mut offsets_len_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_len_bytes)?; + let offsets_len = u64::from_le_bytes(offsets_len_bytes) as usize; + + // Skip offset bytes + reader.seek(SeekFrom::Current((offsets_len * 8) as i64))?; + } + + Ok(()) + } + + /// Helper method to find the lower bound index for range queries. + fn find_lower_bound( + &self, + reader: &mut R, + lower_bound: &[u8], + start_pos: u64, + ) -> std::io::Result { + // Binary search to find the lower bound + let mut left = 0; + let mut right = self.entry_count as i64 - 1; + let mut result = 0; + + while left <= right { + let mid = left + (right - left) / 2; + + // Seek to the mid entry + self.seek_to_entry(reader, mid as u64, start_pos)?; + + // Read key length + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes) as usize; + + // Read key bytes + let mut key_buf = vec![0u8; key_len]; + reader.read_exact(&mut key_buf)?; + + // Compare keys + let ordering = self.compare_keys(&key_buf, lower_bound); + + match ordering { + std::cmp::Ordering::Equal => { + result = mid as u64; + break; + } + std::cmp::Ordering::Less => { + left = mid + 1; + result = left as u64; + } + std::cmp::Ordering::Greater => { + right = mid - 1; + } + } + } + + Ok(result) + } + + // TODO: Fix me!!!!!! + /// Helper method to compare keys based on the type identifier. + fn compare_keys(&self, key_bytes: &[u8], query_key: &[u8]) -> std::cmp::Ordering { + match self.type_id { + ByteSerializableType::F32 => { + // OrderedFloat + if key_bytes.len() == 4 && query_key.len() == 4 { + let key_val = ordered_float::OrderedFloat(f32::from_le_bytes([ + key_bytes[0], + key_bytes[1], + key_bytes[2], + key_bytes[3], + ])); + let query_val = ordered_float::OrderedFloat(f32::from_le_bytes([ + query_key[0], + query_key[1], + query_key[2], + query_key[3], + ])); + + key_val + .partial_cmp(&query_val) + .unwrap_or(std::cmp::Ordering::Equal) + } else { + key_bytes.cmp(query_key) + } + } + ByteSerializableType::F64 => { + // OrderedFloat + if key_bytes.len() == 8 && query_key.len() == 8 { + let key_val = ordered_float::OrderedFloat(f64::from_le_bytes([ + key_bytes[0], + key_bytes[1], + key_bytes[2], + key_bytes[3], + key_bytes[4], + key_bytes[5], + key_bytes[6], + key_bytes[7], + ])); + let query_val = ordered_float::OrderedFloat(f64::from_le_bytes([ + query_key[0], + query_key[1], + query_key[2], + query_key[3], + query_key[4], + query_key[5], + query_key[6], + query_key[7], + ])); + + key_val + .partial_cmp(&query_val) + .unwrap_or(std::cmp::Ordering::Equal) + } else { + key_bytes.cmp(query_key) + } + } + ByteSerializableType::String => { + // Try to convert to strings for comparison + match ( + std::str::from_utf8(key_bytes), + std::str::from_utf8(query_key), + ) { + (Ok(key_str), Ok(query_str)) => key_str.cmp(query_str), + _ => key_bytes.cmp(query_key), + } + } + ByteSerializableType::DateTime => { + // DateTime + let key_val = DateTime::::from_bytes(key_bytes); + let query_val = DateTime::::from_bytes(query_key); + key_val.cmp(&query_val) + // if key_bytes.len() == 8 && query_key.len() == 8 { + // let key_val = DateTime::::from_bytes(key_bytes); + // let query_val = DateTime::::from_bytes(query_key); + + // key_val.cmp(&query_val) + // } else { + // key_bytes.cmp(query_key) + // } + } + + // For all other types, we can directly compare the byte slices + _ => key_bytes.cmp(query_key), + } + } +} + +/// A multi-index that can be streamed from a reader. +#[derive(Default)] +pub struct StreamableMultiIndex { + /// A mapping from field names to their corresponding index metadata. + pub indices: HashMap, + /// A mapping from field names to their offsets in the file. + pub index_offsets: HashMap, +} + +impl StreamableMultiIndex { + /// Create a new, empty streamable multi-index. + pub fn new() -> Self { + Self { + indices: HashMap::new(), + index_offsets: HashMap::new(), + } + } + + /// Add an index for a field. + pub fn add_index(&mut self, field_name: String, index: TypeErasedIndexMeta) { + self.indices.insert(field_name, index); + } + + /// Create a streamable multi-index from a reader. + pub fn from_reader( + reader: &mut R, + index_offsets: &HashMap, + ) -> Result { + let mut multi_index = Self::new(); + + // Copy the index offsets + for (field, offset) in index_offsets { + multi_index.index_offsets.insert(field.clone(), *offset); + } + + // Get the type identifier and entry count for each index + for (field, offset) in index_offsets { + reader.seek(SeekFrom::Start(*offset))?; + + // Read the type ID + let mut type_id_bytes = [0u8; 4]; + reader.read_exact(&mut type_id_bytes)?; + let type_id = ByteSerializableType::from_type_id(u32::from_le_bytes(type_id_bytes))?; + + // Read the entry count + let mut entry_count_bytes = [0u8; 8]; + reader.read_exact(&mut entry_count_bytes)?; + let entry_count = u64::from_le_bytes(entry_count_bytes); + + // Get the size of the index by reading through all entries + let start_pos = *offset; + reader.seek(SeekFrom::Start(start_pos + 12))?; // Skip type ID and entry count + + let mut curr_pos = start_pos + 12; + + // For each entry, skip over the key and offsets + for _ in 0..entry_count { + // Read key length + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Skip key bytes + reader.seek(SeekFrom::Current(key_len as i64))?; + + // Read offsets length + let mut offsets_len_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_len_bytes)?; + let offsets_len = u64::from_le_bytes(offsets_len_bytes); + + // Skip offset bytes + reader.seek(SeekFrom::Current((offsets_len * 8) as i64))?; + + curr_pos = reader.stream_position()?; + } + + // Calculate the size of the index + let size = curr_pos - start_pos; + + // Create a type-erased index meta and add it to the multi-index + let index_meta = TypeErasedIndexMeta { + entry_count, + size, + type_id, + }; + + multi_index.add_index(field.clone(), index_meta); + } + + Ok(multi_index) + } + + /// Execute a query on the multi-index. + pub fn stream_query( + &self, + reader: &mut R, + query: &Query, + ) -> Result, error::Error> { + // Save the current position to restore later + let start_pos = reader.stream_position()?; + + // Process each condition and collect the results + let mut all_results: Option> = None; + + for condition in &query.conditions { + // Get the index for this field + let index_meta = match self.indices.get(&condition.field) { + Some(index) => index, + None => { + continue; + } + }; + + // Get the offset for this field + let offset = match self.index_offsets.get(&condition.field) { + Some(offset) => *offset, + None => { + continue; + } + }; + + // Seek to the start of the index + reader.seek(SeekFrom::Start(offset))?; + + // Execute the query based on the operator + let results = match condition.operator { + Operator::Eq => index_meta.stream_query_exact(reader, &condition.key)?, + Operator::Ne => { + // For not equal, we need to get all results and filter out the matching ones + let matching = index_meta.stream_query_exact(reader, &condition.key)?; + let all = index_meta.stream_query_range(reader, None, None)?; + all.into_iter().filter(|v| !matching.contains(v)).collect() + } + Operator::Gt => { + // For greater than, we get the range but exclude exact matches + let range_results = + index_meta.stream_query_range(reader, Some(&condition.key), None)?; + let exact_matches = index_meta.stream_query_exact(reader, &condition.key)?; + + // Filter out exact matches from range results + range_results .into_iter() + .filter(|v| !exact_matches.contains(v)) .collect() } - Operator::Ne => { - let all: HashSet = - idx.query_range_bytes(None, None).into_iter().collect(); - let eq: HashSet = - idx.query_exact_bytes(&condition.key).into_iter().collect(); - all.difference(&eq).cloned().collect::>() + Operator::Lt => { + index_meta.stream_query_range(reader, None, Some(&condition.key))? + } + Operator::Ge => { + // For greater than or equal, we include the key + index_meta.stream_query_range(reader, Some(&condition.key), None)? + } + Operator::Le => { + // For less than or equal, we include the key + index_meta.stream_query_range(reader, None, Some(&condition.key))? } }; - candidate_sets.push(offsets.into_iter().collect()); + + // Intersect with previous results + match all_results { + None => { + all_results = Some(results.into_iter().collect()); + } + Some(ref mut existing) => { + let new_results: HashSet = results.into_iter().collect(); + *existing = existing.intersection(&new_results).cloned().collect(); + } + } } + + // Restore the original position + reader.seek(SeekFrom::Start(start_pos))?; + + // Convert the results to a sorted vector + let mut result_vec = match all_results { + Some(set) => set.into_iter().collect::>(), + None => Vec::new(), + }; + + result_vec.sort(); + + Ok(result_vec) + } + + #[cfg(feature = "http")] + pub async fn http_stream_query( + &self, + client: &mut AsyncBufferedHttpRangeClient, + query: &Query, + index_offset: usize, + feature_begin: usize, + ) -> std::io::Result> { + // TODO: Implement HTTP streaming query + unimplemented!("HTTP streaming query not yet implemented for TypeErasedIndexMeta"); + } + + #[cfg(feature = "http")] + pub async fn http_stream_query_batched( + &self, + client: &mut AsyncBufferedHttpRangeClient, + query: &Query, + index_offset: usize, + feature_begin: usize, + batch_threshold: usize, + ) -> std::io::Result> { + // TODO: Implement batched HTTP streaming query + unimplemented!("Batched HTTP streaming query not yet implemented for TypeErasedIndexMeta"); } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::sorted_index::BufferedIndex; + use crate::IndexSerializable; + use crate::KeyValue; + + use ordered_float::OrderedFloat; + use std::io::Cursor; - if candidate_sets.is_empty() { - return Ok(vec![]); + fn create_sample_height_index() -> BufferedIndex> { + let mut index = BufferedIndex::new(); + let entries = vec![ + KeyValue { + key: OrderedFloat(10.5), + offsets: vec![1, 2, 3], + }, + KeyValue { + key: OrderedFloat(20.0), + offsets: vec![4, 5], + }, + KeyValue { + key: OrderedFloat(30.0), + offsets: vec![6, 7, 8], + }, + KeyValue { + key: OrderedFloat(74.5), + offsets: vec![9, 10], + }, + ]; + index.build_index(entries); + index } - // Intersect candidate sets to get matching offsets. - let mut intersection: HashSet = candidate_sets.first().unwrap().clone(); - for set in candidate_sets.iter().skip(1) { - intersection = intersection.intersection(set).cloned().collect(); + fn create_sample_id_index() -> BufferedIndex { + let mut index = BufferedIndex::new(); + let entries = vec![ + KeyValue { + key: "a1".to_string(), + offsets: vec![1, 2], + }, + KeyValue { + key: "b2".to_string(), + offsets: vec![3, 4, 5], + }, + KeyValue { + key: "c3".to_string(), + offsets: vec![6, 7], + }, + KeyValue { + key: "d4".to_string(), + offsets: vec![8, 9, 10], + }, + ]; + index.build_index(entries); + index } - let mut offsets: Vec = intersection.into_iter().collect(); - offsets.sort(); // ascending order - let http_ranges: Vec = offsets - .into_iter() - .map(|offset| HttpSearchResultItem { - range: HttpRange::RangeFrom(offset as usize + feature_begin..), - }) - .collect(); + fn create_serialized_height_index() -> Vec { + let index = create_sample_height_index(); + let mut buffer = Vec::new(); + index.serialize(&mut buffer).unwrap(); + buffer + } + + fn create_serialized_id_index() -> Vec { + let index = create_sample_id_index(); + let mut buffer = Vec::new(); + index.serialize(&mut buffer).unwrap(); + buffer + } - Ok(http_ranges) + #[test] + fn test_streamable_multi_index_from_reader() -> Result<(), error::Error> { + // Create serialized indices + let height_buffer = create_serialized_height_index(); + let id_buffer = create_serialized_id_index(); + + // Create a combined buffer with both indices + let mut combined_buffer = Vec::new(); + combined_buffer.extend_from_slice(&height_buffer); + combined_buffer.extend_from_slice(&id_buffer); + + // Create a cursor for the combined buffer + let mut cursor = Cursor::new(&combined_buffer); + + // Create index offsets + let mut index_offsets = HashMap::new(); + index_offsets.insert("height".to_string(), 0); + index_offsets.insert("id".to_string(), height_buffer.len() as u64); + + // Create a streamable multi-index + let multi_index = StreamableMultiIndex::from_reader(&mut cursor, &index_offsets)?; + + // Verify the indices were loaded correctly + assert_eq!(multi_index.indices.len(), 2); + assert!(multi_index.indices.contains_key("height")); + assert!(multi_index.indices.contains_key("id")); + + // Verify the offsets were stored correctly + assert_eq!(multi_index.index_offsets.len(), 2); + assert_eq!(multi_index.index_offsets.get("height"), Some(&0)); + assert_eq!( + multi_index.index_offsets.get("id"), + Some(&(height_buffer.len() as u64)) + ); + + // Verify the type IDs are correct + assert_eq!( + multi_index.indices.get("height").unwrap().type_id, + ByteSerializableType::F32 + ); + assert_eq!( + multi_index.indices.get("id").unwrap().type_id, + ByteSerializableType::String + ); + + Ok(()) + } + + #[test] + fn test_streamable_multi_index_queries() -> Result<(), error::Error> { + // Create serialized indices + let height_buffer = create_serialized_height_index(); + let id_buffer = create_serialized_id_index(); + + // Create a combined buffer with both indices + let mut combined_buffer = Vec::new(); + combined_buffer.extend_from_slice(&height_buffer); + combined_buffer.extend_from_slice(&id_buffer); + + // Create a cursor for the combined buffer + let mut cursor = Cursor::new(&combined_buffer); + + // Create index offsets + let mut index_offsets = HashMap::new(); + index_offsets.insert("height".to_string(), 0); + index_offsets.insert("id".to_string(), height_buffer.len() as u64); + + // Create a streamable multi-index + let multi_index = StreamableMultiIndex::from_reader(&mut cursor, &index_offsets)?; + + // Define test cases + struct TestCase { + name: &'static str, + query: Query, + expected: Vec, + } + + let test_cases = vec![ + TestCase { + name: "Exact height match", + query: Query { + conditions: vec![QueryCondition { + field: "height".to_string(), + operator: Operator::Eq, + key: OrderedFloat(30.0f32).to_bytes(), + }], + }, + expected: vec![6, 7, 8], + }, + TestCase { + name: "Height range query", + query: Query { + conditions: vec![QueryCondition { + field: "height".to_string(), + operator: Operator::Gt, + key: OrderedFloat(20.0f32).to_bytes(), + }], + }, + expected: vec![6, 7, 8, 9, 10], + }, + TestCase { + name: "Exact ID match", + query: Query { + conditions: vec![QueryCondition { + field: "id".to_string(), + operator: Operator::Eq, + key: "c3".to_string().to_bytes(), + }], + }, + expected: vec![6, 7], + }, + TestCase { + name: "Combined query (height and ID)", + query: Query { + conditions: vec![ + QueryCondition { + field: "height".to_string(), + operator: Operator::Ge, + key: OrderedFloat(30.0f32).to_bytes(), + }, + QueryCondition { + field: "id".to_string(), + operator: Operator::Eq, + key: "c3".to_string().to_bytes(), + }, + ], + }, + expected: vec![6, 7], + }, + ]; + + // Run the test cases + for test_case in test_cases { + println!("Running test case: {}", test_case.name); + + // Reset cursor position + cursor.set_position(0); + + // Execute the query + let results = multi_index.stream_query(&mut cursor, &test_case.query)?; + + // Verify the results + assert_eq!( + results, test_case.expected, + "Test case '{}' failed: expected {:?}, got {:?}", + test_case.name, test_case.expected, results + ); + } + + Ok(()) + } } diff --git a/src/rust/bst/src/sorted_index.rs b/src/rust/bst/src/sorted_index.rs index 03e01be..baf9bf1 100644 --- a/src/rust/bst/src/sorted_index.rs +++ b/src/rust/bst/src/sorted_index.rs @@ -1,30 +1,33 @@ -use std::io::{Read, Write}; +use std::io::{Read, Seek, SeekFrom, Write}; -use crate::byte_serializable::ByteSerializable; +use crate::{byte_serializable::ByteSerializable, error, ByteSerializableType}; /// The offset type used to point to actual record data. pub type ValueOffset = u64; /// A key–offset pair. The key must be orderable and serializable. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct KeyValue { +pub struct KeyValue { pub key: T, pub offsets: Vec, } -/// A sorted index implemented as an array of key–offset pairs. -#[derive(Debug)] -pub struct SortedIndex { +/// A buffered index implemented as an in-memory array of key–offset pairs. +/// +/// This index is fully loaded into memory for fast access, making it suitable +/// for smaller datasets or when memory usage is not a concern. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BufferedIndex { pub entries: Vec>, } -impl Default for SortedIndex { +impl Default for BufferedIndex { fn default() -> Self { Self::new() } } -impl SortedIndex { +impl BufferedIndex { /// Create an empty index. pub fn new() -> Self { Self { @@ -39,8 +42,24 @@ impl SortedIndex { } } -/// A trait defining flexible search operations on an index. -pub trait SearchableIndex { +/// A trait defining byte-based search operations on an index. +/// +/// This trait is object-safe and works with serialized byte representations +/// of keys, making it suitable for use with trait objects and dynamic dispatch. +pub trait SearchableIndex: Send + Sync { + /// Return offsets for an exact key match given a serialized key. + fn query_exact_bytes(&self, key: &[u8]) -> Vec; + + /// Return offsets for keys in the half-open interval [lower, upper) given serialized keys. + /// (A `None` for either bound means unbounded.) + fn query_range_bytes(&self, lower: Option<&[u8]>, upper: Option<&[u8]>) -> Vec; +} + +/// A trait defining type-specific search operations on an index. +/// +/// This trait provides strongly-typed search methods that work with the actual +/// key type rather than byte representations. +pub trait TypedSearchableIndex { /// Return offsets for an exact key match. fn query_exact(&self, key: &T) -> Option<&[ValueOffset]>; @@ -54,7 +73,9 @@ pub trait SearchableIndex { F: Fn(&T) -> bool; } -impl SearchableIndex for SortedIndex { +impl TypedSearchableIndex + for BufferedIndex +{ fn query_exact(&self, key: &T) -> Option<&[ValueOffset]> { self.entries .binary_search_by_key(&key, |kv| &kv.key) @@ -99,21 +120,45 @@ impl SearchableIndex for SortedIndex { } } +impl SearchableIndex for BufferedIndex { + fn query_exact_bytes(&self, key: &[u8]) -> Vec { + let key_t = T::from_bytes(key); + self.query_exact(&key_t).unwrap_or(&[]).to_vec() + } + + fn query_range_bytes(&self, lower: Option<&[u8]>, upper: Option<&[u8]>) -> Vec { + // Convert the optional byte slices into T + let lower_t = lower.map(|b| T::from_bytes(b)); + let upper_t = upper.map(|b| T::from_bytes(b)); + // We need to pass references. + let lower_ref = lower_t.as_ref(); + let upper_ref = upper_t.as_ref(); + let results = self.query_range(lower_ref, upper_ref); + results.into_iter().flatten().cloned().collect() + } +} + /// A trait for serializing and deserializing an index. pub trait IndexSerializable { /// Write the index to a writer. - fn serialize(&self, writer: &mut W) -> std::io::Result<()>; + fn serialize(&self, writer: &mut W) -> Result<(), error::Error>; /// Read the index from a reader. - fn deserialize(reader: &mut R) -> std::io::Result + fn deserialize(reader: &mut R) -> Result where Self: Sized; } -impl IndexSerializable for SortedIndex { - fn serialize(&self, writer: &mut W) -> std::io::Result<()> { +impl IndexSerializable for BufferedIndex { + fn serialize(&self, writer: &mut W) -> Result<(), error::Error> { + // Write the type identifier for T + let value_type = self.entries.first().unwrap().key.value_type(); + writer.write_all(&value_type.to_bytes())?; + + // Write the number of entries let len = self.entries.len() as u64; writer.write_all(&len.to_le_bytes())?; + for kv in &self.entries { let key_bytes = kv.key.to_bytes(); let key_len = key_bytes.len() as u64; @@ -128,10 +173,17 @@ impl IndexSerializable for SortedIndex { Ok(()) } - fn deserialize(reader: &mut R) -> std::io::Result { + fn deserialize(reader: &mut R) -> Result { + // Read the type identifier + let mut type_id_bytes = [0u8; 4]; + reader.read_exact(&mut type_id_bytes)?; + let _ = ByteSerializableType::from_bytes(&type_id_bytes)?; + + // Read the number of entries let mut len_bytes = [0u8; 8]; reader.read_exact(&mut len_bytes)?; let num_entries = u64::from_le_bytes(len_bytes); + let mut entries = Vec::with_capacity(num_entries as usize); for _ in 0..num_entries { // Read key length. @@ -156,34 +208,811 @@ impl IndexSerializable for SortedIndex { } entries.push(KeyValue { key, offsets }); } - Ok(SortedIndex { entries }) + Ok(BufferedIndex { entries }) } } -pub trait AnyIndex { - /// Returns the offsets for an exact match given a serialized key. - fn query_exact_bytes(&self, key: &[u8]) -> Vec; - /// Returns the offsets for a range query given optional lower and upper serialized keys. - fn query_range_bytes(&self, lower: Option<&[u8]>, upper: Option<&[u8]>) -> Vec; +/// A trait for type-safe streaming access to an index. +pub trait TypedStreamableIndex: + Send + Sync +{ + /// Returns the size of the index in bytes. + fn index_size(&self) -> u64; + + /// Returns the offsets for an exact match given a key. + /// The reader should be positioned at the start of the index data. + fn stream_query_exact( + &self, + reader: &mut R, + key: &T, + ) -> std::io::Result>; + + /// Returns the offsets for a range query given optional lower and upper keys. + /// The reader should be positioned at the start of the index data. + fn stream_query_range( + &self, + reader: &mut R, + lower: Option<&T>, + upper: Option<&T>, + ) -> std::io::Result>; + + /// Returns the offsets for an exact match given a key. + /// For use with HTTP range requests. + #[cfg(feature = "http")] + async fn http_stream_query_exact( + &self, + client: &mut http_range_client::AsyncBufferedHttpRangeClient, + index_offset: usize, + key: &T, + ) -> std::io::Result>; + + /// Returns the offsets for a range query given optional lower and upper keys. + /// For use with HTTP range requests. + #[cfg(feature = "http")] + async fn http_stream_query_range( + &self, + client: &mut http_range_client::AsyncBufferedHttpRangeClient, + index_offset: usize, + lower: Option<&T>, + upper: Option<&T>, + ) -> std::io::Result>; +} + +/// Metadata for a serialized BufferedIndex, used for streaming access. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IndexMeta { + /// Number of entries in the index. + pub entry_count: u64, + /// Total size of the index in bytes. + pub size: u64, + /// Phantom data to represent the type parameter. + pub _phantom: std::marker::PhantomData, } -impl AnyIndex for SortedIndex -where - T: ByteSerializable + Ord + 'static, +impl IndexMeta { + /// Creates a new IndexMeta. + pub fn new(entry_count: u64, size: u64) -> Self { + Self { + entry_count, + size, + _phantom: std::marker::PhantomData, + } + } + + /// Read metadata and construct an IndexMeta from a reader. + pub fn from_reader(reader: &mut R, size: u64) -> Result { + let start_pos = reader.stream_position()?; + + // Read the type identifier. + let mut type_id_bytes = [0u8; 4]; + reader.read_exact(&mut type_id_bytes)?; + + // Read the number of entries. + let mut entry_count_bytes = [0u8; 8]; + reader.read_exact(&mut entry_count_bytes)?; + let entry_count = u64::from_le_bytes(entry_count_bytes); + + // Seek back to the start position. + reader.seek(SeekFrom::Start(start_pos))?; + + Ok(Self::new(entry_count, size)) + } + + /// Seek to a specific entry in the index. + pub fn seek_to_entry( + &self, + reader: &mut R, + entry_index: u64, + start_pos: u64, + ) -> std::io::Result<()> { + if entry_index >= self.entry_count { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "entry index {} out of bounds (max: {})", + entry_index, + self.entry_count - 1 + ), + )); + } + + // Skip the type id (4 bytes) and entry count (8 bytes). + let pos = start_pos + 12; + + reader.seek(SeekFrom::Start(pos))?; + + // Skip entries until we reach the desired one. + for _ in 0..entry_index { + // Read the key length. + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Skip the key. + reader.seek(SeekFrom::Current(key_len as i64))?; + + // Read the offsets count. + let mut offsets_count_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_count_bytes)?; + let offsets_count = u64::from_le_bytes(offsets_count_bytes); + + // Skip the offsets. + reader.seek(SeekFrom::Current((offsets_count * 8) as i64))?; + } + + Ok(()) + } + + /// Find the lower bound for a key using binary search. + pub fn find_lower_bound( + &self, + reader: &mut R, + key: &T, + start_pos: u64, + ) -> std::io::Result { + if self.entry_count == 0 { + return Ok(0); + } + + let mut left = 0; + let mut right = self.entry_count - 1; + + while left <= right { + let mid = left + (right - left) / 2; + self.seek_to_entry(reader, mid, start_pos)?; + + // Read the key length. + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Read the key. + let mut key_bytes = vec![0u8; key_len as usize]; + reader.read_exact(&mut key_bytes)?; + + // Deserialize the key and compare. + let entry_key = T::from_bytes(&key_bytes); + let ordering = entry_key.cmp(key); + + match ordering { + std::cmp::Ordering::Equal => return Ok(mid), + std::cmp::Ordering::Less => left = mid + 1, + std::cmp::Ordering::Greater => { + if mid == 0 { + break; + } + right = mid - 1; + } + } + } + + Ok(left) + } + + /// Find the upper bound for a key using binary search. + pub fn find_upper_bound( + &self, + reader: &mut R, + key: &T, + start_pos: u64, + ) -> std::io::Result { + if self.entry_count == 0 { + return Ok(0); + } + + let mut left = 0; + let mut right = self.entry_count - 1; + + while left <= right { + let mid = left + (right - left) / 2; + self.seek_to_entry(reader, mid, start_pos)?; + + // Read the key length. + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Read the key. + let mut key_bytes = vec![0u8; key_len as usize]; + reader.read_exact(&mut key_bytes)?; + + // Deserialize the key and compare. + let entry_key = T::from_bytes(&key_bytes); + let ordering = entry_key.cmp(key); + + match ordering { + std::cmp::Ordering::Equal | std::cmp::Ordering::Less => left = mid + 1, + std::cmp::Ordering::Greater => { + if mid == 0 { + break; + } + right = mid - 1; + } + } + } + + Ok(left) + } + + /// Read the offsets for a specific entry. + pub fn read_offsets( + &self, + reader: &mut R, + entry_index: u64, + start_pos: u64, + ) -> std::io::Result> { + self.seek_to_entry(reader, entry_index, start_pos)?; + + // Read the key length. + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Skip the key. + reader.seek(SeekFrom::Current(key_len as i64))?; + + // Read the offsets count. + let mut offsets_count_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_count_bytes)?; + let offsets_count = u64::from_le_bytes(offsets_count_bytes); + + // Read the offsets. + let mut offsets = Vec::with_capacity(offsets_count as usize); + for _ in 0..offsets_count { + let mut offset_bytes = [0u8; 8]; + reader.read_exact(&mut offset_bytes)?; + offsets.push(u64::from_le_bytes(offset_bytes)); + } + + Ok(offsets) + } +} + +impl TypedStreamableIndex + for IndexMeta { - fn query_exact_bytes(&self, key: &[u8]) -> Vec { - let key_t = T::from_bytes(key); - self.query_exact(&key_t).unwrap_or(&[]).to_vec() + fn index_size(&self) -> u64 { + self.size } - fn query_range_bytes(&self, lower: Option<&[u8]>, upper: Option<&[u8]>) -> Vec { - // Convert the optional byte slices into T - let lower_t = lower.map(|b| T::from_bytes(b)); - let upper_t = upper.map(|b| T::from_bytes(b)); - // We need to pass references. - let lower_ref = lower_t.as_ref(); - let upper_ref = upper_t.as_ref(); - let results = self.query_range(lower_ref, upper_ref); - results.into_iter().flatten().cloned().collect() + fn stream_query_exact( + &self, + reader: &mut R, + key: &T, + ) -> std::io::Result> { + let start_pos = reader.stream_position()?; + let index = self.find_lower_bound(reader, key, start_pos)?; + + if index >= self.entry_count { + return Ok(Vec::new()); + } + + // Seek to the found entry. + self.seek_to_entry(reader, index, start_pos)?; + + // Read the key length. + let mut key_len_bytes = [0u8; 8]; + reader.read_exact(&mut key_len_bytes)?; + let key_len = u64::from_le_bytes(key_len_bytes); + + // Read the key. + let mut key_bytes = vec![0u8; key_len as usize]; + reader.read_exact(&mut key_bytes)?; + + // Deserialize the key and check for exact match. + let entry_key = T::from_bytes(&key_bytes); + + if &entry_key == key { + // Read the offsets count. + let mut offsets_count_bytes = [0u8; 8]; + reader.read_exact(&mut offsets_count_bytes)?; + let offsets_count = u64::from_le_bytes(offsets_count_bytes); + + // Read the offsets. + let mut offsets = Vec::with_capacity(offsets_count as usize); + for _ in 0..offsets_count { + let mut offset_bytes = [0u8; 8]; + reader.read_exact(&mut offset_bytes)?; + offsets.push(u64::from_le_bytes(offset_bytes)); + } + + return Ok(offsets); + } + + Ok(Vec::new()) + } + + fn stream_query_range( + &self, + reader: &mut R, + lower: Option<&T>, + upper: Option<&T>, + ) -> std::io::Result> { + let start_pos = reader.stream_position()?; + // Find lower bound. + let start_index = if let Some(lower_key) = lower { + self.find_lower_bound(reader, lower_key, start_pos)? + } else { + 0 + }; + + // Find upper bound. + let end_index = if let Some(upper_key) = upper { + self.find_upper_bound(reader, upper_key, start_pos)? + } else { + self.entry_count + }; + + if start_index >= end_index || start_index >= self.entry_count { + return Ok(Vec::new()); + } + + let mut all_offsets = Vec::new(); + + // Collect all offsets within the range. + for entry_index in start_index..end_index.min(self.entry_count) { + let offsets = self.read_offsets(reader, entry_index, start_pos)?; + all_offsets.extend(offsets); + } + + Ok(all_offsets) + } + + #[cfg(feature = "http")] + async fn http_stream_query_exact( + &self, + client: &mut http_range_client::AsyncBufferedHttpRangeClient, + index_offset: usize, + key: &T, + ) -> std::io::Result> { + // HTTP implementation would go here, similar to the existing one but type-aware + unimplemented!("Type-aware HTTP streaming not yet implemented") + } + + #[cfg(feature = "http")] + async fn http_stream_query_range( + &self, + client: &mut http_range_client::AsyncBufferedHttpRangeClient, + index_offset: usize, + lower: Option<&T>, + upper: Option<&T>, + ) -> std::io::Result> { + // HTTP implementation would go here, similar to the existing one but type-aware + unimplemented!("Type-aware HTTP streaming not yet implemented") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::byte_serializable::Float; + use chrono::{NaiveDate, NaiveDateTime}; + use ordered_float::OrderedFloat; + use std::io::Cursor; + use std::io::{Seek, SeekFrom}; + + // Helper function to create a sample height index + fn create_sample_height_index() -> BufferedIndex> { + let mut entries = Vec::new(); + + // Create sample data with heights + let heights = [ + (10.5f32, vec![0]), // Building 0 has height 10.5 + (15.2f32, vec![1]), // Building 1 has height 15.2 + (20.0f32, vec![2, 3]), // Buildings 2 and 3 have height 20.0 + (22.7f32, vec![4]), + (25.3f32, vec![5]), + (30.0f32, vec![6, 7, 8]), // Buildings 6, 7, 8 have height 30.0 + (32.1f32, vec![9]), + (35.5f32, vec![10]), + (40.0f32, vec![11, 12]), + (45.2f32, vec![13]), + (50.0f32, vec![14, 15, 16]), // Buildings 14, 15, 16 have height 50.0 + (55.7f32, vec![17]), + (60.3f32, vec![18]), + (65.0f32, vec![19]), + ]; + + for (height, offsets) in heights.iter() { + entries.push(KeyValue { + key: OrderedFloat(*height), + offsets: offsets.iter().map(|&i| i as u64).collect(), + }); + } + + let mut index = BufferedIndex::new(); + index.build_index(entries); + index + } + + // Helper function to create a sample building ID index + fn create_sample_id_index() -> BufferedIndex { + let mut entries = Vec::new(); + + // Create sample data with building IDs + let ids = [ + ("BLDG0001", vec![0]), + ("BLDG0002", vec![1]), + ("BLDG0003", vec![2]), + ("BLDG0004", vec![3]), + ("BLDG0005", vec![4]), + ("BLDG0010", vec![5, 6]), // Two buildings share the same ID + ("BLDG0015", vec![7]), + ("BLDG0020", vec![8, 9, 10]), // Three buildings share the same ID + ("BLDG0025", vec![11]), + ("BLDG0030", vec![12]), + ("BLDG0035", vec![13]), + ("BLDG0040", vec![14]), + ("BLDG0045", vec![15]), + ("BLDG0050", vec![16, 17]), // Two buildings share the same ID + ("BLDG0055", vec![18]), + ("BLDG0060", vec![19]), + ]; + + for (id, offsets) in ids.iter() { + entries.push(KeyValue { + key: id.to_string(), + offsets: offsets.iter().map(|&i| i as u64).collect(), + }); + } + + let mut index = BufferedIndex::new(); + index.build_index(entries); + index + } + + fn create_sample_date_index() -> BufferedIndex { + let mut entries = Vec::new(); + let dates = [ + (NaiveDate::from_ymd(2020, 1, 1).and_hms(0, 0, 0), vec![0]), + (NaiveDate::from_ymd(2020, 1, 2).and_hms(0, 0, 0), vec![1]), + (NaiveDate::from_ymd(2020, 1, 3).and_hms(0, 0, 0), vec![2]), + (NaiveDate::from_ymd(2020, 1, 4).and_hms(0, 0, 0), vec![3]), + (NaiveDate::from_ymd(2020, 1, 5).and_hms(0, 0, 0), vec![4, 5]), + (NaiveDate::from_ymd(2020, 1, 7).and_hms(0, 0, 0), vec![6]), + (NaiveDate::from_ymd(2020, 1, 8).and_hms(0, 0, 0), vec![7]), + (NaiveDate::from_ymd(2020, 1, 9).and_hms(0, 0, 0), vec![8]), + (NaiveDate::from_ymd(2020, 1, 10).and_hms(0, 0, 0), vec![9]), + ( + NaiveDate::from_ymd(2020, 1, 11).and_hms(0, 0, 0), + vec![10, 11, 12], + ), + (NaiveDate::from_ymd(2020, 1, 14).and_hms(0, 0, 0), vec![13]), + (NaiveDate::from_ymd(2020, 1, 15).and_hms(0, 0, 0), vec![14]), + (NaiveDate::from_ymd(2020, 1, 16).and_hms(0, 0, 0), vec![15]), + (NaiveDate::from_ymd(2020, 1, 17).and_hms(0, 0, 0), vec![16]), + (NaiveDate::from_ymd(2020, 1, 18).and_hms(0, 0, 0), vec![17]), + (NaiveDate::from_ymd(2020, 1, 19).and_hms(0, 0, 0), vec![18]), + (NaiveDate::from_ymd(2020, 1, 20).and_hms(0, 0, 0), vec![19]), + ]; + for (date, offsets) in dates.iter() { + entries.push(KeyValue { + key: *date, + offsets: offsets.iter().map(|&i| i as u64).collect(), + }); + } + let mut index = BufferedIndex::new(); + index.build_index(entries); + index + } + + #[test] + fn test_stream_query_exact_height() -> Result<(), error::Error> { + // Create the index + let index = create_sample_height_index(); + + // Serialize to a temporary file + let mut tmp_file = tempfile::NamedTempFile::new()?; + index.serialize(&mut tmp_file)?; + + // Get the size of the serialized index + let size = tmp_file.as_file().metadata()?.len(); + + // Prepare for reading + let mut file = tmp_file.reopen()?; + file.seek(SeekFrom::Start(0))?; + + // Read the metadata + let index_meta = IndexMeta::>::from_reader(&mut file, size)?; + + // Reset position + file.seek(SeekFrom::Start(0))?; + + // Perform streaming query + let test_height = OrderedFloat(74.5); + let stream_results = index_meta.stream_query_exact(&mut file, &test_height)?; + + // Also test with in-memory cursor + let mut serialized = Vec::new(); + { + let mut cursor = Cursor::new(&mut serialized); + index.serialize(&mut cursor)?; + } + + let mut cursor = Cursor::new(&serialized); + let index_meta = + IndexMeta::>::from_reader(&mut cursor, serialized.len() as u64)?; + + cursor.set_position(0); + let stream_results = index_meta.stream_query_exact(&mut cursor, &test_height)?; + + // Verify results + let typed_results = index.query_exact(&test_height); + assert_eq!( + stream_results, + typed_results.map(|v| v.to_vec()).unwrap_or_default() + ); + + Ok(()) + } + + #[test] + fn test_stream_query_range_height() -> Result<(), error::Error> { + // Create the index + let index = create_sample_height_index(); + + // Serialize to a temporary file + let mut tmp_file = tempfile::NamedTempFile::new()?; + index.serialize(&mut tmp_file)?; + + // Get the size of the serialized index + let size = tmp_file.as_file().metadata()?.len(); + + // Prepare for reading + let mut file = tmp_file.reopen()?; + file.seek(SeekFrom::Start(0))?; + + // Read the metadata + let index_meta = IndexMeta::>::from_reader(&mut file, size)?; + + // Reset position + file.seek(SeekFrom::Start(0))?; + + // Define range query + let lower = OrderedFloat(70.0); + let upper = OrderedFloat(75.0); + + // Perform streaming query + let stream_results = + index_meta.stream_query_range(&mut file, Some(&lower), Some(&upper))?; + + // Also test with in-memory cursor + let mut serialized = Vec::new(); + { + let mut cursor = Cursor::new(&mut serialized); + index.serialize(&mut cursor)?; + } + + let mut cursor = Cursor::new(&serialized); + let index_meta = + IndexMeta::>::from_reader(&mut cursor, serialized.len() as u64)?; + + cursor.set_position(0); + let stream_results = + index_meta.stream_query_range(&mut cursor, Some(&lower), Some(&upper))?; + + // Verify results match the typed query + let typed_results = index.query_range(Some(&lower), Some(&upper)); + let typed_flat: Vec = typed_results.into_iter().flatten().cloned().collect(); + assert_eq!(stream_results, typed_flat); + + Ok(()) + } + + #[test] + fn test_stream_query_exact_id() -> Result<(), error::Error> { + // Create the index + let index = create_sample_id_index(); + + // Serialize to a temporary file + let mut tmp_file = tempfile::NamedTempFile::new()?; + index.serialize(&mut tmp_file)?; + + // Get the size of the serialized index + let size = tmp_file.as_file().metadata()?.len(); + + // Prepare for reading + let mut file = tmp_file.reopen()?; + file.seek(SeekFrom::Start(0))?; + + // Read the metadata + let index_meta = IndexMeta::::from_reader(&mut file, size)?; + + // Reset position + file.seek(SeekFrom::Start(0))?; + + // Perform streaming query + let test_id = "c3".to_string(); + let stream_results = index_meta.stream_query_exact(&mut file, &test_id)?; + + let typed_results = index.query_exact(&test_id); + assert_eq!( + stream_results, + typed_results.map(|v| v.to_vec()).unwrap_or_default() + ); + + // Also test with in-memory cursor + let mut serialized = Vec::new(); + { + let mut cursor = Cursor::new(&mut serialized); + index.serialize(&mut cursor)?; + } + + let mut cursor = Cursor::new(&serialized); + let index_meta = IndexMeta::::from_reader(&mut cursor, serialized.len() as u64)?; + + cursor.set_position(0); + let stream_results = index_meta.stream_query_exact(&mut cursor, &test_id)?; + + // Verify results + assert_eq!( + stream_results, + typed_results.map(|v| v.to_vec()).unwrap_or_default() + ); + + Ok(()) + } + + #[test] + fn test_stream_query_range_id() -> Result<(), error::Error> { + // Create the index + let index = create_sample_id_index(); + + // Serialize to a temporary file + let mut tmp_file = tempfile::NamedTempFile::new()?; + index.serialize(&mut tmp_file)?; + + // Get the size of the serialized index + let size = tmp_file.as_file().metadata()?.len(); + + // Prepare for reading + let mut file = tmp_file.reopen()?; + file.seek(SeekFrom::Start(0))?; + + // Read the metadata + let index_meta = IndexMeta::::from_reader(&mut file, size)?; + + // Reset position + file.seek(SeekFrom::Start(0))?; + + // Define range query + let lower = "c1".to_string(); + let upper = "c4".to_string(); + + // Perform streaming query + let stream_results = + index_meta.stream_query_range(&mut file, Some(&lower), Some(&upper))?; + let typed_results = index.query_range(Some(&lower), Some(&upper)); + let typed_flat: Vec = typed_results.into_iter().flatten().cloned().collect(); + + assert_eq!(stream_results, typed_flat); + + // Also test with in-memory cursor + let mut serialized = Vec::new(); + { + let mut cursor = Cursor::new(&mut serialized); + index.serialize(&mut cursor)?; + } + + let mut cursor = Cursor::new(&serialized); + let index_meta = IndexMeta::::from_reader(&mut cursor, serialized.len() as u64)?; + + cursor.set_position(0); + let stream_results = + index_meta.stream_query_range(&mut cursor, Some(&lower), Some(&upper))?; + + // Verify results match the typed query + let typed_results = index.query_range(Some(&lower), Some(&upper)); + let typed_flat: Vec = typed_results.into_iter().flatten().cloned().collect(); + assert_eq!(stream_results, typed_flat); + + Ok(()) + } + + #[test] + fn test_stream_query_range_date() -> Result<(), error::Error> { + // Create the index + let index = create_sample_date_index(); + + // Serialize to a temporary file + let mut tmp_file = tempfile::NamedTempFile::new()?; + index.serialize(&mut tmp_file)?; + + // Get the size of the serialized index + let size = tmp_file.as_file().metadata()?.len(); + + // Prepare for reading + let mut file = tmp_file.reopen()?; + file.seek(SeekFrom::Start(0))?; + + // Read the metadata + let index_meta = IndexMeta::::from_reader(&mut file, size)?; + + // Reset position + file.seek(SeekFrom::Start(0))?; + + // Define range query + let lower = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2022, 1, 1).unwrap(), + chrono::NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + let upper = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2022, 2, 1).unwrap(), + chrono::NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + + // Perform streaming query + let stream_results = + index_meta.stream_query_range(&mut file, Some(&lower), Some(&upper))?; + let typed_results = index.query_range(Some(&lower), Some(&upper)); + let typed_flat: Vec = typed_results.into_iter().flatten().cloned().collect(); + + assert_eq!(stream_results, typed_flat); + + // Also test with in-memory cursor + let mut serialized = Vec::new(); + { + let mut cursor = Cursor::new(&mut serialized); + index.serialize(&mut cursor)?; + } + + let mut cursor = Cursor::new(&serialized); + let index_meta = + IndexMeta::::from_reader(&mut cursor, serialized.len() as u64)?; + + cursor.set_position(0); + let stream_results = + index_meta.stream_query_range(&mut cursor, Some(&lower), Some(&upper))?; + + // Verify results match the typed query + let typed_results = index.query_range(Some(&lower), Some(&upper)); + let typed_flat: Vec = typed_results.into_iter().flatten().cloned().collect(); + assert_eq!(stream_results, typed_flat); + + Ok(()) + } + + #[test] + fn test_performance_comparison() -> Result<(), error::Error> { + // Create a sample height index + let index = create_sample_height_index(); + + // Serialize to buffer + let mut buffer = Vec::new(); + index.serialize(&mut buffer)?; + + // Generate some test values + let test_values = vec![30.0f32, 74.5, 100.0, 150.0, 200.0]; + + // Measure direct query performance + let direct_start = std::time::Instant::now(); + for &value in &test_values { + let _results = index.query_exact(&OrderedFloat(value)); + } + let direct_duration = direct_start.elapsed(); + + // Measure streaming query performance + let mut cursor = Cursor::new(buffer.clone()); + let index_meta = IndexMeta::>::from_reader(&mut cursor, buffer.len() as u64)?; + + let stream_start = std::time::Instant::now(); + for &value in &test_values { + let test_height = OrderedFloat(value); + cursor.seek(SeekFrom::Start(0))?; + let _results = index_meta.stream_query_exact(&mut cursor, &test_height)?; + } + let stream_duration = stream_start.elapsed(); + + println!( + "Performance comparison:\n\ + Direct query: {:?}\n\ + Stream query: {:?}\n\ + Ratio: {:.2}x", + direct_duration, + stream_duration, + stream_duration.as_secs_f64() / direct_duration.as_secs_f64() + ); + + Ok(()) } } diff --git a/src/rust/cli/Cargo.toml b/src/rust/cli/Cargo.toml index c7a8641..13a0cfd 100644 --- a/src/rust/cli/Cargo.toml +++ b/src/rust/cli/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] fcb_core = { path = "../fcb_core" } +cjseq = { workspace = true } clap = { workspace = true } anyhow = { workspace = true } serde = { workspace = true } diff --git a/src/rust/cli/src/main.rs b/src/rust/cli/src/main.rs index e772d22..826791a 100644 --- a/src/rust/cli/src/main.rs +++ b/src/rust/cli/src/main.rs @@ -1,3 +1,4 @@ +use cjseq::{CityJSONFeature, Transform as CjTransform}; use clap::{Parser, Subcommand}; use fcb_core::error::Error; use fcb_core::{ @@ -34,6 +35,14 @@ enum Commands { /// Comma-separated list of attributes to create index for #[arg(long)] attr_index: Option, + + /// Bounding box filter in format "minx,miny,maxx,maxy" + #[arg(long)] + bbox: Option, + + /// Automatically calculate and set geospatial extent in header + #[arg(long)] + ge: bool, }, /// Convert FCB to CityJSON @@ -69,10 +78,32 @@ fn get_writer(output: &str) -> Result, Error> { } } -fn serialize(input: &str, output: &str, attr_index: Option) -> Result<(), Error> { - let reader = BufReader::new(get_reader(input)?); - let writer = BufWriter::new(get_writer(output)?); +fn serialize( + input: &str, + output: &str, + attr_index: Option, + bbox: Option, + ge: bool, +) -> Result<(), Error> { + let reader = get_reader(input)?; + let writer = get_writer(output)?; + + let reader = BufReader::new(reader); + let writer = BufWriter::new(writer); + + // Parse the bbox if provided + let bbox_parsed = if let Some(bbox_str) = bbox { + Some(parse_bbox(&bbox_str).map_err(|e| { + Error::IoError(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("failed to parse bbox: {}", e), + )) + })?) + } else { + None + }; + // Create a CityJSONSeq reader let cj_seq = match read_cityjson_from_reader(reader, CJTypeKind::Seq) { Ok(CJType::Seq(seq)) => seq, _ => { @@ -84,9 +115,24 @@ fn serialize(input: &str, output: &str, attr_index: Option) -> Result<() }; let CityJSONSeq { cj, features } = cj_seq; + + // Filter features by bbox if provided + let filtered_features = if let Some(bbox) = &bbox_parsed { + features + .into_iter() + .filter(|feature| feature_intersects_bbox(feature, bbox, &cj.transform)) + .collect() + } else { + features + }; + + if filtered_features.is_empty() { + eprintln!("warning: no features found within the specified bbox"); + } + let attr_schema = { let mut schema = AttributeSchema::new(); - for feature in features.iter() { + for feature in filtered_features.iter() { for (_, co) in feature.city_objects.iter() { if let Some(attributes) = &co.attributes { schema.add_attributes(attributes); @@ -107,16 +153,27 @@ fn serialize(input: &str, output: &str, attr_index: Option) -> Result<() .collect::>() }); + // Calculate geospatial extent if requested + let geo_extent = if ge { + Some(calculate_geospatial_extent( + &filtered_features, + &cj.transform, + )) + } else { + None + }; + let header_options = HeaderWriterOptions { write_index: true, - feature_count: features.len() as u64, + feature_count: filtered_features.len() as u64, index_node_size: 16, attribute_indices: attr_index_vec, + geographical_extent: geo_extent, }; let mut fcb = FcbWriter::new(cj, Some(header_options), attr_schema)?; - for feature in features.iter() { + for feature in filtered_features.iter() { fcb.add_feature(feature)?; } fcb.write(writer)?; @@ -124,9 +181,118 @@ fn serialize(input: &str, output: &str, attr_index: Option) -> Result<() if output != "-" { eprintln!("Successfully encoded to FCB"); } + Ok(()) } +/// Parse a bounding box string in format "minx,miny,maxx,maxy" +fn parse_bbox(bbox_str: &str) -> Result<[f64; 4], String> { + let parts: Vec<&str> = bbox_str.split(',').collect(); + if parts.len() != 4 { + return Err(format!( + "Invalid bounding box format. Expected 'minx,miny,maxx,maxy', got '{}'", + bbox_str + )); + } + + let mut bbox = [0.0; 4]; + for (i, part) in parts.iter().enumerate() { + bbox[i] = part + .trim() + .parse::() + .map_err(|e| format!("Failed to parse bbox component: {}", e))?; + } + + // Validate that min <= max + if bbox[0] > bbox[2] || bbox[1] > bbox[3] { + return Err( + "Invalid bounding box: min values must be less than or equal to max values".to_string(), + ); + } + + Ok(bbox) +} + +/// Get all vertices from a feature +fn get_vertices_from_feature(feature: &CityJSONFeature, transform: &CjTransform) -> Vec<[f64; 3]> { + let mut result = Vec::new(); + + for vertex in &feature.vertices { + if vertex.len() >= 3 { + // Convert from i64 to f64 and apply transform + let x = (vertex[0] as f64 * transform.scale[0]) + transform.translate[0]; + let y = (vertex[1] as f64 * transform.scale[1]) + transform.translate[1]; + let z = (vertex[2] as f64 * transform.scale[2]) + transform.translate[2]; + + result.push([x, y, z]); + } + } + + result +} + +/// Check if a CityJSONFeature intersects with a bounding box +fn feature_intersects_bbox( + feature: &CityJSONFeature, + bbox: &[f64; 4], + transform: &CjTransform, +) -> bool { + // Get transformed vertices from the feature + let vertices = get_vertices_from_feature(feature, transform); + if city_object_intersects_bbox(bbox, &vertices) { + return true; + } + + false +} + +/// Check if a CityObject intersects with a bounding box +fn city_object_intersects_bbox(bbox: &[f64; 4], feature_vertices: &[[f64; 3]]) -> bool { + // Check if any of the vertices are within the bbox + for vertex in feature_vertices { + if point_in_bbox_2d(vertex, bbox) { + return true; + } + } + + false +} + +/// Check if a point is inside a 2D bounding box +fn point_in_bbox_2d(point: &[f64; 3], bbox: &[f64; 4]) -> bool { + point[0] >= bbox[0] && point[0] <= bbox[2] && point[1] >= bbox[1] && point[1] <= bbox[3] +} + +/// Calculate the geospatial extent from a list of features +fn calculate_geospatial_extent(features: &[CityJSONFeature], transform: &CjTransform) -> [f64; 6] { + let mut min_x = f64::MAX; + let mut min_y = f64::MAX; + let mut min_z = f64::MAX; + let mut max_x = f64::MIN; + let mut max_y = f64::MIN; + let mut max_z = f64::MIN; + + for feature in features { + let vertices = get_vertices_from_feature(feature, transform); + + for [x, y, z] in vertices { + min_x = min_x.min(x); + min_y = min_y.min(y); + min_z = min_z.min(z); + max_x = max_x.max(x); + max_y = max_y.max(y); + max_z = max_z.max(z); + } + } + + // If no vertices were found, return a default extent + if min_x == f64::MAX { + return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + } + + [min_x, min_y, min_z, max_x, max_y, max_z] +} + fn deserialize(input: &str, output: &str) -> Result<(), Error> { let reader = BufReader::new(get_reader(input)?); let mut writer = BufWriter::new(get_writer(output)?); @@ -161,13 +327,29 @@ fn show_info(input: PathBuf) -> Result<(), Error> { let reader = BufReader::new(File::open(input)?); let metadata = reader.get_ref().metadata()?.len() / 1024 / 1024; // show in megabytes let fcb_reader = FcbReader::open(reader)?.select_all()?; - + let raw_attr_index = fcb_reader.header().attribute_index(); + let attr_index = raw_attr_index.map(|ai_vec| { + ai_vec + .iter() + .map(|ai| { + fcb_reader + .header() + .columns() + .iter() + .flat_map(|c| c.iter()) + .find(|ci| ci.index() == ai.index()) + .map(|ci| ci.name()) + .unwrap() + }) + .collect::>() + }); let header = fcb_reader.header(); println!("FCB File Info:"); println!(" File size: {} MB", metadata); println!(" Version: {}", header.version()); println!(" Features count: {}", header.features_count()); println!(" bbox: {:?}", header.geographical_extent()); + println!(" attr_index: {:?}", attr_index.unwrap_or_default()); if let Some(title) = header.title() { println!(" Title: {}", title); @@ -200,7 +382,9 @@ fn main() -> Result<(), Error> { input, output, attr_index, - } => serialize(&input, &output, attr_index), + bbox, + ge, + } => serialize(&input, &output, attr_index, bbox, ge), Commands::Deser { input, output } => deserialize(&input, &output), Commands::Info { input } => show_info(input), } diff --git a/src/rust/fcb_core/benches/read_attr.rs b/src/rust/fcb_core/benches/read_attr.rs index ea95947..35f82e9 100644 --- a/src/rust/fcb_core/benches/read_attr.rs +++ b/src/rust/fcb_core/benches/read_attr.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use bst::OrderedFloat; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use fcb_core::{AttrQuery, ByteSerializableValue, FcbReader, Operator}; use std::{fs::File, io::BufReader}; @@ -44,34 +45,93 @@ fn read_fcb_without_attr_index(path: &str) -> Result<()> { break; } } - println!("target_feat_num: {:?}", target_feat_num); - println!("feat_total: {:?}", feat_total); Ok(()) } -/// Read FCB file and count geometry types using attribute index. -fn read_fcb_with_attr_index(path: &str) -> Result<()> { +/// Read FCB file and count geometry types using attribute index with seekable reader (StreamableMultiIndex). +fn read_fcb_with_attr_index_seekable(path: &str) -> Result<()> { let input_file = File::open(path)?; let input_reader = BufReader::new(input_file); let query: AttrQuery = vec![ + ( + "b3_h_dak_50p".to_string(), + Operator::Gt, + ByteSerializableValue::F64(OrderedFloat(2.0)), + ), + ( + "b3_h_dak_50p".to_string(), + Operator::Lt, + ByteSerializableValue::F64(OrderedFloat(50.0)), + ), // ( - // "b3_h_dak_50p".to_string(), - // Operator::Gt, - // ByteSerializableValue::F64(OrderedFloat(2.0)), - // ), - // ( - // "b3_h_dak_50p".to_string(), - // Operator::Lt, - // ByteSerializableValue::F64(OrderedFloat(50.0)), + // "identificatie".to_string(), + // Operator::Eq, + // ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), // ), + ]; + + // Use the seekable version with StreamableMultiIndex + let mut reader = FcbReader::open(input_reader)?.select_attr_query(query)?; + let header = reader.header(); + let feat_count = header.features_count(); + + let mut target_feat_num = 0; + let mut feat_total = 0; + while let Some(feat_buf) = reader.next()? { + let feature = feat_buf.cur_cj_feature()?; + for (_, co) in feature.city_objects.iter() { + if let Some(attributes) = &co.attributes { + if let Some(b3_h_dak_50p) = attributes.get("b3_h_dak_50p") { + if b3_h_dak_50p.as_f64().unwrap() > 2.0 && b3_h_dak_50p.as_f64().unwrap() < 50.0 + { + println!("b3_h_dak_50p: {:?}", b3_h_dak_50p); + target_feat_num += 1; + continue; + } + } + // if let Some(identificatie) = attributes.get("identificatie") { + // if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { + // target_feat_num += 1; + // break; + // } + // } + } + } + feat_total += 1; + if feat_total == feat_count { + break; + } + } + + Ok(()) +} + +/// Read FCB file and count geometry types using attribute index with non-seekable reader (optimized MultiIndex). +fn read_fcb_with_attr_index_non_seekable(path: &str) -> Result<()> { + let input_file = File::open(path)?; + let input_reader = BufReader::new(input_file); + + let query: AttrQuery = vec![ ( - "identificatie".to_string(), - Operator::Eq, - ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), + "b3_h_dak_50p".to_string(), + Operator::Gt, + ByteSerializableValue::F64(OrderedFloat(2.0)), ), + ( + "b3_h_dak_50p".to_string(), + Operator::Lt, + ByteSerializableValue::F64(OrderedFloat(50.0)), + ), + // ( + // "identificatie".to_string(), + // Operator::Eq, + // ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), + // ), ]; - let mut reader = FcbReader::open(input_reader)?.select_attr_query(query)?; + + // Use the non-seekable version with optimized MultiIndex + let mut reader = FcbReader::open(input_reader)?.select_attr_query_seq(query)?; let header = reader.header(); let feat_count = header.features_count(); @@ -81,19 +141,18 @@ fn read_fcb_with_attr_index(path: &str) -> Result<()> { let feature = feat_buf.cur_cj_feature()?; for (_, co) in feature.city_objects.iter() { if let Some(attributes) = &co.attributes { - if let Some(identificatie) = attributes.get("identificatie") { - if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { + if let Some(b3_h_dak_50p) = attributes.get("b3_h_dak_50p") { + if b3_h_dak_50p.as_f64().unwrap() > 2.0 && b3_h_dak_50p.as_f64().unwrap() < 50.0 + { + println!("b3_h_dak_50p: {:?}", b3_h_dak_50p); target_feat_num += 1; - break; + continue; } } - // if let Some(b3_h_dak_50p) = attributes.get("b3_h_dak_50p") { - // if b3_h_dak_50p.as_f64().unwrap() > 2.0 - // && b3_h_dak_50p.as_f64().unwrap() < 50.0 - // { - // println!("b3_h_dak_50p: {:?}", b3_h_dak_50p); + // if let Some(identificatie) = attributes.get("identificatie") { + // if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { // target_feat_num += 1; - // continue; + // break; // } // } } @@ -103,8 +162,6 @@ fn read_fcb_with_attr_index(path: &str) -> Result<()> { break; } } - println!("target_feat_num: {:?}", target_feat_num); - println!("feat_total: {:?}", feat_total); Ok(()) } @@ -112,8 +169,9 @@ fn read_fcb_with_attr_index(path: &str) -> Result<()> { const DATASETS: &[(&str, (&str, &str))] = &[( "delft", ( - "benchmark_data/attribute/delft.fcb", - "benchmark_data/attribute/delft_attr.fcb", + // "benchmark_data/attribute/delft.fcb", + "benchmark_data/attribute/3dbag_partial.fcb", + "benchmark_data/attribute/3dbag_partial.fcb", ), )]; @@ -122,23 +180,37 @@ pub fn read_benchmark(c: &mut Criterion) { for &(dataset, (file_without, file_with)) in DATASETS.iter() { // Benchmark the file without attribute index. + // group.bench_with_input( + // BenchmarkId::new(format!("{} without", dataset), file_without), + // &file_without, + // |b, &path| { + // b.iter(|| { + // read_fcb_without_attr_index(path).unwrap(); + // }) + // }, + // ); + + // Benchmark the file with attribute index using seekable reader. group.bench_with_input( - BenchmarkId::new(format!("{} without", dataset), file_without), - &file_without, + BenchmarkId::new(format!("{} with seekable (streamable)", dataset), file_with), + &file_with, |b, &path| { b.iter(|| { - read_fcb_without_attr_index(path).unwrap(); + read_fcb_with_attr_index_seekable(path).unwrap(); }) }, ); - // Benchmark the file with attribute index. + // Benchmark the file with attribute index using non-seekable reader. group.bench_with_input( - BenchmarkId::new(format!("{} with", dataset), file_with), + BenchmarkId::new( + format!("{} with non-seekable (sequential)", dataset), + file_with, + ), &file_with, |b, &path| { b.iter(|| { - read_fcb_with_attr_index(path).unwrap(); + read_fcb_with_attr_index_non_seekable(path).unwrap(); }) }, ); @@ -148,8 +220,8 @@ pub fn read_benchmark(c: &mut Criterion) { // Optionally print a concise summary. println!("\nBenchmark Results:"); - println!("{:<12} {:<15} {:<15}", "Dataset", "Format", "Mean Time"); - println!("{:-<42}", ""); + println!("{:<12} {:<25} {:<15}", "Dataset", "Method", "Mean Time"); + println!("{:-<52}", ""); } criterion_group!(benches, read_benchmark); diff --git a/src/rust/fcb_core/src/bin/write.rs b/src/rust/fcb_core/src/bin/write.rs index 0dfae80..c8c394a 100644 --- a/src/rust/fcb_core/src/bin/write.rs +++ b/src/rust/fcb_core/src/bin/write.rs @@ -30,6 +30,7 @@ fn write_file() -> Result<(), Box> { feature_count: features.len() as u64, index_node_size: 16, attribute_indices: Some(attr_indices), + geographical_extent: None, }); let mut attr_schema = AttributeSchema::new(); for feature in features.iter() { diff --git a/src/rust/fcb_core/src/error.rs b/src/rust/fcb_core/src/error.rs index 618226a..bb1a2fa 100644 --- a/src/rust/fcb_core/src/error.rs +++ b/src/rust/fcb_core/src/error.rs @@ -52,6 +52,13 @@ pub enum Error { #[error("Invalid attribute value: {msg}")] InvalidAttributeValue { msg: String }, + // Index and query errors + #[error("Failed to create index: {0}")] + IndexCreationError(String), + + #[error("Failed to execute query: {0}")] + QueryExecutionError(String), + // HTTP errors (when http feature is enabled) #[cfg(feature = "http")] #[error("HTTP client error: {0}")] @@ -86,4 +93,12 @@ impl Error { Error::UnsupportedColumnType(_) | Error::InvalidAttributeValue { .. } ) } + + /// Returns true if the error is related to index or query operations + pub fn is_index_error(&self) -> bool { + matches!( + self, + Error::IndexCreationError(_) | Error::QueryExecutionError(_) | Error::BstError(_) + ) + } } diff --git a/src/rust/fcb_core/src/http_reader/mod.rs b/src/rust/fcb_core/src/http_reader/mod.rs index 7a98ded..8e610b1 100644 --- a/src/rust/fcb_core/src/http_reader/mod.rs +++ b/src/rust/fcb_core/src/http_reader/mod.rs @@ -1,13 +1,13 @@ use crate::deserializer::to_cj_feature; use crate::{build_query, fb::*, process_attr_index_entry, AttrQuery}; +use crate::error::Error; use crate::reader::city_buffer::FcbBuffer; use crate::{ check_magic_bytes, size_prefixed_root_as_city_feature, HEADER_MAX_BUFFER_SIZE, HEADER_SIZE_SIZE, MAGIC_BYTES_SIZE, }; -use anyhow::{anyhow, Result}; -use bst::{ByteSerializable, MultiIndex}; +use bst::{ByteSerializable, HttpRange as BstHttpRange, MultiIndex}; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use cjseq::CityJSONFeature; @@ -18,7 +18,9 @@ use reqwest; #[cfg(feature = "http")] use http_range_client::BufferedHttpRangeClient; +use bst::StreamableMultiIndex; use packed_rtree::{http::HttpRange, http::HttpSearchResultItem, NodeItem, PackedRTree}; +use std::collections::HashMap; use std::collections::VecDeque; use std::ops::Range; use tracing::debug; @@ -55,7 +57,7 @@ pub struct AsyncFeatureIter { #[cfg(feature = "http")] impl HttpFcbReader { - pub async fn open(url: &str) -> Result> { + pub async fn open(url: &str) -> Result, Error> { trace!("starting: opening http reader, reading header"); let client = BufferedHttpRangeClient::new(url); Self::_open(client).await @@ -63,11 +65,11 @@ impl HttpFcbReader { } impl HttpFcbReader { - pub async fn new(client: AsyncBufferedHttpRangeClient) -> Result> { + pub async fn new(client: AsyncBufferedHttpRangeClient) -> Result, Error> { Self::_open(client).await } - async fn _open(mut client: AsyncBufferedHttpRangeClient) -> Result> { + async fn _open(mut client: AsyncBufferedHttpRangeClient) -> Result, Error> { // Because we use a buffered HTTP reader, anything extra we fetch here can // be utilized to skip subsequent fetches. // Immediately following the header is the optional spatial index, we deliberately fetch @@ -95,7 +97,7 @@ impl HttpFcbReader { let mut read_bytes = 0; let bytes = client.get_range(read_bytes, MAGIC_BYTES_SIZE).await?; // to get magic bytes if !check_magic_bytes(bytes) { - return Err(anyhow!("MissingMagicBytes")); + return Err(Error::MissingMagicBytes); } read_bytes += MAGIC_BYTES_SIZE; @@ -105,7 +107,7 @@ impl HttpFcbReader { let header_size = LittleEndian::read_u32(&bytes) as usize; if header_size > HEADER_MAX_BUFFER_SIZE || header_size < 8 { // minimum size check avoids panic in FlatBuffers header decoding - return Err(anyhow!("IllegalHeaderSize: {header_size}")); + return Err(Error::IllegalHeaderSize(header_size)); } bytes.put(client.get_range(read_bytes, header_size).await?); @@ -153,7 +155,7 @@ impl HttpFcbReader { .try_fold(0u32, |acc, ai| { let len = ai.length(); if len > u32::MAX - acc { - Err(anyhow!("Attribute index size overflow")) + Err(Error::AttributeIndexSizeOverflow) } else { Ok(acc + len) } @@ -164,7 +166,7 @@ impl HttpFcbReader { } /// Select all features. - pub async fn select_all(self) -> Result> { + pub async fn select_all(self) -> Result, Error> { let header = self.fbs.header(); let count = header.features_count(); // TODO: support reading with unknown feature count @@ -192,12 +194,12 @@ impl HttpFcbReader { min_y: f64, max_x: f64, max_y: f64, - ) -> Result> { + ) -> Result, Error> { trace!("starting: select_bbox, traversing index"); // Read R-Tree index and build filter for features within bbox let header = self.fbs.header(); if header.index_node_size() == 0 || header.features_count() == 0 { - return Err(anyhow!("NoIndex")); + return Err(Error::NoIndex); } let count = header.features_count() as usize; let header_len = self.header_len(); @@ -238,66 +240,84 @@ impl HttpFcbReader { /// This method uses the attribute index section to find matching feature offsets. /// It then groups (batches) the remote feature ranges in order to reduce IO overhead. - pub async fn select_attr_query(mut self, query: &AttrQuery) -> Result> { + pub async fn select_attr_query( + mut self, + query: &AttrQuery, + ) -> Result, Error> { trace!("starting: select_attr_query via http reader"); - - let header = self.fbs.header(); - let header_len = self.header_len(); - // Assume the header provides rtree and attribute index sizes. - let rtree_index_size = self.rtree_index_size() as usize; - let attr_index_size = self.attr_index_size() as usize; - let attr_index_offset = header_len + rtree_index_size; - let feature_begin = header_len + rtree_index_size + attr_index_size; - - // Fetch the attribute index block via HTTP range request. - let mut attr_index_bytes = self - .client - .get_range(attr_index_offset, attr_index_size) - .await?; - - let attr_index_entries = header - .attribute_index() - .ok_or_else(|| anyhow!("attribute index not found"))?; - let columns: Vec = header - .columns() - .ok_or_else(|| anyhow!("no columns found in header"))? - .iter() - .collect(); - - let mut multi_index = MultiIndex::new(); - - for attr_info in attr_index_entries.iter() { - process_attr_index_entry( - &mut attr_index_bytes, - &mut multi_index, - &columns, - &query, - attr_info, - )?; - } - - let query = build_query(&query); - - let result = bst::stream_query(&multi_index, query, feature_begin).await?; - - let count = result.len(); - let combine_request_threshold = 256 * 1024; - - let http_ranges: Vec = result.into_iter().map(|item| item.range).collect(); - - trace!( - "completed: select_attr_query via http reader, matched features: {}", - count - ); - Ok(AsyncFeatureIter { - client: self.client, - fbs: self.fbs, - selection: FeatureSelection::SelectAttr(SelectAttr { - ranges: http_ranges, - range_pos: 0, - }), - count, - }) + unimplemented!() + // let header = self.fbs.header(); + // let header_len = self.header_len(); + // // Assume the header provides rtree and attribute index sizes. + // let rtree_index_size = self.rtree_index_size() as usize; + // let attr_index_size = self.attr_index_size() as usize; + // let attr_index_offset = header_len + rtree_index_size; + // let feature_begin = header_len + rtree_index_size + attr_index_size; + + // let attr_index_entries = header + // .attribute_index() + // .ok_or_else(|| Error::AttributeIndexNotFound)?; + // let columns: Vec = header + // .columns() + // .ok_or_else(|| Error::NoColumnsInHeader)? + // .iter() + // .collect(); + + // // Create a map of field names to index offsets + // let mut index_offsets = HashMap::new(); + // let mut field_names = Vec::new(); + + // for attr_info in attr_index_entries.iter() { + // let field_name = columns + // .iter() + // .find(|c| c.index() == attr_info.index()) + // .map(|c| c.name().to_string()) + // .ok_or_else(|| Error::AttributeIndexNotFound)?; + // let offset = attr_index_offset + attr_info.length() as usize; + // index_offsets.insert(field_name.clone(), offset); + // field_names.push(field_name); + // } + + // // Create a StreamableMultiIndex from HTTP range requests + // let streamable_index = + // StreamableMultiIndex::from_http(&mut self.client, &index_offsets).await?; + + // // Build the query + // let bst_query = build_query(&query); + + // // Execute the query using HTTP streaming + // let result = streamable_index + // .http_stream_query( + // &mut self.client, + // &bst_query, + // attr_index_offset, + // feature_begin, + // ) + // .await?; + + // let count = result.len(); + + // let http_ranges: Vec = result + // .into_iter() + // .map(|item| match item.range { + // BstHttpRange::Range(range) => HttpRange::Range(range.start..range.end), + // BstHttpRange::RangeFrom(range) => HttpRange::RangeFrom(range.start..), + // }) + // .collect(); + + // trace!( + // "completed: select_attr_query via http reader, matched features: {}", + // count + // ); + // Ok(AsyncFeatureIter { + // client: self.client, + // fbs: self.fbs, + // selection: FeatureSelection::SelectAttr(SelectAttr { + // ranges: http_ranges, + // range_pos: 0, + // }), + // count, + // }) } } @@ -314,7 +334,7 @@ impl AsyncFeatureIter { } } /// Read next feature - pub async fn next(&mut self) -> Result> { + pub async fn next(&mut self) -> Result, Error> { let Some(buffer) = self.selection.next_feature_buffer(&mut self.client).await? else { return Ok(None); }; @@ -330,7 +350,7 @@ impl AsyncFeatureIter { &self.fbs } - pub fn cur_cj_feature(&self) -> Result { + pub fn cur_cj_feature(&self) -> Result { let cj_feature = to_cj_feature(self.cur_feature().feature(), self.header().columns())?; Ok(cj_feature) } @@ -346,7 +366,7 @@ impl FeatureSelection { async fn next_feature_buffer( &mut self, client: &mut AsyncBufferedHttpRangeClient, - ) -> Result> { + ) -> Result, Error> { match self { FeatureSelection::SelectAll(select_all) => select_all.next_buffer(client).await, FeatureSelection::SelectBbox(select_bbox) => select_bbox.next_buffer(client).await, @@ -367,7 +387,7 @@ impl SelectAll { async fn next_buffer( &mut self, client: &mut AsyncBufferedHttpRangeClient, - ) -> Result> { + ) -> Result, Error> { client.min_req_size(DEFAULT_HTTP_FETCH_SIZE); if self.features_left == 0 { @@ -394,7 +414,7 @@ impl SelectBbox { async fn next_buffer( &mut self, client: &mut AsyncBufferedHttpRangeClient, - ) -> Result> { + ) -> Result, Error> { let mut next_buffer = None; while next_buffer.is_none() { let Some(feature_batch) = self.feature_batches.last_mut() else { @@ -423,7 +443,7 @@ impl FeatureBatch { async fn make_batches( feature_ranges: Vec, combine_request_threshold: usize, - ) -> Result> { + ) -> Result, Error> { let mut batched_ranges = vec![]; for search_result_item in feature_ranges.into_iter() { @@ -498,7 +518,7 @@ impl FeatureBatch { async fn next_buffer( &mut self, client: &mut AsyncBufferedHttpRangeClient, - ) -> Result> { + ) -> Result, Error> { let request_size = self.request_size(); client.set_min_req_size(request_size); let Some(feature_range) = self.feature_ranges.pop_front() else { @@ -525,7 +545,7 @@ impl SelectAttr { async fn next_buffer( &mut self, client: &mut AsyncBufferedHttpRangeClient, - ) -> Result> { + ) -> Result, Error> { let Some(range) = self.ranges.get(self.range_pos) else { return Ok(None); }; diff --git a/src/rust/fcb_core/src/reader/attr_query.rs b/src/rust/fcb_core/src/reader/attr_query.rs index ca98436..adfd6ce 100644 --- a/src/rust/fcb_core/src/reader/attr_query.rs +++ b/src/rust/fcb_core/src/reader/attr_query.rs @@ -1,8 +1,13 @@ +use anyhow::Result; +use std::collections::HashMap; use std::io::{self, Read, Seek, SeekFrom}; use crate::error::Error; -use bst::{ByteSerializable, IndexSerializable, OrderedFloat, SortedIndex}; -pub use bst::{ByteSerializableValue, MultiIndex, Operator, Query, QueryCondition}; +use bst::{BufferedIndex, IndexSerializable, OrderedFloat}; +pub use bst::{ + ByteSerializableValue, MultiIndex, Operator, Query, Query as AttributeQuery, QueryCondition, + StreamableMultiIndex, +}; use chrono::{DateTime, Utc}; @@ -30,73 +35,55 @@ pub fn process_attr_index_entry( if query.iter().any(|(name, _, _)| col.name() == name) { match col.type_() { ColumnType::Int => { - let index = SortedIndex::::deserialize(&mut buffer.as_slice())?; + let index = BufferedIndex::::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::Long => { - let index = SortedIndex::::deserialize(&mut buffer.as_slice())?; + let index = BufferedIndex::::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::Float => { let index = - SortedIndex::>::deserialize(&mut buffer.as_slice())?; + BufferedIndex::>::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::Double => { let index = - SortedIndex::>::deserialize(&mut buffer.as_slice())?; + BufferedIndex::>::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::String => { - let index = SortedIndex::::deserialize(&mut buffer.as_slice())?; + let index = BufferedIndex::::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::Bool => { - let index = SortedIndex::::deserialize(&mut buffer.as_slice())?; + let index = BufferedIndex::::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } ColumnType::DateTime => { - let index = SortedIndex::>::deserialize(&mut buffer.as_slice())?; + let index = + BufferedIndex::>::deserialize(&mut buffer.as_slice())?; multi_index.add_index(col.name().to_string(), Box::new(index)); } _ => return Err(Error::UnsupportedColumnType(col.name().to_string())), } + } else { + println!(" - Skipping index for field: {}", col.name()); } } Ok(()) } -fn byte_serializable_to_bytes(value: &ByteSerializableValue) -> Vec { - match value { - ByteSerializableValue::I64(i) => i.to_bytes(), - ByteSerializableValue::I32(i) => i.to_bytes(), - ByteSerializableValue::I16(i) => i.to_bytes(), - ByteSerializableValue::I8(i) => i.to_bytes(), - ByteSerializableValue::U64(i) => i.to_bytes(), - ByteSerializableValue::U32(i) => i.to_bytes(), - ByteSerializableValue::U16(i) => i.to_bytes(), - ByteSerializableValue::U8(i) => i.to_bytes(), - ByteSerializableValue::F64(i) => i.to_bytes(), - ByteSerializableValue::F32(i) => i.to_bytes(), - ByteSerializableValue::Bool(i) => i.to_bytes(), - ByteSerializableValue::String(s) => s.to_bytes(), - ByteSerializableValue::NaiveDateTime(dt) => dt.to_bytes(), - ByteSerializableValue::NaiveDate(d) => d.to_bytes(), - ByteSerializableValue::DateTime(dt) => dt.to_bytes(), - } -} - pub fn build_query(query: &AttrQuery) -> Query { - Query { - conditions: query - .iter() - .map(|(name, operator, value)| QueryCondition { - field: name.to_string(), - operator: *operator, - key: byte_serializable_to_bytes(value), - }) - .collect(), - } + let conditions = query + .iter() + .map(|(field, operator, value)| QueryCondition { + field: field.clone(), + operator: *operator, + key: value.to_bytes(), + }) + .collect(); + Query { conditions } } impl FcbReader { @@ -109,31 +96,81 @@ impl FcbReader { let attr_index_entries = header .attribute_index() .ok_or(Error::AttributeIndexNotFound)?; + if attr_index_entries.is_empty() { + return Err(Error::AttributeIndexNotFound); + } + + let mut attr_index_entries: Vec<&AttributeIndex> = attr_index_entries.iter().collect(); + attr_index_entries.sort_by_key(|attr| attr.index()); + let columns = header.columns().ok_or(Error::NoColumnsInHeader)?; let columns: Vec = columns.iter().collect(); - // skip the rtree index bytes; we know the correct offset for that + // Get the current position (should be at the start of the file) + let start_pos = self.reader.stream_position()?; + + // Skip the rtree index bytes; we know the correct offset for that let rtree_offset = self.rtree_index_size(); self.reader.seek(SeekFrom::Current(rtree_offset as i64))?; - let mut multi_index = MultiIndex::new(); + // Now we should be at the start of the attribute indices + let attr_index_start_pos = self.reader.stream_position()?; - // Process each attribute index entry. + // Create a mapping from field names to index offsets + let mut index_offsets = HashMap::new(); + let mut current_offset = 0; + + // First pass: build the index_offsets map and skip over all indices for attr_info in attr_index_entries.iter() { - process_attr_index_entry( - &mut self.reader, - &mut multi_index, - &columns, - &query, - attr_info, - )?; + let column_idx = attr_info.index(); + let field_name = columns + .iter() + .find(|col| col.index() == column_idx) + .ok_or(Error::AttributeIndexNotFound)? + .name() + .to_string(); + let index_size = attr_info.length() as u64; + + // Store the offset for this field + index_offsets.insert(field_name, attr_index_start_pos + current_offset); + + // Skip over this index to position at the next one + current_offset += index_size; + self.reader.seek(SeekFrom::Current(index_size as i64))?; } - let query = build_query(&query); + // Reset reader position to the start of attribute indices + self.reader.seek(SeekFrom::Start(attr_index_start_pos))?; + + // Try to create the StreamableMultiIndex with detailed error handling + let streamable_index = + match StreamableMultiIndex::from_reader(&mut self.reader, &index_offsets) { + Ok(index) => index, + Err(e) => { + return Err(Error::IndexCreationError(format!( + "Failed to create streamable index: {}", + e + ))); + } + }; + + // Create a query from the AttrQuery + let query_obj = build_query(&query); + + let result = match streamable_index.stream_query(&mut self.reader, &query_obj) { + Ok(res) => res, + Err(e) => { + return Err(Error::QueryExecutionError(format!( + "Failed to execute streaming query: {}", + e + ))); + } + }; + + // Sort the results + let mut result_vec: Vec = result.into_iter().collect(); + result_vec.sort(); - let mut result = multi_index.query(query); - // sort result so it can read features in order - result.sort(); let header_size = self.buffer.header_buf.len(); let feature_offset = FeatureOffset { magic_bytes: 8, @@ -141,13 +178,15 @@ impl FcbReader { rtree_index: self.rtree_index_size(), attributes: self.attr_index_size(), }; - let total_feat_count = result.len() as u64; + + let total_feat_count = result_vec.len() as u64; + Ok(FeatureIter::::new( self.reader, self.verify, self.buffer, None, - Some(result), + Some(result_vec), feature_offset, total_feat_count, )) @@ -169,26 +208,46 @@ impl FcbReader { .ok_or_else(|| anyhow::anyhow!("no columns found in header"))? .iter() .collect(); + // Instead of seeking, read and discard the rtree index bytes; we know the correct offset for that. let rtree_offset = self.rtree_index_size(); io::copy(&mut (&mut self.reader).take(rtree_offset), &mut io::sink())?; + // Since we can't use StreamableMultiIndex with a non-seekable reader, + // we'll still use MultiIndex but optimize the process to minimize memory usage let mut multi_index = MultiIndex::new(); + // Process each attribute index entry, but only load the ones needed for our query + let query_fields: Vec = query.iter().map(|(field, _, _)| field.clone()).collect(); + for attr_info in attr_index_entries.iter() { - process_attr_index_entry( - &mut self.reader, - &mut multi_index, - &columns, - &query, - attr_info, - )?; - } + let column_idx = attr_info.index(); + let field_name = columns[column_idx as usize].name().to_string(); - let query = build_query(&query); + // Only process this attribute if it's used in the query + if query_fields.contains(&field_name) { + process_attr_index_entry( + &mut self.reader, + &mut multi_index, + &columns, + &query, + attr_info, + )?; + } else { + // Skip this attribute index if not needed + let index_size = attr_info.length(); + io::copy( + &mut (&mut self.reader).take(index_size as u64), + &mut io::sink(), + )?; + } + } - let mut result = multi_index.query(query); + // Build and execute the query + let query_obj = build_query(&query); + let mut result = multi_index.query(query_obj); result.sort(); + let header_size = self.buffer.header_buf.len(); let feature_offset = FeatureOffset { magic_bytes: 8, @@ -196,7 +255,10 @@ impl FcbReader { rtree_index: self.rtree_index_size(), attributes: self.attr_index_size(), }; + let total_feat_count = result.len() as u64; + + // Create and return the FeatureIter Ok(FeatureIter::::new( self.reader, self.verify, diff --git a/src/rust/fcb_core/src/writer/attr_index.rs b/src/rust/fcb_core/src/writer/attr_index.rs index baf9ec2..0abbcc8 100644 --- a/src/rust/fcb_core/src/writer/attr_index.rs +++ b/src/rust/fcb_core/src/writer/attr_index.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; -use bst::{ByteSerializable, IndexSerializable, KeyValue, SortedIndex}; -use chrono::NaiveDateTime; +use bst::{BufferedIndex, ByteSerializable, IndexSerializable, KeyValue}; +use chrono::{DateTime, Utc}; use ordered_float::OrderedFloat; use crate::ColumnType; @@ -18,7 +18,7 @@ fn build_index_generic( extract: F, ) -> Option<(Vec, AttributeIndexInfo)> where - T: Ord + Clone + ByteSerializable, + T: Ord + Clone + ByteSerializable + 'static, F: Fn(&AttributeIndexEntry) -> Option, { let mut entries: Vec> = Vec::new(); @@ -38,7 +38,7 @@ where } } - let mut sorted_index = SortedIndex::new(); + let mut sorted_index = BufferedIndex::new(); sorted_index.build_index(entries); let mut buf = Vec::new(); sorted_index.serialize(&mut buf).ok()?; @@ -166,7 +166,7 @@ pub(super) fn build_attribute_index_for_attr( }) } ColumnType::DateTime => { - build_index_generic::(*schema_index, attribute_entries, |entry| { + build_index_generic::, _>(*schema_index, attribute_entries, |entry| { if let AttributeIndexEntry::DateTime { index, val } = entry { if *index == *schema_index { Some(*val) @@ -178,6 +178,84 @@ pub(super) fn build_attribute_index_for_attr( } }) } + ColumnType::Short => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::Short { index, val } = entry { + if *index == *schema_index { + Some(*val) + } else { + None + } + } else { + None + } + }) + } + ColumnType::UShort => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::UShort { index, val } = entry { + if *index == *schema_index { + Some(*val) + } else { + None + } + } else { + None + } + }) + } + ColumnType::Byte => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::Byte { index, val } = entry { + if *index == *schema_index { + Some(*val) + } else { + None + } + } else { + None + } + }) + } + ColumnType::UByte => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::UByte { index, val } = entry { + if *index == *schema_index { + Some(*val) + } else { + None + } + } else { + None + } + }) + } + ColumnType::Json => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::Json { index, val } = entry { + if *index == *schema_index { + Some(val.clone()) + } else { + None + } + } else { + None + } + }) + } + ColumnType::Binary => { + build_index_generic::(*schema_index, attribute_entries, |entry| { + if let AttributeIndexEntry::Binary { index, val } = entry { + if *index == *schema_index { + Some(val.clone()) + } else { + None + } + } else { + None + } + }) + } _ => { println!("Unsupported column type for indexing: {:?}", coltype); None diff --git a/src/rust/fcb_core/src/writer/attribute.rs b/src/rust/fcb_core/src/writer/attribute.rs index 01e3310..4913413 100644 --- a/src/rust/fcb_core/src/writer/attribute.rs +++ b/src/rust/fcb_core/src/writer/attribute.rs @@ -1,6 +1,6 @@ use crate::fb::ColumnType; use byteorder::{ByteOrder, LittleEndian}; -use chrono::NaiveDateTime; +use chrono::{DateTime, Utc}; use cjseq::CityJSONFeature; use serde_json::Value; use std::collections::HashMap; @@ -203,7 +203,13 @@ pub enum AttributeIndexEntry { Float { index: u16, val: f32 }, Double { index: u16, val: f64 }, String { index: u16, val: String }, - DateTime { index: u16, val: NaiveDateTime }, + DateTime { index: u16, val: DateTime }, + Short { index: u16, val: i16 }, + UShort { index: u16, val: u16 }, + Byte { index: u16, val: u8 }, + UByte { index: u16, val: u8 }, + Json { index: u16, val: String }, + Binary { index: u16, val: String }, } pub fn cityfeature_to_index_entries( @@ -307,12 +313,12 @@ pub fn attribute_to_index_entries( index: *index, val: match chrono::DateTime::parse_from_rfc3339(val.as_str().unwrap_or("")) { - Ok(dt) => dt.naive_utc(), + Ok(dt) => dt.to_utc(), Err(e) => { eprintln!("Failed to parse DateTime: {}", e); // Choose whether to skip, default, or handle differently // For example, default to 1970-01-01: - NaiveDateTime::from_timestamp_opt(0, 0).unwrap() + DateTime::::from_timestamp(0, 0).unwrap() } }, }); diff --git a/src/rust/fcb_core/src/writer/geom_encoder.rs b/src/rust/fcb_core/src/writer/geom_encoder.rs index 32449c7..cad0b79 100644 --- a/src/rust/fcb_core/src/writer/geom_encoder.rs +++ b/src/rust/fcb_core/src/writer/geom_encoder.rs @@ -920,7 +920,6 @@ mod tests { assert_eq!(encoded.len(), 1); match &encoded[0] { MaterialMapping::Values(values) => { - println!("values: {:?}", values); assert_eq!(values.theme, "theme6"); assert_eq!(values.solids, vec![2, 1]); // Two solids, the first solid has 2 shells, the second solid has 1 shell assert_eq!(values.shells, vec![3, 3, 3]); // Each shell has 3 surfaces diff --git a/src/rust/fcb_core/src/writer/header_writer.rs b/src/rust/fcb_core/src/writer/header_writer.rs index c699f72..16a80db 100644 --- a/src/rust/fcb_core/src/writer/header_writer.rs +++ b/src/rust/fcb_core/src/writer/header_writer.rs @@ -29,6 +29,8 @@ pub struct HeaderWriterOptions { pub index_node_size: u16, /// Attribute indices pub attribute_indices: Option>, + /// Geographical extent + pub geographical_extent: Option<[f64; 6]>, } impl Default for HeaderWriterOptions { @@ -38,6 +40,7 @@ impl Default for HeaderWriterOptions { index_node_size: PackedRTree::DEFAULT_NODE_SIZE, feature_count: 0, attribute_indices: None, + geographical_extent: None, } } } diff --git a/src/rust/fcb_core/src/writer/serializer.rs b/src/rust/fcb_core/src/writer/serializer.rs index 7fc56cb..c9fc811 100644 --- a/src/rust/fcb_core/src/writer/serializer.rs +++ b/src/rust/fcb_core/src/writer/serializer.rs @@ -64,15 +64,24 @@ pub(super) fn to_fcb_header<'a>( None } }; + + // Use the geographical_extent from the HeaderWriterOptions if provided + let geographical_extent_from_options = header_options + .geographical_extent + .as_ref() + .map(to_geographical_extent); + if let Some(meta) = cj.metadata.as_ref() { let reference_system = meta .reference_system .as_ref() .map(|ref_sys| to_reference_system(fbb, ref_sys)); - let geographical_extent = meta - .geographical_extent - .as_ref() - .map(to_geographical_extent); + // Use the geographical_extent from the HeaderWriterOptions if provided, otherwise use the one from the metadata + let geographical_extent = geographical_extent_from_options.or_else(|| { + meta.geographical_extent + .as_ref() + .map(to_geographical_extent) + }); let identifier = meta.identifier.as_ref().map(|i| fbb.create_string(i)); let reference_date = meta.reference_date.as_ref().map(|r| fbb.create_string(r)); let title = meta.title.as_ref().map(|t| fbb.create_string(t)); @@ -149,7 +158,9 @@ pub(super) fn to_fcb_header<'a>( columns, features_count, index_node_size, + geographical_extent: geographical_extent_from_options.as_ref(), version, + attribute_index, ..Default::default() }, ) diff --git a/src/rust/fcb_core/tests/attr_index.rs b/src/rust/fcb_core/tests/attr_index.rs index 529e9f8..eb9e72d 100644 --- a/src/rust/fcb_core/tests/attr_index.rs +++ b/src/rust/fcb_core/tests/attr_index.rs @@ -8,403 +8,419 @@ use fcb_core::{ read_cityjson_from_reader, CJType, CJTypeKind, FcbReader, FcbWriter, }; use ordered_float::OrderedFloat; -use pretty_assertions::assert_eq; use std::{ fs::File, io::{BufReader, Cursor, Seek, SeekFrom}, path::PathBuf, }; -#[test] -fn test_attr_index() -> Result<()> { - // Setup paths - let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let input_file = manifest_dir - .join("tests") - .join("data") - .join("small.city.jsonl"); +mod tests { + use super::*; + use pretty_assertions::assert_eq; - // Read original CityJSONSeq - let input_file = File::open(input_file)?; - let input_reader = BufReader::new(input_file); - let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { - CJType::Seq(seq) => seq, - _ => panic!("Expected CityJSONSeq"), - }; + #[test] + fn test_attr_index() -> Result<()> { + // Setup paths + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let input_file = manifest_dir + .join("tests") + .join("data") + .join("small.city.jsonl"); - // Write to FCB + // Read original CityJSONSeq + let input_file = File::open(input_file)?; + let input_reader = BufReader::new(input_file); + let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { + CJType::Seq(seq) => seq, + _ => panic!("Expected CityJSONSeq"), + }; - let mut memory_buffer = Cursor::new(Vec::new()); - let mut attr_schema = AttributeSchema::new(); - for feature in original_cj_seq.features.iter() { - for (_, co) in feature.city_objects.iter() { - if let Some(attributes) = &co.attributes { - attr_schema.add_attributes(attributes); + // Write to FCB + + let mut memory_buffer = Cursor::new(Vec::new()); + let mut attr_schema = AttributeSchema::new(); + for feature in original_cj_seq.features.iter() { + for (_, co) in feature.city_objects.iter() { + if let Some(attributes) = &co.attributes { + attr_schema.add_attributes(attributes); + } } } - } - let attr_indices = vec!["b3_h_dak_50p".to_string(), "identificatie".to_string()]; - let mut fcb = FcbWriter::new( - original_cj_seq.cj.clone(), - Some(HeaderWriterOptions { - write_index: true, - feature_count: original_cj_seq.features.len() as u64, - index_node_size: 16, - attribute_indices: Some(attr_indices), - }), - Some(attr_schema), - )?; - for feature in original_cj_seq.features.iter() { - fcb.add_feature(feature)?; - } - fcb.write(&mut memory_buffer)?; + let attr_indices = vec!["b3_h_dak_50p".to_string(), "identificatie".to_string()]; + let mut fcb = FcbWriter::new( + original_cj_seq.cj.clone(), + Some(HeaderWriterOptions { + write_index: true, + feature_count: original_cj_seq.features.len() as u64, + index_node_size: 16, + attribute_indices: Some(attr_indices), + geographical_extent: None, + }), + Some(attr_schema), + )?; + for feature in original_cj_seq.features.iter() { + fcb.add_feature(feature)?; + } + fcb.write(&mut memory_buffer)?; - let query: Vec<(String, Operator, ByteSerializableValue)> = vec![ - ( - "b3_h_dak_50p".to_string(), - Operator::Gt, - ByteSerializableValue::F64(OrderedFloat(2.0)), - ), - ( - "identificatie".to_string(), - Operator::Eq, - ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), - ), - ]; - memory_buffer.seek(std::io::SeekFrom::Start(0))?; + let query: Vec<(String, Operator, ByteSerializableValue)> = vec![ + ( + "b3_h_dak_50p".to_string(), + Operator::Gt, + ByteSerializableValue::F64(OrderedFloat(2.0)), + ), + ( + "identificatie".to_string(), + Operator::Eq, + ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), + ), + ]; + memory_buffer.seek(std::io::SeekFrom::Start(0))?; - let mut reader = FcbReader::open(memory_buffer)?.select_attr_query(query)?; + let mut reader = FcbReader::open(memory_buffer)?.select_attr_query(query)?; - let header = reader.header(); - let mut deserialized_features = Vec::new(); - let feat_count = header.features_count(); - let mut feat_num = 0; - while let Ok(Some(feat_buf)) = reader.next() { - let feature = feat_buf.cur_cj_feature()?; - deserialized_features.push(feature); - feat_num += 1; - if feat_num >= feat_count { - break; + let header = reader.header(); + let mut deserialized_features = Vec::new(); + let feat_count = header.features_count(); + let mut feat_num = 0; + while let Ok(Some(feat_buf)) = reader.next() { + let feature = feat_buf.cur_cj_feature()?; + deserialized_features.push(feature); + feat_num += 1; + if feat_num >= feat_count { + break; + } } - } - assert_eq!(deserialized_features.len(), 1); - let feature = deserialized_features.first().unwrap(); - let mut contains_b3_h_dak_50p = false; - let mut contains_identificatie = false; - for co in feature.city_objects.values() { - if co.attributes.is_some() { - let attrs = co.attributes.as_ref().unwrap(); - if let Some(b3_h_dak_50p) = attrs.get("b3_h_dak_50p") { - if b3_h_dak_50p.as_f64().unwrap() > 2.0 { - contains_b3_h_dak_50p = true; + assert_eq!(deserialized_features.len(), 1); + let feature = deserialized_features.first().unwrap(); + let mut contains_b3_h_dak_50p = false; + let mut contains_identificatie = false; + for co in feature.city_objects.values() { + if co.attributes.is_some() { + let attrs = co.attributes.as_ref().unwrap(); + if let Some(b3_h_dak_50p) = attrs.get("b3_h_dak_50p") { + if b3_h_dak_50p.as_f64().unwrap() > 2.0 { + contains_b3_h_dak_50p = true; + } } - } - if let Some(identificatie) = attrs.get("identificatie") { - if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { - contains_identificatie = true; + if let Some(identificatie) = attrs.get("identificatie") { + if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { + contains_identificatie = true; + } } } } - } - assert!(contains_b3_h_dak_50p); - assert!(contains_identificatie); + assert!(contains_b3_h_dak_50p); + assert!(contains_identificatie); - Ok(()) -} + Ok(()) + } -#[test] -fn test_attr_index_seq() -> Result<()> { - // Setup paths - let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let input_file = manifest_dir - .join("tests") - .join("data") - .join("small.city.jsonl"); + #[test] + fn test_attr_index_seq() -> Result<()> { + // Setup paths + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let input_file = manifest_dir + .join("tests") + .join("data") + .join("small.city.jsonl"); - // Read original CityJSONSeq - let input_file = File::open(input_file)?; - let input_reader = BufReader::new(input_file); - let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { - CJType::Seq(seq) => seq, - _ => panic!("Expected CityJSONSeq"), - }; + // Read original CityJSONSeq + let input_file = File::open(input_file)?; + let input_reader = BufReader::new(input_file); + let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { + CJType::Seq(seq) => seq, + _ => panic!("Expected CityJSONSeq"), + }; - // Write to FCB + // Write to FCB - let mut memory_buffer = Cursor::new(Vec::new()); + let mut memory_buffer = Cursor::new(Vec::new()); - let mut attr_schema = AttributeSchema::new(); - for feature in original_cj_seq.features.iter() { - for (_, co) in feature.city_objects.iter() { - if let Some(attributes) = &co.attributes { - attr_schema.add_attributes(attributes); + let mut attr_schema = AttributeSchema::new(); + for feature in original_cj_seq.features.iter() { + for (_, co) in feature.city_objects.iter() { + if let Some(attributes) = &co.attributes { + attr_schema.add_attributes(attributes); + } } } - } - let attr_indices = vec!["b3_h_dak_50p".to_string(), "identificatie".to_string()]; - let mut fcb = FcbWriter::new( - original_cj_seq.cj.clone(), - Some(HeaderWriterOptions { - write_index: true, - feature_count: original_cj_seq.features.len() as u64, - index_node_size: 16, - attribute_indices: Some(attr_indices), - }), - Some(attr_schema), - )?; - for feature in original_cj_seq.features.iter() { - fcb.add_feature(feature)?; - } - fcb.write(&mut memory_buffer)?; + let attr_indices = vec!["b3_h_dak_50p".to_string(), "identificatie".to_string()]; + let mut fcb = FcbWriter::new( + original_cj_seq.cj.clone(), + Some(HeaderWriterOptions { + write_index: true, + feature_count: original_cj_seq.features.len() as u64, + index_node_size: 16, + attribute_indices: Some(attr_indices), + geographical_extent: None, + }), + Some(attr_schema), + )?; + for feature in original_cj_seq.features.iter() { + fcb.add_feature(feature)?; + } + fcb.write(&mut memory_buffer)?; - let query: Vec<(String, Operator, ByteSerializableValue)> = vec![ - ( - "b3_h_dak_50p".to_string(), - Operator::Gt, - ByteSerializableValue::F64(OrderedFloat(2.0)), - ), - ( - "identificatie".to_string(), - Operator::Eq, - ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), - ), - ]; - memory_buffer.seek(std::io::SeekFrom::Start(0))?; - let mut reader = FcbReader::open(memory_buffer)?.select_attr_query_seq(query)?; + let query: Vec<(String, Operator, ByteSerializableValue)> = vec![ + ( + "b3_h_dak_50p".to_string(), + Operator::Gt, + ByteSerializableValue::F64(OrderedFloat(2.0)), + ), + ( + "identificatie".to_string(), + Operator::Eq, + ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), + ), + ]; + memory_buffer.seek(std::io::SeekFrom::Start(0))?; + let mut reader = FcbReader::open(memory_buffer)?.select_attr_query_seq(query)?; - let header = reader.header(); - let mut deserialized_features = Vec::new(); - let feat_count = header.features_count(); - let mut feat_num = 0; - while let Ok(Some(feat_buf)) = reader.next() { - let feature = feat_buf.cur_cj_feature()?; - deserialized_features.push(feature); - feat_num += 1; - if feat_num >= feat_count { - break; + let header = reader.header(); + let mut deserialized_features = Vec::new(); + let feat_count = header.features_count(); + let mut feat_num = 0; + while let Ok(Some(feat_buf)) = reader.next() { + let feature = feat_buf.cur_cj_feature()?; + deserialized_features.push(feature); + feat_num += 1; + if feat_num >= feat_count { + break; + } } - } - assert_eq!(deserialized_features.len(), 1); - let feature = deserialized_features.first().unwrap(); - let mut contains_b3_h_dak_50p = false; - let mut contains_identificatie = false; - for co in feature.city_objects.values() { - if co.attributes.is_some() { - let attrs = co.attributes.as_ref().unwrap(); - if let Some(b3_h_dak_50p) = attrs.get("b3_h_dak_50p") { - if b3_h_dak_50p.as_f64().unwrap() > 2.0 { - contains_b3_h_dak_50p = true; + assert_eq!(deserialized_features.len(), 1); + let feature = deserialized_features.first().unwrap(); + let mut contains_b3_h_dak_50p = false; + let mut contains_identificatie = false; + for co in feature.city_objects.values() { + if co.attributes.is_some() { + let attrs = co.attributes.as_ref().unwrap(); + if let Some(b3_h_dak_50p) = attrs.get("b3_h_dak_50p") { + if b3_h_dak_50p.as_f64().unwrap() > 2.0 { + contains_b3_h_dak_50p = true; + } } - } - if let Some(identificatie) = attrs.get("identificatie") { - if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { - contains_identificatie = true; + if let Some(identificatie) = attrs.get("identificatie") { + if identificatie.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { + contains_identificatie = true; + } } } } - } - assert!(contains_b3_h_dak_50p); - assert!(contains_identificatie); + assert!(contains_b3_h_dak_50p); + assert!(contains_identificatie); - Ok(()) -} + Ok(()) + } -#[test] -fn test_attr_index_multiple_queries() -> Result<()> { - // --- Prepare FCB data --- - let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let input_file = manifest_dir - .join("tests") - .join("data") - .join("small.city.jsonl"); + #[test] + fn test_attr_index_multiple_queries() -> Result<()> { + // --- Prepare FCB data --- + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let input_file = manifest_dir + .join("tests") + .join("data") + .join("small.city.jsonl"); - let input_file = File::open(input_file)?; - let input_reader = BufReader::new(input_file); - let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { - CJType::Seq(seq) => seq, - _ => panic!("Expected CityJSONSeq"), - }; + let input_file = File::open(input_file)?; + let input_reader = BufReader::new(input_file); + let original_cj_seq = match read_cityjson_from_reader(input_reader, CJTypeKind::Seq)? { + CJType::Seq(seq) => seq, + _ => panic!("Expected CityJSONSeq"), + }; - // Build attribute schema from features. - let mut attr_schema = AttributeSchema::new(); - for feature in original_cj_seq.features.iter() { - for (_, co) in feature.city_objects.iter() { - if let Some(attributes) = &co.attributes { - attr_schema.add_attributes(attributes); + // Build attribute schema from features. + let mut attr_schema = AttributeSchema::new(); + for feature in original_cj_seq.features.iter() { + for (_, co) in feature.city_objects.iter() { + if let Some(attributes) = &co.attributes { + attr_schema.add_attributes(attributes); + } } } - } - let attr_indices = vec![ - "b3_h_dak_50p".to_string(), - "identificatie".to_string(), - "tijdstipregistratie".to_string(), - ]; - let mut fcb = FcbWriter::new( - original_cj_seq.cj.clone(), - Some(HeaderWriterOptions { - write_index: true, - feature_count: original_cj_seq.features.len() as u64, - index_node_size: 16, - attribute_indices: Some(attr_indices), - }), - Some(attr_schema), - )?; - for feature in original_cj_seq.features.iter() { - fcb.add_feature(feature)?; - } - let mut memory_buffer = Cursor::new(Vec::new()); - fcb.write(&mut memory_buffer)?; - // Clone the underlying byte vector to re-create a fresh Cursor for every test case. - let fcb_data = memory_buffer.get_ref().clone(); + let attr_indices = vec![ + "b3_h_dak_50p".to_string(), + "identificatie".to_string(), + "tijdstipregistratie".to_string(), + ]; + let mut fcb = FcbWriter::new( + original_cj_seq.cj.clone(), + Some(HeaderWriterOptions { + write_index: true, + feature_count: original_cj_seq.features.len() as u64, + index_node_size: 16, + attribute_indices: Some(attr_indices), + geographical_extent: None, + }), + Some(attr_schema), + )?; + for feature in original_cj_seq.features.iter() { + fcb.add_feature(feature)?; + } + let mut memory_buffer = Cursor::new(Vec::new()); + fcb.write(&mut memory_buffer)?; + // Clone the underlying byte vector to re-create a fresh Cursor for every test case. + let fcb_data = memory_buffer.get_ref().clone(); - // --- Helper: Run a query test --- - fn run_query_test( - data: &[u8], - query: &Vec<(String, Operator, ByteSerializableValue)>, - ) -> Result> { - // Create a new Cursor from the data. - let mut cursor = Cursor::new(data.to_vec()); - cursor.seek(SeekFrom::Start(0))?; - let mut reader = FcbReader::open(cursor)?.select_attr_query(query.clone())?; - let feat_count = reader.header().features_count(); - let mut features = Vec::new(); - let mut feat_num = 0; - while let Ok(Some(feat_buf)) = reader.next() { - let feature = feat_buf.cur_cj_feature()?; - features.push(feature); - feat_num += 1; - if feat_num >= feat_count { - break; + // --- Helper: Run a query test --- + fn run_query_test( + data: &[u8], + query: &Vec<(String, Operator, ByteSerializableValue)>, + ) -> Result> { + // Create a new Cursor from the data. + let mut cursor = Cursor::new(data.to_vec()); + cursor.seek(SeekFrom::Start(0))?; + let mut reader = FcbReader::open(cursor)?.select_attr_query(query.clone())?; + let feat_count = reader.header().features_count(); + let mut features = Vec::new(); + let mut feat_num = 0; + while let Ok(Some(feat_buf)) = reader.next() { + let feature = feat_buf.cur_cj_feature()?; + features.push(feature); + feat_num += 1; + if feat_num >= feat_count { + break; + } } + Ok(features) } - Ok(features) - } - // --- Define Test Cases --- - #[derive(Debug)] - struct QueryTestCase { - query: Vec<(String, Operator, ByteSerializableValue)>, - expected_count: usize, - /// A validator function that returns true if the feature satisfies expected conditions. - validator: fn(&CityJSONFeature) -> bool, - } + // --- Define Test Cases --- + #[derive(Debug)] + struct QueryTestCase { + test_name: &'static str, + query: Vec<(String, Operator, ByteSerializableValue)>, + expected_count: usize, + /// A validator function that returns true if the feature satisfies expected conditions. + validator: fn(&CityJSONFeature) -> bool, + } - let test_cases = vec![ - // Test case: Expect one matching feature with b3_h_dak_50p > 2.0 and matching identificatie. - QueryTestCase { - query: vec![ - ( - "b3_h_dak_50p".to_string(), - Operator::Gt, - ByteSerializableValue::F64(OrderedFloat(2.0)), - ), - ( - "identificatie".to_string(), - Operator::Eq, - ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), - ), - ], - expected_count: 1, - validator: |feature: &CityJSONFeature| { - let mut valid_b3 = false; - let mut valid_ident = false; - for co in feature.city_objects.values() { - if let Some(attrs) = &co.attributes { - if let Some(val) = attrs.get("b3_h_dak_50p") { - if val.as_f64().unwrap() > 2.0 { - valid_b3 = true; + let test_cases = vec![ + // Test case: Expect one matching feature with b3_h_dak_50p > 2.0 and matching identificatie. + QueryTestCase { + test_name: "test_attr_index_multiple_queries: b3_h_dak_50p > 2.0 and identificatie == NL.IMBAG.Pand.0503100000012869", + query: vec![ + ( + "b3_h_dak_50p".to_string(), + Operator::Gt, + ByteSerializableValue::F64(OrderedFloat(2.0)), + ), + ( + "identificatie".to_string(), + Operator::Eq, + ByteSerializableValue::String("NL.IMBAG.Pand.0503100000012869".to_string()), + ), + ], + expected_count: 1, + validator: |feature: &CityJSONFeature| { + let mut valid_b3 = false; + let mut valid_ident = false; + for co in feature.city_objects.values() { + if let Some(attrs) = &co.attributes { + if let Some(val) = attrs.get("b3_h_dak_50p") { + if val.as_f64().unwrap() > 2.0 { + valid_b3 = true; + } } - } - if let Some(ident) = attrs.get("identificatie") { - if ident.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { - valid_ident = true; + if let Some(ident) = attrs.get("identificatie") { + if ident.as_str().unwrap() == "NL.IMBAG.Pand.0503100000012869" { + valid_ident = true; + } } } } - } - valid_b3 && valid_ident + valid_b3 && valid_ident + }, }, - }, - // Test case: Expect zero features where tijdstipregistratie is before 2008-01-01. - QueryTestCase { - query: vec![( - "tijdstipregistratie".to_string(), - Operator::Lt, - ByteSerializableValue::NaiveDateTime( - chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0), - ), - )], - expected_count: 0, - validator: |feature: &CityJSONFeature| { - let mut valid_tijdstip = true; - let query_tijdstip = chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0); - for co in feature.city_objects.values() { - if let Some(attrs) = &co.attributes { - if let Some(val) = attrs.get("tijdstipregistratie") { - let val_tijdstip = chrono::NaiveDateTime::parse_from_str( - val.as_str().unwrap(), - "%Y-%m-%dT%H:%M:%S", - ) - .unwrap(); - if val_tijdstip < query_tijdstip { - valid_tijdstip = false; + // Test case: Expect zero features where tijdstipregistratie is before 2008-01-01. + QueryTestCase { + test_name: "test_attr_index_multiple_queries: tijdstipregistratie < 2008-01-01", + query: vec![( + "tijdstipregistratie".to_string(), + Operator::Lt, + ByteSerializableValue::DateTime(chrono::DateTime::::from_utc( + chrono::NaiveDate::from_ymd_opt(2008, 1, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap(), + chrono::Utc, + )), + )], + expected_count: 0, + validator: |feature: &CityJSONFeature| { + let mut valid_tijdstip = true; + let query_tijdstip = chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0); + for co in feature.city_objects.values() { + if let Some(attrs) = &co.attributes { + if let Some(val) = attrs.get("tijdstipregistratie") { + let val_tijdstip = chrono::NaiveDateTime::parse_from_str( + val.as_str().unwrap(), + "%Y-%m-%dT%H:%M:%S", + ) + .unwrap(); + if val_tijdstip < query_tijdstip { + valid_tijdstip = false; + } } } } - } - valid_tijdstip + valid_tijdstip + }, }, - }, - // Test case: Expect zero features where tijdstipregistratie is after 2008-01-01. - QueryTestCase { - query: vec![( - "tijdstipregistratie".to_string(), - Operator::Gt, - ByteSerializableValue::NaiveDateTime( - chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0), - ), - )], - expected_count: 3, - validator: |feature: &CityJSONFeature| { - let mut valid_tijdstip = false; - let query_tijdstip = chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0); - for co in feature.city_objects.values() { - if let Some(attrs) = &co.attributes { - if let Some(val) = attrs.get("tijdstipregistratie") { - let val_tijdstip = - chrono::DateTime::parse_from_rfc3339(val.as_str().unwrap()) - .map_err(|e| eprintln!("Failed to parse datetime: {}", e)) - .map(|dt| dt.naive_utc()) - .unwrap_or_else(|_| { - chrono::NaiveDateTime::from_timestamp_opt(0, 0).unwrap() - }); - if val_tijdstip > query_tijdstip { - valid_tijdstip = true; + // Test case: Expect zero features where tijdstipregistratie is after 2008-01-01. + QueryTestCase { + test_name: "test_attr_index_multiple_queries: tijdstipregistratie > 2008-01-01", + query: vec![( + "tijdstipregistratie".to_string(), + Operator::Gt, + ByteSerializableValue::NaiveDateTime( + chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0), + ), + )], + expected_count: 3, + validator: |feature: &CityJSONFeature| { + let mut valid_tijdstip = false; + let query_tijdstip = chrono::NaiveDate::from_ymd(2008, 1, 1).and_hms(0, 0, 0); + for co in feature.city_objects.values() { + if let Some(attrs) = &co.attributes { + if let Some(val) = attrs.get("tijdstipregistratie") { + let val_tijdstip = + chrono::DateTime::parse_from_rfc3339(val.as_str().unwrap()) + .map_err(|e| eprintln!("Failed to parse datetime: {}", e)) + .map(|dt| dt.naive_utc()) + .unwrap_or_else(|_| { + chrono::NaiveDateTime::from_timestamp_opt(0, 0).unwrap() + }); + if val_tijdstip > query_tijdstip { + valid_tijdstip = true; + } } } } - } - valid_tijdstip + valid_tijdstip + }, }, - }, - ]; + ]; - // --- Run Test Cases --- - for test_case in test_cases.into_iter() { - let features = run_query_test(&fcb_data, &test_case.query)?; - assert_eq!( - features.len(), - test_case.expected_count, - "Unexpected feature count for query: {:?}", - test_case.query - ); - for feature in features { - assert!( - (test_case.validator)(&feature), - "Validator failed for feature: {:?}", - feature + // --- Run Test Cases --- + for test_case in test_cases.into_iter() { + let features = run_query_test(&fcb_data, &test_case.query)?; + println!("running test: {}", test_case.test_name); + assert_eq!( + features.len(), + test_case.expected_count, + "Unexpected feature count for query: {:?}", + test_case.query ); + for feature in features { + assert!( + (test_case.validator)(&feature), + "Validator failed for feature: {:?}", + feature + ); + } } + Ok(()) } - Ok(()) } diff --git a/src/rust/fcb_core/tests/e2e.rs b/src/rust/fcb_core/tests/e2e.rs index 6937516..d905bda 100644 --- a/src/rust/fcb_core/tests/e2e.rs +++ b/src/rust/fcb_core/tests/e2e.rs @@ -52,6 +52,7 @@ fn test_cityjson_serialization_cycle() -> Result<()> { feature_count: original_cj_seq.features.len() as u64, index_node_size: 16, attribute_indices: None, + geographical_extent: None, }), Some(attr_schema), )?; diff --git a/src/rust/fcb_core/tests/http.rs b/src/rust/fcb_core/tests/http.rs index 71447f3..fbe5a75 100644 --- a/src/rust/fcb_core/tests/http.rs +++ b/src/rust/fcb_core/tests/http.rs @@ -6,10 +6,11 @@ use fcb_core::{deserializer::to_cj_metadata, HttpFcbReader}; async fn read_http_file_bbox(path: &str) -> Result<(), Box> { let http_reader = HttpFcbReader::open(path).await?; - let minx = 84227.77; - let miny = 445377.33; - let maxx = 85323.23; - let maxy = 446334.69; + let minx = 68989.19384501831; + let miny = 444614.3991728433; + let maxx = 70685.16687543111; + let maxy = 446023.6031208569; + let mut iter = http_reader.select_bbox(minx, miny, maxx, maxy).await?; let header = iter.header(); let cj = to_cj_metadata(&header)?; @@ -35,6 +36,37 @@ async fn read_http_file_bbox(path: &str) -> Result<(), Box> { // TODO: add more tests Ok(()) } +// async fn read_http_file_bbox(path: &str) -> Result<(), Box> { +// let http_reader = HttpFcbReader::open(path).await?; +// let minx = 84227.77; +// let miny = 445377.33; +// let maxx = 85323.23; +// let maxy = 446334.69; +// let mut iter = http_reader.select_bbox(minx, miny, maxx, maxy).await?; +// let header = iter.header(); +// let cj = to_cj_metadata(&header)?; + +// // let mut writer = BufWriter::new(File::create("delft_http.city.jsonl")?); +// // writeln!(writer, "{}", serde_json::to_string(&cj)?)?; + +// let mut feat_num = 0; +// let feat_count = header.features_count(); +// let mut features = Vec::new(); +// while let Some(feature) = iter.next().await? { +// let cj_feature = feature.cj_feature()?; +// features.push(cj_feature); +// // writeln!(writer, "{}", serde_json::to_string(&cj_feature)?)?; + +// feat_num += 1; +// if feat_num >= feat_count { +// break; +// } +// } +// println!("cj: {:?}", cj); +// println!("features count: {:?}", features.len()); +// // TODO: add more tests +// Ok(()) +// } async fn read_http_file_attr(path: &str) -> Result<(), Box> { let http_reader = HttpFcbReader::open(path).await?; @@ -103,7 +135,8 @@ mod http { #[tokio::test] async fn test_read_http_file() -> Result<()> { let res = - read_http_file_bbox("https://storage.googleapis.com/flatcitybuf/delft_attr.fcb").await; + read_http_file_bbox("https://storage.googleapis.com/flatcitybuf/3dbag_100k.fcb").await; + // read_http_file_bbox("https://storage.googleapis.com/flatcitybuf/delft_attr.fcb").await; assert!(res.is_ok()); Ok(()) diff --git a/src/rust/fcb_core/tests/read.rs b/src/rust/fcb_core/tests/read.rs index 9862e48..5e5380a 100644 --- a/src/rust/fcb_core/tests/read.rs +++ b/src/rust/fcb_core/tests/read.rs @@ -39,6 +39,7 @@ fn read_bbox() -> Result<()> { feature_count: original_cj_seq.features.len() as u64, index_node_size: 16, attribute_indices: Some(attr_indices), + geographical_extent: None, }), Some(attr_schema), )?; @@ -143,6 +144,7 @@ fn read_bbox_nonseekable() -> anyhow::Result<()> { feature_count: original_cj_seq.features.len() as u64, index_node_size: 16, attribute_indices: Some(attr_indices), + geographical_extent: None, }), Some(attr_schema), )?; diff --git a/src/rust/wasm/src/lib.rs b/src/rust/wasm/src/lib.rs index 08839be..c6dc1d5 100644 --- a/src/rust/wasm/src/lib.rs +++ b/src/rust/wasm/src/lib.rs @@ -69,11 +69,11 @@ impl HttpFcbReader { println!("open===: {:?}", url); // Only initialize the logger once - if !LOGGER_INITIALIZED.load(Ordering::SeqCst) { - if let Ok(_) = console_log::init_with_level(Level::Trace) { - LOGGER_INITIALIZED.store(true, Ordering::SeqCst); - log::info!("Logger initialized successfully."); - } + if !LOGGER_INITIALIZED.load(Ordering::SeqCst) + && console_log::init_with_level(Level::Trace).is_ok() + { + LOGGER_INITIALIZED.store(true, Ordering::SeqCst); + log::info!("Logger initialized successfully."); } trace!("starting: opening http reader, reading header"); @@ -335,14 +335,18 @@ impl HttpFcbReader { let query = build_query(&query.inner); - let result = bst::stream_query(&multi_index, query, feature_begin) - .await - .map_err(|e| JsValue::from_str(&e.to_string()))?; + // let result = bst::stream_query(&multi_index, query, feature_begin) + // .await + // .map_err(|e| JsValue::from_str(&e.to_string()))?; + // TODO: remove this + let result: Vec = vec![]; let count = result.len(); let combine_request_threshold = 256 * 1024; - let http_ranges: Vec = result.into_iter().map(|item| item.range).collect(); + // let http_ranges: Vec = result.into_iter().map(|item| item.range).collect(); + // TODO: remove this + let http_ranges: Vec = vec![]; trace!( "completed: select_attr_query via http reader, matched features: {}", diff --git a/src/ts/fcb_wasm_bg.wasm b/src/ts/fcb_wasm_bg.wasm index 034aa6c..37744d9 100644 Binary files a/src/ts/fcb_wasm_bg.wasm and b/src/ts/fcb_wasm_bg.wasm differ