Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ Optional metadata join from `index_metadata.pickle`:
- `user_id`
- `seq_id`

Extract summary also includes `index_properties` sourced from `header.bin`:

- `m`
- `ef_construction`
- `cur_element_count`
- `max_elements`
- `persisted_version`
- `word_size_bytes`

Output format:

- `parquet`
Expand Down Expand Up @@ -101,7 +110,15 @@ Parquet/Arrow exports.
"scanned": 10000,
"emitted": 8000,
"deleted_skipped": 2000,
"dimension": 384
"dimension": 384,
"index_properties": {
"m": 16,
"ef_construction": 200,
"cur_element_count": 10000,
"max_elements": 12000,
"persisted_version": 1,
"word_size_bytes": 8
}
}
}
```
Expand Down
18 changes: 14 additions & 4 deletions hnswtoolbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,21 @@ type ExtractRequest struct {
BatchSize int
}

type ExtractIndexProperties struct {
M uint64 `json:"m"`
EfConstruction uint64 `json:"ef_construction"`
CurElementCount uint64 `json:"cur_element_count"`
MaxElements uint64 `json:"max_elements"`
PersistedVersion int32 `json:"persisted_version"`
WordSizeBytes uint32 `json:"word_size_bytes"`
}

type ExtractSummary struct {
Scanned uint64 `json:"scanned"`
Emitted uint64 `json:"emitted"`
DeletedSkipped uint64 `json:"deleted_skipped"`
Dimension int `json:"dimension"`
Scanned uint64 `json:"scanned"`
Emitted uint64 `json:"emitted"`
DeletedSkipped uint64 `json:"deleted_skipped"`
Dimension int `json:"dimension"`
IndexProperties ExtractIndexProperties `json:"index_properties"`
}

type ExtractResponse struct {
Expand Down
51 changes: 51 additions & 0 deletions hnswtoolbox_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package hnswtoolbox

import (
"encoding/json"
"testing"
)

func TestExtractResponseUnmarshalIncludesIndexProperties(t *testing.T) {
raw := []byte(`{
"output_path": "/tmp/rebuilt.parquet",
"output_format": "parquet",
"summary": {
"scanned": 10000,
"emitted": 8000,
"deleted_skipped": 2000,
"dimension": 384,
"index_properties": {
"m": 16,
"ef_construction": 200,
"cur_element_count": 10000,
"max_elements": 12000,
"persisted_version": 1,
"word_size_bytes": 8
}
}
}`)

var response ExtractResponse
if err := json.Unmarshal(raw, &response); err != nil {
t.Fatalf("unmarshal extract response: %v", err)
}

if response.Summary.IndexProperties.M != 16 {
t.Errorf("M mismatch: got %d", response.Summary.IndexProperties.M)
}
if response.Summary.IndexProperties.EfConstruction != 200 {
t.Errorf("EfConstruction mismatch: got %d", response.Summary.IndexProperties.EfConstruction)
}
if response.Summary.IndexProperties.CurElementCount != 10000 {
t.Errorf("CurElementCount mismatch: got %d", response.Summary.IndexProperties.CurElementCount)
}
if response.Summary.IndexProperties.MaxElements != 12000 {
t.Errorf("MaxElements mismatch: got %d", response.Summary.IndexProperties.MaxElements)
}
if response.Summary.IndexProperties.PersistedVersion != 1 {
t.Errorf("PersistedVersion mismatch: got %d", response.Summary.IndexProperties.PersistedVersion)
}
if response.Summary.IndexProperties.WordSizeBytes != 8 {
t.Errorf("WordSizeBytes mismatch: got %d", response.Summary.IndexProperties.WordSizeBytes)
}
}
38 changes: 38 additions & 0 deletions src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,24 @@ pub struct ExtractOptions {
pub metadata: Option<HashMap<u64, MetadataEntry>>,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ExtractIndexProperties {
pub m: u64,
pub ef_construction: u64,
pub cur_element_count: u64,
pub max_elements: u64,
pub persisted_version: i32,
pub word_size_bytes: u32,
}

#[derive(Debug, Clone, Serialize)]
pub struct ExtractSummary {
pub scanned: u64,
pub emitted: u64,
pub deleted_skipped: u64,
pub dimension: usize,
pub index_properties: ExtractIndexProperties,
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -131,6 +143,14 @@ where
emitted: 0,
deleted_skipped: 0,
dimension: header.dimension(),
index_properties: ExtractIndexProperties {
m: header.m,
ef_construction: header.ef_construction,
cur_element_count: header.cur_element_count,
max_elements: header.max_elements,
persisted_version: header.persisted_version,
word_size_bytes: header.word_size.bytes() as u32,
},
};

let delete_marker_offset = header.delete_marker_offset();
Expand Down Expand Up @@ -506,6 +526,12 @@ mod tests {
assert_eq!(summary.emitted, 2);
assert_eq!(summary.deleted_skipped, 1);
assert_eq!(summary.dimension, 3);
assert_eq!(summary.index_properties.m, 4);
assert_eq!(summary.index_properties.ef_construction, 100);
assert_eq!(summary.index_properties.cur_element_count, 3);
assert_eq!(summary.index_properties.max_elements, 3);
assert_eq!(summary.index_properties.persisted_version, 1);
assert_eq!(summary.index_properties.word_size_bytes, 8);
assert_eq!(out.len(), 2);
assert_eq!(out[0].label, 10);
assert_eq!(out[1].label, 30);
Expand Down Expand Up @@ -683,6 +709,18 @@ mod tests {
summary_without_deleted.deleted_skipped,
fixture_records.len() as u64 - expected_non_deleted
);
prop_assert_eq!(summary_without_deleted.index_properties.m, 4);
prop_assert_eq!(summary_without_deleted.index_properties.ef_construction, 100);
prop_assert_eq!(
summary_without_deleted.index_properties.cur_element_count,
fixture_records.len() as u64
);
prop_assert_eq!(
summary_without_deleted.index_properties.max_elements,
capacity as u64
);
prop_assert_eq!(summary_without_deleted.index_properties.persisted_version, 1);
prop_assert_eq!(summary_without_deleted.index_properties.word_size_bytes, 8);

for exported in &exported_without_deleted {
let source = &fixture_records[exported.internal_id as usize];
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ pub mod metadata;

pub use error::ExtractError;
pub use extractor::{
extract_index, extract_index_to_columnar, ExtractOptions, ExtractSummary, ExtractedRecord,
OutputFormat,
extract_index, extract_index_to_columnar, ExtractIndexProperties, ExtractOptions,
ExtractSummary, ExtractedRecord, OutputFormat,
};
pub use header::{HeaderWordSize, PersistentHeader, HNSW_PERSISTENCE_VERSION};
pub use importer::{
Expand Down