Skip to content

Commit dd8d5f7

Browse files
authored
Merge pull request #1 from amikos-tech/codex/extract-index-properties
Emit core HNSW index properties in extract summary
2 parents 9c76757 + 98159e8 commit dd8d5f7

5 files changed

Lines changed: 123 additions & 7 deletions

File tree

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ Optional metadata join from `index_metadata.pickle`:
2020
- `user_id`
2121
- `seq_id`
2222

23+
Extract summary also includes `index_properties` sourced from `header.bin`:
24+
25+
- `m`
26+
- `ef_construction`
27+
- `cur_element_count`
28+
- `max_elements`
29+
- `persisted_version`
30+
- `word_size_bytes`
31+
2332
Output format:
2433

2534
- `parquet`
@@ -101,7 +110,15 @@ Parquet/Arrow exports.
101110
"scanned": 10000,
102111
"emitted": 8000,
103112
"deleted_skipped": 2000,
104-
"dimension": 384
113+
"dimension": 384,
114+
"index_properties": {
115+
"m": 16,
116+
"ef_construction": 200,
117+
"cur_element_count": 10000,
118+
"max_elements": 12000,
119+
"persisted_version": 1,
120+
"word_size_bytes": 8
121+
}
105122
}
106123
}
107124
```

hnswtoolbox.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,21 @@ type ExtractRequest struct {
4444
BatchSize int
4545
}
4646

47+
type ExtractIndexProperties struct {
48+
M uint64 `json:"m"`
49+
EfConstruction uint64 `json:"ef_construction"`
50+
CurElementCount uint64 `json:"cur_element_count"`
51+
MaxElements uint64 `json:"max_elements"`
52+
PersistedVersion int32 `json:"persisted_version"`
53+
WordSizeBytes uint32 `json:"word_size_bytes"`
54+
}
55+
4756
type ExtractSummary struct {
48-
Scanned uint64 `json:"scanned"`
49-
Emitted uint64 `json:"emitted"`
50-
DeletedSkipped uint64 `json:"deleted_skipped"`
51-
Dimension int `json:"dimension"`
57+
Scanned uint64 `json:"scanned"`
58+
Emitted uint64 `json:"emitted"`
59+
DeletedSkipped uint64 `json:"deleted_skipped"`
60+
Dimension int `json:"dimension"`
61+
IndexProperties ExtractIndexProperties `json:"index_properties"`
5262
}
5363

5464
type ExtractResponse struct {

hnswtoolbox_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package hnswtoolbox
2+
3+
import (
4+
"encoding/json"
5+
"testing"
6+
)
7+
8+
func TestExtractResponseUnmarshalIncludesIndexProperties(t *testing.T) {
9+
raw := []byte(`{
10+
"output_path": "/tmp/rebuilt.parquet",
11+
"output_format": "parquet",
12+
"summary": {
13+
"scanned": 10000,
14+
"emitted": 8000,
15+
"deleted_skipped": 2000,
16+
"dimension": 384,
17+
"index_properties": {
18+
"m": 16,
19+
"ef_construction": 200,
20+
"cur_element_count": 10000,
21+
"max_elements": 12000,
22+
"persisted_version": 1,
23+
"word_size_bytes": 8
24+
}
25+
}
26+
}`)
27+
28+
var response ExtractResponse
29+
if err := json.Unmarshal(raw, &response); err != nil {
30+
t.Fatalf("unmarshal extract response: %v", err)
31+
}
32+
33+
if response.Summary.IndexProperties.M != 16 {
34+
t.Errorf("M mismatch: got %d", response.Summary.IndexProperties.M)
35+
}
36+
if response.Summary.IndexProperties.EfConstruction != 200 {
37+
t.Errorf("EfConstruction mismatch: got %d", response.Summary.IndexProperties.EfConstruction)
38+
}
39+
if response.Summary.IndexProperties.CurElementCount != 10000 {
40+
t.Errorf("CurElementCount mismatch: got %d", response.Summary.IndexProperties.CurElementCount)
41+
}
42+
if response.Summary.IndexProperties.MaxElements != 12000 {
43+
t.Errorf("MaxElements mismatch: got %d", response.Summary.IndexProperties.MaxElements)
44+
}
45+
if response.Summary.IndexProperties.PersistedVersion != 1 {
46+
t.Errorf("PersistedVersion mismatch: got %d", response.Summary.IndexProperties.PersistedVersion)
47+
}
48+
if response.Summary.IndexProperties.WordSizeBytes != 8 {
49+
t.Errorf("WordSizeBytes mismatch: got %d", response.Summary.IndexProperties.WordSizeBytes)
50+
}
51+
}

src/extractor.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,24 @@ pub struct ExtractOptions {
3131
pub metadata: Option<HashMap<u64, MetadataEntry>>,
3232
}
3333

34+
#[derive(Debug, Clone, Serialize)]
35+
#[serde(rename_all = "snake_case")]
36+
pub struct ExtractIndexProperties {
37+
pub m: u64,
38+
pub ef_construction: u64,
39+
pub cur_element_count: u64,
40+
pub max_elements: u64,
41+
pub persisted_version: i32,
42+
pub word_size_bytes: u32,
43+
}
44+
3445
#[derive(Debug, Clone, Serialize)]
3546
pub struct ExtractSummary {
3647
pub scanned: u64,
3748
pub emitted: u64,
3849
pub deleted_skipped: u64,
3950
pub dimension: usize,
51+
pub index_properties: ExtractIndexProperties,
4052
}
4153

4254
#[derive(Debug, Clone)]
@@ -131,6 +143,14 @@ where
131143
emitted: 0,
132144
deleted_skipped: 0,
133145
dimension: header.dimension(),
146+
index_properties: ExtractIndexProperties {
147+
m: header.m,
148+
ef_construction: header.ef_construction,
149+
cur_element_count: header.cur_element_count,
150+
max_elements: header.max_elements,
151+
persisted_version: header.persisted_version,
152+
word_size_bytes: header.word_size.bytes() as u32,
153+
},
134154
};
135155

136156
let delete_marker_offset = header.delete_marker_offset();
@@ -506,6 +526,12 @@ mod tests {
506526
assert_eq!(summary.emitted, 2);
507527
assert_eq!(summary.deleted_skipped, 1);
508528
assert_eq!(summary.dimension, 3);
529+
assert_eq!(summary.index_properties.m, 4);
530+
assert_eq!(summary.index_properties.ef_construction, 100);
531+
assert_eq!(summary.index_properties.cur_element_count, 3);
532+
assert_eq!(summary.index_properties.max_elements, 3);
533+
assert_eq!(summary.index_properties.persisted_version, 1);
534+
assert_eq!(summary.index_properties.word_size_bytes, 8);
509535
assert_eq!(out.len(), 2);
510536
assert_eq!(out[0].label, 10);
511537
assert_eq!(out[1].label, 30);
@@ -683,6 +709,18 @@ mod tests {
683709
summary_without_deleted.deleted_skipped,
684710
fixture_records.len() as u64 - expected_non_deleted
685711
);
712+
prop_assert_eq!(summary_without_deleted.index_properties.m, 4);
713+
prop_assert_eq!(summary_without_deleted.index_properties.ef_construction, 100);
714+
prop_assert_eq!(
715+
summary_without_deleted.index_properties.cur_element_count,
716+
fixture_records.len() as u64
717+
);
718+
prop_assert_eq!(
719+
summary_without_deleted.index_properties.max_elements,
720+
capacity as u64
721+
);
722+
prop_assert_eq!(summary_without_deleted.index_properties.persisted_version, 1);
723+
prop_assert_eq!(summary_without_deleted.index_properties.word_size_bytes, 8);
686724

687725
for exported in &exported_without_deleted {
688726
let source = &fixture_records[exported.internal_id as usize];

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ pub mod metadata;
77

88
pub use error::ExtractError;
99
pub use extractor::{
10-
extract_index, extract_index_to_columnar, ExtractOptions, ExtractSummary, ExtractedRecord,
11-
OutputFormat,
10+
extract_index, extract_index_to_columnar, ExtractIndexProperties, ExtractOptions,
11+
ExtractSummary, ExtractedRecord, OutputFormat,
1212
};
1313
pub use header::{HeaderWordSize, PersistentHeader, HNSW_PERSISTENCE_VERSION};
1414
pub use importer::{

0 commit comments

Comments
 (0)