Skip to content

Commit 7ffd9dc

Browse files
committed
add columnar hnsw build path and bit-exact f32 export tests
1 parent 52bc02a commit 7ffd9dc

10 files changed

Lines changed: 1216 additions & 13 deletions

File tree

Cargo.lock

Lines changed: 56 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ crate-type = ["rlib", "cdylib"]
1010
arrow-array = "54.2"
1111
arrow-ipc = "54.2"
1212
arrow-schema = "54.2"
13+
fast-hnsw = "1.0.0"
1314
parquet = { version = "54.2", features = ["arrow"] }
1415
serde = { version = "1.0", features = ["derive"] }
1516
serde-pickle = "1.2"

README.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,20 @@ Output format:
2525
- `parquet`
2626
- `arrow_ipc`
2727

28+
## What It Builds
29+
30+
From a columnar export (`parquet` or `arrow_ipc`) containing at least:
31+
32+
- `vector` (fixed-size list/list of float32/float64)
33+
- optional `label` (otherwise falls back to `internal_id`, then row index)
34+
- optional `deleted` (bool/int)
35+
36+
Build output:
37+
38+
- A new persisted HNSW file (`.hnsw`) produced by the Rust `fast-hnsw` format.
39+
40+
Note: this build output is not Chroma/hnswlib's native persistence layout (`header.bin`, `data_level0.bin`, ...).
41+
2842
## Build
2943

3044
```bash
@@ -49,12 +63,16 @@ Dynamic library output (platform-specific):
4963
Use `extract_index(...)` for record-by-record streaming callbacks, or
5064
`extract_index_to_columnar(...)` for file output.
5165

66+
Use `build_index_from_columnar(...)` to build a new HNSW index file from
67+
Parquet/Arrow exports.
68+
5269
## FFI API
5370

5471
### Exported Symbols
5572

5673
- `hnsw_toolbox_version() -> *const c_char`
5774
- `hnsw_toolbox_extract_index(request_json: *const c_char) -> *mut c_char`
75+
- `hnsw_toolbox_build_index(request_json: *const c_char) -> *mut c_char`
5876
- `hnsw_toolbox_get_last_error() -> *const c_char`
5977
- `hnsw_toolbox_free_string(ptr: *mut c_char)`
6078

@@ -88,6 +106,44 @@ Use `extract_index(...)` for record-by-record streaming callbacks, or
88106
}
89107
```
90108

109+
### Build Request JSON
110+
111+
```json
112+
{
113+
"input_path": "/tmp/extracted.parquet",
114+
"output_path": "/tmp/rebuilt.hnsw",
115+
"input_format": "parquet",
116+
"metric": "euclidean",
117+
"include_deleted": false,
118+
"m": 16,
119+
"m0": 32,
120+
"ef_construction": 200,
121+
"batch_size": 1024,
122+
"capacity": 100000,
123+
"seed": 42
124+
}
125+
```
126+
127+
`input_format`: `"parquet"` or `"arrow_ipc"` (defaults to `"parquet"`).
128+
`metric`: `"euclidean"`, `"squared_euclidean"`, `"cosine"`, `"dot_product"`, `"manhattan"` (defaults to `"euclidean"`).
129+
130+
### Build Response JSON
131+
132+
```json
133+
{
134+
"input_path": "/tmp/extracted.parquet",
135+
"output_path": "/tmp/rebuilt.hnsw",
136+
"input_format": "parquet",
137+
"metric": "euclidean",
138+
"summary": {
139+
"scanned": 10000,
140+
"inserted": 8000,
141+
"deleted_skipped": 2000,
142+
"dimension": 384
143+
}
144+
}
145+
```
146+
91147
## Go Usage (Purego, No CGO)
92148

93149
```go
@@ -113,6 +169,25 @@ if err != nil {
113169
_ = resp
114170
```
115171

172+
Build a new index:
173+
174+
```go
175+
buildResp, err := hnswtoolbox.BuildIndex(hnswtoolbox.BuildRequest{
176+
InputPath: "/tmp/extracted.parquet",
177+
OutputPath: "/tmp/rebuilt.hnsw",
178+
InputFormat: hnswtoolbox.InputFormatParquet,
179+
Metric: hnswtoolbox.DistanceMetricEuclidean,
180+
IncludeDeleted: false,
181+
M: 16,
182+
EfConstruction: 200,
183+
BatchSize: 1024,
184+
})
185+
if err != nil {
186+
panic(err)
187+
}
188+
_ = buildResp
189+
```
190+
116191
## Validation
117192

118193
```bash

hnswtoolbox.go

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,23 @@ const (
1818
OutputFormatArrowIPC OutputFormat = "arrow_ipc"
1919
)
2020

21+
type InputFormat string
22+
23+
const (
24+
InputFormatParquet InputFormat = "parquet"
25+
InputFormatArrowIPC InputFormat = "arrow_ipc"
26+
)
27+
28+
type DistanceMetric string
29+
30+
const (
31+
DistanceMetricEuclidean DistanceMetric = "euclidean"
32+
DistanceMetricSquaredEuclidean DistanceMetric = "squared_euclidean"
33+
DistanceMetricCosine DistanceMetric = "cosine"
34+
DistanceMetricDotProduct DistanceMetric = "dot_product"
35+
DistanceMetricManhattan DistanceMetric = "manhattan"
36+
)
37+
2138
type ExtractRequest struct {
2239
IndexDir string
2340
OutputPath string
@@ -40,6 +57,35 @@ type ExtractResponse struct {
4057
Summary ExtractSummary `json:"summary"`
4158
}
4259

60+
type BuildRequest struct {
61+
InputPath string
62+
OutputPath string
63+
InputFormat InputFormat
64+
Metric DistanceMetric
65+
IncludeDeleted bool
66+
M int
67+
M0 *int
68+
EfConstruction int
69+
BatchSize int
70+
Capacity *int
71+
Seed *uint64
72+
}
73+
74+
type BuildSummary struct {
75+
Scanned uint64 `json:"scanned"`
76+
Inserted uint64 `json:"inserted"`
77+
DeletedSkipped uint64 `json:"deleted_skipped"`
78+
Dimension int `json:"dimension"`
79+
}
80+
81+
type BuildResponse struct {
82+
InputPath string `json:"input_path"`
83+
OutputPath string `json:"output_path"`
84+
InputFormat InputFormat `json:"input_format"`
85+
Metric DistanceMetric `json:"metric"`
86+
Summary BuildSummary `json:"summary"`
87+
}
88+
4389
type extractPayload struct {
4490
IndexDir string `json:"index_dir"`
4591
OutputPath string `json:"output_path"`
@@ -49,6 +95,20 @@ type extractPayload struct {
4995
BatchSize int `json:"batch_size,omitempty"`
5096
}
5197

98+
type buildPayload struct {
99+
InputPath string `json:"input_path"`
100+
OutputPath string `json:"output_path"`
101+
InputFormat InputFormat `json:"input_format,omitempty"`
102+
Metric DistanceMetric `json:"metric,omitempty"`
103+
IncludeDeleted bool `json:"include_deleted,omitempty"`
104+
M int `json:"m,omitempty"`
105+
M0 *int `json:"m0,omitempty"`
106+
EfConstruction int `json:"ef_construction,omitempty"`
107+
BatchSize int `json:"batch_size,omitempty"`
108+
Capacity *int `json:"capacity,omitempty"`
109+
Seed *uint64 `json:"seed,omitempty"`
110+
}
111+
52112
var (
53113
stateMu sync.Mutex
54114
callMu sync.Mutex
@@ -57,6 +117,7 @@ var (
57117
loaded bool
58118

59119
fnExtractIndex func(*byte) *byte
120+
fnBuildIndex func(*byte) *byte
60121
fnLastError func() *byte
61122
fnFreeCString func(*byte)
62123
fnVersion func() *byte
@@ -82,6 +143,10 @@ func Init(libraryPath string) error {
82143
_ = purego.Dlclose(handle)
83144
return err
84145
}
146+
if err := register(handle, &fnBuildIndex, "hnsw_toolbox_build_index"); err != nil {
147+
_ = purego.Dlclose(handle)
148+
return err
149+
}
85150
if err := register(handle, &fnLastError, "hnsw_toolbox_get_last_error"); err != nil {
86151
_ = purego.Dlclose(handle)
87152
return err
@@ -189,6 +254,59 @@ func ExtractIndex(request ExtractRequest) (*ExtractResponse, error) {
189254
return &response, nil
190255
}
191256

257+
func BuildIndex(request BuildRequest) (*BuildResponse, error) {
258+
if err := ensureLoaded(); err != nil {
259+
return nil, err
260+
}
261+
if strings.TrimSpace(request.InputPath) == "" {
262+
return nil, errors.New("InputPath is required")
263+
}
264+
if strings.TrimSpace(request.OutputPath) == "" {
265+
return nil, errors.New("OutputPath is required")
266+
}
267+
268+
payload := buildPayload{
269+
InputPath: request.InputPath,
270+
OutputPath: request.OutputPath,
271+
InputFormat: request.InputFormat,
272+
Metric: request.Metric,
273+
IncludeDeleted: request.IncludeDeleted,
274+
M: request.M,
275+
M0: request.M0,
276+
EfConstruction: request.EfConstruction,
277+
BatchSize: request.BatchSize,
278+
Capacity: request.Capacity,
279+
Seed: request.Seed,
280+
}
281+
282+
rawPayload, err := json.Marshal(payload)
283+
if err != nil {
284+
return nil, fmt.Errorf("failed to marshal build request: %w", err)
285+
}
286+
cPayload := append(rawPayload, 0)
287+
288+
callMu.Lock()
289+
responsePtr := fnBuildIndex(&cPayload[0])
290+
if responsePtr == nil {
291+
errorMessage := goStringFromPtr(fnLastError())
292+
callMu.Unlock()
293+
if strings.TrimSpace(errorMessage) == "" {
294+
errorMessage = "hnsw_toolbox_build_index failed without error message"
295+
}
296+
return nil, errors.New(errorMessage)
297+
}
298+
299+
responseJSON := goStringFromPtr(responsePtr)
300+
fnFreeCString(responsePtr)
301+
callMu.Unlock()
302+
303+
var response BuildResponse
304+
if err := json.Unmarshal([]byte(responseJSON), &response); err != nil {
305+
return nil, fmt.Errorf("failed to parse build response: %w", err)
306+
}
307+
return &response, nil
308+
}
309+
192310
func isLoaded() bool {
193311
stateMu.Lock()
194312
defer stateMu.Unlock()

0 commit comments

Comments
 (0)