Skip to content

Commit 8972063

Browse files
committed
feat: Expose checkIntegrity() method to Python bindings
Add check_integrity() method to Python Index class that validates HNSW graph structure including: - Connection validity (no invalid neighbor IDs) - No self-loops - No duplicate connections - No orphan nodes (elements with no inbound connections) Returns dict with: valid, connections_checked, element_count, min_inbound, max_inbound, errors[] This enables CIDX to perform health checks on HNSW indexes.
1 parent c1b9b79 commit 8972063

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed

python_bindings/bindings.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,94 @@ class Index {
720720
size_t getCurrentCount() const {
721721
return appr_alg->cur_element_count;
722722
}
723+
724+
725+
py::dict checkIntegrity() {
726+
/**
727+
* Python-friendly integrity check that returns detailed results
728+
* instead of crashing on assert failures.
729+
*
730+
* Returns a dict with:
731+
* - valid: bool - whether integrity check passed
732+
* - connections_checked: int - total connections verified
733+
* - min_inbound: int - minimum inbound connections per node
734+
* - max_inbound: int - maximum inbound connections per node
735+
* - errors: list[str] - list of any errors found
736+
*/
737+
if (!appr_alg) {
738+
return py::dict(
739+
"valid"_a = false,
740+
"connections_checked"_a = 0,
741+
"min_inbound"_a = 0,
742+
"max_inbound"_a = 0,
743+
"errors"_a = py::list(py::cast(std::vector<std::string>{"Index not initialized"}))
744+
);
745+
}
746+
747+
std::vector<std::string> errors;
748+
int connections_checked = 0;
749+
std::vector<int> inbound_connections_num(appr_alg->cur_element_count, 0);
750+
751+
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
752+
for (int l = 0; l <= appr_alg->element_levels_[i]; l++) {
753+
hnswlib::linklistsizeint *ll_cur = appr_alg->get_linklist_at_level(i, l);
754+
int size = appr_alg->getListCount(ll_cur);
755+
hnswlib::tableint *data = (hnswlib::tableint *) (ll_cur + 1);
756+
std::unordered_set<hnswlib::tableint> s;
757+
758+
for (int j = 0; j < size; j++) {
759+
// Check: connection points to valid element
760+
if (data[j] >= appr_alg->cur_element_count) {
761+
errors.push_back("Element " + std::to_string(i) + " at level " +
762+
std::to_string(l) + " has invalid connection to " + std::to_string(data[j]));
763+
}
764+
// Check: no self-loops
765+
if (data[j] == i) {
766+
errors.push_back("Element " + std::to_string(i) + " at level " +
767+
std::to_string(l) + " has self-loop");
768+
}
769+
// Track for duplicate check
770+
if (s.find(data[j]) != s.end()) {
771+
errors.push_back("Element " + std::to_string(i) + " at level " +
772+
std::to_string(l) + " has duplicate connection to " + std::to_string(data[j]));
773+
}
774+
s.insert(data[j]);
775+
if (data[j] < appr_alg->cur_element_count) {
776+
inbound_connections_num[data[j]]++;
777+
}
778+
connections_checked++;
779+
}
780+
}
781+
}
782+
783+
// Check for orphan nodes (no inbound connections)
784+
int min_inbound = 0, max_inbound = 0;
785+
if (appr_alg->cur_element_count > 1) {
786+
min_inbound = inbound_connections_num[0];
787+
max_inbound = inbound_connections_num[0];
788+
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
789+
if (inbound_connections_num[i] == 0) {
790+
errors.push_back("Element " + std::to_string(i) + " has no inbound connections (orphan)");
791+
}
792+
min_inbound = std::min(inbound_connections_num[i], min_inbound);
793+
max_inbound = std::max(inbound_connections_num[i], max_inbound);
794+
}
795+
}
796+
797+
py::list error_list;
798+
for (const auto& err : errors) {
799+
error_list.append(err);
800+
}
801+
802+
return py::dict(
803+
"valid"_a = errors.empty(),
804+
"connections_checked"_a = connections_checked,
805+
"element_count"_a = (size_t)appr_alg->cur_element_count,
806+
"min_inbound"_a = min_inbound,
807+
"max_inbound"_a = max_inbound,
808+
"errors"_a = error_list
809+
);
810+
}
723811
};
724812

725813
template<typename dist_t, typename data_t = float>
@@ -950,6 +1038,15 @@ PYBIND11_PLUGIN(hnswlib) {
9501038
.def("resize_index", &Index<float>::resizeIndex, py::arg("new_size"))
9511039
.def("get_max_elements", &Index<float>::getMaxElements)
9521040
.def("get_current_count", &Index<float>::getCurrentCount)
1041+
.def("check_integrity", &Index<float>::checkIntegrity,
1042+
"Check index integrity and return detailed results.\n\n"
1043+
"Returns a dict with:\n"
1044+
" - valid: bool - whether integrity check passed\n"
1045+
" - connections_checked: int - total connections verified\n"
1046+
" - element_count: int - number of elements in index\n"
1047+
" - min_inbound: int - minimum inbound connections per node\n"
1048+
" - max_inbound: int - maximum inbound connections per node\n"
1049+
" - errors: list[str] - list of any errors found\n")
9531050
.def_readonly("space", &Index<float>::space_name)
9541051
.def_readonly("dim", &Index<float>::dim)
9551052
.def_readwrite("num_threads", &Index<float>::num_threads_default)

0 commit comments

Comments
 (0)