Skip to content
Open
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
e011555
workflow fix
CompromisedKiwi Jan 20, 2026
bfb16fa
coarce migration
CompromisedKiwi Jan 21, 2026
54e43c1
underline
CompromisedKiwi Jan 21, 2026
894bb73
c++17
CompromisedKiwi Jan 22, 2026
b0a1ca0
rename
CompromisedKiwi Jan 22, 2026
8e7e017
save
CompromisedKiwi Jan 23, 2026
fa1d7d2
Merge branch 'main' into yzh/migrate_doc_node
CompromisedKiwi Jan 28, 2026
7ea108e
undo workflow fix
CompromisedKiwi Jan 28, 2026
f0f6657
refactor
CompromisedKiwi Jan 30, 2026
1854448
adaptor
CompromisedKiwi Jan 30, 2026
1484fc7
finish doc_node init
CompromisedKiwi Feb 2, 2026
a69f82f
children
CompromisedKiwi Feb 3, 2026
a6cfceb
doc_node hpp
CompromisedKiwi Feb 4, 2026
0170a0e
DocNode done
CompromisedKiwi Feb 4, 2026
459cfd4
pending review
CompromisedKiwi Feb 5, 2026
5ea167c
NodeTransform done
CompromisedKiwi Feb 6, 2026
e4070f8
rename
CompromisedKiwi Feb 6, 2026
6017ffa
save
CompromisedKiwi Feb 7, 2026
cc7ab7e
Merge branch 'main' into yzh/migrate_doc_node
CompromisedKiwi Feb 7, 2026
615b7b0
Module
CompromisedKiwi Feb 7, 2026
0b193c8
map_params
CompromisedKiwi Feb 10, 2026
0d88ea6
save
CompromisedKiwi Feb 10, 2026
02cbec4
Integrate utf8proc to split text to readable chars.
CompromisedKiwi Feb 10, 2026
af7e617
UnicodeProcessor
CompromisedKiwi Feb 12, 2026
1c7ee82
text splitter base cpp finish
CompromisedKiwi Feb 13, 2026
9ef9bd8
keys
CompromisedKiwi Feb 13, 2026
068ca98
export
CompromisedKiwi Feb 13, 2026
19e00dd
sentence_splitter
CompromisedKiwi Feb 13, 2026
e0c3acc
compile_options
CompromisedKiwi Feb 24, 2026
06aa586
tests in cpp side
CompromisedKiwi Feb 24, 2026
a214e35
libstdc++.so.6
CompromisedKiwi Feb 27, 2026
e865ab6
DocNode manage itself.
CompromisedKiwi Feb 27, 2026
2fd8583
finish cpp side tests
CompromisedKiwi Mar 2, 2026
ac9dad3
cpp env switch
CompromisedKiwi Mar 4, 2026
4ab5a93
no need to test cpp override
CompromisedKiwi Mar 4, 2026
b38affc
cpp tests passed.
CompromisedKiwi Mar 5, 2026
79218fb
merge
CompromisedKiwi Mar 5, 2026
ee3ecbc
install and third parties so.
CompromisedKiwi Mar 5, 2026
42252a7
Reuse python side tests.
CompromisedKiwi Mar 6, 2026
06eabd4
LD_PRELOAD
CompromisedKiwi Mar 11, 2026
fa73e50
feat: add cpp_class decorator for C++ class replacement
CompromisedKiwi Mar 12, 2026
08f3333
docnode cpp ext repaired
CompromisedKiwi Mar 12, 2026
2c893df
save
CompromisedKiwi Mar 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ jobs:
cpp_ext_test:
name: C++ Extension Test (${{ matrix.os }})
needs: [ clone ]
if: always()
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/publish_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,6 @@ jobs:
name: repo-with-docs
path: ./repo_artifact

- name: Install Python dev headers (Ubuntu only)
if: startsWith(matrix.os, 'ubuntu')
run: |
sudo apt-get update
sudo apt-get install -y python3-dev

- name: Extract repo-with-docs
run: |
set -ex
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ test/
dist/
tmp/
build
.cache/
*.lock
*.db
mkdocs.yml
Expand Down Expand Up @@ -64,3 +65,4 @@ docs/zh/assets
build*
lazyllm_cpp.egg-info/
!build*.sh
lazyllm/cpp_lib/
48 changes: 40 additions & 8 deletions csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,48 @@
cmake_minimum_required(VERSION 3.16)
project(LazyLLMCPP LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
find_package(pybind11 CONFIG REQUIRED)
# Third party libs
include(cmake/third_party.cmake)

# Config lazyllm_core lib with pure cpp code.
file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/core/src/*.cpp")
add_library(lazyllm_core STATIC ${LAZYLLM_CORE_SOURCES})
target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/core/include)
target_link_libraries(lazyllm_core PUBLIC xxhash)
target_link_libraries(lazyllm_core PUBLIC tiktoken)
target_link_libraries(lazyllm_core PUBLIC utf8proc)
target_compile_options(lazyllm_core PRIVATE -Werror -Wshadow)

# Config lazyllm_adaptor lib which maintains callback invocations.
file(GLOB_RECURSE LAZYLLM_ADAPTOR_SOURCES CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/adaptor/*.cpp")
add_library(lazyllm_adaptor STATIC ${LAZYLLM_ADAPTOR_SOURCES})
target_include_directories(lazyllm_adaptor PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/adaptor)
target_link_libraries(lazyllm_adaptor PUBLIC pybind11::headers Python3::Python lazyllm_core)
target_compile_options(lazyllm_adaptor PRIVATE -Werror -Wshadow)

# Config lazyllm_cpp lib with binding infomations.
set(LAZYLLM_BINDING_SOURCES binding/lazyllm.cpp binding/doc.cpp)
file(GLOB_RECURSE LAZYLLM_BINDING_SOURCES CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/binding/*.cpp")
set(INTERFACE_TARGET_NAME lazyllm_cpp)
pybind11_add_module(${INTERFACE_TARGET_NAME} ${LAZYLLM_BINDING_SOURCES})
target_link_libraries(${INTERFACE_TARGET_NAME} PRIVATE lazyllm_core)
target_link_libraries(${INTERFACE_TARGET_NAME} PRIVATE lazyllm_core lazyllm_adaptor)
target_compile_options(${INTERFACE_TARGET_NAME} PRIVATE -Werror -Wshadow)

# Ensure lazyllm_cpp can find third-party shared libraries under lazyllm/cpp_lib.
set(_lazyllm_cpp_rpath "$ORIGIN/cpp_lib")
if (APPLE)
set(_lazyllm_cpp_rpath "@loader_path/cpp_lib")
endif()
set_target_properties(${INTERFACE_TARGET_NAME} PROPERTIES
BUILD_RPATH "${_lazyllm_cpp_rpath}"
INSTALL_RPATH "${_lazyllm_cpp_rpath}"
)

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
# SHOW_SYMBOL
Expand All @@ -26,7 +51,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
endif()

# Install
install(TARGETS ${INTERFACE_TARGET_NAME} LIBRARY DESTINATION lazyllm)
install(TARGETS ${INTERFACE_TARGET_NAME}
LIBRARY DESTINATION lazyllm COMPONENT lazyllm_cpp
RUNTIME DESTINATION lazyllm COMPONENT lazyllm_cpp
)
install(TARGETS tiktoken utf8proc
LIBRARY DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp
RUNTIME DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp
)


# TESTS
Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions csrc/adaptor/adaptor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#include "adaptor_base_wrapper.hpp"
#include "document_store.hpp"
37 changes: 37 additions & 0 deletions csrc/adaptor/adaptor_base_wrapper.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>

#include <pybind11/pybind11.h>

#include "adaptor_base.hpp"


namespace lazyllm {

class LAZYLLM_HIDDEN AdaptorBaseWrapper : public AdaptorBase {
pybind11::object _py_obj;
public:
AdaptorBaseWrapper(const pybind11::object &obj) : _py_obj(obj) {}
virtual ~AdaptorBaseWrapper() = default;

std::any call(
const std::string& func_name,
const std::unordered_map<std::string, std::any>& args) const override final
{
pybind11::gil_scoped_acquire gil;
pybind11::object func = pybind11::getattr(_py_obj, func_name.c_str(), pybind11::none());
return call_impl(func_name, func, args);
}

virtual std::any call_impl(
const std::string& func_name,
const pybind11::object& func,
const std::unordered_map<std::string, std::any>& args) const = 0;
};

}
119 changes: 119 additions & 0 deletions csrc/adaptor/document_store.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#pragma once

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

#include "adaptor_base_wrapper.hpp"
#include "doc_node.hpp"

namespace lazyllm {

struct NodeGroup {
enum class Type {
ORIGINAL, CHUNK, SUMMARY, IMAGE_INFO, QUESTION_ANSWER, OTHER
};
std::string _parent;
std::string _display_name;
Type _type;
NodeGroup(
const std::string& parent,
const std::string& display_name,
const Type& type = Type::ORIGINAL) :
_parent(parent), _display_name(display_name), _type(type) {}
};

class LAZYLLM_HIDDEN DocumentStore : public AdaptorBaseWrapper {
public:
DocumentStore() = delete;
explicit DocumentStore(
const pybind11::object& store,
const std::unordered_map<std::string, NodeGroup> &map) :
AdaptorBaseWrapper(store), _node_groups_map(map) {}

// Cache-aware factory to avoid rebuilding adaptor for the same Python store.
static std::shared_ptr<DocumentStore> from_store(
const pybind11::object& store, const std::unordered_map<std::string, NodeGroup>& map) {
if (store.is_none()) return nullptr;

pybind11::gil_scoped_acquire gil;
PyObject *key = store.ptr();
auto &cache = store_cache();
auto it = cache.find(key);
if (it != cache.end()) {
if (auto existing = it->second.lock())
return existing;
}
auto created = std::make_shared<DocumentStore>(store, map);
cache[key] = created;
return created;
}

DocNode::Children get_node_children(const DocNode* node) const {
DocNode::Children out;
auto& kb_id = std::any_cast<std::string&>(node->_p_global_metadata->at(std::string(RAGMetadataKeys::KB_ID)));
auto& doc_id = std::any_cast<std::string&>(node->_p_global_metadata->at(std::string(RAGMetadataKeys::DOC_ID)));
auto& group_name = node->_group_name;
for(auto& [current_group_name, group] : _node_groups_map) {
if (group._parent != group_name) continue;
if (!std::any_cast<bool>(call("is_group_active", {{"group", current_group_name}}))) continue;
auto nodes_in_group = std::any_cast<std::vector<PDocNode>>(call("get_nodes", {
{"group_name", current_group_name},
{"kb_id", kb_id},
{"doc_ids", std::vector<std::string>({doc_id})}
}));

std::vector<PDocNode> children;
children.reserve(nodes_in_group.size());
for (auto n : nodes_in_group)
if (n->get_parent_node() == node) children.push_back(n);
out[current_group_name] = children;
}
return out;
}

private:
std::unordered_map<std::string, NodeGroup> _node_groups_map;

std::any call_impl(
const std::string& func_name,
const pybind11::object& func,
const std::unordered_map<std::string, std::any>& args) const override
{
if (func_name == "is_group_active") {
return func(args.at("group")).cast<bool>();
}
else if (func_name == "get_node") {
return func(
pybind11::arg("group_name") = std::any_cast<std::string>(args.at("group_name")),
pybind11::arg("uids") = std::vector<std::string>({std::any_cast<std::string>(args.at("uid"))}),
pybind11::arg("kb_id") = std::any_cast<std::string>(args.at("kb_id")),
pybind11::arg("display") = true
).cast<pybind11::list>()[0].cast<DocNode*>();
}
Comment on lines +90 to +97

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Python 函数返回的列表可能为空,直接使用 [0] 访问会导致程序崩溃。建议在访问前检查列表是否为空。

            auto list = func(
                pybind11::arg("group_name") = std::any_cast<std::string>(args.at("group_name")),
                pybind11::arg("uids") = std::vector<std::string>({std::any_cast<std::string>(args.at("uid"))}),
                pybind11::arg("kb_id") = std::any_cast<std::string>(args.at("kb_id")),
                pybind11::arg("display") = true
            ).cast<pybind11::list>();
            if (list.empty()) {
                throw std::runtime_error("DocumentStore's get_node returned an empty list for uid: " + std::any_cast<std::string>(args.at("uid")));
            }
            return list[0].cast<DocNode*>();

else if (func_name == "get_nodes") {
return func(
pybind11::arg("group_name") = std::any_cast<std::string>(args.at("group_name")),
pybind11::arg("kb_id") = std::any_cast<std::string>(args.at("kb_id")),
pybind11::arg("doc_ids") = std::vector<std::string>({std::any_cast<std::string>(args.at("doc_id"))})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

此处存在两个问题:

  1. args 中获取 doc_ids 时使用了错误的键 "doc_id",应该为 "doc_ids"
  2. 值的类型应该是 std::vector<std::string>,而不是 std::string

这会导致 std::out_of_range 异常和类型转换错误。

                pybind11::arg("doc_ids") = std::any_cast<const std::vector<std::string>&>(args.at("doc_ids"))

).cast<std::vector<DocNode*>>();
}
else if (func_name == "get_node_children") {
return get_node_children(std::any_cast<DocNode*>(args.at("node")));
}

throw std::runtime_error("Unknown DocumentStore function: " + func_name);
}

// Cache by Python object identity to ensure one wrapper per store instance.
static std::unordered_map<PyObject *, std::weak_ptr<DocumentStore>> &store_cache() {
static std::unordered_map<PyObject *, std::weak_ptr<DocumentStore>> cache;
return cache;
}
Comment on lines +113 to +116

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

store_cache() 返回一个静态的 unordered_map,但在多线程环境下对它的访问(查找和插入)没有加锁,这会引发竞态条件。from_store 函数可能被多个线程同时调用,需要使用互斥锁来保护对 cache 的访问。

};

} // namespace lazyllm
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ void addDocStr(py::object obj, std::string docs) {
}
}

void exportDoc(py::module& m) {
void exportAddDocStr(py::module& m) {
m.def("add_doc", &addDocStr, "Add docstring to a function or method", py::arg("obj"), py::arg("docs"));
}
Loading
Loading