Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions lazyllm/tools/rag/doc_to_db/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,30 @@ class SchemaExtractor:
'map': dict,
}

def __init__(self, db_config: Dict[str, Any], llm: LLMBase, *, table_prefix: Optional[str] = None,
force_refresh: bool = False, extraction_mode: ExtractionMode = ExtractionMode.TEXT,
max_len: int = ONE_DOC_LENGTH_LIMIT, num_workers: int = 4):
def __init__(self, db_config: Optional[Dict[str, Any]] = None, llm: LLMBase = None, *,
table_prefix: Optional[str] = None, force_refresh: bool = False,
extraction_mode: ExtractionMode = ExtractionMode.TEXT, max_len: int = ONE_DOC_LENGTH_LIMIT,
num_workers: int = 4, sql_manager: Optional[SqlManager] = None):
if (db_config is None) == (sql_manager is None):
raise ValueError('Exactly one of db_config or sql_manager must be provided')
if not isinstance(llm, LLMBase):
raise TypeError('llm must be an instance of LLMBase')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

llm 参数现在是可选的,默认值为 None,但是这里的检查 if not isinstance(llm, LLMBase): 会在 llmNone 时引发 TypeError。这会阻止在没有 LLM 的情况下初始化 SchemaExtractor,而这似乎是本次重构的一个预期用例。这个检查应该被更新以处理 None 的情况。

Suggested change
if not isinstance(llm, LLMBase):
raise TypeError('llm must be an instance of LLMBase')
if llm is not None and not isinstance(llm, LLMBase):

self._llm = llm
self._table_prefix = table_prefix or self.TABLE_PREFIX
self._sql_manager = None
self._db_config = db_config
if sql_manager is not None:
self._sql_manager = sql_manager
self._db_config = {
'db_type': sql_manager._db_type,
'user': getattr(sql_manager, '_user', None),
'password': getattr(sql_manager, '_password', None),
'host': getattr(sql_manager, '_host', None),
'port': getattr(sql_manager, '_port', None),
'db_name': getattr(sql_manager, '_db_name', None),
'options_str': getattr(sql_manager, '_options_str', None),
}
Comment on lines +81 to +89

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

直接访问 sql_manager 的私有成员(例如 _db_type, _user)破坏了封装性,并使 SchemaExtractorSqlManager 的内部实现紧密耦合。这将使得未来对 SqlManager 的修改变得困难且容易出错。更好的做法是让 SqlManager 提供一个公共方法(例如 get_config())来暴露其配置细节。

Comment on lines +81 to +89

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Accessing private attributes of SqlManager (e.g., _db_type, _user) makes SchemaExtractor tightly coupled to the internal implementation of SqlManager. This can lead to maintenance issues if SqlManager is refactored in the future.

Consider adding a public method to SqlManager, such as get_db_config(), to expose its configuration in a controlled way. This would improve encapsulation and make the code more robust and maintainable.

else:
self._db_config = db_config
self._table_cache: Dict[str, Type[_TableBase]] = {}
self._schema_registry: Dict[str, Type[BaseModel]] = {}
self._force_refresh = force_refresh
Expand Down Expand Up @@ -179,7 +194,8 @@ def _schema_table_desc(model: Type[BaseModel]) -> str:

@once_wrapper
def _lazy_init(self):
self._sql_manager = self._init_sql_manager(self._db_config) if self._db_config else None
if self._sql_manager is None:
self._sql_manager = self._init_sql_manager(self._db_config) if self._db_config else None
if self._sql_manager:
self._ensure_management_tables()

Expand Down
68 changes: 39 additions & 29 deletions lazyllm/tools/rag/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from .doc_manager import DocManager
from .doc_impl import DocImpl, StorePlaceholder, EmbedPlaceholder, BuiltinGroups, DocumentProcessor, NodeGroupType
from .doc_node import DocNode
from .doc_to_db import DocInfoSchema, DocToDbProcessor, extract_db_schema_from_files, SchemaExtractor
from .doc_to_db import SchemaExtractor
from lazyllm.tools.rag.doc_to_db.model import SchemaSetInfo, Table_ALGO_KB_SCHEMA
from .store import LAZY_ROOT_NAME, EMBED_DEFAULT_KEY
from .store.store_base import DEFAULT_KB_ID
from .index_base import IndexBase
Expand Down Expand Up @@ -175,7 +176,6 @@ def __init__(self, dataset_path: Optional[str] = None, embed: Optional[Union[Cal
display_name=display_name, description=description,
schema_extractor=self._schema_extractor)
self._curr_group = name
self._doc_to_db_processor: DocToDbProcessor = None
self._graph_document: weakref.ref = None

@staticmethod
Expand Down Expand Up @@ -230,60 +230,70 @@ def url(self):
def connect_sql_manager(
self,
sql_manager: SqlManager,
schma: Optional[DocInfoSchema] = None,
schma: Optional[BaseModel] = None, #basemodel
force_refresh: bool = True,
):
def format_schema_to_dict(schema: DocInfoSchema):
if schema is None:
return None, None
desc_dict = {ele['key']: ele['desc'] for ele in schema}
type_dict = {ele['key']: ele['type'] for ele in schema}
return desc_dict, type_dict

def compare_schema(old_schema: DocInfoSchema, new_schema: DocInfoSchema):
old_desc_dict, old_type_dict = format_schema_to_dict(old_schema)
new_desc_dict, new_type_dict = format_schema_to_dict(new_schema)
return old_desc_dict == new_desc_dict and old_type_dict == new_type_dict

def compare_schema(rows: Union[List, object, None], schma: BaseModel, extractor: SchemaExtractor):
if schma is None:
return False
sid = extractor.register_schema_set(schma)

if not rows:
return False
if not isinstance(rows, (list, tuple, set)):
rows = [rows]

return any(getattr(row, 'schema_set_id', row) == sid for row in rows)

# 1. Check valid arguments
if sql_manager.check_connection().status != DBStatus.SUCCESS:
raise RuntimeError(f'Failed to connect to sql manager: {sql_manager._gen_conn_url()}')
pre_doc_table_schema = None
if self._doc_to_db_processor:
pre_doc_table_schema = self._doc_to_db_processor.doc_info_schema
assert pre_doc_table_schema or schma, 'doc_table_schma must be given'

schema_equal = compare_schema(pre_doc_table_schema, schma)

rows: List = []
if self._schema_extractor:
mgr = self._schema_extractor.sql_manager
Bind = mgr.get_table_orm_class(Table_ALGO_KB_SCHEMA['name'])
with mgr.get_session() as s:
rows = s.query(Bind).filter_by(algo_id=self._impl._algo_name).all()
# algoid + kbid (self._impl.algo_name)

assert rows or schma, 'doc_table_schma must be given'

extractor = self._schema_extractor or SchemaExtractor(sql_manager)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This line introduces a critical issue with two failure modes:

  1. If self._schema_extractor is None, the code attempts to instantiate SchemaExtractor(sql_manager). This will fail with a TypeError because the SchemaExtractor constructor requires db_config (a dictionary) and llm as its first two positional arguments. The current call incorrectly passes an SqlManager object for db_config and omits the required llm.

  2. If self._schema_extractor was initialized with an LLMBase instance, extractor here would be that LLMBase instance. The subsequent call to compare_schema would then fail with an AttributeError because it expects a SchemaExtractor instance and calls methods like register_schema_set.

To fix this, the function must ensure it has a valid SchemaExtractor instance. This might involve ensuring self._impl._create_schema_extractor() is called beforehand and then consistently using the resulting extractor instance. The state management of _schema_extractor between Document and DocImpl may also need to be reviewed for consistency.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

SchemaExtractor(sql_manager) 这个调用是错误的。在 SchemaExtractor.__init__ 中,sql_manager 是一个仅关键字参数(keyword-only argument),所以这个调用会引发 TypeError。即使它是一个位置参数,它也会被错误地赋给 db_config,从而在后续导致错误。这个调用应该使用关键字参数,例如 SchemaExtractor(sql_manager=sql_manager, ...)。此外,SchemaExtractor 初始化可能需要的 llm 实例没有被提供,这个问题也需要解决。

schema_equal = compare_schema(rows, schma, extractor)
assert (
schema_equal or force_refresh is True
), 'When changing doc_table_schema, force_refresh should be set to True'

# 2. Init handler if needed
need_init_processor = False
if self._doc_to_db_processor is None:
if self._schema_extractor is None:
need_init_processor = True
else:
# avoid reinit for the same db
if sql_manager != self._doc_to_db_processor.sql_manager:
if sql_manager != self._schema_extractor.sql_manager:
need_init_processor = True
if need_init_processor:
self._doc_to_db_processor = DocToDbProcessor(sql_manager)
# reuse the extractor instance used for schema comparison/registration
self._schema_extractor = extractor

# 3. Reset doc_table_schema if needed
if schma and not schema_equal:
# This api call will clear existing db table 'lazyllm_doc_elements'
self._doc_to_db_processor._reset_doc_info_schema(schma)
schema_set_id = self._schema_extractor.register_schema_set(schma)
self._schema_extractor.register_schema_set_to_kb(
algo_id = self._impl._algo_name, schema_set_id= schema_set_id, force_refresh=True)

def get_sql_manager(self):
if self._doc_to_db_processor is None:
if self._schema_extractor is None:
raise ValueError('Please call connect_sql_manager to init handler first')
return self._doc_to_db_processor.sql_manager
return self._schema_extractor.sql_manager

def extract_db_schema(
self, llm: Union[OnlineChatModule, TrainableModule], print_schema: bool = False
) -> DocInfoSchema:
file_paths = self._list_all_files_in_dataset()
schema = extract_db_schema_from_files(file_paths, llm)
) -> SchemaSetInfo:
Comment on lines 305 to +307

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

重构后,extract_db_schema 方法中的 llm 参数不再被使用。该方法现在调用 _analyze_schema_by_llm,它依赖于在 Document 初始化时在 SchemaExtractor 中配置的 llm 实例。这对 API 用户来说是有误导性的,并且相比之前直接使用传入 llm 的行为是一种功能退步。请从方法签名中移除 llm 参数,或者更新实现来使用它。

schema = self._forward('_analyze_schema_by_llm')
if print_schema:
lazyllm.LOG.info(f'Extracted Schema:\n\t{schema}\n')
return schema
Expand Down
56 changes: 0 additions & 56 deletions tests/charge_tests/Tools/test_doc_to_db.py

This file was deleted.

Loading
Loading