Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 142 additions & 9 deletions backend/app/services/ontology_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Dict, Any, List, Optional
from ..utils.llm_client import LLMClient
from ..utils.locale import get_language_instruction
from ..utils.file_parser import split_text_into_chunks

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -227,6 +228,10 @@ def generate(

# 传给 LLM 的文本最大长度(5万字)
MAX_TEXT_LENGTH_FOR_LLM = 50000
LONG_TEXT_CHUNK_SIZE = 8000
LONG_TEXT_CHUNK_OVERLAP = 200
MAX_LONG_TEXT_CHUNKS = 60
MIN_LONG_TEXT_EXCERPT = 400

def _build_user_message(
self,
Expand All @@ -236,14 +241,7 @@ def _build_user_message(
) -> str:
"""构建用户消息"""

# 合并文本
combined_text = "\n\n---\n\n".join(document_texts)
original_length = len(combined_text)

# 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建)
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
combined_text = self._build_document_context(document_texts)

message = f"""## 模拟需求

Expand Down Expand Up @@ -273,6 +271,142 @@ def _build_user_message(
"""

return message

def _build_document_context(self, document_texts: List[str]) -> str:
"""构建用于本体分析的文档上下文,长文本按全局分块抽样而不是只截取开头。"""

combined_text = "\n\n---\n\n".join(document_texts)
original_length = len(combined_text)

if original_length <= self.MAX_TEXT_LENGTH_FOR_LLM:
return combined_text

chunks = self._collect_document_chunks(document_texts)
if not chunks:
return ""

selected_chunks = self._select_representative_chunks(chunks)
excerpt_budget = self._calculate_excerpt_budget(len(selected_chunks))
context = self._render_chunked_context(
selected_chunks=selected_chunks,
original_length=original_length,
total_chunks=len(chunks),
excerpt_limit=excerpt_budget,
)

while len(context) > self.MAX_TEXT_LENGTH_FOR_LLM and excerpt_budget > self.MIN_LONG_TEXT_EXCERPT:
excerpt_budget = max(self.MIN_LONG_TEXT_EXCERPT, int(excerpt_budget * 0.85))
context = self._render_chunked_context(
selected_chunks=selected_chunks,
original_length=original_length,
total_chunks=len(chunks),
excerpt_limit=excerpt_budget,
)

if len(context) > self.MAX_TEXT_LENGTH_FOR_LLM:
marker = "\n\n...(分块上下文已压缩到本体分析长度限制内)..."
context = context[:self.MAX_TEXT_LENGTH_FOR_LLM - len(marker)] + marker

return context

def _collect_document_chunks(self, document_texts: List[str]) -> List[Dict[str, Any]]:
"""按文档收集分块,保留文档和分块编号方便提示词定位。"""

all_chunks: List[Dict[str, Any]] = []
for doc_index, text in enumerate(document_texts, 1):
doc_chunks = split_text_into_chunks(
text,
chunk_size=self.LONG_TEXT_CHUNK_SIZE,
overlap=self.LONG_TEXT_CHUNK_OVERLAP,
)
total_doc_chunks = len(doc_chunks)
for chunk_index, chunk in enumerate(doc_chunks, 1):
all_chunks.append({
"document_index": doc_index,
"chunk_index": chunk_index,
"total_document_chunks": total_doc_chunks,
"text": chunk,
})

return all_chunks

def _select_representative_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""从全部分块中等距抽样,覆盖长文开头、中段和结尾。"""

if len(chunks) <= self.MAX_LONG_TEXT_CHUNKS:
return chunks

if self.MAX_LONG_TEXT_CHUNKS <= 1:
return [chunks[0]]

last_index = len(chunks) - 1
selected_indexes = {
round(i * last_index / (self.MAX_LONG_TEXT_CHUNKS - 1))
for i in range(self.MAX_LONG_TEXT_CHUNKS)
}
return [chunks[i] for i in sorted(selected_indexes)]

def _calculate_excerpt_budget(self, selected_count: int) -> int:
"""根据选中的分块数量为每块分配字符预算。"""

header_budget = 600
chunk_header_budget = 120 * selected_count
available = max(
self.MIN_LONG_TEXT_EXCERPT * selected_count,
self.MAX_TEXT_LENGTH_FOR_LLM - header_budget - chunk_header_budget,
)
return max(self.MIN_LONG_TEXT_EXCERPT, available // max(selected_count, 1))

def _render_chunked_context(
self,
selected_chunks: List[Dict[str, Any]],
original_length: int,
total_chunks: int,
excerpt_limit: int,
) -> str:
"""渲染长文本分块上下文。"""

lines = [
(
f"【长文本自动分块摘要】原文共{original_length}字,"
f"已分为{total_chunks}个文本块用于全局覆盖分析。"
),
(
f"以下展示其中{len(selected_chunks)}个代表性文本块的摘录,"
"覆盖开头、中段和结尾;请基于这些跨全文线索设计本体,不要只依赖第一段内容。"
),
]

for chunk in selected_chunks:
excerpt = self._excerpt_text(chunk["text"], excerpt_limit)
lines.append(
"\n".join([
(
f"--- 文档 {chunk['document_index']} / "
f"分块 {chunk['chunk_index']}/{chunk['total_document_chunks']} ---"
),
excerpt,
])
)

return "\n\n".join(lines)

@staticmethod
def _excerpt_text(text: str, char_limit: int) -> str:
"""长分块保留首尾,避免每个分块内部再次变成只看开头。"""

text = text.strip()
if len(text) <= char_limit:
return text

marker = "\n...(本分块中间内容省略)...\n"
if char_limit <= len(marker) + 20:
return text[:char_limit]

remaining = char_limit - len(marker)
head_len = remaining // 2
tail_len = remaining - head_len
return f"{text[:head_len].rstrip()}{marker}{text[-tail_len:].lstrip()}"

def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""验证和后处理结果"""
Expand Down Expand Up @@ -503,4 +637,3 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str:
code_lines.append('}')

return '\n'.join(code_lines)

52 changes: 52 additions & 0 deletions backend/tests/test_ontology_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from app.services.ontology_generator import OntologyGenerator


def _generator_for_test() -> OntologyGenerator:
generator = OntologyGenerator(llm_client=object())
generator.MAX_TEXT_LENGTH_FOR_LLM = 2000
generator.LONG_TEXT_CHUNK_SIZE = 500
generator.LONG_TEXT_CHUNK_OVERLAP = 0
generator.MAX_LONG_TEXT_CHUNKS = 3
generator.MIN_LONG_TEXT_EXCERPT = 120
return generator


def test_short_ontology_context_keeps_original_text():
generator = _generator_for_test()

context = generator._build_document_context(["short document body"])

assert context == "short document body"
assert "长文本自动分块摘要" not in context


def test_long_ontology_context_samples_across_document():
generator = _generator_for_test()
long_text = "BEGIN" + ("a" * 1050) + "MIDDLE" + ("b" * 1050) + "END"

context = generator._build_document_context([long_text])

assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
assert "长文本自动分块摘要" in context
assert "BEGIN" in context
assert "MIDDLE" in context
assert "END" in context
assert "分块 1/" in context
assert "分块 3/" in context
assert "分块 5/" in context


def test_very_long_ontology_context_selects_representative_chunks():
generator = _generator_for_test()
chunks = ["BEGIN"] + [
f"CHUNK{i:02d}-" + (str(i) * 490)
for i in range(12)
] + ["FINALEND"]
long_text = "".join(chunks)

context = generator._build_document_context([long_text])

assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
assert "BEGIN" in context
assert "FINALEND" in context
assert context.count("--- 文档 1 / 分块") == generator.MAX_LONG_TEXT_CHUNKS