Skip to content

Commit 54da98f

Browse files
Windfarerlyingbug
authored andcommitted
feat: add docx max pages env config
1 parent 3756c7c commit 54da98f

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,10 @@ DOCREADER_ADDR=docreader:50051
291291
# Docreader 连接方式
292292
DOCREADER_TRANSPORT=grpc
293293

294+
# Docreader 中 DOCX 解析的最大页数,默认 100
295+
# 用于限制超大 Word 文档的解析开销;超过页数的内容将不会继续解析
296+
# DOCREADER_DOCX_MAX_PAGES=100
297+
294298
# 如果使用Weaviate作为向量存储,需要配置以下参数
295299
# 注意:容器内访问请使用 service:port(不要用 localhost,也不要用宿主机映射端口)
296300
# Weaviate HTTP 地址(Docker 内:weaviate:8080;宿主机访问:localhost:9035)

docreader/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ class DocReaderConfig:
5252
grpc_max_file_size_mb: int
5353
grpc_port: int
5454

55+
# Parser
56+
docx_max_pages: int
57+
5558
# Proxy
5659
external_http_proxy: str
5760
external_https_proxy: str
@@ -70,6 +73,7 @@ def load_config() -> DocReaderConfig:
7073
* 1024
7174
)
7275
grpc_port = _get_int(["DOCREADER_GRPC_PORT", "PORT"], 50051)
76+
docx_max_pages = _get_int(["DOCREADER_DOCX_MAX_PAGES"], 100)
7377

7478
external_http_proxy = _get_str(
7579
["DOCREADER_EXTERNAL_HTTP_PROXY", "EXTERNAL_HTTP_PROXY"], ""
@@ -86,6 +90,7 @@ def load_config() -> DocReaderConfig:
8690
grpc_max_workers=grpc_max_workers,
8791
grpc_max_file_size_mb=grpc_max_file_size_mb,
8892
grpc_port=grpc_port,
93+
docx_max_pages=docx_max_pages,
8994
external_http_proxy=external_http_proxy,
9095
external_https_proxy=external_https_proxy,
9196
image_output_dir=image_output_dir,
@@ -101,6 +106,7 @@ def dump_config(mask_secrets: bool = True) -> Dict[str, Any]:
101106
"DOCREADER_GRPC_MAX_WORKERS": cfg.grpc_max_workers,
102107
"DOCREADER_GRPC_MAX_FILE_SIZE_MB": cfg.grpc_max_file_size_mb,
103108
"DOCREADER_GRPC_PORT": cfg.grpc_port,
109+
"DOCREADER_DOCX_MAX_PAGES": cfg.docx_max_pages,
104110
"DOCREADER_EXTERNAL_HTTP_PROXY": cfg.external_http_proxy,
105111
"DOCREADER_EXTERNAL_HTTPS_PROXY": cfg.external_https_proxy,
106112
"DOCREADER_IMAGE_OUTPUT_DIR": cfg.image_output_dir,

docreader/parser/docx_parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
4141
)
4242
from PIL import Image
4343

44+
from docreader.config import CONFIG
4445
from docreader.models.document import Document as DocumentModel
4546
from docreader.parser.base_parser import BaseParser
4647
from docreader.utils import endecode
@@ -76,7 +77,7 @@ class DocxParser(BaseParser):
7677

7778
def __init__(
7879
self,
79-
max_pages: int = 100, # Maximum number of pages to process
80+
max_pages: Optional[int] = None, # Maximum number of pages to process
8081
**kwargs,
8182
):
8283
"""Initialize DOCX document parser
@@ -95,8 +96,8 @@ def __init__(
9596
max_pages: Maximum number of pages to process
9697
"""
9798
super().__init__(**kwargs)
98-
self.max_pages = max_pages
99-
logger.info(f"DocxParser initialized with max_pages={max_pages}")
99+
self.max_pages = CONFIG.docx_max_pages if max_pages is None else max_pages
100+
logger.info(f"DocxParser initialized with max_pages={self.max_pages}")
100101

101102
def parse_into_text(self, content: bytes) -> DocumentModel:
102103
"""Parse DOCX document, extract text content and image Markdown links"""

0 commit comments

Comments
 (0)