Skip to content

Commit 390c120

Browse files
committed
fix(file_parser): handle non-UTF-8 encoded text files with automatic encoding detection
1 parent 0efd935 commit 390c120

4 files changed

Lines changed: 64 additions & 6 deletions

File tree

backend/app/utils/file_parser.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,56 @@
88
from typing import List, Optional
99

1010

11+
def _read_text_with_fallback(file_path: str) -> str:
12+
"""
13+
读取文本文件,UTF-8失败时自动探测编码。
14+
15+
采用多级回退策略:
16+
1. 首先尝试 UTF-8 解码
17+
2. 使用 charset_normalizer 检测编码
18+
3. 回退到 chardet 检测编码
19+
4. 最终使用 UTF-8 + errors='replace' 兜底
20+
21+
Args:
22+
file_path: 文件路径
23+
24+
Returns:
25+
解码后的文本内容
26+
"""
27+
data = Path(file_path).read_bytes()
28+
29+
# 首先尝试 UTF-8
30+
try:
31+
return data.decode('utf-8')
32+
except UnicodeDecodeError:
33+
pass
34+
35+
# 尝试使用 charset_normalizer 检测编码
36+
encoding = None
37+
try:
38+
from charset_normalizer import from_bytes
39+
best = from_bytes(data).best()
40+
if best and best.encoding:
41+
encoding = best.encoding
42+
except Exception:
43+
pass
44+
45+
# 回退到 chardet
46+
if not encoding:
47+
try:
48+
import chardet
49+
result = chardet.detect(data)
50+
encoding = result.get('encoding') if result else None
51+
except Exception:
52+
pass
53+
54+
# 最终兜底:使用 UTF-8 + replace
55+
if not encoding:
56+
encoding = 'utf-8'
57+
58+
return data.decode(encoding, errors='replace')
59+
60+
1161
class FileParser:
1262
"""文件解析器"""
1363

@@ -62,15 +112,13 @@ def _extract_from_pdf(file_path: str) -> str:
62112

63113
@staticmethod
64114
def _extract_from_md(file_path: str) -> str:
65-
"""从Markdown提取文本"""
66-
with open(file_path, 'r', encoding='utf-8') as f:
67-
return f.read()
115+
"""从Markdown提取文本,支持自动编码检测"""
116+
return _read_text_with_fallback(file_path)
68117

69118
@staticmethod
70119
def _extract_from_txt(file_path: str) -> str:
71-
"""从TXT提取文本"""
72-
with open(file_path, 'r', encoding='utf-8') as f:
73-
return f.read()
120+
"""从TXT提取文本,支持自动编码检测"""
121+
return _read_text_with_fallback(file_path)
74122

75123
@classmethod
76124
def extract_from_multiple(cls, file_paths: List[str]) -> str:

backend/pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ dependencies = [
2525

2626
# 文件处理
2727
"PyMuPDF>=1.24.0",
28+
# 编码检测(支持非UTF-8编码的文本文件)
29+
"charset-normalizer>=3.0.0",
30+
"chardet>=5.0.0",
2831

2932
# 工具库
3033
"python-dotenv>=1.0.0",

backend/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ camel-ai==0.2.78
2323

2424
# ============= 文件处理 =============
2525
PyMuPDF>=1.24.0
26+
# 编码检测(支持非UTF-8编码的文本文件)
27+
charset-normalizer>=3.0.0
28+
chardet>=5.0.0
2629

2730
# ============= 工具库 =============
2831
# 环境变量加载

backend/uv.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)