|
8 | 8 | from typing import List, Optional |
9 | 9 |
|
10 | 10 |
|
| 11 | +def _read_text_with_fallback(file_path: str) -> str: |
| 12 | + """ |
| 13 | + 读取文本文件,UTF-8失败时自动探测编码。 |
| 14 | + |
| 15 | + 采用多级回退策略: |
| 16 | + 1. 首先尝试 UTF-8 解码 |
| 17 | + 2. 使用 charset_normalizer 检测编码 |
| 18 | + 3. 回退到 chardet 检测编码 |
| 19 | + 4. 最终使用 UTF-8 + errors='replace' 兜底 |
| 20 | + |
| 21 | + Args: |
| 22 | + file_path: 文件路径 |
| 23 | + |
| 24 | + Returns: |
| 25 | + 解码后的文本内容 |
| 26 | + """ |
| 27 | + data = Path(file_path).read_bytes() |
| 28 | + |
| 29 | + # 首先尝试 UTF-8 |
| 30 | + try: |
| 31 | + return data.decode('utf-8') |
| 32 | + except UnicodeDecodeError: |
| 33 | + pass |
| 34 | + |
| 35 | + # 尝试使用 charset_normalizer 检测编码 |
| 36 | + encoding = None |
| 37 | + try: |
| 38 | + from charset_normalizer import from_bytes |
| 39 | + best = from_bytes(data).best() |
| 40 | + if best and best.encoding: |
| 41 | + encoding = best.encoding |
| 42 | + except Exception: |
| 43 | + pass |
| 44 | + |
| 45 | + # 回退到 chardet |
| 46 | + if not encoding: |
| 47 | + try: |
| 48 | + import chardet |
| 49 | + result = chardet.detect(data) |
| 50 | + encoding = result.get('encoding') if result else None |
| 51 | + except Exception: |
| 52 | + pass |
| 53 | + |
| 54 | + # 最终兜底:使用 UTF-8 + replace |
| 55 | + if not encoding: |
| 56 | + encoding = 'utf-8' |
| 57 | + |
| 58 | + return data.decode(encoding, errors='replace') |
| 59 | + |
| 60 | + |
11 | 61 | class FileParser: |
12 | 62 | """文件解析器""" |
13 | 63 |
|
@@ -62,15 +112,13 @@ def _extract_from_pdf(file_path: str) -> str: |
62 | 112 |
|
63 | 113 | @staticmethod |
64 | 114 | def _extract_from_md(file_path: str) -> str: |
65 | | - """从Markdown提取文本""" |
66 | | - with open(file_path, 'r', encoding='utf-8') as f: |
67 | | - return f.read() |
| 115 | + """从Markdown提取文本,支持自动编码检测""" |
| 116 | + return _read_text_with_fallback(file_path) |
68 | 117 |
|
69 | 118 | @staticmethod |
70 | 119 | def _extract_from_txt(file_path: str) -> str: |
71 | | - """从TXT提取文本""" |
72 | | - with open(file_path, 'r', encoding='utf-8') as f: |
73 | | - return f.read() |
| 120 | + """从TXT提取文本,支持自动编码检测""" |
| 121 | + return _read_text_with_fallback(file_path) |
74 | 122 |
|
75 | 123 | @classmethod |
76 | 124 | def extract_from_multiple(cls, file_paths: List[str]) -> str: |
|
0 commit comments