Skip to content

Commit ed41a19

Browse files
committed
去除STT中的markdown格式和emoji
1 parent e6d616c commit ed41a19

5 files changed

Lines changed: 49 additions & 7 deletions

File tree

.claude/settings.local.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
"Bash(nul:*)",
2727
"WebFetch(domain:developers.home-assistant.dev)",
2828
"Bash(chcp:*)",
29-
"Bash(python3:*)"
29+
"Bash(python3:*)",
30+
"mcp__serena__list_dir",
31+
"Bash(dir:*)"
3032
],
3133
"deny": [],
3234
"ask": []

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
backup/
2+
.spec-workflow
3+
.claude/

custom_components/ai_hub/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
"iot_class": "cloud_polling",
1010
"issue_tracker": "https://github.com/ha-china/ai_hub/issues",
1111
"requirements": ["edge-tts", "aiofiles", "aiohttp"],
12-
"version": "v2025.11.15"
12+
"version": "v2025.11.16"
1313
}

custom_components/ai_hub/markdown_filter.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,32 @@
11
import re
22

3+
def _remove_emojis(text: str) -> str:
4+
"""Remove all emojis from text"""
5+
if not text:
6+
return ""
7+
8+
result = []
9+
for char in text:
10+
code = ord(char)
11+
# Skip characters in emoji ranges (remove emojis), keep non-emoji characters
12+
if (
13+
(0x1F300 <= code <= 0x1F5FF) or # Supplemental Symbols and Pictographs
14+
(0x1F600 <= code <= 0x1F64F) or # Emoticons
15+
(0x1F680 <= code <= 0x1F6FF) or # Transport & Map Symbols
16+
(0x1F900 <= code <= 0x1F9FF) or # Miscellaneous Symbols
17+
(0x1FA70 <= code <= 0x1FAFF) or # Symbols and Pictographs Extended-A
18+
(0x2600 <= code <= 0x26FF) or # Miscellaneous Symbols
19+
(0x2700 <= code <= 0x27BF) # Dingbats
20+
):
21+
# This is an emoji, skip it (don't add to result)
22+
continue
23+
else:
24+
# This is not an emoji, keep it
25+
result.append(char)
26+
27+
return ''.join(result)
28+
29+
330
_MARKDOWN_FILTER_PATTERNS = [
431
re.compile(r'^#{1,6}\s+.*$', re.MULTILINE),
532
re.compile(r'^\s*[-*+]\s+', re.MULTILINE),
@@ -11,8 +38,6 @@
1138
re.compile(r'\*([^*\n]*)\*'), # 修复斜体:保留内容
1239
re.compile(r'__([^_\n]*)__'), # 修复粗体:保留内容
1340
re.compile(r'_([^_\n]*)_'), # 修复斜体:保留内容
14-
re.compile(r'^\|[^\n]*\|$', re.MULTILINE),
15-
re.compile(r'^\|[\s-]*\|[\s-]*\|$', re.MULTILINE),
1641
re.compile(r'~~([^~\n]*)~~'), # 修复删除线:保留内容
1742
re.compile(r'`([^`\n]*)`'), # 修复行内代码:保留内容
1843
re.compile(r'^-{3,}$|^_{3,}$|^\*{3,}$', re.MULTILINE),
@@ -23,13 +48,16 @@
2348
re.compile(r'^`[a-zA-Z0-9_-]*$', re.MULTILINE)
2449
]
2550

26-
_BASE_FILTER_PATTERNS = [re.compile(r'')]
51+
_BASE_FILTER_PATTERNS = []
2752

2853
def filter_markdown_content(content: str) -> str:
2954
"""无条件过滤markdown格式内容,保留英文单词间的空格"""
3055
if not content:
3156
return ""
3257

58+
# 首先清除 emoji
59+
content = _remove_emojis(content)
60+
3361
# 定义需要保留内容的模式(这些模式有捕获组,用于移除markdown语法但保留内容)
3462
patterns_with_capture = [
3563
re.compile(r'\*\*([^*\n]*)\*\*'), # 粗体:保留内容
@@ -78,6 +106,9 @@ def filter_markdown_streaming(content: str) -> str:
78106
if not content:
79107
return ""
80108

109+
# 首先清除 emoji
110+
content = _remove_emojis(content)
111+
81112
# 定义需要保留内容的模式(这些模式有捕获组,用于移除markdown语法但保留内容)
82113
patterns_with_capture = [
83114
re.compile(r'\*\*([^*\n]*)\*\*'), # 粗体:保留内容
@@ -131,4 +162,4 @@ def filter_markdown_content_legacy(content: str, filter_enabled: bool = False) -
131162
if filter_enabled:
132163
return filter_markdown_content(content)
133164

134-
return content.strip()
165+
return content.strip()

custom_components/ai_hub/stt.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
DOMAIN,
2626
)
2727
from .entity import AIHubEntityBase
28+
from .markdown_filter import filter_markdown_content
2829
from homeassistant.helpers import device_registry as dr
2930

3031
_LOGGER = logging.getLogger(__name__)
@@ -350,11 +351,16 @@ async def async_process_audio_stream(
350351
raise HomeAssistantError("API 响应格式错误,无法找到转录文本")
351352

352353
_LOGGER.info("=== STT识别成功: '%s' ===", transcribed_text)
354+
355+
# 应用 markdown_filter 清理可能的 markdown 格式内容
356+
cleaned_text = filter_markdown_content(transcribed_text)
357+
_LOGGER.info("应用 markdown_filter 后: '%s' → '%s'", transcribed_text, cleaned_text)
358+
353359
_LOGGER.info("返回SpeechResult对象,格式检查...")
354360

355361
# Create SpeechResult object using the correct format like zhipuai
356362
result = stt.SpeechResult(
357-
transcribed_text.strip(),
363+
cleaned_text.strip(),
358364
stt.SpeechResultState.SUCCESS
359365
)
360366
_LOGGER.info("SpeechResult创建成功,text='%s'", result.text)

0 commit comments

Comments
 (0)