去除STT中的markdown格式和emoji

Desmond-Dong · Desmond-Dong · commit ed41a19dfca8 · 2025-12-02T20:10:01.000+08:00
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -26,7 +26,9 @@
       "Bash(nul:*)",
       "WebFetch(domain:developers.home-assistant.dev)",
       "Bash(chcp:*)",
-      "Bash(python3:*)"
+      "Bash(python3:*)",
+      "mcp__serena__list_dir",
+      "Bash(dir:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+backup/
+.spec-workflow
+.claude/
diff --git a/custom_components/ai_hub/manifest.json b/custom_components/ai_hub/manifest.json
@@ -9,5 +9,5 @@
   "iot_class": "cloud_polling",
   "issue_tracker": "https://github.com/ha-china/ai_hub/issues",
   "requirements": ["edge-tts", "aiofiles", "aiohttp"],
-  "version": "v2025.11.15"
+  "version": "v2025.11.16"
 }
diff --git a/custom_components/ai_hub/markdown_filter.py b/custom_components/ai_hub/markdown_filter.py
@@ -1,5 +1,32 @@
 import re
 
+def _remove_emojis(text: str) -> str:
+    """Remove all emojis from text"""
+    if not text:
+        return ""
+
+    result = []
+    for char in text:
+        code = ord(char)
+        # Skip characters in emoji ranges (remove emojis), keep non-emoji characters
+        if (
+            (0x1F300 <= code <= 0x1F5FF) or  # Supplemental Symbols and Pictographs
+            (0x1F600 <= code <= 0x1F64F) or  # Emoticons
+            (0x1F680 <= code <= 0x1F6FF) or  # Transport & Map Symbols
+            (0x1F900 <= code <= 0x1F9FF) or  # Miscellaneous Symbols
+            (0x1FA70 <= code <= 0x1FAFF) or  # Symbols and Pictographs Extended-A
+            (0x2600 <= code <= 0x26FF) or    # Miscellaneous Symbols
+            (0x2700 <= code <= 0x27BF)       # Dingbats
+        ):
+            # This is an emoji, skip it (don't add to result)
+            continue
+        else:
+            # This is not an emoji, keep it
+            result.append(char)
+
+    return ''.join(result)
+
+
 _MARKDOWN_FILTER_PATTERNS = [
     re.compile(r'^#{1,6}\s+.*$', re.MULTILINE),
     re.compile(r'^\s*[-*+]\s+', re.MULTILINE),
@@ -11,8 +38,6 @@
     re.compile(r'\*([^*\n]*)\*'),          # 修复斜体：保留内容
     re.compile(r'__([^_\n]*)__'),          # 修复粗体：保留内容
     re.compile(r'_([^_\n]*)_'),            # 修复斜体：保留内容
-    re.compile(r'^\|[^\n]*\|$', re.MULTILINE),
-    re.compile(r'^\|[\s-]*\|[\s-]*\|$', re.MULTILINE),
     re.compile(r'~~([^~\n]*)~~'),          # 修复删除线：保留内容
     re.compile(r'`([^`\n]*)`'),            # 修复行内代码：保留内容
     re.compile(r'^-{3,}$|^_{3,}$|^\*{3,}$', re.MULTILINE),
@@ -23,13 +48,16 @@
     re.compile(r'^`[a-zA-Z0-9_-]*$', re.MULTILINE)
 ]
 
-_BASE_FILTER_PATTERNS = [re.compile(r'')]
+_BASE_FILTER_PATTERNS = []
 
 def filter_markdown_content(content: str) -> str:
     """无条件过滤markdown格式内容，保留英文单词间的空格"""
     if not content:
         return ""
 
+    # 首先清除 emoji
+    content = _remove_emojis(content)
+
     # 定义需要保留内容的模式（这些模式有捕获组，用于移除markdown语法但保留内容）
     patterns_with_capture = [
         re.compile(r'\*\*([^*\n]*)\*\*'),      # 粗体：保留内容
@@ -78,6 +106,9 @@ def filter_markdown_streaming(content: str) -> str:
     if not content:
         return ""
 
+    # 首先清除 emoji
+    content = _remove_emojis(content)
+
     # 定义需要保留内容的模式（这些模式有捕获组，用于移除markdown语法但保留内容）
     patterns_with_capture = [
         re.compile(r'\*\*([^*\n]*)\*\*'),      # 粗体：保留内容
@@ -131,4 +162,4 @@ def filter_markdown_content_legacy(content: str, filter_enabled: bool = False) -
     if filter_enabled:
         return filter_markdown_content(content)
 
-    return content.strip() 
+    return content.strip()
diff --git a/custom_components/ai_hub/stt.py b/custom_components/ai_hub/stt.py
@@ -25,6 +25,7 @@
     DOMAIN,
 )
 from .entity import AIHubEntityBase
+from .markdown_filter import filter_markdown_content
 from homeassistant.helpers import device_registry as dr
 
 _LOGGER = logging.getLogger(__name__)
@@ -350,11 +351,16 @@ async def async_process_audio_stream(
                         raise HomeAssistantError("API 响应格式错误，无法找到转录文本")
 
                     _LOGGER.info("=== STT识别成功: '%s' ===", transcribed_text)
+
+                    # 应用 markdown_filter 清理可能的 markdown 格式内容
+                    cleaned_text = filter_markdown_content(transcribed_text)
+                    _LOGGER.info("应用 markdown_filter 后: '%s' → '%s'", transcribed_text, cleaned_text)
+
                     _LOGGER.info("返回SpeechResult对象，格式检查...")
 
                     # Create SpeechResult object using the correct format like zhipuai
                     result = stt.SpeechResult(
-                        transcribed_text.strip(),
+                        cleaned_text.strip(),
                         stt.SpeechResultState.SUCCESS
                     )
                     _LOGGER.info("SpeechResult创建成功，text='%s'", result.text)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+backup/`
	`2`	`+.spec-workflow`
	`3`	`+.claude/`
Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,5 @@`
`9`	`9`	`"iot_class": "cloud_polling",`
`10`	`10`	`"issue_tracker": "https://github.com/ha-china/ai_hub/issues",`
`11`	`11`	`"requirements": ["edge-tts", "aiofiles", "aiohttp"],`
`12`		`- "version": "v2025.11.15"`
	`12`	`+ "version": "v2025.11.16"`
`13`	`13`	`}`