11import re
22
3+ def _remove_emojis (text : str ) -> str :
4+ """Remove all emojis from text"""
5+ if not text :
6+ return ""
7+
8+ result = []
9+ for char in text :
10+ code = ord (char )
11+ # Skip characters in emoji ranges (remove emojis), keep non-emoji characters
12+ if (
13+ (0x1F300 <= code <= 0x1F5FF ) or # Supplemental Symbols and Pictographs
14+ (0x1F600 <= code <= 0x1F64F ) or # Emoticons
15+ (0x1F680 <= code <= 0x1F6FF ) or # Transport & Map Symbols
16+ (0x1F900 <= code <= 0x1F9FF ) or # Miscellaneous Symbols
17+ (0x1FA70 <= code <= 0x1FAFF ) or # Symbols and Pictographs Extended-A
18+ (0x2600 <= code <= 0x26FF ) or # Miscellaneous Symbols
19+ (0x2700 <= code <= 0x27BF ) # Dingbats
20+ ):
21+ # This is an emoji, skip it (don't add to result)
22+ continue
23+ else :
24+ # This is not an emoji, keep it
25+ result .append (char )
26+
27+ return '' .join (result )
28+
29+
330_MARKDOWN_FILTER_PATTERNS = [
431 re .compile (r'^#{1,6}\s+.*$' , re .MULTILINE ),
532 re .compile (r'^\s*[-*+]\s+' , re .MULTILINE ),
1138 re .compile (r'\*([^*\n]*)\*' ), # 修复斜体:保留内容
1239 re .compile (r'__([^_\n]*)__' ), # 修复粗体:保留内容
1340 re .compile (r'_([^_\n]*)_' ), # 修复斜体:保留内容
14- re .compile (r'^\|[^\n]*\|$' , re .MULTILINE ),
15- re .compile (r'^\|[\s-]*\|[\s-]*\|$' , re .MULTILINE ),
1641 re .compile (r'~~([^~\n]*)~~' ), # 修复删除线:保留内容
1742 re .compile (r'`([^`\n]*)`' ), # 修复行内代码:保留内容
1843 re .compile (r'^-{3,}$|^_{3,}$|^\*{3,}$' , re .MULTILINE ),
2348 re .compile (r'^`[a-zA-Z0-9_-]*$' , re .MULTILINE )
2449]
2550
26- _BASE_FILTER_PATTERNS = [re . compile ( r'' ) ]
51+ _BASE_FILTER_PATTERNS = []
2752
2853def filter_markdown_content (content : str ) -> str :
2954 """无条件过滤markdown格式内容,保留英文单词间的空格"""
3055 if not content :
3156 return ""
3257
58+ # 首先清除 emoji
59+ content = _remove_emojis (content )
60+
3361 # 定义需要保留内容的模式(这些模式有捕获组,用于移除markdown语法但保留内容)
3462 patterns_with_capture = [
3563 re .compile (r'\*\*([^*\n]*)\*\*' ), # 粗体:保留内容
@@ -78,6 +106,9 @@ def filter_markdown_streaming(content: str) -> str:
78106 if not content :
79107 return ""
80108
109+ # 首先清除 emoji
110+ content = _remove_emojis (content )
111+
81112 # 定义需要保留内容的模式(这些模式有捕获组,用于移除markdown语法但保留内容)
82113 patterns_with_capture = [
83114 re .compile (r'\*\*([^*\n]*)\*\*' ), # 粗体:保留内容
@@ -131,4 +162,4 @@ def filter_markdown_content_legacy(content: str, filter_enabled: bool = False) -
131162 if filter_enabled :
132163 return filter_markdown_content (content )
133164
134- return content .strip ()
165+ return content .strip ()
0 commit comments