Skip to content

Commit 3a00607

Browse files
authored
Merge pull request #47 from brandonhthiere/pr46-pinyin-fuzzy
feat(search): 增加拼音与模糊(编辑距离)搜索及前缀联想
2 parents 165c61a + 031f11e commit 3a00607

3 files changed

Lines changed: 129 additions & 0 deletions

File tree

pinyin_index.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Lightweight pinyin + fuzzy matching (no heavy dependencies).
2+
3+
A small built-in CJK->pinyin map covers common characters; unknown characters
4+
fall back to themselves. This powers pinyin / pinyin-initial recall and a
5+
bounded edit-distance fuzzy fallback for typo tolerance.
6+
"""
7+
from typing import List
8+
9+
# Compact map of common Chinese characters to pinyin (extend as needed).
10+
_PINYIN = {
11+
"北": "bei", "京": "jing", "上": "shang", "海": "hai", "广": "guang",
12+
"州": "zhou", "深": "shen", "圳": "zhen", "中": "zhong", "国": "guo",
13+
"学": "xue", "习": "xi", "笔": "bi", "记": "ji", "工": "gong",
14+
"作": "zuo", "生": "sheng", "活": "huo", "项": "xiang", "目": "mu",
15+
"想": "xiang", "法": "fa", "今": "jin", "天": "tian", "明": "ming",
16+
"编": "bian", "程": "cheng", "教": "jiao", "搜": "sou", "索": "suo",
17+
"任": "ren", "务": "wu", "文": "wen", "件": "jian", "数": "shu",
18+
"据": "ju", "时": "shi", "间": "jian", "标": "biao", "题": "ti",
19+
}
20+
21+
22+
def to_pinyin(text: str) -> str:
23+
"""Full pinyin of text (known chars mapped, others kept)."""
24+
out = [_PINYIN.get(ch, ch if ch.isascii() else "") for ch in text]
25+
return "".join(out).lower()
26+
27+
28+
def to_initials(text: str) -> str:
29+
"""First letters of each character's pinyin (e.g. 北京 -> bj)."""
30+
out = []
31+
for ch in text:
32+
py = _PINYIN.get(ch)
33+
if py:
34+
out.append(py[0])
35+
elif ch.isascii() and ch.isalnum():
36+
out.append(ch.lower())
37+
return "".join(out)
38+
39+
40+
def matches_pinyin(query: str, text: str) -> bool:
41+
"""True if query matches text's full pinyin or pinyin initials."""
42+
q = query.lower()
43+
return bool(q) and (q in to_pinyin(text) or q in to_initials(text))
44+
45+
46+
def edit_distance(a: str, b: str) -> int:
47+
"""Levenshtein distance (iterative DP)."""
48+
a, b = a.lower(), b.lower()
49+
if a == b:
50+
return 0
51+
prev = list(range(len(b) + 1))
52+
for i, ca in enumerate(a, 1):
53+
cur = [i]
54+
for j, cb in enumerate(b, 1):
55+
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb)))
56+
prev = cur
57+
return prev[-1]
58+
59+
60+
def fuzzy_match(query: str, term: str, max_distance: int = 1) -> bool:
61+
"""True if query is within max_distance edits of term (or a prefix region)."""
62+
if not query:
63+
return False
64+
if query in term:
65+
return True
66+
return edit_distance(query, term) <= max_distance
67+
68+
69+
def suggest(prefix: str, terms: List[str], limit: int = 5) -> List[str]:
70+
"""Prefix suggestions across literal terms and their pinyin/initials."""
71+
p = prefix.lower()
72+
hits = [t for t in terms
73+
if t.lower().startswith(p) or to_pinyin(t).startswith(p) or to_initials(t).startswith(p)]
74+
return hits[:limit]

search_engine.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,22 @@ def search_bm25(self, query: str, notes: Dict, limit: int = 20,
236236
scores[note_id] *= weights["tags"]
237237
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:limit]
238238

239+
def search_fuzzy(self, query: str, notes: Dict, limit: int = 20) -> List[Tuple[str, float]]:
240+
"""Augment exact results with pinyin and edit-distance fallbacks at
241+
lower weight, improving recall for Chinese and typos."""
242+
import pinyin_index # noqa: PLC0415
243+
results = dict(self.search_bm25(query, notes, limit))
244+
q = query.lower()
245+
for note_id, note in notes.items():
246+
if note is None or note_id in results:
247+
continue
248+
hay = f"{note.title} {note.content} {' '.join(note.tags)}"
249+
if pinyin_index.matches_pinyin(q, hay):
250+
results[note_id] = results.get(note_id, 0.0) + 2.0 # pinyin < exact
251+
elif any(pinyin_index.fuzzy_match(q, tok) for tok in self.tokenize(hay)):
252+
results[note_id] = results.get(note_id, 0.0) + 1.0 # fuzzy lowest
253+
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:limit]
254+
239255
def search(self, query: str, notes: Dict, limit: int = 20) -> List[Tuple[str, float]]:
240256
if not query or not notes:
241257
return []

tests/test_pinyin_fuzzy.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Pinyin + fuzzy matching helpers and integration with search_fuzzy."""
2+
import pinyin_index
3+
from note_model import NoteManager
4+
from search_engine import SearchEngine
5+
6+
7+
def test_pinyin_full_and_initials():
8+
assert pinyin_index.to_pinyin("北京") == "beijing"
9+
assert pinyin_index.to_initials("北京") == "bj"
10+
11+
12+
def test_matches_pinyin():
13+
assert pinyin_index.matches_pinyin("beijing", "北京游记")
14+
assert pinyin_index.matches_pinyin("bj", "北京游记")
15+
assert not pinyin_index.matches_pinyin("shanghai", "北京游记")
16+
17+
18+
def test_edit_distance_and_fuzzy():
19+
assert pinyin_index.edit_distance("python", "pythn") == 1
20+
assert pinyin_index.fuzzy_match("pythn", "python", max_distance=1)
21+
assert not pinyin_index.fuzzy_match("xyz", "python", max_distance=1)
22+
23+
24+
def test_search_fuzzy_recall_pinyin(tmp_path):
25+
nm = NoteManager(tmp_path / "n.db", tmp_path)
26+
n = nm.create_note("北京游记", "记录北京的行程")
27+
se = SearchEngine(tmp_path / "i.json")
28+
se.build_index(nm.notes)
29+
hits = [nid for nid, _ in se.search_fuzzy("beijing", nm.notes)]
30+
assert n.id in hits
31+
32+
33+
def test_search_fuzzy_typo(tmp_path):
34+
nm = NoteManager(tmp_path / "n.db", tmp_path)
35+
n = nm.create_note("Python notes", "python python python")
36+
se = SearchEngine(tmp_path / "i.json")
37+
se.build_index(nm.notes)
38+
hits = [nid for nid, _ in se.search_fuzzy("pythn", nm.notes)] # one typo
39+
assert n.id in hits

0 commit comments

Comments
 (0)