|
5 | 5 | # |
6 | 6 | # File : kanjidraw/lib.py |
7 | 7 | # Maintainer : Felix C. Stegerman <[email protected]> |
8 | | -# Date : 2021-05-09 |
| 8 | +# Date : 2021-05-10 |
9 | 9 | # |
10 | 10 | # Copyright : Copyright (C) 2021 Felix C. Stegerman |
11 | | -# Version : v0.1.0 |
| 11 | +# Version : v0.1.1 |
12 | 12 | # License : AGPLv3+ |
13 | 13 | # |
14 | 14 | # -- ; }}}1 |
15 | 15 |
|
16 | 16 | # {{{1 |
17 | 17 | r""" |
18 | 18 |
|
19 | | -Handwritten kanji recognition library. |
| 19 | +Handwritten kanji recognition: library. |
| 20 | +
|
| 21 | +>>> strokes = [[125.5875, 28.6875, 48.45, 196.35], [104.55, 93.7125, 195.7125, 223.125]] |
| 22 | +>>> for s, k in matches(strokes): print(int(s), k) |
| 23 | +99 人 |
| 24 | +96 九 |
| 25 | +96 乂 |
| 26 | +93 八 |
| 27 | +93 入 |
| 28 | +90 儿 |
| 29 | +89 勹 |
| 30 | +88 乃 |
| 31 | +87 又 |
| 32 | +87 几 |
| 33 | +87 冂 |
| 34 | +85 匕 |
| 35 | +82 亻 |
| 36 | +81 卜 |
| 37 | +78 亠 |
| 38 | +75 冖 |
| 39 | +
|
| 40 | +>>> strokes = [[17.85, 102.0, 83.5125, 51.0], [45.9, 26.775, 49.0875, 210.375], [33.7875, 152.3625, 61.8375, 133.875], [103.9125, 54.825, 211.0125, 58.65], [139.6125, 31.2375, 142.8, 64.3875], [178.5, 42.075, 179.1375, 66.3], [108.375, 121.125, 106.4625, 182.325], [113.475, 114.1125, 205.275, 189.975], [163.2, 116.025, 164.475, 181.05], [126.225, 148.5375, 198.2625, 158.7375], [109.0125, 200.8125, 205.9125, 191.25]] |
| 41 | +>>> for s, k in matches(strokes): print(int(s), k) |
| 42 | +91 描 |
| 43 | +89 猫 |
| 44 | +86 桷 |
| 45 | +85 淌 |
| 46 | +84 猟 |
| 47 | +83 掩 |
| 48 | +83 培 |
| 49 | +82 猛 |
| 50 | +82 控 |
| 51 | +82 清 |
| 52 | +82 捨 |
| 53 | +81 陪 |
| 54 | +81 措 |
| 55 | +81 袷 |
| 56 | +81 掠 |
| 57 | +81 淹 |
| 58 | +80 桶 |
| 59 | +80 掘 |
| 60 | +80 舳 |
| 61 | +80 猪 |
| 62 | +80 掃 |
| 63 | +79 情 |
| 64 | +79 捺 |
| 65 | +79 陷 |
| 66 | +79 探 |
20 | 67 |
|
21 | 68 | """ # }}}1 |
22 | 69 |
|
23 | | -import gzip, json, os, re, sys |
| 70 | +import gzip, itertools, json, os, re, sys |
24 | 71 | import xml.etree.ElementTree as ET |
25 | 72 |
|
26 | 73 | from collections import namedtuple |
27 | 74 | from enum import Enum |
28 | 75 |
|
29 | | -__version__ = "0.1.0" |
| 76 | +__version__ = "0.1.1" |
30 | 77 |
|
31 | 78 | DATAFILE = os.path.join(os.path.dirname(__file__), "data.json") |
32 | 79 |
|
|
41 | 88 | STROKE_LOCATION_WEIGHT = 0.6 |
42 | 89 | CLOSE_WEIGHT = 0.7 |
43 | 90 |
|
44 | | -DirAndLoc = namedtuple("DirAndLoc", "starts ends dirs moves".split()) |
| 91 | +SEDM = namedtuple("SEDM", "starts ends dirs moves".split()) |
45 | 92 |
|
46 | 93 | class Direction(Enum): # {{{1 |
47 | 94 | X, N, NE, E, SE, S, SW, W, NW = range(-1, 8) |
@@ -131,15 +178,32 @@ def strict_match(a, b): # {{{1 |
131 | 178 | # }}}1 |
132 | 179 |
|
133 | 180 | def _directions_and_locations(lines): |
134 | | - return DirAndLoc( |
| 181 | + return SEDM( |
135 | 182 | tuple( Location.of_point(*l[:2]) for l in lines ), |
136 | 183 | tuple( Location.of_point(*l[2:]) for l in lines ), |
137 | 184 | tuple(map(Direction.of_line, lines)), |
138 | 185 | tuple(map(Direction.of_move, lines[1:], lines[:-1])) |
139 | 186 | ) |
140 | 187 |
|
| 188 | +def matches(lines, data = None, match = strict_match, |
| 189 | + max_results = 25, cutoff = 0.75): |
| 190 | + """ |
| 191 | + Find best matches; yields a (score, kanji) pair for the first |
| 192 | + max_results matches that have a score >= max_score * cutoff. |
| 193 | + """ |
| 194 | + if data is None: data = kanji_data() |
| 195 | + it = data[len(lines)].items() |
| 196 | + ms = sorted(( (match(lines, l), k) for k, l in it ), reverse = True) |
| 197 | + mm = ms[0][0] * cutoff |
| 198 | + return itertools.takewhile(lambda m: m[0] >= mm, ms[:max_results]) |
| 199 | + |
| 200 | +def kanji_data(): |
| 201 | + if kanji_data._data is None: kanji_data._data = _load_json() |
| 202 | + return kanji_data._data |
| 203 | +kanji_data._data = None |
| 204 | + |
141 | 205 | # FIXME: better kanji unicode ranges |
142 | | -def parse_kanjivg(file): # {{{1 |
| 206 | +def _parse_kanjivg(file): # {{{1 |
143 | 207 | data = {} |
144 | 208 | with gzip.open(file) as f: |
145 | 209 | for e in ET.parse(f).getroot(): |
@@ -179,29 +243,16 @@ def _path_to_line(path): # {{{1 |
179 | 243 | return tuple( int(v * 255 / 109) for v in [x1, y1, x2, y2] ) |
180 | 244 | # }}}1 |
181 | 245 |
|
182 | | -def load_json(file = DATAFILE): |
| 246 | +def _load_json(file = DATAFILE): |
183 | 247 | """Load data from JSON file.""" |
184 | 248 | with open(file) as fh: |
185 | 249 | return { int(k): v for k, v in json.load(fh).items() } |
186 | 250 |
|
187 | | -def save_json(file, data): |
| 251 | +def _save_json(file, data): |
188 | 252 | """Save data to JSON file.""" |
189 | 253 | with open(file, "w") as fh: |
190 | 254 | json.dump(data, fh, sort_keys = True) |
191 | 255 |
|
192 | | -def matches(lines, data, match = strict_match, max_results = 25, |
193 | | - cutoff = 0.75): |
194 | | - """ |
195 | | - Find best matches; yields a (score, kanji) pair for the first |
196 | | - max_results matches that have a score >= max_score * cutoff. |
197 | | - """ |
198 | | - it = data[len(lines)].items() |
199 | | - ms = sorted(( (match(lines, l), k) for k, l in it ), reverse = True) |
200 | | - max_score = ms[0][0] |
201 | | - for m in ms[:max_results]: |
202 | | - if m[0] < max_score * cutoff: break |
203 | | - yield m |
204 | | - |
205 | 256 | if __name__ == "__main__": |
206 | 257 | if "--doctest" in sys.argv: |
207 | 258 | verbose = "--verbose" in sys.argv |
|
0 commit comments