forked from MetrodataTeam/pinyin-data
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_regex.py
More file actions
30 lines (29 loc) · 857 Bytes
/
prepare_regex.py
File metadata and controls
30 lines (29 loc) · 857 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
data = []
with open('pinyin.txt') as f:
for index, i in enumerate(f):
if index < 2:
continue
unicode, character = re.match(r'U\+(.+):.*#\s+(.+)', i).groups()
data.append((unicode, character))
data_int = sorted((ord(j), j) for i, j in data)
pattern = ['['] + [data[0][1]]
for i in range(1, len(data) - 1):
pre = data_int[i][0] - data_int[i - 1][0] == 1
post = data_int[i + 1][0] - data_int[i][0] == 1
if pre and post:
if pattern[-1] != '-':
pattern.append('-')
continue
if not pre and not post:
pattern.append(data_int[i][1])
elif pre and not post:
pattern.append(data_int[i][1])
else:
pattern.append(data_int[i][1])
pattern.append(data_int[-1][1])
pattern.append(']+')
regex = ''.join(pattern)
print(len(regex), regex[:10], regex[-10:])
with open('regex.txt', 'w') as f:
f.write(regex)