-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcanton_dict.py
311 lines (268 loc) · 13.2 KB
/
canton_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Simple Bot to reply to Telegram messages
# This program is dedicated to the public domain under the CC0 license.
"""
This Bot uses the Updater class to handle the bot.
First, a few handler functions are defined. Then, those functions are passed to
the Dispatcher and registered at their respective places.
Then, the bot is started and runs until we press Ctrl-C on the command line.
Usage:
Basic inline bot example. Applies different text transformations.
Press Ctrl-C on the command line or send a signal to the process to stop the
bot.
"""
import ast
from uuid import uuid4
import re
from urllib.request import urlopen
from telegram import InlineQueryResultArticle, InlineQueryResultAudio, ParseMode, InputTextMessageContent
from telegram.ext import Updater, InlineQueryHandler, CommandHandler
import logging
from telegram.ext.dispatcher import run_async
import pymysql
# Enable logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO)
logger = logging.getLogger(__name__)
re_square_brackets = r"<[^<]*?>"
re_nothing = r''
re_cangjie = r'<th>倉頡碼</th>\s*<td>.*? </td>'
re_brief_explain = r'略說:</span>\s*.*?<br />'
re_audios = r'sound/.*?Mp3'
re_examples = r'<div style="overflow-y: auto; overflow-x: hidden; width: 100%; height: 40px">\s*.*?</div>'
re_can_note = '<td class="char_can_note">\s*.*?</td>'
re_eng_pos = r'<td class="char_eng_pos">*.*?</td>'
re_eng_meaning = r'<td class="char_eng_meaning">*.*?</td>'
re_can_phon = '<td class="char_can_phon" colspan="3" rowspan="2">\s*.*?<\/td>'
connection = pymysql.connect(host='',
user='',
password='',
connect_timeout=1209600,
db='',
use_unicode=True,
charset="utf8")
class CantonDict:
def __init__str(self, word):
try:
word = word.encode('utf8')
url = 'http://humanum.arts.cuhk.edu.hk/Lexis/lexi-mf/search.php?word={0}'.format(
str(word)[2:-1].replace('\\x', '%').upper())
contents = str(urlopen(url).read().decode('utf-8'))
contents_no_newline = contents.replace("\r", "").replace("\n", "")
# if no this word
if "字未收錄於本資料庫" in contents:
print("This word is not recorded.")
return
# 倉頡碼
cangjie_pattern = re.findall(re_cangjie, contents_no_newline)[0].replace(" ", "")[16:-11]
# 略說
brief_explain = re.findall(re_brief_explain, contents_no_newline)
brief_explains = list()
for explanation in brief_explain:
while True:
b_new = re.sub(re_square_brackets, re_nothing, explanation)
if b_new == explanation:
brief_explains.append(explanation.encode().strip())
break
explanation = b_new
# 粵音
pronunciation = re.findall(re_audios, contents_no_newline)
# 詞例
word_example = re.findall(re_examples, contents_no_newline)
char_can_notes = re.findall(re_can_note, contents_no_newline)
notes = list()
examples = list()
for example in word_example:
while True:
e_new = re.sub(re_square_brackets, re_nothing, example)
if e_new == example:
examples.append(example.strip())
break
example = e_new
for note in char_can_notes:
while True:
e_new = re.sub(re_square_brackets, re_nothing, note)
if e_new == note:
while True:
e_new = re.sub(r'\tphonetic.*?;', re_nothing, note)
if e_new == note:
break
else:
note = e_new
notes.append(note.strip())
break
note = e_new
# 英文
eng_pos = re.findall(re_eng_pos, contents_no_newline)
eng_meaning = re.findall(re_eng_meaning, contents_no_newline)
eng_meanings = list()
for i in range(len(eng_pos)):
eng_meanings.append("({0}) {1}".format(re.sub(re_square_brackets, re_nothing, eng_pos[i]),
re.sub(re_square_brackets, re_nothing, eng_meaning[i])))
# 同音
homonyms = re.findall(re_can_phon, contents_no_newline)
homonyms_list = list()
for row in homonyms:
while True:
e_new = re.sub(re_square_brackets, re_nothing, row)
if e_new == row:
homonyms_list.append(row.strip())
break
row = e_new
self.homonyms = homonyms_list
self.url = url
self.examples = examples
self.brief_explains = brief_explains
self.audios = pronunciation
self.cangjie_pattern = cangjie_pattern
self.char_can_notes = notes
self.eng_meanings = '\n'.join(eng_meanings)
self.word_example = word_example
self.word = word
except UnicodeEncodeError: # no this word in utf8
print("no this word in utf8") # Define a few command handlers. These usually take the two arguments bot and
def __init__(self, input):
if type(input) == str:
self.__init__str(input)
return
elif type(input) == tuple:
self.url = input[1]
self.examples = input[2].split(',')
self.brief_explains = input[3]
self.audios = input[4][1:-1].replace("'", "").split(',')
self.cangjie_pattern = input[5]
self.char_can_notes = input[6][1:-1].split(',')
self.eng_meanings = input[7]
self.word_example = input[8][1:-1].split(',')
self.homonyms = input[9][1:-1].split(',')
self.word = input[0]
@run_async
# update. Error handlers also receive the raised TelegramError object in error.
def start(bot, update):
update.message.reply_text('請使用行內模式\nPlease use inline mode\nhttps://github.com/tlyeung/cantondict')
@run_async
def help(bot, update):
update.message.reply_text('請使用行內模式\nPlease use inline mode\nhttps://github.com/tlyeung/cantondict')
@run_async
def escape_markdown(text):
"""Helper function to escape telegram markup symbols"""
escape_chars = '\*_`\['
return re.sub(r'([%s])' % escape_chars, r'\\\1', text)
@run_async
def inline_query(bot, update):
word = update.inline_query.query
results = list()
if len(word) >= 1:
with connection.cursor() as cursor:
sql = "select * from CantonDict where word=%s limit 1"
cursor.execute(sql, (word[0]))
result = cursor.fetchone()
if result:
canton_dict = CantonDict(result)
else:
canton_dict = CantonDict(word[0])
with connection.cursor() as cursor:
sql = "INSERT INTO `CantonDict`" \
" (`word`, `url`,`examples`,`brief_explains`,`audios`" \
",`cangjie_pattern`,`char_can_notes`,`eng_meanings`,`word_example`,`homonyms`)" \
" VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(sql,
(word[0],
canton_dict.url,
str(canton_dict.examples).replace("[", "").replace("]", ""),
str(canton_dict.brief_explains).replace("[", "").replace("]", ""),
str(canton_dict.audios).replace("[", "").replace("]", ""),
canton_dict.cangjie_pattern,
str(canton_dict.char_can_notes).replace("[", "").replace("]", ""),
canton_dict.eng_meanings,
str(canton_dict.word_example).replace("[", "").replace("]", ""),
str(canton_dict.homonyms).replace("[", "").replace("]", "")))
connection.commit()
if len(canton_dict.audios) == len(canton_dict.char_can_notes):
for index in range(len(canton_dict.audios)):
both = canton_dict.char_can_notes[index] and canton_dict.examples[index]
results.append(InlineQueryResultAudio(
id=uuid4(),
title="「{0}」讀音:{1},{2}".format(
word[0], canton_dict.char_can_notes[index], canton_dict.examples[index])
if both else
"「{0}」讀音:{1}{2}".format(
word[0], canton_dict.char_can_notes[index], canton_dict.examples[index]),
audio_url='http://humanum.arts.cuhk.edu.hk/Lexis/lexi-mf/{0}'.format(canton_dict.audios[index])))
if canton_dict.word_example:
example_line = list()
empty = True
for example in canton_dict.examples:
example_line.append(example.replace("'", "").strip())
if example.replace("'", "").strip():
empty = False
if len(example_line) > 0 and not empty:
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」例子".format(word[0]),
input_message_content=InputTextMessageContent(
"「{0}」例子\n{1}".format(word[0], "\n".join(example_line)))))
if canton_dict.brief_explains:
is_str = type(canton_dict.brief_explains) is str
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」略說".format(word[0]),
input_message_content=InputTextMessageContent(
"「{0}」{1}".format(word[0],
ast.literal_eval(canton_dict.brief_explains.replace('\\\\', '\\')).decode())
if is_str else canton_dict.brief_explains[0].decode('utf8'))))
if canton_dict.eng_meanings:
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」English meaning".format(word[0]),
input_message_content=InputTextMessageContent(
"「{0}」English meaning\n{1}".format(word[0], canton_dict.eng_meanings))))
if len(canton_dict.audios) == len(canton_dict.homonyms):
for index in range(len(canton_dict.audios)):
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」({1})同音字".format(word[0],
canton_dict.audios[index][6:-4]),
input_message_content=InputTextMessageContent(
"「{0}」{1} 同音字\n{2}".format(word[0], canton_dict.audios[index][6:-4],
canton_dict.homonyms[index]))))
if canton_dict.cangjie_pattern:
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」倉頡碼".format(word[0]),
input_message_content=InputTextMessageContent(
"「{0}」倉頡碼:{1}".format(word[0], canton_dict.cangjie_pattern))))
if canton_dict.url:
results.append(InlineQueryResultArticle(
id=uuid4(),
title="「{0}」更多".format(word[0]),
input_message_content=InputTextMessageContent(canton_dict.url)))
update.inline_query.answer(results)
@run_async
def error_handle(bot, update, error):
logger.warning('Update "%s" caused error "%s"' % (update, error))
@run_async
def process(word):
pass
def main():
# Create the Updater and pass it your bot's token.
updater = Updater("")
# Get the dispatcher to register handlers
dp = updater.dispatcher
# on different commands - answer in Telegram
dp.add_handler(CommandHandler("start", start))
dp.add_handler(CommandHandler("help", help))
# on non-command i.e message - echo the message on Telegram
dp.add_handler(InlineQueryHandler(inline_query))
# log all errors
dp.add_error_handler(error_handle)
# Start the Bot
updater.start_polling()
# Block until the user presses Ctrl-C or the process receives SIGINT,
# SIGTERM or SIGABRT. This should be used most of the time, since
# start_polling() is non-blocking and will stop the bot gracefully.
updater.idle()
if __name__ == '__main__':
main()