@@ -50,14 +50,12 @@ def process(wikitext):
50
50
words = collections .OrderedDict ()
51
51
52
52
def add_word (word ):
53
- if word .startswith ("形容" ):
54
- return
55
- for garbage in ("、" , "[" , "]" , "…" ):
53
+ for garbage in ("[" , "]" , "…" , ":" , ":" , ")" , ")" , '"' , "“" , "”" , "-{" , "}-" , "简称" , "簡稱" ):
56
54
word = word .replace (garbage , "" )
57
55
words [word .strip ()] = None
58
56
59
57
def add_words (word ):
60
- for word_separator in ("、" , "/" , "|" , "," , "。" ):
58
+ for word_separator in ("、" , "/" , "|" , "," , "。" , "?" , "?" , "(" , "(" ):
61
59
if word_separator in word :
62
60
for w in word .split (word_separator ):
63
61
# recursively resolve
@@ -66,14 +64,29 @@ def add_words(word):
66
64
else :
67
65
add_word (word )
68
66
67
+ def iter_bolds (line ):
68
+ line_bak = line
69
+ while "'''" in line :
70
+ _ , sep1 , line = line .partition ("'''" )
71
+ bold , sep2 , line = line .partition ("'''" )
72
+ assert sep1 and sep2 , ValueError ("Unclosed ''' in line: " + line_bak )
73
+ yield bold
74
+
69
75
for line in wikitext .split ("\n " ):
70
- if line .startswith ("*" ):
71
- # Lists
72
- for table_separator in (":" , ":" ):
73
- if table_separator in line :
74
- word = line .split (table_separator )[0 ].strip ("*" ).strip ()
75
- add_words (word )
76
- break
76
+ if not line .startswith ("*" ):
77
+ continue
78
+ # Lists
79
+ line = line .strip ("*" ).strip ()
80
+ pre_colon , sep , post_colon = line .partition ("''':" )
81
+ if not sep :
82
+ pre_colon , sep , post_colon = line .partition ("''':" )
83
+ for bold in iter_bolds (pre_colon + sep ):
84
+ # Add bold words before colon
85
+ add_words (bold )
86
+ for bold in iter_bolds (post_colon ):
87
+ # Add bold words after colon (or line w/o colon), skipping the origin of abbreviation (length probably <= 2)
88
+ if len (bold ) > 2 :
89
+ add_words (bold )
77
90
78
91
return words
79
92
0 commit comments