Skip to content

Commit 8d031d5

Browse files
Rongronggg9felixonmars
authored andcommitted
web-slang: Refine list extracting, word separating & trimming
This aims to extract more words from the page and make the result more precise.
1 parent a76ad0a commit 8d031d5

File tree

1 file changed

+24
-11
lines changed

1 file changed

+24
-11
lines changed

zhwiki-web-slang.py

+24-11
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,12 @@ def process(wikitext):
5050
words = collections.OrderedDict()
5151

5252
def add_word(word):
53-
if word.startswith("形容"):
54-
return
55-
for garbage in ("、", "[", "]", "…"):
53+
for garbage in ("[", "]", "…", ":", ":", ")", ")", '"', "“", "”", "-{", "}-", "简称", "簡稱"):
5654
word = word.replace(garbage, "")
5755
words[word.strip()] = None
5856

5957
def add_words(word):
60-
for word_separator in ("、", "/", "|", ",", "。"):
58+
for word_separator in ("、", "/", "|", ",", "。", "?", "?", "(", "("):
6159
if word_separator in word:
6260
for w in word.split(word_separator):
6361
# recursively resolve
@@ -66,14 +64,29 @@ def add_words(word):
6664
else:
6765
add_word(word)
6866

67+
def iter_bolds(line):
68+
line_bak = line
69+
while "'''" in line:
70+
_, sep1, line = line.partition("'''")
71+
bold, sep2, line = line.partition("'''")
72+
assert sep1 and sep2, ValueError("Unclosed ''' in line: " + line_bak)
73+
yield bold
74+
6975
for line in wikitext.split("\n"):
70-
if line.startswith("*"):
71-
# Lists
72-
for table_separator in (":", ":"):
73-
if table_separator in line:
74-
word = line.split(table_separator)[0].strip("*").strip()
75-
add_words(word)
76-
break
76+
if not line.startswith("*"):
77+
continue
78+
# Lists
79+
line = line.strip("*").strip()
80+
pre_colon, sep, post_colon = line.partition("''':")
81+
if not sep:
82+
pre_colon, sep, post_colon = line.partition("''':")
83+
for bold in iter_bolds(pre_colon + sep):
84+
# Add bold words before colon
85+
add_words(bold)
86+
for bold in iter_bolds(post_colon):
87+
# Add bold words after colon (or line w/o colon), skipping the origin of abbreviation (length probably <= 2)
88+
if len(bold) > 2:
89+
add_words(bold)
7790

7891
return words
7992

0 commit comments

Comments
 (0)