Skip to content
This repository was archived by the owner on Mar 9, 2023. It is now read-only.

Commit b9a5fcf

Browse files
authored
Merge pull request #35 from WorksApplications/fix-bug
fix bugs & add test in .travis.yml
2 parents 28a5fdc + 28a294a commit b9a5fcf

19 files changed

+148
-108
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,8 @@ python:
66
- '3.7'
77
install:
88
- pip install flake8
9+
before_script:
10+
- mv .travis/system.dic.test tests/resources/system.dic
911
script:
1012
- ./scripts/format.sh
13+
- python -m unittest discover tests

.travis/system.dic.test

6.73 KB
Binary file not shown.

README.md

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,15 @@ tokenizer_obj.tokenize(mode, "シュミレーション")[0].normalized_form()
103103

104104
### Code format
105105

106-
You have to run `./scripts/format.sh` and check if your code is in rule before PR.
107-
This code formatting script will be integrated to CI system later. `flake8` is required.
106+
You can use `./scripts/format.sh` and check if your code is in rule. `flake8` is required.
108107

109108
### Test
110109

111-
You have to run `./script/test.sh` and check if not your change cause regression.
112-
This test script will be integrated to CI system later. Current test assumes `sudachi-dictionary-20190531-core.dic`
113-
is in `resources` directory as `system.dic`. We will change it to special dictionary for test like Sudachi (Java) in all tests.
114-
Some of the tests use `system.dic` built by Sudachi. This is an example command to get `system.dic` for test
115-
```bash
116-
git clone https://github.com/WorksApplications/Sudachi.git
117-
cd Sudahi
118-
mvn test
119-
cp target/test-classes/system.dic ${SudachiPy}/tests/resources/
120-
```
110+
You can use `./script/test.sh` and check if not your change cause regression.
111+
Current test assumes
112+
113+
- `sudachi-dictionary-20190531-core.dic`is in `resources` directory as `system.dic`.
114+
115+
- `system.dic` for test in `tests/resources` directory
116+
117+
`system.dic` for test exists as `.travis/system.dic.test`. Copy it into `tests/resources` before test.

scripts/test.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22

33
# print error message only when it fails
44
# python unittest print message in stderr even if it succeed
5+
# You need to prepare system.dic in resources and tests/resources
6+
# see README
57

68
cd $(dirname $0)
7-
RES=`cd ..; python -m unittest discover tests 2>&1`
9+
RES=`cd ..; python -m unittest discover tests -p '*test*.py' 2>&1`
810
RES_TAIL=`echo "$RES" | tail -1`
911
if [[ $RES_TAIL != "OK" ]]; then
10-
echo "$RES"
12+
>&2 echo "$RES"
1113
fi

sudachipy/dictionarylib/charactercategory.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def read_character_definition(self, char_def=None):
4848
cols = re.split(r"\s+", line)
4949
if len(cols) < 2:
5050
f.close()
51-
raise AttributeError("invalid format at line ", i)
51+
raise AttributeError("invalid format at line {}".format(i))
5252
if not re.match("0x", cols[0]):
5353
continue
5454
range_ = self.Range()
@@ -58,14 +58,14 @@ def read_character_definition(self, char_def=None):
5858
range_.high = int(r[1], 16)
5959
if range_.low > range_.high:
6060
f.close()
61-
raise AttributeError("invalid range at line ", i)
61+
raise AttributeError("invalid range at line {}".format(i))
6262
for j in range(1, len(cols)):
6363
if re.match("#", cols[j]) or cols[j] is '':
6464
break
6565
type_ = categorytype.CategoryType.get(cols[j])
6666
if type_ is None:
6767
f.close()
68-
raise AttributeError(cols[j], " is invalid type at line ", i)
68+
raise AttributeError("{} is invalid type at line {}".format(cols[j], i))
6969
range_.categories.add(type_)
7070
self.range_list.append(range_)
7171
default_range = self.Range()

sudachipy/dictionarylib/dictionaryheader.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@
22

33

44
class DictionaryHeader:
5-
description_size = 256
6-
storage_size = 8 + 8 + description_size
5+
__description_size = 256
6+
storage_size = 8 + 8 + __description_size
77

88
def __init__(self, bytes_, offset):
99
self.version, self.create_time = struct.unpack_from("<2Q", bytes_, offset)
1010
offset += 16
11-
self.description = bytes_[offset:offset+self.description_size].decode("utf-8")
11+
12+
len_ = 0
13+
while len_ < self.__description_size:
14+
if bytes_[offset + len_] == 0:
15+
break
16+
len_ += 1
17+
self.description = bytes_[offset:offset + len_].decode("utf-8")

sudachipy/dictionarylib/doublearraylexicon.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ class DoubleArrayLexicon(lexicon.Lexicon):
1313
def __init__(self, bytes_, offset):
1414
self.trie = dartsclone.doublearray.DoubleArray()
1515
bytes_.seek(offset)
16-
self.size = int.from_bytes(bytes_.read(4), 'little')
16+
size = int.from_bytes(bytes_.read(4), 'little')
1717
offset += 4
1818
bytes_.seek(offset)
19-
array = struct.unpack_from("<{}I".format(self.size), bytes_, offset)
20-
self.trie.set_array(array, self.size)
19+
array = struct.unpack_from("<{}I".format(size), bytes_, offset)
20+
self.trie.set_array(array, size)
2121
offset += self.trie.total_size()
2222

2323
self.word_id_table = wordidtable.WordIdTable(bytes_, offset)
@@ -49,3 +49,6 @@ def get_cost(self, word_id):
4949

5050
def get_word_info(self, word_id):
5151
return self.word_infos.get_word_info(word_id)
52+
53+
def size(self):
54+
return self.word_params.size

sudachipy/dictionarylib/wordinfolist.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,49 +9,50 @@ def __init__(self, bytes_, offset, word_size):
99
self.offset = offset
1010

1111
def get_word_info(self, word_id):
12+
orig_pos = self.bytes.tell()
1213
index = self.word_id_to_offset(word_id)
13-
14-
surface = self.buffer_to_string(index)
15-
index += 1 + 2 * len(surface)
16-
head_word_length = self.bytes[index]
17-
index += 1
18-
pos_id = int.from_bytes(self.bytes[index:index+2], 'little')
19-
index += 2
20-
normalized_form = self.buffer_to_string(index)
21-
index += 1 + 2 * len(normalized_form)
14+
self.bytes.seek(index)
15+
surface = self.buffer_to_string()
16+
head_word_length = self.buffer_to_string_length()
17+
pos_id = int.from_bytes(self.bytes.read(2), 'little')
18+
normalized_form = self.buffer_to_string()
2219
if not normalized_form:
2320
normalized_form = surface
24-
dictionary_form_word_id = int.from_bytes(self.bytes[index:index+4], "little", signed=True)
25-
index += 4
26-
reading_form = self.buffer_to_string(index)
27-
index += 1 + 2 * len(reading_form)
28-
a_unit_split = self.buffer_to_int_array(index)
29-
index += 1 + 4 * len(a_unit_split)
30-
b_unit_split = self.buffer_to_int_array(index)
31-
index += 1 + 4 * len(b_unit_split)
32-
word_structure = self.buffer_to_int_array(index)
21+
dictionary_form_word_id = int.from_bytes(self.bytes.read(4), 'little', signed=True)
22+
reading_form = self.buffer_to_string()
23+
a_unit_split = self.buffer_to_int_array()
24+
b_unit_split = self.buffer_to_int_array()
25+
word_structure = self.buffer_to_int_array()
3326

3427
dictionary_form = surface
3528
if dictionary_form_word_id >= 0 and dictionary_form_word_id != word_id:
3629
wi = self.get_word_info(dictionary_form_word_id)
3730
dictionary_form = wi.surface
3831

32+
self.bytes.seek(orig_pos)
33+
3934
return wordinfo.WordInfo(surface, head_word_length, pos_id, normalized_form,
4035
dictionary_form_word_id, dictionary_form, reading_form,
4136
a_unit_split, b_unit_split, word_structure)
4237

4338
def word_id_to_offset(self, word_id):
4439
i = self.offset + 4 * word_id
45-
return int.from_bytes(self.bytes[i:i+4], "little", signed=False)
46-
47-
def buffer_to_string(self, offset):
48-
length = self.bytes[offset]
49-
offset += 1
50-
end = offset + 2 * length
51-
return self.bytes[offset:end].decode("utf-16-le")
52-
53-
def buffer_to_int_array(self, offset):
54-
length = self.bytes[offset]
55-
offset += 1
56-
array = struct.unpack_from("<{}I".format(length), self.bytes, offset)
40+
return int.from_bytes(self.bytes[i:i+4], 'little', signed=False)
41+
42+
def buffer_to_string_length(self):
43+
length = self.bytes.read_byte()
44+
if length < 128:
45+
return length
46+
low = self.bytes.read_byte()
47+
return ((length & 0x7F) << 8) | low
48+
49+
def buffer_to_string(self):
50+
length = self.buffer_to_string_length()
51+
return self.bytes.read(2 * length).decode('utf-16-le')
52+
53+
def buffer_to_int_array(self):
54+
length = self.bytes.read_byte()
55+
array = []
56+
for _ in range(length):
57+
array.append(int.from_bytes(self.bytes.read(4), 'little', signed=True))
5758
return array

sudachipy/plugin/input_text/default_input_text_plugin.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,15 @@ def read_rewrite_lists(self, rewrite_def):
6464
with open(rewrite_def, "r", encoding="utf-8") as f:
6565
for i, line in enumerate(f):
6666
line = line.strip()
67-
if not line or line.startswith("#"):
67+
if (not line) or line.startswith("#"):
6868
continue
6969
cols = line.split()
7070

7171
# ignored normalize list
7272
if len(cols) == 1:
7373
key = cols[0]
74+
if len(key) != 1:
75+
raise RuntimeError("{} is not character at line {}".format(key, i))
7476
self.ignore_normalize_set.add(key)
7577
# replace char list
7678
elif len(cols) == 2:

sudachipy/utf8inputtextbuilder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def build(self):
5252
j = 0
5353
for i in range(len(self.modified_text)):
5454
# 注: サロゲートペア文字は考慮していない
55-
for k in range(self.utf8_byte_length(ord(self.modified_text[i]))):
55+
for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
5656
byte_indexes[j] = i
5757
offsets[j] = self.text_offsets[i]
5858
j += 1

0 commit comments

Comments
 (0)