Skip to content
This repository was archived by the owner on Mar 9, 2023. It is now read-only.

Commit 620f7c3

Browse files
authored
Cython based optimization (#123)
* Remove unecessary deep copy * Add lru_cache on get_word_info * Add lru_cache to get_word_info This seems to be a small speedup. * Basic Cythonization Unlike the other branch the tests pass on this one. Benchmark time went down by a third compared to the previous commit. I'm not sure the _c functions are necessary here - I think that's what cpdef functions are for, but I had difficulty getting them working. Will need to give that another look. * Use cpdef functions Didn't have any issues this time, and it's cleaner with no clear performance difference. * Move build_lattice to Cython, intern some slow parts This should cut execution time by roughly 25% compared to the last commit. * Don't use deepcopy This is not an appropriate use of deepcopy and it's slow. * Add cython to setup_requires * Fix setup.py * Make INHIBITED_CONNECTION literal Minor speed boost. * Bring the matrix into the lattice building This provides a notable speedup. * Various cythonizations Improvements are relatively minor compared to previous commit, but there is a few seconds of speedup. * Inline function for small speed boost * Change import order, make lru cache size explicit Maybe this will make Travis happy? * Add a build command * Use INT_MAX * Remove comment Missed this before, this is fine.
1 parent 4d50586 commit 620f7c3

File tree

14 files changed

+148
-84
lines changed

14 files changed

+148
-84
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ python:
66
- '3.7'
77
install:
88
- pip install flake8 flake8-import-order flake8-builtins && pip install -r requirements.txt
9+
- python setup.py build_ext --inplace
910
before_script:
1011
- cp .travis/system.dic.test tests/resources/system.dic && cp .travis/user.dic.test tests/resources/user.dic
1112
script:

setup.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,17 @@
1414

1515
from setuptools import setup, find_packages
1616

17+
from distutils.extension import Extension
18+
19+
extensions = [
20+
Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']),
21+
Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']),
22+
Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']),
23+
]
24+
1725
setup(name="SudachiPy",
1826
use_scm_version=True,
19-
setup_requires=['setuptools_scm'],
27+
setup_requires=['setuptools_scm', 'cython'],
2028
description="Python version of Sudachi, the Japanese Morphological Analyzer",
2129
long_description=open('README.md', encoding='utf-8').read(),
2230
long_description_content_type="text/markdown",
@@ -33,4 +41,5 @@
3341
"sortedcontainers~=2.1.0",
3442
'dartsclone~=0.9.0',
3543
],
44+
ext_modules=extensions,
3645
)

sudachipy/dictionarylib/lexiconset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from functools import lru_cache
1516
from typing import List
1617

1718
from .lexicon import Lexicon
@@ -57,6 +58,7 @@ def get_cost(self, word_id: int) -> int:
5758
return self.lexicons[self.get_dictionary_id(word_id)]\
5859
.get_cost(self.get_word_id1(word_id))
5960

61+
@lru_cache(1024)
6062
def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821
6163
dic_id = self.get_dictionary_id(word_id)
6264
winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id))

sudachipy/dictionarylib/wordinfolist.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import struct
16+
from functools import lru_cache
1617

1718
from .wordinfo import WordInfo
1819

@@ -23,6 +24,7 @@ def __init__(self, bytes_, offset, word_size):
2324
self.offset = offset
2425
self._word_size = word_size
2526

27+
@lru_cache(2048)
2628
def get_word_info(self, word_id):
2729
orig_pos = self.bytes.tell()
2830
index = self.word_id_to_offset(word_id)

sudachipy/lattice.pxd

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from .latticenode cimport LatticeNode
2+
3+
cdef extern from "limits.h":
4+
cdef int INT_MAX
5+
6+
cdef class Lattice:
7+
8+
cdef int size
9+
cdef int capacity
10+
cdef LatticeNode eos_node
11+
12+
cdef list end_lists
13+
cdef object grammar
14+
cdef object eos_params
15+
cdef const short[:,:] connect_costs
16+
17+
cpdef void resize(self, int size)
18+
cpdef void insert(self, int begin, int end, LatticeNode node)
19+
cdef void connect_node(self, LatticeNode r_node)
20+
cdef void connect_eos_node(self)
Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,26 @@
1515
from typing import List, Optional
1616

1717
from .dictionarylib.grammar import Grammar
18-
from .latticenode import LatticeNode
18+
from .latticenode cimport LatticeNode
1919

20+
cdef class Lattice:
2021

21-
class Lattice:
22+
def __init__(self, grammar: Grammar):
23+
self.size = 0
24+
self.capacity = 0
2225

23-
size = 0
24-
capacity = 0
25-
eos_node = None
2626

27-
def __init__(self, grammar: Grammar):
2827
self.end_lists = []
2928
self.grammar = grammar
3029
self.eos_params = grammar.get_eos_parameter()
31-
bos_node = LatticeNode()
30+
cdef LatticeNode bos_node = LatticeNode()
3231
bos_params = grammar.get_bos_parameter()
3332
bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2])
3433
bos_node.is_connected_to_bos = True
3534
self.end_lists.append([bos_node])
35+
self.connect_costs = self.grammar._matrix_view
3636

37-
def resize(self, size: int) -> None:
37+
cpdef void resize(self, int size):
3838
if size > self.capacity:
3939
self.expand(size)
4040
self.size = size
@@ -69,7 +69,7 @@ def get_minumum_node(self, begin: int, end: int) -> Optional[LatticeNode]:
6969
min_arg = node
7070
return min_arg
7171

72-
def insert(self, begin: int, end: int, node: LatticeNode) -> None:
72+
cpdef void insert(self, int begin, int end, LatticeNode node):
7373
self.end_lists[end].append(node)
7474
node.begin = begin
7575
node.end = end
@@ -85,15 +85,20 @@ def create_node() -> LatticeNode:
8585
def has_previous_node(self, index: int) -> bool:
8686
return bool(self.end_lists[index])
8787

88-
def connect_node(self, r_node: LatticeNode) -> None:
88+
cdef void connect_node(self, LatticeNode r_node):
8989
begin = r_node.begin
90-
r_node.total_cost = float('inf')
90+
r_node.total_cost = INT_MAX
91+
92+
cdef LatticeNode l_node
93+
cdef int connect_cost
9194
for l_node in self.end_lists[begin]:
9295
if not l_node.is_connected_to_bos:
9396
continue
9497
# right_id and left_id look reversed, but it works ...
95-
connect_cost = self.grammar.get_connect_cost(l_node.right_id, r_node.left_id)
96-
if connect_cost == Grammar.INHIBITED_CONNECTION:
98+
connect_cost = self.connect_costs[l_node.right_id, r_node.left_id]
99+
100+
# 0x7fff == Grammar.INHIBITED_CONNECTION:
101+
if connect_cost == 0x7fff:
97102
continue
98103
cost = l_node.total_cost + connect_cost
99104
if cost < r_node.total_cost:
@@ -103,7 +108,7 @@ def connect_node(self, r_node: LatticeNode) -> None:
103108
r_node.is_connected_to_bos = r_node.best_previous_node is not None
104109
r_node.total_cost += r_node.cost
105110

106-
def connect_eos_node(self) -> None:
111+
cdef void connect_eos_node(self):
107112
self.connect_node(self.eos_node)
108113

109114
def get_best_path(self) -> List[LatticeNode]:

sudachipy/latticenode.pxd

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
cdef class LatticeNode:
2+
3+
cdef int begin
4+
cdef int end
5+
cdef int total_cost
6+
cdef int word_id
7+
cdef bint _is_oov
8+
cdef LatticeNode best_previous_node
9+
cdef bint is_connected_to_bos
10+
cdef object extra_word_info
11+
cdef object undefined_word_info
12+
cdef bint is_defined
13+
cdef object lexicon
14+
cdef int left_id
15+
cdef int right_id
16+
cdef int cost
17+
Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# cython: profile=True
2+
13
# Copyright (c) 2019 Works Applications Co., Ltd.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,27 +17,22 @@
1517
from .dictionarylib.wordinfo import WordInfo
1618

1719
__NULL_SURFACE = '(null)'
18-
UNK = WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
19-
__NULL_SURFACE, __NULL_SURFACE, [], [], [])
20-
20+
UNK =\
21+
WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
22+
__NULL_SURFACE, __NULL_SURFACE, [], [], [])
2123

22-
class LatticeNode:
23-
24-
begin = 0
25-
end = 0
26-
total_cost = 0
27-
word_id = 0
28-
_is_oov = False
29-
best_previous_node = None
30-
is_connected_to_bos = None
31-
extra_word_info = None
32-
lexicon = None
33-
left_id = None
34-
right_id = None
35-
cost = None
24+
cdef class LatticeNode:
3625

3726
def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None):
3827

28+
self.begin = 0
29+
self.end = 0
30+
self.word_id = 0
31+
self._is_oov = False
32+
self.best_previous_node = None
33+
self.is_connected_to_bos = False
34+
self.extra_word_info = None
35+
3936
self.is_defined = True
4037
if lexicon is left_id is right_id is cost is word_id is None:
4138
self.is_defined = False
@@ -54,9 +51,15 @@ def set_parameter(self, left_id: int, right_id: int, cost: int) -> None:
5451
def get_begin(self) -> int:
5552
return self.begin
5653

54+
def set_begin(self, begin) -> None:
55+
self.begin = begin
56+
5757
def get_end(self) -> int:
5858
return self.end
5959

60+
def set_end(self, end) -> None:
61+
self.end = end
62+
6063
def set_range(self, begin: int, end: int) -> None:
6164
self.begin = begin
6265
self.end = end

sudachipy/plugin/oov/mecab_oov_plugin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def __init__(self):
3232

3333
class OOV:
3434
def __init__(self):
35-
self.left_id = None
36-
self.right_id = None
37-
self.cost = None
35+
self.left_id = -1
36+
self.right_id = -1
37+
self.cost = -1
3838
self.pos_id = None
3939

4040
def __init__(self, json_obj=None):

sudachipy/plugin/oov/oov_provider_plugin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: b
3333
def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
3434
nodes = self.provide_oov(input_text, offset, has_other_words)
3535
for node in nodes:
36-
node.begin = offset
37-
node.end = offset + node.get_word_info().length()
36+
node.set_begin(offset)
37+
node.set_end(offset + node.get_word_info().length())
3838
return nodes
3939

4040
@staticmethod

0 commit comments

Comments
 (0)