Skip to content

Commit 94aa3fe

Browse files
committed
use STL map for unicodeindices
1 parent d8f08f6 commit 94aa3fe

File tree

3 files changed

+28
-32
lines changed

3 files changed

+28
-32
lines changed

src/includes.pxi

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
21
cimport cpython.unicode
3-
from cython.operator cimport preincrement as inc, dereference as deref
42
from libcpp.map cimport map
53
from libcpp.string cimport string as cpp_string
4+
from cython.operator cimport postincrement, dereference
65
from cpython.buffer cimport Py_buffer, PyBUF_SIMPLE, PyObject_CheckBuffer, \
76
PyObject_GetBuffer, PyBuffer_Release
8-
from cpython cimport array
97
from cpython.version cimport PY_MAJOR_VERSION
108

119

src/match.pxi

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ cdef class Match:
1818

1919
property lastgroup:
2020
def __get__(self):
21-
2221
if self._lastindex < 1:
2322
return None
2423
for name, n in self.re.groupindex.items():
@@ -105,7 +104,8 @@ cdef class Match:
105104
def groupdict(self):
106105
result = self._groupdict()
107106
if self.encoded:
108-
return {a: None if b is None else b.decode('utf8') for a, b in result.items()}
107+
return {a: None if b is None else b.decode('utf8')
108+
for a, b in result.items()}
109109
return result
110110

111111
def expand(self, object template):
@@ -264,12 +264,13 @@ cdef class Match:
264264

265265
cdef list _convert_spans(self, spans,
266266
char * cstring, int size, int * cpos, int * upos):
267-
positions = [x for x, _ in spans] + [y for _, y in spans]
268-
positions = array.array(b'l' if PY2 else 'l', sorted(set(positions)))
269-
posdict = dict(zip(
270-
positions,
271-
unicodeindices(positions, cstring, size, cpos, upos)))
272-
return [(posdict[x], posdict[y]) for x, y in spans]
267+
cdef map[int, int] positions
268+
cdef int x, y
269+
for x, y in spans:
270+
positions[x] = x
271+
positions[y] = y
272+
unicodeindices(positions, cstring, size, cpos, upos)
273+
return [(positions[x], positions[y]) for x, y in spans]
273274

274275
def __dealloc__(self):
275276
delete_StringPiece_array(self.matches)

src/re2.pyx

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ include "includes.pxi"
106106

107107
import re
108108
import sys
109-
import array
110109
import warnings
111110

112111

@@ -404,23 +403,22 @@ cdef utf8indices(char * cstring, int size, int *pos, int *endpos):
404403
endpos[0] = newendpos
405404

406405

407-
cdef array.array unicodeindices(array.array positions,
406+
cdef void unicodeindices(map[int, int] &positions,
408407
char * cstring, int size, int * cpos, int * upos):
409-
"""Convert an array of UTF-8 byte indices to unicode indices."""
408+
"""Convert UTF-8 byte indices to unicode indices."""
410409
cdef unsigned char * s = <unsigned char *>cstring
411-
cdef int i = 0
412-
cdef array.array result = array.clone(positions, len(positions), False)
413-
414-
if positions.data.as_longs[i] == -1:
415-
result.data.as_longs[i] = -1
416-
i += 1
417-
if i == len(positions):
418-
return result
419-
if positions.data.as_longs[i] == cpos[0]:
420-
result.data.as_longs[i] = upos[0]
421-
i += 1
422-
if i == len(positions):
423-
return result
410+
cdef map[int, int].iterator it = positions.begin()
411+
412+
if dereference(it).first == -1:
413+
dereference(it).second = -1
414+
postincrement(it)
415+
if it == positions.end():
416+
return
417+
if dereference(it).first == cpos[0]:
418+
dereference(it).second = upos[0]
419+
postincrement(it)
420+
if it == positions.end():
421+
return
424422

425423
while cpos[0] < size:
426424
if s[cpos[0]] < 0x80:
@@ -442,12 +440,11 @@ cdef array.array unicodeindices(array.array positions,
442440
upos[0] += 1
443441
emit_endif()
444442

445-
if positions.data.as_longs[i] == cpos[0]:
446-
result.data.as_longs[i] = upos[0]
447-
i += 1
448-
if i == len(positions):
443+
if dereference(it).first == cpos[0]:
444+
dereference(it).second = upos[0]
445+
postincrement(it)
446+
if it == positions.end():
449447
break
450-
return result
451448

452449

453450
__all__ = [

0 commit comments

Comments
 (0)