Skip to content

Commit 182461a

Browse files
authored
Improve character range matching with binary search (#524)
Co-authored-by: Zoltan Herczeg <[email protected]>
1 parent 1e09555 commit 182461a

File tree

16 files changed

+1590
-466
lines changed

16 files changed

+1590
-466
lines changed

.github/workflows/dev.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
run: ./autogen.sh
5151

5252
- name: Configure
53-
run: ./configure CC='clang -fsanitize=undefined,address,integer -fno-sanitize-recover=undefined,integer -fno-sanitize=unsigned-integer-overflow,function' CPPFLAGS='-Wall -Wextra -Werror -Wno-error=unused-but-set-parameter -Wno-error=deprecated-declarations -Wno-error=incompatible-library-redeclaration -Wno-error=incompatible-pointer-types-discards-qualifiers' --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --with-link-size=3
53+
run: ./configure CC='clang -fsanitize=undefined,address,integer -fno-sanitize-recover=undefined,integer -fno-sanitize=unsigned-integer-overflow,unsigned-shift-base,function' CPPFLAGS='-Wall -Wextra -Werror -Wno-error=unused-but-set-parameter -Wno-error=deprecated-declarations -Wno-error=incompatible-library-redeclaration -Wno-error=incompatible-pointer-types-discards-qualifiers' --enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-debug --with-link-size=3
5454

5555
- name: Build
5656
run: make -j3
@@ -83,7 +83,7 @@ jobs:
8383
./autogen.sh
8484
8585
- name: Configure
86-
run: ./configure CC='clang -fsanitize=undefined,address,integer -fno-sanitize-recover=undefined,integer -fno-sanitize=function' --enable-pcre2-16 --enable-pcre2-32 --enable-debug
86+
run: ./configure CC='clang -fsanitize=undefined,address,integer -fno-sanitize-recover=undefined,integer -fno-sanitize=unsigned-shift-base,function' --enable-pcre2-16 --enable-pcre2-32 --enable-debug
8787

8888
- name: Build
8989
run: make CPPFLAGS='-pedantic -Wall -Wextra -Wpedantic -Wdeclaration-after-statement -Wshadow -Wno-overlength-strings -Werror -Wno-error=incompatible-pointer-types-discards-qualifiers' -j3

maint/ManyConfigTests

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ if [ $usemain -ne 0 ]; then
349349
"--disable-unicode --enable-newline-is-crlf --disable-shared" \
350350
"--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
351351
"--enable-newline-is-any --disable-static" \
352-
"--disable-unicode --enable-pcre2-16" \
352+
"--disable-unicode --enable-pcre2-16 --enable-debug" \
353353
"--enable-pcre2-16 --disable-shared" \
354354
"--disable-unicode --enable-pcre2-32" \
355355
"--enable-pcre2-32 --disable-shared" \

maint/README

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,14 @@ new release.
160160
. Run tests with two or more different compilers (e.g. clang and gcc), and make
161161
use of -fsanitize=address and friends where possible. For gcc,
162162
-fsanitize=undefined -std=gnu99 picks up undefined behaviour at runtime. For
163-
clang, -fsanitize=address,undefined,integer can be used but
163+
clang, -fsanitize=address,undefined,integer can be used but an exception is
164+
needed to allow XCLASS with very large ranges in the 32-bit library so it
165+
should be followed by -fno-sanitize=unsigned-shift-base, additionally
164166
-fno-sanitize=unsigned-integer-overflow must be added when compiling with
165-
JIT. Newer versions of the compiler also need -fno-sanitize=function, at
166-
least until pcre2test stops using generic pointers on its callbacks. Another
167-
useful clang option is -fsanitize=signed-integer-overflow.
167+
JIT. Newer versions of clang also need -fno-sanitize=function, at least
168+
until pcre2test stops using generic pointers on its callbacks. Another
169+
useful clang option is -fsanitize=signed-integer-overflow but that should
170+
be already included if using "integer".
168171

169172
. Do a test build using CMake. Remove src/config.h first, lest it override the
170173
version that CMake creates. Also ensure there is no leftover CMakeCache.txt

src/pcre2_compile.c

Lines changed: 159 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5481,8 +5481,10 @@ return TRUE;
54815481
#define XCLASS_HAS_8BIT_CHARS 0x2
54825482
/* XClass has properties. */
54835483
#define XCLASS_HAS_PROPS 0x4
5484+
/* XClass has character lists. */
5485+
#define XCLASS_HAS_CHAR_LISTS 0x8
54845486
/* XClass matches to all >= 256 characters. */
5485-
#define XCLASS_HIGH_ANY 0x8
5487+
#define XCLASS_HIGH_ANY 0x10
54865488

54875489
#endif
54885490

@@ -5945,7 +5947,7 @@ for (;; pptr++)
59455947

59465948
if (cranges->range_list_size > 0)
59475949
{
5948-
uint32_t *ranges = (uint32_t*)(cranges + 1);
5950+
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
59495951

59505952
if (ranges[0] <= 255)
59515953
xclass_props |= XCLASS_HAS_8BIT_CHARS;
@@ -6403,76 +6405,86 @@ for (;; pptr++)
64036405
range += 2;
64046406
}
64056407

6406-
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
6408+
if (cranges->char_lists_size > 0)
64076409
{
6408-
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
6409-
range[1] >= GET_MAX_CHAR_VALUE(utf));
6410-
should_flip_negation = TRUE;
6411-
range = end;
6410+
/* The cranges structure is still used and freed later. */
6411+
PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
6412+
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
64126413
}
6413-
6414-
while (range < end)
6414+
else
64156415
{
6416-
uint32_t range_start = range[0];
6417-
uint32_t range_end = range[1];
6416+
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
6417+
{
6418+
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
6419+
range[1] >= GET_MAX_CHAR_VALUE(utf));
6420+
should_flip_negation = TRUE;
6421+
range = end;
6422+
}
64186423

6419-
range += 2;
6420-
xclass_props |= XCLASS_REQUIRED;
6424+
while (range < end)
6425+
{
6426+
uint32_t range_start = range[0];
6427+
uint32_t range_end = range[1];
64216428

6422-
if (range_start < 256) range_start = 256;
6429+
range += 2;
6430+
xclass_props |= XCLASS_REQUIRED;
6431+
6432+
if (range_start < 256) range_start = 256;
6433+
6434+
if (lengthptr != NULL)
6435+
{
6436+
#ifdef SUPPORT_UNICODE
6437+
if (utf)
6438+
{
6439+
*lengthptr += 1;
6440+
6441+
if (range_start < range_end)
6442+
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
6443+
6444+
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
6445+
continue;
6446+
}
6447+
#endif /* SUPPORT_UNICODE */
6448+
6449+
*lengthptr += range_start < range_end ? 3 : 2;
6450+
continue;
6451+
}
64236452

6424-
if (lengthptr != NULL)
6425-
{
64266453
#ifdef SUPPORT_UNICODE
64276454
if (utf)
64286455
{
6429-
*lengthptr += 1;
6430-
64316456
if (range_start < range_end)
6432-
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
6457+
{
6458+
*class_uchardata++ = XCL_RANGE;
6459+
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
6460+
}
6461+
else
6462+
*class_uchardata++ = XCL_SINGLE;
64336463

6434-
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
6464+
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
64356465
continue;
64366466
}
64376467
#endif /* SUPPORT_UNICODE */
64386468

6439-
*lengthptr += range_start < range_end ? 3 : 2;
6440-
continue;
6441-
}
6442-
6443-
#ifdef SUPPORT_UNICODE
6444-
if (utf)
6445-
{
6469+
/* Without UTF support, character values are constrained
6470+
by the bit length, and can only be > 256 for 16-bit and
6471+
32-bit libraries. */
6472+
#if PCRE2_CODE_UNIT_WIDTH != 8
64466473
if (range_start < range_end)
64476474
{
64486475
*class_uchardata++ = XCL_RANGE;
6449-
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
6476+
*class_uchardata++ = range_start;
64506477
}
64516478
else
64526479
*class_uchardata++ = XCL_SINGLE;
64536480

6454-
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
6455-
continue;
6456-
}
6457-
#endif /* SUPPORT_UNICODE */
6458-
6459-
/* Without UTF support, character values are constrained by the bit length,
6460-
and can only be > 256 for 16-bit and 32-bit libraries. */
6461-
#if PCRE2_CODE_UNIT_WIDTH != 8
6462-
if (range_start < range_end)
6463-
{
6464-
*class_uchardata++ = XCL_RANGE;
6465-
*class_uchardata++ = range_start;
6481+
*class_uchardata++ = range_end;
6482+
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
64666483
}
6467-
else
6468-
*class_uchardata++ = XCL_SINGLE;
64696484

6470-
*class_uchardata++ = range_end;
6471-
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
6485+
if (lengthptr == NULL)
6486+
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
64726487
}
6473-
6474-
if (lengthptr == NULL)
6475-
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
64766488
}
64776489
#endif
64786490

@@ -6502,7 +6514,8 @@ for (;; pptr++)
65026514
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
65036515
if ((xclass_props & XCLASS_REQUIRED) != 0)
65046516
{
6505-
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
6517+
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
6518+
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
65066519
*code++ = OP_XCLASS;
65076520
code += LINK_SIZE;
65086521
*code = negate_class? XCL_NOT:0;
@@ -6526,6 +6539,101 @@ for (;; pptr++)
65266539
}
65276540
else code = class_uchardata;
65286541

6542+
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
6543+
{
6544+
/* Char lists size is an even number,
6545+
because all items are 16 or 32 bit values. */
6546+
size_t char_lists_size = cranges->char_lists_size;
6547+
PCRE2_ASSERT((char_lists_size & 0x1) == 0);
6548+
6549+
if (lengthptr != NULL)
6550+
{
6551+
/* At this point, we don't know the precise location
6552+
so the maximum alignment is added to the length. */
6553+
#if PCRE2_CODE_UNIT_WIDTH == 8
6554+
*lengthptr += 2 /* sizeof(type) in PCRE2_UCHARs */ +
6555+
3 /* maximum alignment. */;
6556+
#elif PCRE2_CODE_UNIT_WIDTH == 16
6557+
*lengthptr += 1 /* sizeof(type) in PCRE2_UCHARs */ +
6558+
1 /* maximum alignment. */;
6559+
char_lists_size >>= 1;
6560+
#else
6561+
*lengthptr += 1 /* sizeof(type) in PCRE2_UCHARs */;
6562+
/* Padding, when the size is not divisible by 4. */
6563+
if ((char_lists_size & 0x2) != 0)
6564+
char_lists_size += 2;
6565+
char_lists_size >>= 2;
6566+
#endif
6567+
6568+
if (OFLOW_MAX - *lengthptr < char_lists_size)
6569+
{
6570+
*errorcodeptr = ERR20; /* Integer overflow */
6571+
return 0;
6572+
}
6573+
6574+
*lengthptr += char_lists_size;
6575+
6576+
if (*lengthptr > MAX_PATTERN_SIZE)
6577+
{
6578+
*errorcodeptr = ERR20; /* Pattern is too large */
6579+
return 0;
6580+
}
6581+
}
6582+
else
6583+
{
6584+
uint8_t *char_buffer = (uint8_t*)code;
6585+
6586+
PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
6587+
#if PCRE2_CODE_UNIT_WIDTH == 8
6588+
/* Encode as high / low bytes. */
6589+
code[0] = (uint8_t)(XCL_LIST |
6590+
(cranges->char_lists_types >> 8));
6591+
code[1] = (uint8_t)cranges->char_lists_types;
6592+
char_buffer += 2;
6593+
6594+
/* Compute alignment. */
6595+
if (((uintptr_t)char_buffer & 0x1) != 0)
6596+
{
6597+
code[0] |= 1u << (XCL_ALIGNMENT_SHIFT - 8);
6598+
char_buffer += 1;
6599+
}
6600+
6601+
if (((uintptr_t)char_buffer & 0x2) != (char_lists_size & 0x2))
6602+
{
6603+
code[0] |= 2u << (XCL_ALIGNMENT_SHIFT - 8);
6604+
char_buffer += 2;
6605+
}
6606+
#elif PCRE2_CODE_UNIT_WIDTH == 16
6607+
code[0] = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
6608+
char_buffer += 2;
6609+
6610+
/* Compute alignment. */
6611+
if (((uintptr_t)char_buffer & 0x2) != (char_lists_size & 0x2))
6612+
{
6613+
code[0] |= 2u << XCL_ALIGNMENT_SHIFT;
6614+
char_buffer += 2;
6615+
}
6616+
#else
6617+
code[0] = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
6618+
char_buffer += 4;
6619+
6620+
/* Padding. */
6621+
if ((char_lists_size & 0x2) != 0)
6622+
{
6623+
code[0] |= 2u << XCL_ALIGNMENT_SHIFT;
6624+
char_buffer += 2;
6625+
}
6626+
#endif
6627+
memcpy(char_buffer,
6628+
(uint8_t*)(cranges + 1) + cranges->char_lists_start,
6629+
char_lists_size);
6630+
6631+
code = (PCRE2_UCHAR*)(char_buffer + char_lists_size);
6632+
6633+
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
6634+
}
6635+
}
6636+
65296637
/* Now fill in the complete length of the item */
65306638

65316639
PUT(previous, 1, (int)(code - previous));
@@ -6549,7 +6657,7 @@ for (;; pptr++)
65496657
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
65506658
cb->classbits.classwords[0] == ~(uint32_t)0)
65516659
{
6552-
uint32_t *classwords = cb->classbits.classwords;
6660+
const uint32_t *classwords = cb->classbits.classwords;
65536661
int i;
65546662

65556663
for (i = 0; i < 8; i++)
@@ -11222,7 +11330,9 @@ version of the pattern, free it before returning. Also free the list of named
1122211330
groups if a larger one had to be obtained, and likewise the group information
1122311331
vector. */
1122411332

11333+
#ifdef SUPPORT_UNICODE
1122511334
PCRE2_ASSERT(cb.cranges == NULL);
11335+
#endif
1122611336

1122711337
EXIT:
1122811338
#ifdef SUPPORT_VALGRIND

0 commit comments

Comments
 (0)