Skip to content

Commit 2bba84b

Browse files
committed
Optimize character category matching in JIT
1 parent 542cb11 commit 2bba84b

File tree

2 files changed

+60
-33
lines changed

2 files changed

+60
-33
lines changed

src/pcre2_jit_compile.c

Lines changed: 59 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6735,6 +6735,7 @@ JUMPTO(SLJIT_JUMP, mainloop);
67356735
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
67366736
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
67376737
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
6738+
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
67386739
#endif
67396740

67406741
static void check_wordboundary(compiler_common *common, BOOL ucp)
@@ -7615,6 +7616,8 @@ BOOL utf = common->utf;
76157616

76167617
#ifdef SUPPORT_UNICODE
76177618
sljit_u32 unicode_status = 0;
7619+
sljit_u32 category_list = 0;
7620+
sljit_u32 items;
76187621
int typereg = TMP1;
76197622
const sljit_u32 *other_cases;
76207623
#endif /* SUPPORT_UNICODE */
@@ -7633,6 +7636,7 @@ if (cc[-1] & XCL_MAP)
76337636
while (*cc != XCL_END)
76347637
{
76357638
compares++;
7639+
76367640
if (*cc == XCL_SINGLE)
76377641
{
76387642
cc ++;
@@ -7659,6 +7663,7 @@ while (*cc != XCL_END)
76597663
{
76607664
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
76617665
cc++;
7666+
76627667
if (*cc == PT_CLIST && cc[-1] == XCL_PROP)
76637668
{
76647669
other_cases = PRIV(ucd_caseless_sets) + cc[1];
@@ -7675,25 +7680,34 @@ while (*cc != XCL_END)
76757680
min = 0;
76767681
}
76777682

7683+
items = 0;
7684+
76787685
switch(*cc)
76797686
{
76807687
case PT_ANY:
76817688
/* Any either accepts everything or ignored. */
76827689
if (cc[-1] == XCL_PROP)
7683-
{
7684-
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
7685-
if (list == backtracks)
7686-
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
7687-
return;
7688-
}
7690+
items = UCPCAT_ALL;
76897691
break;
76907692

76917693
case PT_LAMP:
7694+
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
7695+
break;
7696+
76927697
case PT_GC:
7698+
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
7699+
break;
7700+
76937701
case PT_PC:
7702+
items = UCPCAT(cc[1]);
7703+
break;
7704+
76947705
case PT_WORD:
7706+
items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
7707+
break;
7708+
76957709
case PT_ALNUM:
7696-
unicode_status |= XCLASS_HAS_TYPE;
7710+
items = UCPCAT_L | UCPCAT_N;
76977711
break;
76987712

76997713
case PT_SCX:
@@ -7736,11 +7750,32 @@ while (*cc != XCL_END)
77367750
SLJIT_UNREACHABLE();
77377751
break;
77387752
}
7753+
7754+
if (items > 0)
7755+
{
7756+
if (cc[-1] == XCL_NOTPROP)
7757+
items ^= UCPCAT_ALL;
7758+
category_list |= items;
7759+
unicode_status |= XCLASS_HAS_TYPE;
7760+
compares--;
7761+
}
7762+
77397763
cc += 2;
77407764
}
77417765
#endif /* SUPPORT_UNICODE */
77427766
}
7743-
SLJIT_ASSERT(compares > 0);
7767+
SLJIT_ASSERT(compares > 0 || category_list > 0);
7768+
7769+
#ifdef SUPPORT_UNICODE
7770+
if (category_list == UCPCAT_ALL)
7771+
{
7772+
/* All characters are accepted, same as dotall. */
7773+
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
7774+
if (list == backtracks)
7775+
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
7776+
return;
7777+
}
7778+
#endif /* SUPPORT_UNICODE */
77447779

77457780
/* We are not necessary in utf mode even in 8 bit mode. */
77467781
cc = ccbegin;
@@ -7841,6 +7876,9 @@ if (unicode_status & XCLASS_NEEDS_UCD)
78417876

78427877
ccbegin = cc;
78437878

7879+
if (category_list != 0)
7880+
compares++;
7881+
78447882
if (unicode_status & XCLASS_HAS_BIDICL)
78457883
{
78467884
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
@@ -8045,8 +8083,16 @@ if (unicode_status & XCLASS_NEEDS_UCD)
80458083
if (unicode_status & XCLASS_SAVE_CHAR)
80468084
typereg = RETURN_ADDR;
80478085

8048-
OP1(SLJIT_MOV_U8, typereg, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
8049-
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, typereg, 0);
8086+
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
8087+
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);
8088+
8089+
if (category_list > 0)
8090+
{
8091+
compares--;
8092+
invertcmp = (compares == 0 && list != backtracks);
8093+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);
8094+
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
8095+
}
80508096
}
80518097
}
80528098
#endif /* SUPPORT_UNICODE */
@@ -8126,26 +8172,16 @@ while (*cc != XCL_END)
81268172
break;
81278173

81288174
case PT_LAMP:
8129-
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt));
8130-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8131-
break;
8132-
81338175
case PT_GC:
8134-
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]));
8135-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8136-
break;
8137-
81388176
case PT_PC:
8139-
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(cc[1]));
8140-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8141-
break;
8142-
81438177
case PT_SC:
81448178
case PT_SCX:
81458179
case PT_BOOL:
81468180
case PT_BIDICL:
8181+
case PT_WORD:
8182+
case PT_ALNUM:
81478183
compares++;
8148-
/* Do nothing. */
8184+
/* Already handled. */
81498185
break;
81508186

81518187
case PT_SPACE:
@@ -8165,16 +8201,6 @@ while (*cc != XCL_END)
81658201
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81668202
break;
81678203

8168-
case PT_WORD:
8169-
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N);
8170-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8171-
break;
8172-
8173-
case PT_ALNUM:
8174-
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_L | UCPCAT_N);
8175-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8176-
break;
8177-
81788204
case PT_CLIST:
81798205
other_cases = PRIV(ucd_caseless_sets) + cc[1];
81808206

src/pcre2_jit_test.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ static struct regression_test_case regression_test_cases[] = {
423423
{ CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
424424
{ MUP, 0, 0, 0 | F_NOMATCH, "[^[:print:]\\x{f6f6}]", "\xef\x9b\xb6" },
425425
{ MUP, 0, 0, 0, "[[:xdigit:]\\x{6500}]#", "\xe6\x94\x80#" },
426+
{ MUP, 0, 0, 0, "[\\pC\\PC]#", "A#" },
426427

427428
/* Possible empty brackets. */
428429
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },

0 commit comments

Comments
 (0)