Skip to content

Commit 2239414

Browse files
authored
Fix OP_REFI for caseless_restrict (#516)
1 parent 440f5d1 commit 2239414

File tree

9 files changed

+222
-24
lines changed

9 files changed

+222
-24
lines changed

HACKING

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,10 @@ Changeable options
365365
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
366366
some others may be changed in the middle of patterns by items such as (?i).
367367
Their processing is handled entirely at compile time by generating different
368-
opcodes for the different settings. The runtime functions do not need to keep
369-
track of an option's state.
368+
opcodes for the different settings. Some options are copied into the opcode's
369+
data, for opcodes such as OP_REFI which depends on the (?r)
370+
(PCRE2_EXTRA_CASELESS_RESTRICT) option. The runtime functions do not need to
371+
keep track of an option's state.
370372

371373
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
372374
are tracked and processed during the parsing pre-pass. The others are handled
@@ -639,6 +641,9 @@ generates OP_DNREF or OP_DNREFI. These are followed by two counts: the index
639641
required name, followed by the number of groups with the same name. The
640642
matching code can then search for the first one that is set.
641643

644+
OP_REFI and OP_DNREFI are further followed by an item containing any
645+
case-insensitivity flags.
646+
642647

643648
Repeating character classes and back references
644649
-----------------------------------------------

src/pcre2_compile.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7188,6 +7188,9 @@ for (;; pptr++)
71887188
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
71897189
PUT2INC(code, 0, index);
71907190
PUT2INC(code, 0, count);
7191+
if ((options & PCRE2_CASELESS) != 0)
7192+
*code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7193+
REFI_FLAG_CASELESS_RESTRICT : 0;
71917194
}
71927195
break;
71937196

@@ -8142,6 +8145,9 @@ for (;; pptr++)
81428145
if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
81438146
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
81448147
PUT2INC(code, 0, meta_arg);
8148+
if ((options & PCRE2_CASELESS) != 0)
8149+
*code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8150+
REFI_FLAG_CASELESS_RESTRICT : 0;
81458151

81468152
/* Update the map of back references, and keep the highest one. We
81478153
could do this in parse_regex() for numerical back references, but not

src/pcre2_internal.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,9 +1780,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
17801780
1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \
17811781
0, /* XCLASS - variable length */ \
17821782
1+IMM2_SIZE, /* REF */ \
1783-
1+IMM2_SIZE, /* REFI */ \
1783+
1+IMM2_SIZE+1, /* REFI */ \
17841784
1+2*IMM2_SIZE, /* DNREF */ \
1785-
1+2*IMM2_SIZE, /* DNREFI */ \
1785+
1+2*IMM2_SIZE+1, /* DNREFI */ \
17861786
1+LINK_SIZE, /* RECURSE */ \
17871787
1+2*LINK_SIZE+1, /* CALLOUT */ \
17881788
0, /* CALLOUT_STR - variable length */ \
@@ -1829,6 +1829,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
18291829

18301830
#define RREF_ANY 0xffff
18311831

1832+
/* Constants used by OP_REFI and OP_DNREFI to control matching behaviour. */
1833+
1834+
#define REFI_FLAG_CASELESS_RESTRICT 0x1
1835+
18321836

18331837
/* ---------- Private structures that are mode-independent. ---------- */
18341838

src/pcre2_jit_compile.c

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,7 +1149,7 @@ while (cc < ccend)
11491149
/* Fall through. */
11501150
case OP_REF:
11511151
common->optimized_cbracket[GET2(cc, 1)] = 0;
1152-
cc += 1 + IMM2_SIZE;
1152+
cc += PRIV(OP_lengths)[*cc];
11531153
break;
11541154

11551155
case OP_ASSERT_NA:
@@ -1181,8 +1181,16 @@ while (cc < ccend)
11811181
cc += 1 + IMM2_SIZE;
11821182
break;
11831183

1184-
case OP_DNREF:
11851184
case OP_DNREFI:
1185+
#ifdef SUPPORT_UNICODE
1186+
if (common->iref_ptr == 0)
1187+
{
1188+
common->iref_ptr = common->ovector_start;
1189+
common->ovector_start += 3 * sizeof(sljit_sw);
1190+
}
1191+
#endif /* SUPPORT_UNICODE */
1192+
/* Fall through */
1193+
case OP_DNREF:
11861194
case OP_DNCREF:
11871195
count = GET2(cc, 1 + IMM2_SIZE);
11881196
slot = common->name_table + GET2(cc, 1) * common->name_entry_size;
@@ -1191,7 +1199,7 @@ while (cc < ccend)
11911199
common->optimized_cbracket[GET2(slot, 0)] = 0;
11921200
slot += common->name_entry_size;
11931201
}
1194-
cc += 1 + 2 * IMM2_SIZE;
1202+
cc += PRIV(OP_lengths)[*cc];
11951203
break;
11961204

11971205
case OP_RECURSE:
@@ -9424,6 +9432,10 @@ jump_list *no_match = NULL;
94249432
int source_reg = COUNT_MATCH;
94259433
int source_end_reg = ARGUMENTS;
94269434
int char1_reg = STACK_LIMIT;
9435+
PCRE2_UCHAR refi_flag = 0;
9436+
9437+
if (*cc == OP_REFI || *cc == OP_DNREFI)
9438+
refi_flag = cc[PRIV(OP_lengths)[*cc] - 1];
94279439
#endif /* SUPPORT_UNICODE */
94289440

94299441
if (ref)
@@ -9438,7 +9450,7 @@ else
94389450
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0);
94399451

94409452
#if defined SUPPORT_UNICODE
9441-
if (common->utf && *cc == OP_REFI)
9453+
if (common->utf && (*cc == OP_REFI || *cc == OP_DNREFI))
94429454
{
94439455
SLJIT_ASSERT(common->iref_ptr != 0);
94449456

@@ -9491,6 +9503,8 @@ if (common->utf && *cc == OP_REFI)
94919503
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
94929504
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
94939505

9506+
if (refi_flag & REFI_FLAG_CASELESS_RESTRICT)
9507+
add_jump(compiler, &no_match, CMP(SLJIT_LESS, char1_reg, 0, SLJIT_IMM, 128));
94949508
add_jump(compiler, &no_match, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
94959509
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
94969510
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_caseless_sets));
@@ -9594,6 +9608,9 @@ if (ref)
95949608
offset = GET2(cc, 1) << 1;
95959609
else
95969610
cc += IMM2_SIZE;
9611+
9612+
if (*ccbegin == OP_REFI || *ccbegin == OP_DNREFI)
9613+
cc += 1;
95979614
type = cc[1 + IMM2_SIZE];
95989615

95999616
SLJIT_COMPILE_ASSERT((OP_CRSTAR & 0x1) == 0, crstar_opcode_must_be_even);
@@ -12687,25 +12704,31 @@ while (cc < ccend)
1268712704

1268812705
case OP_REF:
1268912706
case OP_REFI:
12690-
if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRPOSRANGE)
12707+
{
12708+
int op_len = PRIV(OP_lengths)[*cc];
12709+
if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE)
1269112710
cc = compile_ref_iterator_matchingpath(common, cc, parent);
1269212711
else
1269312712
{
1269412713
compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE);
12695-
cc += 1 + IMM2_SIZE;
12714+
cc += op_len;
1269612715
}
12716+
}
1269712717
break;
1269812718

1269912719
case OP_DNREF:
1270012720
case OP_DNREFI:
12701-
if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRPOSRANGE)
12721+
{
12722+
int op_len = PRIV(OP_lengths)[*cc];
12723+
if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE)
1270212724
cc = compile_ref_iterator_matchingpath(common, cc, parent);
1270312725
else
1270412726
{
1270512727
compile_dnref_search(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks);
1270612728
compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE);
12707-
cc += 1 + 2 * IMM2_SIZE;
12729+
cc += op_len;
1270812730
}
12731+
}
1270912732
break;
1271012733

1271112734
case OP_RECURSE:
@@ -12992,7 +13015,7 @@ PCRE2_SPTR cc = current->cc;
1299213015
BOOL ref = (*cc == OP_REF || *cc == OP_REFI);
1299313016
PCRE2_UCHAR type;
1299413017

12995-
type = cc[ref ? 1 + IMM2_SIZE : 1 + 2 * IMM2_SIZE];
13018+
type = cc[PRIV(OP_lengths)[*cc]];
1299613019

1299713020
if ((type & 0x1) == 0)
1299813021
{

src/pcre2_match.c

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ seems unlikely.)
348348
Arguments:
349349
offset index into the offset vector
350350
caseless TRUE if caseless
351+
caseopts bitmask of REFI_FLAG_XYZ values
351352
F the current backtracking frame pointer
352353
mb points to match block
353354
lengthptr pointer for returning the length matched
@@ -358,8 +359,8 @@ Returns: = 0 sucessful match; number of code units matched is set
358359
*/
359360

360361
static int
361-
match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
362-
PCRE2_SIZE *lengthptr)
362+
match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,
363+
match_block *mb, PCRE2_SIZE *lengthptr)
363364
{
364365
PCRE2_SPTR p;
365366
PCRE2_SIZE length;
@@ -389,6 +390,7 @@ if (caseless)
389390
{
390391
#if defined SUPPORT_UNICODE
391392
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
393+
BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;
392394

393395
if (utf || (mb->poptions & PCRE2_UCP) != 0)
394396
{
@@ -424,6 +426,11 @@ if (caseless)
424426
if (c != d && c != (uint32_t)((int)d + ur->other_case))
425427
{
426428
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
429+
430+
/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets
431+
that start with an ASCII character. */
432+
if (caseless_restrict && *pp < 128) return -1; /* No match */
433+
427434
for (;;)
428435
{
429436
if (c < *pp) return -1; /* No match */
@@ -5006,16 +5013,18 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
50065013
#define Lmin F->temp_32[0]
50075014
#define Lmax F->temp_32[1]
50085015
#define Lcaseless F->temp_32[2]
5016+
#define Lcaseopts F->temp_32[3]
50095017
#define Lstart F->temp_sptr[0]
50105018
#define Loffset F->temp_size
50115019

50125020
case OP_DNREF:
50135021
case OP_DNREFI:
50145022
Lcaseless = (Fop == OP_DNREFI);
5023+
Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;
50155024
{
50165025
int count = GET2(Fecode, 1+IMM2_SIZE);
50175026
PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5018-
Fecode += 1 + 2*IMM2_SIZE;
5027+
Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);
50195028

50205029
while (count-- > 0)
50215030
{
@@ -5029,8 +5038,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
50295038
case OP_REF:
50305039
case OP_REFI:
50315040
Lcaseless = (Fop == OP_REFI);
5041+
Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;
50325042
Loffset = (GET2(Fecode, 1) << 1) - 2;
5033-
Fecode += 1 + IMM2_SIZE;
5043+
Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);
50345044

50355045
/* Set up for repetition, or handle the non-repeated case. The maximum and
50365046
minimum must be in the heap frame, but as they are short-term values, we
@@ -5062,7 +5072,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
50625072

50635073
default: /* No repeat follows */
50645074
{
5065-
rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
5075+
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);
50665076
if (rrc != 0)
50675077
{
50685078
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
@@ -5096,7 +5106,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
50965106
for (i = 1; i <= Lmin; i++)
50975107
{
50985108
PCRE2_SIZE slength;
5099-
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5109+
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
51005110
if (rrc != 0)
51015111
{
51025112
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
@@ -5120,7 +5130,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
51205130
RMATCH(Fecode, RM20);
51215131
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
51225132
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
5123-
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5133+
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
51245134
if (rrc != 0)
51255135
{
51265136
if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
@@ -5145,7 +5155,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
51455155
for (i = Lmin; i < Lmax; i++)
51465156
{
51475157
PCRE2_SIZE slength;
5148-
rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5158+
rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
51495159
if (rrc != 0)
51505160
{
51515161
/* Can't use CHECK_PARTIAL because we don't want to update Feptr in
@@ -5196,7 +5206,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
51965206
for (i = Lmin; i < Lmax; i++)
51975207
{
51985208
PCRE2_SIZE slength;
5199-
(void)match_ref(Loffset, Lcaseless, F, mb, &slength);
5209+
(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);
52005210
Feptr += slength;
52015211
}
52025212
}

src/pcre2_printint.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,21 +633,25 @@ for(;;)
633633

634634
case OP_REFI:
635635
flag = "/i";
636+
extra = code[1 + IMM2_SIZE];
636637
/* Fall through */
637638
case OP_REF:
638639
fprintf(f, " %s \\%d", flag, GET2(code,1));
640+
if (extra != 0) fprintf(f, " 0x%02x", extra);
639641
ccode = code + OP_lengths[*code];
640642
goto CLASS_REF_REPEAT;
641643

642644
case OP_DNREFI:
643645
flag = "/i";
646+
extra = code[1 + 2*IMM2_SIZE];
644647
/* Fall through */
645648
case OP_DNREF:
646649
{
647650
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
648651
fprintf(f, " %s \\k<", flag);
649652
print_custring(f, entry);
650653
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
654+
if (extra != 0) fprintf(f, " 0x%02x", extra);
651655
}
652656
ccode = code + OP_lengths[*code];
653657
goto CLASS_REF_REPEAT;

src/pcre2_study.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ for (;;)
540540
}
541541
}
542542
else d = 0;
543-
cc += 1 + 2*IMM2_SIZE;
543+
cc += PRIV(OP_lengths)[*cc];
544544
goto REPEAT_BACK_REFERENCE;
545545

546546
/* Single back reference by number. References by name are converted to by
@@ -593,7 +593,7 @@ for (;;)
593593
backref_cache[0] = recno;
594594
}
595595

596-
cc += 1 + IMM2_SIZE;
596+
cc += PRIV(OP_lengths)[*cc];
597597

598598
/* Handle repeated back references */
599599

testdata/testinput5

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2289,6 +2289,52 @@
22892289
s\x{212a}s
22902290
K\x{17f}K
22912291

2292+
/(.) \1/i,utf,caseless_restrict
2293+
s S
2294+
k K
2295+
\= Expect no match
2296+
s \x{17f}
2297+
k \x{212a}
2298+
2299+
/(.) (?r:\1)/i,utf
2300+
s S
2301+
k K
2302+
\= Expect no match
2303+
s \x{17f}
2304+
k \x{212a}
2305+
2306+
/(.) \1/i,utf
2307+
s S
2308+
k K
2309+
s \x{17f}
2310+
k \x{212a}
2311+
2312+
/(?:(?<A>ss)|(?<A>kk)) \k<A>/i,utf,dupnames,caseless_restrict
2313+
sS Ss
2314+
kK Kk
2315+
\= Expect no match
2316+
sS \x{17f}s
2317+
kK \x{212a}k
2318+
2319+
/(?:(?<A>ss)|(?<A>kk)) \k<A>/i,utf,dupnames
2320+
sS Ss
2321+
kK Kk
2322+
sS \x{17f}s
2323+
kK \x{212a}k
2324+
2325+
/(?:(?<A>s)|(?<A>k)) \k<A>{3,}!/i,utf,dupnames,caseless_restrict
2326+
s SsSs!
2327+
k KkKk!
2328+
\= Expect no match
2329+
s \x{17f}sSs\x{17f}!
2330+
k \x{212a}kKk\x{212a}!
2331+
2332+
/(?:(?<A>s)|(?<A>k)) \k<A>{3,}!/i,utf,dupnames
2333+
s SsSs!
2334+
k KkKk!
2335+
s \x{17f}sSs\x{17f}!
2336+
k \x{212a}kKk\x{212a}!
2337+
22922338
# End caseless restrict tests
22932339

22942340
# TESTS for PCRE2_EXTRA_ASCII_xxx - again, tests with and without.

0 commit comments

Comments
 (0)