Skip to content

Commit 29ea849

Browse files
committed
Support possessive repeats for backreferences
1 parent d96f139 commit 29ea849

File tree

11 files changed

+428
-128
lines changed

11 files changed

+428
-128
lines changed

src/pcre2_compile.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8066,6 +8066,13 @@ for (;; pptr++)
80668066
tempcode += GET(tempcode, 1);
80678067
break;
80688068
#endif
8069+
8070+
case OP_REF:
8071+
case OP_REFI:
8072+
case OP_DNREF:
8073+
case OP_DNREFI:
8074+
tempcode += PRIV(OP_lengths)[*tempcode];
8075+
break;
80698076
}
80708077

80718078
/* If tempcode is equal to code (which points to the end of the repeated

src/pcre2_jit_compile.c

Lines changed: 122 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ typedef struct ref_iterator_backtrack {
321321
backtrack_common common;
322322
/* Next iteration. */
323323
struct sljit_label *matchingpath;
324+
BOOL possessive_or_exact;
324325
} ref_iterator_backtrack;
325326

326327
typedef struct recurse_entry {
@@ -1142,7 +1143,11 @@ cc += PRIV(OP_lengths)[*cc];
11421143
/* Although do_casefulcmp() uses only one local, the allocate_stack()
11431144
calls during the repeat destroys LOCAL1 variables. */
11441145
if (*cc >= OP_CRSTAR && *cc <= OP_CRPOSRANGE)
1146+
{
11451147
locals_size += 2 * SSIZE_OF(sw);
1148+
if (*cc >= OP_CRPOSRANGE && GET2(cc, 1 + IMM2_SIZE + 1) != GET2(cc, 1 + IMM2_SIZE + 1 + IMM2_SIZE))
1149+
locals_size += SSIZE_OF(sw);
1150+
}
11461151

11471152
return (current_locals_size >= locals_size) ? current_locals_size : locals_size;
11481153
}
@@ -8390,9 +8395,10 @@ int offset = 0;
83908395
struct sljit_label *label;
83918396
struct sljit_jump *zerolength;
83928397
struct sljit_jump *jump = NULL;
8398+
jump_list *match_failed = NULL;
83938399
PCRE2_SPTR ccbegin = cc;
83948400
int min = 0, max = 0;
8395-
BOOL minimize;
8401+
BOOL minimize, exact;
83968402

83978403
PUSH_BACKTRACK(sizeof(ref_iterator_backtrack), cc, NULL);
83988404

@@ -8415,38 +8421,137 @@ type = cc[1 + IMM2_SIZE];
84158421
SLJIT_COMPILE_ASSERT((OP_CRSTAR & 0x1) == 0, crstar_opcode_must_be_even);
84168422
/* Update ref_update_local_size() when this changes. */
84178423
SLJIT_ASSERT(local_start + 2 * SSIZE_OF(sw) <= (int)LOCAL0 + common->locals_size);
8418-
minimize = (type & 0x1) != 0;
8424+
minimize = FALSE;
8425+
exact = FALSE;
84198426
switch(type)
84208427
{
8421-
case OP_CRSTAR:
84228428
case OP_CRMINSTAR:
8429+
minimize = TRUE;
8430+
PCRE2_FALLTHROUGH /* Fall through */
8431+
case OP_CRSTAR:
8432+
case OP_CRPOSSTAR:
84238433
min = 0;
84248434
max = 0;
84258435
cc += 1 + IMM2_SIZE + 1;
84268436
break;
8427-
case OP_CRPLUS:
8437+
84288438
case OP_CRMINPLUS:
8439+
minimize = TRUE;
8440+
PCRE2_FALLTHROUGH /* Fall through */
8441+
case OP_CRPLUS:
8442+
case OP_CRPOSPLUS:
84298443
min = 1;
84308444
max = 0;
84318445
cc += 1 + IMM2_SIZE + 1;
84328446
break;
8433-
case OP_CRQUERY:
8447+
84348448
case OP_CRMINQUERY:
8449+
minimize = TRUE;
8450+
PCRE2_FALLTHROUGH /* Fall through */
8451+
case OP_CRQUERY:
8452+
case OP_CRPOSQUERY:
84358453
min = 0;
84368454
max = 1;
84378455
cc += 1 + IMM2_SIZE + 1;
84388456
break;
8439-
case OP_CRRANGE:
8457+
84408458
case OP_CRMINRANGE:
8459+
minimize = TRUE;
8460+
PCRE2_FALLTHROUGH /* Fall through */
8461+
case OP_CRRANGE:
8462+
case OP_CRPOSRANGE:
84418463
min = GET2(cc, 1 + IMM2_SIZE + 1);
84428464
max = GET2(cc, 1 + IMM2_SIZE + 1 + IMM2_SIZE);
8465+
SLJIT_ASSERT(min > 1 || max > 1);
8466+
if (min == max)
8467+
exact = TRUE;
84438468
cc += 1 + IMM2_SIZE + 1 + 2 * IMM2_SIZE;
84448469
break;
84458470
default:
84468471
SLJIT_UNREACHABLE();
84478472
break;
84488473
}
84498474

8475+
if (type >= OP_CRPOSSTAR || exact)
8476+
{
8477+
BACKTRACK_AS(ref_iterator_backtrack)->possessive_or_exact = TRUE;
8478+
if (ref)
8479+
{
8480+
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset));
8481+
if (min > 0 && !common->unset_backref)
8482+
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(1)));
8483+
zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset + 1));
8484+
}
8485+
else
8486+
{
8487+
compile_dnref_search(common, ccbegin, min > 0 ? &backtrack->own_backtracks : NULL);
8488+
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0);
8489+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start + SSIZE_OF(sw), TMP2, 0);
8490+
zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
8491+
}
8492+
8493+
if (exact)
8494+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, SLJIT_IMM, min);
8495+
else if (type != OP_CRPOSSTAR)
8496+
{
8497+
/* STR_PTR is NULL before reaching min. */
8498+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, min > 0 ? SLJIT_IMM : STR_PTR, 0);
8499+
if (type == OP_CRPOSRANGE)
8500+
{
8501+
SLJIT_ASSERT(local_start + 3 * SSIZE_OF(sw) <= (int)LOCAL0 + common->locals_size);
8502+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start + 2 * SSIZE_OF(sw), SLJIT_IMM, 0);
8503+
}
8504+
}
8505+
8506+
label = LABEL();
8507+
if (!ref)
8508+
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), local_start + SSIZE_OF(sw));
8509+
8510+
if (type == OP_CRPOSSTAR)
8511+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, STR_PTR, 0);
8512+
8513+
compile_ref_matchingpath(common, ccbegin, exact ? &backtrack->own_backtracks : &match_failed, FALSE, FALSE);
8514+
8515+
if (type == OP_CRPOSSTAR)
8516+
JUMPTO(SLJIT_JUMP, label);
8517+
else if (type == OP_CRPOSPLUS || type == OP_CRPOSQUERY)
8518+
{
8519+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, STR_PTR, 0);
8520+
if (type == OP_CRPOSPLUS)
8521+
JUMPTO(SLJIT_JUMP, label);
8522+
}
8523+
else if (exact)
8524+
{
8525+
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_MEM1(SLJIT_SP), local_start, SLJIT_MEM1(SLJIT_SP), local_start, SLJIT_IMM, 1);
8526+
JUMPTO(SLJIT_NOT_ZERO, label);
8527+
}
8528+
else
8529+
{
8530+
OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_SP), local_start + 2 * SSIZE_OF(sw), SLJIT_IMM, 1);
8531+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start + 2 * SSIZE_OF(sw), TMP1, 0);
8532+
if (min > 0)
8533+
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, min, label);
8534+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, STR_PTR, 0);
8535+
if (max == 0)
8536+
JUMPTO(SLJIT_JUMP, label);
8537+
else
8538+
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, max, label);
8539+
}
8540+
8541+
if (!exact)
8542+
{
8543+
set_jumps(match_failed, LABEL());
8544+
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), local_start);
8545+
8546+
if (type != OP_CRPOSSTAR)
8547+
add_jump(compiler, &backtrack->own_backtracks, CMP(SLJIT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0));
8548+
}
8549+
8550+
JUMPHERE(zerolength);
8551+
count_match(common);
8552+
return cc;
8553+
}
8554+
84508555
if (!minimize)
84518556
{
84528557
if (min == 0)
@@ -8458,6 +8563,7 @@ if (!minimize)
84588563
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), SLJIT_IMM, 0);
84598564
/* Temporary release of STR_PTR. */
84608565
OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw));
8566+
84618567
/* Handles both invalid and empty cases. Since the minimum repeat,
84628568
is zero the invalid case is basically the same as an empty case. */
84638569
if (ref)
@@ -8469,6 +8575,7 @@ if (!minimize)
84698575
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start + SSIZE_OF(sw), TMP2, 0);
84708576
zerolength = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_MEM1(TMP2), sizeof(sljit_sw));
84718577
}
8578+
84728579
/* Restore if not zero length. */
84738580
OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw));
84748581
}
@@ -8477,6 +8584,7 @@ if (!minimize)
84778584
allocate_stack(common, 1);
84788585
if (ref)
84798586
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(offset));
8587+
84808588
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), SLJIT_IMM, 0);
84818589

84828590
if (ref)
@@ -8504,11 +8612,11 @@ if (!minimize)
85048612

85058613
if (min > 1 || max > 1)
85068614
{
8507-
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), local_start);
8508-
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
8615+
OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_SP), local_start, SLJIT_IMM, 1);
85098616
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), local_start, TMP1, 0);
85108617
if (min > 1)
85118618
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, min, label);
8619+
85128620
if (max > 1)
85138621
{
85148622
jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, max);
@@ -12139,6 +12247,12 @@ PCRE2_UCHAR type;
1213912247

1214012248
type = cc[PRIV(OP_lengths)[*cc]];
1214112249

12250+
if (CURRENT_AS(ref_iterator_backtrack)->possessive_or_exact)
12251+
{
12252+
set_jumps(current->own_backtracks, LABEL());
12253+
return;
12254+
}
12255+
1214212256
if ((type & 0x1) == 0)
1214312257
{
1214412258
/* Maximize case. */

src/pcre2_match.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5284,6 +5284,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
52845284
case OP_CRMINPLUS:
52855285
case OP_CRQUERY:
52865286
case OP_CRMINQUERY:
5287+
case OP_CRPOSSTAR:
5288+
case OP_CRPOSPLUS:
5289+
case OP_CRPOSQUERY:
52875290
fc = *Fecode++ - OP_CRSTAR;
52885291
Lmin = rep_min[fc];
52895292
Lmax = rep_max[fc];
@@ -5292,6 +5295,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
52925295

52935296
case OP_CRRANGE:
52945297
case OP_CRMINRANGE:
5298+
case OP_CRPOSRANGE:
52955299
Lmin = GET2(Fecode, 1);
52965300
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
52975301
reptype = rep_typ[*Fecode - OP_CRSTAR];
@@ -5403,6 +5407,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
54035407
Feptr += slength;
54045408
}
54055409

5410+
/* No recursion if the repeat type is possessive. */
5411+
if (reptype == REPTYPE_POS) break;
5412+
54065413
/* If the length matched for each repetition is the same as the length of
54075414
the captured group, we can easily work backwards. This is the normal
54085415
case. However, in caseless UTF-8 mode there are pairs of case-equivalent

testdata/testinput2

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7640,6 +7640,47 @@ a)"xI
76407640

76417641
# --------------
76427642

7643+
# Test backreferences with possessive repeats
7644+
7645+
/(..)\1*+/B
7646+
ab
7647+
abc
7648+
abababc
7649+
7650+
/(..)\1++/B
7651+
abc
7652+
ababc
7653+
aababab
7654+
aabababc
7655+
7656+
/(..)\1?+/B
7657+
aa
7658+
aac
7659+
ababab
7660+
7661+
# The + is intentionally missing,
7662+
# exact repeat is always possessive
7663+
/(..)\1{3}/B
7664+
abababac
7665+
abababab
7666+
ababababc
7667+
7668+
/(..)\1{2,}+/B
7669+
ababac
7670+
ababab
7671+
abababab
7672+
ababababab
7673+
abababababc
7674+
7675+
/(..)\1{3,5}+/B
7676+
abababac
7677+
abababab
7678+
ababababab
7679+
abababababab
7680+
ababababababab
7681+
7682+
# --------------
7683+
76437684
# Attempt at full coverage of the substitution buffer-management code - not
76447685
# just covering each line in each macro, but covering each instantiation of each
76457686
# line in those macros.

0 commit comments

Comments
 (0)