Skip to content

Commit 0e6b1ec

Browse files
committed
Enumerate constant strings from bounded regex quantifications (a?, a{n}, a{n,m})
- In `RegexGroupParser::walkGroupAst()`, a `#quantification` node previously always discarded the accumulated literals (`onlyLiterals(null)`), so any quantified atom degraded a constant-string group to `non-falsy-string`. - Add `getQuantifiedLiterals()`: for a bounded quantification over constant literals it walks the quantified atom standalone, enumerates the repetition via `repeatLiterals()` (`a?` => `'a'|''`, `a{2}` => `'aa'`, `a{1,2}` => `'a'|'aa'`), and cross-combines the result with the literals accumulated so far. Unbounded quantifiers (`*`, `+`, `{n,}`) and non-literal atoms keep returning `null` so the group stays non-constant. - Reset `inOptionalQuantification` while walking the quantified atom so a multi-token concatenation inside an optional group (e.g. `(a(bc)?d)`) keeps accumulating its literals instead of being nulled. - Add a named `LITERALS_LIMIT` constant to bail out to a plain string type instead of exploding on deeply nested optional/bounded quantifications. - Update `preg_match_shapes.php` expectations that now infer precise constant unions (`(a|bc?)` => `'a'|'b'|'bc'`, `(a(b)?)` => `'a'|'ab'`, etc.).
1 parent b9cc732 commit 0e6b1ec

3 files changed

Lines changed: 173 additions & 6 deletions

File tree

src/Type/Regex/RegexGroupParser.php

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
use PHPStan\Type\TypeCombinator;
2424
use PHPStan\Type\UnionType;
2525
use function array_key_exists;
26+
use function array_pop;
2627
use function array_values;
2728
use function count;
2829
use function in_array;
@@ -44,6 +45,10 @@ final class RegexGroupParser
4445
'J', // rare modifier too complicated to support
4546
];
4647

48+
// upper bound on the number of constant string literals enumerated from a group,
49+
// to avoid combinatorial explosion from nested optional/bounded quantifications
50+
private const LITERALS_LIMIT = 100;
51+
4752
private static ?Parser $parser = null;
4853

4954
/** @var array<string, ?TreeNode> */
@@ -473,6 +478,7 @@ private function walkGroupAst(
473478
): RegexGroupWalkResult
474479
{
475480
$children = $ast->getChildren();
481+
$quantifiedLiterals = null;
476482

477483
if (
478484
$ast->getId() === '#concatenation'
@@ -506,7 +512,7 @@ private function walkGroupAst(
506512
}
507513
}
508514
} elseif ($ast->getId() === '#quantification') {
509-
[$min] = $this->getQuantificationRange($ast);
515+
[$min, $max] = $this->getQuantificationRange($ast);
510516

511517
if ($min === 0) {
512518
$walkResult = $walkResult->inOptionalQuantification(true);
@@ -521,6 +527,10 @@ private function walkGroupAst(
521527
}
522528
}
523529

530+
// "a?" yields 'a'|'', "a{1,2}" yields 'a'|'aa', etc. so a bounded quantification
531+
// over constant literals can be combined with the surrounding literals
532+
$quantifiedLiterals = $this->getQuantifiedLiterals($ast, $min, $max, $inClass, $patternModifiers, $walkResult);
533+
524534
$walkResult = $walkResult->onlyLiterals(null);
525535
} elseif ($ast->getId() === '#class' && $walkResult->getOnlyLiterals() !== null) {
526536
$inClass = true;
@@ -626,9 +636,108 @@ private function walkGroupAst(
626636
);
627637
}
628638

639+
if ($ast->getId() === '#quantification') {
640+
// the bottom walk above nulls the literals via the quantifier token,
641+
// so restore the literals enumerated up-front for bounded quantifications
642+
$walkResult = $walkResult->onlyLiterals($quantifiedLiterals);
643+
}
644+
629645
return $walkResult;
630646
}
631647

648+
/**
649+
* Enumerate the constant strings a bounded quantification (like "a?", "a{2}", "a{1,3}")
650+
* produces, combined with the literals accumulated so far. Returns null when the result
651+
* cannot be enumerated (unbounded quantifier, non-literal atom, or too many combinations).
652+
*
653+
* @return array<string>|null
654+
*/
655+
private function getQuantifiedLiterals(TreeNode $ast, ?int $min, ?int $max, bool $inClass, string $patternModifiers, RegexGroupWalkResult $walkResult): ?array
656+
{
657+
$prefixLiterals = $walkResult->getOnlyLiterals();
658+
if ($prefixLiterals === null || $min === null || $max === null) {
659+
return null;
660+
}
661+
662+
// walk the quantified atom standalone (everything but the trailing quantifier token);
663+
// the atom itself is not optional, so reset the flag to let concatenations accumulate literals
664+
$atomChildren = $ast->getChildren();
665+
array_pop($atomChildren);
666+
667+
$atomResult = $walkResult->onlyLiterals([])->inOptionalQuantification(false);
668+
foreach ($atomChildren as $atomChild) {
669+
$atomResult = $this->walkGroupAst($atomChild, $inClass, $patternModifiers, $atomResult);
670+
}
671+
672+
$atomLiterals = $atomResult->getOnlyLiterals();
673+
if ($atomLiterals === null) {
674+
return null;
675+
}
676+
677+
$repeatedLiterals = $this->repeatLiterals($atomLiterals, $min, $max);
678+
if ($repeatedLiterals === null) {
679+
return null;
680+
}
681+
682+
$newLiterals = [];
683+
foreach ($repeatedLiterals as $repeatedLiteral) {
684+
if ($prefixLiterals === []) {
685+
$newLiterals[] = $repeatedLiteral;
686+
} else {
687+
foreach ($prefixLiterals as $prefixLiteral) {
688+
$newLiterals[] = $prefixLiteral . $repeatedLiteral;
689+
}
690+
}
691+
692+
if (count($newLiterals) > self::LITERALS_LIMIT) {
693+
return null;
694+
}
695+
}
696+
697+
return $newLiterals;
698+
}
699+
700+
/**
701+
* @param array<string> $literals
702+
* @return array<string>|null
703+
*/
704+
private function repeatLiterals(array $literals, int $min, int $max): ?array
705+
{
706+
$collected = [];
707+
if ($min === 0) {
708+
$collected[''] = '';
709+
}
710+
711+
$current = [''];
712+
for ($k = 1; $k <= $max; $k++) {
713+
$next = [];
714+
foreach ($current as $prefix) {
715+
foreach ($literals as $literal) {
716+
$next[] = $prefix . $literal;
717+
}
718+
719+
if (count($next) > self::LITERALS_LIMIT) {
720+
return null;
721+
}
722+
}
723+
$current = $next;
724+
725+
if ($k < $min) {
726+
continue;
727+
}
728+
729+
foreach ($current as $value) {
730+
$collected[$value] = $value;
731+
}
732+
733+
if (count($collected) > self::LITERALS_LIMIT) {
734+
return null;
735+
}
736+
}
737+
738+
return array_values($collected);
739+
}
740+
632741
private function isMaybeEmptyNode(TreeNode $node, string $patternModifiers, bool &$isNonFalsy): bool
633742
{
634743
if ($node->getId() === '#quantification') {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
<?php declare(strict_types = 1);
2+
3+
namespace Bug14820;
4+
5+
use function PHPStan\Testing\assertType;
6+
7+
function doFoo(string $s): void
8+
{
9+
// a single optional literal yields the literal or the empty string,
10+
// combined with the surrounding literals
11+
if (preg_match('~(ab?)~', $s, $m)) {
12+
assertType("array{non-falsy-string, 'a'|'ab'}", $m);
13+
}
14+
15+
if (preg_match('~(ab?c)~', $s, $m)) {
16+
assertType("array{non-falsy-string, 'abc'|'ac'}", $m);
17+
}
18+
19+
// optional in front
20+
if (preg_match('~(a?bc)~', $s, $m)) {
21+
assertType("array{non-falsy-string, 'abc'|'bc'}", $m);
22+
}
23+
24+
// two optionals combine into the full cross-product
25+
if (preg_match('~(a?b?)~', $s, $m)) {
26+
assertType("array{string, ''|'a'|'ab'|'b'}", $m);
27+
}
28+
29+
// optional over a (sub) group of literals
30+
if (preg_match('~(a(bc)?d)~', $s, $m)) {
31+
assertType("array{0: non-falsy-string, 1: 'abcd'|'ad', 2?: 'bc'}", $m);
32+
}
33+
34+
// optional over an alternation
35+
if (preg_match('~(a(b|c)?d)~', $s, $m)) {
36+
assertType("array{0: non-falsy-string, 1: 'abd'|'acd'|'ad', 2?: 'b'|'c'}", $m);
37+
}
38+
39+
// the classic colour/color example
40+
if (preg_match('~(colou?r)~', $s, $m)) {
41+
assertType("array{non-falsy-string, 'color'|'colour'}", $m);
42+
}
43+
44+
// exactly-n repetition of a literal
45+
if (preg_match('~(ab{2}c)~', $s, $m)) {
46+
assertType("array{non-falsy-string, 'abbc'}", $m);
47+
}
48+
49+
// n-to-m repetition of a literal
50+
if (preg_match('~(ab{1,2}c)~', $s, $m)) {
51+
assertType("array{non-falsy-string, 'abbc'|'abc'}", $m);
52+
}
53+
54+
// unbounded repetition stays non-constant
55+
if (preg_match('~(ab*c)~', $s, $m)) {
56+
assertType('array{non-falsy-string, non-falsy-string}', $m);
57+
}
58+
}

tests/PHPStan/Analyser/nsrt/preg_match_shapes.php

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,10 @@ function doFoo(string $row): void
236236
assertType("array{non-falsy-string, 'ab', 'b'}", $matches);
237237
}
238238
if (preg_match('~^(a(b)?)$~', $row, $matches) === 1) {
239-
assertType("array{0: non-falsy-string, 1: non-falsy-string, 2?: 'b'}", $matches);
239+
assertType("array{0: non-falsy-string, 1: 'a'|'ab', 2?: 'b'}", $matches);
240240
}
241241
if (preg_match('~^(a(b)?)?$~', $row, $matches) === 1) {
242-
assertType("list{0: string, 1?: non-falsy-string, 2?: 'b'}", $matches);
242+
assertType("list{0: string, 1?: 'a'|'ab', 2?: 'b'}", $matches);
243243
}
244244
}
245245

@@ -612,19 +612,19 @@ function (string $s): void {
612612

613613
function (string $s): void {
614614
if (preg_match('/Price: (a|bc?)/', $s, $matches)) {
615-
assertType("array{non-falsy-string, non-falsy-string}", $matches);
615+
assertType("array{non-falsy-string, 'a'|'b'|'bc'}", $matches);
616616
}
617617
};
618618

619619
function (string $s): void {
620620
if (preg_match('/Price: (?<named>a|bc?)/', $s, $matches)) {
621-
assertType("array{0: non-falsy-string, named: non-falsy-string, 1: non-falsy-string}", $matches);
621+
assertType("array{0: non-falsy-string, named: 'a'|'b'|'bc', 1: 'a'|'b'|'bc'}", $matches);
622622
}
623623
};
624624

625625
function (string $s): void {
626626
if (preg_match('/Price: (a|0c?)/', $s, $matches)) {
627-
assertType("array{non-falsy-string, non-empty-string}", $matches);
627+
assertType("array{non-falsy-string, '0'|'0c'|'a'}", $matches);
628628
}
629629
};
630630

0 commit comments

Comments
 (0)