Skip to content

Commit 13e2298

Browse files
committed
Narrow regex digit sequences to numeric-string instead of decimal-int-string when a leading zero is possible
- `RegexGroupParser` previously narrowed any pure digit sequence (e.g. `\d+`, `[0-9]+`, `\d{2}`) to `decimal-int-string`, but those match leading-zero strings like "007" (and `-?\d+` matches "-0"), which stay strings as array keys and are therefore not canonical decimal integers. - Track, while walking the regex group AST, whether a leading zero can be followed by more digits or whether a negative zero is possible (`RegexGroupWalkResult::isDecimalIntegerLeadingZeroSafe()`). A digit position is analysed for whether it can be "0", whether it is mandatory, and whether it may repeat; classes are analysed as a single character position via `getClassDecimalInfo()`. - Only emit `AccessoryDecimalIntegerStringType` when the match is guaranteed canonical (single digit `\d`, leading non-zero digit class like `[1-9]\d*`, literal "0", decimal alternations like `(?:0|-?[1-9][0-9]*)`); otherwise emit `AccessoryNumericStringType`. This applies to both the subject base type and capturing-group types. - Updated the existing regex shape tests whose `\d+`/`\d{n}`/`-?[0-9]+` patterns now correctly infer `numeric-string`.
1 parent df6acee commit 13e2298

12 files changed

Lines changed: 614 additions & 139 deletions

src/Type/Regex/RegexGroupParser.php

Lines changed: 229 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
use PHPStan\Type\Accessory\AccessoryDecimalIntegerStringType;
1717
use PHPStan\Type\Accessory\AccessoryNonEmptyStringType;
1818
use PHPStan\Type\Accessory\AccessoryNonFalsyStringType;
19+
use PHPStan\Type\Accessory\AccessoryNumericStringType;
1920
use PHPStan\Type\Constant\ConstantStringType;
2021
use PHPStan\Type\IntersectionType;
2122
use PHPStan\Type\StringType;
@@ -25,6 +26,7 @@
2526
use function array_key_exists;
2627
use function array_values;
2728
use function count;
29+
use function ctype_digit;
2830
use function in_array;
2931
use function is_int;
3032
use function preg_replace;
@@ -129,8 +131,11 @@ public function parseGroups(string $regex): ?RegexAstWalkResult
129131
$subjectAsGroupResult->isDecimalInteger()->yes()
130132
&& $this->regexExpressionHelper->isAnchoredPattern($regex)
131133
) {
134+
$accessory = $subjectAsGroupResult->isDecimalIntegerLeadingZeroSafe()
135+
? new AccessoryDecimalIntegerStringType()
136+
: new AccessoryNumericStringType();
132137
$astWalkResult = $astWalkResult->withSubjectBaseType(
133-
new IntersectionType([new StringType(), new AccessoryDecimalIntegerStringType()]),
138+
new IntersectionType([new StringType(), $accessory]),
134139
);
135140
} elseif ($subjectAsGroupResult->isNonFalsy()->yes()) {
136141
$astWalkResult = $astWalkResult->withSubjectBaseType(
@@ -427,15 +432,21 @@ private function createGroupType(TreeNode $group, bool $maybeConstant, string $p
427432
}
428433

429434
if ($walkResult->isDecimalInteger()->yes()) {
435+
// a series of digits beginning with "0" (e.g. "007") or a "-0" is not a canonical
436+
// decimal integer string, but it is still a numeric string
437+
$accessory = $walkResult->isDecimalIntegerLeadingZeroSafe()
438+
? new AccessoryDecimalIntegerStringType()
439+
: new AccessoryNumericStringType();
440+
430441
if ($walkResult->isNonFalsy()->yes()) {
431442
return new IntersectionType([
432443
new StringType(),
433-
new AccessoryDecimalIntegerStringType(),
444+
$accessory,
434445
new AccessoryNonFalsyStringType(),
435446
]);
436447
}
437448

438-
$result = new IntersectionType([new StringType(), new AccessoryDecimalIntegerStringType()]);
449+
$result = new IntersectionType([new StringType(), $accessory]);
439450
if (!$walkResult->isNonEmpty()->yes()) {
440451
return new UnionType([new ConstantStringType(''), $result]);
441452
}
@@ -517,7 +528,7 @@ private function walkGroupAst(
517528
}
518529
}
519530
} elseif ($ast->getId() === '#quantification') {
520-
[$min] = $this->getQuantificationRange($ast);
531+
[$min, $max] = $this->getQuantificationRange($ast);
521532

522533
if ($min === 0) {
523534
$walkResult = $walkResult->inOptionalQuantification(true);
@@ -532,39 +543,77 @@ private function walkGroupAst(
532543
}
533544
}
534545

535-
$walkResult = $walkResult->onlyLiterals(null);
536-
} elseif ($ast->getId() === '#class' && $walkResult->getOnlyLiterals() !== null) {
546+
// signal the quantified atom whether it may appear more than once
547+
// (so a leading zero may be followed by more digits) and whether it
548+
// is optional, which is consumed when the atom is processed.
549+
$walkResult = $walkResult
550+
->decimalAtomRepeats($max === null || $max >= 2)
551+
->decimalAtomOptional($min === 0)
552+
->onlyLiterals(null);
553+
} elseif (in_array($ast->getId(), ['#class', '#negativeclass'], true)) {
537554
$inClass = true;
538555

539-
$newLiterals = [];
540-
foreach ($children as $child) {
541-
$oldLiterals = $walkResult->getOnlyLiterals();
556+
[$atomRepeats, $atomOptional, $walkResult] = $this->consumeDecimalAtomQuantification($walkResult);
542557

543-
$this->getLiteralValue($child, $oldLiterals, true, $patternModifiers, true);
544-
foreach ($oldLiterals ?? [] as $oldLiteral) {
545-
$newLiterals[] = $oldLiteral;
558+
[$classAllDigit, $classCanBeZero] = $ast->getId() === '#class'
559+
? $this->getClassDecimalInfo($ast)
560+
: [false, false];
561+
562+
if ($classAllDigit) {
563+
if ($walkResult->isDecimalInteger()->maybe()) {
564+
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createYes());
565+
}
566+
$walkResult = $this->applyDecimalDigitPosition($walkResult, $classCanBeZero, !$atomOptional, $atomRepeats);
567+
} else {
568+
// [^0-9] should not parse as decimal-int-string, and [^list-everything-but-numbers] is
569+
// technically doable but really silly compared to just \d so we can safely assume the string
570+
// is not a decimal integer for negative classes (and classes containing non-digits).
571+
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createNo());
572+
}
573+
574+
if ($ast->getId() === '#class' && $walkResult->getOnlyLiterals() !== null) {
575+
$newLiterals = [];
576+
foreach ($children as $child) {
577+
$oldLiterals = $walkResult->getOnlyLiterals();
578+
579+
$this->getLiteralValue($child, $oldLiterals, true, $patternModifiers, true);
580+
foreach ($oldLiterals ?? [] as $oldLiteral) {
581+
$newLiterals[] = $oldLiteral;
582+
}
546583
}
584+
$walkResult = $walkResult->onlyLiterals($newLiterals);
585+
} else {
586+
$walkResult = $walkResult->onlyLiterals(null);
547587
}
548-
$walkResult = $walkResult->onlyLiterals($newLiterals);
549588
} elseif ($ast->getId() === 'token') {
550589
$onlyLiterals = $walkResult->getOnlyLiterals();
551590
$literalValue = $this->getLiteralValue($ast, $onlyLiterals, !$inClass, $patternModifiers, false);
552591
$walkResult = $walkResult->onlyLiterals($onlyLiterals);
553592

554593
if ($literalValue !== null) {
555-
if (Strings::match($literalValue, '/^\d+$/') !== null) {
556-
if ($walkResult->isDecimalInteger()->maybe()) {
557-
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createYes());
594+
if (!$inClass && $literalValue !== '') {
595+
[$atomRepeats, $atomOptional, $walkResult] = $this->consumeDecimalAtomQuantification($walkResult);
596+
597+
if (Strings::match($literalValue, '/^\d+$/') !== null) {
598+
if ($walkResult->isDecimalInteger()->maybe()) {
599+
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createYes());
600+
}
601+
$walkResult = $this->applyDecimalDigitPosition(
602+
$walkResult,
603+
$literalValue[0] === '0',
604+
!$atomOptional,
605+
$atomRepeats || strlen($literalValue) > 1,
606+
);
607+
} elseif (
608+
$literalValue === '-'
609+
&& $walkResult->isDecimalInteger()->maybe()
610+
&& !$walkResult->hasSeenDecimalIntegerSign()
611+
) {
612+
// a single leading minus sign keeps the string a decimal integer (e.g. "-1")
613+
$walkResult = $walkResult->seenDecimalIntegerSign(true);
614+
} else {
615+
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createNo());
558616
}
559-
} elseif (
560-
$literalValue === '-'
561-
&& $walkResult->isDecimalInteger()->maybe()
562-
&& !$walkResult->hasSeenDecimalIntegerSign()
563-
) {
564-
// a single leading minus sign keeps the string a decimal integer (e.g. "-1")
565-
$walkResult = $walkResult->seenDecimalIntegerSign(true);
566-
} elseif ($literalValue !== '') {
567-
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createNo());
568617
}
569618

570619
if (!$walkResult->isInOptionalQuantification() && $literalValue !== '') {
@@ -584,6 +633,10 @@ private function walkGroupAst(
584633
$nonEmpty = TrinaryLogic::createYes();
585634
$nonFalsy = TrinaryLogic::createYes();
586635
$decimalInteger = TrinaryLogic::createYes();
636+
$branchBad = false;
637+
$branchLeadCanBeZero = false;
638+
$branchResolved = true;
639+
$branchSeenDigit = false;
587640
foreach ($children as $child) {
588641
$childResult = $this->walkGroupAst(
589642
$child,
@@ -593,12 +646,20 @@ private function walkGroupAst(
593646
->nonEmpty(TrinaryLogic::createMaybe())
594647
->nonFalsy(TrinaryLogic::createMaybe())
595648
->decimalInteger(TrinaryLogic::createMaybe())
596-
->seenDecimalIntegerSign(false),
649+
->seenDecimalIntegerSign(false)
650+
->decimalLeadingResolved(false)
651+
->decimalSeenDigit(false)
652+
->decimalLeadCanBeZero(false)
653+
->decimalBad(false),
597654
);
598655

599656
$nonEmpty = $nonEmpty->and($childResult->isNonEmpty());
600657
$nonFalsy = $nonFalsy->and($childResult->isNonFalsy());
601658
$decimalInteger = $decimalInteger->and($childResult->isDecimalInteger());
659+
$branchBad = $branchBad || !$childResult->isDecimalIntegerLeadingZeroSafe();
660+
$branchLeadCanBeZero = $branchLeadCanBeZero || $childResult->isDecimalLeadCanBeZero();
661+
$branchResolved = $branchResolved && $childResult->isDecimalLeadingResolved();
662+
$branchSeenDigit = $branchSeenDigit || $childResult->hasDecimalSeenDigit();
602663

603664
if ($newLiterals === null) {
604665
continue;
@@ -613,18 +674,26 @@ private function walkGroupAst(
613674
}
614675
}
615676

677+
// the alternation is a single conceptual digit position: it is unsafe if any
678+
// branch is internally unsafe, or if a preceding zero-able lead now gets more digits
679+
$mergedBad = $walkResult->isDecimalBad()
680+
|| $branchBad
681+
|| ($walkResult->hasDecimalSeenDigit() && $walkResult->isDecimalLeadCanBeZero() && $branchSeenDigit);
682+
$mergedLeadCanBeZero = $walkResult->isDecimalLeadingResolved()
683+
? $walkResult->isDecimalLeadCanBeZero()
684+
: ($walkResult->isDecimalLeadCanBeZero() || $branchLeadCanBeZero);
685+
616686
return $walkResult
617687
->onlyLiterals($newLiterals)
618688
->nonEmpty($walkResult->isNonEmpty()->or($nonEmpty))
619689
->nonFalsy($walkResult->isNonFalsy()->or($nonFalsy))
620-
->decimalInteger(TrinaryLogic::maxMin($walkResult->isDecimalInteger(), $decimalInteger));
621-
}
622-
623-
// [^0-9] should not parse as decimal-int-string, and [^list-everything-but-numbers] is technically
624-
// doable but really silly compared to just \d so we can safely assume the string is not a decimal
625-
// integer for negative classes
626-
if ($ast->getId() === '#negativeclass') {
627-
$walkResult = $walkResult->decimalInteger(TrinaryLogic::createNo());
690+
->decimalInteger(TrinaryLogic::maxMin($walkResult->isDecimalInteger(), $decimalInteger))
691+
->decimalLeadingResolved($walkResult->isDecimalLeadingResolved() || $branchResolved)
692+
->decimalSeenDigit($walkResult->hasDecimalSeenDigit() || $branchSeenDigit)
693+
->decimalLeadCanBeZero($mergedLeadCanBeZero)
694+
->decimalBad($mergedBad)
695+
->decimalAtomRepeats(false)
696+
->decimalAtomOptional(false);
628697
}
629698

630699
foreach ($children as $child) {
@@ -639,6 +708,132 @@ private function walkGroupAst(
639708
return $walkResult;
640709
}
641710

711+
/**
712+
* Reads and clears the transient quantification flags set on the walk result
713+
* for the next digit-producing atom.
714+
*
715+
* @return array{bool, bool, RegexGroupWalkResult} [repeats, optional, walkResult]
716+
*/
717+
private function consumeDecimalAtomQuantification(RegexGroupWalkResult $walkResult): array
718+
{
719+
return [
720+
$walkResult->isDecimalAtomRepeats(),
721+
$walkResult->isDecimalAtomOptional(),
722+
$walkResult->decimalAtomRepeats(false)->decimalAtomOptional(false),
723+
];
724+
}
725+
726+
/**
727+
* Tracks one digit character position to detect whether a leading zero can be
728+
* followed by more digits (which would not be a canonical decimal integer).
729+
*
730+
* @param bool $canBeZero whether this digit can be "0"
731+
* @param bool $mandatory whether this digit is always present (not optional)
732+
* @param bool $repeats whether this digit may appear more than once in a row
733+
*/
734+
private function applyDecimalDigitPosition(RegexGroupWalkResult $walkResult, bool $canBeZero, bool $mandatory, bool $repeats): RegexGroupWalkResult
735+
{
736+
$leadingResolved = $walkResult->isDecimalLeadingResolved();
737+
$leadCanBeZero = $walkResult->isDecimalLeadCanBeZero();
738+
$bad = $walkResult->isDecimalBad();
739+
740+
// a digit appears after another digit position: if the lead can be a zero,
741+
// the value is a leading-zero string like "00"
742+
if ($walkResult->hasDecimalSeenDigit() && $leadCanBeZero) {
743+
$bad = true;
744+
}
745+
746+
// while the leading digit is not pinned down yet (only optional digits seen
747+
// so far), this digit may be the leading one
748+
if (!$leadingResolved && $canBeZero) {
749+
$leadCanBeZero = true;
750+
}
751+
752+
// a single quantified digit repeated produces a leading-zero string like "00"
753+
if ($repeats && $leadCanBeZero) {
754+
$bad = true;
755+
}
756+
757+
if ($mandatory) {
758+
$leadingResolved = true;
759+
}
760+
761+
return $walkResult
762+
->decimalSeenDigit(true)
763+
->decimalLeadingResolved($leadingResolved)
764+
->decimalLeadCanBeZero($leadCanBeZero)
765+
->decimalBad($bad);
766+
}
767+
768+
/**
769+
* @return array{bool, bool} [allDigit, canBeZero]
770+
*/
771+
private function getClassDecimalInfo(TreeNode $classNode): array
772+
{
773+
$allDigit = true;
774+
$canBeZero = false;
775+
776+
foreach ($classNode->getChildren() as $child) {
777+
if ($child->getId() === '#range') {
778+
$bounds = $child->getChildren();
779+
$from = $this->getClassBoundChar($bounds[0] ?? null);
780+
$to = $this->getClassBoundChar($bounds[1] ?? null);
781+
782+
if ($from === null || $to === null || $from > $to || ctype_digit($from) === false || ctype_digit($to) === false) {
783+
$allDigit = false;
784+
} elseif ($from <= '0' && '0' <= $to) {
785+
$canBeZero = true;
786+
}
787+
788+
continue;
789+
}
790+
791+
if ($child->getId() === 'token') {
792+
$token = $child->getValueToken();
793+
$value = $child->getValueValue();
794+
795+
if ($token === 'character_type' && $value === '\d') {
796+
$canBeZero = true;
797+
continue;
798+
}
799+
800+
if ($token === 'posix_class' && $value === '[:digit:]') {
801+
$canBeZero = true;
802+
continue;
803+
}
804+
805+
if (
806+
in_array($token, ['literal', 'range', 'class_', '_class'], true)
807+
&& strlen($value) === 1
808+
&& ctype_digit($value)
809+
) {
810+
if ($value === '0') {
811+
$canBeZero = true;
812+
}
813+
continue;
814+
}
815+
}
816+
817+
$allDigit = false;
818+
}
819+
820+
return [$allDigit, $canBeZero];
821+
}
822+
823+
private function getClassBoundChar(?TreeNode $node): ?string
824+
{
825+
if ($node === null || $node->getId() !== 'token') {
826+
return null;
827+
}
828+
829+
$value = $node->getValueValue();
830+
if (strlen($value) > 1 && $value[0] === '\\') {
831+
$value = substr($value, 1) ?: '';
832+
}
833+
834+
return strlen($value) === 1 ? $value : null;
835+
}
836+
642837
private function isMaybeEmptyNode(TreeNode $node, string $patternModifiers, bool &$isNonFalsy, bool &$isNonDecimal): bool
643838
{
644839
if ($node->getId() === '#quantification') {

0 commit comments

Comments
 (0)