Skip to content

Commit 6297149

Browse files
committed
Refactor the Lexer to use the new Dialect API
When configuring the Lexer with a dialect provider, invalid language tags in the parsed files will fail instead of silently using English keywords, matching the cucumber behavior. When configuring the Lexer with a KeywordsInterface, the existing silent usage of English will still be done (as the Keywords implementation does that internally).
1 parent 365f330 commit 6297149

File tree

5 files changed

+185
-95
lines changed

5 files changed

+185
-95
lines changed

src/Lexer.php

Lines changed: 177 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
namespace Behat\Gherkin;
1212

13+
use Behat\Gherkin\Dialect\DialectProviderInterface;
14+
use Behat\Gherkin\Dialect\GherkinDialect;
15+
use Behat\Gherkin\Dialect\KeywordsDialectProvider;
1316
use Behat\Gherkin\Exception\LexerException;
1417
use Behat\Gherkin\Keywords\KeywordsInterface;
1518
use LogicException;
@@ -31,7 +34,8 @@ class Lexer
3134
* @see https://github.com/cucumber/gherkin/blob/679a87e21263699c15ea635159c6cda60f64af3b/php/src/StringGherkinLine.php#L14
3235
*/
3336
private const CELL_PATTERN = '/(?<!\\\\)(?:\\\\{2})*\K\\|/u';
34-
private string $language;
37+
private readonly DialectProviderInterface $dialectProvider;
38+
private GherkinDialect $currentDialect;
3539
/**
3640
* @var list<string>
3741
*/
@@ -42,13 +46,11 @@ class Lexer
4246
private int $lineNumber;
4347
private bool $eos;
4448
/**
45-
* @var array<string, string>
46-
*/
47-
private array $keywordsCache = [];
48-
/**
49-
* @var array<string, list<string>>
49+
* A cache of keyword types associated with each keyword.
50+
*
51+
* @var array<string, non-empty-list<string>>|null
5052
*/
51-
private array $stepKeywordTypesCache = [];
53+
private ?array $stepKeywordTypesCache = null;
5254
/**
5355
* @phpstan-var list<TToken>
5456
*/
@@ -67,8 +69,14 @@ class Lexer
6769
private ?string $pyStringDelimiter = null;
6870

6971
public function __construct(
70-
private readonly KeywordsInterface $keywords,
72+
DialectProviderInterface|KeywordsInterface $dialectProvider,
7173
) {
74+
if ($dialectProvider instanceof KeywordsInterface) {
75+
// TODO trigger deprecation
76+
$dialectProvider = new KeywordsDialectProvider($dialectProvider);
77+
}
78+
79+
$this->dialectProvider = $dialectProvider;
7280
}
7381

7482
/**
@@ -108,9 +116,21 @@ public function analyse($input, $language = 'en')
108116
$this->allowMultilineArguments = false;
109117
$this->allowSteps = false;
110118

111-
$this->setLanguage($language);
119+
if (\func_num_args() > 1) {
120+
// @codeCoverageIgnoreStart
121+
\assert($language !== '');
122+
// TODO trigger deprecation (the Parser does not use this code path)
123+
$this->setLanguage($language);
124+
// @codeCoverageIgnoreEnd
125+
} else {
126+
$this->currentDialect = $this->dialectProvider->getDefaultDialect();
127+
$this->stepKeywordTypesCache = null;
128+
}
112129
}
113130

131+
/**
132+
* @param non-empty-string $language
133+
*/
114134
private function setLanguage(string $language): void
115135
{
116136
if (($this->stashedToken !== null) || ($this->deferredObjects !== [])) {
@@ -129,9 +149,8 @@ private function setLanguage(string $language): void
129149
// @codeCoverageIgnoreEnd
130150
}
131151

132-
$this->keywords->setLanguage($this->language = $language);
133-
$this->keywordsCache = [];
134-
$this->stepKeywordTypesCache = [];
152+
$this->currentDialect = $this->dialectProvider->getDialect($language);
153+
$this->stepKeywordTypesCache = null;
135154
}
136155

137156
/**
@@ -141,7 +160,7 @@ private function setLanguage(string $language): void
141160
*/
142161
public function getLanguage()
143162
{
144-
return $this->language;
163+
return $this->currentDialect->getLanguage();
145164
}
146165

147166
/**
@@ -343,9 +362,12 @@ protected function scanInput($regex, $type)
343362
* @return array|null
344363
*
345364
* @phpstan-return TToken|null
365+
*
366+
* @deprecated
346367
*/
347368
protected function scanInputForKeywords($keywords, $type)
348369
{
370+
// @codeCoverageIgnoreStart
349371
if (!preg_match('/^(\s*)(' . $keywords . '):\s*(.*)/u', $this->line, $matches)) {
350372
return null;
351373
}
@@ -375,6 +397,33 @@ protected function scanInputForKeywords($keywords, $type)
375397
}
376398

377399
return $token;
400+
// @codeCoverageIgnoreEnd
401+
}
402+
403+
/**
404+
* @param list<string> $keywords
405+
*
406+
* @phpstan-return TToken|null
407+
*/
408+
private function scanTitleLine(array $keywords, string $type): ?array
409+
{
410+
$trimmedLine = $this->getTrimmedLine();
411+
412+
foreach ($keywords as $keyword) {
413+
if (str_starts_with($trimmedLine, $keyword . ':')) {
414+
$title = trim(mb_substr($trimmedLine, mb_strlen($keyword) + 1));
415+
416+
$token = $this->takeToken($type, $title);
417+
$token['keyword'] = $keyword;
418+
$token['indent'] = mb_strlen($this->line, 'utf8') - mb_strlen(ltrim($this->line), 'utf8');
419+
420+
$this->consumeLine();
421+
422+
return $token;
423+
}
424+
}
425+
426+
return null;
378427
}
379428

380429
/**
@@ -394,33 +443,36 @@ protected function scanEOS()
394443
}
395444

396445
/**
397-
* Returns keywords for provided type.
446+
* Returns a regex matching the keywords for the provided type.
398447
*
399448
* @param string $type Keyword type
400449
*
401450
* @return string
451+
*
452+
* @deprecated
402453
*/
403454
protected function getKeywords($type)
404455
{
405-
if (!isset($this->keywordsCache[$type])) {
406-
$getter = 'get' . $type . 'Keywords';
407-
$keywords = $this->keywords->$getter();
408-
409-
if ($type === 'Step') {
410-
$padded = [];
411-
foreach (explode('|', $keywords) as $keyword) {
412-
$padded[] = str_contains($keyword, '<')
413-
? preg_quote(mb_substr($keyword, 0, -1, 'utf8'), '/') . '\s*'
414-
: preg_quote($keyword, '/') . '\s+';
415-
}
416-
417-
$keywords = implode('|', $padded);
418-
}
456+
// @codeCoverageIgnoreStart
457+
$keywords = match ($type) {
458+
'Feature' => $this->currentDialect->getFeatureKeywords(),
459+
'Background' => $this->currentDialect->getBackgroundKeywords(),
460+
'Scenario' => $this->currentDialect->getScenarioKeywords(),
461+
'Outline' => $this->currentDialect->getScenarioOutlineKeywords(),
462+
'Examples' => $this->currentDialect->getExamplesKeywords(),
463+
'Step' => $this->currentDialect->getStepKeywords(),
464+
'Given' => $this->currentDialect->getGivenKeywords(),
465+
'When' => $this->currentDialect->getWhenKeywords(),
466+
'Then' => $this->currentDialect->getThenKeywords(),
467+
'And' => $this->currentDialect->getAndKeywords(),
468+
'But' => $this->currentDialect->getButKeywords(),
469+
default => throw new \InvalidArgumentException(sprintf('Unknown keyword type "%s"', $type)),
470+
};
419471

420-
$this->keywordsCache[$type] = $keywords;
421-
}
472+
$keywordsRegex = implode('|', array_map(fn ($keyword) => preg_quote($keyword, '/'), $keywords));
422473

423-
return $this->keywordsCache[$type];
474+
return $keywordsRegex;
475+
// @codeCoverageIgnoreEnd
424476
}
425477

426478
/**
@@ -437,7 +489,17 @@ protected function scanFeature()
437489
return null;
438490
}
439491

440-
return $this->scanInputForKeywords($this->getKeywords('Feature'), 'Feature');
492+
$token = $this->scanTitleLine($this->currentDialect->getFeatureKeywords(), 'Feature');
493+
494+
if ($token === null) {
495+
return null;
496+
}
497+
498+
$this->allowFeature = false;
499+
$this->allowLanguageTag = false;
500+
$this->allowMultilineArguments = false;
501+
502+
return $token;
441503
}
442504

443505
/**
@@ -449,7 +511,15 @@ protected function scanFeature()
449511
*/
450512
protected function scanBackground()
451513
{
452-
return $this->scanInputForKeywords($this->getKeywords('Background'), 'Background');
514+
$token = $this->scanTitleLine($this->currentDialect->getBackgroundKeywords(), 'Background');
515+
516+
if ($token === null) {
517+
return null;
518+
}
519+
520+
$this->allowSteps = true;
521+
522+
return $token;
453523
}
454524

455525
/**
@@ -461,7 +531,16 @@ protected function scanBackground()
461531
*/
462532
protected function scanScenario()
463533
{
464-
return $this->scanInputForKeywords($this->getKeywords('Scenario'), 'Scenario');
534+
$token = $this->scanTitleLine($this->currentDialect->getScenarioKeywords(), 'Scenario');
535+
536+
if ($token === null) {
537+
return null;
538+
}
539+
540+
$this->allowMultilineArguments = false;
541+
$this->allowSteps = true;
542+
543+
return $token;
465544
}
466545

467546
/**
@@ -473,7 +552,16 @@ protected function scanScenario()
473552
*/
474553
protected function scanOutline()
475554
{
476-
return $this->scanInputForKeywords($this->getKeywords('Outline'), 'Outline');
555+
$token = $this->scanTitleLine($this->currentDialect->getScenarioOutlineKeywords(), 'Outline');
556+
557+
if ($token === null) {
558+
return null;
559+
}
560+
561+
$this->allowMultilineArguments = false;
562+
$this->allowSteps = true;
563+
564+
return $token;
477565
}
478566

479567
/**
@@ -485,7 +573,15 @@ protected function scanOutline()
485573
*/
486574
protected function scanExamples()
487575
{
488-
return $this->scanInputForKeywords($this->getKeywords('Examples'), 'Examples');
576+
$token = $this->scanTitleLine($this->currentDialect->getExamplesKeywords(), 'Examples');
577+
578+
if ($token === null) {
579+
return null;
580+
}
581+
582+
$this->allowMultilineArguments = true;
583+
584+
return $token;
489585
}
490586

491587
/**
@@ -501,15 +597,27 @@ protected function scanStep()
501597
return null;
502598
}
503599

504-
$keywords = $this->getKeywords('Step');
505-
if (!preg_match('/^\s*(' . $keywords . ')([^\s].*)/u', $this->line, $matches)) {
600+
$trimmedLine = $this->getTrimmedLine();
601+
$matchedKeyword = null;
602+
603+
foreach ($this->currentDialect->getStepKeywords() as $keyword) {
604+
if (str_starts_with($trimmedLine, $keyword)) {
605+
$matchedKeyword = $keyword;
606+
break;
607+
}
608+
}
609+
610+
if ($matchedKeyword === null) {
506611
return null;
507612
}
508613

509-
$keyword = trim($matches[1]);
510-
$token = $this->takeToken('Step', $keyword);
511-
$token['keyword_type'] = $this->getStepKeywordType($keyword);
512-
$token['text'] = $matches[2];
614+
$text = ltrim(mb_substr($trimmedLine, mb_strlen($matchedKeyword)));
615+
616+
// cucumber/gherkin reports the keyword text with its final space when it is part of it, but we keep trimming it for BC reasons
617+
// TODO remove the trimming when opting for the cucumber-compatible parsing in the future
618+
$token = $this->takeToken('Step', trim($matchedKeyword));
619+
$token['keyword_type'] = $this->getStepKeywordType($matchedKeyword);
620+
$token['text'] = $text;
513621

514622
$this->consumeLine();
515623
$this->allowMultilineArguments = true;
@@ -667,6 +775,7 @@ protected function scanLanguage()
667775

668776
if ($token) {
669777
\assert(\is_string($token['value']));
778+
\assert($token['value'] !== ''); // the regex can only match a non-empty value.
670779
$this->allowLanguageTag = false;
671780
$this->setLanguage($token['value']);
672781
}
@@ -736,32 +845,39 @@ protected function scanText()
736845
* Returns step type keyword (Given, When, Then, etc.).
737846
*
738847
* @param string $native Step keyword in provided language
739-
*
740-
* @return string
741848
*/
742-
private function getStepKeywordType($native)
849+
private function getStepKeywordType(string $native): string
743850
{
744-
// Consider "*" as a AND keyword so that it is normalized to the previous step type
745-
if ($native === '*') {
746-
return 'And';
851+
if ($this->stepKeywordTypesCache === null) {
852+
$this->stepKeywordTypesCache = [];
853+
$this->addStepKeywordTypes($this->currentDialect->getGivenKeywords(), 'Given');
854+
$this->addStepKeywordTypes($this->currentDialect->getWhenKeywords(), 'When');
855+
$this->addStepKeywordTypes($this->currentDialect->getThenKeywords(), 'Then');
856+
$this->addStepKeywordTypes($this->currentDialect->getAndKeywords(), 'And');
857+
$this->addStepKeywordTypes($this->currentDialect->getButKeywords(), 'But');
747858
}
748859

749-
if (empty($this->stepKeywordTypesCache)) {
750-
$this->stepKeywordTypesCache = [
751-
'Given' => explode('|', $this->keywords->getGivenKeywords()),
752-
'When' => explode('|', $this->keywords->getWhenKeywords()),
753-
'Then' => explode('|', $this->keywords->getThenKeywords()),
754-
'And' => explode('|', $this->keywords->getAndKeywords()),
755-
'But' => explode('|', $this->keywords->getButKeywords()),
756-
];
860+
if (!isset($this->stepKeywordTypesCache[$native])) { // should not happen when the native keyword belongs to the dialect
861+
return 'Given'; // cucumber/gherkin has an UNKNOWN type, but we don't have it.
757862
}
758863

759-
foreach ($this->stepKeywordTypesCache as $type => $keywords) {
760-
if (in_array($native, $keywords, true) || in_array($native . '<', $keywords, true)) {
761-
return $type;
762-
}
864+
if (\count($this->stepKeywordTypesCache[$native]) === 1) {
865+
return $this->stepKeywordTypesCache[$native][0];
763866
}
764867

765-
return 'Given';
868+
// Consider ambiguous keywords as AND keywords so that they are normalized to the previous step type.
869+
// This happens in English for the `* ` keyword for instance.
870+
// cucumber/gherkin returns that as an UNKNOWN type, but we don't have it.
871+
return 'And';
872+
}
873+
874+
/**
875+
* @param list<string> $keywords
876+
*/
877+
private function addStepKeywordTypes(array $keywords, string $type): void
878+
{
879+
foreach ($keywords as $keyword) {
880+
$this->stepKeywordTypesCache[$keyword][] = $type;
881+
}
766882
}
767883
}

src/Parser.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public function parse($input, $file = null)
6969
$this->tags = [];
7070

7171
try {
72-
$this->lexer->analyse($this->input, 'en');
72+
$this->lexer->analyse($this->input);
7373
} catch (LexerException $e) {
7474
throw new ParserException(
7575
sprintf('Lexer exception "%s" thrown for file %s', $e->getMessage(), $file),

0 commit comments

Comments
 (0)