Skip to content

Commit ce8169d

Browse files
committed
Refactor the Lexer to use the new Dialect API
1 parent 2b3bfca commit ce8169d

File tree

5 files changed

+189
-81
lines changed

5 files changed

+189
-81
lines changed

src/Lexer.php

Lines changed: 181 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010

1111
namespace Behat\Gherkin;
1212

13+
use Behat\Gherkin\Dialect\DialectProviderInterface;
14+
use Behat\Gherkin\Dialect\GherkinDialect;
15+
use Behat\Gherkin\Dialect\KeywordsDialectProvider;
1316
use Behat\Gherkin\Exception\LexerException;
17+
use Behat\Gherkin\Exception\NoSuchLanguageException;
1418
use Behat\Gherkin\Keywords\KeywordsInterface;
1519
use LogicException;
1620

@@ -31,7 +35,8 @@ class Lexer
3135
* @see https://github.com/cucumber/gherkin/blob/679a87e21263699c15ea635159c6cda60f64af3b/php/src/StringGherkinLine.php#L14
3236
*/
3337
private const CELL_PATTERN = '/(?<!\\\\)(?:\\\\{2})*\K\\|/u';
34-
private string $language;
38+
private readonly DialectProviderInterface $dialectProvider;
39+
private GherkinDialect $currentDialect;
3540
/**
3641
* @var list<string>
3742
*/
@@ -42,13 +47,11 @@ class Lexer
4247
private int $lineNumber;
4348
private bool $eos;
4449
/**
45-
* @var array<string, string>
46-
*/
47-
private array $keywordsCache = [];
48-
/**
49-
* @var array<string, list<string>>
50+
* A cache of keyword types associated with each keyword.
51+
*
52+
* @var array<string, non-empty-list<string>>|null
5053
*/
51-
private array $stepKeywordTypesCache = [];
54+
private ?array $stepKeywordTypesCache = null;
5255
/**
5356
* @phpstan-var list<TToken>
5457
*/
@@ -67,8 +70,14 @@ class Lexer
6770
private ?string $pyStringDelimiter = null;
6871

6972
public function __construct(
70-
private readonly KeywordsInterface $keywords,
73+
DialectProviderInterface|KeywordsInterface $dialectProvider,
7174
) {
75+
if ($dialectProvider instanceof KeywordsInterface) {
76+
// TODO trigger deprecation
77+
$dialectProvider = new KeywordsDialectProvider($dialectProvider);
78+
}
79+
80+
$this->dialectProvider = $dialectProvider;
7281
}
7382

7483
/**
@@ -108,9 +117,21 @@ public function analyse(string $input, string $language = 'en')
108117
$this->allowMultilineArguments = false;
109118
$this->allowSteps = false;
110119

111-
$this->setLanguage($language);
120+
if (\func_num_args() > 1) {
121+
// @codeCoverageIgnoreStart
122+
\assert($language !== '');
123+
// TODO trigger deprecation (the Parser does not use this code path)
124+
$this->setLanguage($language);
125+
// @codeCoverageIgnoreEnd
126+
} else {
127+
$this->currentDialect = $this->dialectProvider->getDefaultDialect();
128+
$this->stepKeywordTypesCache = null;
129+
}
112130
}
113131

132+
/**
133+
* @param non-empty-string $language
134+
*/
114135
private function setLanguage(string $language): void
115136
{
116137
if (($this->stashedToken !== null) || ($this->deferredObjects !== [])) {
@@ -129,9 +150,12 @@ private function setLanguage(string $language): void
129150
// @codeCoverageIgnoreEnd
130151
}
131152

132-
$this->keywords->setLanguage($this->language = $language);
133-
$this->keywordsCache = [];
134-
$this->stepKeywordTypesCache = [];
153+
try {
154+
$this->currentDialect = $this->dialectProvider->getDialect($language);
155+
} catch (NoSuchLanguageException) {
156+
// TODO rethrow the exception when introducing the compatibility mode for invalid languages.
157+
}
158+
$this->stepKeywordTypesCache = null;
135159
}
136160

137161
/**
@@ -141,7 +165,7 @@ private function setLanguage(string $language): void
141165
*/
142166
public function getLanguage()
143167
{
144-
return $this->language;
168+
return $this->currentDialect->getLanguage();
145169
}
146170

147171
/**
@@ -343,9 +367,12 @@ protected function scanInput(string $regex, string $type)
343367
* @return array|null
344368
*
345369
* @phpstan-return TToken|null
370+
*
371+
* @deprecated
346372
*/
347373
protected function scanInputForKeywords(string $keywords, string $type)
348374
{
375+
// @codeCoverageIgnoreStart
349376
if (!preg_match('/^(\s*)(' . $keywords . '):\s*(.*)/u', $this->line, $matches)) {
350377
return null;
351378
}
@@ -375,6 +402,33 @@ protected function scanInputForKeywords(string $keywords, string $type)
375402
}
376403

377404
return $token;
405+
// @codeCoverageIgnoreEnd
406+
}
407+
408+
/**
409+
* @param list<string> $keywords
410+
*
411+
* @phpstan-return TToken|null
412+
*/
413+
private function scanTitleLine(array $keywords, string $type): ?array
414+
{
415+
$trimmedLine = $this->getTrimmedLine();
416+
417+
foreach ($keywords as $keyword) {
418+
if (str_starts_with($trimmedLine, $keyword . ':')) {
419+
$title = trim(mb_substr($trimmedLine, mb_strlen($keyword) + 1));
420+
421+
$token = $this->takeToken($type, $title);
422+
$token['keyword'] = $keyword;
423+
$token['indent'] = mb_strlen($this->line, 'utf8') - mb_strlen(ltrim($this->line), 'utf8');
424+
425+
$this->consumeLine();
426+
427+
return $token;
428+
}
429+
}
430+
431+
return null;
378432
}
379433

380434
/**
@@ -394,33 +448,36 @@ protected function scanEOS()
394448
}
395449

396450
/**
397-
* Returns keywords for provided type.
451+
* Returns a regex matching the keywords for the provided type.
398452
*
399453
* @param string $type Keyword type
400454
*
401455
* @return string
456+
*
457+
* @deprecated
402458
*/
403459
protected function getKeywords(string $type)
404460
{
405-
if (!isset($this->keywordsCache[$type])) {
406-
$getter = 'get' . $type . 'Keywords';
407-
$keywords = $this->keywords->$getter();
461+
// @codeCoverageIgnoreStart
462+
$keywords = match ($type) {
463+
'Feature' => $this->currentDialect->getFeatureKeywords(),
464+
'Background' => $this->currentDialect->getBackgroundKeywords(),
465+
'Scenario' => $this->currentDialect->getScenarioKeywords(),
466+
'Outline' => $this->currentDialect->getScenarioOutlineKeywords(),
467+
'Examples' => $this->currentDialect->getExamplesKeywords(),
468+
'Step' => $this->currentDialect->getStepKeywords(),
469+
'Given' => $this->currentDialect->getGivenKeywords(),
470+
'When' => $this->currentDialect->getWhenKeywords(),
471+
'Then' => $this->currentDialect->getThenKeywords(),
472+
'And' => $this->currentDialect->getAndKeywords(),
473+
'But' => $this->currentDialect->getButKeywords(),
474+
default => throw new \InvalidArgumentException(sprintf('Unknown keyword type "%s"', $type)),
475+
};
408476

409-
if ($type === 'Step') {
410-
$padded = [];
411-
foreach (explode('|', $keywords) as $keyword) {
412-
$padded[] = str_contains($keyword, '<')
413-
? preg_quote(mb_substr($keyword, 0, -1, 'utf8'), '/') . '\s*'
414-
: preg_quote($keyword, '/') . '\s+';
415-
}
477+
$keywordsRegex = implode('|', array_map(fn ($keyword) => preg_quote($keyword, '/'), $keywords));
416478

417-
$keywords = implode('|', $padded);
418-
}
419-
420-
$this->keywordsCache[$type] = $keywords;
421-
}
422-
423-
return $this->keywordsCache[$type];
479+
return $keywordsRegex;
480+
// @codeCoverageIgnoreEnd
424481
}
425482

426483
/**
@@ -437,7 +494,17 @@ protected function scanFeature()
437494
return null;
438495
}
439496

440-
return $this->scanInputForKeywords($this->getKeywords('Feature'), 'Feature');
497+
$token = $this->scanTitleLine($this->currentDialect->getFeatureKeywords(), 'Feature');
498+
499+
if ($token === null) {
500+
return null;
501+
}
502+
503+
$this->allowFeature = false;
504+
$this->allowLanguageTag = false;
505+
$this->allowMultilineArguments = false;
506+
507+
return $token;
441508
}
442509

443510
/**
@@ -449,7 +516,15 @@ protected function scanFeature()
449516
*/
450517
protected function scanBackground()
451518
{
452-
return $this->scanInputForKeywords($this->getKeywords('Background'), 'Background');
519+
$token = $this->scanTitleLine($this->currentDialect->getBackgroundKeywords(), 'Background');
520+
521+
if ($token === null) {
522+
return null;
523+
}
524+
525+
$this->allowSteps = true;
526+
527+
return $token;
453528
}
454529

455530
/**
@@ -461,7 +536,16 @@ protected function scanBackground()
461536
*/
462537
protected function scanScenario()
463538
{
464-
return $this->scanInputForKeywords($this->getKeywords('Scenario'), 'Scenario');
539+
$token = $this->scanTitleLine($this->currentDialect->getScenarioKeywords(), 'Scenario');
540+
541+
if ($token === null) {
542+
return null;
543+
}
544+
545+
$this->allowMultilineArguments = false;
546+
$this->allowSteps = true;
547+
548+
return $token;
465549
}
466550

467551
/**
@@ -473,7 +557,16 @@ protected function scanScenario()
473557
*/
474558
protected function scanOutline()
475559
{
476-
return $this->scanInputForKeywords($this->getKeywords('Outline'), 'Outline');
560+
$token = $this->scanTitleLine($this->currentDialect->getScenarioOutlineKeywords(), 'Outline');
561+
562+
if ($token === null) {
563+
return null;
564+
}
565+
566+
$this->allowMultilineArguments = false;
567+
$this->allowSteps = true;
568+
569+
return $token;
477570
}
478571

479572
/**
@@ -485,7 +578,15 @@ protected function scanOutline()
485578
*/
486579
protected function scanExamples()
487580
{
488-
return $this->scanInputForKeywords($this->getKeywords('Examples'), 'Examples');
581+
$token = $this->scanTitleLine($this->currentDialect->getExamplesKeywords(), 'Examples');
582+
583+
if ($token === null) {
584+
return null;
585+
}
586+
587+
$this->allowMultilineArguments = true;
588+
589+
return $token;
489590
}
490591

491592
/**
@@ -501,15 +602,27 @@ protected function scanStep()
501602
return null;
502603
}
503604

504-
$keywords = $this->getKeywords('Step');
505-
if (!preg_match('/^\s*(' . $keywords . ')([^\s].*)/u', $this->line, $matches)) {
605+
$trimmedLine = $this->getTrimmedLine();
606+
$matchedKeyword = null;
607+
608+
foreach ($this->currentDialect->getStepKeywords() as $keyword) {
609+
if (str_starts_with($trimmedLine, $keyword)) {
610+
$matchedKeyword = $keyword;
611+
break;
612+
}
613+
}
614+
615+
if ($matchedKeyword === null) {
506616
return null;
507617
}
508618

509-
$keyword = trim($matches[1]);
510-
$token = $this->takeToken('Step', $keyword);
511-
$token['keyword_type'] = $this->getStepKeywordType($keyword);
512-
$token['text'] = $matches[2];
619+
$text = ltrim(mb_substr($trimmedLine, mb_strlen($matchedKeyword)));
620+
621+
// cucumber/gherkin reports the keyword text with its final space when it is part of it, but we keep trimming it for BC reasons
622+
// TODO remove the trimming when opting for the cucumber-compatible parsing in the future
623+
$token = $this->takeToken('Step', trim($matchedKeyword));
624+
$token['keyword_type'] = $this->getStepKeywordType($matchedKeyword);
625+
$token['text'] = $text;
513626

514627
$this->consumeLine();
515628
$this->allowMultilineArguments = true;
@@ -665,6 +778,7 @@ protected function scanLanguage()
665778

666779
if ($token) {
667780
\assert(\is_string($token['value']));
781+
\assert($token['value'] !== ''); // the regex can only match a non-empty value.
668782
$this->allowLanguageTag = false;
669783
$this->setLanguage($token['value']);
670784
}
@@ -737,27 +851,36 @@ protected function scanText()
737851
*/
738852
private function getStepKeywordType(string $native): string
739853
{
740-
// Consider "*" as a AND keyword so that it is normalized to the previous step type
741-
if ($native === '*') {
742-
return 'And';
854+
if ($this->stepKeywordTypesCache === null) {
855+
$this->stepKeywordTypesCache = [];
856+
$this->addStepKeywordTypes($this->currentDialect->getGivenKeywords(), 'Given');
857+
$this->addStepKeywordTypes($this->currentDialect->getWhenKeywords(), 'When');
858+
$this->addStepKeywordTypes($this->currentDialect->getThenKeywords(), 'Then');
859+
$this->addStepKeywordTypes($this->currentDialect->getAndKeywords(), 'And');
860+
$this->addStepKeywordTypes($this->currentDialect->getButKeywords(), 'But');
743861
}
744862

745-
if (empty($this->stepKeywordTypesCache)) {
746-
$this->stepKeywordTypesCache = [
747-
'Given' => explode('|', $this->keywords->getGivenKeywords()),
748-
'When' => explode('|', $this->keywords->getWhenKeywords()),
749-
'Then' => explode('|', $this->keywords->getThenKeywords()),
750-
'And' => explode('|', $this->keywords->getAndKeywords()),
751-
'But' => explode('|', $this->keywords->getButKeywords()),
752-
];
863+
if (!isset($this->stepKeywordTypesCache[$native])) { // should not happen when the native keyword belongs to the dialect
864+
return 'Given'; // cucumber/gherkin has an UNKNOWN type, but we don't have it.
753865
}
754866

755-
foreach ($this->stepKeywordTypesCache as $type => $keywords) {
756-
if (in_array($native, $keywords, true) || in_array($native . '<', $keywords, true)) {
757-
return $type;
758-
}
867+
if (\count($this->stepKeywordTypesCache[$native]) === 1) {
868+
return $this->stepKeywordTypesCache[$native][0];
759869
}
760870

761-
return 'Given';
871+
// Consider ambiguous keywords as AND keywords so that they are normalized to the previous step type.
872+
// This happens in English for the `* ` keyword for instance.
873+
// cucumber/gherkin returns that as an UNKNOWN type, but we don't have it.
874+
return 'And';
875+
}
876+
877+
/**
878+
* @param list<string> $keywords
879+
*/
880+
private function addStepKeywordTypes(array $keywords, string $type): void
881+
{
882+
foreach ($keywords as $keyword) {
883+
$this->stepKeywordTypesCache[$keyword][] = $type;
884+
}
762885
}
763886
}

src/Parser.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public function parse(string $input, ?string $file = null)
6969
$this->tags = [];
7070

7171
try {
72-
$this->lexer->analyse($this->input, 'en');
72+
$this->lexer->analyse($this->input);
7373
} catch (LexerException $e) {
7474
throw new ParserException(
7575
sprintf('Lexer exception "%s" thrown for file %s', $e->getMessage(), $file),

0 commit comments

Comments
 (0)