1010
1111namespace Behat \Gherkin ;
1212
13+ use Behat \Gherkin \Dialect \DialectProviderInterface ;
14+ use Behat \Gherkin \Dialect \GherkinDialect ;
15+ use Behat \Gherkin \Dialect \KeywordsDialectProvider ;
1316use Behat \Gherkin \Exception \LexerException ;
1417use Behat \Gherkin \Keywords \KeywordsInterface ;
1518use LogicException ;
@@ -31,7 +34,8 @@ class Lexer
3134 * @see https://github.com/cucumber/gherkin/blob/679a87e21263699c15ea635159c6cda60f64af3b/php/src/StringGherkinLine.php#L14
3235 */
3336 private const CELL_PATTERN = '/(?<! \\\\)(?: \\\\{2})*\K \\|/u ' ;
34- private string $ language ;
37+ private readonly DialectProviderInterface $ dialectProvider ;
38+ private GherkinDialect $ currentDialect ;
3539 /**
3640 * @var list<string>
3741 */
@@ -42,13 +46,11 @@ class Lexer
4246 private int $ lineNumber ;
4347 private bool $ eos ;
4448 /**
45- * @var array<string, string>
46- */
47- private array $ keywordsCache = [];
48- /**
49- * @var array<string, list<string>>
49+ * A cache of keyword types associated with each keyword.
50+ *
51+ * @var array<string, non-empty-list<string>>|null
5052 */
51- private array $ stepKeywordTypesCache = [] ;
53+ private ? array $ stepKeywordTypesCache = null ;
5254 /**
5355 * @phpstan-var list<TToken>
5456 */
@@ -67,8 +69,14 @@ class Lexer
6769 private ?string $ pyStringDelimiter = null ;
6870
6971 public function __construct (
70- private readonly KeywordsInterface $ keywords ,
72+ DialectProviderInterface | KeywordsInterface $ dialectProvider ,
7173 ) {
74+ if ($ dialectProvider instanceof KeywordsInterface) {
75+ // TODO trigger deprecation
76+ $ dialectProvider = new KeywordsDialectProvider ($ dialectProvider );
77+ }
78+
79+ $ this ->dialectProvider = $ dialectProvider ;
7280 }
7381
7482 /**
@@ -108,9 +116,21 @@ public function analyse($input, $language = 'en')
108116 $ this ->allowMultilineArguments = false ;
109117 $ this ->allowSteps = false ;
110118
111- $ this ->setLanguage ($ language );
119+ if (\func_num_args () > 1 ) {
120+ // @codeCoverageIgnoreStart
121+ \assert ($ language !== '' );
122+ // TODO trigger deprecation (the Parser does not use this code path)
123+ $ this ->setLanguage ($ language );
124+ // @codeCoverageIgnoreEnd
125+ } else {
126+ $ this ->currentDialect = $ this ->dialectProvider ->getDefaultDialect ();
127+ $ this ->stepKeywordTypesCache = null ;
128+ }
112129 }
113130
131+ /**
132+ * @param non-empty-string $language
133+ */
114134 private function setLanguage (string $ language ): void
115135 {
116136 if (($ this ->stashedToken !== null ) || ($ this ->deferredObjects !== [])) {
@@ -129,9 +149,8 @@ private function setLanguage(string $language): void
129149 // @codeCoverageIgnoreEnd
130150 }
131151
132- $ this ->keywords ->setLanguage ($ this ->language = $ language );
133- $ this ->keywordsCache = [];
134- $ this ->stepKeywordTypesCache = [];
152+ $ this ->currentDialect = $ this ->dialectProvider ->getDialect ($ language );
153+ $ this ->stepKeywordTypesCache = null ;
135154 }
136155
137156 /**
@@ -141,7 +160,7 @@ private function setLanguage(string $language): void
141160 */
142161 public function getLanguage ()
143162 {
144- return $ this ->language ;
163+ return $ this ->currentDialect -> getLanguage () ;
145164 }
146165
147166 /**
@@ -343,9 +362,12 @@ protected function scanInput($regex, $type)
343362 * @return array|null
344363 *
345364 * @phpstan-return TToken|null
365+ *
366+ * @deprecated
346367 */
347368 protected function scanInputForKeywords ($ keywords , $ type )
348369 {
370+ // @codeCoverageIgnoreStart
349371 if (!preg_match ('/^(\s*)( ' . $ keywords . '):\s*(.*)/u ' , $ this ->line , $ matches )) {
350372 return null ;
351373 }
@@ -375,6 +397,33 @@ protected function scanInputForKeywords($keywords, $type)
375397 }
376398
377399 return $ token ;
400+ // @codeCoverageIgnoreEnd
401+ }
402+
403+ /**
404+ * @param list<string> $keywords
405+ *
406+ * @phpstan-return TToken|null
407+ */
408+ private function scanTitleLine (array $ keywords , string $ type ): ?array
409+ {
410+ $ trimmedLine = $ this ->getTrimmedLine ();
411+
412+ foreach ($ keywords as $ keyword ) {
413+ if (str_starts_with ($ trimmedLine , $ keyword . ': ' )) {
414+ $ title = trim (mb_substr ($ trimmedLine , mb_strlen ($ keyword ) + 1 ));
415+
416+ $ token = $ this ->takeToken ($ type , $ title );
417+ $ token ['keyword ' ] = $ keyword ;
418+ $ token ['indent ' ] = mb_strlen ($ this ->line , 'utf8 ' ) - mb_strlen (ltrim ($ this ->line ), 'utf8 ' );
419+
420+ $ this ->consumeLine ();
421+
422+ return $ token ;
423+ }
424+ }
425+
426+ return null ;
378427 }
379428
380429 /**
@@ -394,33 +443,36 @@ protected function scanEOS()
394443 }
395444
396445 /**
397- * Returns keywords for provided type.
446+ * Returns a regex matching the keywords for the provided type.
398447 *
399448 * @param string $type Keyword type
400449 *
401450 * @return string
451+ *
452+ * @deprecated
402453 */
403454 protected function getKeywords ($ type )
404455 {
405- if (!isset ($ this ->keywordsCache [$ type ])) {
406- $ getter = 'get ' . $ type . 'Keywords ' ;
407- $ keywords = $ this ->keywords ->$ getter ();
408-
409- if ($ type === 'Step ' ) {
410- $ padded = [];
411- foreach (explode ('| ' , $ keywords ) as $ keyword ) {
412- $ padded [] = str_contains ($ keyword , '< ' )
413- ? preg_quote (mb_substr ($ keyword , 0 , -1 , 'utf8 ' ), '/ ' ) . '\s* '
414- : preg_quote ($ keyword , '/ ' ) . '\s+ ' ;
415- }
416-
417- $ keywords = implode ('| ' , $ padded );
418- }
456+ // @codeCoverageIgnoreStart
457+ $ keywords = match ($ type ) {
458+ 'Feature ' => $ this ->currentDialect ->getFeatureKeywords (),
459+ 'Background ' => $ this ->currentDialect ->getBackgroundKeywords (),
460+ 'Scenario ' => $ this ->currentDialect ->getScenarioKeywords (),
461+ 'Outline ' => $ this ->currentDialect ->getScenarioOutlineKeywords (),
462+ 'Examples ' => $ this ->currentDialect ->getExamplesKeywords (),
463+ 'Step ' => $ this ->currentDialect ->getStepKeywords (),
464+ 'Given ' => $ this ->currentDialect ->getGivenKeywords (),
465+ 'When ' => $ this ->currentDialect ->getWhenKeywords (),
466+ 'Then ' => $ this ->currentDialect ->getThenKeywords (),
467+ 'And ' => $ this ->currentDialect ->getAndKeywords (),
468+ 'But ' => $ this ->currentDialect ->getButKeywords (),
469+ default => throw new \InvalidArgumentException (sprintf ('Unknown keyword type "%s" ' , $ type )),
470+ };
419471
420- $ this ->keywordsCache [$ type ] = $ keywords ;
421- }
472+ $ keywordsRegex = implode ('| ' , array_map (fn ($ keyword ) => preg_quote ($ keyword , '/ ' ), $ keywords ));
422473
423- return $ this ->keywordsCache [$ type ];
474+ return $ keywordsRegex ;
475+ // @codeCoverageIgnoreEnd
424476 }
425477
426478 /**
@@ -437,7 +489,17 @@ protected function scanFeature()
437489 return null ;
438490 }
439491
440- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Feature ' ), 'Feature ' );
492+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getFeatureKeywords (), 'Feature ' );
493+
494+ if ($ token === null ) {
495+ return null ;
496+ }
497+
498+ $ this ->allowFeature = false ;
499+ $ this ->allowLanguageTag = false ;
500+ $ this ->allowMultilineArguments = false ;
501+
502+ return $ token ;
441503 }
442504
443505 /**
@@ -449,7 +511,15 @@ protected function scanFeature()
449511 */
450512 protected function scanBackground ()
451513 {
452- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Background ' ), 'Background ' );
514+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getBackgroundKeywords (), 'Background ' );
515+
516+ if ($ token === null ) {
517+ return null ;
518+ }
519+
520+ $ this ->allowSteps = true ;
521+
522+ return $ token ;
453523 }
454524
455525 /**
@@ -461,7 +531,16 @@ protected function scanBackground()
461531 */
462532 protected function scanScenario ()
463533 {
464- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Scenario ' ), 'Scenario ' );
534+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getScenarioKeywords (), 'Scenario ' );
535+
536+ if ($ token === null ) {
537+ return null ;
538+ }
539+
540+ $ this ->allowMultilineArguments = false ;
541+ $ this ->allowSteps = true ;
542+
543+ return $ token ;
465544 }
466545
467546 /**
@@ -473,7 +552,16 @@ protected function scanScenario()
473552 */
474553 protected function scanOutline ()
475554 {
476- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Outline ' ), 'Outline ' );
555+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getScenarioOutlineKeywords (), 'Outline ' );
556+
557+ if ($ token === null ) {
558+ return null ;
559+ }
560+
561+ $ this ->allowMultilineArguments = false ;
562+ $ this ->allowSteps = true ;
563+
564+ return $ token ;
477565 }
478566
479567 /**
@@ -485,7 +573,15 @@ protected function scanOutline()
485573 */
486574 protected function scanExamples ()
487575 {
488- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Examples ' ), 'Examples ' );
576+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getExamplesKeywords (), 'Examples ' );
577+
578+ if ($ token === null ) {
579+ return null ;
580+ }
581+
582+ $ this ->allowMultilineArguments = true ;
583+
584+ return $ token ;
489585 }
490586
491587 /**
@@ -501,15 +597,27 @@ protected function scanStep()
501597 return null ;
502598 }
503599
504- $ keywords = $ this ->getKeywords ('Step ' );
505- if (!preg_match ('/^\s*( ' . $ keywords . ')([^\s].*)/u ' , $ this ->line , $ matches )) {
600+ $ trimmedLine = $ this ->getTrimmedLine ();
601+ $ matchedKeyword = null ;
602+
603+ foreach ($ this ->currentDialect ->getStepKeywords () as $ keyword ) {
604+ if (str_starts_with ($ trimmedLine , $ keyword )) {
605+ $ matchedKeyword = $ keyword ;
606+ break ;
607+ }
608+ }
609+
610+ if ($ matchedKeyword === null ) {
506611 return null ;
507612 }
508613
509- $ keyword = trim ($ matches [1 ]);
510- $ token = $ this ->takeToken ('Step ' , $ keyword );
511- $ token ['keyword_type ' ] = $ this ->getStepKeywordType ($ keyword );
512- $ token ['text ' ] = $ matches [2 ];
614+ $ text = ltrim (mb_substr ($ trimmedLine , mb_strlen ($ matchedKeyword )));
615+
616+ // cucumber/gherkin reports the keyword text with its final space when it is part of it, but we keep trimming it for BC reasons
617+ // TODO remove the trimming when opting for the cucumber-compatible parsing in the future
618+ $ token = $ this ->takeToken ('Step ' , trim ($ matchedKeyword ));
619+ $ token ['keyword_type ' ] = $ this ->getStepKeywordType ($ matchedKeyword );
620+ $ token ['text ' ] = $ text ;
513621
514622 $ this ->consumeLine ();
515623 $ this ->allowMultilineArguments = true ;
@@ -667,6 +775,7 @@ protected function scanLanguage()
667775
668776 if ($ token ) {
669777 \assert (\is_string ($ token ['value ' ]));
778+ \assert ($ token ['value ' ] !== '' ); // the regex can only match a non-empty value.
670779 $ this ->allowLanguageTag = false ;
671780 $ this ->setLanguage ($ token ['value ' ]);
672781 }
@@ -736,32 +845,39 @@ protected function scanText()
736845 * Returns step type keyword (Given, When, Then, etc.).
737846 *
738847 * @param string $native Step keyword in provided language
739- *
740- * @return string
741848 */
742- private function getStepKeywordType ($ native )
849+ private function getStepKeywordType (string $ native ): string
743850 {
744- // Consider "*" as a AND keyword so that it is normalized to the previous step type
745- if ($ native === '* ' ) {
746- return 'And ' ;
851+ if ($ this ->stepKeywordTypesCache === null ) {
852+ $ this ->stepKeywordTypesCache = [];
853+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getGivenKeywords (), 'Given ' );
854+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getWhenKeywords (), 'When ' );
855+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getThenKeywords (), 'Then ' );
856+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getAndKeywords (), 'And ' );
857+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getButKeywords (), 'But ' );
747858 }
748859
749- if (empty ($ this ->stepKeywordTypesCache )) {
750- $ this ->stepKeywordTypesCache = [
751- 'Given ' => explode ('| ' , $ this ->keywords ->getGivenKeywords ()),
752- 'When ' => explode ('| ' , $ this ->keywords ->getWhenKeywords ()),
753- 'Then ' => explode ('| ' , $ this ->keywords ->getThenKeywords ()),
754- 'And ' => explode ('| ' , $ this ->keywords ->getAndKeywords ()),
755- 'But ' => explode ('| ' , $ this ->keywords ->getButKeywords ()),
756- ];
860+ if (!isset ($ this ->stepKeywordTypesCache [$ native ])) { // should not happen when the native keyword belongs to the dialect
861+ return 'Given ' ; // cucumber/gherkin has an UNKNOWN type, but we don't have it.
757862 }
758863
759- foreach ($ this ->stepKeywordTypesCache as $ type => $ keywords ) {
760- if (in_array ($ native , $ keywords , true ) || in_array ($ native . '< ' , $ keywords , true )) {
761- return $ type ;
762- }
864+ if (\count ($ this ->stepKeywordTypesCache [$ native ]) === 1 ) {
865+ return $ this ->stepKeywordTypesCache [$ native ][0 ];
763866 }
764867
765- return 'Given ' ;
868+ // Consider ambiguous keywords as AND keywords so that they are normalized to the previous step type.
869+ // This happens in English for the `* ` keyword for instance.
870+ // cucumber/gherkin returns that as an UNKNOWN type, but we don't have it.
871+ return 'And ' ;
872+ }
873+
874+ /**
875+ * @param list<string> $keywords
876+ */
877+ private function addStepKeywordTypes (array $ keywords , string $ type ): void
878+ {
879+ foreach ($ keywords as $ keyword ) {
880+ $ this ->stepKeywordTypesCache [$ keyword ][] = $ type ;
881+ }
766882 }
767883}
0 commit comments