1010
1111namespace Behat \Gherkin ;
1212
13+ use Behat \Gherkin \Dialect \DialectProviderInterface ;
14+ use Behat \Gherkin \Dialect \GherkinDialect ;
15+ use Behat \Gherkin \Dialect \KeywordsDialectProvider ;
1316use Behat \Gherkin \Exception \LexerException ;
17+ use Behat \Gherkin \Exception \NoSuchLanguageException ;
1418use Behat \Gherkin \Keywords \KeywordsInterface ;
1519use LogicException ;
1620
@@ -31,7 +35,8 @@ class Lexer
3135 * @see https://github.com/cucumber/gherkin/blob/679a87e21263699c15ea635159c6cda60f64af3b/php/src/StringGherkinLine.php#L14
3236 */
3337 private const CELL_PATTERN = '/(?<! \\\\)(?: \\\\{2})*\K \\|/u ' ;
34- private string $ language ;
38+ private readonly DialectProviderInterface $ dialectProvider ;
39+ private GherkinDialect $ currentDialect ;
3540 /**
3641 * @var list<string>
3742 */
@@ -42,13 +47,11 @@ class Lexer
4247 private int $ lineNumber ;
4348 private bool $ eos ;
4449 /**
45- * @var array<string, string>
46- */
47- private array $ keywordsCache = [];
48- /**
49- * @var array<string, list<string>>
50+ * A cache of keyword types associated with each keyword.
51+ *
52+ * @var array<string, non-empty-list<string>>|null
5053 */
51- private array $ stepKeywordTypesCache = [] ;
54+ private ? array $ stepKeywordTypesCache = null ;
5255 /**
5356 * @phpstan-var list<TToken>
5457 */
@@ -67,8 +70,14 @@ class Lexer
6770 private ?string $ pyStringDelimiter = null ;
6871
6972 public function __construct (
70- private readonly KeywordsInterface $ keywords ,
73+ DialectProviderInterface | KeywordsInterface $ dialectProvider ,
7174 ) {
75+ if ($ dialectProvider instanceof KeywordsInterface) {
76+ // TODO trigger deprecation
77+ $ dialectProvider = new KeywordsDialectProvider ($ dialectProvider );
78+ }
79+
80+ $ this ->dialectProvider = $ dialectProvider ;
7281 }
7382
7483 /**
@@ -108,9 +117,21 @@ public function analyse(string $input, string $language = 'en')
108117 $ this ->allowMultilineArguments = false ;
109118 $ this ->allowSteps = false ;
110119
111- $ this ->setLanguage ($ language );
120+ if (\func_num_args () > 1 ) {
121+ // @codeCoverageIgnoreStart
122+ \assert ($ language !== '' );
123+ // TODO trigger deprecation (the Parser does not use this code path)
124+ $ this ->setLanguage ($ language );
125+ // @codeCoverageIgnoreEnd
126+ } else {
127+ $ this ->currentDialect = $ this ->dialectProvider ->getDefaultDialect ();
128+ $ this ->stepKeywordTypesCache = null ;
129+ }
112130 }
113131
132+ /**
133+ * @param non-empty-string $language
134+ */
114135 private function setLanguage (string $ language ): void
115136 {
116137 if (($ this ->stashedToken !== null ) || ($ this ->deferredObjects !== [])) {
@@ -129,9 +150,12 @@ private function setLanguage(string $language): void
129150 // @codeCoverageIgnoreEnd
130151 }
131152
132- $ this ->keywords ->setLanguage ($ this ->language = $ language );
133- $ this ->keywordsCache = [];
134- $ this ->stepKeywordTypesCache = [];
153+ try {
154+ $ this ->currentDialect = $ this ->dialectProvider ->getDialect ($ language );
155+ } catch (NoSuchLanguageException ) {
156+ // TODO rethrow the exception when introducing the compatibility mode for invalid languages.
157+ }
158+ $ this ->stepKeywordTypesCache = null ;
135159 }
136160
137161 /**
@@ -141,7 +165,7 @@ private function setLanguage(string $language): void
141165 */
142166 public function getLanguage ()
143167 {
144- return $ this ->language ;
168+ return $ this ->currentDialect -> getLanguage () ;
145169 }
146170
147171 /**
@@ -343,9 +367,12 @@ protected function scanInput(string $regex, string $type)
343367 * @return array|null
344368 *
345369 * @phpstan-return TToken|null
370+ *
371+ * @deprecated
346372 */
347373 protected function scanInputForKeywords (string $ keywords , string $ type )
348374 {
375+ // @codeCoverageIgnoreStart
349376 if (!preg_match ('/^(\s*)( ' . $ keywords . '):\s*(.*)/u ' , $ this ->line , $ matches )) {
350377 return null ;
351378 }
@@ -375,6 +402,33 @@ protected function scanInputForKeywords(string $keywords, string $type)
375402 }
376403
377404 return $ token ;
405+ // @codeCoverageIgnoreEnd
406+ }
407+
408+ /**
409+ * @param list<string> $keywords
410+ *
411+ * @phpstan-return TToken|null
412+ */
413+ private function scanTitleLine (array $ keywords , string $ type ): ?array
414+ {
415+ $ trimmedLine = $ this ->getTrimmedLine ();
416+
417+ foreach ($ keywords as $ keyword ) {
418+ if (str_starts_with ($ trimmedLine , $ keyword . ': ' )) {
419+ $ title = trim (mb_substr ($ trimmedLine , mb_strlen ($ keyword ) + 1 ));
420+
421+ $ token = $ this ->takeToken ($ type , $ title );
422+ $ token ['keyword ' ] = $ keyword ;
423+ $ token ['indent ' ] = mb_strlen ($ this ->line , 'utf8 ' ) - mb_strlen (ltrim ($ this ->line ), 'utf8 ' );
424+
425+ $ this ->consumeLine ();
426+
427+ return $ token ;
428+ }
429+ }
430+
431+ return null ;
378432 }
379433
380434 /**
@@ -394,33 +448,36 @@ protected function scanEOS()
394448 }
395449
396450 /**
397- * Returns keywords for provided type.
451+ * Returns a regex matching the keywords for the provided type.
398452 *
399453 * @param string $type Keyword type
400454 *
401455 * @return string
456+ *
457+ * @deprecated
402458 */
403459 protected function getKeywords (string $ type )
404460 {
405- if (!isset ($ this ->keywordsCache [$ type ])) {
406- $ getter = 'get ' . $ type . 'Keywords ' ;
407- $ keywords = $ this ->keywords ->$ getter ();
461+ // @codeCoverageIgnoreStart
462+ $ keywords = match ($ type ) {
463+ 'Feature ' => $ this ->currentDialect ->getFeatureKeywords (),
464+ 'Background ' => $ this ->currentDialect ->getBackgroundKeywords (),
465+ 'Scenario ' => $ this ->currentDialect ->getScenarioKeywords (),
466+ 'Outline ' => $ this ->currentDialect ->getScenarioOutlineKeywords (),
467+ 'Examples ' => $ this ->currentDialect ->getExamplesKeywords (),
468+ 'Step ' => $ this ->currentDialect ->getStepKeywords (),
469+ 'Given ' => $ this ->currentDialect ->getGivenKeywords (),
470+ 'When ' => $ this ->currentDialect ->getWhenKeywords (),
471+ 'Then ' => $ this ->currentDialect ->getThenKeywords (),
472+ 'And ' => $ this ->currentDialect ->getAndKeywords (),
473+ 'But ' => $ this ->currentDialect ->getButKeywords (),
474+ default => throw new \InvalidArgumentException (sprintf ('Unknown keyword type "%s" ' , $ type )),
475+ };
408476
409- if ($ type === 'Step ' ) {
410- $ padded = [];
411- foreach (explode ('| ' , $ keywords ) as $ keyword ) {
412- $ padded [] = str_contains ($ keyword , '< ' )
413- ? preg_quote (mb_substr ($ keyword , 0 , -1 , 'utf8 ' ), '/ ' ) . '\s* '
414- : preg_quote ($ keyword , '/ ' ) . '\s+ ' ;
415- }
477+ $ keywordsRegex = implode ('| ' , array_map (fn ($ keyword ) => preg_quote ($ keyword , '/ ' ), $ keywords ));
416478
417- $ keywords = implode ('| ' , $ padded );
418- }
419-
420- $ this ->keywordsCache [$ type ] = $ keywords ;
421- }
422-
423- return $ this ->keywordsCache [$ type ];
479+ return $ keywordsRegex ;
480+ // @codeCoverageIgnoreEnd
424481 }
425482
426483 /**
@@ -437,7 +494,17 @@ protected function scanFeature()
437494 return null ;
438495 }
439496
440- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Feature ' ), 'Feature ' );
497+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getFeatureKeywords (), 'Feature ' );
498+
499+ if ($ token === null ) {
500+ return null ;
501+ }
502+
503+ $ this ->allowFeature = false ;
504+ $ this ->allowLanguageTag = false ;
505+ $ this ->allowMultilineArguments = false ;
506+
507+ return $ token ;
441508 }
442509
443510 /**
@@ -449,7 +516,15 @@ protected function scanFeature()
449516 */
450517 protected function scanBackground ()
451518 {
452- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Background ' ), 'Background ' );
519+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getBackgroundKeywords (), 'Background ' );
520+
521+ if ($ token === null ) {
522+ return null ;
523+ }
524+
525+ $ this ->allowSteps = true ;
526+
527+ return $ token ;
453528 }
454529
455530 /**
@@ -461,7 +536,16 @@ protected function scanBackground()
461536 */
462537 protected function scanScenario ()
463538 {
464- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Scenario ' ), 'Scenario ' );
539+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getScenarioKeywords (), 'Scenario ' );
540+
541+ if ($ token === null ) {
542+ return null ;
543+ }
544+
545+ $ this ->allowMultilineArguments = false ;
546+ $ this ->allowSteps = true ;
547+
548+ return $ token ;
465549 }
466550
467551 /**
@@ -473,7 +557,16 @@ protected function scanScenario()
473557 */
474558 protected function scanOutline ()
475559 {
476- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Outline ' ), 'Outline ' );
560+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getScenarioOutlineKeywords (), 'Outline ' );
561+
562+ if ($ token === null ) {
563+ return null ;
564+ }
565+
566+ $ this ->allowMultilineArguments = false ;
567+ $ this ->allowSteps = true ;
568+
569+ return $ token ;
477570 }
478571
479572 /**
@@ -485,7 +578,15 @@ protected function scanOutline()
485578 */
486579 protected function scanExamples ()
487580 {
488- return $ this ->scanInputForKeywords ($ this ->getKeywords ('Examples ' ), 'Examples ' );
581+ $ token = $ this ->scanTitleLine ($ this ->currentDialect ->getExamplesKeywords (), 'Examples ' );
582+
583+ if ($ token === null ) {
584+ return null ;
585+ }
586+
587+ $ this ->allowMultilineArguments = true ;
588+
589+ return $ token ;
489590 }
490591
491592 /**
@@ -501,15 +602,27 @@ protected function scanStep()
501602 return null ;
502603 }
503604
504- $ keywords = $ this ->getKeywords ('Step ' );
505- if (!preg_match ('/^\s*( ' . $ keywords . ')([^\s].*)/u ' , $ this ->line , $ matches )) {
605+ $ trimmedLine = $ this ->getTrimmedLine ();
606+ $ matchedKeyword = null ;
607+
608+ foreach ($ this ->currentDialect ->getStepKeywords () as $ keyword ) {
609+ if (str_starts_with ($ trimmedLine , $ keyword )) {
610+ $ matchedKeyword = $ keyword ;
611+ break ;
612+ }
613+ }
614+
615+ if ($ matchedKeyword === null ) {
506616 return null ;
507617 }
508618
509- $ keyword = trim ($ matches [1 ]);
510- $ token = $ this ->takeToken ('Step ' , $ keyword );
511- $ token ['keyword_type ' ] = $ this ->getStepKeywordType ($ keyword );
512- $ token ['text ' ] = $ matches [2 ];
619+ $ text = ltrim (mb_substr ($ trimmedLine , mb_strlen ($ matchedKeyword )));
620+
621+ // cucumber/gherkin reports the keyword text with its final space when it is part of it, but we keep trimming it for BC reasons
622+ // TODO remove the trimming when opting for the cucumber-compatible parsing in the future
623+ $ token = $ this ->takeToken ('Step ' , trim ($ matchedKeyword ));
624+ $ token ['keyword_type ' ] = $ this ->getStepKeywordType ($ matchedKeyword );
625+ $ token ['text ' ] = $ text ;
513626
514627 $ this ->consumeLine ();
515628 $ this ->allowMultilineArguments = true ;
@@ -665,6 +778,7 @@ protected function scanLanguage()
665778
666779 if ($ token ) {
667780 \assert (\is_string ($ token ['value ' ]));
781+ \assert ($ token ['value ' ] !== '' ); // the regex can only match a non-empty value.
668782 $ this ->allowLanguageTag = false ;
669783 $ this ->setLanguage ($ token ['value ' ]);
670784 }
@@ -737,27 +851,36 @@ protected function scanText()
737851 */
738852 private function getStepKeywordType (string $ native ): string
739853 {
740- // Consider "*" as a AND keyword so that it is normalized to the previous step type
741- if ($ native === '* ' ) {
742- return 'And ' ;
854+ if ($ this ->stepKeywordTypesCache === null ) {
855+ $ this ->stepKeywordTypesCache = [];
856+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getGivenKeywords (), 'Given ' );
857+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getWhenKeywords (), 'When ' );
858+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getThenKeywords (), 'Then ' );
859+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getAndKeywords (), 'And ' );
860+ $ this ->addStepKeywordTypes ($ this ->currentDialect ->getButKeywords (), 'But ' );
743861 }
744862
745- if (empty ($ this ->stepKeywordTypesCache )) {
746- $ this ->stepKeywordTypesCache = [
747- 'Given ' => explode ('| ' , $ this ->keywords ->getGivenKeywords ()),
748- 'When ' => explode ('| ' , $ this ->keywords ->getWhenKeywords ()),
749- 'Then ' => explode ('| ' , $ this ->keywords ->getThenKeywords ()),
750- 'And ' => explode ('| ' , $ this ->keywords ->getAndKeywords ()),
751- 'But ' => explode ('| ' , $ this ->keywords ->getButKeywords ()),
752- ];
863+ if (!isset ($ this ->stepKeywordTypesCache [$ native ])) { // should not happen when the native keyword belongs to the dialect
864+ return 'Given ' ; // cucumber/gherkin has an UNKNOWN type, but we don't have it.
753865 }
754866
755- foreach ($ this ->stepKeywordTypesCache as $ type => $ keywords ) {
756- if (in_array ($ native , $ keywords , true ) || in_array ($ native . '< ' , $ keywords , true )) {
757- return $ type ;
758- }
867+ if (\count ($ this ->stepKeywordTypesCache [$ native ]) === 1 ) {
868+ return $ this ->stepKeywordTypesCache [$ native ][0 ];
759869 }
760870
761- return 'Given ' ;
871+ // Consider ambiguous keywords as AND keywords so that they are normalized to the previous step type.
872+ // This happens in English for the `* ` keyword for instance.
873+ // cucumber/gherkin returns that as an UNKNOWN type, but we don't have it.
874+ return 'And ' ;
875+ }
876+
877+ /**
878+ * @param list<string> $keywords
879+ */
880+ private function addStepKeywordTypes (array $ keywords , string $ type ): void
881+ {
882+ foreach ($ keywords as $ keyword ) {
883+ $ this ->stepKeywordTypesCache [$ keyword ][] = $ type ;
884+ }
762885 }
763886}
0 commit comments