Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved parser performance #372

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ This document is formatted according to the principles of [Keep A CHANGELOG](htt
- [.NET] Fix NuGet package generation
- [c] Optimise error handling for empty datatable rows
- [Perl] Optimise error handling for unclosed DocStrings
- [Java] Improved parsing time ([#361](https://github.com/cucumber/gherkin/issues/361))

## [31.0.0] - 2025-01-29
### Added
Expand Down
15 changes: 9 additions & 6 deletions java/gherkin-java.razor
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ import static java.util.Arrays.asList;

class @Model.ParserClassName<T> {
enum TokenType {
None,
None(RuleType.None),
@foreach(var rule in Model.RuleSet.TokenRules)
{<text> @rule.Name.Replace("#", ""),
{<text> @(rule.Name.Replace("#", ""))(RuleType.@(rule.Name.Replace("#", "_"))),
</text>} ;

final RuleType ruleType;

TokenType(RuleType ruleType) {
this.ruleType = ruleType;
}
}

enum RuleType {
Expand All @@ -59,9 +65,6 @@ class @Model.ParserClassName<T> {
{<text> @rule.Name.Replace("#", "_"), // @rule.ToString(true)
</text>} ;

static RuleType cast(TokenType tokenType) {
return RuleType.values()[tokenType.ordinal()];
}
}

private final Builder<T> builder;
Expand Down Expand Up @@ -189,7 +192,7 @@ class @Model.ParserClassName<T> {
}

private Token readToken(ParserContext context) {
return context.tokenQueue.size() > 0 ? context.tokenQueue.remove() : context.tokenScanner.read();
return context.tokenQueue.isEmpty() ? context.tokenScanner.read() : context.tokenQueue.remove();
}

@foreach(var rule in Model.RuleSet.TokenRules)
Expand Down
5 changes: 2 additions & 3 deletions java/src/main/java/io/cucumber/gherkin/AstNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ public <T> List<T> getItems(RuleType ruleType) {
}

public Token getToken(TokenType tokenType) {
RuleType ruleType = RuleType.cast(tokenType);
return getSingle(ruleType, new Token(null, null));
return getSingle(tokenType.ruleType, new Token(null, null));
}

public List<Token> getTokens(TokenType tokenType) {
return getItems(RuleType.cast(tokenType));
return getItems(tokenType.ruleType);
}
}
23 changes: 14 additions & 9 deletions java/src/main/java/io/cucumber/gherkin/EncodingParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Optional;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -18,6 +19,7 @@ class EncodingParser {
private static final Pattern COMMENT_OR_EMPTY_LINE_PATTERN = Pattern.compile("^\\s*#|^\\s*$");
private static final Pattern ENCODING_PATTERN = Pattern.compile("^\\s*#\\s*encoding\\s*:\\s*([0-9a-zA-Z\\-]+)",
CASE_INSENSITIVE);
private static final Pattern PATTERN_NEW_LINE = Pattern.compile("[\\n\\r]");

static String readWithEncodingFromSource(byte[] source) {
byte[] bomFreeSource = removeByteOrderMarker(source);
Expand All @@ -41,15 +43,18 @@ private static byte[] removeByteOrderMarker(byte[] source) {
}

private static Optional<Charset> parseEncodingPragma(String source) {
for (String line : source.split("[\\n\\r]")) {
if (!COMMENT_OR_EMPTY_LINE_PATTERN.matcher(line).find()) {
return Optional.empty();
}
Matcher matcher = ENCODING_PATTERN.matcher(line);
if (matcher.find()) {
String charSetName = matcher.group(1).toUpperCase(ROOT);
Charset charset = Charset.forName(charSetName);
return Optional.of(charset);
try (Scanner scanner = new Scanner(source)) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (!COMMENT_OR_EMPTY_LINE_PATTERN.matcher(line).find()) {
return Optional.empty();
}
Matcher matcher = ENCODING_PATTERN.matcher(line);
if (matcher.find()) {
String charSetName = matcher.group(1).toUpperCase(ROOT);
Charset charset = Charset.forName(charSetName);
return Optional.of(charset);
}
}
}
return Optional.empty();
Expand Down
13 changes: 7 additions & 6 deletions java/src/main/java/io/cucumber/gherkin/GherkinDialect.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

import static java.util.Collections.unmodifiableList;
import static java.util.Objects.requireNonNull;
Expand Down Expand Up @@ -45,23 +45,24 @@ public final class GherkinDialect {
this.andKeywords = requireNonNull(andKeywords);
this.butKeywords = requireNonNull(butKeywords);

List<String> stepKeywords = new ArrayList<>();
List<String> stepKeywords = new ArrayList<>(givenKeywords.size() + whenKeywords.size() +
thenKeywords.size() + andKeywords.size() + butKeywords.size());
stepKeywords.addAll(givenKeywords);
stepKeywords.addAll(whenKeywords);
stepKeywords.addAll(thenKeywords);
stepKeywords.addAll(whenKeywords);
stepKeywords.addAll(andKeywords);
stepKeywords.addAll(butKeywords);
this.stepKeywords = unmodifiableList(stepKeywords);
this.stepKeywords = unmodifiableList(stepKeywords.stream().distinct().collect(Collectors.toList()));

Map<String, List<StepKeywordType>> stepKeywordsTypes = new HashMap<>();
addStepKeywordsTypes(stepKeywordsTypes, getGivenKeywords(), StepKeywordType.CONTEXT);
addStepKeywordsTypes(stepKeywordsTypes, getWhenKeywords(), StepKeywordType.ACTION);
addStepKeywordsTypes(stepKeywordsTypes, getThenKeywords(), StepKeywordType.OUTCOME);

List<String> conjunctionKeywords = new ArrayList<>();
List<String> conjunctionKeywords = new ArrayList<>(andKeywords.size() + butKeywords.size());
conjunctionKeywords.addAll(getAndKeywords());
conjunctionKeywords.addAll(getButKeywords());
addStepKeywordsTypes(stepKeywordsTypes, conjunctionKeywords, StepKeywordType.CONJUNCTION);
addStepKeywordsTypes(stepKeywordsTypes, conjunctionKeywords.stream().distinct().collect(Collectors.toList()), StepKeywordType.CONJUNCTION);
this.stepKeywordsTypes = stepKeywordsTypes;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@

public final class GherkinDialectProvider {

private final String defaultDialectName;
private final GherkinDialect defaultDialect;

public GherkinDialectProvider(String defaultDialectName) {
this.defaultDialectName = requireNonNull(defaultDialectName);
this.defaultDialect = getDialect(defaultDialectName)
.orElseThrow(() -> new ParserException.NoSuchLanguageException(defaultDialectName, null));
}

public GherkinDialectProvider() {
this("en");
}

public GherkinDialect getDefaultDialect() {
return getDialect(defaultDialectName).orElseThrow(() -> new ParserException.NoSuchLanguageException(defaultDialectName, null));
return defaultDialect;
}

public Optional<GherkinDialect> getDialect(String language) {
Expand Down
38 changes: 22 additions & 16 deletions java/src/main/java/io/cucumber/gherkin/GherkinDocumentBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static io.cucumber.gherkin.Parser.Builder;
import static io.cucumber.gherkin.Parser.RuleType;
import static io.cucumber.gherkin.Parser.TokenType;

class GherkinDocumentBuilder implements Builder<GherkinDocument> {
public static final Pattern PATTERN_SPACES = Pattern.compile("\\s*");
private final List<Comment> comments = new ArrayList<>();
private final IdGenerator idGenerator;
private String uri;
Expand All @@ -53,12 +55,11 @@ private AstNode currentNode() {

@Override
public void build(Token token) {
RuleType ruleType = RuleType.cast(token.matchedType);
if (token.matchedType == TokenType.Comment) {
Comment comment = new Comment(getLocation(token, 0), token.matchedText);
Comment comment = new Comment(getLocation(token), token.matchedText);
comments.add(comment);
} else {
currentNode().add(ruleType, token);
currentNode().add(token.matchedType.ruleType, token);
}
}

Expand All @@ -79,7 +80,7 @@ private Object getTransformedNode(AstNode node) {
case Step: {
Token stepLine = node.getToken(TokenType.StepLine);
return new Step(
getLocation(stepLine, 0),
getLocation(stepLine),
stepLine.matchedKeyword,
stepLine.keywordType,
stepLine.matchedText,
Expand All @@ -90,7 +91,7 @@ private Object getTransformedNode(AstNode node) {
}
case DocString: {
Token separatorToken = node.getTokens(TokenType.DocStringSeparator).get(0);
String mediaType = separatorToken.matchedText.length() > 0 ? separatorToken.matchedText : null;
String mediaType = separatorToken.matchedText.isEmpty() ? null : separatorToken.matchedText;
List<Token> lineTokens = node.getTokens(TokenType.Other);
StringBuilder content = new StringBuilder();
boolean newLine = false;
Expand All @@ -101,7 +102,7 @@ private Object getTransformedNode(AstNode node) {
}

return new DocString(
getLocation(separatorToken, 0),
getLocation(separatorToken),
mediaType,
content.toString(),
separatorToken.matchedKeyword
Expand All @@ -114,7 +115,7 @@ private Object getTransformedNode(AstNode node) {
case Background: {
Token backgroundLine = node.getToken(TokenType.BackgroundLine);
return new Background(
getLocation(backgroundLine, 0),
getLocation(backgroundLine),
backgroundLine.matchedKeyword,
backgroundLine.matchedText,
getDescription(node),
Expand All @@ -127,7 +128,7 @@ private Object getTransformedNode(AstNode node) {
Token scenarioLine = scenarioNode.getToken(TokenType.ScenarioLine);

return new Scenario(
getLocation(scenarioLine, 0),
getLocation(scenarioLine),
getTags(node),
scenarioLine.matchedKeyword,
scenarioLine.matchedText,
Expand All @@ -145,7 +146,7 @@ private Object getTransformedNode(AstNode node) {
List<TableRow> tableBody = rows != null && !rows.isEmpty() ? rows.subList(1, rows.size()) : Collections.emptyList();

return new Examples(
getLocation(examplesLine, 0),
getLocation(examplesLine),
getTags(node),
examplesLine.matchedKeyword,
examplesLine.matchedText,
Expand All @@ -163,7 +164,7 @@ private Object getTransformedNode(AstNode node) {
List<Token> lineTokens = node.getTokens(TokenType.Other);
// Trim trailing empty lines
int end = lineTokens.size();
while (end > 0 && lineTokens.get(end - 1).matchedText.matches("\\s*")) {
while (end > 0 && PATTERN_SPACES.matcher(lineTokens.get(end - 1).matchedText).matches()) {
end--;
}
lineTokens = lineTokens.subList(0, end);
Expand Down Expand Up @@ -196,7 +197,7 @@ private Object getTransformedNode(AstNode node) {
String language = featureLine.matchedGherkinDialect.getLanguage();

return new Feature(
getLocation(featureLine, 0),
getLocation(featureLine),
tags,
language,
featureLine.matchedKeyword,
Expand Down Expand Up @@ -224,7 +225,7 @@ private Object getTransformedNode(AstNode node) {
}

return new Rule(
getLocation(ruleLine, 0),
getLocation(ruleLine),
tags,
ruleLine.matchedKeyword,
ruleLine.matchedText,
Expand All @@ -244,10 +245,11 @@ private Object getTransformedNode(AstNode node) {
}

private List<TableRow> getTableRows(AstNode node) {
List<TableRow> rows = new ArrayList<>();
List<Token> tokens = node.getTokens(TokenType.TableRow);
List<TableRow> rows = new ArrayList<>(tokens.size());

for (Token token : node.getTokens(TokenType.TableRow)) {
TableRow tableRow = new TableRow(getLocation(token, 0), getCells(token), idGenerator.newId());
for (Token token : tokens) {
TableRow tableRow = new TableRow(getLocation(token), getCells(token), idGenerator.newId());
rows.add(tableRow);
}
ensureCellCount(rows);
Expand All @@ -271,7 +273,7 @@ private void ensureCellCount(List<TableRow> rows) {
}

private List<TableCell> getCells(Token token) {
List<TableCell> cells = new ArrayList<>();
List<TableCell> cells = new ArrayList<>(token.matchedItems.size());
for (GherkinLineSpan cellItem : token.matchedItems) {
TableCell tableCell = new TableCell(
getLocation(token, cellItem.column),
Expand All @@ -291,6 +293,10 @@ private io.cucumber.messages.types.Location getLocation(Token token, int column)
return new io.cucumber.messages.types.Location((long) token.location.getLine(), (long) column);
}

private io.cucumber.messages.types.Location getLocation(Token token) {
return new io.cucumber.messages.types.Location((long) token.location.getLine(), (long) token.location.getColumn());
}

private String getDescription(AstNode node) {
return node.getSingle(RuleType.Description, "");
}
Expand Down
Loading