Add tests of lexer dialects (sqlparse, sqlfluff) (#1313)

avelanarius · web-flow · commit 6fb0ebb3d263 · 2025-03-05T09:57:36.000Z
This PR adds tests of our ports of dialects from sqlparse and sqlfluff.

The test files are extracted SQL queries from the tests of sqlparse and
sqlfluff. The expected outputs (tokens) are collected by running the
queries through the original lexers of sqlparse and sqlfluff.

By doing this, we get a really good test coverage (&gt;10k SQL queries,
many edge case/"strange"/"tricky" SQL queries collected) and we can be
almost certain that our implementation behaves identically to the
original sqlparse/sqlfluff implementations.

The test files are stored in a separate repo (added to Quesma as a Git
submodule), because they are large (~10MB).
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -20,6 +20,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: 'true'
 
       - name: Tune GitHub-hosted runner network
         uses: smorimoto/tune-github-hosted-runner-network@v1
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "platform/parsers/sql/testdata"]
+	path = platform/parsers/sql/testdata
+	url = https://github.com/avelanarius/quesma-testdata-wip.git
diff --git a/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go b/platform/parsers/sql/lexer/dialect_sqlparse/rule_test.go
@@ -4,9 +4,11 @@
 package dialect_sqlparse
 
 import (
-	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
 	"testing"
 
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils"
+
 	"github.com/stretchr/testify/assert"
 )
 
@@ -38,6 +40,42 @@ func TestSimpleSelect(t *testing.T) {
 	assert.Equal(t, "tabela", tokens[6].RawValue)
 }
 
+func TestSqlparsedTestcases(t *testing.T) {
+	testfiles := []string{
+		"../../testdata/testdata/dialect_sqlparse/parsed-sqlparse-testcases.txt",
+		"../../testdata/testdata/dialect_sqlparse/parsed-sqlfluff-all-testcases.txt",
+	}
+
+	for _, testfile := range testfiles {
+		t.Run(testfile, func(t *testing.T) {
+			testcases := testutils.LoadParsedTestcases(testfile)
+			for _, testcase := range testcases {
+				t.Run(testcase.Query, func(t *testing.T) {
+					tokens := core.Lex(testcase.Query, SqlparseRules)
+					assert.Equal(t, len(testcase.ExpectedTokens), len(tokens))
+
+					commonLength := min(len(testcase.ExpectedTokens), len(tokens))
+
+					for i := 0; i < commonLength; i++ {
+						assert.Equalf(t, testcase.ExpectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i)
+						assert.Equalf(t, testcase.ExpectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i)
+					}
+
+					if t.Failed() {
+						for i := 0; i < commonLength; i++ {
+							if testcase.ExpectedTokens[i].TokenType != tokens[i].Type.Name || testcase.ExpectedTokens[i].TokenValue != tokens[i].RawValue {
+								t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							} else {
+								t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, testcase.ExpectedTokens[i].TokenType, testcase.ExpectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
 func FuzzLex(f *testing.F) {
 	f.Add("SELECT * FROM tabela")
 	f.Add("SELECT id, name, email FROM customers WHERE age > 21")
diff --git a/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go b/platform/parsers/sql/lexer/dialects_sqlfluff/ansi/rule_test.go
@@ -4,11 +4,51 @@
 package ansi
 
 import (
+	"testing"
+
 	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/core"
+	"github.com/QuesmaOrg/quesma/platform/parsers/sql/lexer/testutils"
 	"github.com/stretchr/testify/assert"
-	"testing"
 )
 
+func TestSqlfluffAnsiTestcases(t *testing.T) {
+	testfiles := []string{
+		"../../../testdata/testdata/dialects_sqlfluff/parsed-sqlfluff-ansi-testcases.txt",
+		"../../../testdata/testdata/dialects_sqlfluff/parsed-sqlparse-testcases.txt",
+	}
+
+	for _, testfile := range testfiles {
+		t.Run(testfile, func(t *testing.T) {
+			testcases := testutils.LoadParsedTestcases(testfile)
+			for _, testcase := range testcases {
+				t.Run(testcase.Query, func(t *testing.T) {
+					expectedTokens := testcase.ExpectedTokens[:len(testcase.ExpectedTokens)-1] // remove the last token, which is an EOF token
+
+					tokens := core.Lex(testcase.Query, SqlfluffAnsiRules)
+					assert.Equal(t, len(expectedTokens), len(tokens))
+
+					commonLength := min(len(expectedTokens), len(tokens))
+
+					for i := 0; i < commonLength; i++ {
+						assert.Equalf(t, expectedTokens[i].TokenType, tokens[i].Type.Name, "Token type at position %d", i)
+						assert.Equalf(t, expectedTokens[i].TokenValue, tokens[i].RawValue, "Token value at position %d", i)
+					}
+
+					if t.Failed() {
+						for i := 0; i < commonLength; i++ {
+							if expectedTokens[i].TokenType != tokens[i].Type.Name || expectedTokens[i].TokenValue != tokens[i].RawValue {
+								t.Logf("Mismatch token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							} else {
+								t.Logf("Expected token at position %d: %s(%s). Got: %s(%s)", i, expectedTokens[i].TokenType, expectedTokens[i].TokenValue, tokens[i].Type.Name, tokens[i].RawValue)
+							}
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
 func FuzzLex(f *testing.F) {
 	f.Add("SELECT * FROM tabela")
 	f.Add("SELECT id, name, email FROM customers WHERE age > 21")
diff --git a/platform/parsers/sql/lexer/testutils/testdata_loader.go b/platform/parsers/sql/lexer/testutils/testdata_loader.go
@@ -0,0 +1,65 @@
+// Copyright Quesma, licensed under the Elastic License 2.0.
+// SPDX-License-Identifier: Elastic-2.0
+
+package testutils
+
+import (
+	"bytes"
+	"os"
+)
+
+type ParsedTestcase struct {
+	Query          string
+	ExpectedTokens []ExpectedToken
+}
+
+type ExpectedToken struct {
+	TokenType  string
+	TokenValue string
+}
+
+// Loads a list of test queries and their expected tokens (extracted from existing parsers).
+// The structure of the file is as follows:
+//
+//	[QUERY1]
+//	<end_of_query/>
+//	[TOKEN_TYPE_1]
+//	[TOKEN_VALUE_1]
+//	<end_of_token/>
+//	[TOKEN_TYPE_2]
+//	[TOKEN_VALUE_2]
+//	<end_of_token/>
+//	...
+//	<end_of_tokens/>
+//	[QUERY2]
+//	...
+func LoadParsedTestcases(filename string) []ParsedTestcase {
+	contents, err := os.ReadFile(filename)
+	if err != nil {
+		panic(err)
+	}
+
+	testcases := bytes.Split(contents, []byte("\n<end_of_tokens/>\n"))
+	testcases = testcases[:len(testcases)-1]
+
+	var parsedTestcases []ParsedTestcase
+	for _, testcase := range testcases {
+		endOfQuerySplit := bytes.Split(testcase, []byte("\n<end_of_query/>\n"))
+
+		query := string(endOfQuerySplit[0])
+
+		tokens := bytes.Split(endOfQuerySplit[1], []byte("\n<end_of_token/>\n"))
+		tokens = tokens[:len(tokens)-1]
+
+		var expectedTokens []ExpectedToken
+		for _, tokenDescription := range tokens {
+			tokenDescriptionSplit := bytes.SplitN(tokenDescription, []byte("\n"), 2)
+			tokenType := string(tokenDescriptionSplit[0])
+			tokenValue := string(tokenDescriptionSplit[1])
+			expectedTokens = append(expectedTokens, ExpectedToken{tokenType, tokenValue})
+		}
+
+		parsedTestcases = append(parsedTestcases, ParsedTestcase{query, expectedTokens})
+	}
+	return parsedTestcases
+}
diff --git a/platform/parsers/sql/testdata b/platform/parsers/sql/testdata
@@ -0,0 +1 @@
+Subproject commit e7995a72b4eda7d836bbd825996dc8642bb9fe15

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "platform/parsers/sql/testdata"]`
	`2`	`+ path = platform/parsers/sql/testdata`
	`3`	`+ url = https://github.com/avelanarius/quesma-testdata-wip.git`