Skip to content

Commit f621586

Browse files
Ajit Pratap SinghAjit Pratap Singh
authored andcommitted
feat(#249): Dialect mode engine (Phase 1) - core plumbing
- Add Dialect field and SetDialect/NewWithDialect to Tokenizer - Add WithDialect ParserOption and Dialect() getter to Parser - Add ParseWithDialect/ValidateWithDialect convenience functions - Wire --dialect flag through CLI validator to tokenizer and parser - Add comprehensive dialect tests (flow, MySQL/PostgreSQL keywords, defaults) - Backward compatible: default behavior unchanged (PostgreSQL dialect)
1 parent 1ae108d commit f621586

File tree

6 files changed

+268
-3
lines changed

6 files changed

+268
-3
lines changed

cmd/gosqlx/cmd/validate.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/ajitpratap0/GoSQLX/cmd/gosqlx/internal/config"
1010
"github.com/ajitpratap0/GoSQLX/cmd/gosqlx/internal/output"
11+
"github.com/ajitpratap0/GoSQLX/pkg/sql/keywords"
1112
"github.com/ajitpratap0/GoSQLX/pkg/sql/parser"
1213
)
1314

@@ -269,7 +270,12 @@ func validateFromStdin(cmd *cobra.Command) error {
269270
// validateInlineSQL validates inline SQL passed as a command argument.
270271
// Uses the fast-path Validate() which skips full AST construction (#274).
271272
func validateInlineSQL(cmd *cobra.Command, sql string) error {
272-
err := parser.Validate(sql)
273+
var err error
274+
if validateDialect != "" {
275+
err = parser.ValidateWithDialect(sql, keywords.SQLDialect(validateDialect))
276+
} else {
277+
err = parser.Validate(sql)
278+
}
273279
if err != nil {
274280
if !validateQuiet {
275281
fmt.Fprintf(cmd.ErrOrStderr(), "✗ Invalid SQL: %v\n", err)

cmd/gosqlx/cmd/validator.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/ajitpratap0/GoSQLX/cmd/gosqlx/internal/config"
1212
"github.com/ajitpratap0/GoSQLX/cmd/gosqlx/internal/output"
1313
"github.com/ajitpratap0/GoSQLX/pkg/sql/ast"
14+
"github.com/ajitpratap0/GoSQLX/pkg/sql/keywords"
1415
"github.com/ajitpratap0/GoSQLX/pkg/sql/parser"
1516
"github.com/ajitpratap0/GoSQLX/pkg/sql/tokenizer"
1617
)
@@ -235,10 +236,15 @@ func (v *Validator) validateFile(filename string) output.FileValidationResult {
235236
return result // Empty inputs are considered valid
236237
}
237238

238-
// Use pooled tokenizer for performance
239+
// Use pooled tokenizer for performance with dialect support
239240
tkz := tokenizer.GetTokenizer()
240241
defer tokenizer.PutTokenizer(tkz)
241242

243+
// Configure dialect if specified
244+
if v.Opts.Dialect != "" {
245+
tkz.SetDialect(keywords.SQLDialect(v.Opts.Dialect))
246+
}
247+
242248
// Tokenize
243249
tokens, err := tkz.Tokenize(data)
244250
if err != nil {
@@ -254,7 +260,7 @@ func (v *Validator) validateFile(filename string) output.FileValidationResult {
254260
// Convert TokenWithSpan to Token using centralized converter
255261

256262
// Parse to validate syntax with proper error handling for memory management
257-
p := parser.NewParser()
263+
p := parser.NewParser(parser.WithDialect(v.Opts.Dialect))
258264
astObj, err := p.ParseFromModelTokens(tokens)
259265
if err != nil {
260266
result.Error = fmt.Errorf("parsing failed: %w", err)

pkg/sql/parser/dialect_test.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package parser
2+
3+
import (
4+
"testing"
5+
6+
"github.com/ajitpratap0/GoSQLX/pkg/sql/keywords"
7+
"github.com/ajitpratap0/GoSQLX/pkg/sql/tokenizer"
8+
)
9+
10+
func TestParserWithDialectOption(t *testing.T) {
11+
p := NewParser(WithDialect("mysql"))
12+
if p.Dialect() != "mysql" {
13+
t.Errorf("expected mysql, got %s", p.Dialect())
14+
}
15+
16+
p2 := NewParser()
17+
if p2.Dialect() != "postgresql" {
18+
t.Errorf("expected postgresql default, got %s", p2.Dialect())
19+
}
20+
}
21+
22+
func TestTokenizerWithDialect(t *testing.T) {
23+
tkz, err := tokenizer.NewWithDialect(keywords.DialectMySQL)
24+
if err != nil {
25+
t.Fatal(err)
26+
}
27+
if tkz.Dialect() != keywords.DialectMySQL {
28+
t.Errorf("expected mysql, got %s", tkz.Dialect())
29+
}
30+
}
31+
32+
func TestTokenizerSetDialect(t *testing.T) {
33+
tkz := tokenizer.GetTokenizer()
34+
defer tokenizer.PutTokenizer(tkz)
35+
36+
tkz.SetDialect(keywords.DialectMySQL)
37+
if tkz.Dialect() != keywords.DialectMySQL {
38+
t.Errorf("expected mysql, got %s", tkz.Dialect())
39+
}
40+
}
41+
42+
func TestTokenizerDefaultDialect(t *testing.T) {
43+
tkz, err := tokenizer.NewWithDialect("")
44+
if err != nil {
45+
t.Fatal(err)
46+
}
47+
if tkz.Dialect() != keywords.DialectPostgreSQL {
48+
t.Errorf("expected postgresql default, got %s", tkz.Dialect())
49+
}
50+
}
51+
52+
func TestParseWithDialect(t *testing.T) {
53+
// Basic SQL should parse with any dialect
54+
for _, dialect := range []keywords.SQLDialect{
55+
keywords.DialectPostgreSQL,
56+
keywords.DialectMySQL,
57+
keywords.DialectSQLServer,
58+
} {
59+
t.Run(string(dialect), func(t *testing.T) {
60+
ast, err := ParseWithDialect("SELECT 1", dialect)
61+
if err != nil {
62+
t.Fatalf("ParseWithDialect(%s) failed: %v", dialect, err)
63+
}
64+
if ast == nil {
65+
t.Fatal("expected non-nil AST")
66+
}
67+
})
68+
}
69+
}
70+
71+
func TestValidateWithDialect(t *testing.T) {
72+
err := ValidateWithDialect("SELECT * FROM users WHERE id = 1", keywords.DialectMySQL)
73+
if err != nil {
74+
t.Fatalf("ValidateWithDialect(mysql) failed: %v", err)
75+
}
76+
77+
err = ValidateWithDialect("SELECT * FROM users WHERE id = 1", keywords.DialectPostgreSQL)
78+
if err != nil {
79+
t.Fatalf("ValidateWithDialect(postgresql) failed: %v", err)
80+
}
81+
}
82+
83+
func TestDefaultBehaviorUnchanged(t *testing.T) {
84+
// Validate() without dialect should still work (backward compatibility)
85+
err := Validate("SELECT * FROM users")
86+
if err != nil {
87+
t.Fatalf("Validate() failed: %v", err)
88+
}
89+
90+
ast, err := ParseBytes([]byte("SELECT 1"))
91+
if err != nil {
92+
t.Fatalf("ParseBytes() failed: %v", err)
93+
}
94+
if ast == nil {
95+
t.Fatal("expected non-nil AST")
96+
}
97+
}
98+
99+
func TestMySQLKeywordsRecognized(t *testing.T) {
100+
// UNSIGNED is a MySQL-specific keyword; tokenizer should recognize it
101+
tkz, err := tokenizer.NewWithDialect(keywords.DialectMySQL)
102+
if err != nil {
103+
t.Fatal(err)
104+
}
105+
106+
tokens, err := tkz.Tokenize([]byte("SELECT UNSIGNED"))
107+
if err != nil {
108+
t.Fatalf("tokenize failed: %v", err)
109+
}
110+
111+
// Should have at least 2 tokens (SELECT, UNSIGNED)
112+
if len(tokens) < 2 {
113+
t.Fatalf("expected at least 2 tokens, got %d", len(tokens))
114+
}
115+
}
116+
117+
func TestPostgreSQLKeywordsRecognized(t *testing.T) {
118+
tkz, err := tokenizer.NewWithDialect(keywords.DialectPostgreSQL)
119+
if err != nil {
120+
t.Fatal(err)
121+
}
122+
123+
tokens, err := tkz.Tokenize([]byte("SELECT ILIKE"))
124+
if err != nil {
125+
t.Fatalf("tokenize failed: %v", err)
126+
}
127+
128+
if len(tokens) < 2 {
129+
t.Fatalf("expected at least 2 tokens, got %d", len(tokens))
130+
}
131+
}

pkg/sql/parser/parser.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,24 @@ func WithStrictMode() ParserOption {
208208
}
209209
}
210210

211+
// WithDialect sets the SQL dialect for dialect-aware parsing.
212+
// Supported values: "postgresql", "mysql", "sqlserver", "oracle", "sqlite", etc.
213+
// If not set, defaults to "postgresql" for backward compatibility.
214+
func WithDialect(dialect string) ParserOption {
215+
return func(p *Parser) {
216+
p.dialect = dialect
217+
}
218+
}
219+
220+
// Dialect returns the SQL dialect configured for this parser.
221+
// Returns "postgresql" if no dialect was explicitly set.
222+
func (p *Parser) Dialect() string {
223+
if p.dialect == "" {
224+
return "postgresql"
225+
}
226+
return p.dialect
227+
}
228+
211229
type Parser struct {
212230
tokens []token.Token
213231
currentPos int
@@ -216,6 +234,7 @@ type Parser struct {
216234
ctx context.Context // Optional context for cancellation support
217235
positions []TokenPosition // Position mapping for error reporting
218236
strict bool // Strict mode rejects empty statements
237+
dialect string // SQL dialect for dialect-aware parsing (default: "postgresql")
219238
}
220239

221240
// Parse parses a token stream into an Abstract Syntax Tree (AST).

pkg/sql/parser/validate.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
goerrors "github.com/ajitpratap0/GoSQLX/pkg/errors"
99
"github.com/ajitpratap0/GoSQLX/pkg/models"
1010
"github.com/ajitpratap0/GoSQLX/pkg/sql/ast"
11+
"github.com/ajitpratap0/GoSQLX/pkg/sql/keywords"
1112
"github.com/ajitpratap0/GoSQLX/pkg/sql/token"
1213
"github.com/ajitpratap0/GoSQLX/pkg/sql/tokenizer"
1314
)
@@ -119,3 +120,72 @@ func ParseBytesWithTokens(input []byte) (*ast.AST, []token.Token, error) {
119120

120121
return astResult, converted, nil
121122
}
123+
124+
// ValidateWithDialect checks whether the given SQL string is syntactically valid
125+
// using the specified SQL dialect for keyword recognition.
126+
func ValidateWithDialect(sql string, dialect keywords.SQLDialect) error {
127+
return ValidateBytesWithDialect([]byte(sql), dialect)
128+
}
129+
130+
// ValidateBytesWithDialect is like ValidateWithDialect but accepts []byte.
131+
func ValidateBytesWithDialect(input []byte, dialect keywords.SQLDialect) error {
132+
if len(trimBytes(input)) == 0 {
133+
return nil
134+
}
135+
136+
tkz, err := tokenizer.NewWithDialect(dialect)
137+
if err != nil {
138+
return fmt.Errorf("tokenizer initialization: %w", err)
139+
}
140+
141+
tokens, err := tkz.Tokenize(input)
142+
if err != nil {
143+
return fmt.Errorf("tokenization error: %w", err)
144+
}
145+
146+
if len(tokens) == 0 {
147+
return nil
148+
}
149+
150+
p := NewParser(WithDialect(string(dialect)))
151+
defer p.Release()
152+
153+
converted, convErr := convertModelTokens(tokens)
154+
if convErr != nil {
155+
return fmt.Errorf("token conversion failed: %w", convErr)
156+
}
157+
158+
astResult, parseErr := p.Parse(converted)
159+
if parseErr != nil {
160+
return parseErr
161+
}
162+
ast.ReleaseAST(astResult)
163+
return nil
164+
}
165+
166+
// ParseWithDialect parses SQL using the specified dialect for keyword recognition.
167+
// This is a convenience function combining dialect-aware tokenization and parsing.
168+
func ParseWithDialect(sql string, dialect keywords.SQLDialect) (*ast.AST, error) {
169+
return ParseBytesWithDialect([]byte(sql), dialect)
170+
}
171+
172+
// ParseBytesWithDialect is like ParseWithDialect but accepts []byte.
173+
func ParseBytesWithDialect(input []byte, dialect keywords.SQLDialect) (*ast.AST, error) {
174+
tkz, err := tokenizer.NewWithDialect(dialect)
175+
if err != nil {
176+
return nil, fmt.Errorf("tokenizer initialization: %w", err)
177+
}
178+
179+
tokens, err := tkz.Tokenize(input)
180+
if err != nil {
181+
return nil, fmt.Errorf("tokenization error: %w", err)
182+
}
183+
184+
if len(tokens) == 0 {
185+
return nil, goerrors.IncompleteStatementError(models.Location{}, "")
186+
}
187+
188+
p := NewParser(WithDialect(string(dialect)))
189+
190+
return p.ParseFromModelTokens(tokens)
191+
}

pkg/sql/tokenizer/tokenizer.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ type Tokenizer struct {
244244
lineStarts []int // Byte offsets of line starts (for position tracking)
245245
line int // Current line number (1-based)
246246
keywords *keywords.Keywords // Keyword classifier for token type determination
247+
dialect keywords.SQLDialect // SQL dialect for dialect-specific keyword recognition
247248
logger *slog.Logger // Optional structured logger for verbose tracing
248249
Comments []models.Comment // Comments captured during tokenization
249250
}
@@ -267,11 +268,43 @@ func New() (*Tokenizer, error) {
267268
kw := keywords.NewKeywords()
268269
return &Tokenizer{
269270
keywords: kw,
271+
dialect: keywords.DialectPostgreSQL,
270272
pos: NewPosition(1, 0),
271273
lineStarts: []int{0},
272274
}, nil
273275
}
274276

277+
// NewWithDialect creates a new Tokenizer configured for the given SQL dialect.
278+
// Dialect-specific keywords are recognized based on the dialect parameter.
279+
// If dialect is empty or unknown, defaults to DialectPostgreSQL.
280+
func NewWithDialect(dialect keywords.SQLDialect) (*Tokenizer, error) {
281+
if dialect == "" || dialect == keywords.DialectUnknown {
282+
dialect = keywords.DialectPostgreSQL
283+
}
284+
kw := keywords.New(dialect, true)
285+
return &Tokenizer{
286+
keywords: kw,
287+
dialect: dialect,
288+
pos: NewPosition(1, 0),
289+
lineStarts: []int{0},
290+
}, nil
291+
}
292+
293+
// Dialect returns the SQL dialect configured for this tokenizer.
294+
func (t *Tokenizer) Dialect() keywords.SQLDialect {
295+
return t.dialect
296+
}
297+
298+
// SetDialect reconfigures the tokenizer for a different SQL dialect.
299+
// This rebuilds the keyword set to include dialect-specific keywords.
300+
func (t *Tokenizer) SetDialect(dialect keywords.SQLDialect) {
301+
if dialect == "" || dialect == keywords.DialectUnknown {
302+
dialect = keywords.DialectPostgreSQL
303+
}
304+
t.dialect = dialect
305+
t.keywords = keywords.New(dialect, true)
306+
}
307+
275308
// NewWithKeywords initializes a Tokenizer with a custom keyword classifier.
276309
// This allows you to customize keyword recognition for specific SQL dialects
277310
// or to add custom keywords.

0 commit comments

Comments
 (0)