Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.

Commit ca8ec4a

Browse files
authored
Lucene enhancements (#921)
This Lucene parser seemed to work fine for a subset of Lucene. Now I had to extend this subset a bit, and I'm afraid some weird (and usually incorrect) queries might not work 100% correctly, although I'd have to think for a bit for an example. But all tests + customer's dashboards work fine, so I'd merge it. I have an idea how to support full Lucene, 90% it'd work. I'll try to do it very soon. I tried 2 other open-source Lucene parsers, and they both fail on ~20-30 / 60 our tests, so if our passes all, it's more fine than not.
1 parent 898d937 commit ca8ec4a

File tree

4 files changed

+40
-12
lines changed

4 files changed

+40
-12
lines changed

quesma/queryparser/lucene/expression.go

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,25 @@ package lucene
55
import (
66
"quesma/logger"
77
"quesma/model"
8-
//wc "quesma/queryparser/where_clause"
98
)
109

10+
var invalidStatement = model.NewLiteral("false")
11+
1112
func (p *luceneParser) BuildWhereStatement() model.Expr {
1213
for len(p.tokens) > 0 {
1314
p.WhereStatement = p.buildWhereStatement(true)
1415
}
1516
if p.WhereStatement == nil {
16-
p.WhereStatement = model.NewLiteral("true")
17+
return model.NewLiteral("true")
1718
}
1819
return p.WhereStatement
1920
}
2021

21-
// LeafStatement is a smallest part of a query that can be translated into SQL,
22+
// LeafStatement is the smallest part of a query that can be translated into SQL,
2223
// e.g. "title:abc", or "abc", or "title:(abc OR def)".
2324
func newLeafStatement(fieldNames []string, value value) model.Expr {
2425
if len(fieldNames) == 0 {
25-
return model.NewLiteral("false")
26+
return invalidStatement
2627
}
2728

2829
var newStatement model.Expr
@@ -38,16 +39,19 @@ func newLeafStatement(fieldNames []string, value value) model.Expr {
3839
return newStatement
3940
}
4041

41-
var invalidStatement = model.NewLiteral("false")
42-
4342
// buildWhereStatement builds a WHERE statement from the tokens.
4443
// During parsing, we only keep one expression, because we're combining leafExpressions into
4544
// a tree of expressions. We keep the lastExpression to combine it with the next one.
4645
// E.g. "title:abc AND text:def" is parsed into andExpression(title:abc, text:def)".
4746
func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
47+
if len(p.tokens) == 0 {
48+
return invalidStatement
49+
}
50+
4851
tok := p.tokens[0]
4952
p.tokens = p.tokens[1:]
5053
var currentStatement model.Expr
54+
5155
switch currentToken := tok.(type) {
5256
case fieldNameToken:
5357
if len(p.tokens) <= 1 {
@@ -79,15 +83,29 @@ func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
7983
case notToken:
8084
latterExp := p.buildWhereStatement(false)
8185
currentStatement = model.NewPrefixExpr("NOT", []model.Expr{latterExp})
86+
case existsToken:
87+
fieldName, ok := p.buildValue([]value{}, 0).(termValue)
88+
if !ok {
89+
logger.Error().Msgf("buildExpression: invalid expression, unexpected token: %#v, tokens: %v", currentToken, p.tokens)
90+
return invalidStatement
91+
}
92+
currentStatement = model.NewInfixExpr(model.NewColumnRef(fieldName.term), " IS NOT ", model.NewLiteral("NULL"))
8293
case leftParenthesisToken:
83-
currentStatement = newLeafStatement(p.defaultFieldNames, p.buildValue([]value{}, 1))
94+
currentStatement = model.NewParenExpr(p.buildWhereStatement(false))
95+
case rightParenthesisToken:
96+
if p.WhereStatement == nil {
97+
return invalidStatement
98+
}
99+
return p.WhereStatement
84100
default:
85101
logger.Error().Msgf("buildExpression: invalid expression, unexpected token: %#v, tokens: %v", currentToken, p.tokens)
86102
return invalidStatement
87103
}
104+
88105
if !addDefaultOperator || p.WhereStatement == nil {
89106
return currentStatement
90107
}
108+
91109
switch stmt := currentStatement.(type) {
92110
case model.PrefixExpr:
93111
if stmt.Op == "NOT" {

quesma/queryparser/lucene/lucene_parser.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
// - escaped " inside quoted fieldnames, so e.g.
2525
// * "a\"b" - not supported
2626
// * abc"def - supported
27-
// - +, -, &&, ||, ! operators. But AND, OR, NOT are fully supported and they seem equivalent.
27+
// - +, -, &&, ||, operators. But AND, OR, NOT are fully supported and they seem equivalent.
2828

2929
// Date ranges are only in format YYYY-MM-DD, as in docs there are no other examples. That can be changed if needed.
3030

@@ -68,6 +68,8 @@ var specialOperators = map[string]token{
6868
"AND ": andToken{},
6969
"OR ": orToken{},
7070
"NOT ": notToken{},
71+
"!": notToken{},
72+
"_exists_:": existsToken{},
7173
string(leftParenthesis): leftParenthesisToken{},
7274
string(rightParenthesis): rightParenthesisToken{},
7375
}

quesma/queryparser/lucene/lucene_parser_test.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,15 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
4545
{`"jakarta apache" AND "Apache Lucene"`, `(("title" = 'jakarta apache' OR "text" = 'jakarta apache') AND ("title" = 'Apache Lucene' OR "text" = 'Apache Lucene'))`},
4646
{`NOT status:"jakarta apache"`, `NOT ("status" = 'jakarta apache')`},
4747
{`"jakarta apache" NOT "Apache Lucene"`, `(("title" = 'jakarta apache' OR "text" = 'jakarta apache') AND NOT (("title" = 'Apache Lucene' OR "text" = 'Apache Lucene')))`},
48-
{`(jakarta OR apache) AND website`, `((("title" = 'jakarta' OR "title" = 'apache') OR ("text" = 'jakarta' OR "text" = 'apache')) AND ("title" = 'website' OR "text" = 'website'))`},
48+
{`(jakarta OR apache) AND website`, `(((("title" = 'jakarta' OR "text" = 'jakarta')) OR ("title" = 'apache' OR "text" = 'apache')) AND ("title" = 'website' OR "text" = 'website'))`},
4949
{`title:(return "pink panther")`, `("title" = 'return' OR "title" = 'pink panther')`},
5050
{`status:(active OR pending) title:(full text search)^2`, `(("status" = 'active' OR "status" = 'pending') OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
5151
{`status:(active OR NOT (pending AND in-progress)) title:(full text search)^2`, `(("status" = 'active' OR NOT (("status" = 'pending' AND "status" = 'in-progress'))) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
5252
{`status:(NOT active OR NOT (pending AND in-progress)) title:(full text search)^2`, `((NOT ("status" = 'active') OR NOT (("status" = 'pending' AND "status" = 'in-progress'))) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
5353
{`status:(active OR (pending AND in-progress)) title:(full text search)^2`, `(("status" = 'active' OR ("status" = 'pending' AND "status" = 'in-progress')) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
5454
{`status:((a OR (b AND c)) AND d)`, `(("status" = 'a' OR ("status" = 'b' AND "status" = 'c')) AND "status" = 'd')`},
5555
{`title:(return [Aida TO Carmen])`, `("title" = 'return' OR ("title" >= 'Aida' AND "title" <= 'Carmen'))`},
56-
{`host.name:(NOT active OR NOT (pending OR in-progress)) (full text search)^2`, `((NOT ("host.name" = 'active') OR NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR ((("title" = 'full' OR "title" = 'text') OR "title" = 'search') OR (("text" = 'full' OR "text" = 'text') OR "text" = 'search')))`},
56+
{`host.name:(NOT active OR NOT (pending OR in-progress)) (full text search)^2`, `((((NOT ("host.name" = 'active') OR NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR (("title" = 'full' OR "text" = 'full'))) OR ("title" = 'text' OR "text" = 'text')) OR ("title" = 'search' OR "text" = 'search'))`},
5757
{`host.name:(active AND NOT (pending OR in-progress)) hermes nemesis^2`, `((("host.name" = 'active' AND NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR ("title" = 'hermes' OR "text" = 'hermes')) OR ("title" = 'nemesis' OR "text" = 'nemesis'))`},
5858
{`dajhd \(%&RY#WFDG`, `(("title" = 'dajhd' OR "text" = 'dajhd') OR ("title" = '(%&RY#WFDG' OR "text" = '(%&RY#WFDG'))`},
5959
// tests for wildcards
@@ -64,6 +64,12 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
6464
{`title:abc\*`, `"title" = 'abc*'`},
6565
{`title:abc*\*`, `"title" ILIKE 'abc%*'`},
6666
{`ab\+c`, `("title" = 'ab+c' OR "text" = 'ab+c')`},
67+
{`!db.str:FAIL`, `NOT ("db.str" = 'FAIL')`},
68+
{`_exists_:title`, `"title" IS NOT NULL`},
69+
{`!_exists_:title`, `NOT ("title" IS NOT NULL)`},
70+
{"db.str:*weaver*", `"db.str" ILIKE '%weaver%'`},
71+
{"(db.str:*weaver*)", `("db.str" ILIKE '%weaver%')`},
72+
{"(a.type:*ab* OR a.type:*Ab*)", `(("a.type" ILIKE '%ab%') OR "a.type" ILIKE '%Ab%')`},
6773
}
6874
var randomQueriesWithPossiblyIncorrectInput = []struct {
6975
query string
@@ -72,7 +78,7 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
7278
{``, `true`},
7379
{` `, `true`},
7480
{` 2 `, `("title" = '2' OR "text" = '2')`},
75-
{` 2df$ ! `, `(("title" = '2df$' OR "text" = '2df$') OR ("title" = '!' OR "text" = '!'))`},
81+
{` 2df$ ! `, `(("title" = '2df$' OR "text" = '2df$') AND NOT (false))`}, // TODO: this should probably just be "false"
7682
{`title:`, `false`},
7783
{`title: abc`, `"title" = 'abc'`},
7884
{`title[`, `("title" = 'title[' OR "text" = 'title[')`},
@@ -82,7 +88,7 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
8288
{` title `, `("title" = 'title' OR "text" = 'title')`},
8389
{` title : (+a -b c)`, `(("title" = '+a' OR "title" = '-b') OR "title" = 'c')`}, // we don't support '+', '-' operators, but in that case the answer seems good enough + nothing crashes
8490
{`title:()`, `false`},
85-
{`() a`, `((false OR false) OR ("title" = 'a' OR "text" = 'a'))`}, // a bit weird, but 'false OR false' is OK as I think nothing should match '()'
91+
{`() a`, `((false) OR ("title" = 'a' OR "text" = 'a'))`}, // a bit weird, but '(false)' is OK as I think nothing should match '()'
8692
}
8793

8894
currentSchema := schema.Schema{

quesma/queryparser/lucene/token.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ type andToken struct{}
2222

2323
type notToken struct{}
2424

25+
type existsToken struct{}
26+
2527
type leftParenthesisToken struct{}
2628

2729
type rightParenthesisToken struct{}

0 commit comments

Comments
 (0)