Lucene enhancements (#921)

trzysiek · web-flow · commit ca8ec4aabfef · 2024-10-28T21:28:14.000Z
This Lucene parser seemed to work fine for a subset of Lucene. Now I had
to extend this subset a bit, and I'm afraid some weird (and usually
incorrect) queries might not work 100% correctly, although I'd have to
think for a bit for an example.

But all tests + customer's dashboards work fine, so I'd merge it. I have
an idea how to support full Lucene, 90% it'd work. I'll try to do it
very soon.

I tried 2 other open-source Lucene parsers, and they both fail on ~20-30
/ 60 our tests, so if our passes all, it's more fine than not.
diff --git a/quesma/queryparser/lucene/expression.go b/quesma/queryparser/lucene/expression.go
@@ -5,24 +5,25 @@ package lucene
 import (
 	"quesma/logger"
 	"quesma/model"
-	//wc "quesma/queryparser/where_clause"
 )
 
+var invalidStatement = model.NewLiteral("false")
+
 func (p *luceneParser) BuildWhereStatement() model.Expr {
 	for len(p.tokens) > 0 {
 		p.WhereStatement = p.buildWhereStatement(true)
 	}
 	if p.WhereStatement == nil {
-		p.WhereStatement = model.NewLiteral("true")
+		return model.NewLiteral("true")
 	}
 	return p.WhereStatement
 }
 
-// LeafStatement is a smallest part of a query that can be translated into SQL,
+// LeafStatement is the smallest part of a query that can be translated into SQL,
 // e.g. "title:abc", or "abc", or "title:(abc OR def)".
 func newLeafStatement(fieldNames []string, value value) model.Expr {
 	if len(fieldNames) == 0 {
-		return model.NewLiteral("false")
+		return invalidStatement
 	}
 
 	var newStatement model.Expr
@@ -38,16 +39,19 @@ func newLeafStatement(fieldNames []string, value value) model.Expr {
 	return newStatement
 }
 
-var invalidStatement = model.NewLiteral("false")
-
 // buildWhereStatement builds a WHERE statement from the tokens.
 // During parsing, we only keep one expression, because we're combining leafExpressions into
 // a tree of expressions. We keep the lastExpression to combine it with the next one.
 // E.g. "title:abc AND text:def" is parsed into andExpression(title:abc, text:def)".
 func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
+	if len(p.tokens) == 0 {
+		return invalidStatement
+	}
+
 	tok := p.tokens[0]
 	p.tokens = p.tokens[1:]
 	var currentStatement model.Expr
+
 	switch currentToken := tok.(type) {
 	case fieldNameToken:
 		if len(p.tokens) <= 1 {
@@ -79,15 +83,29 @@ func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
 	case notToken:
 		latterExp := p.buildWhereStatement(false)
 		currentStatement = model.NewPrefixExpr("NOT", []model.Expr{latterExp})
+	case existsToken:
+		fieldName, ok := p.buildValue([]value{}, 0).(termValue)
+		if !ok {
+			logger.Error().Msgf("buildExpression: invalid expression, unexpected token: %#v, tokens: %v", currentToken, p.tokens)
+			return invalidStatement
+		}
+		currentStatement = model.NewInfixExpr(model.NewColumnRef(fieldName.term), " IS NOT ", model.NewLiteral("NULL"))
 	case leftParenthesisToken:
-		currentStatement = newLeafStatement(p.defaultFieldNames, p.buildValue([]value{}, 1))
+		currentStatement = model.NewParenExpr(p.buildWhereStatement(false))
+	case rightParenthesisToken:
+		if p.WhereStatement == nil {
+			return invalidStatement
+		}
+		return p.WhereStatement
 	default:
 		logger.Error().Msgf("buildExpression: invalid expression, unexpected token: %#v, tokens: %v", currentToken, p.tokens)
 		return invalidStatement
 	}
+
 	if !addDefaultOperator || p.WhereStatement == nil {
 		return currentStatement
 	}
+
 	switch stmt := currentStatement.(type) {
 	case model.PrefixExpr:
 		if stmt.Op == "NOT" {
diff --git a/quesma/queryparser/lucene/lucene_parser.go b/quesma/queryparser/lucene/lucene_parser.go
@@ -24,7 +24,7 @@ import (
 // - escaped " inside quoted fieldnames, so e.g.
 //     * "a\"b" - not supported
 //     * abc"def - supported
-// - +, -, &&, ||, ! operators. But AND, OR, NOT are fully supported and they seem equivalent.
+// - +, -, &&, ||, operators. But AND, OR, NOT are fully supported and they seem equivalent.
 
 // Date ranges are only in format YYYY-MM-DD, as in docs there are no other examples. That can be changed if needed.
 
@@ -68,6 +68,8 @@ var specialOperators = map[string]token{
 	"AND ":                   andToken{},
 	"OR ":                    orToken{},
 	"NOT ":                   notToken{},
+	"!":                      notToken{},
+	"_exists_:":              existsToken{},
 	string(leftParenthesis):  leftParenthesisToken{},
 	string(rightParenthesis): rightParenthesisToken{},
 }
diff --git a/quesma/queryparser/lucene/lucene_parser_test.go b/quesma/queryparser/lucene/lucene_parser_test.go
@@ -45,15 +45,15 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
 		{`"jakarta apache" AND "Apache Lucene"`, `(("title" = 'jakarta apache' OR "text" = 'jakarta apache') AND ("title" = 'Apache Lucene' OR "text" = 'Apache Lucene'))`},
 		{`NOT status:"jakarta apache"`, `NOT ("status" = 'jakarta apache')`},
 		{`"jakarta apache" NOT "Apache Lucene"`, `(("title" = 'jakarta apache' OR "text" = 'jakarta apache') AND NOT (("title" = 'Apache Lucene' OR "text" = 'Apache Lucene')))`},
-		{`(jakarta OR apache) AND website`, `((("title" = 'jakarta' OR "title" = 'apache') OR ("text" = 'jakarta' OR "text" = 'apache')) AND ("title" = 'website' OR "text" = 'website'))`},
+		{`(jakarta OR apache) AND website`, `(((("title" = 'jakarta' OR "text" = 'jakarta')) OR ("title" = 'apache' OR "text" = 'apache')) AND ("title" = 'website' OR "text" = 'website'))`},
 		{`title:(return "pink panther")`, `("title" = 'return' OR "title" = 'pink panther')`},
 		{`status:(active OR pending) title:(full text search)^2`, `(("status" = 'active' OR "status" = 'pending') OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
 		{`status:(active OR NOT (pending AND in-progress)) title:(full text search)^2`, `(("status" = 'active' OR NOT (("status" = 'pending' AND "status" = 'in-progress'))) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
 		{`status:(NOT active OR NOT (pending AND in-progress)) title:(full text search)^2`, `((NOT ("status" = 'active') OR NOT (("status" = 'pending' AND "status" = 'in-progress'))) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
 		{`status:(active OR (pending AND in-progress)) title:(full text search)^2`, `(("status" = 'active' OR ("status" = 'pending' AND "status" = 'in-progress')) OR (("title" = 'full' OR "title" = 'text') OR "title" = 'search'))`},
 		{`status:((a OR (b AND c)) AND d)`, `(("status" = 'a' OR ("status" = 'b' AND "status" = 'c')) AND "status" = 'd')`},
 		{`title:(return [Aida TO Carmen])`, `("title" = 'return' OR ("title" >= 'Aida' AND "title" <= 'Carmen'))`},
-		{`host.name:(NOT active OR NOT (pending OR in-progress)) (full text search)^2`, `((NOT ("host.name" = 'active') OR NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR ((("title" = 'full' OR "title" = 'text') OR "title" = 'search') OR (("text" = 'full' OR "text" = 'text') OR "text" = 'search')))`},
+		{`host.name:(NOT active OR NOT (pending OR in-progress)) (full text search)^2`, `((((NOT ("host.name" = 'active') OR NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR (("title" = 'full' OR "text" = 'full'))) OR ("title" = 'text' OR "text" = 'text')) OR ("title" = 'search' OR "text" = 'search'))`},
 		{`host.name:(active AND NOT (pending OR in-progress)) hermes nemesis^2`, `((("host.name" = 'active' AND NOT (("host.name" = 'pending' OR "host.name" = 'in-progress'))) OR ("title" = 'hermes' OR "text" = 'hermes')) OR ("title" = 'nemesis' OR "text" = 'nemesis'))`},
 		{`dajhd \(%&RY#WFDG`, `(("title" = 'dajhd' OR "text" = 'dajhd') OR ("title" = '(%&RY#WFDG' OR "text" = '(%&RY#WFDG'))`},
 		// tests for wildcards
@@ -64,6 +64,12 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
 		{`title:abc\*`, `"title" = 'abc*'`},
 		{`title:abc*\*`, `"title" ILIKE 'abc%*'`},
 		{`ab\+c`, `("title" = 'ab+c' OR "text" = 'ab+c')`},
+		{`!db.str:FAIL`, `NOT ("db.str" = 'FAIL')`},
+		{`_exists_:title`, `"title" IS NOT NULL`},
+		{`!_exists_:title`, `NOT ("title" IS NOT NULL)`},
+		{"db.str:*weaver*", `"db.str" ILIKE '%weaver%'`},
+		{"(db.str:*weaver*)", `("db.str" ILIKE '%weaver%')`},
+		{"(a.type:*ab* OR a.type:*Ab*)", `(("a.type" ILIKE '%ab%') OR "a.type" ILIKE '%Ab%')`},
 	}
 	var randomQueriesWithPossiblyIncorrectInput = []struct {
 		query string
@@ -72,7 +78,7 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
 		{``, `true`},
 		{`          `, `true`},
 		{`  2 `, `("title" = '2' OR "text" = '2')`},
-		{`  2df$ ! `, `(("title" = '2df$' OR "text" = '2df$') OR ("title" = '!' OR "text" = '!'))`},
+		{`  2df$ ! `, `(("title" = '2df$' OR "text" = '2df$') AND NOT (false))`}, // TODO: this should probably just be "false"
 		{`title:`, `false`},
 		{`title: abc`, `"title" = 'abc'`},
 		{`title[`, `("title" = 'title[' OR "text" = 'title[')`},
@@ -82,7 +88,7 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
 		{`  title       `, `("title" = 'title' OR "text" = 'title')`},
 		{`  title : (+a -b c)`, `(("title" = '+a' OR "title" = '-b') OR "title" = 'c')`}, // we don't support '+', '-' operators, but in that case the answer seems good enough + nothing crashes
 		{`title:()`, `false`},
-		{`() a`, `((false OR false) OR ("title" = 'a' OR "text" = 'a'))`}, // a bit weird, but 'false OR false' is OK as I think nothing should match '()'
+		{`() a`, `((false) OR ("title" = 'a' OR "text" = 'a'))`}, // a bit weird, but '(false)' is OK as I think nothing should match '()'
 	}
 
 	currentSchema := schema.Schema{
diff --git a/quesma/queryparser/lucene/token.go b/quesma/queryparser/lucene/token.go
@@ -22,6 +22,8 @@ type andToken struct{}
 
 type notToken struct{}
 
+type existsToken struct{}
+
 type leftParenthesisToken struct{}
 
 type rightParenthesisToken struct{}