Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions platform/config/config_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@ package config
import (
"errors"
"fmt"
"github.com/QuesmaOrg/quesma/platform/util"
"github.com/hashicorp/go-multierror"
"github.com/knadh/koanf/parsers/json"
"github.com/knadh/koanf/v2"
"github.com/rs/zerolog"
"log"
"reflect"
"regexp"
"slices"
"strings"

"github.com/QuesmaOrg/quesma/platform/util"
"github.com/hashicorp/go-multierror"
"github.com/knadh/koanf/parsers/json"
"github.com/knadh/koanf/v2"
"github.com/rs/zerolog"
)

var DefaultLogLevel = zerolog.InfoLevel
Expand Down
2 changes: 2 additions & 0 deletions platform/parsers/elastic_query_dsl/lucene/expression.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
)
case termToken:
currentStatement = newLeafStatement(p.defaultFieldNames, newTermValue(currentToken.term))
case fuzzyToken:
currentStatement = newLeafStatement(p.defaultFieldNames, newFuzzyValue(currentToken.term, currentToken.distance))
case andToken:
return model.NewInfixExpr(p.WhereStatement, "AND", p.buildWhereStatement(false))
case orToken:
Expand Down
103 changes: 92 additions & 11 deletions platform/parsers/elastic_query_dsl/lucene/lucene_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ package lucene

import (
"context"
"github.com/QuesmaOrg/quesma/platform/logger"
"github.com/QuesmaOrg/quesma/platform/model"
"github.com/QuesmaOrg/quesma/platform/schema"
"math"
"slices"
"strconv"
"strings"
"unicode"

"github.com/QuesmaOrg/quesma/platform/logger"
"github.com/QuesmaOrg/quesma/platform/model"
"github.com/QuesmaOrg/quesma/platform/schema"
)

// Mainly based on this doc: https://lucene.apache.org/core/2_9_4/queryparsersyntax.html
Expand Down Expand Up @@ -80,7 +81,6 @@ func TranslateToSQL(ctx context.Context, query string, fields []string, currentS
}

func (p *luceneParser) translateToSQL(query string) model.Expr {
query = p.removeFuzzySearchOperator(query)
query = p.removeBoostingOperator(query)
p.tokenizeQuery(query)
if len(p.tokens) == 1 {
Expand Down Expand Up @@ -142,7 +142,31 @@ func (p *luceneParser) parseTerm(query string, closingBoundTerm bool) (token tok
case '"':
for i, r := range query[1:] {
if r == '"' {
return newTermToken(query[:i+2]), query[i+2:]
term := query[:i+2]
remainingQuery = query[i+2:]
// Check for fuzzy operator after quoted term (e.g., "term"~2)
if strings.HasPrefix(remainingQuery, string(fuzzyOperator)) {
// Parse fuzzy operator from remaining query
distanceEnd := 1 // Start after ~

// Find where distance ends (space, delimiter, etc.)
for distanceEnd < len(remainingQuery) {
r := remainingQuery[distanceEnd]
if r == ' ' || r == delimiterCharacter || r == rightParenthesis {
break
}
distanceEnd++
}

distanceStr := remainingQuery[1:distanceEnd] // Skip the ~
distance := p.parseFuzzyDistance(distanceStr)

// Remove quotes from term for fuzzy search
cleanTerm := term[1 : len(term)-1] // Remove quotes
logger.InfoWithCtx(p.ctx).Msgf("Parsed fuzzy term: %s with distance: %d", cleanTerm, distance)
return newFuzzyToken(cleanTerm, distance), remainingQuery[distanceEnd:]
}
return newTermToken(term), remainingQuery
}
}
logger.Error().Msgf("unterminated quoted term, query: %s", query)
Expand All @@ -152,11 +176,72 @@ func (p *luceneParser) parseTerm(query string, closingBoundTerm bool) (token tok
default:
for i, r := range query {
if r == ' ' || r == delimiterCharacter || r == rightParenthesis || (closingBoundTerm && (r == exclusiveRangeClosingCharacter || r == inclusiveRangeClosingCharacter)) {
return newTermToken(query[:i]), query[i:]
term := query[:i]
remainingQuery = query[i:]
// Check for fuzzy operator
if fuzzyTok, remaining := p.parseFuzzyIfPresent(term, remainingQuery); fuzzyTok != nil {
return fuzzyTok, remaining
}
return newTermToken(term), remainingQuery
}
}
return newTermToken(query), ""
// End of query reached
term := query
remainingQuery = ""
// Check for fuzzy operator
if fuzzyTok, remaining := p.parseFuzzyIfPresent(term, remainingQuery); fuzzyTok != nil {
return fuzzyTok, remaining
}
return newTermToken(term), remainingQuery
}
}

// parseFuzzyIfPresent checks if the term contains fuzzy operator and parses it
// Returns fuzzy token if fuzzy operator found, nil otherwise
func (p *luceneParser) parseFuzzyIfPresent(term string, remainingQuery string) (token, string) {
// Check if term contains fuzzy operator
fuzzyIndex := strings.LastIndex(term, string(fuzzyOperator))
if fuzzyIndex == -1 {
return nil, remainingQuery
}

// Check if it's escaped
if fuzzyIndex > 0 && term[fuzzyIndex-1] == escapeCharacter {
return nil, remainingQuery
}

// Extract the base term (before ~)
baseTerm := term[:fuzzyIndex]
if baseTerm == "" {
return nil, remainingQuery
}

// Extract distance (after ~)
distanceStr := term[fuzzyIndex+1:]
distance := p.parseFuzzyDistance(distanceStr)

return newFuzzyToken(baseTerm, distance), remainingQuery
}

// parseFuzzyDistance converts a distance string to an integer for fuzzy search.
// Returns 2 as default if distanceStr is empty or invalid.
// For fractional values like 0.8, returns 1 as minimum distance.
func (p *luceneParser) parseFuzzyDistance(distanceStr string) int {
if distanceStr == "" {
return 2 // default fuzzy distance
}

if parsedFloat, err := strconv.ParseFloat(distanceStr, 64); err == nil && parsedFloat >= 0 {
// Convert float to int - Elasticsearch typically uses this for edit distance
// For fractional values like 0.8, we'll use 1 as minimum distance
if parsedFloat < 1.0 && parsedFloat > 0 {
return 1
} else {
return int(parsedFloat)
}
}

return 2 // default if parsing fails
}

func (p *luceneParser) parseRange(query string) (token token, remainingQuery string) {
Expand Down Expand Up @@ -279,10 +364,6 @@ func (p *luceneParser) parseOneBound(query string, closingBound bool) (bound any
}
}

func (p *luceneParser) removeFuzzySearchOperator(query string) string {
return p.removeSpecialCharacter(query, fuzzyOperator)
}

func (p *luceneParser) removeBoostingOperator(query string) string {
return p.removeSpecialCharacter(query, boostingOperator)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ package lucene

import (
"context"
"testing"

"github.com/QuesmaOrg/quesma/platform/model"
"github.com/QuesmaOrg/quesma/platform/schema"
"github.com/QuesmaOrg/quesma/platform/util"
"testing"
)

func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
Expand All @@ -19,11 +20,13 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
}{
{`title:"The Right Way" AND text:go!!`, `("title" __quesma_match 'The Right Way' AND "text" __quesma_match 'go!!')`},
{`title:Do it right AND right`, `((("title" __quesma_match 'Do' OR ("title" __quesma_match 'it' OR "text" __quesma_match 'it')) OR ("title" __quesma_match 'right' OR "text" __quesma_match 'right')) AND ("title" __quesma_match 'right' OR "text" __quesma_match 'right'))`},
{`roam~`, `("title" __quesma_match 'roam' OR "text" __quesma_match 'roam')`},
{`roam~0.8`, `("title" __quesma_match 'roam' OR "text" __quesma_match 'roam')`},
{`roam~`, `(damerauLevenshteinDistance("title",'roam') <= 2 OR damerauLevenshteinDistance("text",'roam') <= 2)`},
{`query: roam~323`, `damerauLevenshteinDistance("query",'roam') <= 323`},
{`roam~0.8`, `(damerauLevenshteinDistance("title",'roam') <= 1 OR damerauLevenshteinDistance("text",'roam') <= 1)`},
{`query:google.cmo~1`, `damerauLevenshteinDistance("query",'google.cmo') <= 1`},
{`jakarta^4 apache`, `(("title" __quesma_match 'jakarta' OR "text" __quesma_match 'jakarta') OR ("title" __quesma_match 'apache' OR "text" __quesma_match 'apache'))`},
{`"jakarta apache"^10`, `("title" __quesma_match 'jakarta apache' OR "text" __quesma_match 'jakarta apache')`},
{`"jakarta apache"~10`, `("title" __quesma_match 'jakarta apache' OR "text" __quesma_match 'jakarta apache')`},
{`"jakarta apache"~10`, `(damerauLevenshteinDistance("title",'jakarta apache') <= 10 OR damerauLevenshteinDistance("text",'jakarta apache') <= 10)`},
{`mod_date:[2002-01-01 TO 2003-02-15]`, `("mod_date" >= '2002-01-01' AND "mod_date" <= '2003-02-15')`}, // 7
{`mod_date:[2002-01-01 TO 2003-02-15}`, `("mod_date" >= '2002-01-01' AND "mod_date" < '2003-02-15')`},
{`age:>10`, `"age" > '10'`},
Expand Down
12 changes: 12 additions & 0 deletions platform/parsers/elastic_query_dsl/lucene/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,15 @@ type termToken struct {
func newTermToken(term string) termToken {
return termToken{term}
}

type fuzzyToken struct {
term string
distance int
}

func newFuzzyToken(term string, distance int) fuzzyToken {
if distance <= 0 {
distance = 2 // default fuzzy distance
}
return fuzzyToken{term: term, distance: distance}
}
69 changes: 66 additions & 3 deletions platform/parsers/elastic_query_dsl/lucene/value.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ package lucene

import (
"fmt"
"github.com/QuesmaOrg/quesma/platform/logger"
"github.com/QuesmaOrg/quesma/platform/model"
"github.com/QuesmaOrg/quesma/platform/util"
"slices"
"strconv"
"strings"

"github.com/QuesmaOrg/quesma/platform/logger"
"github.com/QuesmaOrg/quesma/platform/model"
"github.com/QuesmaOrg/quesma/platform/util"
)

// value is a part of an expression, representing what we query for (expression without fields for which we query).
Expand Down Expand Up @@ -38,6 +39,15 @@ func newTermValue(term string) termValue {
return termValue{term: term}
}

type fuzzyValue struct {
term string
distance int
}

func newFuzzyValue(term string, distance int) fuzzyValue {
return fuzzyValue{term: term, distance: distance}
}

func (v termValue) toExpression(fieldName string) model.Expr {
termAsStringToClickhouse := v.transformSpecialCharacters()

Expand Down Expand Up @@ -91,6 +101,57 @@ func (v termValue) transformSpecialCharacters() (termFinal string) {
return returnTerm.String()
}

func (v fuzzyValue) toExpression(fieldName string) model.Expr {
// Clean the term like we do for regular terms
termAsStringToClickhouse := v.transformSpecialCharacters()

if alreadyQuoted(v.term) {
termAsStringToClickhouse = termAsStringToClickhouse[1 : len(termAsStringToClickhouse)-1]
}
if !util.IsSingleQuoted(termAsStringToClickhouse) {
termAsStringToClickhouse = util.SingleQuote(termAsStringToClickhouse)
}

// Use ClickHouse's damerauLevenshteinDistance function
// Syntax: damerauLevenshteinDistance(field, search_term) <= distance
fieldRef := model.NewColumnRef(fieldName)
searchTerm := model.NewLiteralWithEscapeType(termAsStringToClickhouse, model.FullyEscaped)
distanceLiteral := model.NewLiteral(strconv.Itoa(v.distance))

fuzzyFunc := model.NewFunction("damerauLevenshteinDistance", fieldRef, searchTerm)

return model.NewInfixExpr(fuzzyFunc, " <= ", distanceLiteral)
}

// transformSpecialCharacters for fuzzy values - similar to termValue but for fuzzy terms
func (v fuzzyValue) transformSpecialCharacters() (termFinal string) {
strAsRunes := []rune(v.term)
var returnTerm strings.Builder
for i := 0; i < len(strAsRunes); i++ {
curRune := strAsRunes[i]
transformed, isTransformed := charTransformations[curRune]
if isTransformed {
returnTerm.WriteString(transformed)
continue
}

if i == len(strAsRunes)-1 {
returnTerm.WriteRune(curRune)
continue
}

nextRune := strAsRunes[i+1]
if curRune == escapeCharacter && slices.Contains(specialCharacters, nextRune) {
// it's escaped, so we write nextRune instead of the original curRune
returnTerm.WriteRune(nextRune)
i++
} else {
returnTerm.WriteRune(curRune)
}
}
return returnTerm.String()
}

type rangeValue struct {
lowerBound any // unbounded (nil) means no lower bound
upperBound any // unbounded (nil) means no upper bound
Expand Down Expand Up @@ -282,6 +343,8 @@ func (p *luceneParser) buildValue(stack []value, parenthesisLevel int) value {
stack = append(stack, newNotValue(p.buildValue([]value{}, 0)))
case termToken:
stack = append(stack, newTermValue(currentToken.term))
case fuzzyToken:
stack = append(stack, newFuzzyValue(currentToken.term, currentToken.distance))
case rangeToken:
stack = append(stack, currentToken.rangeValue)
default:
Expand Down
Loading