Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.

Commit 4438739

Browse files
rhrazdiltrzysiek
andauthored
Fuzzy search support (#1509)
[Why] We're in a process of migrating data from elasticsearch to clickhouse. We have a alerting service reading the data from elasticsearch (https://github.com/jertel/elastalert2), with no clear substitution offering feature partiy for Clickhouse. This is where quesma seems like a perfect fit. One of our alerts relies on fuzzy search (`~`), so I decided to give it a try implementing. I'd be happy for any feedback. PR introduces implementation of elasticsearch fuzzy search for clickhouse database. I decided to use `damerauLevenshteinDistance` clickhouse function, as its functionality is the closest to elasticsearch `~` operator, based on my direct comparison. I did basic E2E test of the implemented functionality in my local setup with clickhouse-server 24.5. --------- Signed-off-by: Radim Hrazdil <[email protected]> Co-authored-by: Krzysztof Kiewicz <[email protected]>
1 parent cd6fe36 commit 4438739

File tree

6 files changed

+185
-23
lines changed

6 files changed

+185
-23
lines changed

platform/config/config_v2.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@ package config
55
import (
66
"errors"
77
"fmt"
8-
"github.com/QuesmaOrg/quesma/platform/util"
9-
"github.com/hashicorp/go-multierror"
10-
"github.com/knadh/koanf/parsers/json"
11-
"github.com/knadh/koanf/v2"
12-
"github.com/rs/zerolog"
138
"log"
149
"reflect"
1510
"regexp"
1611
"slices"
1712
"strings"
13+
14+
"github.com/QuesmaOrg/quesma/platform/util"
15+
"github.com/hashicorp/go-multierror"
16+
"github.com/knadh/koanf/parsers/json"
17+
"github.com/knadh/koanf/v2"
18+
"github.com/rs/zerolog"
1819
)
1920

2021
var DefaultLogLevel = zerolog.InfoLevel

platform/parsers/elastic_query_dsl/lucene/expression.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ func (p *luceneParser) buildWhereStatement(addDefaultOperator bool) model.Expr {
7676
)
7777
case termToken:
7878
currentStatement = newLeafStatement(p.defaultFieldNames, newTermValue(currentToken.term))
79+
case fuzzyToken:
80+
currentStatement = newLeafStatement(p.defaultFieldNames, newFuzzyValue(currentToken.term, currentToken.distance))
7981
case andToken:
8082
return model.NewInfixExpr(p.WhereStatement, "AND", p.buildWhereStatement(false))
8183
case orToken:

platform/parsers/elastic_query_dsl/lucene/lucene_parser.go

Lines changed: 92 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ package lucene
44

55
import (
66
"context"
7-
"github.com/QuesmaOrg/quesma/platform/logger"
8-
"github.com/QuesmaOrg/quesma/platform/model"
9-
"github.com/QuesmaOrg/quesma/platform/schema"
107
"math"
118
"slices"
129
"strconv"
1310
"strings"
1411
"unicode"
12+
13+
"github.com/QuesmaOrg/quesma/platform/logger"
14+
"github.com/QuesmaOrg/quesma/platform/model"
15+
"github.com/QuesmaOrg/quesma/platform/schema"
1516
)
1617

1718
// Mainly based on this doc: https://lucene.apache.org/core/2_9_4/queryparsersyntax.html
@@ -80,7 +81,6 @@ func TranslateToSQL(ctx context.Context, query string, fields []string, currentS
8081
}
8182

8283
func (p *luceneParser) translateToSQL(query string) model.Expr {
83-
query = p.removeFuzzySearchOperator(query)
8484
query = p.removeBoostingOperator(query)
8585
p.tokenizeQuery(query)
8686
if len(p.tokens) == 1 {
@@ -142,7 +142,31 @@ func (p *luceneParser) parseTerm(query string, closingBoundTerm bool) (token tok
142142
case '"':
143143
for i, r := range query[1:] {
144144
if r == '"' {
145-
return newTermToken(query[:i+2]), query[i+2:]
145+
term := query[:i+2]
146+
remainingQuery = query[i+2:]
147+
// Check for fuzzy operator after quoted term (e.g., "term"~2)
148+
if strings.HasPrefix(remainingQuery, string(fuzzyOperator)) {
149+
// Parse fuzzy operator from remaining query
150+
distanceEnd := 1 // Start after ~
151+
152+
// Find where distance ends (space, delimiter, etc.)
153+
for distanceEnd < len(remainingQuery) {
154+
r := remainingQuery[distanceEnd]
155+
if r == ' ' || r == delimiterCharacter || r == rightParenthesis {
156+
break
157+
}
158+
distanceEnd++
159+
}
160+
161+
distanceStr := remainingQuery[1:distanceEnd] // Skip the ~
162+
distance := p.parseFuzzyDistance(distanceStr)
163+
164+
// Remove quotes from term for fuzzy search
165+
cleanTerm := term[1 : len(term)-1] // Remove quotes
166+
logger.InfoWithCtx(p.ctx).Msgf("Parsed fuzzy term: %s with distance: %d", cleanTerm, distance)
167+
return newFuzzyToken(cleanTerm, distance), remainingQuery[distanceEnd:]
168+
}
169+
return newTermToken(term), remainingQuery
146170
}
147171
}
148172
logger.Error().Msgf("unterminated quoted term, query: %s", query)
@@ -152,11 +176,72 @@ func (p *luceneParser) parseTerm(query string, closingBoundTerm bool) (token tok
152176
default:
153177
for i, r := range query {
154178
if r == ' ' || r == delimiterCharacter || r == rightParenthesis || (closingBoundTerm && (r == exclusiveRangeClosingCharacter || r == inclusiveRangeClosingCharacter)) {
155-
return newTermToken(query[:i]), query[i:]
179+
term := query[:i]
180+
remainingQuery = query[i:]
181+
// Check for fuzzy operator
182+
if fuzzyTok, remaining := p.parseFuzzyIfPresent(term, remainingQuery); fuzzyTok != nil {
183+
return fuzzyTok, remaining
184+
}
185+
return newTermToken(term), remainingQuery
156186
}
157187
}
158-
return newTermToken(query), ""
188+
// End of query reached
189+
term := query
190+
remainingQuery = ""
191+
// Check for fuzzy operator
192+
if fuzzyTok, remaining := p.parseFuzzyIfPresent(term, remainingQuery); fuzzyTok != nil {
193+
return fuzzyTok, remaining
194+
}
195+
return newTermToken(term), remainingQuery
196+
}
197+
}
198+
199+
// parseFuzzyIfPresent checks if the term contains fuzzy operator and parses it
200+
// Returns fuzzy token if fuzzy operator found, nil otherwise
201+
func (p *luceneParser) parseFuzzyIfPresent(term string, remainingQuery string) (token, string) {
202+
// Check if term contains fuzzy operator
203+
fuzzyIndex := strings.LastIndex(term, string(fuzzyOperator))
204+
if fuzzyIndex == -1 {
205+
return nil, remainingQuery
206+
}
207+
208+
// Check if it's escaped
209+
if fuzzyIndex > 0 && term[fuzzyIndex-1] == escapeCharacter {
210+
return nil, remainingQuery
211+
}
212+
213+
// Extract the base term (before ~)
214+
baseTerm := term[:fuzzyIndex]
215+
if baseTerm == "" {
216+
return nil, remainingQuery
217+
}
218+
219+
// Extract distance (after ~)
220+
distanceStr := term[fuzzyIndex+1:]
221+
distance := p.parseFuzzyDistance(distanceStr)
222+
223+
return newFuzzyToken(baseTerm, distance), remainingQuery
224+
}
225+
226+
// parseFuzzyDistance converts a distance string to an integer for fuzzy search.
227+
// Returns 2 as default if distanceStr is empty or invalid.
228+
// For fractional values like 0.8, returns 1 as minimum distance.
229+
func (p *luceneParser) parseFuzzyDistance(distanceStr string) int {
230+
if distanceStr == "" {
231+
return 2 // default fuzzy distance
232+
}
233+
234+
if parsedFloat, err := strconv.ParseFloat(distanceStr, 64); err == nil && parsedFloat >= 0 {
235+
// Convert float to int - Elasticsearch typically uses this for edit distance
236+
// For fractional values like 0.8, we'll use 1 as minimum distance
237+
if parsedFloat < 1.0 && parsedFloat > 0 {
238+
return 1
239+
} else {
240+
return int(parsedFloat)
241+
}
159242
}
243+
244+
return 2 // default if parsing fails
160245
}
161246

162247
func (p *luceneParser) parseRange(query string) (token token, remainingQuery string) {
@@ -279,10 +364,6 @@ func (p *luceneParser) parseOneBound(query string, closingBound bool) (bound any
279364
}
280365
}
281366

282-
func (p *luceneParser) removeFuzzySearchOperator(query string) string {
283-
return p.removeSpecialCharacter(query, fuzzyOperator)
284-
}
285-
286367
func (p *luceneParser) removeBoostingOperator(query string) string {
287368
return p.removeSpecialCharacter(query, boostingOperator)
288369
}

platform/parsers/elastic_query_dsl/lucene/lucene_parser_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ package lucene
44

55
import (
66
"context"
7+
"testing"
8+
79
"github.com/QuesmaOrg/quesma/platform/model"
810
"github.com/QuesmaOrg/quesma/platform/schema"
911
"github.com/QuesmaOrg/quesma/platform/util"
10-
"testing"
1112
)
1213

1314
func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
@@ -19,11 +20,13 @@ func TestTranslatingLuceneQueriesToSQL(t *testing.T) {
1920
}{
2021
{`title:"The Right Way" AND text:go!!`, `("title" __quesma_match 'The Right Way' AND "text" __quesma_match 'go!!')`},
2122
{`title:Do it right AND right`, `((("title" __quesma_match 'Do' OR ("title" __quesma_match 'it' OR "text" __quesma_match 'it')) OR ("title" __quesma_match 'right' OR "text" __quesma_match 'right')) AND ("title" __quesma_match 'right' OR "text" __quesma_match 'right'))`},
22-
{`roam~`, `("title" __quesma_match 'roam' OR "text" __quesma_match 'roam')`},
23-
{`roam~0.8`, `("title" __quesma_match 'roam' OR "text" __quesma_match 'roam')`},
23+
{`roam~`, `(damerauLevenshteinDistance("title",'roam') <= 2 OR damerauLevenshteinDistance("text",'roam') <= 2)`},
24+
{`query: roam~323`, `damerauLevenshteinDistance("query",'roam') <= 323`},
25+
{`roam~0.8`, `(damerauLevenshteinDistance("title",'roam') <= 1 OR damerauLevenshteinDistance("text",'roam') <= 1)`},
26+
{`query:google.cmo~1`, `damerauLevenshteinDistance("query",'google.cmo') <= 1`},
2427
{`jakarta^4 apache`, `(("title" __quesma_match 'jakarta' OR "text" __quesma_match 'jakarta') OR ("title" __quesma_match 'apache' OR "text" __quesma_match 'apache'))`},
2528
{`"jakarta apache"^10`, `("title" __quesma_match 'jakarta apache' OR "text" __quesma_match 'jakarta apache')`},
26-
{`"jakarta apache"~10`, `("title" __quesma_match 'jakarta apache' OR "text" __quesma_match 'jakarta apache')`},
29+
{`"jakarta apache"~10`, `(damerauLevenshteinDistance("title",'jakarta apache') <= 10 OR damerauLevenshteinDistance("text",'jakarta apache') <= 10)`},
2730
{`mod_date:[2002-01-01 TO 2003-02-15]`, `("mod_date" >= '2002-01-01' AND "mod_date" <= '2003-02-15')`}, // 7
2831
{`mod_date:[2002-01-01 TO 2003-02-15}`, `("mod_date" >= '2002-01-01' AND "mod_date" < '2003-02-15')`},
2932
{`age:>10`, `"age" > '10'`},

platform/parsers/elastic_query_dsl/lucene/token.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,15 @@ type termToken struct {
5151
func newTermToken(term string) termToken {
5252
return termToken{term}
5353
}
54+
55+
type fuzzyToken struct {
56+
term string
57+
distance int
58+
}
59+
60+
func newFuzzyToken(term string, distance int) fuzzyToken {
61+
if distance <= 0 {
62+
distance = 2 // default fuzzy distance
63+
}
64+
return fuzzyToken{term: term, distance: distance}
65+
}

platform/parsers/elastic_query_dsl/lucene/value.go

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ package lucene
44

55
import (
66
"fmt"
7-
"github.com/QuesmaOrg/quesma/platform/logger"
8-
"github.com/QuesmaOrg/quesma/platform/model"
9-
"github.com/QuesmaOrg/quesma/platform/util"
107
"slices"
118
"strconv"
129
"strings"
10+
11+
"github.com/QuesmaOrg/quesma/platform/logger"
12+
"github.com/QuesmaOrg/quesma/platform/model"
13+
"github.com/QuesmaOrg/quesma/platform/util"
1314
)
1415

1516
// value is a part of an expression, representing what we query for (expression without fields for which we query).
@@ -38,6 +39,15 @@ func newTermValue(term string) termValue {
3839
return termValue{term: term}
3940
}
4041

42+
type fuzzyValue struct {
43+
term string
44+
distance int
45+
}
46+
47+
func newFuzzyValue(term string, distance int) fuzzyValue {
48+
return fuzzyValue{term: term, distance: distance}
49+
}
50+
4151
func (v termValue) toExpression(fieldName string) model.Expr {
4252
termAsStringToClickhouse := v.transformSpecialCharacters()
4353

@@ -91,6 +101,57 @@ func (v termValue) transformSpecialCharacters() (termFinal string) {
91101
return returnTerm.String()
92102
}
93103

104+
func (v fuzzyValue) toExpression(fieldName string) model.Expr {
105+
// Clean the term like we do for regular terms
106+
termAsStringToClickhouse := v.transformSpecialCharacters()
107+
108+
if alreadyQuoted(v.term) {
109+
termAsStringToClickhouse = termAsStringToClickhouse[1 : len(termAsStringToClickhouse)-1]
110+
}
111+
if !util.IsSingleQuoted(termAsStringToClickhouse) {
112+
termAsStringToClickhouse = util.SingleQuote(termAsStringToClickhouse)
113+
}
114+
115+
// Use ClickHouse's damerauLevenshteinDistance function
116+
// Syntax: damerauLevenshteinDistance(field, search_term) <= distance
117+
fieldRef := model.NewColumnRef(fieldName)
118+
searchTerm := model.NewLiteralWithEscapeType(termAsStringToClickhouse, model.FullyEscaped)
119+
distanceLiteral := model.NewLiteral(strconv.Itoa(v.distance))
120+
121+
fuzzyFunc := model.NewFunction("damerauLevenshteinDistance", fieldRef, searchTerm)
122+
123+
return model.NewInfixExpr(fuzzyFunc, " <= ", distanceLiteral)
124+
}
125+
126+
// transformSpecialCharacters for fuzzy values - similar to termValue but for fuzzy terms
127+
func (v fuzzyValue) transformSpecialCharacters() (termFinal string) {
128+
strAsRunes := []rune(v.term)
129+
var returnTerm strings.Builder
130+
for i := 0; i < len(strAsRunes); i++ {
131+
curRune := strAsRunes[i]
132+
transformed, isTransformed := charTransformations[curRune]
133+
if isTransformed {
134+
returnTerm.WriteString(transformed)
135+
continue
136+
}
137+
138+
if i == len(strAsRunes)-1 {
139+
returnTerm.WriteRune(curRune)
140+
continue
141+
}
142+
143+
nextRune := strAsRunes[i+1]
144+
if curRune == escapeCharacter && slices.Contains(specialCharacters, nextRune) {
145+
// it's escaped, so we write nextRune instead of the original curRune
146+
returnTerm.WriteRune(nextRune)
147+
i++
148+
} else {
149+
returnTerm.WriteRune(curRune)
150+
}
151+
}
152+
return returnTerm.String()
153+
}
154+
94155
type rangeValue struct {
95156
lowerBound any // unbounded (nil) means no lower bound
96157
upperBound any // unbounded (nil) means no upper bound
@@ -282,6 +343,8 @@ func (p *luceneParser) buildValue(stack []value, parenthesisLevel int) value {
282343
stack = append(stack, newNotValue(p.buildValue([]value{}, 0)))
283344
case termToken:
284345
stack = append(stack, newTermValue(currentToken.term))
346+
case fuzzyToken:
347+
stack = append(stack, newFuzzyValue(currentToken.term, currentToken.distance))
285348
case rangeToken:
286349
stack = append(stack, currentToken.rangeValue)
287350
default:

0 commit comments

Comments
 (0)