Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.

Commit 867d2fe

Browse files
authored
Fix Lucene query behavior (match operator) (#1426)
Lucene syntax is not expected to behave the way Quesma handles it. Using default text fields, for query like: `myText: getAcc` it is going to perform case-insensitive, exact match. This became apparent when playing with Grafana <img width="1488" alt="image" src="https://github.com/user-attachments/assets/ce108b02-d8a1-494a-8d44-167ad8c4c492" /> but also applies to Kibana when switching to Lucene query language. <img width="620" alt="image" src="https://github.com/user-attachments/assets/db374571-8552-4e2c-88f9-a1cec0342136" /> We still do not fully implement Lucene syntax translation, but this definitely brings us closer by at least matching the default behavior. ## Useful notes for reproducing this situation 👇 <details> ```bash ## Create two indices to have comparison curl -X PUT "http://localhost:8080/es" -H 'Content-Type: application/json' -d ' { "mappings": { "properties": { "myText": { "type": "text" }, "@timestamp": { "type": "date" } } } }' curl -X PUT "http://localhost:8080/ch" -H 'Content-Type: application/json' -d ' { "mappings": { "properties": { "myText": { "type": "text" } } } }' ## Example dataset, save into `data.ndjson` {"index": {"_index": "es"}} {"myText": "getAccomondationStatus", "@timestamp": "2025-05-13T10:00:00Z"} {"index": {"_index": "es"}} {"myText": "getAccomodationOffer", "@timestamp": "2025-05-13T10:01:00Z"} {"index": {"_index": "es"}} {"myText": "getAccomodation", "@timestamp": "2025-05-13T10:02:00Z"} {"index": {"_index": "ch"}} {"myText": "getAccomondationStatus", "@timestamp": "2025-05-13T10:00:00Z"} {"index": {"_index": "ch"}} {"myText": "getAccomodationOffer", "@timestamp": "2025-05-13T10:01:00Z"} {"index": {"_index": "ch"}} {"myText": "getAccomodation", "@timestamp": "2025-05-13T10:02:00Z"} ``` ## Load data curl -X POST "http://localhost:8080/_bulk" \ -H "Content-Type: application/x-ndjson" \ --data-binary "@data.ndjson" </details> Fixes: #1425
1 parent c6db0b3 commit 867d2fe

File tree

8 files changed

+92
-93
lines changed

8 files changed

+92
-93
lines changed

platform/frontend_connectors/schema_transformer.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,10 +1136,12 @@ func (s *SchemaCheckPass) applyMatchOperator(indexSchema schema.Schema, query *m
11361136
case schema.QuesmaTypeKeyword.Name:
11371137
return equal()
11381138
default:
1139-
// ILIKE '%%' has terrible performance, but semantically means "is not null", hence this transformation
1140-
if rhsValue == "%%" {
1139+
if rhsValue == "%%" { // ILIKE '%%' has terrible performance, but semantically means "is not null", hence this transformation
11411140
return model.NewInfixExpr(lhs, "IS", model.NewLiteral("NOT NULL"))
11421141
}
1142+
// we might investigate the potential performance gain of checking
1143+
// that if rhsValue doesn't contain '%' we could use '=' instead of 'ILIKE'
1144+
// *however* that'd require few tweaks in the parser
11431145
return ilike()
11441146
}
11451147
}

platform/parsers/elastic_query_dsl/lucene/lucene_parser_test.go

Lines changed: 47 additions & 47 deletions
Large diffs are not rendered by default.

platform/parsers/elastic_query_dsl/lucene/value.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ func (v termValue) toExpression(fieldName string) model.Expr {
4949
if alreadyQuoted(v.term) {
5050
termAsStringToClickhouse = termAsStringToClickhouse[1 : len(termAsStringToClickhouse)-1]
5151
}
52-
if !util.IsSurroundedWithPercents(termAsStringToClickhouse) {
53-
termAsStringToClickhouse = util.SurroundWithPercents(termAsStringToClickhouse)
54-
}
5552
if !util.IsSingleQuoted(termAsStringToClickhouse) {
5653
termAsStringToClickhouse = util.SingleQuote(termAsStringToClickhouse)
5754
}

platform/testdata/clients/clover.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ var CloverTests = []testdata.AggregationTestCase{
347347
ExpectedPancakeSQL: `
348348
SELECT count(*) AS "aggr__timeseries__count",
349349
countIf(true) AS "metric__timeseries__a2-denominator_col_0",
350-
countIf(NOT ("table.flower" __quesma_match '%clover%')) AS
350+
countIf(NOT ("table.flower" __quesma_match 'clover')) AS
351351
"metric__timeseries__a2-numerator_col_0"
352352
FROM __quesma_table_name
353353
WHERE ("@timestamp">=fromUnixTimestamp64Milli(1728640683723) AND "@timestamp"<=
@@ -1115,7 +1115,7 @@ var CloverTests = []testdata.AggregationTestCase{
11151115
"aggr__q__time_buckets__key_0", count(*) AS "aggr__q__time_buckets__count",
11161116
sumOrNull("count") AS "metric__q__time_buckets__sum(count)_col_0"
11171117
FROM __quesma_table_name
1118-
WHERE NOT ("str_field" __quesma_match '%CRASH%')
1118+
WHERE NOT ("str_field" __quesma_match 'CRASH')
11191119
GROUP BY toInt64((toUnixTimestamp64Milli("@timestamp")+timeZoneOffset(toTimezone
11201120
("@timestamp", 'Europe/Warsaw'))*1000) / 1800000) AS
11211121
"aggr__q__time_buckets__key_0"

platform/testdata/kibana_sample_data_ecommerce.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -858,11 +858,11 @@ var KibanaSampleDataEcommerce = []AggregationTestCase{
858858
SELECT toInt64((toUnixTimestamp64Milli("order_date")+timeZoneOffset(toTimezone(
859859
"order_date", 'Europe/Warsaw'))*1000) / 43200000) AS "aggr__0__key_0",
860860
count(*) AS "aggr__0__count",
861-
countIf(("products.product_name" __quesma_match '%%cocktail%' OR
862-
"__quesma_fulltext_field_name" __quesma_match '%dress%%')) AS
861+
countIf(("products.product_name" __quesma_match '%cocktail' OR
862+
"__quesma_fulltext_field_name" __quesma_match 'dress%')) AS
863863
"aggr__0__1-bucket__count",
864-
sumOrNullIf("taxful_total_price", ("products.product_name" __quesma_match '%%cocktail%'
865-
OR "__quesma_fulltext_field_name" __quesma_match '%dress%%')) AS
864+
sumOrNullIf("taxful_total_price", ("products.product_name" __quesma_match '%cocktail'
865+
OR "__quesma_fulltext_field_name" __quesma_match 'dress%')) AS
866866
"metric__0__1-bucket__1-metric_col_0"
867867
FROM __quesma_table_name
868868
WHERE ("order_date">=fromUnixTimestamp64Milli(1740234098238) AND "order_date"<=

platform/testdata/kibana_sample_data_flights.go

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -800,8 +800,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
800800
count(*) AS "aggr__1__2__count"
801801
FROM __quesma_table_name
802802
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
803-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%' AND
804-
"Cancelled" __quesma_match '%true%'))
803+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true' AND
804+
"Cancelled" __quesma_match 'true'))
805805
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
806806
"timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS "aggr__1__2__key_0"
807807
ORDER BY "aggr__1__2__key_0" ASC`,
@@ -813,8 +813,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
813813
count(*) AS "aggr__1__2__count"
814814
FROM __quesma_table_name
815815
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
816-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
817-
AND "Cancelled" __quesma_match '%true%'))
816+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
817+
AND "Cancelled" __quesma_match 'true'))
818818
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
819819
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
820820
"aggr__1__2__key_0"
@@ -832,8 +832,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
832832
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
833833
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
834834
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
835-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
836-
AND "Cancelled" __quesma_match '%true%')))
835+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
836+
AND "Cancelled" __quesma_match 'true')))
837837
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
838838
"top_metrics__1__2__4_col_0", "top_metrics__1__2__4_col_1", "top_hits_rank"
839839
FROM "quesma_top_hits_join"
@@ -847,8 +847,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
847847
count(*) AS "aggr__1__2__count"
848848
FROM __quesma_table_name
849849
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
850-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
851-
AND "Cancelled" __quesma_match '%true%'))
850+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
851+
AND "Cancelled" __quesma_match 'true'))
852852
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
853853
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
854854
"aggr__1__2__key_0"
@@ -866,8 +866,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
866866
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
867867
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
868868
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
869-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
870-
AND "Cancelled" __quesma_match '%true%')))
869+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
870+
AND "Cancelled" __quesma_match 'true')))
871871
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
872872
"top_metrics__1__2__5_col_0", "top_metrics__1__2__5_col_1", "top_hits_rank"
873873
FROM "quesma_top_hits_join"
@@ -881,8 +881,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
881881
count(*) AS "aggr__1__2__count"
882882
FROM __quesma_table_name
883883
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
884-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
885-
AND "Cancelled" __quesma_match '%true%'))
884+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
885+
AND "Cancelled" __quesma_match 'true'))
886886
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
887887
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
888888
"aggr__1__2__key_0"
@@ -900,8 +900,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
900900
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
901901
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
902902
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
903-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
904-
AND "Cancelled" __quesma_match '%true%')))
903+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
904+
AND "Cancelled" __quesma_match 'true')))
905905
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
906906
"top_metrics__1__2__6_col_0", "top_metrics__1__2__6_col_1", "top_hits_rank"
907907
FROM "quesma_top_hits_join"
@@ -915,8 +915,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
915915
count(*) AS "aggr__1__2__count"
916916
FROM __quesma_table_name
917917
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
918-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
919-
AND "Cancelled" __quesma_match '%true%'))
918+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
919+
AND "Cancelled" __quesma_match 'true'))
920920
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
921921
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
922922
"aggr__1__2__key_0"
@@ -934,8 +934,8 @@ var KibanaSampleDataFlights = []AggregationTestCase{
934934
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
935935
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
936936
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=
937-
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match '%true%'
938-
AND "Cancelled" __quesma_match '%true%')))
937+
fromUnixTimestamp64Milli(1740835408853)) AND ("FlightDelay" __quesma_match 'true'
938+
AND "Cancelled" __quesma_match 'true')))
939939
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
940940
"top_metrics__1__2__7_col_0", "top_metrics__1__2__7_col_1", "top_hits_rank"
941941
FROM "quesma_top_hits_join"
@@ -1638,7 +1638,7 @@ var KibanaSampleDataFlights = []AggregationTestCase{
16381638
SELECT toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
16391639
"timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS "aggr__0__key_0",
16401640
count(*) AS "aggr__0__count",
1641-
countIf("FlightDelay" __quesma_match '%true%') AS "metric__0__1-bucket_col_0",
1641+
countIf("FlightDelay" __quesma_match 'true') AS "metric__0__1-bucket_col_0",
16421642
countIf("__quesma_fulltext_field_name" __quesma_match '%') AS "metric__0__2-bucket_col_0"
16431643
FROM __quesma_table_name
16441644
WHERE ("timestamp">=fromUnixTimestamp64Milli(1740230608853) AND "timestamp"<=

platform/testdata/kibana_sample_data_logs.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,8 +348,8 @@ var KibanaSampleDataLogs = []AggregationTestCase{
348348
count(*) AS "aggr__1__2__count"
349349
FROM __quesma_table_name
350350
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740178800000) AND "timestamp"<=
351-
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match '%error%' AND
352-
"tags" __quesma_match '%security%'))
351+
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match 'error' AND
352+
"tags" __quesma_match 'security'))
353353
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
354354
"timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS "aggr__1__2__key_0"
355355
ORDER BY "aggr__1__2__key_0" ASC`,
@@ -362,8 +362,8 @@ var KibanaSampleDataLogs = []AggregationTestCase{
362362
count(*) AS "aggr__1__2__count"
363363
FROM __quesma_table_name
364364
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740178800000) AND "timestamp"<=
365-
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match '%error%' AND
366-
"tags" __quesma_match '%security%'))
365+
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match 'error' AND
366+
"tags" __quesma_match 'security'))
367367
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
368368
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
369369
"aggr__1__2__key_0"
@@ -381,8 +381,8 @@ var KibanaSampleDataLogs = []AggregationTestCase{
381381
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
382382
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
383383
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740178800000) AND "timestamp"<=
384-
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match '%error%' AND
385-
"tags" __quesma_match '%security%')))
384+
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match 'error' AND
385+
"tags" __quesma_match 'security')))
386386
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
387387
"top_metrics__1__2__4_col_0", "top_metrics__1__2__4_col_1", "top_hits_rank"
388388
FROM "quesma_top_hits_join"
@@ -396,8 +396,8 @@ var KibanaSampleDataLogs = []AggregationTestCase{
396396
count(*) AS "aggr__1__2__count"
397397
FROM __quesma_table_name
398398
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740178800000) AND "timestamp"<=
399-
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match '%error%' AND
400-
"tags" __quesma_match '%security%'))
399+
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match 'error' AND
400+
"tags" __quesma_match 'security'))
401401
GROUP BY toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(
402402
toTimezone("timestamp", 'Europe/Warsaw'))*1000) / 10800000) AS
403403
"aggr__1__2__key_0"
@@ -415,8 +415,8 @@ var KibanaSampleDataLogs = []AggregationTestCase{
415415
toInt64((toUnixTimestamp64Milli("timestamp")+timeZoneOffset(toTimezone(
416416
"timestamp", 'Europe/Warsaw'))*1000) / 10800000))
417417
WHERE (("timestamp">=fromUnixTimestamp64Milli(1740178800000) AND "timestamp"<=
418-
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match '%error%' AND
419-
"tags" __quesma_match '%security%')))
418+
fromUnixTimestamp64Milli(1740831278103)) AND ("tags" __quesma_match 'error' AND
419+
"tags" __quesma_match 'security')))
420420
SELECT "aggr__1__count", "aggr__1__2__key_0", "aggr__1__2__count",
421421
"top_metrics__1__2__5_col_0", "top_metrics__1__2__5_col_1", "top_hits_rank"
422422
FROM "quesma_top_hits_join"

platform/testdata/requests.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,9 +1252,9 @@ var TestsSearch = []SearchTestCase{
12521252
},
12531253
"track_total_hits": false
12541254
}`,
1255-
[]string{`"exception-list-agnostic.list_id" __quesma_match '%endpoint\_event\_filters%'`},
1255+
[]string{`"exception-list-agnostic.list_id" __quesma_match 'endpoint\_event\_filters'`},
12561256
model.ListAllFields,
1257-
[]string{`SELECT "message" FROM ` + TableName + ` WHERE "exception-list-agnostic.list_id"='%endpoint\\_event\\_filters%'`},
1257+
[]string{`SELECT "message" FROM ` + TableName + ` WHERE "exception-list-agnostic.list_id"='endpoint\\_event\\_filters'`},
12581258
[]string{},
12591259
},
12601260
{ // [10]
@@ -1279,9 +1279,9 @@ var TestsSearch = []SearchTestCase{
12791279
},
12801280
"track_total_hits": false
12811281
}`,
1282-
[]string{fullTextFieldName + ` __quesma_match '%ingest-agent-policies%'`},
1282+
[]string{fullTextFieldName + ` __quesma_match 'ingest-agent-policies'`},
12831283
model.ListAllFields,
1284-
[]string{`SELECT "message" FROM ` + TableName + ` WHERE ` + fullTextFieldName + ` ILIKE '%ingest-agent-policies%'`},
1284+
[]string{`SELECT "message" FROM ` + TableName + ` WHERE ` + fullTextFieldName + ` ILIKE 'ingest-agent-policies'`},
12851285
[]string{},
12861286
},
12871287
{ // [11]
@@ -1377,9 +1377,9 @@ var TestsSearch = []SearchTestCase{
13771377
"track_total_hits": false,
13781378
"size": 1
13791379
}`,
1380-
[]string{`"message" __quesma_match '%% logged%'`},
1380+
[]string{`"message" __quesma_match '% logged'`},
13811381
model.ListAllFields,
1382-
[]string{`SELECT "message" FROM ` + TableName + ` WHERE "message" ILIKE '%% logged%'`},
1382+
[]string{`SELECT "message" FROM ` + TableName + ` WHERE "message" ILIKE '% logged'`},
13831383
[]string{},
13841384
},
13851385
{ // [16]

0 commit comments

Comments
 (0)