Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.

Commit 871ba58

Browse files
authored
Introduce unique ids (#1435)
An attempt to introduce unique object IDs in the ClickHouse realm. A returned document id (`_id` field) would have the following syntax: ``` {hex-encoded timestamp field}qqq{hex-encoded hash of the document} ``` Of course, the `{hex-encoded hash of the document}` lives only in Quesma memory: 1. It needs to be re-calculated when returning search hits 2. Quesma need to filter out based on it when rendering hits Therefore when fetching document with specific `_id` we could filter out documents with matching timestamp and then filter for that with matching fields, returning the right entry. This fixes the issue, where JSON view of single document could return a random object from search hits, not necessarily the one clicked. <img width="1458" alt="image" src="https://github.com/user-attachments/assets/0b808dba-0255-4be0-ac5f-acb7a21569ba" /> **However** (and it's a pretty big however), the "surrounding document" view **cannot return the surrounding documents**. While the query doesn't error it also doesn't return any documents. The experiment to have that working [is carried out is a separate PR](#1446), although it's not clear yet if this way of doing things is going to guarantee 100% correctness.
1 parent 57e07b7 commit 871ba58

File tree

4 files changed

+109
-24
lines changed

4 files changed

+109
-24
lines changed

platform/model/typical_queries/hits.go

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ package typical_queries
44

55
import (
66
"context"
7+
"crypto/sha256"
8+
"encoding/hex"
9+
"encoding/json"
710
"fmt"
811
"github.com/QuesmaOrg/quesma/platform/clickhouse"
912
"github.com/QuesmaOrg/quesma/platform/common_table"
@@ -12,6 +15,7 @@ import (
1215
"github.com/QuesmaOrg/quesma/platform/model"
1316
"github.com/QuesmaOrg/quesma/platform/util"
1417
"reflect"
18+
"sort"
1519
"strconv"
1620
"strings"
1721
"time"
@@ -239,9 +243,9 @@ func (query Hits) computeIdForDocument(doc model.SearchHit, defaultID string) st
239243
if v, ok := doc.Fields[tsFieldName]; ok {
240244
if vv, okk := v[0].(time.Time); okk {
241245
// At database level we only compare timestamps with millisecond precision
242-
// However in search results we append `q` plus generated digits (we use q because it's not in hex)
243-
// so that kibana can iterate over documents in UI
244-
pseudoUniqueId = fmt.Sprintf("%xq%s", vv, defaultID)
246+
// However in search results we append `qqq` plus generated hash of the source to hex-encoded timestamp
247+
sourceHash := fmt.Sprintf("%x", ComputeHash(doc.Source))
248+
pseudoUniqueId = fmt.Sprintf("%xqqq%x", vv, sourceHash)
245249
} else {
246250
logger.WarnWithCtx(query.ctx).Msgf("failed to convert timestamp field [%v] to time.Time", v[0])
247251
return defaultID
@@ -250,6 +254,50 @@ func (query Hits) computeIdForDocument(doc model.SearchHit, defaultID string) st
250254
return pseudoUniqueId
251255
}
252256

257+
func ComputeHash(data json.RawMessage) string {
258+
var parsed interface{}
259+
if err := json.Unmarshal(data, &parsed); err != nil {
260+
hash := sha256.Sum256(data)
261+
return hex.EncodeToString(hash[:])
262+
}
263+
normalized := normalizeJSON(parsed)
264+
normalizedBytes, err := json.Marshal(normalized)
265+
if err != nil {
266+
hash := sha256.Sum256(data)
267+
return hex.EncodeToString(hash[:])
268+
}
269+
hash := sha256.Sum256(normalizedBytes)
270+
return hex.EncodeToString(hash[:])
271+
}
272+
273+
// normalizeJSON recursively normalizes JSON structure to ensure consistent ordering for further hashing.
274+
func normalizeJSON(v interface{}) interface{} {
275+
switch val := v.(type) {
276+
case map[string]interface{}:
277+
keys := make([]string, 0, len(val))
278+
for k := range val {
279+
keys = append(keys, k)
280+
}
281+
sort.Strings(keys)
282+
283+
normalized := make(map[string]interface{})
284+
for _, k := range keys {
285+
normalized[k] = normalizeJSON(val[k])
286+
}
287+
return normalized
288+
289+
case []interface{}:
290+
normalized := make([]interface{}, len(val))
291+
for i, v := range val {
292+
normalized[i] = normalizeJSON(v)
293+
}
294+
return normalized
295+
296+
default:
297+
return val
298+
}
299+
}
300+
253301
func (query Hits) String() string {
254302
return fmt.Sprintf("hits(indexes: %v)", strings.Join(query.indexes, ", "))
255303
}

platform/parsers/elastic_query_dsl/query_parser.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ func NewEmptyHighlighter() model.Highlighter {
3434
}
3535

3636
const (
37-
defaultQueryResultSize = 10
38-
defaultTrackTotalHits = 10000
37+
defaultQueryResultSize = 10
38+
increasedResultSizeForIdQuery = 10000 // In `_id` query we had to fetch way more documents as these have to be filtered later on, when assembling the result
39+
defaultTrackTotalHits = 10000
40+
uuidSeparator = "qqq" // Document IDs (_id) fields in quesma ar
3941
)
4042

4143
func (cw *ClickhouseQueryTranslator) ParseQuery(body types.JSON) (*model.ExecutionPlan, error) {
@@ -322,6 +324,7 @@ func (cw *ClickhouseQueryTranslator) parseIds(queryMap QueryMap) model.SimpleQue
322324
return model.NewSimpleQueryInvalid()
323325
}
324326
ids := make([]string, 0, len(idsRaw))
327+
uniqueIds := make([]string, 0, len(idsRaw))
325328
for _, id := range idsRaw {
326329
if idAsString, ok := id.(string); ok {
327330
ids = append(ids, idAsString)
@@ -331,18 +334,19 @@ func (cw *ClickhouseQueryTranslator) parseIds(queryMap QueryMap) model.SimpleQue
331334
}
332335
}
333336

334-
// when our generated ID appears in query looks like this: `1d<TRUNCATED>0b8q1`
335-
// therefore we need to strip the hex part (before `q`) and convert it to decimal
336-
// then we can query at DB level
337+
// when our generated ID appears in query looks like this:
338+
// `<hex-encoded timestamp>qqq<hex-encoded source hash>`
339+
// Therefore we need to convert the hex-encoded timestamp to assemble the SQL query
337340
for i, id := range ids {
338-
idInHex := strings.Split(id, "q")[0]
341+
idInHex := strings.Split(id, uuidSeparator)[0]
339342
if idAsStr, err := hex.DecodeString(idInHex); err != nil {
340343
logger.Error().Msgf("error parsing document id %s: %v", id, err)
341344
return model.NewSimpleQueryInvalid()
342345
} else {
343346
tsWithoutTZ := strings.TrimSuffix(string(idAsStr), " +0000 UTC")
344347
ids[i] = fmt.Sprintf("'%s'", tsWithoutTZ)
345348
}
349+
uniqueIds = append(uniqueIds, id)
346350
}
347351

348352
var idToSql func(string) (model.Expr, error)
@@ -399,6 +403,7 @@ func (cw *ClickhouseQueryTranslator) parseIds(queryMap QueryMap) model.SimpleQue
399403
idsTuple := model.NewTupleExpr(idsAsExprs...)
400404
whereStmt = model.NewInfixExpr(model.NewColumnRef(timestampColumnName), " IN ", idsTuple)
401405
}
406+
cw.UniqueIDsUsedInTheQuery = uniqueIds // a crucial side effect here - queries against _id field requires special treatment
402407
return model.NewSimpleQuery(whereStmt, true)
403408
}
404409

@@ -1233,6 +1238,11 @@ func ResolveField(ctx context.Context, fieldName string, schemaInstance schema.S
12331238
}
12341239

12351240
func (cw *ClickhouseQueryTranslator) parseSize(queryMap QueryMap, defaultSize int) int {
1241+
if len(cw.UniqueIDsUsedInTheQuery) > 0 {
1242+
// If this is a unique ID query, we can't limit size at the SQL level,
1243+
// because we need all matching timestamps that later will be filtered out but looking at hashes computed on hits
1244+
return increasedResultSizeForIdQuery
1245+
}
12361246
sizeRaw, exists := queryMap["size"]
12371247
if !exists {
12381248
return defaultSize

platform/parsers/elastic_query_dsl/query_translator.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"github.com/QuesmaOrg/quesma/platform/parsers/elastic_query_dsl/query_util"
1212
"github.com/QuesmaOrg/quesma/platform/schema"
1313
"github.com/QuesmaOrg/quesma/platform/util"
14+
"slices"
15+
"strings"
1416
)
1517

1618
type JsonMap = map[string]interface{}
@@ -24,7 +26,9 @@ type ClickhouseQueryTranslator struct {
2426
Indexes []string
2527

2628
// TODO this will be removed
27-
Table *clickhouse.Table
29+
Table *clickhouse.Table
30+
UniqueIDsUsedInTheQuery []string // A list of UniqueIDs used in the query (via `_id` field), which has to be passed to the JSON response rendering stage.
31+
// That's because it has to be used both in the SQL query and during the response rendering, see https://github.com/QuesmaOrg/quesma/pull/1435 for more details.
2832
}
2933

3034
var completionStatusOK = func() *int { value := 200; return &value }()
@@ -131,7 +135,30 @@ func (cw *ClickhouseQueryTranslator) makeHits(queries []*model.Query, results []
131135
hitsPartOfResponse := hitsQuery.Type.TranslateSqlResponseToJson(hitsResultSet)
132136

133137
hitsResponse := hitsPartOfResponse["hits"].(model.SearchHits)
134-
return queriesWithoutHits, resultsWithoutHits, &hitsResponse
138+
hits := cw.FilterOutHitsIfThisIsIdQuery(hitsResponse)
139+
return queriesWithoutHits, resultsWithoutHits, &hits
140+
}
141+
142+
// FilterOutHitsIfThisIsIdQuery - If during parsing we have found that this is a query for _id,
143+
// we filter out hits that are not in the list of UniqueIDsUsedInTheQuery.
144+
// we only do this filtering based on the doc.Source hash comparison, ignoring the two first UUID parts.
145+
func (cw *ClickhouseQueryTranslator) FilterOutHitsIfThisIsIdQuery(hits model.SearchHits) model.SearchHits {
146+
if len(cw.UniqueIDsUsedInTheQuery) == 0 {
147+
return hits // not _id query, proceed as usual
148+
}
149+
hashesFromQuery := make([]string, 0, len(cw.UniqueIDsUsedInTheQuery))
150+
for _, id := range cw.UniqueIDsUsedInTheQuery {
151+
hashesFromQuery = append(hashesFromQuery, strings.Split(id, uuidSeparator)[1])
152+
}
153+
filteredHits := make([]model.SearchHit, 0, len(hits.Hits))
154+
for _, hit := range hits.Hits {
155+
hash := strings.Split(hit.ID, uuidSeparator)[1]
156+
if slices.Contains(hashesFromQuery, hash) {
157+
filteredHits = append(filteredHits, hit)
158+
}
159+
}
160+
hits.Hits = filteredHits
161+
return hits
135162
}
136163

137164
func (cw *ClickhouseQueryTranslator) makeTotalCount(queries []*model.Query, results [][]model.QueryResultRow) (queriesWithoutCount []*model.Query, resultsWithoutCount [][]model.QueryResultRow, total *model.Total) {

platform/testdata/requests.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2148,7 +2148,7 @@ var TestsSearch = []SearchTestCase{
21482148
},
21492149
{
21502150
"match_phrase": {
2151-
"_id": "323032342d30352d32342031333a33323a34372e333037202b3030303020555443q1"
2151+
"_id": "323032342d30352d32342031333a33323a34372e333037202b3030303020555443qqq111111111111111111111111111111111111111111111111111111111111"
21522152
}
21532153
}
21542154
]
@@ -2386,7 +2386,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
23862386
`{
23872387
"query": {
23882388
"ids": {
2389-
"values": ["323032342d31322d32312030373a32393a30332e333637202b3030303020555443q1"]
2389+
"values": ["323032342d31322d32312030373a32393a30332e333637202b3030303020555443qqq1111111111111111111111111111111111111111"]
23902390
}
23912391
},
23922392
"track_total_hits": false
@@ -2407,8 +2407,8 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24072407
"query": {
24082408
"ids": {
24092409
"values": [
2410-
"323032342d31322d32312030373a32393a30332e333637202b3030303020555443q1",
2411-
"323032342d31322d32312030373a32393a30322e393932202b3030303020555443q3"
2410+
"323032342d31322d32312030373a32393a30332e333637202b3030303020555443qqq111111111111111111111111111",
2411+
"323032342d31322d32312030373a32393a30322e393932202b3030303020555443qqq111111111111111111111111111"
24122412
]
24132413
}
24142414
},
@@ -2420,7 +2420,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24202420
`SELECT "message" ` +
24212421
`FROM ` + TableName + ` ` +
24222422
`WHERE "@timestamp" IN tuple(toDateTime64('2024-12-21 07:29:03.367',3), toDateTime64('2024-12-21 07:29:02.992',3)) ` +
2423-
`LIMIT 10`,
2423+
`LIMIT 10000`,
24242424
},
24252425
[]string{},
24262426
},
@@ -2429,7 +2429,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24292429
`{
24302430
"query": {
24312431
"ids": {
2432-
"values": ["323032342d31322d32312030373a32393a30332e333637303030303030q1"]
2432+
"values": ["323032342d31322d32312030373a32393a30332e333637303030303030qqq123qqq11111111111111111111111111111111111111111111111"]
24332433
}
24342434
},
24352435
"track_total_hits": false
@@ -2440,7 +2440,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24402440
`SELECT "message" ` +
24412441
`FROM ` + TableName + ` ` +
24422442
`WHERE "@timestamp" = toDateTime64('2024-12-21 07:29:03.367000000',9) ` +
2443-
`LIMIT 10`,
2443+
`LIMIT 10000`,
24442444
},
24452445
[]string{},
24462446
},
@@ -2449,7 +2449,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24492449
`{
24502450
"query": {
24512451
"ids": {
2452-
"values": ["323032342d31322d32312030373a32393a30332e313233343536373839q123"]
2452+
"values": ["323032342d31322d32312030373a32393a30332e313233343536373839qqq123qqq11111111111111111111111111111111111111111111111"]
24532453
}
24542454
},
24552455
"track_total_hits": false
@@ -2460,7 +2460,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24602460
`SELECT "message" ` +
24612461
`FROM ` + TableName + ` ` +
24622462
`WHERE "@timestamp" = toDateTime64('2024-12-21 07:29:03.123456789',9) ` +
2463-
`LIMIT 10`,
2463+
`LIMIT 10000`,
24642464
},
24652465
[]string{},
24662466
},
@@ -2469,7 +2469,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24692469
`{
24702470
"query": {
24712471
"ids": {
2472-
"values": ["323032342d31322d32312030373a32393a3033q1"]
2472+
"values": ["323032342d31322d32312030373a32393a3033qqq11111111111111111111111111111111111111111111111"]
24732473
}
24742474
},
24752475
"track_total_hits": false
@@ -2480,7 +2480,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24802480
`SELECT "message" ` +
24812481
`FROM ` + TableName + ` ` +
24822482
`WHERE "@timestamp" = toDateTime64('2024-12-21 07:29:03',0) ` +
2483-
`LIMIT 10`,
2483+
`LIMIT 10000`,
24842484
},
24852485
[]string{},
24862486
},
@@ -2489,7 +2489,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
24892489
`{
24902490
"query": {
24912491
"ids": {
2492-
"values": ["323032342d31322d32312030373a32393a30332e33q1"]
2492+
"values": ["323032342d31322d32312030373a32393a30332e33qqq11111111111111111111111111111111111111111111111"]
24932493
}
24942494
},
24952495
"track_total_hits": false
@@ -2500,7 +2500,7 @@ Men\\'s Clothing \\\\ %' LIMIT 10`},
25002500
`SELECT "message" ` +
25012501
`FROM ` + TableName + ` ` +
25022502
`WHERE "@timestamp" = toDateTime64('2024-12-21 07:29:03.3',1) ` +
2503-
`LIMIT 10`,
2503+
`LIMIT 10000`,
25042504
},
25052505
[]string{},
25062506
},

0 commit comments

Comments
 (0)