Skip to content
This repository was archived by the owner on Nov 7, 2025. It is now read-only.

Commit 2527ae6

Browse files
authored
String column as keyword field (#1444)
Quesma exposes `String` columns as text type fields. This is the default behaviour. When a user performs a full-text search, quesma searches through all text fields. It can lead to massive queries. This PR adds a flag that changes the default type. ``` flags: defaultStringColumnType: keyword ```
1 parent 871ba58 commit 2527ae6

File tree

7 files changed

+145
-4
lines changed

7 files changed

+145
-4
lines changed

cmd/experimental/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func main() {
165165

166166
virtualTableStorage := persistence.NewElasticJSONDatabase(cfg.Elasticsearch, common_table.VirtualTableElasticIndexName)
167167
tableDisco := clickhouse.NewTableDiscovery(&cfg, connectionPool, virtualTableStorage)
168-
schemaRegistry := schema.NewSchemaRegistry(clickhouse.TableDiscoveryTableProviderAdapter{TableDiscovery: tableDisco}, &cfg, clickhouse.SchemaTypeAdapter{})
168+
schemaRegistry := schema.NewSchemaRegistry(clickhouse.TableDiscoveryTableProviderAdapter{TableDiscovery: tableDisco}, &cfg, clickhouse.NewSchemaTypeAdapter(cfg.DefaultStringColumnType))
169169
schemaRegistry.Start()
170170

171171
im := elasticsearch.NewIndexManagement(cfg.Elasticsearch)

cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func main() {
8888

8989
virtualTableStorage := persistence.NewElasticJSONDatabase(cfg.Elasticsearch, common_table.VirtualTableElasticIndexName)
9090
tableDisco := clickhouse.NewTableDiscovery(&cfg, connectionPool, virtualTableStorage)
91-
schemaRegistry := schema.NewSchemaRegistry(clickhouse.TableDiscoveryTableProviderAdapter{TableDiscovery: tableDisco}, &cfg, clickhouse.SchemaTypeAdapter{})
91+
schemaRegistry := schema.NewSchemaRegistry(clickhouse.TableDiscoveryTableProviderAdapter{TableDiscovery: tableDisco}, &cfg, clickhouse.NewSchemaTypeAdapter(cfg.DefaultStringColumnType))
9292
schemaRegistry.Start()
9393

9494
im := elasticsearch.NewIndexManagement(cfg.Elasticsearch)

platform/clickhouse/type_adapter.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,20 @@
33
package clickhouse
44

55
import (
6+
"github.com/QuesmaOrg/quesma/platform/logger"
67
"github.com/QuesmaOrg/quesma/platform/schema"
78
"strings"
89
)
910

1011
type SchemaTypeAdapter struct {
12+
defaultStringColumnType string
13+
}
14+
15+
func NewSchemaTypeAdapter(defaultType string) SchemaTypeAdapter {
16+
17+
return SchemaTypeAdapter{
18+
defaultStringColumnType: defaultType,
19+
}
1120
}
1221

1322
func (c SchemaTypeAdapter) Convert(s string) (schema.QuesmaType, bool) {
@@ -23,7 +32,18 @@ func (c SchemaTypeAdapter) Convert(s string) (schema.QuesmaType, bool) {
2332

2433
switch s {
2534
case "String":
26-
return schema.QuesmaTypeText, true
35+
switch c.defaultStringColumnType {
36+
37+
// empty if for testing purposes, in production it should always be set
38+
case "", "text":
39+
return schema.QuesmaTypeText, true
40+
case "keyword":
41+
return schema.QuesmaTypeKeyword, true
42+
default:
43+
logger.Error().Msgf("Unknown field type %s", c.defaultStringColumnType)
44+
return schema.QuesmaTypeUnknown, false
45+
}
46+
2747
case "LowCardinality(String)", "UUID", "FixedString":
2848
return schema.QuesmaTypeKeyword, true
2949
case "Int", "Int8", "Int16", "Int32", "Int64":

platform/config/config.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ type QuesmaConfiguration struct {
5757
DefaultIngestOptimizers map[string]OptimizerConfiguration
5858
DefaultQueryOptimizers map[string]OptimizerConfiguration
5959
MapFieldsDiscoveringEnabled bool
60+
DefaultStringColumnType string
6061
}
6162

6263
func NewQuesmaConfigurationIndexConfigOnly(indexConfig map[string]IndexConfiguration) QuesmaConfiguration {
@@ -269,7 +270,8 @@ Quesma Configuration:
269270
UseCommonTableForWildcard: %t,
270271
DefaultIngestTarget: %v,
271272
DefaultQueryTarget: %v,
272-
MapFieldsDiscoveringEnabled: %t
273+
MapFieldsDiscoveringEnabled: %t,
274+
DefaultStringColumnType: %s
273275
`,
274276
c.TransparentProxy,
275277
elasticUrl,
@@ -292,6 +294,7 @@ Quesma Configuration:
292294
c.DefaultIngestTarget,
293295
c.DefaultQueryTarget,
294296
c.MapFieldsDiscoveringEnabled,
297+
c.DefaultStringColumnType,
295298
)
296299
}
297300

platform/config/config_v2.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ type QuesmaNewConfiguration struct {
4949
Pipelines []Pipeline `koanf:"pipelines"`
5050
DisableTelemetry bool `koanf:"disableTelemetry"`
5151
MapFieldsDiscoveringEnabled bool `koanf:"mapFieldsDiscoveringEnabled"`
52+
DefaultStringToKeywordType bool `koanf:"defaultStringToKeywordType"`
53+
QuesmaFlags QuesmaFlags `koanf:"flags"`
54+
}
55+
56+
// It holds all the configuration flags that affect global Quesma behavior.
57+
type QuesmaFlags struct {
58+
DefaultStringColumnType *string `koanf:"defaultStringColumnType"`
5259
}
5360

5461
type LoggingConfiguration struct {
@@ -592,6 +599,21 @@ func (c *QuesmaNewConfiguration) TranslateToLegacyConfig() QuesmaConfiguration {
592599

593600
conf.MapFieldsDiscoveringEnabled = c.MapFieldsDiscoveringEnabled
594601

602+
conf.DefaultStringColumnType = "text" // default value, can be overridden by the flag
603+
if c.QuesmaFlags.DefaultStringColumnType != nil {
604+
605+
switch *c.QuesmaFlags.DefaultStringColumnType {
606+
case "keyword":
607+
conf.DefaultStringColumnType = "keyword"
608+
case "text":
609+
conf.DefaultStringColumnType = "text"
610+
default:
611+
612+
errAcc = multierror.Append(errAcc, fmt.Errorf("defaultStringColumnType must be either 'keyword' or 'text', got '%s'", *c.QuesmaFlags.DefaultStringColumnType))
613+
614+
}
615+
}
616+
595617
conf.AutodiscoveryEnabled = false
596618
conf.Connectors = make(map[string]RelationalDbConfiguration)
597619
relDBConn, connType, relationalDBErr := c.getRelationalDBConf()

platform/config/config_v2_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,3 +359,27 @@ func TestPartitionBy(t *testing.T) {
359359

360360
assert.Equal(t, Hourly, legacyConf.DefaultPartitioningStrategy)
361361
}
362+
363+
func TestStringColumnIsTextDefaultBehavior(t *testing.T) {
364+
os.Setenv(configFileLocationEnvVar, "./test_configs/partition_by.yaml")
365+
cfg := LoadV2Config()
366+
if err := cfg.Validate(); err != nil {
367+
t.Fatalf("error validating config: %v", err)
368+
}
369+
legacyConf := cfg.TranslateToLegacyConfig()
370+
371+
assert.Equal(t, "text", legacyConf.DefaultStringColumnType)
372+
373+
}
374+
375+
func TestStringColumnIsKeyword(t *testing.T) {
376+
os.Setenv(configFileLocationEnvVar, "./test_configs/string_column_is_keyword_field.yaml")
377+
cfg := LoadV2Config()
378+
if err := cfg.Validate(); err != nil {
379+
t.Fatalf("error validating config: %v", err)
380+
}
381+
legacyConf := cfg.TranslateToLegacyConfig()
382+
383+
assert.Equal(t, "keyword", legacyConf.DefaultStringColumnType)
384+
385+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# TEST CONFIGURATION
2+
licenseKey: "cdd749a3-e777-11ee-bcf8-0242ac150004"
3+
4+
5+
flags:
6+
defaultStringColumnType: keyword
7+
8+
frontendConnectors:
9+
- name: elastic-ingest
10+
type: elasticsearch-fe-ingest
11+
config:
12+
listenPort: 8080
13+
- name: elastic-query
14+
type: elasticsearch-fe-query
15+
config:
16+
listenPort: 8080
17+
backendConnectors:
18+
- name: my-minimal-elasticsearch
19+
type: elasticsearch
20+
config:
21+
url: "http://localhost:9200"
22+
- name: my-clickhouse-data-source
23+
type: clickhouse-os
24+
config:
25+
url: "clickhouse://localhost:9000"
26+
ingestStatistics: true
27+
internalTelemetryUrl: "https://api.quesma.com/phone-home"
28+
logging:
29+
remoteUrl: "https://api.quesma.com/phone-home"
30+
path: "logs"
31+
level: "info"
32+
processors:
33+
- name: my-query-processor
34+
type: quesma-v1-processor-query
35+
config:
36+
indexes:
37+
example-index:
38+
target:
39+
- my-clickhouse-data-source
40+
kibana_sample_data_ecommerce:
41+
target:
42+
- my-clickhouse-data-source
43+
partitioningStrategy: daily
44+
"*":
45+
target:
46+
- my-minimal-elasticsearch
47+
partitioningStrategy: hourly
48+
- name: my-ingest-processor
49+
type: quesma-v1-processor-ingest
50+
config:
51+
indexes:
52+
example-index:
53+
target:
54+
- my-clickhouse-data-source
55+
kibana_sample_data_ecommerce:
56+
target:
57+
- my-clickhouse-data-source
58+
partitioningStrategy: daily
59+
"*":
60+
target:
61+
- my-minimal-elasticsearch
62+
partitioningStrategy: hourly
63+
pipelines:
64+
- name: my-pipeline-elasticsearch-query-clickhouse
65+
frontendConnectors: [ elastic-query ]
66+
processors: [ my-query-processor ]
67+
backendConnectors: [ my-minimal-elasticsearch, my-clickhouse-data-source ]
68+
- name: my-pipeline-elasticsearch-ingest-to-clickhouse
69+
frontendConnectors: [ elastic-ingest ]
70+
processors: [ my-ingest-processor ]
71+
backendConnectors: [ my-minimal-elasticsearch, my-clickhouse-data-source ]
72+

0 commit comments

Comments
 (0)