Skip to content

Commit ca83f71

Browse files
committed
MdC-Suche case sensitive; Angleichung Lemma-Satzsuche
1 parent 01ac706 commit ca83f71

File tree

3 files changed

+106
-42
lines changed

3 files changed

+106
-42
lines changed

src/main/java/tla/backend/es/model/parts/Transcription.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public class Transcription {
2222
@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
2323
private String unicode;
2424

25-
@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
25+
@Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")
2626
private String mdc;
2727

2828
}

src/main/resources/elasticsearch/settings/indices/lemma.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,10 @@
6464
"type": "mapping",
6565
"mappings": [
6666
".t.pl => .wt",
67+
".t:pl => .wt",
6768
".tpl => .wt",
6869
"t.du => .tj",
70+
"t:du => .tj",
6971
".tdu => .tj",
7072
",t,pl => ,wt",
7173
",tpl => ,wt",
@@ -111,6 +113,16 @@
111113
"transcription_suffix_filter"
112114
]
113115
},
116+
"mdc_analyzer": {
117+
"type": "custom",
118+
"tokenizer": "keyword",
119+
"char_filter": [
120+
"whitespaces_compressor",
121+
"transcription_special_signs_filter",
122+
"transcription_brackets_filter",
123+
"transcription_suffix_filter"
124+
]
125+
},
114126
"hieroglyph_analyzer": {
115127
"type": "custom",
116128
"tokenizer": "hieroglyph_tokenizer"

src/main/resources/elasticsearch/settings/indices/sentence.json

Lines changed: 93 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,35 @@
2525
"^",
2626
"&",
2727
"(",
28-
")"
28+
")"
2929
]
3030
}
3131
},
32-
"filter": {
33-
"english_stemmer": {
34-
"type": "stemmer",
35-
"language": "german"
36-
},
37-
"german_stemmer": {
38-
"type": "stemmer",
39-
"language": "german"
40-
},
41-
"french_stemmer": {
42-
"type": "stemmer",
43-
"language": "german"
44-
}
45-
},
32+
"filter": {
33+
"english_stemmer": {
34+
"type": "stemmer",
35+
"language": "german"
36+
},
37+
"german_stemmer": {
38+
"type": "stemmer",
39+
"language": "german"
40+
},
41+
"french_stemmer": {
42+
"type": "stemmer",
43+
"language": "german"
44+
}
45+
},
4646
"char_filter": {
47+
"whitespaces_compressor": {
48+
"type": "pattern_replace",
49+
"pattern": "\\s+",
50+
"replacement": " "
51+
},
52+
"transcription_special_signs_filter": {
53+
"type": "pattern_replace",
54+
"pattern": "[\\*]",
55+
"replacement": ""
56+
},
4757
"transcription_brackets_filter": {
4858
"type": "pattern_replace",
4959
"pattern": "[\\[\\]\\(\\)?\\u2e2e\\u2e22\\u2e23\\u2329\\u232a]|\\{\\S*\\}",
@@ -52,6 +62,12 @@
5262
"transcription_suffix_filter": {
5363
"type": "mapping",
5464
"mappings": [
65+
".t.pl => .wt",
66+
".t:pl => .wt",
67+
".tpl => .wt",
68+
"t.du => .tj",
69+
"t:du => .tj",
70+
".tdu => .tj",
5571
",t,pl => ,wt",
5672
",tpl => ,wt",
5773
"t,du => ,tj",
@@ -60,13 +76,48 @@
6076
"pl => w",
6177
", => ."
6278
]
79+
},
80+
"transcription_unicode_normalizer": {
81+
"type": "mapping",
82+
"mappings": [
83+
"h\\u0331 => \\u1e96",
84+
"H\\u0331 => \\u1e96"
85+
]
86+
},
87+
"transcription_unicode_workaround": {
88+
"type": "mapping",
89+
"mappings": [
90+
"i\\u032f => i",
91+
"u\\u032f => u",
92+
"\\u0131\\u0357 => \\ua7bd",
93+
"I\\u0357 => \\ua7bd",
94+
"h\\u032d => \\u0125",
95+
"H\\u032d => \\u0125"
96+
]
6397
}
6498
},
6599
"analyzer": {
66100
"transcription_analyzer": {
67101
"type": "custom",
68-
"tokenizer": "whitespace",
102+
"tokenizer": "keyword",
103+
"filter": [
104+
"lowercase"
105+
],
106+
"char_filter": [
107+
"whitespaces_compressor",
108+
"transcription_unicode_normalizer",
109+
"transcription_unicode_workaround",
110+
"transcription_special_signs_filter",
111+
"transcription_brackets_filter",
112+
"transcription_suffix_filter"
113+
]
114+
},
115+
"mdc_analyzer": {
116+
"type": "custom",
117+
"tokenizer": "keyword",
69118
"char_filter": [
119+
"whitespaces_compressor",
120+
"transcription_special_signs_filter",
70121
"transcription_brackets_filter",
71122
"transcription_suffix_filter"
72123
]
@@ -75,31 +126,32 @@
75126
"type": "custom",
76127
"tokenizer": "hieroglyph_tokenizer"
77128
},
78-
"english_without_stopwords": {
79-
"type":"custom",
80-
"tokenizer": "standard",
81-
"filter": [
82-
"lowercase",
83-
"english_stemmer"
84-
]
85-
},
86-
"german_without_stopwords": {
87-
"type":"custom",
88-
"tokenizer": "standard",
89-
"filter": [
90-
"lowercase",
91-
"german_stemmer"
92-
]
93-
},
94-
"french_without_stopwords": {
95-
"type":"custom",
96-
"tokenizer": "standard",
97-
"filter": [
98-
"lowercase",
99-
"french_stemmer"
100-
]
101-
}
129+
"english_without_stopwords": {
130+
"type":"custom",
131+
"tokenizer": "standard",
132+
"filter": [
133+
"lowercase",
134+
"english_stemmer"
135+
]
136+
},
137+
"german_without_stopwords": {
138+
"type":"custom",
139+
"tokenizer": "standard",
140+
"filter": [
141+
"lowercase",
142+
"german_stemmer"
143+
]
144+
},
145+
"french_without_stopwords": {
146+
"type":"custom",
147+
"tokenizer": "standard",
148+
"filter": [
149+
"lowercase",
150+
"french_stemmer"
151+
]
152+
}
102153
}
103154
}
104155
}
105-
}
156+
}
157+

0 commit comments

Comments
 (0)