Skip to content

Commit 38c7239

Browse files
authored
MdC-Suche, Filteranpassung
2 parents 65f4a09 + ca83f71 commit 38c7239

File tree

4 files changed

+134
-66
lines changed

4 files changed

+134
-66
lines changed

src/main/java/tla/backend/es/model/parts/Transcription.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public class Transcription {
2222
@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
2323
private String unicode;
2424

25-
@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
25+
@Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")
2626
private String mdc;
2727

2828
}

src/main/java/tla/backend/es/query/LemmaSearchQueryBuilder.java

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,15 @@ public void setScript(List<Script> scripts) {
4949
}
5050

5151
public void setTranscription(TranscriptionSpec transcription) {
52-
System.out.println("Sent to query ");
52+
//System.out.println("Sent to query ");
5353
if (transcription.getText() != null) {
54-
//int posEnc=transcription.indexOf("|");
55-
//String encod=transcription.substring(posEnc+1);
56-
System.out.println("Sent to query "+transcription.getEnc()[0]);
54+
//System.out.println("Sent to query "+transcription.getEnc()[0]);
5755
if (transcription.getEnc()[0].equals("mdc")){
58-
59-
this.must(regexpQuery("transcription.mdc", maskRegExTranscription(transcription.getText())));
56+
this.must(regexpQuery("transcription.mdc", maskRegExTranscription(transcription.getText())));
6057
}
61-
else this.must(regexpQuery("transcription.unicode", maskRegExTranscription(transcription.getText())));
58+
else {
59+
this.must(regexpQuery("transcription.unicode", maskRegExTranscription(normalizeUnicode(transcription.getText()))));
60+
}
6261
// works with Unicode only?
6362
}
6463

@@ -79,12 +78,10 @@ public void setTranscription(TranscriptionSpec transcription) {
7978
}
8079
8180
}*/
82-
83-
public String maskRegExTranscription(String transcription) {
84-
if (transcription != null) {
85-
transcription = transcription.trim(); // cut whitespaces
86-
transcription = transcription.replaceAll("\\s+", " ");
8781

82+
83+
public String normalizeUnicode(String transcription) {
84+
if (transcription != null) {
8885
// case insensitive search (ES analyzer also indexes lowercase)
8986
transcription = transcription.toLowerCase();
9087
transcription = transcription.replace("h\u0331", "ẖ"); // no atomic char as capital, now lowercase
@@ -121,16 +118,24 @@ public String maskRegExTranscription(String transcription) {
121118
transcription = transcription.replace("u\u032f", "u"); // atomic workaround for ult.-inf.-u
122119
transcription = transcription.replace("\u0131\u0357", "\ua7bd"); // BTS yod => Egyptological Yod
123120
transcription = transcription.replace("h\u032d", "\u0125"); // atomic workaround for demotic h
121+
}
122+
return transcription;
123+
}
124+
125+
public String maskRegExTranscription(String transcription) {
126+
if (transcription != null) {
127+
transcription = transcription.trim(); // cut whitespaces
128+
transcription = transcription.replaceAll("\\s+", " ");
124129

125-
// Maskieren (nicht ignorieren)
130+
// Maskieren (nicht ignorieren)
126131
transcription = transcription.replace(".", "\\.");
127132
transcription = transcription.replace("-", "\\-");
128133
transcription = transcription.replace("+", "\\+");
129-
134+
130135
// treatment of "( )" als Options-Marker
131136
// transcription = transcription.replace("(", "");
132137
transcription = transcription.replace(")", ")?"); // ### to do: abfangen, wenn Klammern nicht ordentlich öffnen/schließen
133-
138+
134139
// ignorieren: query und ES-Indizierung
135140
transcription = transcription.replace("{", "");
136141
transcription = transcription.replace("}", "");
@@ -140,26 +145,25 @@ public String maskRegExTranscription(String transcription) {
140145
transcription = transcription.replace("〉", "");
141146
transcription = transcription.replace("⸮", "");
142147
// "?", "[" , and "]" are part of allowed RegEx syntax
143-
144-
// BTS wildcards (any sign)
148+
149+
// BTS wildcards (any sign)
145150
transcription = transcription.replace("§", "."); // "§" in legacyTLA
146151
transcription = transcription.replace("*", "."); // "*" new in newTLA
147-
148-
// treatment of right end
152+
153+
// treatment of right end
149154
if (transcription.endsWith("$")) { // "$": wirkliches String-Ende
150155
transcription = transcription.replace("$", ""); // remove "$" (all, just to be sure)
151156
} else {
152157
transcription = transcription + ".*"; // right: any signs may follow
153158
}
154159

155160
// treatment of left end
156-
if (transcription.startsWith("^")) { // "^": search at beginning of lemma transliteration
157-
transcription = transcription.replace("^", ""); // remove "^" (all, just to be sure)
161+
if (transcription.startsWith(">")) { // ">": search at beginning or in the middle of lemma transliteration
162+
transcription = transcription.replace(">", ""); // remove "^" (all, just to be sure)
163+
// find words in the middle too
164+
transcription = "(.+[\\- ])?" + transcription; // left: anything at beginnig of lemma or after "-" or blank
158165
} else if (transcription.startsWith("\\-")) { // "-": search of non-first lemma
159166
transcription = "(.+)?" + transcription; // left: anything may occur before the '-'
160-
} else {
161-
// find words in the middle too
162-
transcription = "(.+[\\- ])?" + transcription; // left: anything at beginnig of lemma or after "-" or blank
163167
}
164168
}
165169
return transcription;

src/main/resources/elasticsearch/settings/indices/lemma.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,10 @@
6464
"type": "mapping",
6565
"mappings": [
6666
".t.pl => .wt",
67+
".t:pl => .wt",
6768
".tpl => .wt",
6869
"t.du => .tj",
70+
"t:du => .tj",
6971
".tdu => .tj",
7072
",t,pl => ,wt",
7173
",tpl => ,wt",
@@ -111,6 +113,16 @@
111113
"transcription_suffix_filter"
112114
]
113115
},
116+
"mdc_analyzer": {
117+
"type": "custom",
118+
"tokenizer": "keyword",
119+
"char_filter": [
120+
"whitespaces_compressor",
121+
"transcription_special_signs_filter",
122+
"transcription_brackets_filter",
123+
"transcription_suffix_filter"
124+
]
125+
},
114126
"hieroglyph_analyzer": {
115127
"type": "custom",
116128
"tokenizer": "hieroglyph_tokenizer"

src/main/resources/elasticsearch/settings/indices/sentence.json

Lines changed: 93 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,35 @@
2525
"^",
2626
"&",
2727
"(",
28-
")"
28+
")"
2929
]
3030
}
3131
},
32-
"filter": {
33-
"english_stemmer": {
34-
"type": "stemmer",
35-
"language": "german"
36-
},
37-
"german_stemmer": {
38-
"type": "stemmer",
39-
"language": "german"
40-
},
41-
"french_stemmer": {
42-
"type": "stemmer",
43-
"language": "german"
44-
}
45-
},
32+
"filter": {
33+
"english_stemmer": {
34+
"type": "stemmer",
35+
"language": "german"
36+
},
37+
"german_stemmer": {
38+
"type": "stemmer",
39+
"language": "german"
40+
},
41+
"french_stemmer": {
42+
"type": "stemmer",
43+
"language": "german"
44+
}
45+
},
4646
"char_filter": {
47+
"whitespaces_compressor": {
48+
"type": "pattern_replace",
49+
"pattern": "\\s+",
50+
"replacement": " "
51+
},
52+
"transcription_special_signs_filter": {
53+
"type": "pattern_replace",
54+
"pattern": "[\\*]",
55+
"replacement": ""
56+
},
4757
"transcription_brackets_filter": {
4858
"type": "pattern_replace",
4959
"pattern": "[\\[\\]\\(\\)?\\u2e2e\\u2e22\\u2e23\\u2329\\u232a]|\\{\\S*\\}",
@@ -52,6 +62,12 @@
5262
"transcription_suffix_filter": {
5363
"type": "mapping",
5464
"mappings": [
65+
".t.pl => .wt",
66+
".t:pl => .wt",
67+
".tpl => .wt",
68+
"t.du => .tj",
69+
"t:du => .tj",
70+
".tdu => .tj",
5571
",t,pl => ,wt",
5672
",tpl => ,wt",
5773
"t,du => ,tj",
@@ -60,13 +76,48 @@
6076
"pl => w",
6177
", => ."
6278
]
79+
},
80+
"transcription_unicode_normalizer": {
81+
"type": "mapping",
82+
"mappings": [
83+
"h\\u0331 => \\u1e96",
84+
"H\\u0331 => \\u1e96"
85+
]
86+
},
87+
"transcription_unicode_workaround": {
88+
"type": "mapping",
89+
"mappings": [
90+
"i\\u032f => i",
91+
"u\\u032f => u",
92+
"\\u0131\\u0357 => \\ua7bd",
93+
"I\\u0357 => \\ua7bd",
94+
"h\\u032d => \\u0125",
95+
"H\\u032d => \\u0125"
96+
]
6397
}
6498
},
6599
"analyzer": {
66100
"transcription_analyzer": {
67101
"type": "custom",
68-
"tokenizer": "whitespace",
102+
"tokenizer": "keyword",
103+
"filter": [
104+
"lowercase"
105+
],
106+
"char_filter": [
107+
"whitespaces_compressor",
108+
"transcription_unicode_normalizer",
109+
"transcription_unicode_workaround",
110+
"transcription_special_signs_filter",
111+
"transcription_brackets_filter",
112+
"transcription_suffix_filter"
113+
]
114+
},
115+
"mdc_analyzer": {
116+
"type": "custom",
117+
"tokenizer": "keyword",
69118
"char_filter": [
119+
"whitespaces_compressor",
120+
"transcription_special_signs_filter",
70121
"transcription_brackets_filter",
71122
"transcription_suffix_filter"
72123
]
@@ -75,31 +126,32 @@
75126
"type": "custom",
76127
"tokenizer": "hieroglyph_tokenizer"
77128
},
78-
"english_without_stopwords": {
79-
"type":"custom",
80-
"tokenizer": "standard",
81-
"filter": [
82-
"lowercase",
83-
"english_stemmer"
84-
]
85-
},
86-
"german_without_stopwords": {
87-
"type":"custom",
88-
"tokenizer": "standard",
89-
"filter": [
90-
"lowercase",
91-
"german_stemmer"
92-
]
93-
},
94-
"french_without_stopwords": {
95-
"type":"custom",
96-
"tokenizer": "standard",
97-
"filter": [
98-
"lowercase",
99-
"french_stemmer"
100-
]
101-
}
129+
"english_without_stopwords": {
130+
"type":"custom",
131+
"tokenizer": "standard",
132+
"filter": [
133+
"lowercase",
134+
"english_stemmer"
135+
]
136+
},
137+
"german_without_stopwords": {
138+
"type":"custom",
139+
"tokenizer": "standard",
140+
"filter": [
141+
"lowercase",
142+
"german_stemmer"
143+
]
144+
},
145+
"french_without_stopwords": {
146+
"type":"custom",
147+
"tokenizer": "standard",
148+
"filter": [
149+
"lowercase",
150+
"french_stemmer"
151+
]
152+
}
102153
}
103154
}
104155
}
105-
}
156+
}
157+

0 commit comments

Comments
 (0)