MdC-Suche case sensitive; Angleichung Lemma-Satzsuche

dwerning · dwerning · commit ca83f7130725 · 2022-09-02T16:10:51.000+02:00
diff --git a/src/main/java/tla/backend/es/model/parts/Transcription.java b/src/main/java/tla/backend/es/model/parts/Transcription.java
@@ -22,7 +22,7 @@ public class Transcription {
     @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
     private String unicode;
 
-    @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
+    @Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")
     private String mdc;
 
 }
diff --git a/src/main/resources/elasticsearch/settings/indices/lemma.json b/src/main/resources/elasticsearch/settings/indices/lemma.json
@@ -64,8 +64,10 @@
           "type": "mapping",
           "mappings": [
             ".t.pl => .wt",
+            ".t:pl => .wt",
             ".tpl => .wt",
             "t.du => .tj",
+            "t:du => .tj",
             ".tdu => .tj",
             ",t,pl => ,wt",
             ",tpl => ,wt",
@@ -111,6 +113,16 @@
             "transcription_suffix_filter"
           ]
         },
+        "mdc_analyzer": {
+          "type": "custom",
+          "tokenizer": "keyword",
+          "char_filter": [
+            "whitespaces_compressor",
+            "transcription_special_signs_filter",
+            "transcription_brackets_filter",
+            "transcription_suffix_filter"
+          ]
+        },
         "hieroglyph_analyzer": {
           "type": "custom",
           "tokenizer": "hieroglyph_tokenizer"
diff --git a/src/main/resources/elasticsearch/settings/indices/sentence.json b/src/main/resources/elasticsearch/settings/indices/sentence.json
@@ -25,25 +25,35 @@
               "^",
               "&",
               "(",
-			  ")"
+              ")"
             ]
           }
         },
-         "filter": {
-           "english_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           },
-           "german_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           },
-           "french_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           }
-        },
+        "filter": {
+          "english_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          },
+          "german_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          },
+          "french_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          }
+  	  },
         "char_filter": {
+          "whitespaces_compressor": {
+            "type": "pattern_replace",
+            "pattern": "\\s+",
+            "replacement": " "
+          },
+          "transcription_special_signs_filter": {
+            "type": "pattern_replace",
+            "pattern": "[\\*]",
+            "replacement": ""
+          },
           "transcription_brackets_filter": {
             "type": "pattern_replace",
             "pattern": "[\\[\\]\\(\\)?\\u2e2e\\u2e22\\u2e23\\u2329\\u232a]|\\{\\S*\\}",
@@ -52,6 +62,12 @@
           "transcription_suffix_filter": {
             "type": "mapping",
             "mappings": [
+              ".t.pl => .wt",
+              ".t:pl => .wt",
+              ".tpl => .wt",
+              "t.du => .tj",
+              "t:du => .tj",
+              ".tdu => .tj",
               ",t,pl => ,wt",
               ",tpl => ,wt",
               "t,du => ,tj",
@@ -60,13 +76,48 @@
               "pl => w",
               ",  => ."
             ]
+          },
+          "transcription_unicode_normalizer": {
+            "type": "mapping",
+            "mappings": [
+              "h\\u0331 => \\u1e96",
+              "H\\u0331 => \\u1e96"
+            ]
+          },
+          "transcription_unicode_workaround": {
+            "type": "mapping",
+            "mappings": [
+              "i\\u032f => i",
+              "u\\u032f => u",
+              "\\u0131\\u0357 => \\ua7bd",
+              "I\\u0357 => \\ua7bd",
+              "h\\u032d => \\u0125",
+              "H\\u032d => \\u0125"
+            ]
           }
         },
         "analyzer": {
           "transcription_analyzer": {
             "type": "custom",
-            "tokenizer": "whitespace",
+            "tokenizer": "keyword",
+             "filter": [
+              "lowercase"
+              ],
+            "char_filter": [
+              "whitespaces_compressor",
+              "transcription_unicode_normalizer",
+              "transcription_unicode_workaround",
+              "transcription_special_signs_filter",
+              "transcription_brackets_filter",
+              "transcription_suffix_filter"
+            ]
+          },
+          "mdc_analyzer": {
+            "type": "custom",
+            "tokenizer": "keyword",
             "char_filter": [
+              "whitespaces_compressor",
+              "transcription_special_signs_filter",
               "transcription_brackets_filter",
               "transcription_suffix_filter"
             ]
@@ -75,31 +126,32 @@
             "type": "custom",
             "tokenizer": "hieroglyph_tokenizer"
           },
-           "english_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "english_stemmer"
-             ]
-           },
-           "german_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "german_stemmer"
-             ]
-           },
-           "french_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "french_stemmer"
-             ]
-           }
+          "english_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "english_stemmer"
+            ]
+          },
+          "german_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "german_stemmer"
+            ]
+          },
+          "french_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "french_stemmer"
+            ]
+          }
         }
       }
     }
-  }
+  }
+  

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ public class Transcription {`
`22`	`22`	`@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")`
`23`	`23`	`private String unicode;`
`24`	`24`
`25`		`- @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")`
	`25`	`+ @Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")`
`26`	`26`	`private String mdc;`
`27`	`27`
`28`	`28`	`}`