MdC-Suche, Filteranpassung

dwerning · web-flow · commit 38c7239c0f8e · 2022-09-02T16:17:35.000+02:00
diff --git a/src/main/java/tla/backend/es/model/parts/Transcription.java b/src/main/java/tla/backend/es/model/parts/Transcription.java
@@ -22,7 +22,7 @@ public class Transcription {
     @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
     private String unicode;
 
-    @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")
+    @Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")
     private String mdc;
 
 }
diff --git a/src/main/java/tla/backend/es/query/LemmaSearchQueryBuilder.java b/src/main/java/tla/backend/es/query/LemmaSearchQueryBuilder.java
@@ -49,16 +49,15 @@ public void setScript(List<Script> scripts) {
     }
 
     public void setTranscription(TranscriptionSpec transcription) {
-    	System.out.println("Sent to query ");
+    	  //System.out.println("Sent to query ");
         if (transcription.getText() != null) {
-        	//int posEnc=transcription.indexOf("|");
-        	//String encod=transcription.substring(posEnc+1);
-        	System.out.println("Sent to query "+transcription.getEnc()[0]);
+        	//System.out.println("Sent to query "+transcription.getEnc()[0]);
         	if (transcription.getEnc()[0].equals("mdc")){
-        		
-        		this.must(regexpQuery("transcription.mdc", maskRegExTranscription(transcription.getText())));
+					this.must(regexpQuery("transcription.mdc", maskRegExTranscription(transcription.getText())));
         	}
-        	else this.must(regexpQuery("transcription.unicode", maskRegExTranscription(transcription.getText())));
+        	else {
+					this.must(regexpQuery("transcription.unicode", maskRegExTranscription(normalizeUnicode(transcription.getText()))));
+			}
 			// works with Unicode only?
         }
         
@@ -79,12 +78,10 @@ public void setTranscription(TranscriptionSpec transcription) {
         }
        
     }*/
-    
-    public String maskRegExTranscription(String transcription) {
-        if (transcription != null) {
-			transcription = transcription.trim(); // cut whitespaces
-			transcription = transcription.replaceAll("\\s+", " ");
 
+
+	public String normalizeUnicode(String transcription) {
+        if (transcription != null) {
 			// case insensitive search (ES analyzer also indexes lowercase)
 			transcription = transcription.toLowerCase(); 
 			transcription = transcription.replace("h\u0331", "ẖ"); // no atomic char as capital, now lowercase
@@ -121,16 +118,24 @@ public String maskRegExTranscription(String transcription) {
 			transcription = transcription.replace("u\u032f", "u");  // atomic workaround for ult.-inf.-u
 			transcription = transcription.replace("\u0131\u0357", "\ua7bd");  // BTS yod => Egyptological Yod
 			transcription = transcription.replace("h\u032d", "\u0125"); // atomic workaround for demotic h
+       }
+		return transcription;
+	}
+	
+    public String maskRegExTranscription(String transcription) {
+        if (transcription != null) {
+			transcription = transcription.trim(); // cut whitespaces
+			transcription = transcription.replaceAll("\\s+", " ");
 
-			// Maskieren (nicht ignorieren)
+            // Maskieren (nicht ignorieren)
             transcription = transcription.replace(".", "\\."); 
             transcription = transcription.replace("-", "\\-"); 
             transcription = transcription.replace("+", "\\+"); 
-						
+            
             // treatment of "(  )" als Options-Marker
             // transcription = transcription.replace("(", ""); 
             transcription = transcription.replace(")", ")?"); // ### to do: abfangen, wenn Klammern nicht ordentlich öffnen/schließen															
-
+            
             // ignorieren: query und ES-Indizierung
             transcription = transcription.replace("{", ""); 
             transcription = transcription.replace("}", ""); 
@@ -140,26 +145,25 @@ public String maskRegExTranscription(String transcription) {
             transcription = transcription.replace("〉", ""); 
             transcription = transcription.replace("⸮", ""); 
             // "?", "[" , and "]" are part of allowed RegEx syntax
-			
-			// BTS wildcards (any sign)
+            
+            // BTS wildcards (any sign)
             transcription = transcription.replace("§", "."); // "§" in legacyTLA 
             transcription = transcription.replace("*", "."); // "*" new in newTLA 
-			
-			// treatment of right end
+            
+            // treatment of right end
 			if (transcription.endsWith("$")) { // "$": wirkliches String-Ende
 				transcription = transcription.replace("$", ""); // remove "$" (all, just to be sure)
 			} else {
 				transcription = transcription + ".*"; // right: any signs may follow
 			}
 			
 			// treatment of left end
-			if (transcription.startsWith("^")) { // "^": search at beginning of lemma transliteration
-				transcription = transcription.replace("^", ""); // remove "^" (all, just to be sure)
+			if (transcription.startsWith(">")) { // ">": search at beginning or in the middle of lemma transliteration
+				transcription = transcription.replace(">", ""); // remove "^" (all, just to be sure)
+				// find words in the middle too
+				transcription = "(.+[\\- ])?" + transcription; // left: anything at beginnig of lemma or after "-" or blank
 			} else if (transcription.startsWith("\\-")) { // "-": search of non-first lemma
 				transcription = "(.+)?" + transcription; // left: anything may occur before the '-'
-			} else {
-				// find words in the middle too
-				transcription = "(.+[\\- ])?" + transcription; // left: anything at beginnig of lemma or after "-" or blank 
 			} 
        }
 		return transcription;
diff --git a/src/main/resources/elasticsearch/settings/indices/lemma.json b/src/main/resources/elasticsearch/settings/indices/lemma.json
@@ -64,8 +64,10 @@
           "type": "mapping",
           "mappings": [
             ".t.pl => .wt",
+            ".t:pl => .wt",
             ".tpl => .wt",
             "t.du => .tj",
+            "t:du => .tj",
             ".tdu => .tj",
             ",t,pl => ,wt",
             ",tpl => ,wt",
@@ -111,6 +113,16 @@
             "transcription_suffix_filter"
           ]
         },
+        "mdc_analyzer": {
+          "type": "custom",
+          "tokenizer": "keyword",
+          "char_filter": [
+            "whitespaces_compressor",
+            "transcription_special_signs_filter",
+            "transcription_brackets_filter",
+            "transcription_suffix_filter"
+          ]
+        },
         "hieroglyph_analyzer": {
           "type": "custom",
           "tokenizer": "hieroglyph_tokenizer"
diff --git a/src/main/resources/elasticsearch/settings/indices/sentence.json b/src/main/resources/elasticsearch/settings/indices/sentence.json
@@ -25,25 +25,35 @@
               "^",
               "&",
               "(",
-			  ")"
+              ")"
             ]
           }
         },
-         "filter": {
-           "english_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           },
-           "german_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           },
-           "french_stemmer": {
-             "type":       "stemmer",
-             "language":   "german"
-           }
-        },
+        "filter": {
+          "english_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          },
+          "german_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          },
+          "french_stemmer": {
+            "type":       "stemmer",
+            "language":   "german"
+          }
+  	  },
         "char_filter": {
+          "whitespaces_compressor": {
+            "type": "pattern_replace",
+            "pattern": "\\s+",
+            "replacement": " "
+          },
+          "transcription_special_signs_filter": {
+            "type": "pattern_replace",
+            "pattern": "[\\*]",
+            "replacement": ""
+          },
           "transcription_brackets_filter": {
             "type": "pattern_replace",
             "pattern": "[\\[\\]\\(\\)?\\u2e2e\\u2e22\\u2e23\\u2329\\u232a]|\\{\\S*\\}",
@@ -52,6 +62,12 @@
           "transcription_suffix_filter": {
             "type": "mapping",
             "mappings": [
+              ".t.pl => .wt",
+              ".t:pl => .wt",
+              ".tpl => .wt",
+              "t.du => .tj",
+              "t:du => .tj",
+              ".tdu => .tj",
               ",t,pl => ,wt",
               ",tpl => ,wt",
               "t,du => ,tj",
@@ -60,13 +76,48 @@
               "pl => w",
               ",  => ."
             ]
+          },
+          "transcription_unicode_normalizer": {
+            "type": "mapping",
+            "mappings": [
+              "h\\u0331 => \\u1e96",
+              "H\\u0331 => \\u1e96"
+            ]
+          },
+          "transcription_unicode_workaround": {
+            "type": "mapping",
+            "mappings": [
+              "i\\u032f => i",
+              "u\\u032f => u",
+              "\\u0131\\u0357 => \\ua7bd",
+              "I\\u0357 => \\ua7bd",
+              "h\\u032d => \\u0125",
+              "H\\u032d => \\u0125"
+            ]
           }
         },
         "analyzer": {
           "transcription_analyzer": {
             "type": "custom",
-            "tokenizer": "whitespace",
+            "tokenizer": "keyword",
+             "filter": [
+              "lowercase"
+              ],
+            "char_filter": [
+              "whitespaces_compressor",
+              "transcription_unicode_normalizer",
+              "transcription_unicode_workaround",
+              "transcription_special_signs_filter",
+              "transcription_brackets_filter",
+              "transcription_suffix_filter"
+            ]
+          },
+          "mdc_analyzer": {
+            "type": "custom",
+            "tokenizer": "keyword",
             "char_filter": [
+              "whitespaces_compressor",
+              "transcription_special_signs_filter",
               "transcription_brackets_filter",
               "transcription_suffix_filter"
             ]
@@ -75,31 +126,32 @@
             "type": "custom",
             "tokenizer": "hieroglyph_tokenizer"
           },
-           "english_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "english_stemmer"
-             ]
-           },
-           "german_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "german_stemmer"
-             ]
-           },
-           "french_without_stopwords": {
-             "type":"custom",
-             "tokenizer": "standard",
-             "filter": [
-               "lowercase",
-               "french_stemmer"
-             ]
-           }
+          "english_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "english_stemmer"
+            ]
+          },
+          "german_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "german_stemmer"
+            ]
+          },
+          "french_without_stopwords": {
+            "type":"custom",
+            "tokenizer": "standard",
+            "filter": [
+              "lowercase",
+              "french_stemmer"
+            ]
+          }
         }
       }
     }
-  }
+  }
+  

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ public class Transcription {`
`22`	`22`	`@Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")`
`23`	`23`	`private String unicode;`
`24`	`24`
`25`		`- @Field(type = FieldType.Text, analyzer = "transcription_analyzer", searchAnalyzer = "transcription_analyzer")`
	`25`	`+ @Field(type = FieldType.Text, analyzer = "mdc_analyzer", searchAnalyzer = "mdc_analyzer")`
`26`	`26`	`private String mdc;`
`27`	`27`
`28`	`28`	`}`