@@ -49,16 +49,15 @@ public void setScript(List<Script> scripts) {
4949 }
5050
5151 public void setTranscription (TranscriptionSpec transcription ) {
52- System .out .println ("Sent to query " );
52+ // System.out.println("Sent to query ");
5353 if (transcription .getText () != null ) {
54- //int posEnc=transcription.indexOf("|");
55- //String encod=transcription.substring(posEnc+1);
56- System .out .println ("Sent to query " +transcription .getEnc ()[0 ]);
54+ //System.out.println("Sent to query "+transcription.getEnc()[0]);
5755 if (transcription .getEnc ()[0 ].equals ("mdc" )){
58-
59- this .must (regexpQuery ("transcription.mdc" , maskRegExTranscription (transcription .getText ())));
56+ this .must (regexpQuery ("transcription.mdc" , maskRegExTranscription (transcription .getText ())));
6057 }
61- else this .must (regexpQuery ("transcription.unicode" , maskRegExTranscription (transcription .getText ())));
58+ else {
59+ this .must (regexpQuery ("transcription.unicode" , maskRegExTranscription (normalizeUnicode (transcription .getText ()))));
60+ }
6261 // works with Unicode only?
6362 }
6463
@@ -79,12 +78,10 @@ public void setTranscription(TranscriptionSpec transcription) {
7978 }
8079
8180 }*/
82-
83- public String maskRegExTranscription (String transcription ) {
84- if (transcription != null ) {
85- transcription = transcription .trim (); // cut whitespaces
86- transcription = transcription .replaceAll ("\\ s+" , " " );
8781
82+
83+ public String normalizeUnicode (String transcription ) {
84+ if (transcription != null ) {
8885 // case insensitive search (ES analyzer also indexes lowercase)
8986 transcription = transcription .toLowerCase ();
9087 transcription = transcription .replace ("h\u0331 " , "ẖ" ); // no atomic char as capital, now lowercase
@@ -121,16 +118,24 @@ public String maskRegExTranscription(String transcription) {
121118 transcription = transcription .replace ("u\u032f " , "u" ); // atomic workaround for ult.-inf.-u
122119 transcription = transcription .replace ("\u0131 \u0357 " , "\ua7bd " ); // BTS yod => Egyptological Yod
123120 transcription = transcription .replace ("h\u032d " , "\u0125 " ); // atomic workaround for demotic h
121+ }
122+ return transcription ;
123+ }
124+
125+ public String maskRegExTranscription (String transcription ) {
126+ if (transcription != null ) {
127+ transcription = transcription .trim (); // cut whitespaces
128+ transcription = transcription .replaceAll ("\\ s+" , " " );
124129
125- // Maskieren (nicht ignorieren)
130+ // Maskieren (nicht ignorieren)
126131 transcription = transcription .replace ("." , "\\ ." );
127132 transcription = transcription .replace ("-" , "\\ -" );
128133 transcription = transcription .replace ("+" , "\\ +" );
129-
134+
130135 // treatment of "( )" als Options-Marker
131136 // transcription = transcription.replace("(", "");
132137 transcription = transcription .replace (")" , ")?" ); // ### to do: abfangen, wenn Klammern nicht ordentlich öffnen/schließen
133-
138+
134139 // ignorieren: query und ES-Indizierung
135140 transcription = transcription .replace ("{" , "" );
136141 transcription = transcription .replace ("}" , "" );
@@ -140,26 +145,25 @@ public String maskRegExTranscription(String transcription) {
140145 transcription = transcription .replace ("〉" , "" );
141146 transcription = transcription .replace ("⸮" , "" );
142147 // "?", "[" , and "]" are part of allowed RegEx syntax
143-
144- // BTS wildcards (any sign)
148+
149+ // BTS wildcards (any sign)
145150 transcription = transcription .replace ("§" , "." ); // "§" in legacyTLA
146151 transcription = transcription .replace ("*" , "." ); // "*" new in newTLA
147-
148- // treatment of right end
152+
153+ // treatment of right end
149154 if (transcription .endsWith ("$" )) { // "$": wirkliches String-Ende
150155 transcription = transcription .replace ("$" , "" ); // remove "$" (all, just to be sure)
151156 } else {
152157 transcription = transcription + ".*" ; // right: any signs may follow
153158 }
154159
155160 // treatment of left end
156- if (transcription .startsWith ("^" )) { // "^": search at beginning of lemma transliteration
157- transcription = transcription .replace ("^" , "" ); // remove "^" (all, just to be sure)
161+ if (transcription .startsWith (">" )) { // ">": search at beginning or in the middle of lemma transliteration
162+ transcription = transcription .replace (">" , "" ); // remove "^" (all, just to be sure)
163+ // find words in the middle too
164+ transcription = "(.+[\\ - ])?" + transcription ; // left: anything at beginnig of lemma or after "-" or blank
158165 } else if (transcription .startsWith ("\\ -" )) { // "-": search of non-first lemma
159166 transcription = "(.+)?" + transcription ; // left: anything may occur before the '-'
160- } else {
161- // find words in the middle too
162- transcription = "(.+[\\ - ])?" + transcription ; // left: anything at beginnig of lemma or after "-" or blank
163167 }
164168 }
165169 return transcription ;
0 commit comments