19
19
import co .elastic .clients .elasticsearch ._types .analysis .Analyzer ;
20
20
import co .elastic .clients .elasticsearch ._types .analysis .CharFilterDefinition ;
21
21
import co .elastic .clients .elasticsearch ._types .analysis .CustomAnalyzer ;
22
+ import co .elastic .clients .elasticsearch ._types .analysis .NGramTokenizer ;
22
23
import co .elastic .clients .elasticsearch ._types .analysis .TokenFilterDefinition ;
23
24
import co .elastic .clients .elasticsearch ._types .analysis .TokenizerDefinition ;
24
25
import co .elastic .clients .elasticsearch .indices .IndexSettingsAnalysis ;
40
41
import org .apache .lucene .analysis .AbstractAnalysisFactory ;
41
42
import org .apache .lucene .analysis .CharFilterFactory ;
42
43
import org .apache .lucene .analysis .TokenFilterFactory ;
44
+ import org .apache .lucene .analysis .charfilter .MappingCharFilterFactory ;
43
45
import org .apache .lucene .analysis .en .AbstractWordsFileFilterFactory ;
44
46
import org .apache .lucene .util .ResourceLoader ;
45
47
import org .jetbrains .annotations .NotNull ;
55
57
import java .nio .charset .StandardCharsets ;
56
58
import java .util .Arrays ;
57
59
import java .util .Collections ;
60
+ import java .util .HashMap ;
58
61
import java .util .LinkedHashMap ;
59
62
import java .util .List ;
60
63
import java .util .Map ;
@@ -97,7 +100,13 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
97
100
NodeState defaultAnalyzer = state .getChildNode (FulltextIndexConstants .ANL_DEFAULT );
98
101
if (defaultAnalyzer .exists ()) {
99
102
IndexSettingsAnalysis .Builder builder = new IndexSettingsAnalysis .Builder ();
100
- Map <String , Object > analyzer = convertNodeState (defaultAnalyzer );
103
+ Map <String , Object > analyzer ;
104
+ try {
105
+ analyzer = convertNodeState (defaultAnalyzer );
106
+ } catch (IOException e ) {
107
+ LOG .warn ("Can not load analyzer; using an empty configuration" , e );
108
+ analyzer = Map .of ();
109
+ }
101
110
String builtIn = defaultAnalyzer .getString (FulltextIndexConstants .ANL_CLASS );
102
111
if (builtIn == null ) {
103
112
builtIn = defaultAnalyzer .getString (FulltextIndexConstants .ANL_NAME );
@@ -107,11 +116,14 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
107
116
108
117
// content params, usually stop words
109
118
for (ChildNodeEntry nodeEntry : defaultAnalyzer .getChildNodeEntries ()) {
119
+ List <String > list ;
110
120
try {
111
- analyzer . put ( normalize ( nodeEntry . getName ()), loadContent (nodeEntry .getNodeState (), nodeEntry .getName (), NOOP_TRANSFORMATION ) );
121
+ list = loadContent (nodeEntry .getNodeState (), nodeEntry .getName (), NOOP_TRANSFORMATION );
112
122
} catch (IOException e ) {
113
- throw new IllegalStateException ("Unable to load content for node entry " + nodeEntry .getName (), e );
123
+ LOG .warn ("Unable to load analyzer content for entry '" + nodeEntry .getName () + "'; using empty list" , e );
124
+ list = List .of ();
114
125
}
126
+ analyzer .put (normalize (nodeEntry .getName ()), list );
115
127
}
116
128
117
129
builder .analyzer (analyzerName , new Analyzer (null , JsonData .of (analyzer )));
@@ -145,49 +157,93 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
145
157
146
158
@ NotNull
147
159
private static TokenizerDefinition loadTokenizer (NodeState state ) {
148
- String name = normalize (Objects .requireNonNull (state .getString (FulltextIndexConstants .ANL_NAME )));
149
- Map <String , Object > args = convertNodeState (state );
160
+ String name ;
161
+ Map <String , Object > args ;
162
+ if (!state .exists ()) {
163
+ LOG .warn ("No tokenizer specified; the standard with an empty configuration" );
164
+ name = "Standard" ;
165
+ args = new HashMap <String , Object >();
166
+ } else {
167
+ name = Objects .requireNonNull (state .getString (FulltextIndexConstants .ANL_NAME ));
168
+ try {
169
+ args = convertNodeState (state );
170
+ } catch (IOException e ) {
171
+ LOG .warn ("Can not load tokenizer; using an empty configuration" , e );
172
+ args = new HashMap <String , Object >();
173
+ }
174
+ }
175
+ name = normalize (name );
176
+ if ("n_gram" .equals (name )) {
177
+ // OAK-11568
178
+ // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
179
+ Integer minGramSize = getIntegerSetting (args , "minGramSize" , 2 );
180
+ Integer maxGramSize = getIntegerSetting (args , "maxGramSize" , 3 );
181
+ TokenizerDefinition ngram = TokenizerDefinition .of (t -> t .ngram (
182
+ NGramTokenizer .of (n -> n .minGram (minGramSize ).maxGram (maxGramSize ))));
183
+ return ngram ;
184
+ }
150
185
args .put (ANALYZER_TYPE , name );
151
186
return new TokenizerDefinition (name , JsonData .of (args ));
152
187
}
153
188
189
+ private static Integer getIntegerSetting (Map <String , Object > args , String name , Integer defaultValue ) {
190
+ Object value = args .getOrDefault (name , defaultValue );
191
+ if (!(value instanceof Integer )) {
192
+ LOG .warn ("Setting {} value {} is not an integer; using default: {}" , name , value , defaultValue );
193
+ return defaultValue ;
194
+ }
195
+ return (Integer ) value ;
196
+ }
197
+
154
198
private static <FD > LinkedHashMap <String , FD > loadFilters (NodeState state ,
155
199
Function <String , Class <? extends AbstractAnalysisFactory >> lookup ,
156
200
BiFunction <String , JsonData , FD > factory ) {
157
201
LinkedHashMap <String , FD > filters = new LinkedHashMap <>();
158
202
int i = 0 ;
159
- //Need to read children in order
203
+ // Need to read children in order
160
204
Tree tree = TreeFactory .createReadOnlyTree (state );
205
+
206
+ // We need to remember that a "WordDelimiter" was configured,
207
+ // because we have to remove it if a synonyms filter is configured as well
208
+ String wordDelimiterFilterKey = null ;
161
209
for (Tree t : tree .getChildren ()) {
162
210
NodeState child = state .getChildNode (t .getName ());
163
211
164
212
String name ;
165
213
List <String > content = null ;
166
214
List <ParameterTransformer > transformers ;
215
+ boolean skipEntry = false ;
167
216
try {
168
- Class <? extends AbstractAnalysisFactory > tff = lookup .apply (t .getName ());
217
+ Class <? extends AbstractAnalysisFactory > analysisFactory = lookup .apply (t .getName ());
169
218
170
219
List <String > unsupportedParameters =
171
220
UNSUPPORTED_LUCENE_PARAMETERS .entrySet ().stream ()
172
- .filter (k -> k .getKey ().isAssignableFrom (tff ))
221
+ .filter (k -> k .getKey ().isAssignableFrom (analysisFactory ))
173
222
.map (Map .Entry ::getValue )
174
223
.findFirst ().orElseGet (Collections ::emptyList );
175
224
Map <String , String > luceneArgs = StreamSupport .stream (child .getProperties ().spliterator (), false )
176
225
.filter (ElasticCustomAnalyzer ::isPropertySupported )
177
226
.filter (ps -> !unsupportedParameters .contains (ps .getName ()))
178
227
.collect (Collectors .toMap (PropertyState ::getName , ps -> ps .getValue (Type .STRING )));
179
228
180
- AbstractAnalysisFactory luceneFactory = tff .getConstructor (Map .class ).newInstance (luceneArgs );
229
+ AbstractAnalysisFactory luceneFactory = analysisFactory .getConstructor (Map .class ).newInstance (luceneArgs );
181
230
if (luceneFactory instanceof AbstractWordsFileFilterFactory ) {
182
231
AbstractWordsFileFilterFactory wordsFF = ((AbstractWordsFileFilterFactory ) luceneFactory );
183
232
// this will parse/load the content handling different formats, comments, etc
184
233
wordsFF .inform (new NodeStateResourceLoader (child ));
185
234
content = wordsFF .getWords ().stream ().map (w -> new String (((char []) w ))).collect (Collectors .toList ());
186
235
}
236
+ if (luceneFactory instanceof MappingCharFilterFactory ) {
237
+ MappingCharFilterFactory map = (MappingCharFilterFactory ) luceneFactory ;
238
+ if (map .getOriginalArgs ().isEmpty ()) {
239
+ skipEntry = true ;
240
+ LOG .warn ("Empty CharFilter mapping: ignoring" );
241
+ }
242
+ }
187
243
188
- name = normalize ((String ) tff .getField ("NAME" ).get (null ));
244
+ name = normalize ((String ) analysisFactory .getField ("NAME" ).get (null ));
189
245
transformers = LUCENE_ELASTIC_TRANSFORMERS .entrySet ().stream ()
190
- .filter (k -> k .getKey ().isAssignableFrom (tff ))
246
+ .filter (k -> k .getKey ().isAssignableFrom (analysisFactory ))
191
247
.map (Map .Entry ::getValue )
192
248
.collect (Collectors .toList ());
193
249
} catch (Exception e ) {
@@ -201,6 +257,21 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
201
257
202
258
Map <String , Object > args = convertNodeState (child , transformers , content );
203
259
260
+ if (name .equals ("word_delimiter" )) {
261
+ // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html
262
+ // We recommend using the word_delimiter_graph instead of the word_delimiter filter.
263
+ // The word_delimiter filter can produce invalid token graphs.
264
+ LOG .info ("Replacing the word delimiter filter with the word delimiter graph" );
265
+ name = "word_delimiter_graph" ;
266
+ }
267
+ if (name .equals ("hyphenation_compound_word" )) {
268
+ name = "hyphenation_decompounder" ;
269
+ String hypenator = args .getOrDefault ("hyphenator" , "" ).toString ();
270
+ LOG .info ("Using the hyphenation_decompounder: " + hypenator );
271
+ args .put ("hyphenation_patterns_path" , "analysis/hyphenation_patterns.xml" );
272
+ args .put ("word_list" , List .of ());
273
+ }
274
+
204
275
// stemmer in elastic don't have language based configurations. They all stay under the stemmer config with
205
276
// a language parameter
206
277
if (name .endsWith ("_stem" )) {
@@ -221,14 +292,31 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
221
292
}
222
293
args .put (ANALYZER_TYPE , name );
223
294
224
- filters .put (name + "_" + i , factory .apply (name , JsonData .of (args )));
295
+ if (skipEntry ) {
296
+ continue ;
297
+ }
298
+ String key = name + "_" + i ;
299
+ filters .put (key , factory .apply (name , JsonData .of (args )));
300
+ if (name .equals ("word_delimiter_graph" )) {
301
+ wordDelimiterFilterKey = key ;
302
+ } else if (name .equals ("synonym" )) {
303
+ if (wordDelimiterFilterKey != null ) {
304
+ LOG .info ("Removing word delimiter because there is a synonyms filter as well: " + wordDelimiterFilterKey );
305
+ filters .remove (wordDelimiterFilterKey );
306
+ }
307
+ }
225
308
i ++;
226
309
}
227
310
return filters ;
228
311
}
229
312
230
313
private static List <String > loadContent (NodeState file , String name , ContentTransformer transformer ) throws IOException {
231
- Blob blob = ConfigUtil .getBlob (file , name );
314
+ Blob blob ;
315
+ try {
316
+ blob = ConfigUtil .getBlob (file , name );
317
+ } catch (IllegalArgumentException | IllegalStateException e ) {
318
+ throw new IOException ("Could not load " + name , e );
319
+ }
232
320
try (Reader content = new InputStreamReader (Objects .requireNonNull (blob ).getNewStream (), StandardCharsets .UTF_8 )) {
233
321
try (BufferedReader br = new BufferedReader (content )) {
234
322
return br .lines ()
@@ -264,11 +352,25 @@ private static String normalize(String value) {
264
352
return name ;
265
353
}
266
354
267
- private static Map <String , Object > convertNodeState (NodeState state ) {
268
- return convertNodeState (state , List .of (), List .of ());
355
+ private static Map <String , Object > convertNodeState (NodeState state ) throws IOException {
356
+ try {
357
+ return convertNodeState (state , List .of (), List .of ());
358
+ } catch (IllegalStateException e ) {
359
+ // convert runtime exception back to checked exception
360
+ throw new IOException ("Can not convert" , e );
361
+ }
269
362
}
270
363
271
- private static Map <String , Object > convertNodeState (NodeState state , List <ParameterTransformer > transformers , List <String > preloadedContent ) {
364
+ /**
365
+ * Read analyzer configuration.
366
+ *
367
+ * @param state the node state
368
+ * @param transformers
369
+ * @param preloadedContent
370
+ * @return
371
+ * @throws IllegalStateException
372
+ */
373
+ private static Map <String , Object > convertNodeState (NodeState state , List <ParameterTransformer > transformers , List <String > preloadedContent ) throws IllegalStateException {
272
374
Map <String , Object > luceneParams = StreamSupport .stream (Spliterators .spliteratorUnknownSize (state .getProperties ().iterator (), Spliterator .ORDERED ), false )
273
375
.filter (ElasticCustomAnalyzer ::isPropertySupported )
274
376
.collect (Collectors .toMap (PropertyState ::getName , ps -> {
@@ -280,6 +382,8 @@ private static Map<String, Object> convertNodeState(NodeState state, List<Parame
280
382
return loadContent (state .getChildNode (v .trim ()), v .trim (),
281
383
CONTENT_TRANSFORMERS .getOrDefault (ps .getName (), NOOP_TRANSFORMATION )).stream ();
282
384
} catch (IOException e ) {
385
+ // convert checked exception to runtime exception to runtime exception,
386
+ // because the stream API doesn't support checked exceptions
283
387
throw new IllegalStateException (e );
284
388
}
285
389
}).collect (Collectors .toList ()));
0 commit comments