apache · thomasmueller · Mar 31, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 20, 2025
diff --git a/...ain/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java b/...ain/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java
@@ -16,9 +16,6 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
-import org.apache.commons.io.FilenameUtils;
-import org.apache.jackrabbit.oak.api.jmx.CacheStatsMBean;
-import org.apache.jackrabbit.oak.cache.CacheStats;
 import org.apache.jackrabbit.oak.commons.IOUtils;
 import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
 import org.apache.jackrabbit.oak.plugins.index.AsyncIndexInfoService;
@@ -50,13 +47,11 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
 import java.util.ArrayList;
 import java.util.Dictionary;
 import java.util.Hashtable;
 import java.util.List;
 
-import static org.apache.commons.io.FileUtils.ONE_MB;
 import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.registerMBean;
 import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.scheduleWithFixedDelay;
 
@@ -130,8 +125,6 @@ public class ElasticIndexProviderService {
 
     private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexProviderService.class);
 
-    private static final String REPOSITORY_HOME = "repository.home";
-
     @Reference
     private StatisticsProvider statisticsProvider;
 
@@ -149,11 +142,10 @@ public class ElasticIndexProviderService {
 
     private ExtractedTextCache extractedTextCache;
 
-    private final List<ServiceRegistration> regs = new ArrayList<>();
+    private final List<ServiceRegistration<?>> regs = new ArrayList<>();
     private final List<Registration> oakRegs = new ArrayList<>();
 
     private Whiteboard whiteboard;
-    private File textExtractionDir;
 
     private ElasticConnection elasticConnection;
     private ElasticMetricHandler metricHandler;
@@ -200,7 +192,7 @@ private void activate(BundleContext bundleContext, Config config) {
 
     @Deactivate
     private void deactivate() {
-        for (ServiceRegistration reg : regs) {
+        for (ServiceRegistration<?> reg : regs) {
             reg.unregister();
         }
 
@@ -242,63 +234,6 @@ private void registerIndexEditor(BundleContext bundleContext) {
         Dictionary<String, Object> props = new Hashtable<>();
         props.put("type", ElasticIndexDefinition.TYPE_ELASTICSEARCH);
         regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, props));
-//        oakRegs.add(registerMBean(whiteboard,
-//                TextExtractionStatsMBean.class,
-//                editorProvider.getExtractedTextCache().getStatsMBean(),
-//                TextExtractionStatsMBean.TYPE,
-//                "TextExtraction statistics"));
-    }
-
-    private void initializeExtractedTextCache(final Config config, StatisticsProvider statisticsProvider) {
-
-        extractedTextCache = new ExtractedTextCache(
-                config.extractedTextCacheSizeInMB() * ONE_MB,
-                config.extractedTextCacheExpiryInSecs(),
-                config.alwaysUsePreExtractedCache(),
-                textExtractionDir,
-                statisticsProvider);
-        if (extractedTextProvider != null) {
-            registerExtractedTextProvider(extractedTextProvider);
-        }
-        CacheStats stats = extractedTextCache.getCacheStats();
-        if (stats != null) {
-            oakRegs.add(registerMBean(whiteboard,
-                    CacheStatsMBean.class, stats,
-                    CacheStatsMBean.TYPE, stats.getName()));
-            LOG.info("Extracted text caching enabled with maxSize {} MB, expiry time {} secs",
-                    config.extractedTextCacheSizeInMB(), config.extractedTextCacheExpiryInSecs());
-        }
-    }
-
-    private void initializeTextExtractionDir(BundleContext bundleContext, Config config) {
-        String textExtractionDir = config.localTextExtractionDir();
-        if (textExtractionDir.trim().isEmpty()) {
-            String repoHome = bundleContext.getProperty(REPOSITORY_HOME);
-            if (repoHome != null) {
-                textExtractionDir = FilenameUtils.concat(repoHome, "index");
-            }
-        }
-
-        if (textExtractionDir == null) {
-            throw new IllegalStateException(String.format("Text extraction directory cannot be determined as neither " +
-                    "directory path [%s] nor repository home [%s] defined", PROP_LOCAL_TEXT_EXTRACTION_DIR, REPOSITORY_HOME));
-        }
-
-        this.textExtractionDir = new File(textExtractionDir);
-    }
-
-    private void registerExtractedTextProvider(PreExtractedTextProvider provider) {
-        if (extractedTextCache != null) {
-            if (provider != null) {
-                String usage = extractedTextCache.isAlwaysUsePreExtractedCache() ?
-                        "always" : "only during reindexing phase";
-                LOG.info("Registering PreExtractedTextProvider {} with extracted text cache. " +
-                        "It would be used {}", provider, usage);
-            } else {
-                LOG.info("Unregistering PreExtractedTextProvider with extracted text cache");
-            }
-            extractedTextCache.setExtractedTextProvider(provider);
-        }
     }
 
     private ElasticConnection getElasticConnection(Config contextConfig) {

diff --git a/...va/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java b/...va/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java
@@ -158,7 +158,7 @@ private BulkIngester<String> initBulkIngester() {
 
     private void checkFailures() throws IOException {
         if (!suppressedErrorCauses.isEmpty()) {
-            IOException ioe = new IOException("Exception while indexing. See suppressed for details");
+            IOException ioe = new IOException("Exception while indexing " + indexName + ". See suppressed for details");
             suppressedErrorCauses.stream().map(ec -> new IllegalStateException(ec.reason())).forEach(ioe::addSuppressed);
             throw ioe;
         }

diff --git a/...ain/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java b/...ain/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java
@@ -19,6 +19,7 @@
 import co.elastic.clients.elasticsearch._types.analysis.Analyzer;
 import co.elastic.clients.elasticsearch._types.analysis.CharFilterDefinition;
 import co.elastic.clients.elasticsearch._types.analysis.CustomAnalyzer;
+import co.elastic.clients.elasticsearch._types.analysis.NGramTokenizer;
 import co.elastic.clients.elasticsearch._types.analysis.TokenFilterDefinition;
 import co.elastic.clients.elasticsearch._types.analysis.TokenizerDefinition;
 import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
@@ -40,6 +41,7 @@
 import org.apache.lucene.analysis.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.CharFilterFactory;
 import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
 import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
 import org.apache.lucene.util.ResourceLoader;
 import org.jetbrains.annotations.NotNull;
@@ -55,6 +57,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -97,7 +100,13 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
             NodeState defaultAnalyzer = state.getChildNode(FulltextIndexConstants.ANL_DEFAULT);
             if (defaultAnalyzer.exists()) {
                 IndexSettingsAnalysis.Builder builder = new IndexSettingsAnalysis.Builder();
-                Map<String, Object> analyzer = convertNodeState(defaultAnalyzer);
+                Map<String, Object> analyzer;
+                try {
+                    analyzer = convertNodeState(defaultAnalyzer);
+                } catch (IOException e) {
+                    LOG.warn("Can not load analyzer; using an empty configuration", e);
+                    analyzer = Map.of();
+                }
                 String builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS);
                 if (builtIn == null) {
                     builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME);
@@ -107,11 +116,14 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
 
                     // content params, usually stop words
                     for (ChildNodeEntry nodeEntry : defaultAnalyzer.getChildNodeEntries()) {
+                        List<String> list;
                         try {
-                            analyzer.put(normalize(nodeEntry.getName()), loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION));
+                            list = loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION);
                         } catch (IOException e) {
-                            throw new IllegalStateException("Unable to load content for node entry " + nodeEntry.getName(), e);
+                            LOG.warn("Unable to load analyzer content for entry '" + nodeEntry.getName() + "'; using empty list", e);
+                            list = List.of();
                         }
+                        analyzer.put(normalize(nodeEntry.getName()), list);
                     }
 
                     builder.analyzer(analyzerName, new Analyzer(null, JsonData.of(analyzer)));
@@ -145,49 +157,93 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
 
     @NotNull
     private static TokenizerDefinition loadTokenizer(NodeState state) {
-        String name = normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)));
-        Map<String, Object> args = convertNodeState(state);
+        String name;
+        Map<String, Object> args;
+        if (!state.exists()) {
+            LOG.warn("No tokenizer specified; the standard with an empty configuration");
+            name = "Standard";
+            args = new HashMap<String, Object>();
+        } else {
+            name = Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME));
+            try {
+                args = convertNodeState(state);
+            } catch (IOException e) {
+                LOG.warn("Can not load tokenizer; using an empty configuration", e);
+                args = new HashMap<String, Object>();
+            }
+        }
+        name = normalize(name);
+        if ("n_gram".equals(name)) {
+            // OAK-11568
+            // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
+            Integer minGramSize = getIntegerSetting(args, "minGramSize", 2);
+            Integer maxGramSize = getIntegerSetting(args, "maxGramSize", 3);
+            TokenizerDefinition ngram = TokenizerDefinition.of(t -> t.ngram(
+                    NGramTokenizer.of(n -> n.minGram(minGramSize).maxGram(maxGramSize))));
+            return ngram;
+        }
         args.put(ANALYZER_TYPE, name);
         return new TokenizerDefinition(name, JsonData.of(args));
     }
 
+    private static Integer getIntegerSetting(Map<String, Object> args, String name, Integer defaultValue) {
+        Object value = args.getOrDefault(name, defaultValue);
+        if (!(value instanceof Integer)) {
+            LOG.warn("Setting {} value {} is not an integer; using default: {}", name, value, defaultValue);
+            return defaultValue;
+        }
+        return (Integer) value;
+    }
+
     private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
                                                               Function<String, Class<? extends AbstractAnalysisFactory>> lookup,
                                                               BiFunction<String, JsonData, FD> factory) {
         LinkedHashMap<String, FD> filters = new LinkedHashMap<>();
         int i = 0;
-        //Need to read children in order
+        // Need to read children in order
         Tree tree = TreeFactory.createReadOnlyTree(state);
+
+        // We need to remember that a "WordDelimiter" was configured,
+        // because we have to remove it if a synonyms filter is configured as well
+        String wordDelimiterFilterKey = null;
         for (Tree t : tree.getChildren()) {
             NodeState child = state.getChildNode(t.getName());
 
             String name;
             List<String> content = null;
             List<ParameterTransformer> transformers;
+            boolean skipEntry = false;
             try {
-                Class<? extends AbstractAnalysisFactory> tff = lookup.apply(t.getName());
+                Class<? extends AbstractAnalysisFactory> analysisFactory = lookup.apply(t.getName());
 
                 List<String> unsupportedParameters =
                         UNSUPPORTED_LUCENE_PARAMETERS.entrySet().stream()
-                                .filter(k -> k.getKey().isAssignableFrom(tff))
+                                .filter(k -> k.getKey().isAssignableFrom(analysisFactory))
                                 .map(Map.Entry::getValue)
                                 .findFirst().orElseGet(Collections::emptyList);
                 Map<String, String> luceneArgs = StreamSupport.stream(child.getProperties().spliterator(), false)
                         .filter(ElasticCustomAnalyzer::isPropertySupported)
                         .filter(ps -> !unsupportedParameters.contains(ps.getName()))
                         .collect(Collectors.toMap(PropertyState::getName, ps -> ps.getValue(Type.STRING)));
 
-                AbstractAnalysisFactory luceneFactory = tff.getConstructor(Map.class).newInstance(luceneArgs);
+                AbstractAnalysisFactory luceneFactory = analysisFactory.getConstructor(Map.class).newInstance(luceneArgs);
                 if (luceneFactory instanceof AbstractWordsFileFilterFactory) {
                     AbstractWordsFileFilterFactory wordsFF = ((AbstractWordsFileFilterFactory) luceneFactory);
                     // this will parse/load the content handling different formats, comments, etc
                     wordsFF.inform(new NodeStateResourceLoader(child));
                     content = wordsFF.getWords().stream().map(w -> new String(((char[]) w))).collect(Collectors.toList());
                 }
+                if (luceneFactory instanceof MappingCharFilterFactory) {
+                    MappingCharFilterFactory map = (MappingCharFilterFactory) luceneFactory;
+                    if (map.getOriginalArgs().isEmpty()) {
+                        skipEntry = true;
+                        LOG.warn("Empty CharFilter mapping: ignoring");
+                    }
+                }
 
-                name = normalize((String) tff.getField("NAME").get(null));
+                name = normalize((String) analysisFactory.getField("NAME").get(null));
                 transformers = LUCENE_ELASTIC_TRANSFORMERS.entrySet().stream()
-                        .filter(k -> k.getKey().isAssignableFrom(tff))
+                        .filter(k -> k.getKey().isAssignableFrom(analysisFactory))
                         .map(Map.Entry::getValue)
                         .collect(Collectors.toList());
             } catch (Exception e) {
@@ -201,6 +257,21 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
 
             Map<String, Object> args = convertNodeState(child, transformers, content);
 
+            if (name.equals("word_delimiter")) {
+                // https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html
+                // We recommend using the word_delimiter_graph instead of the word_delimiter filter.
+                // The word_delimiter filter can produce invalid token graphs.
+                LOG.info("Replacing the word delimiter filter with the word delimiter graph");
+                name = "word_delimiter_graph";
+            }
+            if (name.equals("hyphenation_compound_word")) {
+                name = "hyphenation_decompounder";
+                String hypenator = args.getOrDefault("hyphenator", "").toString();
+                LOG.info("Using the hyphenation_decompounder: " + hypenator);
+                args.put("hyphenation_patterns_path", "analysis/hyphenation_patterns.xml");
+                args.put("word_list", List.of());
+            }
+
             // stemmer in elastic don't have language based configurations. They all stay under the stemmer config with
             // a language parameter
             if (name.endsWith("_stem")) {
@@ -221,14 +292,31 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
             }
             args.put(ANALYZER_TYPE, name);
 
-            filters.put(name + "_" + i, factory.apply(name, JsonData.of(args)));
+            if (skipEntry) {
+                continue;
+            }
+            String key = name + "_" + i;
+            filters.put(key, factory.apply(name, JsonData.of(args)));
+            if (name.equals("word_delimiter_graph")) {
+                wordDelimiterFilterKey = key;
+            } else if (name.equals("synonym")) {
+                if (wordDelimiterFilterKey != null) {
+                    LOG.info("Removing word delimiter because there is a synonyms filter as well: " + wordDelimiterFilterKey);
+                    filters.remove(wordDelimiterFilterKey);
+                }
+            }
             i++;
         }
         return filters;
     }
 
     private static List<String> loadContent(NodeState file, String name, ContentTransformer transformer) throws IOException {
-        Blob blob = ConfigUtil.getBlob(file, name);
+        Blob blob;
+        try {
+            blob = ConfigUtil.getBlob(file, name);
+        } catch (IllegalArgumentException | IllegalStateException e) {
+            throw new IOException("Could not load " + name, e);
+        }
         try (Reader content = new InputStreamReader(Objects.requireNonNull(blob).getNewStream(), StandardCharsets.UTF_8)) {
             try (BufferedReader br = new BufferedReader(content)) {
                 return br.lines()
@@ -264,11 +352,25 @@ private static String normalize(String value) {
         return name;
     }
 
-    private static Map<String, Object> convertNodeState(NodeState state) {
-        return convertNodeState(state, List.of(), List.of());
+    private static Map<String, Object> convertNodeState(NodeState state) throws IOException {
+        try {
+            return convertNodeState(state, List.of(), List.of());
+        } catch (IllegalStateException e) {
+            // convert runtime exception back to checked exception
+            throw new IOException("Can not convert", e);
+        }
     }
 
-    private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) {
+    /**
+     * Read analyzer configuration.
+     *
+     * @param state the node state
+     * @param transformers
+     * @param preloadedContent
+     * @return
+     * @throws IllegalStateException
+     */
+    private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) throws IllegalStateException {
         Map<String, Object> luceneParams = StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(), Spliterator.ORDERED), false)
                 .filter(ElasticCustomAnalyzer::isPropertySupported)
                 .collect(Collectors.toMap(PropertyState::getName, ps -> {
@@ -280,6 +382,8 @@ private static Map<String, Object> convertNodeState(NodeState state, List<Parame
                                 return loadContent(state.getChildNode(v.trim()), v.trim(),
                                         CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream();
                             } catch (IOException e) {
+                                // convert checked exception to runtime exception to runtime exception,
+                                // because the stream API doesn't support checked exceptions
                                 throw new IllegalStateException(e);
                             }
                         }).collect(Collectors.toList()));