Skip to content

OAK-11568 Elastic: improved compatibility for aggregation definitions #2193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;

import org.apache.commons.io.FilenameUtils;
import org.apache.jackrabbit.oak.api.jmx.CacheStatsMBean;
import org.apache.jackrabbit.oak.cache.CacheStats;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
import org.apache.jackrabbit.oak.plugins.index.AsyncIndexInfoService;
Expand Down Expand Up @@ -50,13 +47,11 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.util.ArrayList;
import java.util.Dictionary;
import java.util.Hashtable;
import java.util.List;

import static org.apache.commons.io.FileUtils.ONE_MB;
import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.registerMBean;
import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.scheduleWithFixedDelay;

Expand Down Expand Up @@ -130,8 +125,6 @@ public class ElasticIndexProviderService {

private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexProviderService.class);

private static final String REPOSITORY_HOME = "repository.home";

@Reference
private StatisticsProvider statisticsProvider;

Expand All @@ -149,11 +142,10 @@ public class ElasticIndexProviderService {

private ExtractedTextCache extractedTextCache;

private final List<ServiceRegistration> regs = new ArrayList<>();
private final List<ServiceRegistration<?>> regs = new ArrayList<>();
private final List<Registration> oakRegs = new ArrayList<>();

private Whiteboard whiteboard;
private File textExtractionDir;

private ElasticConnection elasticConnection;
private ElasticMetricHandler metricHandler;
Expand Down Expand Up @@ -200,7 +192,7 @@ private void activate(BundleContext bundleContext, Config config) {

@Deactivate
private void deactivate() {
for (ServiceRegistration reg : regs) {
for (ServiceRegistration<?> reg : regs) {
reg.unregister();
}

Expand Down Expand Up @@ -242,63 +234,6 @@ private void registerIndexEditor(BundleContext bundleContext) {
Dictionary<String, Object> props = new Hashtable<>();
props.put("type", ElasticIndexDefinition.TYPE_ELASTICSEARCH);
regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, props));
// oakRegs.add(registerMBean(whiteboard,
// TextExtractionStatsMBean.class,
// editorProvider.getExtractedTextCache().getStatsMBean(),
// TextExtractionStatsMBean.TYPE,
// "TextExtraction statistics"));
}

private void initializeExtractedTextCache(final Config config, StatisticsProvider statisticsProvider) {

extractedTextCache = new ExtractedTextCache(
config.extractedTextCacheSizeInMB() * ONE_MB,
config.extractedTextCacheExpiryInSecs(),
config.alwaysUsePreExtractedCache(),
textExtractionDir,
statisticsProvider);
if (extractedTextProvider != null) {
registerExtractedTextProvider(extractedTextProvider);
}
CacheStats stats = extractedTextCache.getCacheStats();
if (stats != null) {
oakRegs.add(registerMBean(whiteboard,
CacheStatsMBean.class, stats,
CacheStatsMBean.TYPE, stats.getName()));
LOG.info("Extracted text caching enabled with maxSize {} MB, expiry time {} secs",
config.extractedTextCacheSizeInMB(), config.extractedTextCacheExpiryInSecs());
}
}

private void initializeTextExtractionDir(BundleContext bundleContext, Config config) {
String textExtractionDir = config.localTextExtractionDir();
if (textExtractionDir.trim().isEmpty()) {
String repoHome = bundleContext.getProperty(REPOSITORY_HOME);
if (repoHome != null) {
textExtractionDir = FilenameUtils.concat(repoHome, "index");
}
}

if (textExtractionDir == null) {
throw new IllegalStateException(String.format("Text extraction directory cannot be determined as neither " +
"directory path [%s] nor repository home [%s] defined", PROP_LOCAL_TEXT_EXTRACTION_DIR, REPOSITORY_HOME));
}

this.textExtractionDir = new File(textExtractionDir);
}

private void registerExtractedTextProvider(PreExtractedTextProvider provider) {
if (extractedTextCache != null) {
if (provider != null) {
String usage = extractedTextCache.isAlwaysUsePreExtractedCache() ?
"always" : "only during reindexing phase";
LOG.info("Registering PreExtractedTextProvider {} with extracted text cache. " +
"It would be used {}", provider, usage);
} else {
LOG.info("Unregistering PreExtractedTextProvider with extracted text cache");
}
extractedTextCache.setExtractedTextProvider(provider);
}
}

private ElasticConnection getElasticConnection(Config contextConfig) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ private BulkIngester<String> initBulkIngester() {

private void checkFailures() throws IOException {
if (!suppressedErrorCauses.isEmpty()) {
IOException ioe = new IOException("Exception while indexing. See suppressed for details");
IOException ioe = new IOException("Exception while indexing " + indexName + ". See suppressed for details");
suppressedErrorCauses.stream().map(ec -> new IllegalStateException(ec.reason())).forEach(ioe::addSuppressed);
throw ioe;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import co.elastic.clients.elasticsearch._types.analysis.Analyzer;
import co.elastic.clients.elasticsearch._types.analysis.CharFilterDefinition;
import co.elastic.clients.elasticsearch._types.analysis.CustomAnalyzer;
import co.elastic.clients.elasticsearch._types.analysis.NGramTokenizer;
import co.elastic.clients.elasticsearch._types.analysis.TokenFilterDefinition;
import co.elastic.clients.elasticsearch._types.analysis.TokenizerDefinition;
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
Expand All @@ -40,6 +41,7 @@
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.util.ResourceLoader;
import org.jetbrains.annotations.NotNull;
Expand All @@ -55,6 +57,7 @@
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -97,7 +100,13 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
NodeState defaultAnalyzer = state.getChildNode(FulltextIndexConstants.ANL_DEFAULT);
if (defaultAnalyzer.exists()) {
IndexSettingsAnalysis.Builder builder = new IndexSettingsAnalysis.Builder();
Map<String, Object> analyzer = convertNodeState(defaultAnalyzer);
Map<String, Object> analyzer;
try {
analyzer = convertNodeState(defaultAnalyzer);
} catch (IOException e) {
LOG.warn("Can not load analyzer; using an empty configuration", e);
analyzer = Map.of();
}
String builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS);
if (builtIn == null) {
builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME);
Expand All @@ -107,11 +116,14 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state

// content params, usually stop words
for (ChildNodeEntry nodeEntry : defaultAnalyzer.getChildNodeEntries()) {
List<String> list;
try {
analyzer.put(normalize(nodeEntry.getName()), loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION));
list = loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION);
} catch (IOException e) {
throw new IllegalStateException("Unable to load content for node entry " + nodeEntry.getName(), e);
LOG.warn("Unable to load analyzer content for entry '" + nodeEntry.getName() + "'; using empty list", e);
list = List.of();
}
analyzer.put(normalize(nodeEntry.getName()), list);
}

builder.analyzer(analyzerName, new Analyzer(null, JsonData.of(analyzer)));
Expand Down Expand Up @@ -145,49 +157,93 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state

@NotNull
private static TokenizerDefinition loadTokenizer(NodeState state) {
String name = normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)));
Map<String, Object> args = convertNodeState(state);
String name;
Map<String, Object> args;
if (!state.exists()) {
LOG.warn("No tokenizer specified; the standard with an empty configuration");
name = "Standard";
args = new HashMap<String, Object>();
} else {
name = Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME));
try {
args = convertNodeState(state);
} catch (IOException e) {
LOG.warn("Can not load tokenizer; using an empty configuration", e);
args = new HashMap<String, Object>();
}
}
name = normalize(name);
if ("n_gram".equals(name)) {
// OAK-11568
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
Integer minGramSize = getIntegerSetting(args, "minGramSize", 2);
Integer maxGramSize = getIntegerSetting(args, "maxGramSize", 3);
TokenizerDefinition ngram = TokenizerDefinition.of(t -> t.ngram(
NGramTokenizer.of(n -> n.minGram(minGramSize).maxGram(maxGramSize))));
return ngram;
}
Comment on lines +176 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is okay for now. We should structure it better to cover all the possible tokenizers (https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html). This can go in a separate PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree!

args.put(ANALYZER_TYPE, name);
return new TokenizerDefinition(name, JsonData.of(args));
}

private static Integer getIntegerSetting(Map<String, Object> args, String name, Integer defaultValue) {
Object value = args.getOrDefault(name, defaultValue);
if (!(value instanceof Integer)) {
LOG.warn("Setting {} value {} is not an integer; using default: {}", name, value, defaultValue);
return defaultValue;
}
return (Integer) value;
}

private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
Function<String, Class<? extends AbstractAnalysisFactory>> lookup,
BiFunction<String, JsonData, FD> factory) {
LinkedHashMap<String, FD> filters = new LinkedHashMap<>();
int i = 0;
//Need to read children in order
// Need to read children in order
Tree tree = TreeFactory.createReadOnlyTree(state);

// We need to remember that a "WordDelimiter" was configured,
// because we have to remove it if a synonyms filter is configured as well
String wordDelimiterFilterKey = null;
for (Tree t : tree.getChildren()) {
NodeState child = state.getChildNode(t.getName());

String name;
List<String> content = null;
List<ParameterTransformer> transformers;
boolean skipEntry = false;
try {
Class<? extends AbstractAnalysisFactory> tff = lookup.apply(t.getName());
Class<? extends AbstractAnalysisFactory> analysisFactory = lookup.apply(t.getName());

List<String> unsupportedParameters =
UNSUPPORTED_LUCENE_PARAMETERS.entrySet().stream()
.filter(k -> k.getKey().isAssignableFrom(tff))
.filter(k -> k.getKey().isAssignableFrom(analysisFactory))
.map(Map.Entry::getValue)
.findFirst().orElseGet(Collections::emptyList);
Map<String, String> luceneArgs = StreamSupport.stream(child.getProperties().spliterator(), false)
.filter(ElasticCustomAnalyzer::isPropertySupported)
.filter(ps -> !unsupportedParameters.contains(ps.getName()))
.collect(Collectors.toMap(PropertyState::getName, ps -> ps.getValue(Type.STRING)));

AbstractAnalysisFactory luceneFactory = tff.getConstructor(Map.class).newInstance(luceneArgs);
AbstractAnalysisFactory luceneFactory = analysisFactory.getConstructor(Map.class).newInstance(luceneArgs);
if (luceneFactory instanceof AbstractWordsFileFilterFactory) {
AbstractWordsFileFilterFactory wordsFF = ((AbstractWordsFileFilterFactory) luceneFactory);
// this will parse/load the content handling different formats, comments, etc
wordsFF.inform(new NodeStateResourceLoader(child));
content = wordsFF.getWords().stream().map(w -> new String(((char[]) w))).collect(Collectors.toList());
}
if (luceneFactory instanceof MappingCharFilterFactory) {
MappingCharFilterFactory map = (MappingCharFilterFactory) luceneFactory;
if (map.getOriginalArgs().isEmpty()) {
skipEntry = true;
LOG.warn("Empty CharFilter mapping: ignoring");
}
}

name = normalize((String) tff.getField("NAME").get(null));
name = normalize((String) analysisFactory.getField("NAME").get(null));
transformers = LUCENE_ELASTIC_TRANSFORMERS.entrySet().stream()
.filter(k -> k.getKey().isAssignableFrom(tff))
.filter(k -> k.getKey().isAssignableFrom(analysisFactory))
.map(Map.Entry::getValue)
.collect(Collectors.toList());
} catch (Exception e) {
Expand All @@ -201,6 +257,21 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,

Map<String, Object> args = convertNodeState(child, transformers, content);

if (name.equals("word_delimiter")) {
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html
// We recommend using the word_delimiter_graph instead of the word_delimiter filter.
// The word_delimiter filter can produce invalid token graphs.
LOG.info("Replacing the word delimiter filter with the word delimiter graph");
name = "word_delimiter_graph";
}
if (name.equals("hyphenation_compound_word")) {
name = "hyphenation_decompounder";
String hypenator = args.getOrDefault("hyphenator", "").toString();
LOG.info("Using the hyphenation_decompounder: " + hypenator);
args.put("hyphenation_patterns_path", "analysis/hyphenation_patterns.xml");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should "analysis/hyphenation_patterns.xml" be installed in the Elastic nodes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to use a fixed name, so it is possible to configure it. Installing this would have to be done manually, and we need to document it.

args.put("word_list", List.of());
}

// stemmer in elastic don't have language based configurations. They all stay under the stemmer config with
// a language parameter
if (name.endsWith("_stem")) {
Expand All @@ -221,14 +292,31 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
}
args.put(ANALYZER_TYPE, name);

filters.put(name + "_" + i, factory.apply(name, JsonData.of(args)));
if (skipEntry) {
continue;
}
String key = name + "_" + i;
filters.put(key, factory.apply(name, JsonData.of(args)));
if (name.equals("word_delimiter_graph")) {
wordDelimiterFilterKey = key;
} else if (name.equals("synonym")) {
if (wordDelimiterFilterKey != null) {
LOG.info("Removing word delimiter because there is a synonyms filter as well: " + wordDelimiterFilterKey);
filters.remove(wordDelimiterFilterKey);
}
}
Comment on lines +295 to +307
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option could be the use of https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-multiplexer-tokenfilter.html
We can work on this in a separate PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I also thought about that. I didn't find a very good documentation about it yet.

i++;
}
return filters;
}

private static List<String> loadContent(NodeState file, String name, ContentTransformer transformer) throws IOException {
Blob blob = ConfigUtil.getBlob(file, name);
Blob blob;
try {
blob = ConfigUtil.getBlob(file, name);
} catch (IllegalArgumentException | IllegalStateException e) {
throw new IOException("Could not load " + name, e);
}
try (Reader content = new InputStreamReader(Objects.requireNonNull(blob).getNewStream(), StandardCharsets.UTF_8)) {
try (BufferedReader br = new BufferedReader(content)) {
return br.lines()
Expand Down Expand Up @@ -264,11 +352,25 @@ private static String normalize(String value) {
return name;
}

private static Map<String, Object> convertNodeState(NodeState state) {
return convertNodeState(state, List.of(), List.of());
private static Map<String, Object> convertNodeState(NodeState state) throws IOException {
try {
return convertNodeState(state, List.of(), List.of());
} catch (IllegalStateException e) {
// convert runtime exception back to checked exception
throw new IOException("Can not convert", e);
}
}

private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) {
/**
* Read analyzer configuration.
*
* @param state the node state
* @param transformers
* @param preloadedContent
* @return
* @throws IllegalStateException
*/
private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) throws IllegalStateException {
Map<String, Object> luceneParams = StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(), Spliterator.ORDERED), false)
.filter(ElasticCustomAnalyzer::isPropertySupported)
.collect(Collectors.toMap(PropertyState::getName, ps -> {
Expand All @@ -280,6 +382,8 @@ private static Map<String, Object> convertNodeState(NodeState state, List<Parame
return loadContent(state.getChildNode(v.trim()), v.trim(),
CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream();
} catch (IOException e) {
// convert checked exception to runtime exception to runtime exception,
// because the stream API doesn't support checked exceptions
throw new IllegalStateException(e);
}
}).collect(Collectors.toList()));
Expand Down
Loading
Loading