Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 25df414

Browse files
authoredMar 31, 2025··
OAK-11568 Elastic: improved compatibility for aggregation definitions (#2193)
* OAK-11568 Elastic: improved compatibility for aggregation definitions * OAK-11568 Elastic: improved compatibility for aggregation definitions * OAK-11568 Elastic: improved compatibility for aggregation definitions * OAK-11568 Elastic: improved compatibility for aggregation definitions * OAK-11568 Elastic: improved compatibility for aggregation definitions * OAK-11568 Elastic: improved compatibility for aggregation definitions
1 parent 6ca71c7 commit 25df414

File tree

13 files changed

+362
-93
lines changed

13 files changed

+362
-93
lines changed
 

‎oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexProviderService.java

+2-67
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
*/
1717
package org.apache.jackrabbit.oak.plugins.index.elastic;
1818

19-
import org.apache.commons.io.FilenameUtils;
20-
import org.apache.jackrabbit.oak.api.jmx.CacheStatsMBean;
21-
import org.apache.jackrabbit.oak.cache.CacheStats;
2219
import org.apache.jackrabbit.oak.commons.IOUtils;
2320
import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
2421
import org.apache.jackrabbit.oak.plugins.index.AsyncIndexInfoService;
@@ -50,13 +47,11 @@
5047
import org.slf4j.Logger;
5148
import org.slf4j.LoggerFactory;
5249

53-
import java.io.File;
5450
import java.util.ArrayList;
5551
import java.util.Dictionary;
5652
import java.util.Hashtable;
5753
import java.util.List;
5854

59-
import static org.apache.commons.io.FileUtils.ONE_MB;
6055
import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.registerMBean;
6156
import static org.apache.jackrabbit.oak.spi.whiteboard.WhiteboardUtils.scheduleWithFixedDelay;
6257

@@ -130,8 +125,6 @@ public class ElasticIndexProviderService {
130125

131126
private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexProviderService.class);
132127

133-
private static final String REPOSITORY_HOME = "repository.home";
134-
135128
@Reference
136129
private StatisticsProvider statisticsProvider;
137130

@@ -149,11 +142,10 @@ public class ElasticIndexProviderService {
149142

150143
private ExtractedTextCache extractedTextCache;
151144

152-
private final List<ServiceRegistration> regs = new ArrayList<>();
145+
private final List<ServiceRegistration<?>> regs = new ArrayList<>();
153146
private final List<Registration> oakRegs = new ArrayList<>();
154147

155148
private Whiteboard whiteboard;
156-
private File textExtractionDir;
157149

158150
private ElasticConnection elasticConnection;
159151
private ElasticMetricHandler metricHandler;
@@ -207,7 +199,7 @@ private void activate(BundleContext bundleContext, Config config) {
207199

208200
@Deactivate
209201
private void deactivate() {
210-
for (ServiceRegistration reg : regs) {
202+
for (ServiceRegistration<?> reg : regs) {
211203
reg.unregister();
212204
}
213205

@@ -245,63 +237,6 @@ private void registerIndexEditor(BundleContext bundleContext) {
245237
Dictionary<String, Object> props = new Hashtable<>();
246238
props.put("type", ElasticIndexDefinition.TYPE_ELASTICSEARCH);
247239
regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, props));
248-
// oakRegs.add(registerMBean(whiteboard,
249-
// TextExtractionStatsMBean.class,
250-
// editorProvider.getExtractedTextCache().getStatsMBean(),
251-
// TextExtractionStatsMBean.TYPE,
252-
// "TextExtraction statistics"));
253-
}
254-
255-
private void initializeExtractedTextCache(final Config config, StatisticsProvider statisticsProvider) {
256-
257-
extractedTextCache = new ExtractedTextCache(
258-
config.extractedTextCacheSizeInMB() * ONE_MB,
259-
config.extractedTextCacheExpiryInSecs(),
260-
config.alwaysUsePreExtractedCache(),
261-
textExtractionDir,
262-
statisticsProvider);
263-
if (extractedTextProvider != null) {
264-
registerExtractedTextProvider(extractedTextProvider);
265-
}
266-
CacheStats stats = extractedTextCache.getCacheStats();
267-
if (stats != null) {
268-
oakRegs.add(registerMBean(whiteboard,
269-
CacheStatsMBean.class, stats,
270-
CacheStatsMBean.TYPE, stats.getName()));
271-
LOG.info("Extracted text caching enabled with maxSize {} MB, expiry time {} secs",
272-
config.extractedTextCacheSizeInMB(), config.extractedTextCacheExpiryInSecs());
273-
}
274-
}
275-
276-
private void initializeTextExtractionDir(BundleContext bundleContext, Config config) {
277-
String textExtractionDir = config.localTextExtractionDir();
278-
if (textExtractionDir.trim().isEmpty()) {
279-
String repoHome = bundleContext.getProperty(REPOSITORY_HOME);
280-
if (repoHome != null) {
281-
textExtractionDir = FilenameUtils.concat(repoHome, "index");
282-
}
283-
}
284-
285-
if (textExtractionDir == null) {
286-
throw new IllegalStateException(String.format("Text extraction directory cannot be determined as neither " +
287-
"directory path [%s] nor repository home [%s] defined", PROP_LOCAL_TEXT_EXTRACTION_DIR, REPOSITORY_HOME));
288-
}
289-
290-
this.textExtractionDir = new File(textExtractionDir);
291-
}
292-
293-
private void registerExtractedTextProvider(PreExtractedTextProvider provider) {
294-
if (extractedTextCache != null) {
295-
if (provider != null) {
296-
String usage = extractedTextCache.isAlwaysUsePreExtractedCache() ?
297-
"always" : "only during reindexing phase";
298-
LOG.info("Registering PreExtractedTextProvider {} with extracted text cache. " +
299-
"It would be used {}", provider, usage);
300-
} else {
301-
LOG.info("Unregistering PreExtractedTextProvider with extracted text cache");
302-
}
303-
extractedTextCache.setExtractedTextProvider(provider);
304-
}
305240
}
306241

307242
private ElasticConnection getElasticConnection(Config contextConfig) {

‎oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticBulkProcessorHandler.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ private BulkIngester<String> initBulkIngester() {
158158

159159
private void checkFailures() throws IOException {
160160
if (!suppressedErrorCauses.isEmpty()) {
161-
IOException ioe = new IOException("Exception while indexing. See suppressed for details");
161+
IOException ioe = new IOException("Exception while indexing " + indexName + ". See suppressed for details");
162162
suppressedErrorCauses.stream().map(ec -> new IllegalStateException(ec.reason())).forEach(ioe::addSuppressed);
163163
throw ioe;
164164
}

‎oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticCustomAnalyzer.java

+120-16
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import co.elastic.clients.elasticsearch._types.analysis.Analyzer;
2020
import co.elastic.clients.elasticsearch._types.analysis.CharFilterDefinition;
2121
import co.elastic.clients.elasticsearch._types.analysis.CustomAnalyzer;
22+
import co.elastic.clients.elasticsearch._types.analysis.NGramTokenizer;
2223
import co.elastic.clients.elasticsearch._types.analysis.TokenFilterDefinition;
2324
import co.elastic.clients.elasticsearch._types.analysis.TokenizerDefinition;
2425
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
@@ -40,6 +41,7 @@
4041
import org.apache.lucene.analysis.AbstractAnalysisFactory;
4142
import org.apache.lucene.analysis.CharFilterFactory;
4243
import org.apache.lucene.analysis.TokenFilterFactory;
44+
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
4345
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
4446
import org.apache.lucene.util.ResourceLoader;
4547
import org.jetbrains.annotations.NotNull;
@@ -55,6 +57,7 @@
5557
import java.nio.charset.StandardCharsets;
5658
import java.util.Arrays;
5759
import java.util.Collections;
60+
import java.util.HashMap;
5861
import java.util.LinkedHashMap;
5962
import java.util.List;
6063
import java.util.Map;
@@ -97,7 +100,13 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
97100
NodeState defaultAnalyzer = state.getChildNode(FulltextIndexConstants.ANL_DEFAULT);
98101
if (defaultAnalyzer.exists()) {
99102
IndexSettingsAnalysis.Builder builder = new IndexSettingsAnalysis.Builder();
100-
Map<String, Object> analyzer = convertNodeState(defaultAnalyzer);
103+
Map<String, Object> analyzer;
104+
try {
105+
analyzer = convertNodeState(defaultAnalyzer);
106+
} catch (IOException e) {
107+
LOG.warn("Can not load analyzer; using an empty configuration", e);
108+
analyzer = Map.of();
109+
}
101110
String builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS);
102111
if (builtIn == null) {
103112
builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME);
@@ -107,11 +116,14 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
107116

108117
// content params, usually stop words
109118
for (ChildNodeEntry nodeEntry : defaultAnalyzer.getChildNodeEntries()) {
119+
List<String> list;
110120
try {
111-
analyzer.put(normalize(nodeEntry.getName()), loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION));
121+
list = loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION);
112122
} catch (IOException e) {
113-
throw new IllegalStateException("Unable to load content for node entry " + nodeEntry.getName(), e);
123+
LOG.warn("Unable to load analyzer content for entry '" + nodeEntry.getName() + "'; using empty list", e);
124+
list = List.of();
114125
}
126+
analyzer.put(normalize(nodeEntry.getName()), list);
115127
}
116128

117129
builder.analyzer(analyzerName, new Analyzer(null, JsonData.of(analyzer)));
@@ -145,49 +157,93 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state
145157

146158
@NotNull
147159
private static TokenizerDefinition loadTokenizer(NodeState state) {
148-
String name = normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)));
149-
Map<String, Object> args = convertNodeState(state);
160+
String name;
161+
Map<String, Object> args;
162+
if (!state.exists()) {
163+
LOG.warn("No tokenizer specified; the standard with an empty configuration");
164+
name = "Standard";
165+
args = new HashMap<String, Object>();
166+
} else {
167+
name = Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME));
168+
try {
169+
args = convertNodeState(state);
170+
} catch (IOException e) {
171+
LOG.warn("Can not load tokenizer; using an empty configuration", e);
172+
args = new HashMap<String, Object>();
173+
}
174+
}
175+
name = normalize(name);
176+
if ("n_gram".equals(name)) {
177+
// OAK-11568
178+
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
179+
Integer minGramSize = getIntegerSetting(args, "minGramSize", 2);
180+
Integer maxGramSize = getIntegerSetting(args, "maxGramSize", 3);
181+
TokenizerDefinition ngram = TokenizerDefinition.of(t -> t.ngram(
182+
NGramTokenizer.of(n -> n.minGram(minGramSize).maxGram(maxGramSize))));
183+
return ngram;
184+
}
150185
args.put(ANALYZER_TYPE, name);
151186
return new TokenizerDefinition(name, JsonData.of(args));
152187
}
153188

189+
private static Integer getIntegerSetting(Map<String, Object> args, String name, Integer defaultValue) {
190+
Object value = args.getOrDefault(name, defaultValue);
191+
if (!(value instanceof Integer)) {
192+
LOG.warn("Setting {} value {} is not an integer; using default: {}", name, value, defaultValue);
193+
return defaultValue;
194+
}
195+
return (Integer) value;
196+
}
197+
154198
private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
155199
Function<String, Class<? extends AbstractAnalysisFactory>> lookup,
156200
BiFunction<String, JsonData, FD> factory) {
157201
LinkedHashMap<String, FD> filters = new LinkedHashMap<>();
158202
int i = 0;
159-
//Need to read children in order
203+
// Need to read children in order
160204
Tree tree = TreeFactory.createReadOnlyTree(state);
205+
206+
// We need to remember that a "WordDelimiter" was configured,
207+
// because we have to remove it if a synonyms filter is configured as well
208+
String wordDelimiterFilterKey = null;
161209
for (Tree t : tree.getChildren()) {
162210
NodeState child = state.getChildNode(t.getName());
163211

164212
String name;
165213
List<String> content = null;
166214
List<ParameterTransformer> transformers;
215+
boolean skipEntry = false;
167216
try {
168-
Class<? extends AbstractAnalysisFactory> tff = lookup.apply(t.getName());
217+
Class<? extends AbstractAnalysisFactory> analysisFactory = lookup.apply(t.getName());
169218

170219
List<String> unsupportedParameters =
171220
UNSUPPORTED_LUCENE_PARAMETERS.entrySet().stream()
172-
.filter(k -> k.getKey().isAssignableFrom(tff))
221+
.filter(k -> k.getKey().isAssignableFrom(analysisFactory))
173222
.map(Map.Entry::getValue)
174223
.findFirst().orElseGet(Collections::emptyList);
175224
Map<String, String> luceneArgs = StreamSupport.stream(child.getProperties().spliterator(), false)
176225
.filter(ElasticCustomAnalyzer::isPropertySupported)
177226
.filter(ps -> !unsupportedParameters.contains(ps.getName()))
178227
.collect(Collectors.toMap(PropertyState::getName, ps -> ps.getValue(Type.STRING)));
179228

180-
AbstractAnalysisFactory luceneFactory = tff.getConstructor(Map.class).newInstance(luceneArgs);
229+
AbstractAnalysisFactory luceneFactory = analysisFactory.getConstructor(Map.class).newInstance(luceneArgs);
181230
if (luceneFactory instanceof AbstractWordsFileFilterFactory) {
182231
AbstractWordsFileFilterFactory wordsFF = ((AbstractWordsFileFilterFactory) luceneFactory);
183232
// this will parse/load the content handling different formats, comments, etc
184233
wordsFF.inform(new NodeStateResourceLoader(child));
185234
content = wordsFF.getWords().stream().map(w -> new String(((char[]) w))).collect(Collectors.toList());
186235
}
236+
if (luceneFactory instanceof MappingCharFilterFactory) {
237+
MappingCharFilterFactory map = (MappingCharFilterFactory) luceneFactory;
238+
if (map.getOriginalArgs().isEmpty()) {
239+
skipEntry = true;
240+
LOG.warn("Empty CharFilter mapping: ignoring");
241+
}
242+
}
187243

188-
name = normalize((String) tff.getField("NAME").get(null));
244+
name = normalize((String) analysisFactory.getField("NAME").get(null));
189245
transformers = LUCENE_ELASTIC_TRANSFORMERS.entrySet().stream()
190-
.filter(k -> k.getKey().isAssignableFrom(tff))
246+
.filter(k -> k.getKey().isAssignableFrom(analysisFactory))
191247
.map(Map.Entry::getValue)
192248
.collect(Collectors.toList());
193249
} catch (Exception e) {
@@ -201,6 +257,21 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
201257

202258
Map<String, Object> args = convertNodeState(child, transformers, content);
203259

260+
if (name.equals("word_delimiter")) {
261+
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html
262+
// We recommend using the word_delimiter_graph instead of the word_delimiter filter.
263+
// The word_delimiter filter can produce invalid token graphs.
264+
LOG.info("Replacing the word delimiter filter with the word delimiter graph");
265+
name = "word_delimiter_graph";
266+
}
267+
if (name.equals("hyphenation_compound_word")) {
268+
name = "hyphenation_decompounder";
269+
String hypenator = args.getOrDefault("hyphenator", "").toString();
270+
LOG.info("Using the hyphenation_decompounder: " + hypenator);
271+
args.put("hyphenation_patterns_path", "analysis/hyphenation_patterns.xml");
272+
args.put("word_list", List.of());
273+
}
274+
204275
// stemmer in elastic don't have language based configurations. They all stay under the stemmer config with
205276
// a language parameter
206277
if (name.endsWith("_stem")) {
@@ -221,14 +292,31 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
221292
}
222293
args.put(ANALYZER_TYPE, name);
223294

224-
filters.put(name + "_" + i, factory.apply(name, JsonData.of(args)));
295+
if (skipEntry) {
296+
continue;
297+
}
298+
String key = name + "_" + i;
299+
filters.put(key, factory.apply(name, JsonData.of(args)));
300+
if (name.equals("word_delimiter_graph")) {
301+
wordDelimiterFilterKey = key;
302+
} else if (name.equals("synonym")) {
303+
if (wordDelimiterFilterKey != null) {
304+
LOG.info("Removing word delimiter because there is a synonyms filter as well: " + wordDelimiterFilterKey);
305+
filters.remove(wordDelimiterFilterKey);
306+
}
307+
}
225308
i++;
226309
}
227310
return filters;
228311
}
229312

230313
private static List<String> loadContent(NodeState file, String name, ContentTransformer transformer) throws IOException {
231-
Blob blob = ConfigUtil.getBlob(file, name);
314+
Blob blob;
315+
try {
316+
blob = ConfigUtil.getBlob(file, name);
317+
} catch (IllegalArgumentException | IllegalStateException e) {
318+
throw new IOException("Could not load " + name, e);
319+
}
232320
try (Reader content = new InputStreamReader(Objects.requireNonNull(blob).getNewStream(), StandardCharsets.UTF_8)) {
233321
try (BufferedReader br = new BufferedReader(content)) {
234322
return br.lines()
@@ -264,11 +352,25 @@ private static String normalize(String value) {
264352
return name;
265353
}
266354

267-
private static Map<String, Object> convertNodeState(NodeState state) {
268-
return convertNodeState(state, List.of(), List.of());
355+
private static Map<String, Object> convertNodeState(NodeState state) throws IOException {
356+
try {
357+
return convertNodeState(state, List.of(), List.of());
358+
} catch (IllegalStateException e) {
359+
// convert runtime exception back to checked exception
360+
throw new IOException("Can not convert", e);
361+
}
269362
}
270363

271-
private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) {
364+
/**
365+
* Read analyzer configuration.
366+
*
367+
* @param state the node state
368+
* @param transformers
369+
* @param preloadedContent
370+
* @return
371+
* @throws IllegalStateException
372+
*/
373+
private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) throws IllegalStateException {
272374
Map<String, Object> luceneParams = StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(), Spliterator.ORDERED), false)
273375
.filter(ElasticCustomAnalyzer::isPropertySupported)
274376
.collect(Collectors.toMap(PropertyState::getName, ps -> {
@@ -280,6 +382,8 @@ private static Map<String, Object> convertNodeState(NodeState state, List<Parame
280382
return loadContent(state.getChildNode(v.trim()), v.trim(),
281383
CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream();
282384
} catch (IOException e) {
385+
// convert checked exception to runtime exception to runtime exception,
386+
// because the stream API doesn't support checked exceptions
283387
throw new IllegalStateException(e);
284388
}
285389
}).collect(Collectors.toList()));
There was a problem loading the remainder of the diff.

0 commit comments

Comments
 (0)
Please sign in to comment.