-
Notifications
You must be signed in to change notification settings - Fork 415
OAK-11568 Elastic: improved compatibility for aggregation definitions #2193
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9635838
18412a8
0096dcd
1055323
f1aace1
815f25e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
import co.elastic.clients.elasticsearch._types.analysis.Analyzer; | ||
import co.elastic.clients.elasticsearch._types.analysis.CharFilterDefinition; | ||
import co.elastic.clients.elasticsearch._types.analysis.CustomAnalyzer; | ||
import co.elastic.clients.elasticsearch._types.analysis.NGramTokenizer; | ||
import co.elastic.clients.elasticsearch._types.analysis.TokenFilterDefinition; | ||
import co.elastic.clients.elasticsearch._types.analysis.TokenizerDefinition; | ||
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis; | ||
|
@@ -40,6 +41,7 @@ | |
import org.apache.lucene.analysis.AbstractAnalysisFactory; | ||
import org.apache.lucene.analysis.CharFilterFactory; | ||
import org.apache.lucene.analysis.TokenFilterFactory; | ||
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory; | ||
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory; | ||
import org.apache.lucene.util.ResourceLoader; | ||
import org.jetbrains.annotations.NotNull; | ||
|
@@ -55,6 +57,7 @@ | |
import java.nio.charset.StandardCharsets; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
@@ -97,7 +100,13 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state | |
NodeState defaultAnalyzer = state.getChildNode(FulltextIndexConstants.ANL_DEFAULT); | ||
if (defaultAnalyzer.exists()) { | ||
IndexSettingsAnalysis.Builder builder = new IndexSettingsAnalysis.Builder(); | ||
Map<String, Object> analyzer = convertNodeState(defaultAnalyzer); | ||
Map<String, Object> analyzer; | ||
try { | ||
analyzer = convertNodeState(defaultAnalyzer); | ||
} catch (IOException e) { | ||
LOG.warn("Can not load analyzer; using an empty configuration", e); | ||
analyzer = Map.of(); | ||
} | ||
String builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_CLASS); | ||
if (builtIn == null) { | ||
builtIn = defaultAnalyzer.getString(FulltextIndexConstants.ANL_NAME); | ||
|
@@ -107,11 +116,14 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state | |
|
||
// content params, usually stop words | ||
for (ChildNodeEntry nodeEntry : defaultAnalyzer.getChildNodeEntries()) { | ||
List<String> list; | ||
try { | ||
analyzer.put(normalize(nodeEntry.getName()), loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION)); | ||
list = loadContent(nodeEntry.getNodeState(), nodeEntry.getName(), NOOP_TRANSFORMATION); | ||
} catch (IOException e) { | ||
throw new IllegalStateException("Unable to load content for node entry " + nodeEntry.getName(), e); | ||
LOG.warn("Unable to load analyzer content for entry '" + nodeEntry.getName() + "'; using empty list", e); | ||
list = List.of(); | ||
} | ||
analyzer.put(normalize(nodeEntry.getName()), list); | ||
} | ||
|
||
builder.analyzer(analyzerName, new Analyzer(null, JsonData.of(analyzer))); | ||
|
@@ -145,49 +157,93 @@ public static IndexSettingsAnalysis.Builder buildCustomAnalyzers(NodeState state | |
|
||
@NotNull | ||
private static TokenizerDefinition loadTokenizer(NodeState state) { | ||
String name = normalize(Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME))); | ||
Map<String, Object> args = convertNodeState(state); | ||
String name; | ||
Map<String, Object> args; | ||
if (!state.exists()) { | ||
LOG.warn("No tokenizer specified; the standard with an empty configuration"); | ||
name = "Standard"; | ||
args = new HashMap<String, Object>(); | ||
} else { | ||
name = Objects.requireNonNull(state.getString(FulltextIndexConstants.ANL_NAME)); | ||
try { | ||
args = convertNodeState(state); | ||
} catch (IOException e) { | ||
LOG.warn("Can not load tokenizer; using an empty configuration", e); | ||
args = new HashMap<String, Object>(); | ||
} | ||
} | ||
name = normalize(name); | ||
if ("n_gram".equals(name)) { | ||
// OAK-11568 | ||
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html | ||
Integer minGramSize = getIntegerSetting(args, "minGramSize", 2); | ||
Integer maxGramSize = getIntegerSetting(args, "maxGramSize", 3); | ||
TokenizerDefinition ngram = TokenizerDefinition.of(t -> t.ngram( | ||
NGramTokenizer.of(n -> n.minGram(minGramSize).maxGram(maxGramSize)))); | ||
return ngram; | ||
} | ||
args.put(ANALYZER_TYPE, name); | ||
return new TokenizerDefinition(name, JsonData.of(args)); | ||
} | ||
|
||
private static Integer getIntegerSetting(Map<String, Object> args, String name, Integer defaultValue) { | ||
Object value = args.getOrDefault(name, defaultValue); | ||
if (!(value instanceof Integer)) { | ||
LOG.warn("Setting {} value {} is not an integer; using default: {}", name, value, defaultValue); | ||
return defaultValue; | ||
} | ||
return (Integer) value; | ||
} | ||
|
||
private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state, | ||
Function<String, Class<? extends AbstractAnalysisFactory>> lookup, | ||
BiFunction<String, JsonData, FD> factory) { | ||
LinkedHashMap<String, FD> filters = new LinkedHashMap<>(); | ||
int i = 0; | ||
//Need to read children in order | ||
// Need to read children in order | ||
Tree tree = TreeFactory.createReadOnlyTree(state); | ||
|
||
// We need to remember that a "WordDelimiter" was configured, | ||
// because we have to remove it if a synonyms filter is configured as well | ||
String wordDelimiterFilterKey = null; | ||
for (Tree t : tree.getChildren()) { | ||
NodeState child = state.getChildNode(t.getName()); | ||
|
||
String name; | ||
List<String> content = null; | ||
List<ParameterTransformer> transformers; | ||
boolean skipEntry = false; | ||
try { | ||
Class<? extends AbstractAnalysisFactory> tff = lookup.apply(t.getName()); | ||
Class<? extends AbstractAnalysisFactory> analysisFactory = lookup.apply(t.getName()); | ||
|
||
List<String> unsupportedParameters = | ||
UNSUPPORTED_LUCENE_PARAMETERS.entrySet().stream() | ||
.filter(k -> k.getKey().isAssignableFrom(tff)) | ||
.filter(k -> k.getKey().isAssignableFrom(analysisFactory)) | ||
.map(Map.Entry::getValue) | ||
.findFirst().orElseGet(Collections::emptyList); | ||
Map<String, String> luceneArgs = StreamSupport.stream(child.getProperties().spliterator(), false) | ||
.filter(ElasticCustomAnalyzer::isPropertySupported) | ||
.filter(ps -> !unsupportedParameters.contains(ps.getName())) | ||
.collect(Collectors.toMap(PropertyState::getName, ps -> ps.getValue(Type.STRING))); | ||
|
||
AbstractAnalysisFactory luceneFactory = tff.getConstructor(Map.class).newInstance(luceneArgs); | ||
AbstractAnalysisFactory luceneFactory = analysisFactory.getConstructor(Map.class).newInstance(luceneArgs); | ||
if (luceneFactory instanceof AbstractWordsFileFilterFactory) { | ||
AbstractWordsFileFilterFactory wordsFF = ((AbstractWordsFileFilterFactory) luceneFactory); | ||
// this will parse/load the content handling different formats, comments, etc | ||
wordsFF.inform(new NodeStateResourceLoader(child)); | ||
content = wordsFF.getWords().stream().map(w -> new String(((char[]) w))).collect(Collectors.toList()); | ||
} | ||
if (luceneFactory instanceof MappingCharFilterFactory) { | ||
MappingCharFilterFactory map = (MappingCharFilterFactory) luceneFactory; | ||
if (map.getOriginalArgs().isEmpty()) { | ||
skipEntry = true; | ||
LOG.warn("Empty CharFilter mapping: ignoring"); | ||
} | ||
} | ||
|
||
name = normalize((String) tff.getField("NAME").get(null)); | ||
name = normalize((String) analysisFactory.getField("NAME").get(null)); | ||
transformers = LUCENE_ELASTIC_TRANSFORMERS.entrySet().stream() | ||
.filter(k -> k.getKey().isAssignableFrom(tff)) | ||
.filter(k -> k.getKey().isAssignableFrom(analysisFactory)) | ||
.map(Map.Entry::getValue) | ||
.collect(Collectors.toList()); | ||
} catch (Exception e) { | ||
|
@@ -201,6 +257,21 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state, | |
|
||
Map<String, Object> args = convertNodeState(child, transformers, content); | ||
|
||
if (name.equals("word_delimiter")) { | ||
// https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-tokenfilter.html | ||
// We recommend using the word_delimiter_graph instead of the word_delimiter filter. | ||
// The word_delimiter filter can produce invalid token graphs. | ||
LOG.info("Replacing the word delimiter filter with the word delimiter graph"); | ||
name = "word_delimiter_graph"; | ||
} | ||
if (name.equals("hyphenation_compound_word")) { | ||
name = "hyphenation_decompounder"; | ||
String hypenator = args.getOrDefault("hyphenator", "").toString(); | ||
LOG.info("Using the hyphenation_decompounder: " + hypenator); | ||
args.put("hyphenation_patterns_path", "analysis/hyphenation_patterns.xml"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wanted to use a fixed name, so it is possible to configure it. Installing this would have to be done manually, and we need to document it. |
||
args.put("word_list", List.of()); | ||
} | ||
|
||
// stemmer in elastic don't have language based configurations. They all stay under the stemmer config with | ||
// a language parameter | ||
if (name.endsWith("_stem")) { | ||
|
@@ -221,14 +292,31 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state, | |
} | ||
args.put(ANALYZER_TYPE, name); | ||
|
||
filters.put(name + "_" + i, factory.apply(name, JsonData.of(args))); | ||
if (skipEntry) { | ||
continue; | ||
} | ||
String key = name + "_" + i; | ||
filters.put(key, factory.apply(name, JsonData.of(args))); | ||
if (name.equals("word_delimiter_graph")) { | ||
wordDelimiterFilterKey = key; | ||
} else if (name.equals("synonym")) { | ||
if (wordDelimiterFilterKey != null) { | ||
LOG.info("Removing word delimiter because there is a synonyms filter as well: " + wordDelimiterFilterKey); | ||
filters.remove(wordDelimiterFilterKey); | ||
} | ||
} | ||
Comment on lines
+295
to
+307
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another option could be the use of https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-multiplexer-tokenfilter.html There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I also thought about that. I didn't find a very good documentation about it yet. |
||
i++; | ||
} | ||
return filters; | ||
} | ||
|
||
private static List<String> loadContent(NodeState file, String name, ContentTransformer transformer) throws IOException { | ||
Blob blob = ConfigUtil.getBlob(file, name); | ||
Blob blob; | ||
try { | ||
blob = ConfigUtil.getBlob(file, name); | ||
} catch (IllegalArgumentException | IllegalStateException e) { | ||
throw new IOException("Could not load " + name, e); | ||
} | ||
try (Reader content = new InputStreamReader(Objects.requireNonNull(blob).getNewStream(), StandardCharsets.UTF_8)) { | ||
try (BufferedReader br = new BufferedReader(content)) { | ||
return br.lines() | ||
|
@@ -264,11 +352,25 @@ private static String normalize(String value) { | |
return name; | ||
} | ||
|
||
private static Map<String, Object> convertNodeState(NodeState state) { | ||
return convertNodeState(state, List.of(), List.of()); | ||
private static Map<String, Object> convertNodeState(NodeState state) throws IOException { | ||
try { | ||
return convertNodeState(state, List.of(), List.of()); | ||
} catch (IllegalStateException e) { | ||
// convert runtime exception back to checked exception | ||
throw new IOException("Can not convert", e); | ||
} | ||
} | ||
|
||
private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) { | ||
/** | ||
* Read analyzer configuration. | ||
* | ||
* @param state the node state | ||
* @param transformers | ||
* @param preloadedContent | ||
* @return | ||
* @throws IllegalStateException | ||
*/ | ||
private static Map<String, Object> convertNodeState(NodeState state, List<ParameterTransformer> transformers, List<String> preloadedContent) throws IllegalStateException { | ||
Map<String, Object> luceneParams = StreamSupport.stream(Spliterators.spliteratorUnknownSize(state.getProperties().iterator(), Spliterator.ORDERED), false) | ||
.filter(ElasticCustomAnalyzer::isPropertySupported) | ||
.collect(Collectors.toMap(PropertyState::getName, ps -> { | ||
|
@@ -280,6 +382,8 @@ private static Map<String, Object> convertNodeState(NodeState state, List<Parame | |
return loadContent(state.getChildNode(v.trim()), v.trim(), | ||
CONTENT_TRANSFORMERS.getOrDefault(ps.getName(), NOOP_TRANSFORMATION)).stream(); | ||
} catch (IOException e) { | ||
// convert checked exception to runtime exception to runtime exception, | ||
// because the stream API doesn't support checked exceptions | ||
throw new IllegalStateException(e); | ||
} | ||
}).collect(Collectors.toList())); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is okay for now. We should structure it better to cover all the possible tokenizers (https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html). This can go in a separate PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I agree!