Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions oak-doc/src/site/markdown/query/lucene.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ Below is the canonical index definition structure
- queryPaths (string) multiple = ['/']
- excludedPaths (string) multiple
- maxFieldLength (long) = 10000
- maxTagLength (long) = 100
- refresh (boolean)
- useIfExists (string)
- blobSize (long) = 32768
Expand Down Expand Up @@ -233,6 +234,13 @@ selectionPolicy
[maxFieldLength][OAK-2469]
: Numbers of terms indexed per field. Defaults to 10000

[maxTagLength][OAK-12101]
: Optional integer property. Defaults to 100.
: Maximum length of similarity tag and dynamic boost tag values to be indexed.
Tags with values longer than this limit are skipped during indexing.
Set to -1 to disable the length check entirely.
See [Dynamic Boost](#dynamic-boost) and [Search by similar feature vectors](#similar-fv) for details.

refresh
: Optional boolean property.
: Used to refresh the stored index definition. See [Effective Index Definition](#stored-index-definition)
Expand Down Expand Up @@ -1231,6 +1239,11 @@ with boost set to the confidence.
This is a replacement for the `IndexFieldProvider`.
See also [OAK-8971][OAK-8971].

Tag values that exceed the configured `maxTagLength` (default 100) are skipped during indexing.
This prevents unexpectedly long values from being indexed as dynamic boost tags.
The limit can be changed by setting the `maxTagLength` property on the index definition,
or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].


### <a name="native-query"></a>Native Query and Index Selection
`@deprecated Oak 1.46`
Expand Down Expand Up @@ -1702,6 +1715,11 @@ As a further improvement for the accuracy of similarity search results if nodes
holding text values that can be used as keywords or tags that well describe the feature vector contents, the
`similarityTags` configuration can be set to _true_ for such properties (see [OAK-8118](https://issues.apache.org/jira/browse/OAK-8118)).

Similarity tag values that exceed the configured `maxTagLength` (default 100) are skipped during indexing.
This prevents unexpectedly long values from being indexed as similarity tags.
The limit can be changed by setting the `maxTagLength` property on the index definition,
or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].

See also [OAK-7575](https://issues.apache.org/jira/browse/OAK-7575).


Expand Down Expand Up @@ -2231,6 +2249,7 @@ SELECT rep:facet(title) FROM [app:Asset] WHERE [title] IS NOT NULL
[OAK-7739]: https://issues.apache.org/jira/browse/OAK-7739
[OAK-8971]: https://issues.apache.org/jira/browse/OAK-8971
[OAK-9625]: https://issues.apache.org/jira/browse/OAK-9625
[OAK-12101]: https://issues.apache.org/jira/browse/OAK-12101
[luke]: https://code.google.com/p/luke/
[tika]: http://tika.apache.org/
[oak-console]: https://github.com/apache/jackrabbit-oak/tree/trunk/oak-run#console
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,8 @@ protected void indexNodeName(Document doc, String value) {
}

@Override
protected boolean indexSimilarityTag(Document doc, PropertyState property) {
doc.add(new TextField(FieldNames.SIMILARITY_TAGS, property.getValue(Type.STRING), Field.Store.YES));
protected boolean indexSimilarityTag(Document doc, String value) {
doc.add(new TextField(FieldNames.SIMILARITY_TAGS, value, Field.Store.YES));
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,27 @@

import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneIndexDefinitionBuilder;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.document.Document;
import org.junit.Test;

import java.util.List;

import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
import static org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState.EMPTY_NODE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;

public class LuceneDocumentMakerTest {
private final NodeState root = INITIAL_CONTENT;
private final LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();

@Test
public void excludeSingleProperty() throws Exception{
LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();
builder.indexRule("nt:base")
.property("foo")
.propertyIndex()
Expand All @@ -63,4 +67,39 @@ public void excludeSingleProperty() throws Exception{
assertNull(docMaker.makeDocument(test.getNodeState()));
}

}
@Test
public void similarityTagMaxLengthFiltering() throws Exception{
LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();
builder.indexRule("nt:base")
.property("jcr:primaryType")
.propertyIndex();
builder.indexRule("nt:base")
.property("tag")
.similarityTags(true);

builder.getBuilderTree().setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 10);

LuceneIndexDefinition defn = LuceneIndexDefinition.newLuceneBuilder(root, builder.build(), "/foo").build();
LuceneDocumentMaker docMaker = new LuceneDocumentMaker(defn,
defn.getApplicableIndexingRule("nt:base"), "/x");

NodeBuilder test = EMPTY_NODE.builder();
test.setProperty("tag", "short");
Document doc = docMaker.makeDocument(test.getNodeState());
assertNotNull(doc);
assertEquals("short", doc.get(FieldNames.SIMILARITY_TAGS));

test = EMPTY_NODE.builder();
test.setProperty("tag", "exactly10!");
doc = docMaker.makeDocument(test.getNodeState());
assertNotNull(doc);
assertEquals("exactly10!", doc.get(FieldNames.SIMILARITY_TAGS));

test = EMPTY_NODE.builder();
test.setProperty("tag", "this is too long");
doc = docMaker.makeDocument(test.getNodeState());
assertNotNull(doc);
assertNull(doc.get(FieldNames.SIMILARITY_TAGS));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,37 @@ public void dynamicBoostLiteShouldGiveLessRelevanceToTags() throws Exception {
List.of("/test/asset3", "/test/asset2"));
}

@Test
public void dynamicBoostMaxLengthFiltering() throws Exception {
createAssetsIndexAndProperties(false, false, true, 10);

Tree testParent = createNodeWithType(root.getTree("/"), "test", JcrConstants.NT_UNSTRUCTURED, "");

Tree predicted1 = createAssetNodeWithPredicted(testParent, "asset1", "test");
createPredictedTag(predicted1, "short", 0.9);
createPredictedTag(predicted1, "exactly10!", 0.8);
createPredictedTag(predicted1, "this is too long", 0.7);

Tree predicted2 = createAssetNodeWithPredicted(testParent, "asset2", "test");
createPredictedTag(predicted2, "short", 0.9);
createPredictedTag(predicted2, "exactly10!", 0.8);

root.commit();

assertEventually(() -> {
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'short')", SQL2,
List.of("/test/asset1", "/test/asset2"));
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'exactly10!')", SQL2,
List.of("/test/asset1", "/test/asset2"));

assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'this is too long')", SQL2, List.of());
});
}

@Override
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags) throws Exception {
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws Exception {
factory.queryTermsProvider = new FulltextQueryTermsProviderImpl();
super.createAssetsIndexAndProperties(lite, similarityTags);
super.createAssetsIndexAndProperties(lite, similarityTags, useInFullTextQuery, maxTagLength);
}

private String runIndexingTest(Class<?> loggerClass, boolean nameProperty) throws CommitFailedException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,9 @@ protected void indexNodeName(ElasticDocument doc, String value) {
}

@Override
protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property) {
String val = property.getValue(Type.STRING);
if (!val.isEmpty()) {
doc.addSimilarityTag(val);
protected boolean indexSimilarityTag(ElasticDocument doc, String value) {
if (!value.isEmpty()) {
doc.addSimilarityTag(value);
return true;
}
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,12 @@ public static IndexingMode from(String indexingMode) {
*/
String MAX_FIELD_LENGTH = "maxFieldLength";

/**
* Maximum length of similarity and dynamic boost tag values to be indexed. Tags longer than this value will be skipped.
* Set to -1 to disable the length check entirely
*/
String MAX_TAG_LENGTH = "maxTagLength";

/**
* whether use this property values for suggestions
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ public class IndexDefinition implements Aggregate.AggregateMapper {
*/
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;

/**
* Default value for property {@link #maxTagLength}.
*/
public static final int DEFAULT_MAX_TAG_LENGTH = 100;

public static final int DEFAULT_MAX_EXTRACT_LENGTH = -10;

/**
Expand Down Expand Up @@ -274,6 +279,8 @@ public class IndexDefinition implements Aggregate.AggregateMapper {

private final int maxFieldLength;

private final int maxTagLength;

private final int maxExtractLength;

private final int suggesterUpdateFrequencyMinutes;
Expand Down Expand Up @@ -470,6 +477,7 @@ protected IndexDefinition(NodeState root, NodeState defn, IndexFormatVersion ver
}

this.maxFieldLength = getOptionalValue(defn, FulltextIndexConstants.MAX_FIELD_LENGTH, DEFAULT_MAX_FIELD_LENGTH);
this.maxTagLength = getOptionalValue(defn, FulltextIndexConstants.MAX_TAG_LENGTH, DEFAULT_MAX_TAG_LENGTH);
this.costPerEntry = getOptionalValue(defn, FulltextIndexConstants.COST_PER_ENTRY, getDefaultCostPerEntry(version));
this.costPerExecution = getOptionalValue(defn, FulltextIndexConstants.COST_PER_EXECUTION, 1.0);
this.hasCustomTikaConfig = getTikaConfigNode().exists();
Expand Down Expand Up @@ -690,6 +698,10 @@ public String[] getIndexTags() {
return indexSelectionPolicy;
}

public int getMaxTagLength() {
return maxTagLength;
}

public int getMaxExtractLength() {
return maxExtractLength;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.commons.collections.IterableUtils;
import org.apache.jackrabbit.oak.commons.log.LogSilencer;
import org.apache.jackrabbit.oak.plugins.index.search.Aggregate;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
Expand Down Expand Up @@ -58,6 +59,9 @@
public abstract class FulltextDocumentMaker<D> implements DocumentMaker<D> {

private final Logger log = LoggerFactory.getLogger(getClass());

private static final LogSilencer LOG_SILENCER = new LogSilencer();

public static final String WARN_LOG_STRING_SIZE_THRESHOLD_KEY = "oak.repository.property.index.logWarnStringSizeThreshold";
private static final int DEFAULT_WARN_LOG_STRING_SIZE_THRESHOLD_VALUE = 102400;

Expand Down Expand Up @@ -343,7 +347,13 @@ private boolean indexProperty(String path,
dirty |= indexFacets(doc, property, pname, pd);
}
if (pd.similarityTags) {
dirty |= indexSimilarityTag(doc, property);
String value = property.getValue(Type.STRING);
if (isTagWithinLengthLimit(value)) {
dirty |= indexSimilarityTag(doc, value);
} else if (!LOG_SILENCER.silence(pname)) {
log.warn("[{}] Skipping similarity tag for property {}. Value length {} exceeds maximum allowed length",
getIndexName(), pname, value.length());
}
}

}
Expand Down Expand Up @@ -377,7 +387,7 @@ protected boolean isFulltextValuePersistedAtNode(PropertyDefinition pd) {
return true;
}

protected abstract boolean indexSimilarityTag(D doc, PropertyState property);
protected abstract boolean indexSimilarityTag(D doc, String value);

protected abstract void indexSimilarityBinaries(D doc, PropertyDefinition pd, Blob blob) throws IOException;

Expand Down Expand Up @@ -704,6 +714,13 @@ protected boolean indexDynamicBoost(D doc, String propertyName, String nodeName,
continue;
}
String dynaTagValue = p.getValue(Type.STRING);
if (!isTagWithinLengthLimit(dynaTagValue)) {
if (!LOG_SILENCER.silence(p.getName())) {
log.warn("[{}] Skipping dynamic boost tag for property {}. Value length {} exceeds maximum allowed length",
getIndexName(), p.getName(), dynaTagValue.length());
}
continue;
}
p = dynaTag.getProperty(DYNAMIC_BOOST_TAG_CONFIDENCE);
if (p == null) {
// here we don't log a warning, because possibly it will be added later
Expand Down Expand Up @@ -736,6 +753,11 @@ protected String getIndexName() {
return definition.getIndexName();
}

private boolean isTagWithinLengthLimit(String value) {
int maxLength = definition.getMaxTagLength();
return maxLength < 0 || value.length() <= maxLength;
}

/*
* Extracts the local name of the current node ignoring any namespace prefix
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTa
}

protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery) throws Exception {
createAssetsIndexAndProperties(lite, similarityTags, useInFullTextQuery, null);
}

protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws Exception {
NodeTypeRegistry.register(root, new ByteArrayInputStream(ASSET_NODE_TYPE.getBytes()), "test nodeType");
Tree indexRuleProps = createIndex("dam:Asset", lite);

Expand All @@ -250,6 +254,11 @@ protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTa
predictedTags.setProperty("similarityTags", true);
}

if (maxTagLength != null) {
Tree indexDef = root.getTree("/oak:index/" + TEST_INDEX_NAME);
indexDef.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, maxTagLength);
}

root.commit();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,19 @@ public void customTikaMimeTypes() {
assertEquals("application/test-unmapped", defn.getTikaMappedMimeType("application/test-unmapped"));
}

@Test
public void maxTagLength() {
NodeBuilder defnb = newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",
"lucene", Set.of(TYPENAME_STRING));
IndexDefinition defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
assertEquals(IndexDefinition.DEFAULT_MAX_TAG_LENGTH, defn.getMaxTagLength());

defnb.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 50);

defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
assertEquals(50, defn.getMaxTagLength());
}

@Test
public void maxExtractLength() {
NodeBuilder defnb = newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",
Expand Down
Loading