Skip to content

Commit 88cbec0

Browse files
antonhosgoodAnton Hosgood
andauthored
OAK-12101 - Skip indexing of very long tags (#2768)
* feat: skip long similarity tags * feat: add tests * feat: log warning once per minute * feat: skip long similarity tags for dynamic boosting * fix: similarity tag constant naming * feat: add option to disable filtering * refactor: standardise naming with existing conventions * feat: silence logs per property * fix: comment * feat: add dynamic boost test * doc: add details to lucene.md * fix: test name --------- Co-authored-by: Anton Hosgood <ahosgood@adobe.com>
1 parent 7b77f2d commit 88cbec0

File tree

10 files changed

+158
-12
lines changed

10 files changed

+158
-12
lines changed

oak-doc/src/site/markdown/query/lucene.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ Below is the canonical index definition structure
154154
- queryPaths (string) multiple = ['/']
155155
- excludedPaths (string) multiple
156156
- maxFieldLength (long) = 10000
157+
- maxTagLength (long) = 100
157158
- refresh (boolean)
158159
- useIfExists (string)
159160
- blobSize (long) = 32768
@@ -233,6 +234,13 @@ selectionPolicy
233234
[maxFieldLength][OAK-2469]
234235
: Numbers of terms indexed per field. Defaults to 10000
235236

237+
[maxTagLength][OAK-12101]
238+
: Optional integer property. Defaults to 100.
239+
: Maximum length of similarity tag and dynamic boost tag values to be indexed.
240+
Tags with values longer than this limit are skipped during indexing.
241+
Set to -1 to disable the length check entirely.
242+
See [Dynamic Boost](#dynamic-boost) and [Search by similar feature vectors](#similar-fv) for details.
243+
236244
refresh
237245
: Optional boolean property.
238246
: Used to refresh the stored index definition. See [Effective Index Definition](#stored-index-definition)
@@ -1231,6 +1239,11 @@ with boost set to the confidence.
12311239
This is a replacement for the `IndexFieldProvider`.
12321240
See also [OAK-8971][OAK-8971].
12331241

1242+
Tag values that exceed the configured `maxTagLength` (default 100) are skipped during indexing.
1243+
This prevents unexpectedly long values from being indexed as dynamic boost tags.
1244+
The limit can be changed by setting the `maxTagLength` property on the index definition,
1245+
or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].
1246+
12341247

12351248
### <a name="native-query"></a>Native Query and Index Selection
12361249
`@deprecated Oak 1.46`
@@ -1702,6 +1715,11 @@ As a further improvement for the accuracy of similarity search results if nodes
17021715
holding text values that can be used as keywords or tags that well describe the feature vector contents, the
17031716
`similarityTags` configuration can be set to _true_ for such properties (see [OAK-8118](https://issues.apache.org/jira/browse/OAK-8118)).
17041717

1718+
Similarity tag values that exceed the configured `maxTagLength` (default 100) are skipped during indexing.
1719+
This prevents unexpectedly long values from being indexed as similarity tags.
1720+
The limit can be changed by setting the `maxTagLength` property on the index definition,
1721+
or disabled entirely by setting it to -1. See [OAK-12101][OAK-12101].
1722+
17051723
See also [OAK-7575](https://issues.apache.org/jira/browse/OAK-7575).
17061724

17071725

@@ -2231,6 +2249,7 @@ SELECT rep:facet(title) FROM [app:Asset] WHERE [title] IS NOT NULL
22312249
[OAK-7739]: https://issues.apache.org/jira/browse/OAK-7739
22322250
[OAK-8971]: https://issues.apache.org/jira/browse/OAK-8971
22332251
[OAK-9625]: https://issues.apache.org/jira/browse/OAK-9625
2252+
[OAK-12101]: https://issues.apache.org/jira/browse/OAK-12101
22342253
[luke]: https://code.google.com/p/luke/
22352254
[tika]: http://tika.apache.org/
22362255
[oak-console]: https://github.com/apache/jackrabbit-oak/tree/trunk/oak-run#console

oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,8 @@ protected void indexNodeName(Document doc, String value) {
415415
}
416416

417417
@Override
418-
protected boolean indexSimilarityTag(Document doc, PropertyState property) {
419-
doc.add(new TextField(FieldNames.SIMILARITY_TAGS, property.getValue(Type.STRING), Field.Store.YES));
418+
protected boolean indexSimilarityTag(Document doc, String value) {
419+
doc.add(new TextField(FieldNames.SIMILARITY_TAGS, value, Field.Store.YES));
420420
return true;
421421
}
422422

oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMakerTest.java

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,27 @@
2121

2222
import org.apache.jackrabbit.oak.api.Type;
2323
import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneIndexDefinitionBuilder;
24+
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
25+
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
2426
import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
2527
import org.apache.jackrabbit.oak.spi.state.NodeState;
28+
import org.apache.lucene.document.Document;
2629
import org.junit.Test;
2730

2831
import java.util.List;
2932

3033
import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT;
3134
import static org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState.EMPTY_NODE;
35+
import static org.junit.Assert.assertEquals;
3236
import static org.junit.Assert.assertNotNull;
3337
import static org.junit.Assert.assertNull;
3438

3539
public class LuceneDocumentMakerTest {
3640
private final NodeState root = INITIAL_CONTENT;
37-
private final LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();
3841

3942
@Test
4043
public void excludeSingleProperty() throws Exception{
44+
LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();
4145
builder.indexRule("nt:base")
4246
.property("foo")
4347
.propertyIndex()
@@ -63,4 +67,39 @@ public void excludeSingleProperty() throws Exception{
6367
assertNull(docMaker.makeDocument(test.getNodeState()));
6468
}
6569

66-
}
70+
@Test
71+
public void similarityTagMaxLengthFiltering() throws Exception{
72+
LuceneIndexDefinitionBuilder builder = new LuceneIndexDefinitionBuilder();
73+
builder.indexRule("nt:base")
74+
.property("jcr:primaryType")
75+
.propertyIndex();
76+
builder.indexRule("nt:base")
77+
.property("tag")
78+
.similarityTags(true);
79+
80+
builder.getBuilderTree().setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 10);
81+
82+
LuceneIndexDefinition defn = LuceneIndexDefinition.newLuceneBuilder(root, builder.build(), "/foo").build();
83+
LuceneDocumentMaker docMaker = new LuceneDocumentMaker(defn,
84+
defn.getApplicableIndexingRule("nt:base"), "/x");
85+
86+
NodeBuilder test = EMPTY_NODE.builder();
87+
test.setProperty("tag", "short");
88+
Document doc = docMaker.makeDocument(test.getNodeState());
89+
assertNotNull(doc);
90+
assertEquals("short", doc.get(FieldNames.SIMILARITY_TAGS));
91+
92+
test = EMPTY_NODE.builder();
93+
test.setProperty("tag", "exactly10!");
94+
doc = docMaker.makeDocument(test.getNodeState());
95+
assertNotNull(doc);
96+
assertEquals("exactly10!", doc.get(FieldNames.SIMILARITY_TAGS));
97+
98+
test = EMPTY_NODE.builder();
99+
test.setProperty("tag", "this is too long");
100+
doc = docMaker.makeDocument(test.getNodeState());
101+
assertNotNull(doc);
102+
assertNull(doc.get(FieldNames.SIMILARITY_TAGS));
103+
}
104+
105+
}

oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/LuceneDynamicBoostTest.java

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,10 +184,37 @@ public void dynamicBoostLiteShouldGiveLessRelevanceToTags() throws Exception {
184184
List.of("/test/asset3", "/test/asset2"));
185185
}
186186

187+
@Test
188+
public void dynamicBoostMaxLengthFiltering() throws Exception {
189+
createAssetsIndexAndProperties(false, false, true, 10);
190+
191+
Tree testParent = createNodeWithType(root.getTree("/"), "test", JcrConstants.NT_UNSTRUCTURED, "");
192+
193+
Tree predicted1 = createAssetNodeWithPredicted(testParent, "asset1", "test");
194+
createPredictedTag(predicted1, "short", 0.9);
195+
createPredictedTag(predicted1, "exactly10!", 0.8);
196+
createPredictedTag(predicted1, "this is too long", 0.7);
197+
198+
Tree predicted2 = createAssetNodeWithPredicted(testParent, "asset2", "test");
199+
createPredictedTag(predicted2, "short", 0.9);
200+
createPredictedTag(predicted2, "exactly10!", 0.8);
201+
202+
root.commit();
203+
204+
assertEventually(() -> {
205+
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'short')", SQL2,
206+
List.of("/test/asset1", "/test/asset2"));
207+
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'exactly10!')", SQL2,
208+
List.of("/test/asset1", "/test/asset2"));
209+
210+
assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'this is too long')", SQL2, List.of());
211+
});
212+
}
213+
187214
@Override
188-
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags) throws Exception {
215+
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws Exception {
189216
factory.queryTermsProvider = new FulltextQueryTermsProviderImpl();
190-
super.createAssetsIndexAndProperties(lite, similarityTags);
217+
super.createAssetsIndexAndProperties(lite, similarityTags, useInFullTextQuery, maxTagLength);
191218
}
192219

193220
private String runIndexingTest(Class<?> loggerClass, boolean nameProperty) throws CommitFailedException {

oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,10 +242,9 @@ protected void indexNodeName(ElasticDocument doc, String value) {
242242
}
243243

244244
@Override
245-
protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property) {
246-
String val = property.getValue(Type.STRING);
247-
if (!val.isEmpty()) {
248-
doc.addSimilarityTag(val);
245+
protected boolean indexSimilarityTag(ElasticDocument doc, String value) {
246+
if (!value.isEmpty()) {
247+
doc.addSimilarityTag(value);
249248
return true;
250249
}
251250
return false;

oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,12 @@ public static IndexingMode from(String indexingMode) {
251251
*/
252252
String MAX_FIELD_LENGTH = "maxFieldLength";
253253

254+
/**
255+
* Maximum length of similarity and dynamic boost tag values to be indexed. Tags longer than this value will be skipped.
256+
* Set to -1 to disable the length check entirely
257+
*/
258+
String MAX_TAG_LENGTH = "maxTagLength";
259+
254260
/**
255261
* whether use this property values for suggestions
256262
*/

oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ public class IndexDefinition implements Aggregate.AggregateMapper {
146146
*/
147147
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
148148

149+
/**
150+
* Default value for property {@link #maxTagLength}.
151+
*/
152+
public static final int DEFAULT_MAX_TAG_LENGTH = 100;
153+
149154
public static final int DEFAULT_MAX_EXTRACT_LENGTH = -10;
150155

151156
/**
@@ -274,6 +279,8 @@ public class IndexDefinition implements Aggregate.AggregateMapper {
274279

275280
private final int maxFieldLength;
276281

282+
private final int maxTagLength;
283+
277284
private final int maxExtractLength;
278285

279286
private final int suggesterUpdateFrequencyMinutes;
@@ -470,6 +477,7 @@ protected IndexDefinition(NodeState root, NodeState defn, IndexFormatVersion ver
470477
}
471478

472479
this.maxFieldLength = getOptionalValue(defn, FulltextIndexConstants.MAX_FIELD_LENGTH, DEFAULT_MAX_FIELD_LENGTH);
480+
this.maxTagLength = getOptionalValue(defn, FulltextIndexConstants.MAX_TAG_LENGTH, DEFAULT_MAX_TAG_LENGTH);
473481
this.costPerEntry = getOptionalValue(defn, FulltextIndexConstants.COST_PER_ENTRY, getDefaultCostPerEntry(version));
474482
this.costPerExecution = getOptionalValue(defn, FulltextIndexConstants.COST_PER_EXECUTION, 1.0);
475483
this.hasCustomTikaConfig = getTikaConfigNode().exists();
@@ -690,6 +698,10 @@ public String[] getIndexTags() {
690698
return indexSelectionPolicy;
691699
}
692700

701+
public int getMaxTagLength() {
702+
return maxTagLength;
703+
}
704+
693705
public int getMaxExtractLength() {
694706
return maxExtractLength;
695707
}

oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/editor/FulltextDocumentMaker.java

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.jackrabbit.oak.api.Type;
2424
import org.apache.jackrabbit.oak.commons.PathUtils;
2525
import org.apache.jackrabbit.oak.commons.collections.IterableUtils;
26+
import org.apache.jackrabbit.oak.commons.log.LogSilencer;
2627
import org.apache.jackrabbit.oak.plugins.index.search.Aggregate;
2728
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
2829
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
@@ -58,6 +59,9 @@
5859
public abstract class FulltextDocumentMaker<D> implements DocumentMaker<D> {
5960

6061
private final Logger log = LoggerFactory.getLogger(getClass());
62+
63+
private static final LogSilencer LOG_SILENCER = new LogSilencer();
64+
6165
public static final String WARN_LOG_STRING_SIZE_THRESHOLD_KEY = "oak.repository.property.index.logWarnStringSizeThreshold";
6266
private static final int DEFAULT_WARN_LOG_STRING_SIZE_THRESHOLD_VALUE = 102400;
6367

@@ -343,7 +347,13 @@ private boolean indexProperty(String path,
343347
dirty |= indexFacets(doc, property, pname, pd);
344348
}
345349
if (pd.similarityTags) {
346-
dirty |= indexSimilarityTag(doc, property);
350+
String value = property.getValue(Type.STRING);
351+
if (isTagWithinLengthLimit(value)) {
352+
dirty |= indexSimilarityTag(doc, value);
353+
} else if (!LOG_SILENCER.silence(pname)) {
354+
log.warn("[{}] Skipping similarity tag for property {}. Value length {} exceeds maximum allowed length",
355+
getIndexName(), pname, value.length());
356+
}
347357
}
348358

349359
}
@@ -377,7 +387,7 @@ protected boolean isFulltextValuePersistedAtNode(PropertyDefinition pd) {
377387
return true;
378388
}
379389

380-
protected abstract boolean indexSimilarityTag(D doc, PropertyState property);
390+
protected abstract boolean indexSimilarityTag(D doc, String value);
381391

382392
protected abstract void indexSimilarityBinaries(D doc, PropertyDefinition pd, Blob blob) throws IOException;
383393

@@ -704,6 +714,13 @@ protected boolean indexDynamicBoost(D doc, String propertyName, String nodeName,
704714
continue;
705715
}
706716
String dynaTagValue = p.getValue(Type.STRING);
717+
if (!isTagWithinLengthLimit(dynaTagValue)) {
718+
if (!LOG_SILENCER.silence(p.getName())) {
719+
log.warn("[{}] Skipping dynamic boost tag for property {}. Value length {} exceeds maximum allowed length",
720+
getIndexName(), p.getName(), dynaTagValue.length());
721+
}
722+
continue;
723+
}
707724
p = dynaTag.getProperty(DYNAMIC_BOOST_TAG_CONFIDENCE);
708725
if (p == null) {
709726
// here we don't log a warning, because possibly it will be added later
@@ -736,6 +753,11 @@ protected String getIndexName() {
736753
return definition.getIndexName();
737754
}
738755

756+
private boolean isTagWithinLengthLimit(String value) {
757+
int maxLength = definition.getMaxTagLength();
758+
return maxLength < 0 || value.length() <= maxLength;
759+
}
760+
739761
/*
740762
* Extracts the local name of the current node ignoring any namespace prefix
741763
*/

oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,10 @@ protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTa
234234
}
235235

236236
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery) throws Exception {
237+
createAssetsIndexAndProperties(lite, similarityTags, useInFullTextQuery, null);
238+
}
239+
240+
protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTags, boolean useInFullTextQuery, Integer maxTagLength) throws Exception {
237241
NodeTypeRegistry.register(root, new ByteArrayInputStream(ASSET_NODE_TYPE.getBytes()), "test nodeType");
238242
Tree indexRuleProps = createIndex("dam:Asset", lite);
239243

@@ -250,6 +254,11 @@ protected void createAssetsIndexAndProperties(boolean lite, boolean similarityTa
250254
predictedTags.setProperty("similarityTags", true);
251255
}
252256

257+
if (maxTagLength != null) {
258+
Tree indexDef = root.getTree("/oak:index/" + TEST_INDEX_NAME);
259+
indexDef.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, maxTagLength);
260+
}
261+
253262
root.commit();
254263
}
255264

oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinitionTest.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,19 @@ public void customTikaMimeTypes() {
513513
assertEquals("application/test-unmapped", defn.getTikaMappedMimeType("application/test-unmapped"));
514514
}
515515

516+
@Test
517+
public void maxTagLength() {
518+
NodeBuilder defnb = newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",
519+
"lucene", Set.of(TYPENAME_STRING));
520+
IndexDefinition defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
521+
assertEquals(IndexDefinition.DEFAULT_MAX_TAG_LENGTH, defn.getMaxTagLength());
522+
523+
defnb.setProperty(FulltextIndexConstants.MAX_TAG_LENGTH, 50);
524+
525+
defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
526+
assertEquals(50, defn.getMaxTagLength());
527+
}
528+
516529
@Test
517530
public void maxExtractLength() {
518531
NodeBuilder defnb = newFTIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME), "foo",

0 commit comments

Comments
 (0)