elastic · markjhoy · Apr 12, 2025 · Apr 12, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml
@@ -0,0 +1,6 @@
+pr: 126739
+summary: Update `sparse_vector` field mapping to include default setting for token
+  pruning
+area: Mapping
+type: enhancement
+issues: []
diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md
@@ -24,6 +24,28 @@ PUT my-index
 }
 ```
 
+Also, with optional `index_options` for pruning:
+
+```console
+PUT my-index
+{
+  "mappings": {
+    "properties": {
+      "text.tokens": {
+        "type": "sparse_vector",
+        "index_options": {
+          "prune": true,
+          "pruning_config": {
+            "tokens_freq_ratio_threshold": 5,
+            "tokens_weight_threshold": 0.4
+          }
+        }
+      }
+    }
+  }
+}
+```
+
 See [semantic search with ELSER](docs-content://solutions/search/semantic-search/semantic-search-elser-ingest-pipelines.md) for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.
 
 ## Parameters for `sparse_vector` fields [sparse-vectors-params]
@@ -36,6 +58,28 @@ The following parameters are accepted by `sparse_vector` fields:
     * Exclude the field from [_source](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#source-filtering).
     * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source).
 
+index_options
+:   (Optional, object) You can set index options for your  `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If pruning options are not set in your `sparse_query` vector, Elasticsearch will use the default options configured for the field, if any. The available options for the index options are:
+
+Parameters for `index_options` are:
+
+`prune` {applies_to}`stack: preview 9.1`
+:   (Optional, boolean) Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true.
+
+`pruning_config` {applies_to}`stack: preview 9.1`
+:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur.
-:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur.
+:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false but `pruning_config` is specified, an exception will occur.
-:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur.
+:   (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false but `pruning_config` is specified, an exception will occur.
+
+    Parameters for `pruning_config` include:
+
+    `tokens_freq_ratio_threshold` {applies_to}`stack: preview 9.1`
+    :   (Optional, integer) Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`.
+
+    `tokens_weight_threshold` {applies_to}`stack: preview 9.1`
+    :   (Optional, float) Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`.
+
+    ::::{note}
+    The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSERv2 that provided the most optimal results.
+    ::::
 
 
 ## Multi-value sparse vectors [index-multi-value-sparse-vectors]

diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml
@@ -1,3 +1,15 @@
+
+---
+teardown:
+  # ensure indices are cleaned up after each test
+  # mainly for the sparse vector tests
+  - do:
+      indices.delete:
+        index: ["test1", "test2"]
+        ignore: 404
+  - do:
+      indices.refresh: { }
+
 ---
 "cluster stats test":
   - do:
@@ -358,6 +370,7 @@
   - requires:
       cluster_features: [ "gte_v8.15.0" ]
       reason: "sparse vector stats added in 8.15"
+
   - do:
       indices.create:
         index: test1

diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -190,6 +190,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion JOIN_ON_ALIASES_8_19 = def(8_841_0_42);
     public static final TransportVersion ILM_ADD_SKIP_SETTING_8_19 = def(8_841_0_43);
     public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED_8_19 = def(8_841_0_44);
+    public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19 = def(8_841_0_45);
     public static final TransportVersion V_9_0_0 = def(9_000_0_09);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
@@ -284,6 +285,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion JOIN_ON_ALIASES = def(9_088_0_00);
     public static final TransportVersion ILM_ADD_SKIP_SETTING = def(9_089_0_00);
     public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED = def(9_090_0_00);
+    public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS = def(9_091_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java
@@ -142,6 +142,7 @@ private static Version parseUnchecked(String version) {
     public static final IndexVersion DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ_BACKPORT_8_X = def(8_530_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X = def(8_531_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion INDEX_INT_SORT_INT_TYPE_8_19 = def(8_532_0_00, Version.LUCENE_9_12_1);
+    public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X = def(8_533_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion UPGRADE_TO_LUCENE_10_0_0 = def(9_000_0_00, Version.LUCENE_10_0_0);
     public static final IndexVersion LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT = def(9_001_0_00, Version.LUCENE_10_0_0);
     public static final IndexVersion TIME_BASED_K_ORDERED_DOC_ID = def(9_002_0_00, Version.LUCENE_10_0_0);
@@ -171,6 +172,7 @@ private static Version parseUnchecked(String version) {
     public static final IndexVersion DEFAULT_TO_ACORN_HNSW_FILTER_HEURISTIC = def(9_026_0_00, Version.LUCENE_10_2_1);
     public static final IndexVersion SEQ_NO_WITHOUT_POINTS = def(9_027_0_00, Version.LUCENE_10_2_1);
     public static final IndexVersion INDEX_INT_SORT_INT_TYPE = def(9_028_0_00, Version.LUCENE_10_2_1);
+    public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_029_0_00, Version.LUCENE_10_2_1);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@@ -22,6 +22,9 @@
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.logging.DeprecationCategory;
 import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.features.NodeFeature;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -31,24 +34,35 @@
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperBuilderContext;
+import org.elasticsearch.index.mapper.MappingParserContext;
 import org.elasticsearch.index.mapper.SourceLoader;
 import org.elasticsearch.index.mapper.SourceValueFetcher;
 import org.elasticsearch.index.mapper.TextSearchInfo;
 import org.elasticsearch.index.mapper.ValueFetcher;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.fetch.StoredFieldsSpec;
 import org.elasticsearch.search.lookup.Source;
+import org.elasticsearch.xcontent.ConstructingObjectParser;
+import org.elasticsearch.xcontent.DeprecationHandler;
+import org.elasticsearch.xcontent.NamedXContentRegistry;
+import org.elasticsearch.xcontent.ParseField;
+import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentParser;
 import org.elasticsearch.xcontent.XContentParser.Token;
+import org.elasticsearch.xcontent.XContentType;
+import org.elasticsearch.xcontent.support.MapXContentParser;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.stream.Stream;
 
 import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
+import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;
 
 /**
  * A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse
@@ -57,6 +71,7 @@
 public class SparseVectorFieldMapper extends FieldMapper {
 
     public static final String CONTENT_TYPE = "sparse_vector";
+    public static final String SPARSE_VECTOR_INDEX_OPTIONS = "index_options";
 
     static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to "
         + "contain [sparse_vector] fields, but they cannot be indexed or searched.";
@@ -66,13 +81,26 @@ public class SparseVectorFieldMapper extends FieldMapper {
     static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
     static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;
 
+    static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
+
+    public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector.index_options_supported");
+
     private static SparseVectorFieldMapper toType(FieldMapper in) {
         return (SparseVectorFieldMapper) in;
     }
 
     public static class Builder extends FieldMapper.Builder {
         private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
         private final Parameter<Map<String, String>> meta = Parameter.metaParam();
+        private final Parameter<IndexOptions> indexOptions = new Parameter<>(
+            SPARSE_VECTOR_INDEX_OPTIONS,
+            true,
+            () -> null,
+            (n, c, o) -> parseIndexOptions(c, o),
+            m -> toType(m).fieldType().indexOptions,
+            XContentBuilder::field,
+            Objects::toString
+        ).acceptsNull();
 
         public Builder(String name) {
             super(name);
@@ -85,19 +113,54 @@ public Builder setStored(boolean value) {
 
         @Override
         protected Parameter<?>[] getParameters() {
-            return new Parameter<?>[] { stored, meta };
+            return new Parameter<?>[] { stored, meta, indexOptions };
         }
 
         @Override
         public SparseVectorFieldMapper build(MapperBuilderContext context) {
             return new SparseVectorFieldMapper(
                 leafName(),
-                new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
+                new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), indexOptions.getValue()),
                 builderParams(this, context)
             );
         }
     }
 
+    public IndexOptions getIndexOptions() {
+        return fieldType().getIndexOptions();
+    }
+
+    private static final ConstructingObjectParser<IndexOptions, Void> INDEX_OPTIONS_PARSER = new ConstructingObjectParser<>(
+        SPARSE_VECTOR_INDEX_OPTIONS,
+        args -> new IndexOptions((Boolean) args[0], (TokenPruningConfig) args[1])
+    );
+
+    static {
+        INDEX_OPTIONS_PARSER.declareBoolean(optionalConstructorArg(), IndexOptions.PRUNE_FIELD_NAME);
+        INDEX_OPTIONS_PARSER.declareObject(optionalConstructorArg(), TokenPruningConfig.PARSER, IndexOptions.PRUNING_CONFIG_FIELD_NAME);
+    }
+
+    private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) {
+        if (propNode == null) {
+            return null;
+        }
+
+        Map<String, Object> indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS);
+
+        XContentParser parser = new MapXContentParser(
+            NamedXContentRegistry.EMPTY,
+            DeprecationHandler.IGNORE_DEPRECATIONS,
+            indexOptionsMap,
+            XContentType.JSON
+        );
+
+        try {
+            return INDEX_OPTIONS_PARSER.parse(parser, null);
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
     public static final TypeParser PARSER = new TypeParser((n, c) -> {
         if (c.indexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
             deprecationLogger.warn(DeprecationCategory.MAPPINGS, "sparse_vector", ERROR_MESSAGE_7X);
@@ -109,9 +172,24 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {
     }, notInMultiFields(CONTENT_TYPE));
 
     public static final class SparseVectorFieldType extends MappedFieldType {
+        private final IndexOptions indexOptions;
 
         public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
+            this(name, isStored, meta, null);
+        }
+
+        public SparseVectorFieldType(
+            String name,
+            boolean isStored,
+            Map<String, String> meta,
+            @Nullable SparseVectorFieldMapper.IndexOptions indexOptions
+        ) {
             super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
+            this.indexOptions = indexOptions;
+        }
+
+        public IndexOptions getIndexOptions() {
+            return indexOptions;
         }
 
         @Override
@@ -364,4 +442,70 @@ public void reset() {
         }
     }
 
+    public static class IndexOptions implements ToXContent {
+        public static final ParseField PRUNE_FIELD_NAME = new ParseField("prune");
+        public static final ParseField PRUNING_CONFIG_FIELD_NAME = new ParseField("pruning_config");
+
+        final Boolean prune;
+        final TokenPruningConfig pruningConfig;
+
+        IndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) {
+            if (pruningConfig != null && (prune == null || prune == false)) {
+                throw new IllegalArgumentException(
+                    "["
+                        + SPARSE_VECTOR_INDEX_OPTIONS
+                        + "] field ["
+                        + PRUNING_CONFIG_FIELD_NAME.getPreferredName()
+                        + "] should only be set if ["
+                        + PRUNE_FIELD_NAME.getPreferredName()
+                        + "] is set to true"
+                );
+            }
+
+            this.prune = prune;
+            this.pruningConfig = pruningConfig;
+        }
+
+        public Boolean getPrune() {
+            return prune;
+        }
+
+        public TokenPruningConfig getPruningConfig() {
+            return pruningConfig;
+        }
+
+        @Override
+        public final boolean equals(Object other) {
+            if (other == this) {
+                return true;
+            }
+
+            if (other == null || getClass() != other.getClass()) {
+                return false;
+            }
+
+            IndexOptions otherAsIndexOptions = (IndexOptions) other;
+            return Objects.equals(prune, otherAsIndexOptions.prune) && Objects.equals(pruningConfig, otherAsIndexOptions.pruningConfig);
+        }
+
+        @Override
+        public final int hashCode() {
+            return Objects.hash(prune, pruningConfig);
+        }
+
+        @Override
+        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+            builder.startObject();
+
+            if (prune != null) {
+                builder.field(PRUNE_FIELD_NAME.getPreferredName(), prune);
+            }
+            if (pruningConfig != null) {
+                builder.field(PRUNING_CONFIG_FIELD_NAME.getPreferredName(), pruningConfig);
+            }
+
+            builder.endObject();
+            return builder;
+        }
+    }
 }