elastic · benchaplin · Apr 3, 2025 · Mar 24, 2025 · Mar 26, 2025 · Mar 27, 2025
diff --git a/docs/changelog/125732.yaml b/docs/changelog/125732.yaml
@@ -0,0 +1,5 @@
+pr: 125732
+summary: Log stack traces on data nodes before they are cleared for transport
+area: Search
+type: bug
+issues: []
diff --git a/...oke-test-http/src/internalClusterTest/java/org/elasticsearch/http/SearchErrorTraceIT.java b/...oke-test-http/src/internalClusterTest/java/org/elasticsearch/http/SearchErrorTraceIT.java
@@ -11,16 +11,23 @@
 
 import org.apache.http.entity.ContentType;
 import org.apache.http.nio.entity.NByteArrayEntity;
+import org.apache.logging.log4j.Level;
+import org.apache.logging.log4j.core.config.Configurator;
 import org.elasticsearch.action.search.MultiSearchRequest;
 import org.elasticsearch.action.search.SearchRequest;
 import org.elasticsearch.client.Request;
+import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.search.ErrorTraceHelper;
+import org.elasticsearch.search.SearchService;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
+import org.elasticsearch.test.MockLog;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.xcontent.XContentType;
 import org.junit.Before;
+import org.junit.BeforeClass;
 
 import java.io.IOException;
 import java.nio.charset.Charset;
@@ -37,19 +44,27 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return CollectionUtils.appendToCopyNoNullElements(super.nodePlugins(), MockTransportService.TestPlugin.class);
     }
 
+    @BeforeClass
+    public static void setDebugLogLevel() {
+        Configurator.setLevel(SearchService.class, Level.DEBUG);
+    }
+
     @Before
     public void setupMessageListener() {
         hasStackTrace = ErrorTraceHelper.setupErrorTraceListener(internalCluster());
     }
 
-    private void setupIndexWithDocs() {
-        createIndex("test1", "test2");
+    private int setupIndexWithDocs() {
+        int numShards = numberOfShards();
+        createIndex("test1", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numShards).build());
+        createIndex("test2", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numShards).build());
         indexRandom(
             true,
             prepareIndex("test1").setId("1").setSource("field", "foo"),
             prepareIndex("test2").setId("10").setSource("field", 5)
         );
         refresh();
+        return numShards;
     }
 
     public void testSearchFailingQueryErrorTraceDefault() throws IOException {
@@ -108,6 +123,59 @@ public void testSearchFailingQueryErrorTraceFalse() throws IOException {
         assertFalse(hasStackTrace.getAsBoolean());
     }
 
+    public void testNoLoggingInSearchFailingQueryErrorTraceTrue() throws IOException {
+        int numShards = setupIndexWithDocs();
+
+        Request searchRequest = new Request("POST", "/_search");
+        searchRequest.setJsonEntity("""
+            {
+                "query": {
+                    "simple_query_string" : {
+                        "query": "foo",
+                        "fields": ["field"]
+                    }
+                }
+            }
+            """);
+
+        String errorTriggeringIndex = "test2";
+        try (var mockLog = MockLog.capture(SearchService.class)) {
+            ErrorTraceHelper.addUnseenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
+
+            searchRequest.addParameter("error_trace", "true");
+            getRestClient().performRequest(searchRequest);
+            mockLog.assertAllExpectationsMatched();
+        }
+    }
+
+    public void testLoggingInSearchFailingQueryErrorTraceFalse() throws IOException {
+        int numShards = setupIndexWithDocs();
+
+        Request searchRequest = new Request("POST", "/_search");
+        searchRequest.setJsonEntity("""
+            {
+                "query": {
+                    "simple_query_string" : {
+                        "query": "foo",
+                        "fields": ["field"]
+                    }
+                }
+            }
+            """);
+
+        String errorTriggeringIndex = "test2";
+        try (var mockLog = MockLog.capture(SearchService.class)) {
+            ErrorTraceHelper.addSeenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
+
+            // error_trace defaults to false so we can test both cases with some randomization
+            if (randomBoolean()) {
+                searchRequest.addParameter("error_trace", "false");
+            }
+            getRestClient().performRequest(searchRequest);
+            mockLog.assertAllExpectationsMatched();
+        }
+    }
+
     public void testMultiSearchFailingQueryErrorTraceDefault() throws IOException {
         setupIndexWithDocs();
 
@@ -158,4 +226,55 @@ public void testMultiSearchFailingQueryErrorTraceFalse() throws IOException {
 
         assertFalse(hasStackTrace.getAsBoolean());
     }
+
+    public void testLoggingInMultiSearchFailingQueryErrorTraceTrue() throws IOException {
+        int numShards = setupIndexWithDocs();
+
+        XContentType contentType = XContentType.JSON;
+        MultiSearchRequest multiSearchRequest = new MultiSearchRequest().add(
+            new SearchRequest("test*").source(new SearchSourceBuilder().query(simpleQueryStringQuery("foo").field("field")))
+        );
+        Request searchRequest = new Request("POST", "/_msearch");
+        byte[] requestBody = MultiSearchRequest.writeMultiLineFormat(multiSearchRequest, contentType.xContent());
+        searchRequest.setEntity(
+            new NByteArrayEntity(requestBody, ContentType.create(contentType.mediaTypeWithoutParameters(), (Charset) null))
+        );
+
+        searchRequest.addParameter("error_trace", "true");
+
+        String errorTriggeringIndex = "test2";
+        try (var mockLog = MockLog.capture(SearchService.class)) {
+            ErrorTraceHelper.addUnseenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
+
+            getRestClient().performRequest(searchRequest);
+            mockLog.assertAllExpectationsMatched();
+        }
+    }
+
+    public void testLoggingInMultiSearchFailingQueryErrorTraceFalse() throws IOException {
+        int numShards = setupIndexWithDocs();
+
+        XContentType contentType = XContentType.JSON;
+        MultiSearchRequest multiSearchRequest = new MultiSearchRequest().add(
+            new SearchRequest("test*").source(new SearchSourceBuilder().query(simpleQueryStringQuery("foo").field("field")))
+        );
+        Request searchRequest = new Request("POST", "/_msearch");
+        byte[] requestBody = MultiSearchRequest.writeMultiLineFormat(multiSearchRequest, contentType.xContent());
+        searchRequest.setEntity(
+            new NByteArrayEntity(requestBody, ContentType.create(contentType.mediaTypeWithoutParameters(), (Charset) null))
+        );
+
+        // error_trace defaults to false so we can test both cases with some randomization
+        if (randomBoolean()) {
+            searchRequest.addParameter("error_trace", "false");
+        }
+
+        String errorTriggeringIndex = "test2";
+        try (var mockLog = MockLog.capture(SearchService.class)) {
+            ErrorTraceHelper.addSeenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
+
+            getRestClient().performRequest(searchRequest);
+            mockLog.assertAllExpectationsMatched();
+        }
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/search/SearchService.java b/server/src/main/java/org/elasticsearch/search/SearchService.java
@@ -156,6 +156,7 @@
 import java.util.function.Supplier;
 
 import static org.elasticsearch.TransportVersions.ERROR_TRACE_IN_TRANSPORT_HEADER;
+import static org.elasticsearch.common.Strings.format;
 import static org.elasticsearch.core.TimeValue.timeValueHours;
 import static org.elasticsearch.core.TimeValue.timeValueMillis;
 import static org.elasticsearch.core.TimeValue.timeValueMinutes;
@@ -538,12 +539,16 @@ protected void doClose() {
      * @param <T>            the type of the response
      * @param listener       the action listener to be wrapped
      * @param version        channel version of the request
+     * @param nodeId         id of the current node
+     * @param shardId        id of the shard being searched
      * @param threadPool     with context where to write the new header
      * @return the wrapped action listener
      */
     static <T> ActionListener<T> maybeWrapListenerForStackTrace(
         ActionListener<T> listener,
         TransportVersion version,
+        String nodeId,
+        ShardId shardId,
         ThreadPool threadPool
     ) {
         boolean header = true;
@@ -552,6 +557,16 @@ static <T> ActionListener<T> maybeWrapListenerForStackTrace(
         }
         if (header == false) {
             return listener.delegateResponse((l, e) -> {
+                org.apache.logging.log4j.util.Supplier<String> messageSupplier = () -> format(
+                    "[%s]%s: failed to execute search request",
+                    nodeId,
+                    shardId
+                );
+                if (ExceptionsHelper.status(e).getStatus() < 500 || ExceptionsHelper.isNodeOrShardUnavailableTypeException(e)) {
+                    logger.debug(messageSupplier, e);
+                } else {
+                    logger.warn(messageSupplier, e);
+                }
                 ExceptionsHelper.unwrapCausesAndSuppressed(e, err -> {
                     err.setStackTrace(EMPTY_STACK_TRACE_ARRAY);
                     return false;
@@ -563,7 +578,13 @@ static <T> ActionListener<T> maybeWrapListenerForStackTrace(
     }
 
     public void executeDfsPhase(ShardSearchRequest request, SearchShardTask task, ActionListener<SearchPhaseResult> listener) {
-        listener = maybeWrapListenerForStackTrace(listener, request.getChannelVersion(), threadPool);
+        listener = maybeWrapListenerForStackTrace(
+            listener,
+            request.getChannelVersion(),
+            clusterService.localNode().getId(),
+            request.shardId(),
+            threadPool
+        );
         final IndexShard shard = getShard(request);
         rewriteAndFetchShardRequest(shard, request, listener.delegateFailure((l, rewritten) -> {
             // fork the execution in the search thread pool
@@ -607,7 +628,13 @@ public void executeQueryPhase(ShardSearchRequest request, CancellableTask task,
         rewriteAndFetchShardRequest(
             shard,
             request,
-            maybeWrapListenerForStackTrace(listener, request.getChannelVersion(), threadPool).delegateFailure((l, orig) -> {
+            maybeWrapListenerForStackTrace(
+                listener,
+                request.getChannelVersion(),
+                clusterService.localNode().getId(),
+                request.shardId(),
+                threadPool
+            ).delegateFailure((l, orig) -> {
                 // check if we can shortcut the query phase entirely.
                 if (orig.canReturnNullResponseIfMatchNoDocs()) {
                     assert orig.scroll() == null;
@@ -805,9 +832,15 @@ private SearchPhaseResult executeQueryPhase(ShardSearchRequest request, Cancella
     }
 
     public void executeRankFeaturePhase(RankFeatureShardRequest request, SearchShardTask task, ActionListener<RankFeatureResult> listener) {
-        listener = maybeWrapListenerForStackTrace(listener, request.getShardSearchRequest().getChannelVersion(), threadPool);
         final ReaderContext readerContext = findReaderContext(request.contextId(), request);
         final ShardSearchRequest shardSearchRequest = readerContext.getShardSearchRequest(request.getShardSearchRequest());
+        listener = maybeWrapListenerForStackTrace(
+            listener,
+            shardSearchRequest.getChannelVersion(),
+            clusterService.localNode().getId(),
+            shardSearchRequest.shardId(),
+            threadPool
+        );
         final Releasable markAsUsed = readerContext.markAsUsed(getKeepAlive(shardSearchRequest));
         runAsync(getExecutor(readerContext.indexShard()), () -> {
             try (SearchContext searchContext = createContext(readerContext, shardSearchRequest, task, ResultsType.RANK_FEATURE, false)) {
@@ -856,8 +889,14 @@ public void executeQueryPhase(
         ActionListener<ScrollQuerySearchResult> listener,
         TransportVersion version
     ) {
-        listener = maybeWrapListenerForStackTrace(listener, version, threadPool);
         final LegacyReaderContext readerContext = (LegacyReaderContext) findReaderContext(request.contextId(), request);
+        listener = maybeWrapListenerForStackTrace(
+            listener,
+            version,
+            clusterService.localNode().getId(),
+            readerContext.indexShard().shardId(),
+            threadPool
+        );
         final Releasable markAsUsed;
         try {
             markAsUsed = readerContext.markAsUsed(getScrollKeepAlive(request.scroll()));
@@ -905,9 +944,15 @@ public void executeQueryPhase(
         ActionListener<QuerySearchResult> listener,
         TransportVersion version
     ) {
-        listener = maybeWrapListenerForStackTrace(listener, version, threadPool);
         final ReaderContext readerContext = findReaderContext(request.contextId(), request.shardSearchRequest());
         final ShardSearchRequest shardSearchRequest = readerContext.getShardSearchRequest(request.shardSearchRequest());
+        listener = maybeWrapListenerForStackTrace(
+            listener,
+            version,
+            clusterService.localNode().getId(),
+            shardSearchRequest.shardId(),
+            threadPool
+        );
         final Releasable markAsUsed = readerContext.markAsUsed(getKeepAlive(shardSearchRequest));
         rewriteAndFetchShardRequest(readerContext.indexShard(), shardSearchRequest, listener.delegateFailure((l, rewritten) -> {
             // fork the execution in the search thread pool