MSQ TimeBoundary optimization. (#19012)

gianm · web-flow · commit 6c1e9f4160b3 · 2026-02-14T12:35:30.000-08:00
Implement two timeBoundary-style optimizations for MSQ:

1) Filter base inputs to include only the earliest (for min) or
   latest (for max) segments.

2) Use TimeBoundaryInspector when available.

Unlike the native query path, the SQL planner still emits a groupBy
query or groupBy stage. The decisions about what to optimize happen at
execution time. This makes the optimization simpler, as there is no
need for a special timeBoundary query type that must be capable of
operating over all kinds of data. It also simplifies planning.
diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafStageProcessor.java b/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/BaseLeafStageProcessor.java
@@ -223,6 +223,15 @@ protected abstract FrameProcessor<Object> makeProcessor(
       FrameContext providerThingy
   );
 
+  /**
+   * Filters the physical input slices before they are used to create a {@link ReadableInputQueue}.
+   * Subclasses can override this to reduce the set of segments that need to be read.
+   */
+  protected List<PhysicalInputSlice> filterBaseInput(final List<PhysicalInputSlice> slices)
+  {
+    return slices;
+  }
+
   /**
    * Read base inputs, where "base" is meant in the same sense as in {@link ExecutionVertex}: the primary datasource
    * that drives query processing.
@@ -231,7 +240,7 @@ protected abstract FrameProcessor<Object> makeProcessor(
    * segments. Once {@link ReadableInputQueue#nextInput()} or {@link ReadableInputQueue#start()} is called,
    * the queue must be closed when done being used.
    */
-  private static ReadableInputQueue makeBaseInputQueue(
+  private ReadableInputQueue makeBaseInputQueue(
       final List<InputSlice> inputSlices,
       final ExecutionContext context
   )
@@ -252,12 +261,13 @@ private static ReadableInputQueue makeBaseInputQueue(
       }
     }
 
+    final List<PhysicalInputSlice> filteredSlices = filterBaseInput(physicalInputSlices);
     final Integer segmentLoadAheadCount =
         MultiStageQueryContext.getSegmentLoadAheadCount(context.workOrder().getWorkerContext());
     return new ReadableInputQueue(
         stageDef.getId().getQueryId(),
         new StandardPartitionReader(context),
-        physicalInputSlices,
+        filteredSlices,
         segmentLoadAheadCount != null ? segmentLoadAheadCount : context.threadCount()
     );
   }
diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java b/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleFrameProcessor.java
@@ -184,14 +184,22 @@ protected ReturnOrAwait<Unit> runWithSegment(final SegmentReferenceHolder segmen
         closer.register(() -> segmentHolder.getInputCounters().addFile(rowCount, 0));
       }
 
-      final Sequence<ResultRow> rowSequence =
-          groupingEngine.process(
-              query.withQuerySegmentSpec(new SpecificSegmentSpec(segmentHolder.getDescriptor())),
-              Objects.requireNonNull(segment.as(CursorFactory.class)),
-              segment.as(TimeBoundaryInspector.class),
-              bufferPool,
-              null
-          );
+      final TimeBoundaryInspector tbi = segment.as(TimeBoundaryInspector.class);
+
+      final Sequence<ResultRow> rowSequence;
+      if (GroupByTimeBoundaryUtils.canUseTimeBoundaryInspector(query, tbi, segmentHolder.getDescriptor())) {
+        // Resolve this query using the TimeBoundaryInspector, no need for a cursor.
+        rowSequence = Sequences.simple(List.of(GroupByTimeBoundaryUtils.computeTimeBoundaryResult(query, tbi)));
+      } else {
+        // Resolve this query using a cursor.
+        rowSequence = groupingEngine.process(
+            query.withQuerySegmentSpec(new SpecificSegmentSpec(segmentHolder.getDescriptor())),
+            Objects.requireNonNull(segment.as(CursorFactory.class)),
+            tbi,
+            bufferPool,
+            null
+        );
+      }
 
       resultYielder = Yielders.each(rowSequence);
     }
diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleStageProcessor.java b/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleStageProcessor.java
@@ -28,10 +28,16 @@
 import org.apache.druid.frame.processor.FrameProcessor;
 import org.apache.druid.frame.write.FrameWriterFactory;
 import org.apache.druid.msq.exec.FrameContext;
+import org.apache.druid.msq.input.LoadableSegment;
+import org.apache.druid.msq.input.PhysicalInputSlice;
 import org.apache.druid.msq.querykit.BaseLeafStageProcessor;
 import org.apache.druid.msq.querykit.ReadableInput;
 import org.apache.druid.query.groupby.GroupByQuery;
 import org.apache.druid.segment.SegmentMapFunction;
+import org.joda.time.Interval;
+
+import java.util.ArrayList;
+import java.util.List;
 
 @JsonTypeName("groupByPreShuffle")
 public class GroupByPreShuffleStageProcessor extends BaseLeafStageProcessor
@@ -71,6 +77,63 @@ protected FrameProcessor<Object> makeProcessor(
     );
   }
 
+  @Override
+  protected List<PhysicalInputSlice> filterBaseInput(final List<PhysicalInputSlice> slices)
+  {
+    if (!GroupByTimeBoundaryUtils.isTimeBoundaryQuery(query)) {
+      return slices;
+    }
+
+    // This is a time-boundary style query (see GroupByTimeBoundaryUtils.isTimeBoundaryQuery).
+    // This means we can look at just the earliest (for min) and latest (for max) segments,
+    // ignoring the ones in the middle.
+    final boolean needsMin = GroupByTimeBoundaryUtils.needsMinTime(query);
+    final boolean needsMax = GroupByTimeBoundaryUtils.needsMaxTime(query);
+
+    final List<PhysicalInputSlice> filteredSlices = new ArrayList<>(slices.size());
+
+    for (final PhysicalInputSlice slice : slices) {
+      final List<LoadableSegment> segments = slice.getLoadableSegments();
+
+      if (segments.size() <= 1) {
+        filteredSlices.add(slice);
+        continue;
+      }
+
+      // Find the earliest and latest intervals by start time.
+      Interval minInterval = null;
+      Interval maxInterval = null;
+
+      for (final LoadableSegment segment : segments) {
+        final Interval interval = segment.descriptor().getInterval();
+        if (needsMin) {
+          if (minInterval == null || interval.getStart().isBefore(minInterval.getStart())) {
+            minInterval = interval;
+          }
+        }
+        if (needsMax) {
+          if (maxInterval == null || interval.getEnd().isAfter(maxInterval.getEnd())) {
+            maxInterval = interval;
+          }
+        }
+      }
+
+      // Keep only segments whose interval overlaps with the earliest or latest interval.
+      final List<LoadableSegment> kept = new ArrayList<>();
+      for (final LoadableSegment segment : segments) {
+        final Interval segmentInterval = segment.descriptor().getInterval();
+        if ((minInterval != null && segmentInterval.overlaps(minInterval))
+            || (maxInterval != null && segmentInterval.overlaps(maxInterval))) {
+          kept.add(segment);
+        }
+      }
+
+      filteredSlices.add(new PhysicalInputSlice(slice.getReadablePartitions(), kept, slice.getQueryableServers()));
+    }
+
+    return filteredSlices;
+  }
+
   @Override
   public boolean usesProcessingBuffers()
   {
diff --git a/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByTimeBoundaryUtils.java b/multi-stage-query/src/main/java/org/apache/druid/msq/querykit/groupby/GroupByTimeBoundaryUtils.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.querykit.groupby;
+
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.query.SegmentDescriptor;
+import org.apache.druid.query.aggregation.AggregatorFactory;
+import org.apache.druid.query.aggregation.LongMaxAggregatorFactory;
+import org.apache.druid.query.aggregation.LongMinAggregatorFactory;
+import org.apache.druid.query.aggregation.SimpleLongAggregatorFactory;
+import org.apache.druid.query.groupby.GroupByQuery;
+import org.apache.druid.query.groupby.ResultRow;
+import org.apache.druid.segment.TimeBoundaryInspector;
+import org.apache.druid.segment.column.ColumnHolder;
+
+import javax.annotation.Nullable;
+import java.util.List;
+
+/**
+ * Utility methods for detecting and optimizing GroupBy queries that are effectively time boundary queries:
+ * no dimensions, {@link Granularities#ALL}, and only {@code MIN(__time)} / {@code MAX(__time)} aggregators.
+ */
+public class GroupByTimeBoundaryUtils
+{
+  /**
+   * Returns true if the query is a "time boundary" GroupBy: no dimensions, {@link Granularities#ALL},
+   * no filter, at least one aggregator, and every aggregator is {@link LongMinAggregatorFactory} or
+   * {@link LongMaxAggregatorFactory} on {@link ColumnHolder#TIME_COLUMN_NAME}.
+   */
+  public static boolean isTimeBoundaryQuery(final GroupByQuery query)
+  {
+    if (!query.getDimensions().isEmpty()) {
+      return false;
+    }
+
+    if (!Granularities.ALL.equals(query.getGranularity())) {
+      return false;
+    }
+
+    if (query.getDimFilter() != null) {
+      return false;
+    }
+
+    final List<AggregatorFactory> aggregatorSpecs = query.getAggregatorSpecs();
+
+    if (aggregatorSpecs.isEmpty()) {
+      return false;
+    }
+
+    for (final AggregatorFactory agg : aggregatorSpecs) {
+      if (!isTimeBoundaryAggregator(agg)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Returns true if the query needs the minimum time (has at least one {@link LongMinAggregatorFactory}
+   * on {@link ColumnHolder#TIME_COLUMN_NAME}).
+   */
+  public static boolean needsMinTime(final GroupByQuery query)
+  {
+    for (final AggregatorFactory agg : query.getAggregatorSpecs()) {
+      if (isTimeBoundaryAggregator(agg) && agg instanceof LongMinAggregatorFactory) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Returns true if the query needs the maximum time (has at least one {@link LongMaxAggregatorFactory}
+   * on {@link ColumnHolder#TIME_COLUMN_NAME}).
+   */
+  public static boolean needsMaxTime(final GroupByQuery query)
+  {
+    for (final AggregatorFactory agg : query.getAggregatorSpecs()) {
+      if (isTimeBoundaryAggregator(agg) && agg instanceof LongMaxAggregatorFactory) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Returns true if the {@link TimeBoundaryInspector} can be used to answer the query without scanning data.
+   * Requires that the query is a time boundary query, the inspector is non-null and exact, and that the
+   * descriptor's interval fully contains the inspector's min/max interval.
+   */
+  public static boolean canUseTimeBoundaryInspector(
+      final GroupByQuery query,
+      @Nullable final TimeBoundaryInspector tbi,
+      final SegmentDescriptor descriptor
+  )
+  {
+    return isTimeBoundaryQuery(query)
+           && tbi != null
+           && tbi.isMinMaxExact()
+           && descriptor.getInterval().contains(tbi.getMinMaxInterval());
+  }
+
+  /**
+   * Constructs a {@link ResultRow} from the time boundary inspector, filling each aggregator position
+   * with the appropriate min or max time.
+   */
+  public static ResultRow computeTimeBoundaryResult(final GroupByQuery query, final TimeBoundaryInspector tbi)
+  {
+    final int size = query.getResultRowSizeWithoutPostAggregators();
+    final ResultRow row = ResultRow.create(size);
+    final int aggStart = query.getResultRowAggregatorStart();
+    final List<AggregatorFactory> aggregatorSpecs = query.getAggregatorSpecs();
+
+    for (int i = 0; i < aggregatorSpecs.size(); i++) {
+      final AggregatorFactory agg = aggregatorSpecs.get(i);
+
+      if (agg instanceof LongMinAggregatorFactory) {
+        row.set(aggStart + i, tbi.getMinTime().getMillis());
+      } else if (agg instanceof LongMaxAggregatorFactory) {
+        row.set(aggStart + i, tbi.getMaxTime().getMillis());
+      }
+    }
+
+    return row;
+  }
+
+  private static boolean isTimeBoundaryAggregator(final AggregatorFactory agg)
+  {
+    return (agg instanceof LongMinAggregatorFactory || agg instanceof LongMaxAggregatorFactory)
+           && ColumnHolder.TIME_COLUMN_NAME.equals(((SimpleLongAggregatorFactory) agg).getFieldName());
+  }
+}
diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
@@ -63,6 +63,8 @@
 import org.apache.druid.query.aggregation.CountAggregatorFactory;
 import org.apache.druid.query.aggregation.DoubleSumAggregatorFactory;
 import org.apache.druid.query.aggregation.FilteredAggregatorFactory;
+import org.apache.druid.query.aggregation.LongMaxAggregatorFactory;
+import org.apache.druid.query.aggregation.LongMinAggregatorFactory;
 import org.apache.druid.query.aggregation.cardinality.CardinalityAggregatorFactory;
 import org.apache.druid.query.aggregation.post.ArithmeticPostAggregator;
 import org.apache.druid.query.aggregation.post.FieldAccessPostAggregator;
@@ -75,6 +77,7 @@
 import org.apache.druid.query.ordering.StringComparators;
 import org.apache.druid.query.policy.Policy;
 import org.apache.druid.query.scan.ScanQuery;
+import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 import org.apache.druid.segment.join.JoinType;
@@ -2954,4 +2957,58 @@ public boolean isPageSizeLimited(String contextName)
   {
     return QUERY_RESULTS_WITH_DURABLE_STORAGE.equals(contextName);
   }
+
+  @MethodSource("data")
+  @ParameterizedTest(name = "{index}:with context {0}")
+  public void testTimeBoundaryGroupBy(String contextName, Map<String, Object> context)
+  {
+    final RowSignature rowSignature = RowSignature.builder()
+                                                  .add("EXPR$0", ColumnType.LONG)
+                                                  .add("EXPR$1", ColumnType.LONG)
+                                                  .build();
+
+    testSelectQuery()
+        .setSql("SELECT MIN(__time), MAX(__time) FROM foo")
+        .setExpectedMSQSpec(
+            LegacyMSQSpec.builder()
+                   .query(
+                       GroupByQuery.builder()
+                                  .setDataSource(CalciteTests.DATASOURCE1)
+                                  .setInterval(querySegmentSpec(Filtration.eternity()))
+                                  .setGranularity(Granularities.ALL)
+                                  .setAggregatorSpecs(
+                                      aggregators(
+                                          new LongMinAggregatorFactory("a0", ColumnHolder.TIME_COLUMN_NAME),
+                                          new LongMaxAggregatorFactory("a1", ColumnHolder.TIME_COLUMN_NAME)
+                                      )
+                                  )
+                                  .setContext(context)
+                                  .build()
+                   )
+                   .columnMappings(
+                       new ColumnMappings(
+                           ImmutableList.of(
+                               new ColumnMapping("a0", "EXPR$0"),
+                               new ColumnMapping("a1", "EXPR$1")
+                           )
+                       )
+                   )
+                   .tuningConfig(MSQTuningConfig.defaultConfig())
+                   .destination(isDurableStorageDestination(contextName, context)
+                                ? DurableStorageMSQDestination.INSTANCE
+                                : TaskReportMSQDestination.INSTANCE)
+                   .build()
+        )
+        .setExpectedRowSignature(rowSignature)
+        .setQueryContext(context)
+        .setExpectedResultRows(
+            ImmutableList.of(
+                new Object[]{
+                    DateTimes.of("2000-01-01").getMillis(),
+                    DateTimes.of("2001-01-03").getMillis()
+                }
+            )
+        )
+        .verifyResults();
+  }
 }
diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleStageProcessorTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/groupby/GroupByPreShuffleStageProcessorTest.java
diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/groupby/GroupByTimeBoundaryUtilsTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/querykit/groupby/GroupByTimeBoundaryUtilsTest.java