apache
diff --git a/‎docs/configuration/index.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/configuration/index.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/design/coordinator.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/design/coordinator.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/ingestion/kafka-ingestion.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/ingestion/kafka-ingestion.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/ingestion/supervisor.md‎
Lines changed: 11 additions & 6 deletions b/‎docs/ingestion/supervisor.md‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json‎
Lines changed: 1 addition & 0 deletions b/‎extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaConsumerMonitor.java‎
Lines changed: 6 additions & 0 deletions b/‎extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaConsumerMonitor.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaRecordSupplierTest.java‎
Lines changed: 1 addition & 0 deletions b/‎extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/KafkaRecordSupplierTest.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorIOConfigTest.java‎
Lines changed: 2 additions & 0 deletions b/‎extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorIOConfigTest.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java‎
Lines changed: 9 additions & 5 deletions b/‎indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java‎
Lines changed: 9 additions & 5 deletions
@@ -749,6 +749,7 @@ These Coordinator static configurations can be defined in the `coordinator/runti
 |`druid.coordinator.startDelay`|The operation of the Coordinator works on the assumption that it has an up-to-date view of the state of the world when it runs, the current ZooKeeper interaction code, however, is written in a way that doesn’t allow the Coordinator to know for a fact that it’s done loading the current state of the world. This delay is a hack to give it enough time to believe that it has all the data.|`PT300S`|
 |`druid.coordinator.load.timeout`|The timeout duration for when the Coordinator assigns a segment to a Historical service.|`PT15M`|
 |`druid.coordinator.balancer.strategy`|The [balancing strategy](../design/coordinator.md#balancing-segments-in-a-tier) used by the Coordinator to distribute segments among the Historical servers in a tier. The `cost` strategy distributes segments by minimizing a cost function, `diskNormalized` weights these costs with the disk usage ratios of the servers and `random` distributes segments randomly.|`cost`|
+|`druid.coordinator.balancer.diskNormalized.moveCostSavingsThreshold`|Only used when `druid.coordinator.balancer.strategy` is `diskNormalized`. Minimum fractional cost reduction required before a segment is moved off a server that already holds it. A value of `0.05` requires the destination to be at least 5% cheaper than the source, which prevents oscillation between servers with similar disk utilization. Must be in `[0.0, 1.0)`; `0.0` disables the anti-oscillation discount.|`0.05`|
 |`druid.coordinator.loadqueuepeon.http.repeatDelay`|The start and repeat delay (in milliseconds) for the load queue peon, which manages the load/drop queue of segments for any server.|1 minute|
 |`druid.coordinator.loadqueuepeon.http.batchSize`|Number of segment load/drop requests to batch in one HTTP request. Note that it must be smaller than or equal to the `druid.segmentCache.numLoadingThreads` config on Historical service. If this value is not configured, the coordinator uses the value of the `numLoadingThreads` for the respective server. | `druid.segmentCache.numLoadingThreads` |
 |`druid.coordinator.asOverlord.enabled`|Boolean value for whether this Coordinator service should act like an Overlord as well. This configuration allows users to simplify a Druid cluster by not having to deploy any standalone Overlord services. If set to true, then Overlord console is available at `http://coordinator-host:port/console.html` and be sure to set `druid.coordinator.asOverlord.overlordService` also.|false|
@@ -2023,7 +2024,7 @@ log4j config to route these logs to different sources based on the feed of the e
 |`druid.emitter.logging.loggerClass`|The class used for logging.|`org.apache.druid.java.util.emitter.core.LoggingEmitter`|
 |`druid.emitter.logging.logLevel`|Choices: debug, info, warn, error. The log level at which message are logged.|info|
 |`druid.emitter.logging.shouldFilterMetrics`|When true, only metrics listed in the allow list are emitted; non-metric events (e.g. alerts) are always emitted. When false, all events are logged (backward-compatible).|false|
-|`druid.emitter.logging.allowedMetricsPath`|Path to a JSON file whose keys are the allowed metric names. Only used when `shouldFilterMetrics` is true. If null or empty, the bundled classpath resource `defaultMetrics.json` is used. If a path is set but the file is missing, a warning is logged and the emitter falls back to the default classpath resource.|null|
+|`druid.emitter.logging.allowedMetricsPath`|Path to a JSON file whose keys are the allowed metric names. Only used when `shouldFilterMetrics` is true. If null or empty, the bundled classpath resource `loggingEmitterAllowedMetrics.json` is used. If a path is set but the file is missing, a warning is logged and the emitter falls back to the default classpath resource.|null|
 
 #### HTTP emitter module
 
 
@@ -88,7 +88,7 @@ But in a tier with several Historicals (or a low replication factor), segment re
 Thus, the Coordinator constantly monitors the set of segments present on each Historical in a tier and employs one of the following strategies to identify segments that may be moved from one Historical to another to retain balance.
 
 - `cost` (default): For a given segment in a tier, this strategy picks the server with the minimum "cost" of placing that segment. The cost is a function of the data interval of the segment and the data intervals of all the segments already present on the candidate server. In essence, this strategy tries to avoid placing segments with adjacent or overlapping data intervals on the same server. This is based on the premise that adjacent-interval segments are more likely to be used together in a query and placing them on the same server may lead to skewed CPU usages of Historicals.
-- `diskNormalized`: A derivative of the `cost` strategy that weights the cost of placing a segment on a server with the disk usage ratio of the server. There are known issues with this strategy and is not recommended for a production cluster.
+- `diskNormalized`: A derivative of the `cost` strategy that multiplies the cost of placing a segment on a server by the server's disk usage ratio (`diskUsed / maxSize`). This penalizes fuller servers and drives disk utilization to equalize across the tier, which is useful when historicals within a tier hold segments of widely varying sizes. To prevent oscillation when servers have similar utilization, a segment that is already placed on a server receives a cost discount; a move only fires when the destination saves at least `druid.coordinator.balancer.diskNormalized.moveCostSavingsThreshold` (default `0.05`, i.e. 5%) of the source's cost.
 - `random`: Distributes segments randomly across servers. This is an experimental strategy and is not recommended for a production cluster.
 
 All of the above strategies prioritize moving segments from the Historical with the least available disk space.
 
@@ -235,7 +235,8 @@ The following example shows a supervisor spec with idle configuration enabled:
         "enableTaskAutoScaler": true,
         "taskCountMax": 6,
         "taskCountMin": 2,
-        "minTriggerScaleActionFrequencyMillis": 600000,
+        "minScaleUpDelay": "PT10M",
+        "minScaleDownDelay": "PT10M",
         "autoScalerStrategy": "lagBased",
         "lagCollectionIntervalMillis": 30000,
         "lagCollectionRangeMillis": 600000,
 
@@ -79,7 +79,9 @@ The following table outlines the configuration properties for `autoScalerConfig`
 |`taskCountMax`|The maximum number of ingestion tasks. Must be greater than or equal to `taskCountMin`. If `taskCountMax` is greater than the number of Kafka partitions or Kinesis shards, Druid sets the maximum number of reading tasks to the number of Kafka partitions or Kinesis shards and ignores `taskCountMax`.|Yes||
 |`taskCountMin`|The minimum number of ingestion tasks. When you enable the autoscaler, Druid computes the initial number of tasks to launch by checking the configs in the following order: `taskCountStart`, then `taskCount` (in `ioConfig`), then `taskCountMin`.|Yes||
 |`taskCountStart`|Optional config to specify the number of ingestion tasks to start with. When you enable the autoscaler, Druid computes the initial number of tasks to launch by checking the configs in the following order: `taskCountStart`, then `taskCount` (in `ioConfig`), then `taskCountMin`.|No|`taskCount` or `taskCountMin`|
-|`minTriggerScaleActionFrequencyMillis`|The minimum time interval between two scale actions.| No|600000|
+|`minScaleUpDelay`|Minimum cooldown duration between scale-up actions, specified as an ISO-8601 duration string. Falls back to `minTriggerScaleActionFrequencyMillis` if not set.|No||
+|`minScaleDownDelay`|Minimum cooldown duration between scale-down actions, specified as an ISO-8601 duration string. Falls back to `minTriggerScaleActionFrequencyMillis` if not set.|No||
+|`minTriggerScaleActionFrequencyMillis`|**Deprecated.** Use `minScaleUpDelay` and `minScaleDownDelay` instead. Minimum time interval in milliseconds between scale actions, used as the fallback when the Duration-based fields are not set.|No|600000|
 |`autoScalerStrategy`|The algorithm of autoscaler. Druid only supports the `lagBased` strategy. See [Autoscaler strategy](#autoscaler-strategy) for more information.|No|`lagBased`|
 |`stopTaskCountRatio`|A variable version of `ioConfig.stopTaskCount` with a valid range of (0.0, 1.0]. Allows the maximum number of stoppable tasks in steady state to be proportional to the number of tasks currently running.|No||
 
@@ -161,7 +163,8 @@ The following example shows a supervisor spec with `lagBased` autoscaler:
       "enableTaskAutoScaler": true,
       "taskCountMax": 6,
       "taskCountMin": 2,
-      "minTriggerScaleActionFrequencyMillis": 600000,
+      "minScaleUpDelay": "PT10M",
+      "minScaleDownDelay": "PT10M",
       "autoScalerStrategy": "lagBased",
       "lagCollectionIntervalMillis": 30000,
       "lagCollectionRangeMillis": 600000,
@@ -210,10 +213,11 @@ The following table outlines the configuration properties related to the `costBa
 |`idleWeight`|The weight of extracted poll idle value in cost function. | No | 0.75 |
 |`useTaskCountBoundaries`|Enables the bounded partitions-per-task window when selecting task counts.|No| `false` |
 |`highLagThreshold`|Average partition lag threshold that triggers burst scale-up when set to a value greater than `0`. Set to a negative value to disable burst scale-up.|No|-1|
-|`minScaleDownDelay`|Minimum duration between successful scale actions, specified as an ISO-8601 duration string.|No|`PT30M`|
+|`minScaleUpDelay`|Minimum cooldown duration after a scale-up action before the next scale-up is allowed, specified as an ISO-8601 duration string.|No||
+|`minScaleDownDelay`|Minimum cooldown duration after a scale-down action before the next scale-down is allowed, specified as an ISO-8601 duration string.|No|`PT30M`|
 |`scaleDownDuringTaskRolloverOnly`|Indicates whether task scaling down is limited to periods during task rollovers only.|No|`false`|
 
-The following example shows a supervisor spec with `lagBased` autoscaler:
+The following example shows a supervisor spec with `costBased` autoscaler:
 
 <details>
   <summary>Click to view the example</summary>
@@ -227,9 +231,10 @@ The following example shows a supervisor spec with `lagBased` autoscaler:
       "autoScalerStrategy": "costBased",
       "taskCountMin": 1,
       "taskCountMax": 10,
-      "minTriggerScaleActionFrequencyMillis": 600000,
+      "minScaleUpDelay": "PT10M",
+      "minScaleDownDelay": "PT30M",
       "lagWeight": 0.1,
-      "idleWeight": 0.9,
+      "idleWeight": 0.9
     }
   }
 }
 
@@ -97,6 +97,7 @@
   "kafka/consumer/recordsPerRequestAvg" : { "dimensions" : ["topic"], "type" : "gauge", "help": "Average records per fetch request as seen by the consumer of a Kafka indexing task (per topic)."},
   "kafka/consumer/outgoingBytes" : { "dimensions" : ["node_id"], "type" : "count", "help": "Bytes sent to Kafka brokers by the consumer of a Kafka indexing task (per node)."},
   "kafka/consumer/incomingBytes" : { "dimensions" : ["node_id"], "type" : "count", "help": "Bytes received from Kafka brokers by the consumer of a Kafka indexing task (per node)."},
+  "kafka/consumer/pollIdleRatio" : { "dimensions" : [], "type" : "gauge", "help": "Average fraction of time the consumer of a Kafka indexing task spent idle (not in poll). 0 means never idle, 1 means always idle."},
 
   "ingest/count" : { "dimensions" : ["dataSource", "taskType"], "type" : "count", "help": "Count of 1 every time an ingestion job runs (includes compaction jobs). Aggregate using dimensions." },
   "ingest/segments/count" : { "dimensions" : ["dataSource", "taskType"], "type" : "count", "help": "Count of final segments created by job (includes tombstones)." },
 
@@ -127,6 +127,12 @@ public class KafkaConsumerMonitor extends AbstractMonitor
               "kafka/consumer/incomingBytes",
               Set.of(CLIENT_ID_TAG, NODE_ID_TAG),
               KafkaConsumerMetric.MetricType.COUNTER
+          ),
+          new KafkaConsumerMetric(
+              POLL_IDLE_RATIO_METRIC_NAME,
+              "kafka/consumer/pollIdleRatio",
+              Set.of(CLIENT_ID_TAG),
+              KafkaConsumerMetric.MetricType.GAUGE
           )
       ).collect(Collectors.toMap(KafkaConsumerMetric::getKafkaMetricName, Function.identity()));
 
 
@@ -484,6 +484,7 @@ public void testPoll() throws InterruptedException, ExecutionException
     emitter.verifyEmitted("kafka/consumer/recordsPerRequestAvg", 1);
     emitter.verifyEmitted("kafka/consumer/incomingBytes", 2);
     emitter.verifyEmitted("kafka/consumer/outgoingBytes", 2);
+    emitter.verifyEmitted("kafka/consumer/pollIdleRatio", 1);
 
     recordSupplier.close();
     Assert.assertFalse(monitor.monitor(emitter));
 
@@ -315,6 +315,8 @@ public void testAutoScalingConfigSerde() throws JsonProcessingException
     autoScalerConfig.put("scaleInStep", 1);
     autoScalerConfig.put("scaleOutStep", 2);
     autoScalerConfig.put("minTriggerScaleActionFrequencyMillis", 1200000);
+    autoScalerConfig.put("minScaleUpDelay", "PT20M");
+    autoScalerConfig.put("minScaleDownDelay", "PT20M");
 
     final Map<String, Object> consumerProperties = KafkaConsumerConfigs.getConsumerProperties();
     consumerProperties.put("bootstrap.servers", "localhost:8082");
 
@@ -214,7 +214,8 @@ public Response specGetAll(
           Set<String> authorizedSupervisorIds = filterAuthorizedSupervisorIds(
               req,
               manager,
-              manager.getSupervisorIds()
+              manager.getSupervisorIds(),
+              AuthorizationUtils.DATASOURCE_READ_RA_GENERATOR
           );
           final boolean includeFull = full != null;
           final boolean includeState = state != null && state;
@@ -509,7 +510,8 @@ public Response terminateAll(@Context final HttpServletRequest req)
           Set<String> authorizedSupervisorIds = filterAuthorizedSupervisorIds(
               req,
               manager,
-              manager.getSupervisorIds()
+              manager.getSupervisorIds(),
+              AuthorizationUtils.DATASOURCE_WRITE_RA_GENERATOR
           );
 
           for (final String supervisorId : authorizedSupervisorIds) {
@@ -652,15 +654,16 @@ private Response asLeaderWithSupervisorManager(Function<SupervisorManager, Respo
   private Set<String> filterAuthorizedSupervisorIds(
       final HttpServletRequest req,
       SupervisorManager manager,
-      Collection<String> supervisorIds
+      Collection<String> supervisorIds,
+      Function<String, ResourceAction> authorizationFn
   )
   {
     Function<String, Iterable<ResourceAction>> raGenerator = supervisorId -> {
       Optional<SupervisorSpec> supervisorSpecOptional = manager.getSupervisorSpec(supervisorId);
       if (supervisorSpecOptional.isPresent()) {
         return Iterables.transform(
             supervisorSpecOptional.get().getDataSources(),
-            AuthorizationUtils.DATASOURCE_WRITE_RA_GENERATOR
+            authorizationFn
         );
       } else {
         return null;
@@ -710,7 +713,8 @@ private Response suspendOrResumeAll(final HttpServletRequest req, final boolean
           Set<String> authorizedSupervisorIds = filterAuthorizedSupervisorIds(
               req,
               manager,
-              manager.getSupervisorIds()
+              manager.getSupervisorIds(),
+              AuthorizationUtils.DATASOURCE_WRITE_RA_GENERATOR
           );
 
           for (final String supervisorId : authorizedSupervisorIds) {