|
33 | 33 | import org.apache.flink.configuration.Configuration;
|
34 | 34 | import org.apache.flink.configuration.MemorySize;
|
35 | 35 | import org.apache.flink.configuration.TaskManagerOptions;
|
| 36 | +import org.apache.flink.runtime.instance.SlotSharingGroupId; |
36 | 37 | import org.apache.flink.runtime.jobgraph.JobVertexID;
|
37 | 38 |
|
38 | 39 | import org.slf4j.Logger;
|
@@ -67,6 +68,9 @@ public class ScalingExecutor<KEY, Context extends JobAutoScalerContext<KEY>> {
|
67 | 68 | public static final String HEAP_USAGE_MESSAGE =
|
68 | 69 | "Heap Usage %s is above the allowed limit for scaling operations. Please adjust the available memory manually.";
|
69 | 70 |
|
| 71 | + public static final String RESOURCE_QUOTA_REACHED_MESSAGE = |
| 72 | + "Resource usage is above the allowed limit for scaling operations. Please adjust the resource quota manually."; |
| 73 | + |
70 | 74 | private static final Logger LOG = LoggerFactory.getLogger(ScalingExecutor.class);
|
71 | 75 |
|
72 | 76 | private final JobVertexScaler<KEY, Context> jobVertexScaler;
|
@@ -115,6 +119,17 @@ public boolean scaleResource(
|
115 | 119 | return false;
|
116 | 120 | }
|
117 | 121 |
|
| 122 | + if (resourceQuotaReached(conf, evaluatedMetrics, scalingSummaries, context)) { |
| 123 | + autoScalerEventHandler.handleEvent( |
| 124 | + context, |
| 125 | + AutoScalerEventHandler.Type.Warning, |
| 126 | + "ResourceQuotaReached", |
| 127 | + RESOURCE_QUOTA_REACHED_MESSAGE, |
| 128 | + null, |
| 129 | + conf.get(SCALING_EVENT_INTERVAL)); |
| 130 | + return false; |
| 131 | + } |
| 132 | + |
118 | 133 | updateRecommendedParallelism(evaluatedMetrics.getVertexMetrics(), scalingSummaries);
|
119 | 134 |
|
120 | 135 | if (checkIfBlockedAndTriggerScalingEvent(context, scalingSummaries, conf, now)) {
|
@@ -199,6 +214,85 @@ protected static boolean allVerticesWithinUtilizationTarget(
|
199 | 214 | return true;
|
200 | 215 | }
|
201 | 216 |
|
| 217 | + protected static boolean resourceQuotaReached( |
| 218 | + Configuration conf, |
| 219 | + EvaluatedMetrics evaluatedMetrics, |
| 220 | + Map<JobVertexID, ScalingSummary> scalingSummaries, |
| 221 | + JobAutoScalerContext<?> ctx) { |
| 222 | + |
| 223 | + if (evaluatedMetrics.getJobTopology() == null |
| 224 | + || evaluatedMetrics.getJobTopology().getSlotSharingGroupMapping().isEmpty()) { |
| 225 | + return false; |
| 226 | + } |
| 227 | + |
| 228 | + var cpuQuota = conf.getOptional(AutoScalerOptions.CPU_QUOTA); |
| 229 | + var memoryQuota = conf.getOptional(AutoScalerOptions.MEMORY_QUOTA); |
| 230 | + var tmMemory = ctx.getTaskManagerMemory(); |
| 231 | + var tmCpu = ctx.getTaskManagerCpu(); |
| 232 | + |
| 233 | + if (cpuQuota.isPresent() || memoryQuota.isPresent()) { |
| 234 | + var currentSlotSharingGroupMaxParallelisms = new HashMap<SlotSharingGroupId, Integer>(); |
| 235 | + var newSlotSharingGroupMaxParallelisms = new HashMap<SlotSharingGroupId, Integer>(); |
| 236 | + for (var e : |
| 237 | + evaluatedMetrics.getJobTopology().getSlotSharingGroupMapping().entrySet()) { |
| 238 | + int currentMaxParallelism = |
| 239 | + e.getValue().stream() |
| 240 | + .filter(scalingSummaries::containsKey) |
| 241 | + .mapToInt(v -> scalingSummaries.get(v).getCurrentParallelism()) |
| 242 | + .max() |
| 243 | + .orElse(0); |
| 244 | + currentSlotSharingGroupMaxParallelisms.put(e.getKey(), currentMaxParallelism); |
| 245 | + int newMaxParallelism = |
| 246 | + e.getValue().stream() |
| 247 | + .filter(scalingSummaries::containsKey) |
| 248 | + .mapToInt(v -> scalingSummaries.get(v).getNewParallelism()) |
| 249 | + .max() |
| 250 | + .orElse(0); |
| 251 | + newSlotSharingGroupMaxParallelisms.put(e.getKey(), newMaxParallelism); |
| 252 | + } |
| 253 | + |
| 254 | + var numSlotsPerTm = conf.get(TaskManagerOptions.NUM_TASK_SLOTS); |
| 255 | + var currentTotalSlots = |
| 256 | + currentSlotSharingGroupMaxParallelisms.values().stream() |
| 257 | + .mapToInt(Integer::intValue) |
| 258 | + .sum(); |
| 259 | + var currentNumTms = currentTotalSlots / numSlotsPerTm; |
| 260 | + var newTotalSlots = |
| 261 | + newSlotSharingGroupMaxParallelisms.values().stream() |
| 262 | + .mapToInt(Integer::intValue) |
| 263 | + .sum(); |
| 264 | + var newNumTms = newTotalSlots / numSlotsPerTm; |
| 265 | + |
| 266 | + if (newNumTms <= currentNumTms) { |
| 267 | + LOG.debug( |
| 268 | + "Skipping quota check due to new resource allocation is less or equals than the current"); |
| 269 | + return false; |
| 270 | + } |
| 271 | + |
| 272 | + if (cpuQuota.isPresent() && tmCpu.isPresent()) { |
| 273 | + LOG.debug("CPU resource quota is {}, checking limits", cpuQuota.get()); |
| 274 | + double totalCPU = tmCpu.get() * newNumTms; |
| 275 | + if (totalCPU > cpuQuota.get()) { |
| 276 | + LOG.info("CPU resource quota reached with value: {}", totalCPU); |
| 277 | + return true; |
| 278 | + } |
| 279 | + } |
| 280 | + |
| 281 | + if (memoryQuota.isPresent() && tmMemory.isPresent()) { |
| 282 | + LOG.debug("Memory resource quota is {}, checking limits", memoryQuota.get()); |
| 283 | + long totalMemory = tmMemory.get().getBytes() * newNumTms; |
| 284 | + if (totalMemory > memoryQuota.get().getBytes()) { |
| 285 | + LOG.info( |
| 286 | + "Memory resource quota reached with value: {}", |
| 287 | + new MemorySize(totalMemory)); |
| 288 | + return true; |
| 289 | + } |
| 290 | + } |
| 291 | + } |
| 292 | + |
| 293 | + return false; |
| 294 | + } |
| 295 | + |
202 | 296 | @VisibleForTesting
|
203 | 297 | Map<JobVertexID, ScalingSummary> computeScalingSummary(
|
204 | 298 | Context context,
|
|
0 commit comments