|
20 | 20 | import org.apache.flink.annotation.VisibleForTesting;
|
21 | 21 | import org.apache.flink.autoscaler.config.AutoScalerOptions;
|
22 | 22 | import org.apache.flink.autoscaler.event.AutoScalerEventHandler;
|
| 23 | +import org.apache.flink.autoscaler.metrics.EvaluatedMetrics; |
23 | 24 | import org.apache.flink.autoscaler.metrics.EvaluatedScalingMetric;
|
24 | 25 | import org.apache.flink.autoscaler.metrics.ScalingMetric;
|
| 26 | +import org.apache.flink.autoscaler.topology.JobTopology; |
25 | 27 | import org.apache.flink.autoscaler.topology.ShipStrategy;
|
26 | 28 | import org.apache.flink.autoscaler.utils.AutoScalerUtils;
|
27 | 29 | import org.apache.flink.configuration.Configuration;
|
|
36 | 38 | import java.time.Instant;
|
37 | 39 | import java.time.ZoneId;
|
38 | 40 | import java.util.Collection;
|
| 41 | +import java.util.List; |
39 | 42 | import java.util.Map;
|
40 | 43 | import java.util.SortedMap;
|
41 | 44 |
|
| 45 | +import static org.apache.flink.autoscaler.config.AutoScalerOptions.BOTTLENECK_PROPAGATION_SCALE_DOWN_ENABLED; |
42 | 46 | import static org.apache.flink.autoscaler.config.AutoScalerOptions.MAX_SCALE_DOWN_FACTOR;
|
43 | 47 | import static org.apache.flink.autoscaler.config.AutoScalerOptions.MAX_SCALE_UP_FACTOR;
|
44 | 48 | import static org.apache.flink.autoscaler.config.AutoScalerOptions.SCALE_UP_GRACE_PERIOD;
|
|
49 | 53 | import static org.apache.flink.autoscaler.metrics.ScalingMetric.EXPECTED_PROCESSING_RATE;
|
50 | 54 | import static org.apache.flink.autoscaler.metrics.ScalingMetric.MAX_PARALLELISM;
|
51 | 55 | import static org.apache.flink.autoscaler.metrics.ScalingMetric.PARALLELISM;
|
| 56 | +import static org.apache.flink.autoscaler.metrics.ScalingMetric.TARGET_DATA_RATE; |
52 | 57 | import static org.apache.flink.autoscaler.metrics.ScalingMetric.TRUE_PROCESSING_RATE;
|
53 | 58 | import static org.apache.flink.autoscaler.topology.ShipStrategy.HASH;
|
54 | 59 |
|
@@ -150,6 +155,94 @@ public int computeScaleTargetParallelism(
|
150 | 155 | return newParallelism;
|
151 | 156 | }
|
152 | 157 |
|
| 158 | + public boolean propagateBackpropScaleFactor( |
| 159 | + Configuration conf, |
| 160 | + JobVertexID vertex, |
| 161 | + JobTopology topology, |
| 162 | + EvaluatedMetrics evaluatedMetrics, |
| 163 | + Map<JobVertexID, Double> backpropScaleFactors, |
| 164 | + List<String> excludedVertices) { |
| 165 | + |
| 166 | + double averageTrueProcessingRate = |
| 167 | + evaluatedMetrics |
| 168 | + .getVertexMetrics() |
| 169 | + .get(vertex) |
| 170 | + .get(TRUE_PROCESSING_RATE) |
| 171 | + .getAverage(); |
| 172 | + if (Double.isNaN(averageTrueProcessingRate)) { |
| 173 | + return false; |
| 174 | + } |
| 175 | + |
| 176 | + double minScaleFactor = 1 - conf.get(MAX_SCALE_DOWN_FACTOR); |
| 177 | + double maxScaleFactor = 1 + conf.get(MAX_SCALE_UP_FACTOR); |
| 178 | + |
| 179 | + double processingRateCapacity = |
| 180 | + evaluatedMetrics.getVertexMetrics().get(vertex).get(TARGET_DATA_RATE).getAverage(); |
| 181 | + |
| 182 | + if (Double.isNaN(processingRateCapacity)) { |
| 183 | + return false; |
| 184 | + } |
| 185 | + |
| 186 | + // if scale down is disabled, the adjusted scale factor cannot be less than the default |
| 187 | + // factor |
| 188 | + if (!conf.getBoolean(BOTTLENECK_PROPAGATION_SCALE_DOWN_ENABLED)) { |
| 189 | + double scaleFactor = processingRateCapacity / averageTrueProcessingRate; |
| 190 | + scaleFactor = Math.max(scaleFactor, minScaleFactor); |
| 191 | + minScaleFactor = Math.min(1.0, scaleFactor); |
| 192 | + } |
| 193 | + |
| 194 | + // we scaled processing rate capacity by upstream |
| 195 | + double currentBackPropFactor = backpropScaleFactors.getOrDefault(vertex, 1.0); |
| 196 | + processingRateCapacity *= currentBackPropFactor; |
| 197 | + |
| 198 | + double targetScaleFactor = processingRateCapacity / averageTrueProcessingRate; |
| 199 | + |
| 200 | + if (excludedVertices.contains(vertex.toHexString())) { |
| 201 | + LOG.debug( |
| 202 | + "Vertex {} is excluded from scaling. Target scale factor is 1.0", |
| 203 | + vertex.toHexString()); |
| 204 | + targetScaleFactor = 1.0; |
| 205 | + } |
| 206 | + |
| 207 | + if (targetScaleFactor < minScaleFactor) { |
| 208 | + LOG.debug( |
| 209 | + "Computed scale factor of {} for {} is capped by maximum scale down factor to {}", |
| 210 | + targetScaleFactor, |
| 211 | + vertex, |
| 212 | + minScaleFactor); |
| 213 | + targetScaleFactor = minScaleFactor; |
| 214 | + } |
| 215 | + if (maxScaleFactor < targetScaleFactor) { |
| 216 | + LOG.debug( |
| 217 | + "Computed scale factor of {} for {} is capped by maximum scale up factor to {}", |
| 218 | + targetScaleFactor, |
| 219 | + vertex, |
| 220 | + maxScaleFactor); |
| 221 | + targetScaleFactor = maxScaleFactor; |
| 222 | + } |
| 223 | + |
| 224 | + // capacity the vertex can process with target scale factor |
| 225 | + double limitedProcessingCapacity = targetScaleFactor * averageTrueProcessingRate; |
| 226 | + |
| 227 | + double adjustedProcessingRateCapacity = |
| 228 | + AutoScalerUtils.getInPlaceTargetProcessingCapacity( |
| 229 | + evaluatedMetrics, topology, vertex, backpropScaleFactors); |
| 230 | + if (Double.isNaN(adjustedProcessingRateCapacity)) { |
| 231 | + return false; |
| 232 | + } |
| 233 | + |
| 234 | + // if the capacity from the upstream vertices exceeds target processing rate -> |
| 235 | + // backpropagate scale factor |
| 236 | + if (limitedProcessingCapacity < adjustedProcessingRateCapacity) { |
| 237 | + double adjustFactor = limitedProcessingCapacity / adjustedProcessingRateCapacity; |
| 238 | + for (var input : topology.getVertexInfos().get(vertex).getInputs().keySet()) { |
| 239 | + double factor = backpropScaleFactors.getOrDefault(input, 1.0); |
| 240 | + backpropScaleFactors.put(input, factor * adjustFactor); |
| 241 | + } |
| 242 | + } |
| 243 | + return true; |
| 244 | + } |
| 245 | + |
153 | 246 | private boolean blockScalingBasedOnPastActions(
|
154 | 247 | Context context,
|
155 | 248 | JobVertexID vertex,
|
|
0 commit comments