Skip to content

Commit b54c21f

Browse files
committed
update latency test case to include some metrics (to check what we specific memory we need)
Signed-off-by: see-quick <maros.orsak159@gmail.com>
1 parent e7f11bf commit b54c21f

3 files changed

Lines changed: 39 additions & 16 deletions

File tree

development-docs/systemtests/io.strimzi.systemtest.performance.UserOperatorScalabilityPerformance.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222

2323
| Step | Action | Result |
2424
| - | - | - |
25-
| 1. | Deploy Kafka cluster with User Operator configured with more resources to handle load and also non-default `STRIMZI_WORK_QUEUE_SIZE` set to 2048. | Kafka cluster with User Operator is deployed and ready. |
26-
| 2. | For each configured load level (1000, 1500, 2000 existing users), create N KafkaUsers to establish the load. | N KafkaUsers are created and ready, establishing baseline load on the User Operator. |
25+
| 1. | Deploy Kafka cluster with User Operator configured with more resources to handle load and also non-default `STRIMZI_WORK_QUEUE_SIZE` set to 4096. | Kafka cluster with User Operator is deployed and ready. |
26+
| 2. | For each configured load level (1000, 2000, 3000 existing users), create N KafkaUsers to establish the load. | N KafkaUsers are created and ready, establishing baseline load on the User Operator. |
2727
| 3. | Perform 100 individual user modifications sequentially, measuring the latency of each modification. | Each modification latency is recorded independently. |
2828
| 4. | Calculate latency statistics: min, max, average, P50, P95, and P99 percentiles from the 100 measurements. | Statistical analysis shows how single-user modification latency degrades as system load (number of existing users) increases. |
2929
| 5. | Clean up all users and persist latency metrics to user-operator report directory. | Namespace is cleaned, latency data is saved showing how responsiveness changes at different load levels. |

systemtest/src/main/java/io/strimzi/systemtest/performance/gather/schedulers/BaseMetricsCollectionScheduler.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@ public void executeMetricsCollection() {
109109
* @param unit the time unit of the initial delay and interval.
110110
*/
111111
public void startCollecting(long initialDelay, long interval, TimeUnit unit) {
112+
// Recreate scheduler if it has been shut down
113+
if (this.scheduler.isShutdown()) {
114+
LOGGER.debug("Scheduler was shut down, creating a new one.");
115+
this.scheduler = Executors.newSingleThreadScheduledExecutor();
116+
// Clear metrics from previous collection cycle when restarting
117+
this.metricsStore.clear();
118+
}
119+
112120
// Capture the context in the thread where startCollecting is called
113121
final ExtensionContext currentContext = KubeResourceManager.get().getTestContext();
114122

systemtest/src/test/java/io/strimzi/systemtest/performance/UserOperatorScalabilityPerformance.java

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,12 @@
1313
import io.strimzi.api.kafka.model.user.KafkaUser;
1414
import io.strimzi.systemtest.AbstractST;
1515
import io.strimzi.systemtest.Environment;
16+
import io.strimzi.systemtest.TestConstants;
1617
import io.strimzi.systemtest.annotations.IsolatedTest;
1718
import io.strimzi.systemtest.docs.TestDocsLabels;
19+
import io.strimzi.systemtest.metrics.UserOperatorMetricsComponent;
20+
import io.strimzi.systemtest.performance.gather.collectors.UserOperatorMetricsCollector;
21+
import io.strimzi.systemtest.performance.gather.schedulers.UserOperatorMetricsCollectionScheduler;
1822
import io.strimzi.systemtest.performance.report.UserOperatorPerformanceReporter;
1923
import io.strimzi.systemtest.performance.report.parser.UserOperatorMetricsParser;
2024
import io.strimzi.systemtest.performance.utils.UserOperatorPerformanceUtils;
@@ -24,6 +28,7 @@
2428
import io.strimzi.systemtest.storage.TestStorage;
2529
import io.strimzi.systemtest.templates.crd.KafkaNodePoolTemplates;
2630
import io.strimzi.systemtest.templates.crd.KafkaTemplates;
31+
import io.strimzi.systemtest.templates.specific.ScraperTemplates;
2732
import io.strimzi.systemtest.utils.kafkaUtils.KafkaUserUtils;
2833
import org.apache.logging.log4j.LogManager;
2934
import org.apache.logging.log4j.Logger;
@@ -151,8 +156,8 @@ void testScalability() {
151156
@TestDoc(
152157
description = @Desc("This test measures user modification latency statistics under different load levels by performing multiple user modifications to understand how response time scales with system load."),
153158
steps = {
154-
@Step(value = "Deploy Kafka cluster with User Operator configured with more resources to handle load and also non-default `STRIMZI_WORK_QUEUE_SIZE` set to 2048.", expected = "Kafka cluster with User Operator is deployed and ready."),
155-
@Step(value = "For each configured load level (1000, 1500, 2000 existing users), create N KafkaUsers to establish the load.", expected = "N KafkaUsers are created and ready, establishing baseline load on the User Operator."),
159+
@Step(value = "Deploy Kafka cluster with User Operator configured with more resources to handle load and also non-default `STRIMZI_WORK_QUEUE_SIZE` set to 4096.", expected = "Kafka cluster with User Operator is deployed and ready."),
160+
@Step(value = "For each configured load level (1000, 2000, 3000 existing users), create N KafkaUsers to establish the load.", expected = "N KafkaUsers are created and ready, establishing baseline load on the User Operator."),
156161
@Step(value = "Perform 100 individual user modifications sequentially, measuring the latency of each modification.", expected = "Each modification latency is recorded independently."),
157162
@Step(value = "Calculate latency statistics: min, max, average, P50, P95, and P99 percentiles from the 100 measurements.", expected = "Statistical analysis shows how single-user modification latency degrades as system load (number of existing users) increases."),
158163
@Step(value = "Clean up all users and persist latency metrics to user-operator report directory.", expected = "Namespace is cleaned, latency data is saved showing how responsiveness changes at different load levels.")
@@ -165,28 +170,22 @@ void testScalability() {
165170
@Tag(SCALABILITY)
166171
void testLatencyUnderLoad() {
167172
final TestStorage testStorage = new TestStorage(KubeResourceManager.get().getTestContext());
168-
final List<Integer> loadLevels = List.of(1000, 1500, 2000);
173+
final List<Integer> loadLevels = List.of(1000, 2000, 3000);
169174
final int numberOfModifications = 100;
170175
// default configuration of UO
171176
final int maxBatchSize = 100;
172177
final int maxBatchLingerMs = 100;
173178
// but maxWorkQueueSize must be a bit higher than default because we Queue will be `FULL`
174-
final int maxWorkQueueSize = 2048;
179+
final int maxWorkQueueSize = 4096;
175180

176181
KubeResourceManager.get().createResourceWithWait(
177-
KafkaNodePoolTemplates.brokerPoolPersistentStorage(testStorage.getNamespaceName(), testStorage.getBrokerPoolName(), testStorage.getClusterName(), 3)
178-
.editSpec()
179-
.withNewPersistentClaimStorage()
180-
.withSize("5Gi")
181-
.withDeleteClaim(true)
182-
.endPersistentClaimStorage()
183-
.endSpec()
184-
.build(),
182+
KafkaNodePoolTemplates.brokerPoolPersistentStorage(testStorage.getNamespaceName(), testStorage.getBrokerPoolName(), testStorage.getClusterName(), 3).build(),
185183
KafkaNodePoolTemplates.controllerPoolPersistentStorage(testStorage.getNamespaceName(), testStorage.getControllerPoolName(), testStorage.getClusterName(), 3).build()
186184
);
187185

188186
KubeResourceManager.get().createResourceWithWait(
189-
KafkaTemplates.kafka(testStorage.getNamespaceName(), testStorage.getClusterName(), 3)
187+
KafkaTemplates.kafkaMetricsConfigMap(testStorage.getNamespaceName(), testStorage.getClusterName()),
188+
KafkaTemplates.kafkaWithMetrics(testStorage.getNamespaceName(), testStorage.getClusterName(), 3)
190189
.editSpec()
191190
.editKafka()
192191
.withNewKafkaAuthorizationSimple()
@@ -214,11 +213,23 @@ void testLatencyUnderLoad() {
214213
.endTemplate()
215214
.endEntityOperator()
216215
.endSpec()
217-
.build()
216+
.build(),
217+
ScraperTemplates.scraperPod(testStorage.getNamespaceName(), testStorage.getScraperName()).build()
218218
);
219219

220+
testStorage.addToTestStorage(TestConstants.SCRAPER_POD_KEY,
221+
KubeResourceManager.get().kubeClient().listPodsByPrefixInName(testStorage.getNamespaceName(), testStorage.getScraperName()).get(0).getMetadata().getName());
222+
220223
loadLevels.forEach(numberOfExistingUsers -> {
221224
LatencyMetrics latencyMetrics = null;
225+
final UserOperatorMetricsCollector userOperatorCollector = new UserOperatorMetricsCollector.Builder()
226+
.withScraperPodName(testStorage.getScraperPodName())
227+
.withNamespaceName(testStorage.getNamespaceName())
228+
.withComponent(UserOperatorMetricsComponent.create(testStorage.getNamespaceName(), testStorage.getClusterName()))
229+
.build();
230+
231+
final UserOperatorMetricsCollectionScheduler userOperatorMetricsGatherer = UserOperatorMetricsCollectionScheduler.getInstance(userOperatorCollector, "strimzi.io/cluster=" + testStorage.getClusterName());
232+
userOperatorMetricsGatherer.startCollecting();
222233
try {
223234
LOGGER.info("Measuring single-user modification latency with {} existing users in the system", numberOfExistingUsers);
224235
latencyMetrics = UserOperatorPerformanceUtils.measureLatencyUnderLoad(testStorage, numberOfExistingUsers, numberOfModifications);
@@ -245,6 +256,10 @@ void testLatencyUnderLoad() {
245256
performanceAttributes.put(PerformanceConstants.OPERATOR_OUT_P95_LATENCY, latencyMetrics.p95());
246257
performanceAttributes.put(PerformanceConstants.OPERATOR_OUT_P99_LATENCY, latencyMetrics.p99());
247258

259+
userOperatorMetricsGatherer.stopCollecting();
260+
261+
performanceAttributes.put(PerformanceConstants.METRICS_HISTORY, userOperatorMetricsGatherer.getMetricsStore()); // Map of metrics history
262+
248263
try {
249264
this.userOperatorPerformanceReporter.logPerformanceData(testStorage, performanceAttributes, REPORT_DIRECTORY + "/" + PerformanceConstants.GENERAL_LATENCY_USE_CASE, TimeHolder.getActualTime(), Environment.PERFORMANCE_DIR);
250265
} catch (IOException e) {

0 commit comments

Comments
 (0)