|
4 | 4 | import asyncio |
5 | 5 | import datetime |
6 | 6 | import enum |
| 7 | +import logging |
7 | 8 | from concurrent.futures import ThreadPoolExecutor |
8 | 9 | from functools import reduce |
9 | 10 | from typing import Any, Optional, TypedDict |
|
18 | 19 | from robusta_krr.core.models.config import settings |
19 | 20 | from robusta_krr.core.models.objects import K8sObjectData |
20 | 21 |
|
| 22 | +logger = logging.getLogger("krr") |
| 23 | + |
| 24 | +# Maximum number of data points to request from Prometheus |
| 25 | +# Using 10,000 as a safety margin below the typical 11,000 hard limit |
| 26 | +MAX_PROMETHEUS_POINTS = 10_000 |
| 27 | + |
21 | 28 |
|
22 | 29 | class PrometheusSeries(TypedDict): |
23 | 30 | metric: dict[str, Any] |
@@ -117,6 +124,42 @@ def _step_to_string(self, step: datetime.timedelta) -> str: |
117 | 124 | return f"{int(step.total_seconds()) // (60 * 60 * 24)}d" |
118 | 125 | return f"{int(step.total_seconds()) // 60}m" |
119 | 126 |
|
| 127 | + def _calculate_safe_step(self, period: datetime.timedelta, step: datetime.timedelta) -> datetime.timedelta: |
| 128 | + """ |
| 129 | + Calculate a step size that won't exceed Prometheus's maximum resolution limit. |
| 130 | +
|
| 131 | + If the number of data points (period / step) would exceed MAX_PROMETHEUS_POINTS, |
| 132 | + this function returns an increased step size that keeps the point count under the limit. |
| 133 | +
|
| 134 | + Args: |
| 135 | + period: The time period for the query. |
| 136 | + step: The originally requested step size. |
| 137 | +
|
| 138 | + Returns: |
| 139 | + A step size that keeps the number of data points under MAX_PROMETHEUS_POINTS. |
| 140 | + """ |
| 141 | + period_seconds = period.total_seconds() |
| 142 | + step_seconds = step.total_seconds() |
| 143 | + |
| 144 | + # Calculate expected number of points |
| 145 | + expected_points = period_seconds / step_seconds |
| 146 | + |
| 147 | + if expected_points <= MAX_PROMETHEUS_POINTS: |
| 148 | + return step |
| 149 | + |
| 150 | + # Calculate the minimum step size needed to stay under the limit |
| 151 | + min_step_seconds = period_seconds / MAX_PROMETHEUS_POINTS |
| 152 | + |
| 153 | + # Round up to the nearest second to ensure we're under the limit |
| 154 | + adjusted_step_seconds = int(min_step_seconds) + 1 |
| 155 | + |
| 156 | + logger.debug( |
| 157 | + f"Adjusting step from {step_seconds}s to {adjusted_step_seconds}s to avoid exceeding " |
| 158 | + f"Prometheus max resolution ({expected_points:.0f} points -> {period_seconds / adjusted_step_seconds:.0f} points)" |
| 159 | + ) |
| 160 | + |
| 161 | + return datetime.timedelta(seconds=adjusted_step_seconds) |
| 162 | + |
120 | 163 | @retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5)) |
121 | 164 | def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusSeries]: |
122 | 165 | if data.type == QueryType.QueryRange: |
@@ -168,6 +211,10 @@ async def load_data( |
168 | 211 | ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics. |
169 | 212 | """ |
170 | 213 |
|
| 214 | + # For range queries, adjust step size if needed to avoid exceeding Prometheus limits |
| 215 | + if self.query_type == QueryType.QueryRange: |
| 216 | + step = self._calculate_safe_step(period, step) |
| 217 | + |
171 | 218 | step_str = f"{round(step.total_seconds())}s" |
172 | 219 | duration_str = self._step_to_string(period) |
173 | 220 |
|
|
0 commit comments