Skip to content

Commit 4428036

Browse files
committed
fix: auto-adjust step size to prevent exceeding Prometheus max resolution
When querying Prometheus with long history durations and small step sizes, the number of data points can exceed Prometheus's maximum resolution of 11,000 points per timeseries. This fix automatically increases the step size when the calculated number of points would exceed 10,000 (using 10,000 as a safety margin below the 11,000 hard limit). Fixes #490
1 parent 3acc116 commit 4428036

File tree

1 file changed

+47
-0
lines changed
  • robusta_krr/core/integrations/prometheus/metrics

1 file changed

+47
-0
lines changed

robusta_krr/core/integrations/prometheus/metrics/base.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import asyncio
55
import datetime
66
import enum
7+
import logging
78
from concurrent.futures import ThreadPoolExecutor
89
from functools import reduce
910
from typing import Any, Optional, TypedDict
@@ -18,6 +19,12 @@
1819
from robusta_krr.core.models.config import settings
1920
from robusta_krr.core.models.objects import K8sObjectData
2021

22+
logger = logging.getLogger("krr")
23+
24+
# Maximum number of data points to request from Prometheus
25+
# Using 10,000 as a safety margin below the typical 11,000 hard limit
26+
MAX_PROMETHEUS_POINTS = 10_000
27+
2128

2229
class PrometheusSeries(TypedDict):
2330
metric: dict[str, Any]
@@ -117,6 +124,42 @@ def _step_to_string(self, step: datetime.timedelta) -> str:
117124
return f"{int(step.total_seconds()) // (60 * 60 * 24)}d"
118125
return f"{int(step.total_seconds()) // 60}m"
119126

127+
def _calculate_safe_step(self, period: datetime.timedelta, step: datetime.timedelta) -> datetime.timedelta:
128+
"""
129+
Calculate a step size that won't exceed Prometheus's maximum resolution limit.
130+
131+
If the number of data points (period / step) would exceed MAX_PROMETHEUS_POINTS,
132+
this function returns an increased step size that keeps the point count under the limit.
133+
134+
Args:
135+
period: The time period for the query.
136+
step: The originally requested step size.
137+
138+
Returns:
139+
A step size that keeps the number of data points under MAX_PROMETHEUS_POINTS.
140+
"""
141+
period_seconds = period.total_seconds()
142+
step_seconds = step.total_seconds()
143+
144+
# Calculate expected number of points
145+
expected_points = period_seconds / step_seconds
146+
147+
if expected_points <= MAX_PROMETHEUS_POINTS:
148+
return step
149+
150+
# Calculate the minimum step size needed to stay under the limit
151+
min_step_seconds = period_seconds / MAX_PROMETHEUS_POINTS
152+
153+
# Round up to the nearest second to ensure we're under the limit
154+
adjusted_step_seconds = int(min_step_seconds) + 1
155+
156+
logger.debug(
157+
f"Adjusting step from {step_seconds}s to {adjusted_step_seconds}s to avoid exceeding "
158+
f"Prometheus max resolution ({expected_points:.0f} points -> {period_seconds / adjusted_step_seconds:.0f} points)"
159+
)
160+
161+
return datetime.timedelta(seconds=adjusted_step_seconds)
162+
120163
@retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5))
121164
def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusSeries]:
122165
if data.type == QueryType.QueryRange:
@@ -168,6 +211,10 @@ async def load_data(
168211
ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics.
169212
"""
170213

214+
# For range queries, adjust step size if needed to avoid exceeding Prometheus limits
215+
if self.query_type == QueryType.QueryRange:
216+
step = self._calculate_safe_step(period, step)
217+
171218
step_str = f"{round(step.total_seconds())}s"
172219
duration_str = self._step_to_string(period)
173220

0 commit comments

Comments
 (0)