Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 28 additions & 16 deletions src/integrations/prefect-kubernetes/prefect_kubernetes/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

events_client: EventsClient | None = None
orchestration_client: PrefectClient | None = None
_startup_event_semaphore: asyncio.Semaphore | None = None


@kopf.on.startup()
Expand All @@ -56,8 +57,12 @@ async def initialize_clients(logger: kopf.Logger, **kwargs: Any):
logger.info("Initializing clients")
global events_client
global orchestration_client
global _startup_event_semaphore
orchestration_client = await get_client().__aenter__()
events_client = await get_events_client().__aenter__()
_startup_event_semaphore = asyncio.Semaphore(
settings.observer.startup_event_concurrency
)
logger.info("Clients successfully initialized")


Expand Down Expand Up @@ -124,26 +129,33 @@ async def _replicate_pod_event( # pyright: ignore[reportUnusedFunction]
if event_type is None:
if orchestration_client is None:
raise RuntimeError("Orchestration client not initialized")

# Use the Kubernetes event timestamp for the filter to avoid "Query time range is too large" error
event_filter = EventFilter(
event=EventNameFilter(name=[f"prefect.kubernetes.pod.{phase.lower()}"]),
resource=EventResourceFilter(
id=[f"prefect.kubernetes.pod.{uid}"],
),
occurred=EventOccurredFilter(
since=(
k8s_created_time
if k8s_created_time
else (datetime.now(timezone.utc) - timedelta(hours=1))
)
),
)
if _startup_event_semaphore is None:
raise RuntimeError("Startup event semaphore not initialized")

# Use semaphore to limit concurrent API calls during startup to prevent
# overwhelming the API server when there are many existing pods/jobs
async with _startup_event_semaphore:
# Use the Kubernetes event timestamp for the filter to avoid "Query time range is too large" error
event_filter = EventFilter(
event=EventNameFilter(name=[f"prefect.kubernetes.pod.{phase.lower()}"]),
resource=EventResourceFilter(
id=[f"prefect.kubernetes.pod.{uid}"],
),
occurred=EventOccurredFilter(
since=(
k8s_created_time
if k8s_created_time
else (datetime.now(timezone.utc) - timedelta(hours=1))
)
),
)

response = await orchestration_client.request(
"POST",
"/events/filter",
json=dict(filter=event_filter.model_dump(exclude_unset=True, mode="json")),
json=dict(
Comment on lines 153 to +156
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep the semaphore held during the API request

The startup semaphore is acquired only while building event_filter, but the actual orchestration_client.request happens after the async with block has exited (see the request at lines 153-156). This means concurrent startup events still send API calls without any throttling, so the intended protection against API overload when many existing pods/jobs are present does not actually take effect. Consider keeping the request inside the semaphore scope.

Useful? React with 👍 / 👎.

filter=event_filter.model_dump(exclude_unset=True, mode="json")
),
)
# If the event already exists, we don't need to emit a new one.
if response.json()["events"]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ class KubernetesObserverSettings(PrefectBaseSettings):
"should be provided in the format `key=value`.",
)

startup_event_concurrency: int = Field(
default=5,
description="Maximum number of concurrent API calls when checking for "
"duplicate events during observer startup. This helps prevent overloading "
"the API server when there are many existing pods/jobs in the cluster.",
)


class KubernetesWorkerSettings(PrefectBaseSettings):
model_config = build_settings_config(("integrations", "kubernetes", "worker"))
Expand Down
Loading