diff --git a/quixstreams/__init__.py b/quixstreams/__init__.py
index 6149e2d70..2e1af11f7 100644
--- a/quixstreams/__init__.py
+++ b/quixstreams/__init__.py
@@ -5,4 +5,4 @@
__all__ = ["Application", "message_context", "MessageContext", "State"]
-__version__ = "3.23.2"
+__version__ = "3.23.3"
diff --git a/quixstreams/app.py b/quixstreams/app.py
index 30d71ab4e..d551bf9ce 100644
--- a/quixstreams/app.py
+++ b/quixstreams/app.py
@@ -152,6 +152,7 @@ def __init__(
topic_create_timeout: float = 60,
processing_guarantee: ProcessingGuarantee = "at-least-once",
max_partition_buffer_size: int = 10000,
+ broker_availability_timeout: float = 120.0,
):
"""
:param broker_address: Connection settings for Kafka.
@@ -220,6 +221,12 @@ def __init__(
It is a soft limit, and the actual number of buffered messages can be up to x2 higher.
Lower value decreases the memory use, but increases the latency.
Default - `10000`.
+ :param broker_availability_timeout: timeout in seconds. If all Kafka brokers
+ are unavailable for longer than this, the Application will raise a
+ ``KafkaBrokerUnavailableError`` to allow the orchestrator to restart
+ the application with fresh connections.
+ Set to ``0`` to disable the check.
+ Default - ``120.0``s (2 minutes).
***Error Handlers***
To handle errors, `Application` accepts callbacks triggered when
@@ -343,6 +350,12 @@ def __init__(
self._on_message_processed = on_message_processed
self._on_processing_error = on_processing_error or default_on_processing_error
+ if broker_availability_timeout < 0:
+ raise ValueError(
+ f"broker_availability_timeout must be >= 0, "
+ f"got {broker_availability_timeout}"
+ )
+ self._broker_availability_timeout = broker_availability_timeout
self._consumer = self._get_internal_consumer(
on_error=on_consumer_error,
@@ -361,6 +374,7 @@ def __init__(
recovery_manager = RecoveryManager(
consumer=self._consumer,
topic_manager=self._topic_manager,
+ broker_availability_timeout=self._broker_availability_timeout,
)
self._state_manager = StateStoreManager(
@@ -761,6 +775,7 @@ def add_source(self, source: BaseSource, topic: Optional[Topic] = None) -> Topic
extra_config_overrides=consumer_extra_config_overrides
),
self._get_topic_manager(),
+ broker_availability_timeout=self._broker_availability_timeout,
)
return topic
@@ -926,6 +941,13 @@ def _run_dataframe(self, sink: Optional[VoidExecutor] = None):
processing_context.commit_checkpoint()
consumer.resume_backpressured()
source_manager.raise_for_error()
+ if self._broker_availability_timeout:
+ self._producer.raise_if_broker_unavailable(
+ self._broker_availability_timeout
+ )
+ self._consumer.raise_if_broker_unavailable(
+ self._broker_availability_timeout
+ )
printer.print()
run_tracker.update_status()
@@ -940,6 +962,13 @@ def _run_sources(self):
source_manager.start_sources()
while run_tracker.running and source_manager.is_alive():
source_manager.raise_for_error()
+ if self._broker_availability_timeout:
+ self._producer.raise_if_broker_unavailable(
+ self._broker_availability_timeout
+ )
+ self._consumer.raise_if_broker_unavailable(
+ self._broker_availability_timeout
+ )
run_tracker.update_status()
time.sleep(1)
self.stop()
@@ -1002,6 +1031,8 @@ def _process_message(self, dataframe_composed):
topic=topic_name, partition=partition, offset=offset
)
self._run_tracker.set_message_consumed(True)
+ self._producer._broker_available() # noqa: SLF001
+ self._consumer._broker_available() # noqa: SLF001
if self._on_message_processed is not None:
self._on_message_processed(topic_name, partition, offset)
diff --git a/quixstreams/internal_producer.py b/quixstreams/internal_producer.py
index 42a0d461b..69834ab03 100644
--- a/quixstreams/internal_producer.py
+++ b/quixstreams/internal_producer.py
@@ -170,6 +170,14 @@ def poll(self, timeout: float = 0):
return
raise
+ def _broker_available(self):
+ """Reset the broker unavailability tracker on the underlying Producer."""
+ self._producer._broker_available() # noqa: SLF001
+
+ def raise_if_broker_unavailable(self, timeout: float):
+ """Raise if all brokers have been unavailable for longer than ``timeout`` seconds."""
+ self._producer.raise_if_broker_unavailable(timeout)
+
def produce(
self,
topic: str,
@@ -203,6 +211,8 @@ def _on_delivery(self, err: Optional[KafkaError], msg: Message):
topic, partition, offset = msg.topic(), msg.partition(), msg.offset()
if err is None:
self._tp_offsets[(topic, partition)] = offset
+ # Successful delivery confirms broker is reachable
+ self._producer._broker_available() # noqa: SLF001
else:
self._error = err
diff --git a/quixstreams/kafka/consumer.py b/quixstreams/kafka/consumer.py
index 0f69b190f..9eb291b27 100644
--- a/quixstreams/kafka/consumer.py
+++ b/quixstreams/kafka/consumer.py
@@ -1,5 +1,6 @@
import functools
import logging
+import time
import typing
from typing import Any, Callable, List, Optional, Tuple, Union, cast
@@ -20,7 +21,7 @@
)
from .configuration import ConnectionConfig
-from .exceptions import KafkaConsumerException
+from .exceptions import KafkaBrokerUnavailableError, KafkaConsumerException
__all__ = (
"BaseConsumer",
@@ -124,6 +125,13 @@ def __init__(
if isinstance(broker_address, str):
broker_address = ConnectionConfig(bootstrap_servers=broker_address)
+ self._broker_unavailable_since: Optional[float] = None
+
+ # Wrap the user-provided (or default) error callback so that broker
+ # availability tracking always runs, regardless of custom callbacks.
+ self._user_error_cb = error_callback
+ error_callback = self._error_cb
+
self._consumer_config = {
# previous Quix Streams defaults
"enable.auto.offset.store": False,
@@ -143,6 +151,53 @@ def __init__(
}
self._inner_consumer: Optional[ConfluentConsumer] = None
+ def _error_cb(self, error: KafkaError):
+ """Instance-level error callback that tracks broker availability
+ and delegates to the user-provided (or default) error callback."""
+ error_code = error.code()
+ if error_code == KafkaError._ALL_BROKERS_DOWN: # noqa: SLF001
+ if self._broker_unavailable_since is None:
+ self._broker_unavailable_since = time.monotonic()
+ self._user_error_cb(error)
+
+ def _broker_available(self):
+ """Reset the broker unavailability tracker."""
+ self._broker_unavailable_since = None
+
+ def raise_if_broker_unavailable(self, timeout: float):
+ """Raise if all brokers have been unavailable for longer than ``timeout`` seconds.
+
+ Before raising, performs an active metadata probe to confirm the brokers
+ are truly unreachable (avoids false positives for idle applications).
+
+ :param timeout: seconds of continuous unavailability before raising.
+ :raises KafkaBrokerUnavailableError: if the timeout has been exceeded.
+ """
+ if self._broker_unavailable_since is not None:
+ elapsed = time.monotonic() - self._broker_unavailable_since
+ if elapsed >= timeout:
+ try:
+ self._consumer.list_topics(timeout=5.0)
+ self._broker_unavailable_since = None
+ logger.info(
+ "Consumer broker availability probe succeeded after %.0fs; "
+ "resetting unavailability timer.",
+ elapsed,
+ )
+ return
+ except Exception:
+ logger.debug(
+ "Consumer broker availability probe failed", exc_info=True
+ )
+ raise KafkaBrokerUnavailableError(
+ f"All Kafka brokers have been unavailable for "
+ f"{elapsed:.0f}s (timeout={timeout:.0f}s). "
+ f"The application cannot recover automatically; "
+ f"restarting is required. "
+ f"Adjust via Application(broker_availability_timeout=...) "
+ f"or set to 0 to disable."
+ )
+
def poll(
self, timeout: Optional[float] = None
) -> Optional[RawConfluentKafkaMessageProto]:
diff --git a/quixstreams/kafka/exceptions.py b/quixstreams/kafka/exceptions.py
index 1bffd6bf4..6002c3a3e 100644
--- a/quixstreams/kafka/exceptions.py
+++ b/quixstreams/kafka/exceptions.py
@@ -33,3 +33,6 @@ class KafkaProducerDeliveryError(BaseKafkaException): ...
class InvalidProducerConfigError(QuixException): ...
+
+
+class KafkaBrokerUnavailableError(QuixException): ...
diff --git a/quixstreams/kafka/producer.py b/quixstreams/kafka/producer.py
index 4d9940a36..fb12c76c6 100644
--- a/quixstreams/kafka/producer.py
+++ b/quixstreams/kafka/producer.py
@@ -1,5 +1,6 @@
import functools
import logging
+import time
import uuid
from typing import Callable, List, Optional, Union
@@ -23,16 +24,13 @@
"PRODUCER_POLL_TIMEOUT",
)
-from .exceptions import InvalidProducerConfigError
+from .exceptions import InvalidProducerConfigError, KafkaBrokerUnavailableError
DeliveryCallback = Callable[[Optional[KafkaError], Message], None]
logger = logging.getLogger(__name__)
-IGNORED_KAFKA_ERRORS = (
- # This error seems to be thrown despite brokers being available.
- # Seems linked to `connections.max.idle.ms`.
- KafkaError._ALL_BROKERS_DOWN, # noqa: SLF001
+_SILENTLY_IGNORED_KAFKA_ERRORS = (
# Broker handle destroyed - common/typical behavior, often seen via AdminClient
KafkaError._DESTROY, # noqa: SLF001
)
@@ -43,9 +41,12 @@
def _default_error_cb(error: KafkaError):
error_code = error.code()
- if error_code in IGNORED_KAFKA_ERRORS:
+ if error_code in _SILENTLY_IGNORED_KAFKA_ERRORS:
logger.debug(error.str())
return
+ if error_code == KafkaError._ALL_BROKERS_DOWN: # noqa: SLF001
+ logger.warning(error.str())
+ return
logger.error(f'Kafka producer error: {error.str()} code="{error_code}"')
@@ -96,6 +97,13 @@ def __init__(
if isinstance(broker_address, str):
broker_address = ConnectionConfig(bootstrap_servers=broker_address)
+ self._broker_unavailable_since: Optional[float] = None
+
+ # Wrap the user-provided (or default) error callback so that broker
+ # availability tracking always runs, regardless of custom callbacks.
+ self._user_error_cb = error_callback
+ error_callback = self._error_cb
+
self._producer_config = {
# previous Quix Streams defaults
"partitioner": "murmur2",
@@ -235,6 +243,55 @@ def _producer(self) -> ConfluentProducer:
self._inner_producer.init_transactions()
return self._inner_producer
+ def _error_cb(self, error: KafkaError):
+ """Instance-level error callback that tracks broker availability
+ and delegates to the user-provided (or default) error callback."""
+ error_code = error.code()
+ if error_code == KafkaError._ALL_BROKERS_DOWN: # noqa: SLF001
+ if self._broker_unavailable_since is None:
+ self._broker_unavailable_since = time.monotonic()
+ self._user_error_cb(error)
+
+ def _broker_available(self):
+ """Reset the broker unavailability tracker."""
+ self._broker_unavailable_since = None
+
+ def raise_if_broker_unavailable(self, timeout: float):
+ """Raise if all brokers have been unavailable for longer than ``timeout`` seconds.
+
+ Before raising, performs an active metadata probe to confirm the brokers
+ are truly unreachable (avoids false positives for idle applications).
+
+ :param timeout: seconds of continuous unavailability before raising.
+ :raises KafkaBrokerUnavailableError: if the timeout has been exceeded.
+ """
+ if self._broker_unavailable_since is not None:
+ elapsed = time.monotonic() - self._broker_unavailable_since
+ if elapsed >= timeout:
+ # Active probe: try a lightweight metadata request before raising.
+ # This avoids false positives when brokers recovered but no
+ # messages flowed to reset the timer (idle apps).
+ try:
+ self._producer.list_topics(timeout=5.0)
+ # Probe succeeded — brokers are actually reachable.
+ self._broker_unavailable_since = None
+ logger.info(
+ "Broker availability probe succeeded after %.0fs; "
+ "resetting unavailability timer.",
+ elapsed,
+ )
+ return
+ except Exception:
+ logger.debug("Broker availability probe failed", exc_info=True)
+ raise KafkaBrokerUnavailableError(
+ f"All Kafka brokers have been unavailable for "
+ f"{elapsed:.0f}s (timeout={timeout:.0f}s). "
+ f"The application cannot recover automatically; "
+ f"restarting is required. "
+ f"Adjust via Application(broker_availability_timeout=...) "
+ f"or set to 0 to disable."
+ )
+
def __len__(self):
return len(self._producer)
diff --git a/quixstreams/sources/base/manager.py b/quixstreams/sources/base/manager.py
index 517232077..4da3aca3b 100644
--- a/quixstreams/sources/base/manager.py
+++ b/quixstreams/sources/base/manager.py
@@ -45,6 +45,7 @@ def __init__(
producer: InternalProducer,
consumer: InternalConsumer,
topic_manager: TopicManager,
+ broker_availability_timeout: float = 0,
) -> None:
super().__init__()
self.topic = topic
@@ -58,6 +59,7 @@ def __init__(
self._consumer = consumer
self._producer = producer
+ self._broker_availability_timeout = broker_availability_timeout
# copy parent process log level to the child process
self._loglevel = logging.getLogger(LOGGER_NAME).level
@@ -130,6 +132,7 @@ def _recover_state(self, source: StatefulSource) -> StorePartition:
recovery_manager = RecoveryManager(
consumer=self._consumer,
topic_manager=self._topic_manager,
+ broker_availability_timeout=self._broker_availability_timeout,
)
state_manager = StateStoreManager(
@@ -264,6 +267,7 @@ def register(
producer,
consumer,
topic_manager,
+ broker_availability_timeout: float = 0,
) -> SourceProcess:
"""
Register a new source in the manager.
@@ -281,6 +285,7 @@ def register(
producer=producer,
consumer=consumer,
topic_manager=topic_manager,
+ broker_availability_timeout=broker_availability_timeout,
)
self.processes.append(process)
return process
diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py
index bf26ddf7c..c07d616a0 100644
--- a/quixstreams/state/recovery.py
+++ b/quixstreams/state/recovery.py
@@ -350,10 +350,16 @@ class RecoveryManager:
# At 10-second progress logging intervals, 60 attempts = ~10 minutes
MAX_INVALID_OFFSET_ATTEMPTS = 60
- def __init__(self, consumer: BaseConsumer, topic_manager: TopicManager):
+ def __init__(
+ self,
+ consumer: BaseConsumer,
+ topic_manager: TopicManager,
+ broker_availability_timeout: float = 0,
+ ):
self._running = False
self._consumer = consumer
self._topic_manager = topic_manager
+ self._broker_availability_timeout = broker_availability_timeout
self._recovery_partitions: Dict[int, Dict[str, RecoveryPartition]] = {}
self._last_progress_logged_time = time.monotonic()
# Cache position results to avoid double calls in same iteration
@@ -608,6 +614,11 @@ def _recovery_loop(self) -> None:
msg = raise_for_msg_error(msg)
rp = self._recovery_partitions[msg.partition()][msg.topic()]
rp.recover_from_changelog_message(changelog_message=msg)
+ self._consumer._broker_available() # noqa: SLF001
+ if self._broker_availability_timeout:
+ self._consumer.raise_if_broker_unavailable(
+ self._broker_availability_timeout
+ )
def _get_position_with_cache(self, rp: RecoveryPartition) -> ConfluentPartition:
"""
diff --git a/tests/test_quixstreams/test_kafka/test_broker_availability.py b/tests/test_quixstreams/test_kafka/test_broker_availability.py
new file mode 100644
index 000000000..ff8eb5c70
--- /dev/null
+++ b/tests/test_quixstreams/test_kafka/test_broker_availability.py
@@ -0,0 +1,572 @@
+import time
+import uuid
+from unittest.mock import patch
+
+import pytest
+from confluent_kafka import KafkaError
+
+from quixstreams.kafka.exceptions import KafkaBrokerUnavailableError
+from quixstreams.kafka.producer import Producer
+
+
+class TestProducerBrokerAvailability:
+ """Tests for Producer detecting prolonged broker unavailability."""
+
+ def _simulate_all_brokers_down(self, producer: Producer):
+ """Trigger the producer's error callback with _ALL_BROKERS_DOWN."""
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ producer._error_cb(error)
+
+ def test_all_brokers_down_sets_timestamp(self):
+ """When _ALL_BROKERS_DOWN fires, the producer should record the timestamp."""
+ producer = Producer(broker_address="localhost:9092")
+ assert producer._broker_unavailable_since is None
+
+ self._simulate_all_brokers_down(producer)
+ assert producer._broker_unavailable_since is not None
+
+ def test_all_brokers_down_keeps_first_timestamp(self):
+ """Repeated _ALL_BROKERS_DOWN should keep the original timestamp, not update it."""
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+ first_ts = producer._broker_unavailable_since
+
+ with patch("time.monotonic", return_value=200.0):
+ self._simulate_all_brokers_down(producer)
+ assert producer._broker_unavailable_since == first_ts
+
+ def test_raise_if_broker_unavailable_raises_after_timeout(self):
+ """Should raise after _ALL_BROKERS_DOWN has persisted beyond the timeout
+ and the active metadata probe also fails."""
+ from unittest.mock import MagicMock
+
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+
+ # Install a fake inner producer so the metadata probe fails
+ fake = MagicMock()
+ fake.list_topics.side_effect = Exception("brokers down")
+ producer._inner_producer = fake
+
+ with patch("time.monotonic", return_value=280.0):
+ with pytest.raises(
+ KafkaBrokerUnavailableError,
+ match="broker_availability_timeout",
+ ):
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+ def test_raise_if_broker_unavailable_no_raise_before_timeout(self):
+ """Should NOT raise if timeout hasn't elapsed yet."""
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+
+ with patch("time.monotonic", return_value=150.0):
+ # 50s elapsed, timeout is 120s — should not raise
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+ def test_raise_if_broker_unavailable_no_raise_when_brokers_ok(self):
+ """Should NOT raise if no _ALL_BROKERS_DOWN was ever reported."""
+ producer = Producer(broker_address="localhost:9092")
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+ def test_broker_available_resets_timestamp(self):
+ """Calling broker_available() should reset the unavailable timestamp."""
+ producer = Producer(broker_address="localhost:9092")
+
+ self._simulate_all_brokers_down(producer)
+ assert producer._broker_unavailable_since is not None
+
+ producer._broker_available()
+ assert producer._broker_unavailable_since is None
+
+ def test_broker_available_prevents_raise_after_reset(self):
+ """After a reset via broker_available(), the timeout check should not raise."""
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+
+ # Broker comes back
+ producer._broker_available()
+
+ # Even with enough elapsed time, should not raise because reset happened
+ with patch("time.monotonic", return_value=300.0):
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+ def test_error_cb_still_handles_other_errors(self):
+ """Non-ALL_BROKERS_DOWN errors should still be logged normally,
+ not treated as broker unavailability."""
+ producer = Producer(broker_address="localhost:9092")
+
+ # Simulate a different error (e.g., _TRANSPORT)
+ error = KafkaError(KafkaError._TRANSPORT)
+ producer._error_cb(error)
+
+ # Should not set broker unavailable timestamp
+ assert producer._broker_unavailable_since is None
+
+ def test_error_cb_destroy_does_not_set_unavailable(self):
+ """_DESTROY errors should be ignored like before, not setting unavailable."""
+ producer = Producer(broker_address="localhost:9092")
+
+ error = KafkaError(KafkaError._DESTROY)
+ producer._error_cb(error)
+
+ assert producer._broker_unavailable_since is None
+
+ def test_active_probe_resets_timer_when_brokers_recover(self):
+ """If the metadata probe succeeds, the timer should be reset instead of raising."""
+ from unittest.mock import MagicMock
+
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+
+ # Install a fake inner producer where the probe succeeds
+ fake = MagicMock()
+ fake.list_topics.return_value = MagicMock() # success
+ producer._inner_producer = fake
+
+ with patch("time.monotonic", return_value=280.0):
+ # Should NOT raise because probe succeeds
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+ # Timer should have been reset
+ assert producer._broker_unavailable_since is None
+
+ def test_custom_error_callback_still_tracks_brokers(self):
+ """A custom error_callback should still get broker availability tracking."""
+ calls = []
+
+ def my_error_cb(error):
+ calls.append(error.code())
+
+ producer = Producer(broker_address="localhost:9092", error_callback=my_error_cb)
+
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ producer._error_cb(error)
+
+ # Custom callback was called
+ assert KafkaError._ALL_BROKERS_DOWN in calls
+ # AND tracking was set
+ assert producer._broker_unavailable_since is not None
+
+ def test_error_message_includes_parameter_name(self):
+ """The error message should mention broker_availability_timeout for discoverability."""
+ from unittest.mock import MagicMock
+
+ producer = Producer(broker_address="localhost:9092")
+
+ with patch("time.monotonic", return_value=100.0):
+ self._simulate_all_brokers_down(producer)
+
+ fake = MagicMock()
+ fake.list_topics.side_effect = Exception("down")
+ producer._inner_producer = fake
+
+ with patch("time.monotonic", return_value=280.0):
+ with pytest.raises(
+ KafkaBrokerUnavailableError,
+ match="broker_availability_timeout.*set to 0 to disable",
+ ):
+ producer.raise_if_broker_unavailable(timeout=120.0)
+
+
+class TestAppBrokerAvailability:
+ """Tests for Application raising on prolonged broker unavailability.
+
+ These are unit tests that verify the wiring between Application and
+ Producer's broker availability tracking, without requiring Kafka.
+ """
+
+ def _make_app(self, **kwargs):
+ """Create an Application with mocked Kafka connections."""
+ from quixstreams.app import Application
+
+ defaults = dict(
+ broker_address="localhost:9092",
+ consumer_group="test-group",
+ auto_create_topics=False,
+ use_changelog_topics=False,
+ )
+ defaults.update(kwargs)
+ return Application(**defaults)
+
+ def test_app_raises_when_brokers_unavailable_beyond_timeout(self):
+ """The app should raise KafkaBrokerUnavailableError when brokers have been
+ down for longer than broker_availability_timeout."""
+ from unittest.mock import MagicMock
+ from unittest.mock import patch as mock_patch
+
+ from quixstreams.internal_consumer import InternalConsumer
+ from quixstreams.models.topics.admin import TopicAdmin
+ from quixstreams.models.topics.topic import TopicConfig
+
+ app = self._make_app(broker_availability_timeout=0.1)
+
+ # Simulate _ALL_BROKERS_DOWN in the past
+ app._producer._producer._broker_unavailable_since = time.monotonic() - 10.0
+
+ # Make the active metadata probe fail (brokers truly down)
+ fake_confluent = MagicMock()
+ fake_confluent.list_topics.side_effect = Exception("brokers down")
+ app._producer._producer._inner_producer = fake_confluent
+
+ def fake_inspect(topic_names, timeout=None):
+ return {
+ n: TopicConfig(num_partitions=1, replication_factor=1)
+ for n in topic_names
+ }
+
+ with mock_patch.object(TopicAdmin, "inspect_topics", side_effect=fake_inspect):
+ topic = app.topic(str(uuid.uuid4()))
+ app.dataframe(topic)
+
+ def mock_poll_row(self_consumer, *args, **kwargs):
+ return None
+
+ with mock_patch.object(InternalConsumer, "poll_row", mock_poll_row):
+ with mock_patch.object(InternalConsumer, "_subscribe"):
+ with pytest.raises(KafkaBrokerUnavailableError):
+ app.run()
+
+ def test_app_does_not_raise_when_check_disabled(self):
+ """With broker_availability_timeout=0, the app should NOT check
+ broker availability."""
+ from unittest.mock import patch as mock_patch
+
+ from quixstreams.internal_consumer import InternalConsumer
+ from quixstreams.models.topics.admin import TopicAdmin
+ from quixstreams.models.topics.topic import TopicConfig
+
+ app = self._make_app(broker_availability_timeout=0)
+
+ # Simulate _ALL_BROKERS_DOWN in the past
+ app._producer._producer._broker_unavailable_since = time.monotonic() - 9999.0
+
+ def fake_inspect(topic_names, timeout=None):
+ return {
+ n: TopicConfig(num_partitions=1, replication_factor=1)
+ for n in topic_names
+ }
+
+ with mock_patch.object(TopicAdmin, "inspect_topics", side_effect=fake_inspect):
+ topic = app.topic(str(uuid.uuid4()))
+ app.dataframe(topic)
+
+ poll_count = 0
+
+ def mock_poll_row(self_consumer, *args, **kwargs):
+ nonlocal poll_count
+ poll_count += 1
+ if poll_count >= 3:
+ app.stop()
+ return None
+
+ with mock_patch.object(InternalConsumer, "poll_row", mock_poll_row):
+ with mock_patch.object(InternalConsumer, "_subscribe"):
+ # Should NOT raise — just run and stop normally
+ app.run()
+
+ def test_internal_producer_broker_available_passthrough(self):
+ """InternalProducer._broker_available() should reset the tracker
+ on the underlying Producer."""
+ app = self._make_app()
+
+ # Set the tracker as if brokers were down
+ app._producer._producer._broker_unavailable_since = time.monotonic() - 10.0
+
+ # Call via InternalProducer passthrough
+ app._producer._broker_available()
+
+ assert app._producer._producer._broker_unavailable_since is None
+
+ def test_default_timeout_is_120_seconds(self):
+ """The default broker_availability_timeout should be 120s."""
+ app = self._make_app()
+ assert app._broker_availability_timeout == 120.0
+
+ def test_negative_timeout_raises_value_error(self):
+ """A negative broker_availability_timeout should raise ValueError."""
+ with pytest.raises(
+ ValueError, match="broker_availability_timeout must be >= 0"
+ ):
+ self._make_app(broker_availability_timeout=-1)
+
+
+class TestConsumerBrokerAvailability:
+ """Tests for BaseConsumer detecting prolonged broker unavailability."""
+
+ def test_all_brokers_down_sets_timestamp(self):
+ """When _ALL_BROKERS_DOWN fires, the consumer should record the timestamp."""
+ from quixstreams.kafka.consumer import BaseConsumer
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+ assert consumer._broker_unavailable_since is None
+
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ consumer._error_cb(error)
+ assert consumer._broker_unavailable_since is not None
+
+ def test_broker_available_resets_timestamp(self):
+ """_broker_available() should reset the unavailable timestamp."""
+ from quixstreams.kafka.consumer import BaseConsumer
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ consumer._error_cb(error)
+ assert consumer._broker_unavailable_since is not None
+
+ consumer._broker_available()
+ assert consumer._broker_unavailable_since is None
+
+ def test_raise_if_broker_unavailable_raises_after_timeout(self):
+ """Should raise after timeout with failed probe."""
+ from unittest.mock import MagicMock
+
+ from quixstreams.kafka.consumer import BaseConsumer
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+
+ with patch("time.monotonic", return_value=100.0):
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ consumer._error_cb(error)
+
+ fake = MagicMock()
+ fake.list_topics.side_effect = Exception("down")
+ consumer._inner_consumer = fake
+
+ with patch("time.monotonic", return_value=280.0):
+ with pytest.raises(
+ KafkaBrokerUnavailableError,
+ match="broker_availability_timeout",
+ ):
+ consumer.raise_if_broker_unavailable(timeout=120.0)
+
+ def test_active_probe_resets_timer_when_brokers_recover(self):
+ """If the metadata probe succeeds, the timer should reset."""
+ from unittest.mock import MagicMock
+
+ from quixstreams.kafka.consumer import BaseConsumer
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+
+ with patch("time.monotonic", return_value=100.0):
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ consumer._error_cb(error)
+
+ fake = MagicMock()
+ fake.list_topics.return_value = MagicMock()
+ consumer._inner_consumer = fake
+
+ with patch("time.monotonic", return_value=280.0):
+ consumer.raise_if_broker_unavailable(timeout=120.0)
+
+ assert consumer._broker_unavailable_since is None
+
+ def test_custom_error_callback_still_tracks_brokers(self):
+ """Custom error callback should still get broker tracking."""
+ from quixstreams.kafka.consumer import BaseConsumer
+
+ calls = []
+
+ def my_error_cb(error):
+ calls.append(error.code())
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ error_callback=my_error_cb,
+ )
+
+ error = KafkaError(KafkaError._ALL_BROKERS_DOWN)
+ consumer._error_cb(error)
+
+ assert KafkaError._ALL_BROKERS_DOWN in calls
+ assert consumer._broker_unavailable_since is not None
+
+
+class TestRecoveryBrokerAvailability:
+ """Tests for broker availability checks during state recovery."""
+
+ def test_recovery_loop_raises_when_broker_unavailable(self):
+ """The recovery loop should raise KafkaBrokerUnavailableError when
+ brokers have been down longer than the configured timeout."""
+ from unittest.mock import MagicMock
+ from unittest.mock import patch as mock_patch
+
+ from quixstreams.kafka.consumer import BaseConsumer
+ from quixstreams.models.topics import TopicManager
+ from quixstreams.state.recovery import RecoveryManager
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+
+ topic_manager = MagicMock(spec=TopicManager)
+ recovery_manager = RecoveryManager(
+ consumer=consumer,
+ topic_manager=topic_manager,
+ broker_availability_timeout=0.1,
+ )
+
+ # Simulate broker down for a long time
+ consumer._broker_unavailable_since = time.monotonic() - 10.0
+
+ # Make the active metadata probe fail
+ fake_confluent = MagicMock()
+ fake_confluent.list_topics.side_effect = Exception("brokers down")
+ fake_confluent.poll.return_value = None
+ consumer._inner_consumer = fake_confluent
+
+ # Make recovery loop think it has work (so it enters the while loop)
+ fake_rp = MagicMock()
+ recovery_manager._recovery_partitions = {0: {"store": fake_rp}}
+ recovery_manager._running = True
+
+ # Mock _update_recovery_status to isolate the broker check behavior
+ with mock_patch.object(recovery_manager, "_update_recovery_status"):
+ with pytest.raises(KafkaBrokerUnavailableError):
+ recovery_manager._recovery_loop()
+
+ def test_recovery_loop_no_check_when_timeout_disabled(self):
+ """With broker_availability_timeout=0 (disabled), recovery loop should
+ NOT check broker availability."""
+ from unittest.mock import MagicMock
+ from unittest.mock import patch as mock_patch
+
+ from quixstreams.kafka.consumer import BaseConsumer
+ from quixstreams.models.topics import TopicManager
+ from quixstreams.state.recovery import RecoveryManager
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+
+ topic_manager = MagicMock(spec=TopicManager)
+ recovery_manager = RecoveryManager(
+ consumer=consumer,
+ topic_manager=topic_manager,
+ broker_availability_timeout=0,
+ )
+
+ # Simulate broker down for a long time
+ consumer._broker_unavailable_since = time.monotonic() - 9999.0
+
+ # Make the active metadata probe fail
+ fake_confluent = MagicMock()
+ fake_confluent.list_topics.side_effect = Exception("brokers down")
+ fake_confluent.poll.return_value = None
+ consumer._inner_consumer = fake_confluent
+
+ # Make recovery loop think it has work
+ fake_rp = MagicMock()
+ recovery_manager._recovery_partitions = {0: {"store": fake_rp}}
+ recovery_manager._running = True
+
+ poll_count = 0
+
+ def stop_after_3(*args, **kwargs):
+ nonlocal poll_count
+ poll_count += 1
+ if poll_count >= 3:
+ recovery_manager._running = False
+ return None
+
+ fake_confluent.poll.side_effect = stop_after_3
+
+ # Mock _update_recovery_status to isolate the broker check behavior
+ with mock_patch.object(recovery_manager, "_update_recovery_status"):
+ # Should NOT raise — just loop and stop
+ recovery_manager._recovery_loop()
+ assert poll_count >= 3
+
+ def test_recovery_loop_resets_timer_on_changelog_message(self):
+ """Successfully consuming a changelog message during recovery should
+ reset the broker unavailability timer, preventing false positives."""
+ from unittest.mock import MagicMock
+ from unittest.mock import patch as mock_patch
+
+ from quixstreams.kafka.consumer import BaseConsumer
+ from quixstreams.models.topics import TopicManager
+ from quixstreams.state.recovery import RecoveryManager
+
+ consumer = BaseConsumer(
+ broker_address="localhost:9092",
+ consumer_group="test",
+ auto_offset_reset="latest",
+ )
+
+ topic_manager = MagicMock(spec=TopicManager)
+ recovery_manager = RecoveryManager(
+ consumer=consumer,
+ topic_manager=topic_manager,
+ broker_availability_timeout=0.1,
+ )
+
+ # Simulate broker was down (but has since recovered — messages are flowing)
+ consumer._broker_unavailable_since = time.monotonic() - 10.0
+
+ # Make the metadata probe fail — only changelog consumption should reset
+ fake_confluent = MagicMock()
+ fake_confluent.list_topics.side_effect = Exception("brokers down")
+ consumer._inner_consumer = fake_confluent
+
+ # Set up recovery partition
+ fake_rp = MagicMock()
+ fake_rp.changelog_name = "test-changelog"
+ fake_rp.partition_num = 0
+ recovery_manager._recovery_partitions = {0: {"test-changelog": fake_rp}}
+ recovery_manager._running = True
+
+ poll_count = 0
+
+ def poll_then_stop(*args, **kwargs):
+ nonlocal poll_count
+ poll_count += 1
+ if poll_count == 1:
+ # First poll: return a fake changelog message
+ msg = MagicMock()
+ msg.error.return_value = None
+ msg.partition.return_value = 0
+ msg.topic.return_value = "test-changelog"
+ return msg
+ # Second poll: stop the loop
+ recovery_manager._running = False
+ return None
+
+ fake_confluent.poll.side_effect = poll_then_stop
+
+ with mock_patch.object(recovery_manager, "_update_recovery_status"):
+ recovery_manager._recovery_loop()
+
+ # Timer should have been reset by the successful changelog consumption
+ assert consumer._broker_unavailable_since is None