diff --git a/osbenchmark/benchmark.py b/osbenchmark/benchmark.py index 8727e22a..4d81e97b 100644 --- a/osbenchmark/benchmark.py +++ b/osbenchmark/benchmark.py @@ -652,6 +652,14 @@ def add_workload_source(subparser): help="Run a load test on your cluster, up to a certain QPS value (default: 0)", default=0 ) + test_execution_parser.add_argument( + "--redline-test", + help="Run a redline test on your cluster, up to a certain QPS value (default: 1000)", + nargs='?', + const=1000, # Value to use when flag is present but no value given + default=0, # Value to use when flag is not present + type=int + ) ############################################################################### # @@ -939,6 +947,7 @@ def configure_test(arg_parser, args, cfg): opts.csv_to_list(args.load_worker_coordinator_hosts)) cfg.add(config.Scope.applicationOverride, "workload", "test.mode.enabled", args.test_mode) cfg.add(config.Scope.applicationOverride, "workload", "load.test.clients", int(args.load_test_qps)) + cfg.add(config.Scope.applicationOverride, "workload", "redline.test", int(args.redline_test)) cfg.add(config.Scope.applicationOverride, "workload", "latency.percentiles", args.latency_percentiles) cfg.add(config.Scope.applicationOverride, "workload", "throughput.percentiles", args.throughput_percentiles) cfg.add(config.Scope.applicationOverride, "workload", "randomization.enabled", args.randomization_enabled) diff --git a/osbenchmark/worker_coordinator/__init__.py b/osbenchmark/worker_coordinator/__init__.py index bf5f716c..97c57be7 100644 --- a/osbenchmark/worker_coordinator/__init__.py +++ b/osbenchmark/worker_coordinator/__init__.py @@ -25,6 +25,8 @@ # expose only the minimum API from .worker_coordinator import ( WorkerCoordinatorActor, + FeedbackActor, + StartFeedbackActor, PrepareBenchmark, PreparationComplete, StartBenchmark, diff --git a/osbenchmark/worker_coordinator/worker_coordinator.py b/osbenchmark/worker_coordinator/worker_coordinator.py index a16a54e3..f9374c32 100644 --- a/osbenchmark/worker_coordinator/worker_coordinator.py +++ b/osbenchmark/worker_coordinator/worker_coordinator.py @@ -32,9 +32,10 @@ import math import multiprocessing import queue +import sys import threading from dataclasses import dataclass -from typing import Callable +from typing import Callable, List, Dict, Any import time from enum import Enum @@ -128,7 +129,7 @@ class StartWorker: Starts a worker. """ - def __init__(self, worker_id, config, workload, client_allocations): + def __init__(self, worker_id, config, workload, client_allocations, feedback_actor=None, error_queue=None, queue_lock=None): """ :param worker_id: Unique (numeric) id of the worker. :param config: OSB internal configuration object. @@ -139,6 +140,9 @@ def __init__(self, worker_id, config, workload, client_allocations): self.config = config self.workload = workload self.client_allocations = client_allocations + self.feedback_actor = feedback_actor + self.error_queue = error_queue + self.queue_lock = queue_lock class Drive: @@ -198,6 +202,215 @@ def __init__(self, metrics, next_task_scheduled_in): self.metrics = metrics self.next_task_scheduled_in = next_task_scheduled_in +class ClusterErrorMessage: + """Message sent from the client when a request fails during load testing""" + def __init__(self, client_id, request_metadata): + self.client_id = client_id + self.request_metadata = request_metadata + +class SharedClientStateMessage: + """Message sent from the Worker to the FeedbackActor to share client state dictionaries""" + def __init__(self, worker_id, worker_clients_map): + self.worker_id = worker_id + self.worker_clients_map = worker_clients_map + +class EnableFeedbackScaling: + pass + +class DisableFeedbackScaling: + pass + +class FeedbackState(Enum): + """Various states for the FeedbackActor""" + NEUTRAL = "neutral" + SCALING_DOWN = "scaling_down" + SLEEP = "sleep" + SCALING_UP = "scaling_up" + DISABLED = "disabled" + +class StartFeedbackActor: + def __init__(self, feedback_actor, error_queue=None, queue_lock=None, total_workers=None): + self.feedback_actor = feedback_actor + self.total_workers = total_workers + self.error_queue = error_queue + self.queue_lock = queue_lock + +class FeedbackActor(actor.BenchmarkActor): + POST_SCALEDOWN_SECONDS = 30 + STARTUP_TIMEOUT = 30 + WAKEUP_INTERVAL = 1 + + def __init__(self) -> None: + super().__init__() + self.logger = logging.getLogger(__name__) + self.state = FeedbackState.NEUTRAL + self.shared_client_states = {} + self.expected_worker_count = 0 + self.workers_reported = 0 + self.total_client_count = 0 + self.total_active_client_count = 0 # must be tracked for scaling up/down + self.sleep_start_time = time.perf_counter() + self.last_error_time = time.perf_counter() - FeedbackActor.POST_SCALEDOWN_SECONDS + self.last_scaleup_time = time.perf_counter() - FeedbackActor.POST_SCALEDOWN_SECONDS + # These will be passed in via StartFeedbackActor: + self.error_queue = None + self.queue_lock = None + + def receiveMsg_SharedClientStateMessage(self, msg, sender) -> None: + """ + Receives a worker's client state dictionary. + """ + try: + # Store only the 'data' portion; error reporting is done via the error queue. + self.shared_client_states[msg.worker_id] = msg.worker_clients_map['data'] + self.total_client_count += len(msg.worker_clients_map['data']) + self.workers_reported += 1 + if self.workers_reported == self.expected_worker_count: + self.wakeupAfter(datetime.timedelta(seconds=FeedbackActor.WAKEUP_INTERVAL)) + except Exception as e: + self.logger.error("Error processing client states: %s", e) + + def receiveMsg_StartFeedbackActor(self, msg, sender) -> None: + """ + Initializes the FeedbackActor with expected worker count, error queue, and queue lock. + """ + self.expected_worker_count = msg.total_workers + self.error_queue = msg.error_queue + self.queue_lock = msg.queue_lock + self.startup_time = time.perf_counter() + self.wakeupAfter(datetime.timedelta(seconds=FeedbackActor.WAKEUP_INTERVAL)) + + def receiveMsg_WakeupMessage(self, msg, sender) -> None: + # Check state and re-schedule wakeups. + self.handle_state() + self.wakeupAfter(datetime.timedelta(seconds=FeedbackActor.WAKEUP_INTERVAL)) + + def receiveUnrecognizedMessage(self, msg, sender) -> None: + self.logger.info("Received unrecognized message: %s", msg) + + def receiveMsg_EnableFeedbackScaling(self, msg, sender): + self.logger.info("FeedbackActor: scaling enabled.") + self.state = FeedbackState.SCALING_UP + + def receiveMsg_DisableFeedbackScaling(self, msg, sender): + self.logger.info("FeedbackActor: scaling disabled.") + self.state = FeedbackState.DISABLED + + def receiveMsg_ActorExitRequest(self, msg, sender): + self.logger.info("FeedbackActor received ActorExitRequest and will shutdown") + if hasattr(self, 'shared_client_states'): + self.shared_client_states.clear() + + def check_for_errors(self) -> List[Dict[str, Any]]: + """Poll the error queue for errors.""" + errors = [] + try: + while True: + error = self.error_queue.get_nowait() + errors.append(error) + except queue.Empty: + pass # queue is empty + return errors + + def clear_queue(self) -> None: + """Clear any remaining items from the error queue.""" + while True: + try: + self.error_queue.get_nowait() + except queue.Empty: + break + + def handle_state(self) -> None: + current_time = time.perf_counter() + errors = self.check_for_errors() + + sys.stdout.write("\x1b[s") # Save cursor position + sys.stdout.write("\x1b[1B") # Move cursor down 1 line + sys.stdout.write("\r\x1b[2K") # Clear the line + sys.stdout.write(f"[Redline] Active clients: {self.total_active_client_count}") + sys.stdout.write("\x1b[u") # Restore cursor position + sys.stdout.flush() + + if self.state == FeedbackState.DISABLED: + return + + if self.state == FeedbackState.SLEEP: + if current_time - self.sleep_start_time >= self.POST_SCALEDOWN_SECONDS: + self.logger.info("Sleep period complete, returning to NEUTRAL state") + self.clear_queue() + self.state = FeedbackState.NEUTRAL + self.sleep_start_time = current_time + + elif errors: + self.logger.info("Error messages detected, scaling down...") + self.state = FeedbackState.SCALING_DOWN + with self.queue_lock: # Block producers while scaling down. + self.scale_down() + self.logger.info("Clients scaled down. Active clients: %d", self.total_active_client_count) + self.last_error_time = current_time + + elif self.state == FeedbackState.NEUTRAL: + if (current_time - self.last_error_time >= self.POST_SCALEDOWN_SECONDS and + current_time - self.last_scaleup_time >= self.WAKEUP_INTERVAL): + self.logger.info("No errors in the last %d seconds, scaling up", self.POST_SCALEDOWN_SECONDS) + self.state = FeedbackState.SCALING_UP + + if self.state == FeedbackState.SCALING_UP: + self.logger.info("Scaling up...") + self.scale_up() + self.logger.info("Clients scaled up. Active clients: %d", self.total_active_client_count) + self.state = FeedbackState.NEUTRAL + + def scale_down(self, scale_down_percentage=0.10) -> None: + try: + clients_to_pause = int(self.total_active_client_count * scale_down_percentage) + clients_paused = 0 + active_clients_by_worker = {} + for worker_id, client_states in self.shared_client_states.items(): + active_clients = [(client_id, status) for client_id, status in client_states.items() if status] + if active_clients: + active_clients_by_worker[worker_id] = active_clients + while clients_paused < clients_to_pause and active_clients_by_worker: + for worker_id in list(active_clients_by_worker.keys()): + if clients_paused >= clients_to_pause: + break + if not active_clients_by_worker[worker_id]: + del active_clients_by_worker[worker_id] + continue + client_id, _ = active_clients_by_worker[worker_id].pop(0) + self.shared_client_states[worker_id][client_id] = False + clients_paused += 1 + self.total_active_client_count -= 1 + self.logger.info("Scaling down complete. Paused %d clients", clients_paused) + finally: + self.state = FeedbackState.SLEEP + self.clear_queue() + self.sleep_start_time = self.last_scaleup_time = time.perf_counter() + + def scale_up(self, n_clients=5) -> None: + try: + clients_activated = 0 + while clients_activated < n_clients: + inactive_clients_by_worker = {} + for worker_id, client_states in self.shared_client_states.items(): + inactive_clients = [(client_id, status) for client_id, status in client_states.items() if not status] + if inactive_clients: + inactive_clients_by_worker[worker_id] = inactive_clients + for worker_id in inactive_clients_by_worker: + if clients_activated >= n_clients: + break + if inactive_clients_by_worker[worker_id]: + client_id, _ = inactive_clients_by_worker[worker_id][0] + self.shared_client_states[worker_id][client_id] = True + self.total_active_client_count += 1 + clients_activated += 1 + self.logger.info("Unpaused client %d on worker %d", client_id, worker_id) + if clients_activated < n_clients: + self.logger.info("Not enough inactive clients to activate. Activated %d clients", clients_activated) + break + finally: + self.last_scaleup_time = time.perf_counter() + self.state = FeedbackState.NEUTRAL class WorkerCoordinatorActor(actor.BenchmarkActor): RESET_RELATIVE_TIME_MARKER = "reset_relative_time" @@ -218,6 +431,7 @@ def __init__(self): self.status = "init" self.post_process_timer = 0 self.cluster_details = None + self.feedback_actor = None def receiveMsg_PoisonMessage(self, poisonmsg, sender): self.logger.error("Main worker_coordinator received a fatal indication from load generator (%s). Shutting down.", poisonmsg.details) @@ -249,6 +463,9 @@ def receiveMsg_ChildActorExited(self, msg, sender): self.send(self.start_sender, actor.BenchmarkFailure("Worker [{}] has exited prematurely.".format(worker_index))) else: self.logger.info("A workload preparator has exited.") + + def receiveMsg_StartFeedbackActor(self, msg, sender): + self.feedback_actor = msg.feedback_actor def receiveUnrecognizedMessage(self, msg, sender): self.logger.info("Main worker_coordinator received unknown message [%s] (ignoring).", str(msg)) @@ -293,8 +510,19 @@ def receiveMsg_WakeupMessage(self, msg, sender): def create_client(self, host): return self.createActor(Worker, targetActorRequirements=self._requirements(host)) - def start_worker(self, worker_coordinator, worker_id, cfg, workload, allocations): - self.send(worker_coordinator, StartWorker(worker_id, cfg, workload, allocations)) + def start_worker(self, worker_coordinator, worker_id, cfg, workload, allocations, error_queue=None, queue_lock=None): + self.send(worker_coordinator, StartWorker(worker_id, cfg, workload, allocations, self.feedback_actor, error_queue, queue_lock)) + + def start_feedbackActor(self, total_workers): + self.send( + self.feedback_actor, + StartFeedbackActor( + feedback_actor=self.feedback_actor, + total_workers=total_workers, + error_queue=self.coordinator.error_queue, + queue_lock=self.coordinator.queue_lock + ) + ) def drive_at(self, worker_coordinator, client_start_timestamp): self.send(worker_coordinator, Drive(client_start_timestamp)) @@ -532,6 +760,9 @@ def __init__(self, target, config, os_client_factory_class=client.OsClientFactor self.workers = [] # which client ids are assigned to which workers? self.clients_per_worker = {} + self.manager = multiprocessing.Manager() + self.error_queue = None + self.queue_lock = self.manager.Lock() self.progress_results_publisher = console.progress() self.progress_counter = 0 @@ -657,15 +888,17 @@ def start_benchmark(self): self.logger.info("Attaching cluster-level telemetry devices.") self.telemetry.on_benchmark_start() self.logger.info("Cluster-level telemetry devices are now attached.") - # if load testing is enabled, modify the client + throughput number for the task(s) - # target throughput + clients will then be equal to the qps passed in through --load-test - load_test_clients = self.config.opts("workload", "load.test.clients", mandatory=False) + # if redline testing or load testing is enabled, modify the client + throughput number for the task(s) + # target throughput + clients will then be equal to the qps passed in through --redline-test or --load-test + load_test_clients = self.config.opts("workload", "redline.test", mandatory=False) or self.config.opts("workload", "load.test.clients", mandatory=False) if load_test_clients: + self.target.feedback_actor = self.target.createActor(FeedbackActor) + self.error_queue = self.manager.Queue(maxsize=1000) for task in self.test_procedure.schedule: for subtask in task: subtask.clients = load_test_clients subtask.params["target-throughput"] = load_test_clients - self.logger.info("Load test mode enabled - set client count to %d", load_test_clients) + self.logger.info("Load test mode enabled - set max client count to %d", load_test_clients) allocator = Allocator(self.test_procedure.schedule) self.allocations = allocator.allocations self.number_of_steps = len(allocator.join_points) - 1 @@ -679,11 +912,15 @@ def start_benchmark(self): worker_assignments = calculate_worker_assignments(self.load_worker_coordinator_hosts, allocator.clients) worker_id = 0 + # redline testing: keep track of the total number of workers + # and report this to the feedbackActor before starting a redline test + total_workers = 0 for assignment in worker_assignments: host = assignment["host"] for clients in assignment["workers"]: # don't assign workers without any clients if len(clients) > 0: + total_workers += 1 self.logger.info("Allocating worker [%d] on [%s] with [%d] clients.", worker_id, host, len(clients)) worker = self.target.create_client(host) @@ -691,10 +928,16 @@ def start_benchmark(self): for client_id in clients: client_allocations.add(client_id, self.allocations[client_id]) self.clients_per_worker[client_id] = worker_id - self.target.start_worker(worker, worker_id, self.config, self.workload, client_allocations) + self.target.start_worker(worker, worker_id, self.config, self.workload, client_allocations, self.error_queue, self.queue_lock) self.workers.append(worker) worker_id += 1 + # redline testing: once all workers have been given their assignments, we can start the feedbackActor. + # we pass along the total number of workers, so it can have some context on how many dictionaries it should have + # BEFORE starting the redline test + if self.config.opts("workload", "redline.test", mandatory=False): + self.target.start_feedbackActor(total_workers) + self.update_progress_message() def joinpoint_reached(self, worker_id, worker_local_timestamp, task_allocations): @@ -1076,6 +1319,8 @@ def __init__(self): self.start_driving = False self.wakeup_interval = Worker.WAKEUP_INTERVAL_SECONDS self.sample_queue_size = None + self.shared_states = None + self.feedback_actor = None @actor.no_retry("worker") # pylint: disable=no-value-for-parameter def receiveMsg_StartWorker(self, msg, sender): @@ -1090,12 +1335,36 @@ def receiveMsg_StartWorker(self, msg, sender): self.client_allocations = msg.client_allocations self.current_task_index = 0 self.cancel.clear() + self.feedback_actor = msg.feedback_actor + self.error_queue = msg.error_queue + self.queue_lock = msg.queue_lock # we need to wake up more often in test mode if self.config.opts("workload", "test.mode.enabled"): self.wakeup_interval = 0.5 runner.register_default_runners() if self.workload.has_plugins: workload.load_workload_plugins(self.config, self.workload.name, runner.register_runner, scheduler.register_scheduler) + + # if redline testing is enabled, let's create shared state dictionaries + # these will be shared between the FeedbackActor, as well as clients inside AsyncExecutor + # and will tell clients whether they should be sending requests or not + if self.config.opts("workload", "redline.test", mandatory=False): + self.manager = multiprocessing.Manager() + self.shared_states = { + "worker_id": msg.worker_id, + "data": self.manager.dict() + } + # now let's add the clients assigned to this worker, from the allocations already calculated for us + for allocation in msg.client_allocations.allocations: + client_id = allocation["client_id"] + self.shared_states["data"][client_id] = False + # now let's send this over to the FeedbackActor + shared_state_message = SharedClientStateMessage( + worker_id=msg.worker_id, + worker_clients_map=self.shared_states + ) + self.send(self.feedback_actor, shared_state_message) + self.drive() @actor.no_retry("worker") # pylint: disable=no-value-for-parameter @@ -1198,6 +1467,7 @@ def drive(self): if self.at_joinpoint(): self.logger.info("Worker[%d] reached join point at index [%d].", self.worker_id, self.current_task_index) + self.send(self.feedback_actor, DisableFeedbackScaling()) # clients that don't execute tasks don't need to care about waiting if self.executor_future is not None: self.executor_future.result() @@ -1214,10 +1484,11 @@ def drive(self): self.logger.info("Worker[%d] skips tasks at index [%d] because it has been asked to complete all " "tasks until next join point.", self.worker_id, self.current_task_index) else: + self.send(self.feedback_actor, EnableFeedbackScaling()) self.logger.info("Worker[%d] is executing tasks at index [%d].", self.worker_id, self.current_task_index) self.sampler = Sampler(start_timestamp=time.perf_counter(), buffer_size=self.sample_queue_size) executor = AsyncIoAdapter(self.config, self.workload, task_allocations, self.sampler, - self.cancel, self.complete, self.on_error) + self.cancel, self.complete, self.on_error, self.send, self.shared_states, self.feedback_actor, self.error_queue, self.queue_lock) self.executor_future = self.pool.submit(executor) self.wakeupAfter(datetime.timedelta(seconds=self.wakeup_interval)) @@ -1486,7 +1757,7 @@ def map_task_throughput(self, current_samples): class AsyncIoAdapter: - def __init__(self, cfg, workload, task_allocations, sampler, cancel, complete, abort_on_error): + def __init__(self, cfg, workload, task_allocations, sampler, cancel, complete, abort_on_error, send_fn, shared_states=None, feedback_actor=None, error_queue=None, queue_lock=None): self.cfg = cfg self.workload = workload self.task_allocations = task_allocations @@ -1498,6 +1769,11 @@ def __init__(self, cfg, workload, task_allocations, sampler, cancel, complete, a self.assertions_enabled = self.cfg.opts("worker_coordinator", "assertions") self.debug_event_loop = self.cfg.opts("system", "async.debug", mandatory=False, default_value=False) self.logger = logging.getLogger(__name__) + self.shared_states = shared_states + self.send_fn = send_fn + self.feedback_actor = feedback_actor + self.error_queue = error_queue + self.queue_lock = queue_lock def __call__(self, *args, **kwargs): # only possible in Python 3.7+ (has introduced get_running_loop) @@ -1553,7 +1829,7 @@ def os_clients(all_hosts, all_client_options): schedule = schedule_for(task_allocation, params_per_task[task]) async_executor = AsyncExecutor( client_id, task, schedule, opensearch, self.sampler, self.cancel, self.complete, - task.error_behavior(self.abort_on_error), self.cfg) + task.error_behavior(self.abort_on_error), self.send_fn, self.cfg, self.shared_states, self.feedback_actor, self.error_queue, self.queue_lock) final_executor = AsyncProfiler(async_executor) if self.profiling_enabled else async_executor aws.append(final_executor()) run_start = time.perf_counter() @@ -1605,7 +1881,7 @@ async def __call__(self, *args, **kwargs): class AsyncExecutor: - def __init__(self, client_id, task, schedule, opensearch, sampler, cancel, complete, on_error, config=None): + def __init__(self, client_id, task, schedule, opensearch, sampler, cancel, complete, on_error, send_fn, config=None, shared_states=None, feedback_actor=None, error_queue=None, queue_lock=None): """ Executes tasks according to the schedule for a given operation. @@ -1628,8 +1904,16 @@ def __init__(self, client_id, task, schedule, opensearch, sampler, cancel, compl self.on_error = on_error self.logger = logging.getLogger(__name__) self.cfg = config + self.send = send_fn + self.shared_states = shared_states + self.feedback_actor = feedback_actor + self.error_queue = error_queue + self.queue_lock = queue_lock + self.redline_enabled = self.cfg.opts("workload", "redline.test", mandatory=False) async def __call__(self, *args, **kwargs): + has_run_any_requests = False + was_paused = False task_completes_parent = self.task.completes_parent total_start = time.perf_counter() # lazily initialize the schedule @@ -1651,6 +1935,27 @@ async def __call__(self, *args, **kwargs): if self.cancel.is_set(): self.logger.info("User cancelled execution.") break + + # redline testing: check whether this client should be running + # if redline testing is not enabled, there won't be a dictionary shared to this client, + # so we evaluate to a truthy value by default, allowing it to run normally in a regular benchmark + client_state = (self.shared_states or {}).get('data', {}).get(self.client_id, True) + if client_state and was_paused: + now = time.perf_counter() + total_start = now - expected_scheduled_time + self.logger.debug(f"Client {self.client_id} resumed. Adjusted total_start to prevent request burst.") + was_paused = False + elif not client_state: + was_paused = True + processing_end = time.perf_counter() + total_ops = 0 + total_ops_unit = "ops" + request_meta_data = {"success": False, "skipped": True} + self.schedule_handle.after_request(processing_end, total_ops, total_ops_unit, request_meta_data) + if self.complete.is_set(): + break + continue + absolute_expected_schedule_time = total_start + expected_scheduled_time throughput_throttled = expected_scheduled_time > 0 if throughput_throttled: @@ -1661,6 +1966,7 @@ async def __call__(self, *args, **kwargs): absolute_processing_start = time.time() processing_start = time.perf_counter() self.schedule_handle.before_request(processing_start) + has_run_any_requests = True async with self.opensearch["default"].new_request_context() as request_context: # add num_clients to the parameter so that vector search runner can skip calculating recall # if num_clients > cpu_count(). @@ -1670,11 +1976,21 @@ async def __call__(self, *args, **kwargs): default_value=multiprocessing.cpu_count())) params.update({"num_clients": self.task.clients, "num_cores": available_cores}) - total_ops, total_ops_unit, request_meta_data = await execute_single(runner, self.opensearch, params, self.on_error) + total_ops, total_ops_unit, request_meta_data = await execute_single(runner, self.opensearch, params, self.on_error, self.redline_enabled) request_start = request_context.request_start request_end = request_context.request_end client_request_start = request_context.client_request_start client_request_end = request_context.client_request_end + # redline testing: send any bad requests to the FeedbackActor + if not request_meta_data["success"] and not request_meta_data.get("skipped", False): + if self.error_queue is not None: + self.logger.error("Real error detected in client %s. Notifying FeedbackActor...", self.client_id) + error_info = { + "client_id": self.client_id, + "task": str(self.task), + "error_details": request_meta_data + } + self.report_error(error_info) processing_end = time.perf_counter() service_time = request_end - request_start @@ -1712,10 +2028,11 @@ async def __call__(self, *args, **kwargs): else: progress = percent_completed - self.sampler.add(self.task, self.client_id, sample_type, request_meta_data, - absolute_processing_start, request_start, - latency, service_time, client_processing_time, processing_time, throughput, total_ops, total_ops_unit, - time_period, progress, request_meta_data.pop("dependent_timing", None)) + if client_state: + self.sampler.add(self.task, self.client_id, sample_type, request_meta_data, + absolute_processing_start, request_start, + latency, service_time, client_processing_time, processing_time, throughput, total_ops, total_ops_unit, + time_period, progress, request_meta_data.pop("dependent_timing", None)) if completed: self.logger.info("Task [%s] is considered completed due to external event.", self.task) @@ -1725,16 +2042,22 @@ async def __call__(self, *args, **kwargs): raise exceptions.BenchmarkError(f"Cannot run task [{self.task}]: {e}") from None finally: # Actively set it if this task completes its parent - if task_completes_parent: + if task_completes_parent or not has_run_any_requests: self.logger.info("Task [%s] completes parent. Client id [%s] is finished executing it and signals completion.", self.task, self.client_id) self.complete.set() + def report_error(self, error_info): + if self.error_queue is not None: + try: + self.error_queue.put_nowait(error_info) + except queue.Full: + self.logger.warning("Error queue full; dropping error from client %s", self.client_id) request_context_holder = client.RequestContextHolder() -async def execute_single(runner, opensearch, params, on_error): +async def execute_single(runner, opensearch, params, on_error, redline_enabled): """ Invokes the given runner once and provides the runner's return value in a uniform structure. @@ -1799,7 +2122,8 @@ async def execute_single(runner, opensearch, params, on_error): description = request_meta_data.get("error-description") if description: msg += ", Description: %s" % description - console.error(msg) + if not redline_enabled: + console.error(msg) raise exceptions.BenchmarkAssertionError(msg) if 'error-description' in request_meta_data: @@ -1807,12 +2131,14 @@ async def execute_single(runner, opensearch, params, on_error): error_metadata = json.loads(request_meta_data["error-description"]) # parse error-description metadata opensearch_operation_error = parse_error(error_metadata) - console.error(opensearch_operation_error.get_error_message()) + if not redline_enabled: + console.error(opensearch_operation_error.get_error_message()) except Exception as e: # error-description is not a valid json so we just print it - console.error(request_meta_data["error-description"]) - - logging.getLogger(__name__).error(request_meta_data["error-description"]) + if not redline_enabled: + console.error(request_meta_data["error-description"]) + if not redline_enabled: + logging.getLogger(__name__).error(request_meta_data["error-description"]) return total_ops, total_ops_unit, request_meta_data