Skip to content

Commit 4d96c35

Browse files
authored
Changes to support ramp-up feature (#725)
Signed-off-by: Rishabh Singh <[email protected]>
1 parent 1417225 commit 4d96c35

File tree

8 files changed

+828
-365
lines changed

8 files changed

+828
-365
lines changed

.pylintrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ indent-after-paren=4
360360
indent-string=' '
361361

362362
# Maximum number of characters on a single line.
363-
max-line-length=140
363+
max-line-length=180
364364

365365
# Maximum number of lines in a module.
366366
max-module-lines=1000

osbenchmark/resources/workload-schema.json

+15
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
"type": "integer",
2929
"minimum": 1
3030
},
31+
"ramp-up-time-period": {
32+
"type": "integer",
33+
"minimum": 0,
34+
"description": "Defines the time period in seconds to gradually increase the number of clients."
35+
},
3136
"warmup-time-period": {
3237
"type": "integer",
3338
"minimum": 0,
@@ -75,6 +80,11 @@
7580
"minimum": 1,
7681
"description": "Defines the number of times to run the operation."
7782
},
83+
"ramp-up-time-period": {
84+
"type": "integer",
85+
"minimum": 0,
86+
"description": "Defines the time period in seconds to gradually increase the number of clients."
87+
},
7888
"warmup-time-period": {
7989
"type": "integer",
8090
"minimum": 0,
@@ -146,6 +156,11 @@
146156
"minimum": 1,
147157
"description": "Defines the number of times to run the operation."
148158
},
159+
"ramp-up-time-period": {
160+
"type": "integer",
161+
"minimum": 0,
162+
"description": "Defines the time period in seconds to gradually increase the number of clients."
163+
},
149164
"warmup-time-period": {
150165
"type": "integer",
151166
"minimum": 0,

osbenchmark/worker_coordinator/worker_coordinator.py

+58-22
Original file line numberDiff line numberDiff line change
@@ -1522,7 +1522,7 @@ def os_clients(all_hosts, all_client_options):
15221522
#
15231523
# Now we need to ensure that we start partitioning parameters correctly in both cases. And that means we
15241524
# need to start from (client) index 0 in both cases instead of 0 for indexA and 4 for indexB.
1525-
schedule = schedule_for(task, task_allocation.client_index_in_task, params_per_task[task])
1525+
schedule = schedule_for(task_allocation, params_per_task[task])
15261526
async_executor = AsyncExecutor(
15271527
client_id, task, schedule, opensearch, self.sampler, self.cancel, self.complete,
15281528
task.error_behavior(self.abort_on_error), self.cfg)
@@ -1607,6 +1607,15 @@ async def __call__(self, *args, **kwargs):
16071607
# lazily initialize the schedule
16081608
self.logger.debug("Initializing schedule for client id [%s].", self.client_id)
16091609
schedule = self.schedule_handle()
1610+
self.schedule_handle.start()
1611+
rampup_wait_time = self.schedule_handle.ramp_up_wait_time
1612+
if rampup_wait_time:
1613+
self.logger.info("client id [%s] waiting [%.2f]s for ramp-up.", self.client_id, rampup_wait_time)
1614+
await asyncio.sleep(rampup_wait_time)
1615+
1616+
if rampup_wait_time:
1617+
console.println(f" Client id {self.client_id} is running now.")
1618+
16101619
self.logger.debug("Entering main loop for client id [%s].", self.client_id)
16111620
# noinspection PyBroadException
16121621
try:
@@ -1806,18 +1815,28 @@ def __repr__(self, *args, **kwargs):
18061815

18071816

18081817
class TaskAllocation:
1809-
def __init__(self, task, client_index_in_task):
1818+
def __init__(self, task, client_index_in_task, global_client_index, total_clients):
1819+
"""
1820+
:param task: The current task which is always a leaf task.
1821+
:param client_index_in_task: The task-specific index for the allocated client.
1822+
:param global_client_index: The globally unique index for the allocated client across
1823+
all concurrently executed tasks.
1824+
:param total_clients: The total number of clients executing tasks concurrently.
1825+
"""
18101826
self.task = task
18111827
self.client_index_in_task = client_index_in_task
1828+
self.global_client_index = global_client_index
1829+
self.total_clients = total_clients
18121830

18131831
def __hash__(self):
1814-
return hash(self.task) ^ hash(self.client_index_in_task)
1832+
return hash(self.task) ^ hash(self.global_client_index)
18151833

18161834
def __eq__(self, other):
1817-
return isinstance(other, type(self)) and self.task == other.task and self.client_index_in_task == other.client_index_in_task
1835+
return isinstance(other, type(self)) and self.task == other.task and self.global_client_index == other.global_client_index
18181836

18191837
def __repr__(self, *args, **kwargs):
1820-
return "TaskAllocation [%d/%d] for %s" % (self.client_index_in_task, self.task.clients, self.task)
1838+
return f"TaskAllocation [{self.client_index_in_task}/{self.task.clients}] for {self.task} " \
1839+
f"and [{self.global_client_index}/{self.total_clients}] in total"
18211840

18221841

18231842
class Allocator:
@@ -1858,12 +1877,16 @@ def allocations(self):
18581877
clients_executing_completing_task = []
18591878
for sub_task in task:
18601879
for client_index in range(start_client_index, start_client_index + sub_task.clients):
1861-
# this is the actual client that will execute the task. It may differ from the logical one in case we over-commit (i.e.
1862-
# more tasks than actually available clients)
18631880
physical_client_index = client_index % max_clients
18641881
if sub_task.completes_parent:
18651882
clients_executing_completing_task.append(physical_client_index)
1866-
allocations[physical_client_index].append(TaskAllocation(sub_task, client_index - start_client_index))
1883+
ta = TaskAllocation(task = sub_task,
1884+
client_index_in_task = client_index - start_client_index,
1885+
global_client_index=client_index,
1886+
# if task represents a parallel structure this is the total number of clients
1887+
# executing sub-tasks concurrently.
1888+
total_clients=task.clients)
1889+
allocations[physical_client_index].append(ta)
18671890
start_client_index += sub_task.clients
18681891

18691892
# uneven distribution between tasks and clients, e.g. there are 5 (parallel) tasks but only 2 clients. Then, one of them
@@ -1941,7 +1964,7 @@ def clients(self):
19411964

19421965
# Runs a concrete schedule on one worker client
19431966
# Needs to determine the runners and concrete iterations per client.
1944-
def schedule_for(task, client_index, parameter_source):
1967+
def schedule_for(task_allocation, parameter_source):
19451968
"""
19461969
Calculates a client's schedule for a given task.
19471970
@@ -1951,15 +1974,17 @@ def schedule_for(task, client_index, parameter_source):
19511974
:return: A generator for the operations the given client needs to perform for this task.
19521975
"""
19531976
logger = logging.getLogger(__name__)
1977+
task = task_allocation.task
19541978
op = task.operation
1955-
num_clients = task.clients
19561979
sched = scheduler.scheduler_for(task)
1980+
1981+
client_index = task_allocation.client_index_in_task
19571982
# guard all logging statements with the client index and only emit them for the first client. This information is
19581983
# repetitive and may cause issues in thespian with many clients (an excessive number of actor messages is sent).
19591984
if client_index == 0:
19601985
logger.info("Choosing [%s] for [%s].", sched, task)
19611986
runner_for_op = runner.runner_for(op.type)
1962-
params_for_op = parameter_source.partition(client_index, num_clients)
1987+
params_for_op = parameter_source.partition(client_index, task.clients)
19631988
if hasattr(sched, "parameter_source"):
19641989
if client_index == 0:
19651990
logger.debug("Setting parameter source [%s] for scheduler [%s]", params_for_op, sched)
@@ -1992,7 +2017,7 @@ def schedule_for(task, client_index, parameter_source):
19922017
else:
19932018
logger.info("%s schedule will determine when the schedule for [%s] terminates.", str(loop_control), task.name)
19942019

1995-
return ScheduleHandle(task.name, sched, loop_control, runner_for_op, params_for_op)
2020+
return ScheduleHandle(task_allocation, sched, loop_control, runner_for_op, params_for_op)
19962021

19972022

19982023
def requires_time_period_schedule(task, task_runner, params):
@@ -2009,27 +2034,40 @@ def requires_time_period_schedule(task, task_runner, params):
20092034

20102035

20112036
class ScheduleHandle:
2012-
def __init__(self, task_name, sched, task_progress_control, runner, params):
2037+
def __init__(self, task_allocation, sched, task_progress_control, runner, params):
20132038
"""
20142039
Creates a generator that will yield individual task invocations for the provided schedule.
20152040
2016-
:param task_name: The name of the task for which the schedule is generated.
2041+
:param task_allocation: The task allocation for which the schedule is generated.
20172042
:param sched: The scheduler for this task.
20182043
:param task_progress_control: Controls how and how often this generator will loop.
20192044
:param runner: The runner for a given operation.
20202045
:param params: The parameter source for a given operation.
20212046
:return: A generator for the corresponding parameters.
20222047
"""
2023-
self.task_name = task_name
2048+
self.task_allocation = task_allocation
20242049
self.sched = sched
20252050
self.task_progress_control = task_progress_control
20262051
self.runner = runner
20272052
self.params = params
20282053
# TODO: Can we offload the parameter source execution to a different thread / process? Is this too heavy-weight?
2029-
#from concurrent.futures import ThreadPoolExecutor
2030-
#import asyncio
2031-
#self.io_pool_exc = ThreadPoolExecutor(max_workers=1)
2032-
#self.loop = asyncio.get_event_loop()
2054+
# from concurrent.futures import ThreadPoolExecutor
2055+
# import asyncio
2056+
# self.io_pool_exc = ThreadPoolExecutor(max_workers=1)
2057+
# self.loop = asyncio.get_event_loop()
2058+
@property
2059+
def ramp_up_wait_time(self):
2060+
"""
2061+
:return: the number of seconds to wait until this client should start so load can gradually ramp-up.
2062+
"""
2063+
ramp_up_time_period = self.task_allocation.task.ramp_up_time_period
2064+
if ramp_up_time_period:
2065+
return ramp_up_time_period * (self.task_allocation.global_client_index / self.task_allocation.total_clients)
2066+
else:
2067+
return 0
2068+
2069+
def start(self):
2070+
self.task_progress_control.start()
20332071

20342072
def before_request(self, now):
20352073
self.sched.before_request(now)
@@ -2041,20 +2079,18 @@ async def __call__(self):
20412079
next_scheduled = 0
20422080
if self.task_progress_control.infinite:
20432081
param_source_knows_progress = hasattr(self.params, "percent_completed")
2044-
self.task_progress_control.start()
20452082
while True:
20462083
try:
20472084
next_scheduled = self.sched.next(next_scheduled)
20482085
# does not contribute at all to completion. Hence, we cannot define completion.
20492086
percent_completed = self.params.percent_completed if param_source_knows_progress else None
2050-
#current_params = await self.loop.run_in_executor(self.io_pool_exc, self.params.params)
2087+
# current_params = await self.loop.run_in_executor(self.io_pool_exc, self.params.params)
20512088
yield (next_scheduled, self.task_progress_control.sample_type, percent_completed, self.runner,
20522089
self.params.params())
20532090
self.task_progress_control.next()
20542091
except StopIteration:
20552092
return
20562093
else:
2057-
self.task_progress_control.start()
20582094
while not self.task_progress_control.completed:
20592095
try:
20602096
next_scheduled = self.sched.next(next_scheduled)

osbenchmark/workload/loader.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -1765,14 +1765,24 @@ def parse_parallel(self, ops_spec, ops, test_procedure_name):
17651765
default_iterations = self._r(ops_spec, "iterations", error_ctx="parallel", mandatory=False)
17661766
default_warmup_time_period = self._r(ops_spec, "warmup-time-period", error_ctx="parallel", mandatory=False)
17671767
default_time_period = self._r(ops_spec, "time-period", error_ctx="parallel", mandatory=False)
1768+
default_ramp_up_time_period = self._r(ops_spec, "ramp-up-time-period", error_ctx="parallel", mandatory=False)
17681769
clients = self._r(ops_spec, "clients", error_ctx="parallel", mandatory=False)
17691770
completed_by = self._r(ops_spec, "completed-by", error_ctx="parallel", mandatory=False)
17701771

17711772
# now descent to each operation
17721773
tasks = []
17731774
for task in self._r(ops_spec, "tasks", error_ctx="parallel"):
17741775
tasks.append(self.parse_task(task, ops, test_procedure_name, default_warmup_iterations, default_iterations,
1775-
default_warmup_time_period, default_time_period, completed_by))
1776+
default_warmup_time_period, default_time_period, default_ramp_up_time_period, completed_by))
1777+
1778+
for task in tasks:
1779+
if task.ramp_up_time_period != default_ramp_up_time_period:
1780+
if default_ramp_up_time_period is None:
1781+
self._error(f"task '{task.name}' in 'parallel' element of test-procedure '{test_procedure_name}' specifies "
1782+
f"a ramp-up-time-period but it is only allowed on the 'parallel' element.")
1783+
else:
1784+
self._error(f"task '{task.name}' specifies a different ramp-up-time-period than its enclosing "
1785+
f"'parallel' element in test-procedure '{test_procedure_name}'.")
17761786
if completed_by:
17771787
completion_task = None
17781788
for task in tasks:
@@ -1788,7 +1798,8 @@ def parse_parallel(self, ops_spec, ops, test_procedure_name):
17881798
return workload.Parallel(tasks, clients)
17891799

17901800
def parse_task(self, task_spec, ops, test_procedure_name, default_warmup_iterations=None, default_iterations=None,
1791-
default_warmup_time_period=None, default_time_period=None, completed_by_name=None):
1801+
default_warmup_time_period=None, default_time_period=None, default_ramp_up_time_period=None,
1802+
completed_by_name=None):
17921803

17931804
op_spec = task_spec["operation"]
17941805
if isinstance(op_spec, str) and op_spec in ops:
@@ -1811,6 +1822,8 @@ def parse_task(self, task_spec, ops, test_procedure_name, default_warmup_iterati
18111822
default_value=default_warmup_time_period),
18121823
time_period=self._r(task_spec, "time-period", error_ctx=op.name, mandatory=False,
18131824
default_value=default_time_period),
1825+
ramp_up_time_period=self._r(task_spec, "ramp-up-time-period", error_ctx=op.name,
1826+
mandatory=False, default_value=default_ramp_up_time_period),
18141827
clients=self._r(task_spec, "clients", error_ctx=op.name, mandatory=False, default_value=1),
18151828
completes_parent=(task_name == completed_by_name),
18161829
schedule=schedule,
@@ -1819,11 +1832,25 @@ def parse_task(self, task_spec, ops, test_procedure_name, default_warmup_iterati
18191832
if task.warmup_iterations is not None and task.time_period is not None:
18201833
self._error(
18211834
"Operation '%s' in test_procedure '%s' defines '%d' warmup iterations and a time period of '%d' seconds. Please do not "
1822-
"mix time periods and iterations." % (op.name, test_procedure_name, task.warmup_iterations, task.time_period))
1835+
"mix time periods and iterations." % (op.name, test_procedure_name, task.warmup_iterations, task.time_period))
18231836
elif task.warmup_time_period is not None and task.iterations is not None:
18241837
self._error(
18251838
"Operation '%s' in test_procedure '%s' defines a warmup time period of '%d' seconds and '%d' iterations. Please do not "
1826-
"mix time periods and iterations." % (op.name, test_procedure_name, task.warmup_time_period, task.iterations))
1839+
"mix time periods and iterations." % (op.name, test_procedure_name, task.warmup_time_period, task.iterations))
1840+
1841+
if (task.warmup_iterations is not None or task.iterations is not None) and task.ramp_up_time_period is not None:
1842+
self._error(f"Operation '{op.name}' in test_procedure '{test_procedure_name}' defines a ramp-up time period of "
1843+
f"{task.ramp_up_time_period} seconds as well as {task.warmup_iterations} warmup iterations and "
1844+
f"{task.iterations} iterations but mixing time periods and iterations is not allowed.")
1845+
1846+
if task.ramp_up_time_period is not None:
1847+
if task.warmup_time_period is None:
1848+
self._error(f"Operation '{op.name}' in test_procedure '{test_procedure_name}' defines a ramp-up time period of "
1849+
f"{task.ramp_up_time_period} seconds but no warmup-time-period.")
1850+
elif task.warmup_time_period < task.ramp_up_time_period:
1851+
self._error(f"The warmup-time-period of operation '{op.name}' in test_procedure '{test_procedure_name}' is "
1852+
f"{task.warmup_time_period} seconds but must be greater than or equal to the "
1853+
f"ramp-up-time-period of {task.ramp_up_time_period} seconds.")
18271854

18281855
return task
18291856

0 commit comments

Comments
 (0)