-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathscheduler.py
More file actions
262 lines (225 loc) · 11.3 KB
/
scheduler.py
File metadata and controls
262 lines (225 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import asyncio
import functools
import logging
import zmq.asyncio
from scaler.config.defaults import CLEANUP_INTERVAL_SECONDS, STATUS_REPORT_INTERVAL_SECONDS
from scaler.config.section.scheduler import SchedulerConfig
from scaler.config.types.zmq import ZMQConfig, ZMQType
from scaler.io.async_connector import ZMQAsyncConnector
from scaler.io.mixins import AsyncBinder, AsyncConnector, AsyncObjectStorageConnector
from scaler.io.utility import create_async_binder, create_async_object_storage_connector
from scaler.io.ymq.ymq import YMQException
from scaler.protocol.python.common import ObjectStorageAddress
from scaler.protocol.python.message import (
ClientDisconnect,
ClientHeartbeat,
DisconnectRequest,
GraphTask,
InformationRequest,
ObjectInstruction,
Task,
TaskCancel,
TaskCancelConfirm,
TaskLog,
TaskResult,
WorkerDisconnectNotification,
WorkerHeartbeat,
)
from scaler.protocol.python.mixins import Message
from scaler.scheduler.controllers.balance_controller import VanillaBalanceController
from scaler.scheduler.controllers.client_controller import VanillaClientController
from scaler.scheduler.controllers.config_controller import VanillaConfigController
from scaler.scheduler.controllers.graph_controller import VanillaGraphTaskController
from scaler.scheduler.controllers.information_controller import VanillaInformationController
from scaler.scheduler.controllers.object_controller import VanillaObjectController
from scaler.scheduler.controllers.scaling_policies.utility import create_scaling_controller
from scaler.scheduler.controllers.task_controller import VanillaTaskController
from scaler.scheduler.controllers.worker_controller import VanillaWorkerController
from scaler.utility.event_loop import create_async_loop_routine
from scaler.utility.exceptions import ClientShutdownException, ObjectStorageException
from scaler.utility.identifiers import ClientID, WorkerID
class Scheduler:
def __init__(self, config: SchedulerConfig):
self._config_controller = VanillaConfigController(config)
if config.scheduler_address.type != ZMQType.tcp:
raise TypeError(
f"{self.__class__.__name__}: scheduler address must be tcp type: \
{config.scheduler_address.to_address()}"
)
if config.object_storage_address is None:
object_storage_address = ObjectStorageAddress.new_msg(
host=config.scheduler_address.host, port=config.scheduler_address.port + 1
)
else:
object_storage_address = ObjectStorageAddress.new_msg(
host=config.object_storage_address.host, port=config.object_storage_address.port
)
self._config_controller.update_config("object_storage_address", object_storage_address)
if config.monitor_address is None:
monitor_address = ZMQConfig(
type=ZMQType.tcp, host=config.scheduler_address.host, port=config.scheduler_address.port + 2
)
else:
monitor_address = config.monitor_address
self._config_controller.update_config("monitor_address", monitor_address)
self._context = zmq.asyncio.Context(io_threads=config.worker_io_threads)
self._binder: AsyncBinder = create_async_binder(
self._context, name="scheduler", address=config.scheduler_address
)
logging.info(f"{self.__class__.__name__}: listen to scheduler address {config.scheduler_address}")
self._connector_storage: AsyncObjectStorageConnector = create_async_object_storage_connector()
logging.info(f"{self.__class__.__name__}: connect to object storage server {object_storage_address!r}")
self._binder_monitor: AsyncConnector = ZMQAsyncConnector(
context=self._context,
name="scheduler_monitor",
socket_type=zmq.PUB,
address=monitor_address,
bind_or_connect="bind",
callback=None,
identity=None,
)
logging.info(f"{self.__class__.__name__}: listen to scheduler monitor address {monitor_address.to_address()}")
self._task_allocate_policy = config.allocate_policy.value()
self._client_manager = VanillaClientController(config_controller=self._config_controller)
self._object_controller = VanillaObjectController(config_controller=self._config_controller)
self._graph_controller = VanillaGraphTaskController(config_controller=self._config_controller)
self._task_controller = VanillaTaskController(config_controller=self._config_controller)
self._worker_controller = VanillaWorkerController(
config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
)
self._balance_controller = VanillaBalanceController(
config_controller=self._config_controller, task_allocate_policy=self._task_allocate_policy
)
self._information_controller = VanillaInformationController(config_controller=self._config_controller)
self._scaling_controller = create_scaling_controller(
config.scaling_controller_strategy, config.adapter_webhook_urls
)
# register
self._binder.register(self.on_receive_message)
self._client_manager.register(
self._binder, self._binder_monitor, self._object_controller, self._task_controller, self._worker_controller
)
self._object_controller.register(
self._binder, self._binder_monitor, self._connector_storage, self._client_manager, self._worker_controller
)
self._graph_controller.register(
self._binder,
self._binder_monitor,
self._connector_storage,
self._client_manager,
self._task_controller,
self._object_controller,
)
self._task_controller.register(
self._binder,
self._binder_monitor,
self._client_manager,
self._object_controller,
self._worker_controller,
self._graph_controller,
)
self._worker_controller.register(self._binder, self._binder_monitor, self._task_controller)
self._balance_controller.register(self._binder, self._binder_monitor, self._task_controller)
self._information_controller.register_managers(
self._binder_monitor,
self._binder,
self._client_manager,
self._object_controller,
self._task_controller,
self._worker_controller,
self._scaling_controller,
)
async def connect_to_storage(self):
object_storage_address = self._config_controller.get_config("object_storage_address")
await self._connector_storage.connect(object_storage_address.host, object_storage_address.port)
async def on_receive_message(self, source: bytes, message: Message):
# =====================================================================================
# client manager
if isinstance(message, ClientHeartbeat):
await self._client_manager.on_heartbeat(ClientID(source), message)
return
# scheduler receives client shutdown request from upstream
if isinstance(message, ClientDisconnect):
await self._client_manager.on_client_disconnect(ClientID(source), message)
return
# =====================================================================================
# graph manager
if isinstance(message, GraphTask):
await self._graph_controller.on_graph_task(ClientID(source), message)
return
# =====================================================================================
# task manager
if isinstance(message, Task):
await self._task_controller.on_task_new(message)
return
if isinstance(message, TaskCancel):
if self._graph_controller.is_graph_subtask(message.task_id):
await self._graph_controller.on_graph_task_cancel(message)
else:
await self._task_controller.on_task_cancel(ClientID(source), message)
return
if isinstance(message, TaskCancelConfirm):
await self._task_controller.on_task_cancel_confirm(message)
return
if isinstance(message, TaskResult):
await self._task_controller.on_task_result(message)
return
if isinstance(message, TaskLog):
client = self._client_manager.get_client_id(message.task_id)
if client is not None:
await self._binder.send(client, message)
return
# =====================================================================================
# worker manager
if isinstance(message, WorkerHeartbeat):
await self._worker_controller.on_heartbeat(WorkerID(source), message)
return
# scheduler receives worker disconnect request from downstream
if isinstance(message, DisconnectRequest):
await self._worker_controller.on_disconnect(WorkerID(source), message)
return
if isinstance(message, WorkerDisconnectNotification):
await self._worker_controller.on_disconnect_notification(WorkerID(source), message)
return
# =====================================================================================
# object manager
if isinstance(message, ObjectInstruction):
await self._object_controller.on_object_instruction(source, message)
return
# =====================================================================================
# information manager
if isinstance(message, InformationRequest):
await self._information_controller.on_request(message)
logging.error(f"{self.__class__.__name__}: unknown message from {source=}: {message}")
async def get_loops(self):
await self.connect_to_storage()
loops = [
create_async_loop_routine(self._binder.routine, 0),
create_async_loop_routine(self._connector_storage.routine, 0),
create_async_loop_routine(self._graph_controller.routine, 0),
create_async_loop_routine(
self._balance_controller.routine, self._config_controller.get_config("load_balance_seconds")
),
create_async_loop_routine(self._client_manager.routine, CLEANUP_INTERVAL_SECONDS),
create_async_loop_routine(self._object_controller.routine, CLEANUP_INTERVAL_SECONDS),
create_async_loop_routine(self._worker_controller.routine, CLEANUP_INTERVAL_SECONDS),
create_async_loop_routine(self._information_controller.routine, STATUS_REPORT_INTERVAL_SECONDS),
]
try:
await asyncio.gather(*loops)
except asyncio.CancelledError:
pass
except ClientShutdownException as e:
logging.info(f"{self.__class__.__name__}: {e}")
pass
except YMQException:
pass
except ObjectStorageException:
pass
self._binder.destroy()
self._binder_monitor.destroy()
self._connector_storage.destroy()
@functools.wraps(Scheduler)
async def scheduler_main(*args, **kwargs):
scheduler = Scheduler(*args, **kwargs)
await scheduler.get_loops()