mem_aware_dl_comm_schedule/bytetask.py at master · Yeosangho/mem_aware_dl_comm_schedule · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
#!/usr/bin/python

from __future__ import absolute_import
from abc import ABCMeta, abstractmethod
from six import with_metaclass
from threading import Thread, Lock, Condition
import logging


class ByteTask(with_metaclass(ABCMeta)):
    """ A unified communication operation (e.g., push, pull, allreduce) abstraction for ALL ML frameworks and
    communication methods."""

    def __init__(self, name, tensor, op,
                 priority=0, comm=None, parent=None, max_partition=1000,
                 add_notify_finish_trigger=False, immediate=False, step=0, partition_index=-1, rank=0,
                 logger=None, **kwargs):
        """ByteTask initialization.

        Args:
            name: The name of task, typically using the key index.
            tensor: The tensors of the communication task.
            op: The type of the communication, e.g., allreduce, push, pull.
            priority: The priority of the task. Small number indicates higher priority.
            comm: The communication object, e.g., kvstore.
            parent: The parent ByteTask object, if have.
            max_partition: The max number of children tasks.
            add_notify_finish_trigger: True if the ML engine runs the task in a blocking way.
            immediate: True if it is an immediate task, i.e., no priority scheduling is required.
            step: Which training step the task belongs to.
            partition_index: If the task is a child task, it has a unique index.
            rank: The rank of worker.
            logger: The logging object.
            kwargs: Other necessary arguments.
        """
        self.name = str(name)
        self.op = op
        if parent is None:
            self.priority = int(priority) * max_partition
        else:
            self.priority = int(priority)
        self.parent = parent
        self.kwargs = kwargs
        self._kwargs = kwargs
        self.children = None
        self.partition_count = 1
        self.done_count = 0
        self._done_count_lock = Lock()

        # Can be a single tensor or a list of tensors
        self._tensor = tensor
        self._comm = comm

        # Task states: posted -> ready -> started -> finished
        self._ready = False
        self._started = False

        # The lock is for blocking wait_until_finish()
        # will only be released in notify_finish()
        self._finished = False
        self._lock_before_finish = Condition(Lock())

        # Wait_until_finish() thread
        self._thread = None

        # Callback for Core calling prepare()
        self._prepare_callback = None
        self._prepare_callback_context = None

        # Callback for Core calling do()
        self._do_callback = None
        self._do_callback_context = None

        self._add_notify_finish_trigger = add_notify_finish_trigger
        self._immediate = immediate
        self._partition_index = partition_index
        self._step = step
        self._rank = rank

        # Logger
        if logger is None:
            self._logger = logging.getLogger("ByteScheduler")
        else:
            self._logger = logger

        self.desc = "rank {} step {} task {} {}".format(self._rank, self._step, self.op, self.name)
        self._additional_init()

        self._logger.debug(
            "{} was created".format(self.desc))
        return

    def prepare(self, callback=None, callback_context=None):
        """Prepare a task with dependencies, callback and callback_context are for Core.

        Args:
            callback: a function that runs once the task is ready.
            callback_context: context of callback function, typically Core.
        """
        self._prepare_callback = callback
        self._prepare_callback_context = callback_context
        if self.children is not None:
            for child in self.children:
                child.prepare(callback, callback_context)
        else:
            self._prepare()

        self._logger.debug(
            "{} prepare() started".format(self.desc))
        return

    def do(self, callback=None, callback_context=None):
        """Run a task, callback and callback_context are for Core.

        Core calls this interface to let engines and the underlying communication stacks send the tensor.

        Args:
            callback: a function that runs once the task is finished.
            callback_context: context of callback function, typically Core.

        """
        self._do_callback = callback
        self._do_callback_context = callback_context

        if self.children is not None:
            for child in self.children:
                if not child.is_started():
                    child.do(callback, callback_context)
        else:
            self._do()
            if self._add_notify_finish_trigger:
                self.add_notify_finish_trigger()

        self._started = True
        if self.parent is not None:
            self.parent._started = True

        self._logger.debug(
            "{} do() started".format(self.desc))
        return

    def add_notify_finish_trigger(self):
        """In the case that underlying engine only provides blocking wait but does not provide a callback."""
        def background(task):
            task._wait_until_finish()
            task.notify_finish()

        self._thread = Thread(target=background, args=[self])
        self._thread.start()

    def wait_until_finish(self):
        """Block until the task is finished"""
        if self.children is not None:
            for child in self.children:
                if not child.is_finished():
                    child.wait_until_finish()
        else:
            # We can get this lock only after notify_finish()
            with self._lock_before_finish:
                self._lock_before_finish.wait()
        return

    def immediate_do(self, callback=None, callback_context=None):
        """Called without priority scheduling"""
        self._do_callback = callback
        self._do_callback_context = callback_context
        return self._immediate_do()

    def is_immediate(self):
        """Check if the task is an immediate task, return True if the task does not need priority scheduling.
        Returns:
            A boolean value indicating whether it is an immediate task.
        """
        return self._immediate

    def is_ready(self):
        """Check if the task is ready for scheduling, i.e., the tensors of the task are ready for communication.
        Returns:
            A boolean value indicating whether the task is ready for scheduling.
        """
        return self._ready

    def is_started(self):
        """Check if a task is already started.
        Returns:
            A boolean value indicating whether the task is started.
        """
        return self._started

    def is_finished(self):
        """Check if a task is finished.
       Returns:
            A boolean value indicating whether the task is finished.
        """
        return self._finished

    def notify_ready(self):
        """Notify Core that the task is ready for scheduling.

        Most ML frameworks are asynchronous -- when a communication operation is posted to the engine, possibly the
        tensor has not been computed or ready to be sent. So the ML engine needs to notify Core about a tensor being
        ready, so that Core can actually begin scheduling it.
        """
        self._ready = True
        self._logger.debug(
            "{} is ready".format(self.desc))
        if self._prepare_callback is not None:
            self._prepare_callback(self, self._prepare_callback_context)
        return

    def notify_finish(self):
        """Notify Core that the task is already finished.

        Once the communication of a tensor (all-reduce, push or pull) has been finished, the framework engine must
        notify Core about this, so that Core can continue scheduling more tasks.
        """
        if self.parent is not None:
            parent_finish = self.parent.partition_done()
        else:
            parent_finish = True
            self._notify_upper_layer_finish()

        if self._do_callback is not None:
            self._do_callback(self, self._do_callback_context)

        self._finished = True
        with self._lock_before_finish:
            self._lock_before_finish.notify()

        self._logger.debug(
            "{} has finished".format(self.desc))
        return parent_finish

    def partition_done(self):
        """Called by parent when a task partition is finished.
        Returns:
            A boolean value indicating whether all task partitions are finished.
        """
        assert self.children is not None, "{} unexpected partition_done()".format(self.desc)
        self._done_count_lock.acquire()
        self.done_count += 1
        self._logger.debug(
            "{} done_count = {}".format(self.desc, self.done_count))
        finish = False
        if self.done_count == self.partition_count:
            finish = True
            self.notify_finish()

        self._done_count_lock.release()
        return finish

    def tensor_size(self):
        """Get the tensor size, i.e., the number of parameters of one tensor of the task
        Returns:
            An integer scalar with the size of a tensor of the task.
        """
        return self._tensor_size()

    def partition(self, size=None):
        """Partition a task into small onces

        Core calls this interface to partition a task and its tensor into one or multiple tasks with tensors no larger
        than size. All popular frameworks provide zero-copy APIs for partitioning a tensor.

        Args:
            size: an integer indicating the max size of each partition or a tuple/list of integers indicating the size
            of each partition.

        Returns:
            A list of partitioned children tasks
        """
        # A task can only be partitioned once
        assert self.children is None, "{} has been partitioned before".format(self.desc)

        # For now, we only support one layer of partitioning
        assert self.parent is None, "{} is already a partition".format(self.desc)

        # Check size not None
        assert size is not None and isinstance(size, (int, tuple, list)), \
            "{} got unclear partition request".format(self.desc)

        # Partition tasks evenly or based on a list
        if isinstance(size, list):
            tensor_partitions = self._partition_tensor_v2(size)
        else:
            tensor_partitions = self._partition_tensor(size)

        self.children = []
        for i in range(len(tensor_partitions)):
            self.children.append(
                self.__class__(
                    self.name + "_" + str(i),
                    tensor_partitions[i],
                    self.op,
                    priority=(self.priority + i),
                    comm=self._comm,
                    parent=self,
                    add_notify_finish_trigger=self._add_notify_finish_trigger,
                    immediate=self._immediate,
                    step=self._step,
                    partition_index=i,
                    rank=self._rank,
                    logger=self._logger,
                    **self.kwargs
                )
            )

        self.partition_count = len(tensor_partitions)
        self._logger.debug(
            "{} is partitioned into {}".format(self.desc, len(tensor_partitions)))
        return self.children

    @abstractmethod
    def _prepare(self):
        """Must be overriden -- required by Core.

        If a task is partitioned, only the children will call this function. The implementation depends on the underlying
        engine scheduler.

        No return value, non-blocking.
        """
        pass

    @abstractmethod
    def _do(self):
        """Must be overriden -- required by Core.

        If a task is partitioned, only the children will call this. This function depends on the underlying
        communication method.

        No return value, non-blocking.
        """
        pass


    @abstractmethod
    def _tensor_size(self):
        """Must be overriden -- required by Core.

        This function depends on the data structure of self._tensor.

        Must return an integer, i.e., the size of self._tensor.
        """
        pass

    @abstractmethod
    def _partition_tensor(self, size):
        """Must be overriden -- required by Core.

        If a task is partitioned, only the parent will call this. This function depends on the data structure of
        self._tensor.

        Must return a list of (partitioned) tensors.
        """
        pass

    def _partition_tensor_v2(self, size):
        """Optional -- required by allreduce.

        If a task is partitioned, only the parent will call this. This function depends on the data structure of
        self._tensor.

        Must return a list of (partitioned) tensors.
        """
        return

    def _additional_init(self):
        """Optional -- may be required by framework.

        If we have any framework-specific work to do during __init__. This function will be called at the end of __init__.

        No return value, blocking.
        """
        self._logger.debug(
            "{} _additional_init is not implemented".format(self.desc))
        return

    def _wait_until_finish(self):
        """Optional -- may be required by framework.

        If a task is partitioned, only the children will call this. This function depends on the underlying
        communication method.

        No return value, blocking.
        """
        self._logger.debug(
            "{} _wait_until_finish is not implemented".format(self.desc))
        return

    def _notify_upper_layer_finish(self):
        """Optional -- may be required by framework.

        If a task is partitioned, only the parent will call this. This function depends on the upper layer framework.

        No return value.
        """
        self._logger.debug(
            "{} _notify_upper_layer_finish is not implemented".format(self.desc))
        return

    def _immediate_do(self):
        """Optional -- may benefit certain framework.

        If a task is partitioned, only the children will call this. This function lets Core bypass all scheduling for
        this task.

        No return value, non-blocking.
        """
        pass