-
Notifications
You must be signed in to change notification settings - Fork 26
Integration #54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Integration #54
Changes from all commits
1720324
954cc80
4b55c2d
09058d3
a912b40
e472cde
3b3530b
39a15c9
0fcb5bb
89ec0a8
6d7cf33
0d0ce41
a2305b1
65f3649
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ | |
from tensorflow.python.ops import array_ops | ||
from tensorflow.python.util import tf_contextlib | ||
|
||
from autodist.cluster import Cluster, SSHCluster | ||
from autodist.cluster import Cluster, SSHCluster, ADAPTDLCluster | ||
from autodist.const import ENV | ||
from autodist.coordinator import Coordinator | ||
from autodist.graph_item import GraphItem | ||
|
@@ -39,7 +39,8 @@ | |
|
||
IS_AUTODIST_WORKER = bool(ENV.AUTODIST_WORKER.val) | ||
IS_AUTODIST_CHIEF = not IS_AUTODIST_WORKER | ||
|
||
IS_ADAPTDL = bool(ENV.ADAPTDL.val) | ||
logging.info(f"is chief: {IS_AUTODIST_CHIEF}, is from adaptdl: {IS_ADAPTDL}") | ||
_DEFAULT_AUTODIST = {} | ||
|
||
|
||
|
@@ -74,7 +75,10 @@ def __init__(self, resource_spec_file, strategy_builder=None): | |
self._remapper = None | ||
self._built = None # Ref to the built GraphDef | ||
|
||
self._cluster: Cluster = SSHCluster(self._resource_spec) # which can be also defined with strategy | ||
if IS_ADAPTDL: | ||
self._cluster: Cluster = ADAPTDLCluster(self._resource_spec) | ||
else: | ||
self._cluster: Cluster = SSHCluster(self._resource_spec) # which can be also defined with strategy | ||
self._coordinator: Coordinator | ||
|
||
@tf_contextlib.contextmanager | ||
|
@@ -97,12 +101,18 @@ def build_strategy(self): | |
""" | ||
return self._strategy_builder.build(self._original_graph_item, self._resource_spec) | ||
|
||
def _build_or_load_strategy(self): | ||
def _build_or_load_strategy(self, load=False): | ||
self._original_graph_item.prepare() | ||
if IS_AUTODIST_CHIEF: | ||
s = self.build_strategy() | ||
s.serialize() | ||
else: | ||
# At AdaptDL mode, when the worker pass through this before | ||
# the chief has created the strategy, this should returns | ||
# nothing. Later, when the chief has created the strategy, | ||
# it can load it. | ||
if IS_ADAPTDL and not load: | ||
DachengLi1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return None | ||
strategy_id = ENV.AUTODIST_STRATEGY_ID.val | ||
assert strategy_id | ||
s = base.Strategy.deserialize(strategy_id) | ||
|
@@ -119,12 +129,22 @@ def _compile_strategy(self, strategy): | |
|
||
def _setup(self, strategy): | ||
"""Prepare for the execution.""" | ||
if IS_AUTODIST_CHIEF: | ||
# we should only have one single coordinator for one single AutoDist() instance scope, | ||
# even though we could have multiple strategies. | ||
self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster) | ||
self._cluster.start() | ||
self._coordinator.launch_clients() | ||
if not IS_ADAPTDL: | ||
if IS_AUTODIST_CHIEF: | ||
# we should only have one single coordinator for one single AutoDist() instance scope, | ||
# even though we could have multiple strategies. | ||
self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster) | ||
self._cluster.start() | ||
self._coordinator.launch_clients() | ||
else: | ||
if IS_AUTODIST_CHIEF: | ||
self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be better if we create different Coordinator classes based on the cluster mode? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion. I tried similar format like you suggest. But I think the current version is more readable in autodist.py though more lengthy. Its easy to maintain this way since autodist.py is the first file to look at. |
||
self._cluster.start_chief() | ||
self._coordinator.launch_clients_chief() | ||
else: | ||
self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster) | ||
self._cluster.start_worker() | ||
self._coordinator.launch_clients_worker() | ||
logging.info('Current PID {} belongs to address {}'.format(os.getpid(), self._cluster.get_local_address())) | ||
|
||
|
||
|
@@ -139,6 +159,8 @@ def _initialize_graph(self): | |
def _build(self): | ||
strategy = self._build_or_load_strategy() | ||
self._setup(strategy) # Put it before transforming to allow multiple works to transform concurrently | ||
if IS_ADAPTDL and not IS_AUTODIST_CHIEF: | ||
strategy = self._build_or_load_strategy(load=True) | ||
compiled_strategy = self._compile_strategy(strategy) | ||
graph_transformer = GraphTransformer( | ||
compiled_strategy=compiled_strategy, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still not quite sure about the purpose of
load
and what this comment means. In L162-L163load
is always true whenIS_ADAPTDL
is true. Could you explain more?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is kind of subtle. Previously Autodist chief run first and generate the strategy; it will spawn worker instances after it builds the strategy, setup the cluster, etc. Now every instance will run through _build, and thus call _build_or_load_strategy. The first time the worker gets None from this function. The second time the worker will get the strategy from the chief. This is because kubernetes launch instances parallelly. The second time when the worker call the load, it is guaranteed that the chief has already generates it because there are several collective calls in between, which is blocking.