petuum · DachengLi1 · Nov 14, 2020 · Nov 17, 2020 · Nov 20, 2020 · Nov 22, 2020
diff --git a/autodist/autodist.py b/autodist/autodist.py
@@ -23,7 +23,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util import tf_contextlib
 
-from autodist.cluster import Cluster, SSHCluster
+from autodist.cluster import Cluster, SSHCluster, ADAPTDLCluster
 from autodist.const import ENV
 from autodist.coordinator import Coordinator
 from autodist.graph_item import GraphItem
@@ -39,7 +39,8 @@
 
 IS_AUTODIST_WORKER = bool(ENV.AUTODIST_WORKER.val)
 IS_AUTODIST_CHIEF = not IS_AUTODIST_WORKER
-
+IS_ADAPTDL = bool(ENV.ADAPTDL.val)
+logging.info(f"is chief: {IS_AUTODIST_CHIEF}, is from adaptdl: {IS_ADAPTDL}")
 _DEFAULT_AUTODIST = {}
 
 
@@ -74,7 +75,10 @@ def __init__(self, resource_spec_file, strategy_builder=None):
         self._remapper = None
         self._built = None  # Ref to the built GraphDef
 
-        self._cluster: Cluster = SSHCluster(self._resource_spec)  # which can be also defined with strategy
+        if IS_ADAPTDL:
+            self._cluster: Cluster = ADAPTDLCluster(self._resource_spec)
+        else:
+            self._cluster: Cluster = SSHCluster(self._resource_spec)  # which can be also defined with strategy
         self._coordinator: Coordinator
 
     @tf_contextlib.contextmanager
@@ -97,12 +101,18 @@ def build_strategy(self):
         """
         return self._strategy_builder.build(self._original_graph_item, self._resource_spec)
 
-    def _build_or_load_strategy(self):
+    def _build_or_load_strategy(self, load=False):
         self._original_graph_item.prepare()
         if IS_AUTODIST_CHIEF:
             s = self.build_strategy()
             s.serialize()
         else:
+            # At AdaptDL mode, when the worker pass through this before
+            # the chief has created the strategy, this should returns
+            # nothing. Later, when the chief has created the strategy,
+            # it can load it.
+            if IS_ADAPTDL and not load:
+                return None
             strategy_id = ENV.AUTODIST_STRATEGY_ID.val
             assert strategy_id
             s = base.Strategy.deserialize(strategy_id)
@@ -119,12 +129,22 @@ def _compile_strategy(self, strategy):
 
     def _setup(self, strategy):
         """Prepare for the execution."""
-        if IS_AUTODIST_CHIEF:
-            # we should only have one single coordinator for one single AutoDist() instance scope,
-            # even though we could have multiple strategies.
-            self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster)
-            self._cluster.start()
-            self._coordinator.launch_clients()
+        if not IS_ADAPTDL:
+            if IS_AUTODIST_CHIEF:
+                # we should only have one single coordinator for one single AutoDist() instance scope,
+                # even though we could have multiple strategies.
+                self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster)
+                self._cluster.start()
+                self._coordinator.launch_clients()
+        else:
+            if IS_AUTODIST_CHIEF:
+                self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster)
+                self._cluster.start_chief()
+                self._coordinator.launch_clients_chief()
+            else:
+                self._coordinator = Coordinator(strategy=strategy, cluster=self._cluster)
+                self._cluster.start_worker()
+                self._coordinator.launch_clients_worker()
         logging.info('Current PID {} belongs to address {}'.format(os.getpid(), self._cluster.get_local_address()))
 
 
@@ -139,6 +159,8 @@ def _initialize_graph(self):
     def _build(self):
         strategy = self._build_or_load_strategy()
         self._setup(strategy)  # Put it before transforming to allow multiple works to transform concurrently
+        if IS_ADAPTDL and not IS_AUTODIST_CHIEF:
+            strategy = self._build_or_load_strategy(load=True)
         compiled_strategy = self._compile_strategy(strategy)
         graph_transformer = GraphTransformer(
             compiled_strategy=compiled_strategy,

diff --git a/autodist/cluster.py b/autodist/cluster.py
@@ -40,12 +40,15 @@
 from abc import ABCMeta, abstractmethod
 
 import paramiko
-
 from autodist.const import DEFAULT_PORT_RANGE, DEFAULT_WORKING_DIR, ENV
 from autodist.resource_spec import ResourceSpec
 from autodist.utils import logging
 
 warnings.filterwarnings(action='ignore', module=paramiko.__name__)
+IS_ADAPTDL = bool(ENV.ADAPTDL.val)
+if IS_ADAPTDL:
+    import adaptdl.collective as collective
+    import socket
 
 
 class Cluster(metaclass=ABCMeta):
@@ -144,6 +147,13 @@ def get_local_worker_task_index(self):
         Returns:
             int: Task index
         """
+        if IS_ADAPTDL:
+            logging.info(f"full address {self._full_addresses}")
+            logging.info(f"local address {self.get_local_address()}") 
+            return_ = [i for i, a in enumerate(self._full_addresses) if self.get_local_address() in a][0]
+            logging.info(f"returning {return_}")
+            return return_
+
         return [i for i, a in enumerate(self._full_addresses) if self.get_local_address() in a][0]
 
     def get_local_session_target(self):
@@ -200,7 +210,7 @@ def start(self):
                         full_address, job_name, task_index
                     ))
                 else:  # remote
-                    self.remote_pre_start_tf_server(address, tf_server_starter_filepath=module_file)
+                    self.remote_pre_start_tf_server(address, tf_server_starter_filepath=module_file, chief=False)
                     file = os.path.join(DEFAULT_WORKING_DIR, os.path.basename(module_file))
                     bash = envs + envs_cuda + ['python', '-u', file] + args
                     logging.info("Launching tf.server on %s" % address)
@@ -209,28 +219,102 @@ def start(self):
                     # to ensure no gap for termination failure due to the empty proc list.
                     self.subprocesses.append(proc)
 
+    # pylint: disable=too-many-locals
+    def start_chief(self):
+        """Start tf.servers on all nodes. AdaptDL version. Run on chief."""
+        # pylint: disable=import-outside-toplevel
+        from autodist.utils import server_starter
+        envs = {ENV.AUTODIST_MIN_LOG_LEVEL.name: 'ERROR'}
+        envs = ['{}={}'.format(k, v) for k, v in envs.items()]
+        module_name = server_starter.__name__
+        module_file = server_starter.__file__
+        for job_name, tasks in self.cluster_spec.items():
+            for task_index, full_address in enumerate(tasks):
+                address = full_address.split(':')[0]
+                args = ['--job_name=%s' % job_name, '--task_index=%d' % task_index,
+                        '--cpu_device_num=%d' % len(self._cpu_devices[address])]
+                if address in self._gpu_devices:
+                    envs_cuda = []
+                else:
+                    envs_cuda = ['CUDA_VISIBLE_DEVICES=""']
+                if self.is_chief(address):
+                    json.dump(self.cluster_spec, open(os.path.join(DEFAULT_WORKING_DIR, 'cluster_spec.json'), 'w+'))
+                    cmd = envs + envs_cuda + [sys.executable, '-m', module_name] + args
+                    # pylint: disable=subprocess-popen-preexec-fn
+                    logging.info("cmd at chief: %s", cmd)
+                    proc = subprocess.Popen(' '.join(cmd), shell=True, preexec_fn=os.setsid)
+                    self.subprocesses.append(proc)
+                    logging.debug('$ local tf.server started at {}: job_name={} task_index={}'.format(
+                        full_address, job_name, task_index
+                    ))
+        self.remote_pre_start_tf_server(None, tf_server_starter_filepath=module_file, chief=True)
+
+    # pylint: disable=too-many-locals
+    def start_worker(self):
+        """Start tf.servers on all nodes. AdaptDL version. Run on non-chief."""
+        # pylint: disable=import-outside-toplevel
+        from autodist.utils import server_starter
+        envs = {ENV.AUTODIST_MIN_LOG_LEVEL.name: 'ERROR'}
+        envs = ['{}={}'.format(k, v) for k, v in envs.items()]
+        module_name = server_starter.__name__
+        module_file = server_starter.__file__
+        for job_name, tasks in self.cluster_spec.items():
+            for task_index, full_address in enumerate(tasks):
+                address = full_address.split(':')[0]
+                hostname = socket.gethostname()
+                local_ip = socket.gethostbyname(hostname)
+                if local_ip != address:
+                    continue
+                args = ['--job_name=%s' % job_name, '--task_index=%d' % task_index,
+                        '--cpu_device_num=%d' % len(self._cpu_devices[address])]
+                if address in self._gpu_devices:
+                    envs_cuda = []
+                else:
+                    envs_cuda = ['CUDA_VISIBLE_DEVICES=""']
+                assert not self.is_chief(address)
+                self.remote_pre_start_tf_server(address, tf_server_starter_filepath=module_file, chief=False)
+
+                cmd = envs + envs_cuda + [sys.executable, '-m', module_name] + args
+                logging.info("Launching tf.server on %s" % address)
+                proc = self.local_exec(cmd, address)
+                self.subprocesses.append(proc)
+                assert len(self.subprocesses) <= 1
+
     def terminate(self):
         """Terminate."""
         logging.debug('Terminating cluster...')
         for p in self.subprocesses:
             os.killpg(os.getpgid(p.pid), signal.SIGTERM)
 
-    def remote_pre_start_tf_server(self, hostname, tf_server_starter_filepath, working_dir=DEFAULT_WORKING_DIR):
+    def remote_pre_start_tf_server(self, hostname, tf_server_starter_filepath, chief, working_dir=DEFAULT_WORKING_DIR):
         """
         Prepare to start a TensorFlow server remotely.
 
         Args:
             hostname (str): host name or address
             tf_server_starter_filepath (str): local starter file path
+            chief (bool): indicator that this process is on chief or not. Only apply with adaptDL.
             working_dir (str): remote working directory
         """
         logging.info("Copying necessary files to %s" % hostname)
-        self.remote_copy(local_path=tf_server_starter_filepath, remote_path=working_dir, hostname=hostname)
-        self.remote_file_write(
-            remote_path=os.path.join(working_dir, 'cluster_spec.json'),
-            data=json.dumps(self.cluster_spec),
-            hostname=hostname,
-        )
+        if IS_ADAPTDL:
+            # pylint: disable=unexpected-keyword-arg
+            self.remote_copy(local_path=tf_server_starter_filepath, remote_path=working_dir,
+                             hostname=hostname, chief=chief)
+            self.remote_file_write(
+                remote_path=os.path.join(working_dir, 'cluster_spec.json'),
+                data=json.dumps(self.cluster_spec),
+                hostname=hostname,
+                chief=chief
+            )
+        else:
+            assert chief is False
+            self.remote_copy(local_path=tf_server_starter_filepath, remote_path=working_dir, hostname=hostname)
+            self.remote_file_write(
+                remote_path=os.path.join(working_dir, 'cluster_spec.json'),
+                data=json.dumps(self.cluster_spec),
+                hostname=hostname
+            )
 
     @abstractmethod
     def remote_exec(self, args, hostname):
@@ -291,6 +375,7 @@ def _get_ssh_client(self, hostname):
         client.load_system_host_keys()
         client.set_missing_host_key_policy(paramiko.WarningPolicy)
         client.connect(hostname=hostname, port=ssh_config.port, username=ssh_config.username, pkey=ssh_config.pkey)
+        assert not IS_ADAPTDL
         yield client
         client.close()
 
@@ -372,3 +457,100 @@ def remote_copy(self, local_path, remote_path, hostname):
 
         with self._get_sftp_client(hostname) as sftp:
             sftp.put(localpath=local_path, remotepath=os.path.join(remote_path, os.path.basename(local_path)))
+
+
+class ADAPTDLCluster(Cluster):
+    """An AutoDist Cluster Based on AdaptDL."""
+
+    def __init__(self, resource_spec):
+        assert IS_ADAPTDL
+        super().__init__(resource_spec)
+
+    @staticmethod
+    def get_local_address():
+        """
+        Get the local (ip) address.
+
+        Returns:
+            str: local ip
+        """
+        hostname = socket.gethostname()
+        local_ip = socket.gethostbyname(hostname)
+        return local_ip 
+
+    def remote_exec(self, args, hostname):
+        """
+        Execute a bash script remotely. disabled in AdaptDL.
+
+        Args:
+            args (list): bash commands
+            hostname (str): host name or address
+
+        Returns:
+            None
+        """
+        return
+
+    @staticmethod
+    def local_exec(args, hostname):
+        """
+        Execute a bash script locally.
+
+        Args:
+            args (list): bash commands
+            hostname (str): host name or address
+
+        Returns:
+            Process: process handle
+        """
+        full_cmd = ' '.join(args)
+        logging.info(full_cmd)
+        if ENV.AUTODIST_DEBUG_REMOTE.val:
+            return None
+        # pylint: disable=subprocess-popen-preexec-fn
+        proc = subprocess.Popen(full_cmd, shell=True, preexec_fn=os.setsid)
+        return proc
+
+    def remote_file_write(self, remote_path, data, hostname, **kwargs):
+        """
+        Write a remote file.
+
+        Args:
+            remote_path (str): remote file path
+            data (str): data to be written
+            hostname (str): host name or address
+            chief (boolean): whether this is autodist chief
+        """
+        if kwargs["chief"]:
+            _ = collective.broadcast(data)
+        else:
+            data_ = collective.broadcast(None)
+            f = open(remote_path, "w")
+            f.write(data_)
+            f.close()
+
+    def remote_copy(self, local_path, remote_path, hostname, **kwargs):
+        """
+        Copy a file to a remote directory.
+
+        Args:
+            local_path (str): local file path to be copied
+            remote_path (str): remote directory path
+            hostname (str): host name or address
+            chief (boolean): whether this is autodist chief
+        """
+        # Make sure directory exists
+
+        if kwargs["chief"]:
+            f = open(local_path, "r")
+            lines = f.readlines()
+            _ = collective.broadcast(lines)
+            f.close()
+        else:
+            lines = collective.broadcast(None)
+            if not os.path.isdir(remote_path):
+                os.mkdir(remote_path)
+            f = open(os.path.join(remote_path, os.path.basename(local_path)), "w")
+            for line in lines:
+                f.write(line)
+            f.close()
diff --git a/autodist/const.py b/autodist/const.py
@@ -23,7 +23,6 @@
 from enum import Enum, auto
 
 import os
-
 # Below consts can be modified if necessary.
 # Note that if one of these consts requires frequent modification,
 # it should probably be moved into `ENV`.
@@ -80,10 +79,32 @@ class ENV(Enum):
     AUTODIST_INTERNAL_TF = auto(), lambda v: (v or "False") == "True"       # noqa: E731
     SYS_DATA_PATH = auto(), lambda v: v or ""                         # noqa: E731
     SYS_RESOURCE_PATH = auto(), lambda v: v or ""                     # noqa: E731
+    ADAPTDL = auto(), lambda v: v or ""                        # noqa: E731
 
     @property
     def val(self):
         """Return the output of the lambda on the system's value in the environment."""
-        # pylint: disable=invalid-envvar-value, unpacking-non-sequence
+        # pylint: disable=invalid-envvar-value, unpacking-non-sequence, comparison-with-callable
+        if self.name == "AUTODIST_WORKER" and self.ADAPTDL.val:
+            return self.val_autodist_worker()
         _, default_fn = self.value
         return default_fn(os.getenv(self.name))
+
+    # pylint: disable=import-outside-toplevel
+    @staticmethod
+    def val_autodist_worker():
+        """Evaluate autodist_worker in AdaptDL."""
+        import adaptdl.env as env
+        import socket
+        f = open(os.path.join(env.share_path(), "resource_spec.yml"))
+        lines = f.readlines()
+        line_chief = lines[1]
+        chief_addr = line_chief.split(":")[1].strip()
+        hostname = socket.gethostname()
+        local_ip = socket.gethostbyname(hostname)
+        f.close()
+
+        if chief_addr != local_ip:
+            return local_ip
+        else:
+            return ""