mars-project
diff --git a/‎mars/api.py
Lines changed: 8 additions & 9 deletions b/‎mars/api.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎mars/context.py
Lines changed: 115 additions & 16 deletions b/‎mars/context.py
Lines changed: 115 additions & 16 deletions
diff --git a/‎mars/learn/contrib/pytorch/__init__.py
Lines changed: 1 addition & 0 deletions b/‎mars/learn/contrib/pytorch/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎mars/learn/contrib/pytorch/dataset.py
Lines changed: 54 additions & 0 deletions b/‎mars/learn/contrib/pytorch/dataset.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎mars/learn/contrib/pytorch/run_script.py
Lines changed: 6 additions & 0 deletions b/‎mars/learn/contrib/pytorch/run_script.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎mars/learn/contrib/pytorch/tests/dataset_sample.py
Lines changed: 68 additions & 0 deletions b/‎mars/learn/contrib/pytorch/tests/dataset_sample.py
Lines changed: 68 additions & 0 deletions
@@ -95,11 +95,11 @@ def has_session(self, session_id):
         return self.session_manager.has_session(session_id)
 
     def submit_graph(self, session_id, serialized_graph, graph_key, target,
-                     compose=True, wait=True):
+                     names=None, compose=True, wait=True):
         session_uid = SessionActor.gen_uid(session_id)
         session_ref = self.get_actor_ref(session_uid)
         session_ref.submit_tileable_graph(
-            serialized_graph, graph_key, target, compose=compose, _tell=not wait)
+            serialized_graph, graph_key, target, names=names, compose=compose, _tell=not wait)
 
     def create_mutable_tensor(self, session_id, name, shape, dtype, *args, **kwargs):
         session_uid = SessionActor.gen_uid(session_id)
@@ -176,11 +176,11 @@ def wait_graph_finish(self, session_id, graph_key, timeout=None):
     def fetch_data(self, session_id, graph_key, tileable_key, index_obj=None, compressions=None):
         graph_uid = GraphActor.gen_uid(session_id, graph_key)
         graph_ref = self.get_actor_ref(graph_uid)
-        nsplits, chunk_indexes = graph_ref.get_tileable_meta(tileable_key)
-
+        nsplits, chunk_keys, chunk_indexes = graph_ref.get_tileable_metas([tileable_key])[0]
+        chunk_index_to_key = dict((index, key) for index, key in zip(chunk_indexes, chunk_keys))
         if not index_obj:
             chunk_results = dict((idx, self.fetch_chunk_data(session_id, k)) for
-                                 idx, k in chunk_indexes.items())
+                                 idx, k in zip(chunk_indexes, chunk_keys))
         else:
             chunk_results = dict()
             indexes = dict()
@@ -194,7 +194,7 @@ def fetch_data(self, session_id, graph_key, tileable_key, index_obj=None, compre
                 # `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array
                 # index, `arr[np.array(seq)]`, which will result either in an error or a different result.
                 slice_obj = tuple(indexes[axis][chunk_idx] for axis, chunk_idx in enumerate(chunk_index))
-                chunk_key = chunk_indexes[chunk_index]
+                chunk_key = chunk_index_to_key[chunk_index]
                 chunk_results[chunk_index] = self.fetch_chunk_data(session_id, chunk_key, slice_obj)
 
         chunk_results = [(idx, dataserializer.loads(f.result())) for
@@ -210,8 +210,7 @@ def fetch_chunk_data(self, session_id, chunk_key, index_obj=None):
         endpoints = self.chunk_meta_client.get_workers(session_id, chunk_key)
         sender_ref = self.actor_client.actor_ref(ResultSenderActor.default_uid(),
                                                  address=random.choice(endpoints))
-        future = sender_ref.fetch_data(session_id, chunk_key, index_obj, _wait=False)
-        return future
+        return sender_ref.fetch_data(session_id, chunk_key, index_obj, _wait=False)
 
     def delete_data(self, session_id, graph_key, tileable_key, wait=False):
         graph_uid = GraphActor.gen_uid(session_id, graph_key)
@@ -223,4 +222,4 @@ def get_tileable_nsplits(self, session_id, graph_key, tileable_key):
         graph_uid = GraphActor.gen_uid(session_id, graph_key)
         graph_ref = self.get_actor_ref(graph_uid)
 
-        return graph_ref.get_tileable_meta(tileable_key)[0]
+        return graph_ref.get_tileable_metas([tileable_key], filter_fields=['nsplits'])[0][0]
@@ -14,7 +14,8 @@
 
 import sys
 import threading
-from collections import namedtuple
+import random
+from collections import namedtuple, defaultdict
 from enum import Enum
 from typing import List
 
@@ -70,11 +71,12 @@ def h(*args, **kwargs):
     # Meta relative
     # ---------------
 
-    def get_chunk_metas(self, chunk_keys):
+    def get_chunk_metas(self, chunk_keys, filter_fields=None):
         """
         Get chunk metas according to the given chunk keys.
 
         :param chunk_keys: chunk keys
+        :param filter_fields: filter the fields in meta
         :return: List of chunk metas
         """
         raise NotImplementedError
@@ -188,7 +190,9 @@ def get_local_address(self):
     def get_ncores(self):
         return self._ncores
 
-    def get_chunk_metas(self, chunk_keys):
+    def get_chunk_metas(self, chunk_keys, filter_fields=None):
+        if filter_fields is not None:  # pragma: no cover
+            raise NotImplementedError("Local context doesn't support filter fields now")
         metas = []
         for chunk_key in chunk_keys:
             chunk_data = self.get(chunk_key)
@@ -219,17 +223,29 @@ def get_chunk_results(self, chunk_keys: List[str]) -> List:
 
 
 class DistributedContext(ContextBase):
-    def __init__(self, cluster_info, session_id, addr, chunk_meta_client,
-                 resource_actor_ref, actor_ctx, **kw):
-        self._cluster_info = cluster_info
-        is_distributed = cluster_info.is_distributed()
+    def __init__(self, scheduler_address, session_id, actor_ctx=None, **kw):
+        from .worker.api import WorkerAPI
+        from .scheduler.api import MetaAPI
+        from .scheduler.resource import ResourceActor
+        from .scheduler.utils import SchedulerClusterInfoActor
+        from .actors import new_client
+
+        self._session_id = session_id
+        self._scheduler_address = scheduler_address
+        self._worker_api = WorkerAPI()
+        self._meta_api = MetaAPI(actor_ctx=actor_ctx, scheduler_endpoint=scheduler_address)
+
+        self._running_mode = None
+        self._actor_ctx = actor_ctx or new_client()
+        self._cluster_info = self._actor_ctx.actor_ref(
+            SchedulerClusterInfoActor.default_uid(), address=scheduler_address)
+        is_distributed = self._cluster_info.is_distributed()
         self._running_mode = RunningMode.local_cluster \
             if not is_distributed else RunningMode.distributed
-        self._session_id = session_id
-        self._address = addr
-        self._chunk_meta_client = chunk_meta_client
-        self._resource_actor_ref = resource_actor_ref
-        self._actor_ctx = actor_ctx
+        self._address = kw.pop('address', None)
+        self._resource_actor_ref = self._actor_ctx.actor_ref(
+            ResourceActor.default_uid(), address=scheduler_address)
+
         self._extra_info = kw
 
     @property
@@ -252,10 +268,6 @@ def get_local_address(self):
     def get_ncores(self):
         return self._extra_info.get('n_cpu')
 
-    def get_chunk_metas(self, chunk_keys):
-        return self._chunk_meta_client.batch_get_chunk_meta(
-            self._session_id, chunk_keys)
-
     def get_chunk_results(self, chunk_keys: List[str]) -> List:
         from .serialize import dataserializer
         from .worker.transfer import ResultSenderActor
@@ -269,6 +281,93 @@ def get_chunk_results(self, chunk_keys: List[str]) -> List:
                 dataserializer.loads(sender_ref.fetch_data(self._session_id, chunk_key)))
         return results
 
+    # Meta API
+    def get_tileable_metas(self, tileable_keys, filter_fields: List[str]=None) -> List:
+        return self._meta_api.get_tileable_metas(self._session_id, tileable_keys, filter_fields)
+
+    def get_chunk_metas(self, chunk_keys, filter_fields: List[str] = None) -> List:
+        return self._meta_api.get_chunk_metas(self._session_id, chunk_keys, filter_fields)
+
+    def get_tileable_key_by_name(self, name: str):
+        return self._meta_api.get_tileable_key_by_name(self._session_id, name)
+
+    # Worker API
+    def get_chunks_data(self, worker: str, chunk_keys: List[str], indexes: List=None,
+                        compression_types: List[str]=None):
+        return self._worker_api.get_chunks_data(self._session_id, worker, chunk_keys, indexes=indexes,
+                                                compression_types=compression_types)
+
+    # Fetch tileable data by tileable keys and indexes.
+    def get_tileable_data(self, tileable_key: str, indexes: List=None,
+                          compression_types: List[str]=None):
+        from .serialize import dataserializer
+        from .utils import merge_chunks
+        from .tensor.core import TENSOR_TYPE
+        from .tensor.datasource import empty
+        from .tensor.indexing.getitem import TensorIndexTilesHandler
+
+        nsplits, chunk_keys, chunk_indexes = self.get_tileable_metas([tileable_key])[0]
+        chunk_idx_to_keys = dict(zip(chunk_indexes, chunk_keys))
+        chunk_keys_to_idx = dict(zip(chunk_keys, chunk_indexes))
+        endpoints = self.get_chunk_metas(chunk_keys, filter_fields=['workers'])
+        chunk_keys_to_worker = dict((chunk_key, random.choice(es[0])) for es, chunk_key in zip(endpoints, chunk_keys))
+
+        chunk_workers = defaultdict(list)
+        [chunk_workers[e].append(chunk_key) for chunk_key, e in chunk_keys_to_worker.items()]
+
+        chunk_results = dict()
+        if not indexes:
+            datas = []
+            for endpoint, chunks in chunk_workers.items():
+                datas.append(self.get_chunks_data(endpoint, chunks, compression_types=compression_types))
+            datas = [d.result() for d in datas]
+            for (endpoint, chunks), d in zip(chunk_workers.items(), datas):
+                d = [dataserializer.loads(db) for db in d]
+                chunk_results.update(dict(zip([chunk_keys_to_idx[k] for k in chunks], d)))
+        else:
+            # TODO: make a common util to handle indexes
+            if any(isinstance(ind, TENSOR_TYPE) for ind in indexes):
+                raise TypeError("Doesn't support indexing by tensors")
+            # Reuse the getitem logic to get each chunk's indexes
+            tileable_shape = tuple(sum(s) for s in nsplits)
+            empty_tileable = empty(tileable_shape, chunk_size=nsplits)._inplace_tile()
+            indexed = empty_tileable[tuple(indexes)]
+            index_handler = TensorIndexTilesHandler(indexed.op)
+            index_handler._extract_indexes_info()
+            index_handler._preprocess_fancy_indexes()
+            index_handler._process_fancy_indexes()
+            index_handler._process_in_tensor()
+
+            result_chunks = dict()
+            for c in index_handler._out_chunks:
+                result_chunks[chunk_idx_to_keys[c.inputs[0].index]] = [c.index, c.op.indexes]
+
+            chunk_datas = dict()
+            for endpoint, chunks in chunk_workers.items():
+                to_fetch_keys = []
+                to_fetch_indexes = []
+                to_fetch_idx = []
+                for r_chunk, (chunk_index, index_obj) in result_chunks.items():
+                    if r_chunk in chunks:
+                        to_fetch_keys.append(r_chunk)
+                        to_fetch_indexes.append(index_obj)
+                        to_fetch_idx.append(chunk_index)
+                if to_fetch_keys:
+                    datas = self.get_chunks_data(endpoint, to_fetch_keys, indexes=to_fetch_indexes,
+                                                 compression_types=compression_types)
+                    chunk_datas[tuple(to_fetch_idx)] = datas
+            chunk_datas = dict((k, v.result()) for k, v in chunk_datas.items())
+            for idx, d in chunk_datas.items():
+                d = [dataserializer.loads(db) for db in d]
+                chunk_results.update(dict(zip(idx, d)))
+
+        chunk_results = [(k, v) for k, v in chunk_results.items()]
+        if len(chunk_results) == 1:
+            ret = chunk_results[0][1]
+        else:
+            ret = merge_chunks(chunk_results)
+        return ret
+
 
 class DistributedDictContext(DistributedContext, dict):
     def __init__(self, *args, **kwargs):
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .run_script import run_pytorch_script
+from .dataset import MarsDataset
 
 
 def register_op():
 
@@ -0,0 +1,54 @@
+# Copyright 1999-2020 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+try:
+    from torch.utils.data import Dataset
+except ImportError:  # pragma: no cover
+    Dataset = object
+
+from ....context import get_context, DistributedContext
+from ....tensor.indexing.core import process_index
+from ....tensor.fetch import TensorFetch
+from ....utils import require_not_none
+
+
+@require_not_none(Dataset)
+class MarsDataset(Dataset):
+    def __init__(self, *names):
+        self._context = get_context()
+
+        tensors = []
+        for name in names:
+            tileable_key = self._context.get_tileable_key_by_name(name)
+            nsplits = self._context.get_tileable_metas([tileable_key], filter_fields=['nsplits'])[0][0]
+            shape = tuple(sum(s) for s in nsplits)
+            tensors.append(TensorFetch().new_tensor([], shape=shape, _key=tileable_key))
+        self.tensors = tensors
+
+    def __len__(self):
+        return self.tensors[0].shape[0]
+
+    def __getitem__(self, item):
+        indexes = process_index(self.tensors[0].ndim, item)
+        return tuple(self._context.get_tileable_data(t.key, indexes) for t in self.tensors)
+
+
+def enter_mars_context():
+    scheduler = os.environ['MARS_SCHEDULER']
+    session_id = os.environ['MARS_SESSION']
+    return DistributedContext(scheduler_address=scheduler, session_id=session_id)
+
+
@@ -125,6 +125,12 @@ def execute(cls, ctx, op):
                 env['MASTER_ADDR'] = str(op.master_addr)
             env['RANK'] = str(op.rank)
             env['WORLD_SIZE'] = str(op.world_size)
+
+            # set mars envs
+            if ctx.running_mode != RunningMode.local:
+                env['MARS_SCHEDULER'] = str(ctx._scheduler_address)
+                env['MARS_SESSION'] = str(ctx._session_id)
+
             # exec pytorch code in a new process
             process = subprocess.Popen(
                 [sys.executable, filename] + op.command_args, env=env)
 
@@ -0,0 +1,68 @@
+# Copyright 1999-2020 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+
+
+def get_model():
+    import torch.nn as nn
+    return nn.Sequential(
+        nn.Linear(32, 64),
+        nn.ReLU(),
+        nn.Linear(64, 64),
+        nn.ReLU(),
+        nn.Linear(64, 10),
+        nn.Softmax(),
+    )
+
+
+def main():
+    import torch.nn as nn
+    import torch.distributed as dist
+    import torch.optim as optim
+    import torch.utils.data
+    from mars.learn.contrib.pytorch.dataset import MarsDataset, enter_mars_context
+
+    dist.init_process_group(backend='gloo')
+    torch.manual_seed(42)
+
+    with enter_mars_context():
+        train_dataset = MarsDataset('data', 'labels')
+
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                   batch_size=32,
+                                                   shuffle=False,
+                                                   sampler=train_sampler)
+
+        model = nn.parallel.DistributedDataParallel(get_model())
+        optimizer = optim.SGD(model.parameters(),
+                              lr=0.01, momentum=0.5)
+        criterion = nn.BCELoss()
+
+        for _ in range(2):
+            # 2 epochs
+            for _, (batch_data, batch_labels) in enumerate(train_loader):
+                outputs = model(batch_data)
+                loss = criterion(outputs.squeeze(), batch_labels)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    assert sys.argv[1] == 'multiple'
+    main()