mars-project
diff --git a/‎.github/workflows/ci.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎mars/dataframe/core.py
Lines changed: 5 additions & 0 deletions b/‎mars/dataframe/core.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎mars/dataframe/sort/psrs.py
Lines changed: 11 additions & 3 deletions b/‎mars/dataframe/sort/psrs.py
Lines changed: 11 additions & 3 deletions
diff --git a/‎mars/dataframe/utils.py
Lines changed: 1 addition & 0 deletions b/‎mars/dataframe/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎mars/learn/cluster/_k_means_common.py
Lines changed: 1 addition & 1 deletion b/‎mars/learn/cluster/_k_means_common.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/learn/cluster/_kmeans.py
Lines changed: 1 addition & 1 deletion b/‎mars/learn/cluster/_kmeans.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/learn/tests/integrated/base.py
Lines changed: 3 additions & 2 deletions b/‎mars/learn/tests/integrated/base.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎mars/operands.py
Lines changed: 15 additions & 14 deletions b/‎mars/operands.py
Lines changed: 15 additions & 14 deletions
diff --git a/‎mars/scheduler/operands/base.py
Lines changed: 1 addition & 1 deletion b/‎mars/scheduler/operands/base.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎mars/scheduler/operands/common.py
Lines changed: 8 additions & 4 deletions b/‎mars/scheduler/operands/common.py
Lines changed: 8 additions & 4 deletions
@@ -83,7 +83,7 @@ jobs:
               conda install -n test --quiet --yes -c pytorch python=$PYTHON faiss-cpu
             fi
             if [[ $UNAME == "linux" ]] && [[ ! "$PYTHON" =~ "3.8" ]]; then
-              pip install tensorflow
+              pip install tensorflow\<2.3.0
               pip install torch torchvision
               pip install tsfresh
             fi
@@ -112,7 +112,7 @@ jobs:
       run: |
         source ./.github/workflows/reload-env.sh
         # stop the build if there are Python syntax errors or undefined names
-        flake8 mars --count --select=E9,E111,E225,E302,E303,E901,E999,F7,F63,F82,F401,F821,F822,F823,F841,W291,W292,W391 --show-source --statistics
+        flake8 mars --count --select=E9,E111,E225,E302,E303,E901,E999,F7,F63,F82,F401,F821,F822,F823,F841,W291,W292,W391,W605 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 mars --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
 
@@ -105,11 +105,16 @@ class Index(IndexBase):
     class RangeIndex(IndexBase):
         _name = AnyField('name')
         _slice = SliceField('slice')
+        _dtype = DataTypeField('dtype')
 
         @property
         def slice(self):
             return self._slice
 
+        @property
+        def dtype(self):
+            return getattr(self, '_dtype', np.dtype(np.intc))
+
         def to_pandas(self):
             slc = self._slice
             return pd.RangeIndex(slc.start, slc.stop, slc.step,
 
@@ -16,9 +16,10 @@
 import pandas as pd
 
 from ... import opcodes as OperandDef
+from ...context import RunningMode
 from ...utils import lazy_import, get_shuffle_input_keys_idxes
 from ...operands import OperandStage
-from ...serialize import ValueType, Int32Field, ListField, StringField, BoolField
+from ...serialize import Int32Field, ListField, StringField, BoolField
 from ...tensor.base.psrs import PSRSOperandMixin
 from ..utils import standardize_range_index
 from ..operands import DataFrameOperandMixin, DataFrameOperand, DataFrameShuffleProxy, \
@@ -254,7 +255,7 @@ class DataFramePSRSChunkOperand(DataFrameOperand):
     _sort_type = StringField('sort_type')
 
     _axis = Int32Field('axis')
-    _by = ListField('by', ValueType.string)
+    _by = ListField('by')
     _ascending = BoolField('ascending')
     _inplace = BoolField('inplace')
     _kind = StringField('kind')
@@ -381,7 +382,7 @@ class DataFramePSRSShuffle(DataFrameMapReduceOperand, DataFrameOperandMixin):
 
     # for shuffle map
     _axis = Int32Field('axis')
-    _by = ListField('by', ValueType.string)
+    _by = ListField('by')
     _ascending = BoolField('ascending')
     _inplace = BoolField('inplace')
     _na_position = StringField('na_position')
@@ -459,6 +460,7 @@ def _execute_dataframe_map(cls, ctx, op):
                 poses = records.searchsorted(p_records, side='right')
             else:
                 poses = len(records) - records[::-1].searchsorted(p_records, side='right')
+            del records, p_records
 
             poses = (None,) + tuple(poses) + (None,)
             for i in range(op.n_partition):
@@ -529,6 +531,12 @@ def _execute_reduce(cls, ctx, op):
         raw_inputs = [ctx[(input_key, op.shuffle_key)] for input_key in input_keys]
         xdf = pd if isinstance(raw_inputs[0], (pd.DataFrame, pd.Series)) else cudf
         concat_values = xdf.concat(raw_inputs, axis=op.axis)
+
+        del raw_inputs[:]
+        if getattr(ctx, 'running_mode', None) == RunningMode.distributed:
+            for input_key in input_keys:
+                ctx.pop((input_key, op.shuffle_key), None)
+
         if op.sort_type == 'sort_values':
             ctx[op.outputs[0].key] = execute_sort_values(concat_values, op)
         else:
 
@@ -237,6 +237,7 @@ def _serialize_range_index(index):
                 '_max_val_close': False,
                 '_key': key or _tokenize_index(index, *args),
                 '_name': index.name,
+                '_dtype': index.dtype,
             }
         else:
             properties = _extract_property(index, IndexValue.RangeIndex, False)
 
@@ -288,7 +288,7 @@ def execute(cls, ctx, op):
         ctx[op.outputs[1].key] = out_weight_in_clusters
 
 
-def _relocate_empty_clusters(X ,sample_weight, centers_old, centers_new,
+def _relocate_empty_clusters(X, sample_weight, centers_old, centers_new,
                              weight_in_clusters, labels, to_run=None,
                              session=None, run_kwargs=None):
     to_run = to_run or list()
 
@@ -482,7 +482,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random'} or tensor of shape \
+    init : {'k-means++', 'k-means||', 'random'} or tensor of shape \
             (n_clusters, n_features), default='k-means||'
         Method for initialization, defaults to 'k-means||':
 
 
@@ -46,11 +46,12 @@ def start_distributed_env(self, *args, **kwargs):
                 self._start_distributed_env(*args, **kwargs)
                 break
             except ProcessRequirementUnmetError:
+                self.terminate_processes()
                 fail_count += 1
-                if fail_count >= 3:
+                if fail_count >= 10:
                     raise
+                time.sleep(5)
                 logger.error('Failed to start service, retrying')
-                self.terminate_processes()
 
     def _start_distributed_env(self, n_workers=2):
         scheduler_port = self.scheduler_port = str(get_next_port())
 
@@ -27,13 +27,12 @@
 from .serialize import SerializableMetaclass, ValueType, ProviderType, IdentityField, \
     ListField, DictField, Int32Field, BoolField, StringField
 from .tiles import NotSupportTile
-from .utils import AttributeDict, to_str, calc_data_size, is_eager_mode, is_object_dtype
+from .utils import AttributeDict, to_str, calc_data_size, is_eager_mode, calc_object_overhead
 
 
 operand_type_to_oprand_cls = {}
 OP_TYPE_KEY = '_op_type_'
 OP_MODULE_KEY = '_op_module_'
-OBJECT_FIELD_OVERHEAD = 50
 T = TypeVar('T')
 
 
@@ -373,27 +372,26 @@ def execute(cls, ctx, op):
 
     @classmethod
     def estimate_size(cls, ctx, op):
-        from .dataframe.core import DATAFRAME_CHUNK_TYPE, SERIES_CHUNK_TYPE, INDEX_CHUNK_TYPE
-
         exec_size = 0
         outputs = op.outputs
         if all(not c.is_sparse() and hasattr(c, 'nbytes') and not np.isnan(c.nbytes) for c in outputs):
             for out in outputs:
                 ctx[out.key] = (out.nbytes, out.nbytes)
 
+        all_overhead = 0
         for inp in op.inputs or ():
             try:
+                if isinstance(inp.op, FetchShuffle):
+                    keys_and_shapes = inp.extra_params.get('_shapes', dict()).items()
+                else:
+                    keys_and_shapes = [(inp.key, getattr(inp, 'shape', None))]
+
                 # execution size of a specific data chunk may be
                 # larger than stored type due to objects
-                obj_overhead = n_strings = 0
-                if getattr(inp, 'shape', None) and not np.isnan(inp.shape[0]):
-                    if isinstance(inp, DATAFRAME_CHUNK_TYPE) and inp.dtypes is not None:
-                        n_strings = len([dt for dt in inp.dtypes if is_object_dtype(dt)])
-                    elif isinstance(inp, (INDEX_CHUNK_TYPE, SERIES_CHUNK_TYPE)) and inp.dtype is not None:
-                        n_strings = 1 if is_object_dtype(inp.dtype) else 0
-                    obj_overhead += n_strings * inp.shape[0] * OBJECT_FIELD_OVERHEAD
-
-                exec_size += ctx[inp.key][0] + obj_overhead
+                for key, shape in keys_and_shapes:
+                    overhead = calc_object_overhead(inp, shape)
+                    all_overhead += overhead
+                    exec_size += ctx[key][0] + overhead
             except KeyError:
                 if not op.sparse:
                     inp_size = calc_data_size(inp)
@@ -405,7 +403,10 @@ def estimate_size(cls, ctx, op):
         chunk_sizes = dict()
         for out in outputs:
             try:
-                chunk_size = calc_data_size(out) if not out.is_sparse() else exec_size
+                if not out.is_sparse():
+                    chunk_size = calc_data_size(out) + all_overhead // len(outputs)
+                else:
+                    chunk_size = exec_size
                 if np.isnan(chunk_size):
                     raise TypeError
                 chunk_sizes[out.key] = chunk_size
 
@@ -236,7 +236,7 @@ def stop_operand(self, state=OperandState.CANCELLING):
     def add_running_predecessor(self, op_key, worker):
         self._running_preds.add(op_key)
 
-    def add_finished_predecessor(self, op_key, worker, output_sizes=None):
+    def add_finished_predecessor(self, op_key, worker, output_sizes=None, output_shapes=None):
         self._finish_preds.add(op_key)
 
     def add_finished_successor(self, op_key, worker):
 
@@ -54,6 +54,7 @@ def __init__(self, session_id, graph_id, op_key, op_info, worker=None, allocated
         self._pred_workers = set()
 
         self._data_sizes = None
+        self._data_shapes = None
 
         self._input_worker_scores = dict()
         self._worker_scores = dict()
@@ -118,13 +119,14 @@ def add_running_predecessor(self, op_key, worker):
         self.update_demand_depths(self._info.get('optimize', {}).get('depth', 0))
 
     @log_unhandled
-    def add_finished_predecessor(self, op_key, worker, output_sizes=None):
+    def add_finished_predecessor(self, op_key, worker, output_sizes=None, output_shapes=None):
         """
         This function shall return whether current node is ready. The return values will
         be collected by the predecessor to judge if a node with lower-priority can be
         scheduled.
         """
-        super().add_finished_predecessor(op_key, worker, output_sizes=output_sizes)
+        super().add_finished_predecessor(op_key, worker, output_sizes=output_sizes,
+                                         output_shapes=output_shapes)
         if all(k in self._finish_preds for k in self._pred_keys):
             # all predecessors done, the operand can be executed now
             if self.state == OperandState.UNSCHEDULED:
@@ -411,14 +413,15 @@ def _on_running(self):
                 self._op_key, self.worker, _tell=True, _wait=False)
 
         @log_unhandled
-        def _acceptor(data_sizes):
+        def _acceptor(data_sizes, data_shapes):
             self._allocated = False
             if not self._is_worker_alive():
                 return
             self._resource_ref.deallocate_resource(
                 self._session_id, self._op_key, self.worker, _tell=True, _wait=False)
 
             self._data_sizes = data_sizes
+            self._data_shapes = data_shapes
             self._io_meta['data_targets'] = list(data_sizes)
             self.start_operand(OperandState.FINISHED)
 
@@ -482,7 +485,8 @@ def _on_finished(self):
         # record if successors can be executed
         for out_key in self._succ_keys:
             succ_futures.append(self._get_operand_actor(out_key).add_finished_predecessor(
-                self._op_key, self.worker, output_sizes=self._data_sizes, _wait=False))
+                self._op_key, self.worker, output_sizes=self._data_sizes,
+                output_shapes=self._data_shapes, _wait=False))
 
         pred_futures = []
         for in_key in self._pred_keys:
Original file line number	Diff line number	Diff line change
`@@ -237,6 +237,7 @@ def _serialize_range_index(index):`
`237`	`237`	`'_max_val_close': False,`
`238`	`238`	`'_key': key or _tokenize_index(index, *args),`
`239`	`239`	`'_name': index.name,`
	`240`	`+ '_dtype': index.dtype,`
`240`	`241`	`}`
`241`	`242`	`else:`
`242`	`243`	`properties = _extract_property(index, IndexValue.RangeIndex, False)`