Skip to content

Commit 96704f4

Browse files
authored
[BACKPORT] Implements {DataFrame,Series}.set_axis (#1950) (#1951)
1 parent 9d9fbea commit 96704f4

File tree

13 files changed

+393
-168
lines changed

13 files changed

+393
-168
lines changed

.github/workflows/os-compat-ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ jobs:
5757
shell: bash
5858
run: |
5959
source ./.github/workflows/reload-env.sh
60-
export CYTHON_TRACE=1
6160
for cf in `ls .coveragerc*`; do
61+
sed -i.bak "s/plugins *= *Cython\.Coverage//g" $cf;
6262
sed -i.bak -e '/*\.pxd/ a\
63-
\ \ \ \ *.py \
63+
\ \ \ \ *.pyx \
6464
' $cf
6565
done
6666
retry python setup.py build_ext -i -j 2

docs/source/reference/dataframe/frame.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Reindexing / selection / label manipulation
151151
DataFrame.rename
152152
DataFrame.rename_axis
153153
DataFrame.reset_index
154+
DataFrame.set_axis
154155
DataFrame.set_index
155156
DataFrame.tail
156157

docs/source/reference/dataframe/series.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ Reindexing / selection / label manipulation
148148
Series.rename
149149
Series.rename_axis
150150
Series.reset_index
151+
Series.set_axis
151152
Series.tail
152153

153154
Missing data handling

mars/dataframe/base/astype.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
from ...serialize import AnyField, StringField, ListField
2121
from ...utils import recursive_tile
2222
from ...tensor.base import sort
23-
from ..utils import build_empty_df, build_empty_series
2423
from ..core import DATAFRAME_TYPE, SERIES_TYPE
2524
from ..operands import DataFrameOperand, DataFrameOperandMixin
25+
from ..utils import build_empty_df, build_empty_series, parse_index
2626

2727

2828
class DataFrameAstype(DataFrameOperand, DataFrameOperandMixin):
@@ -197,8 +197,10 @@ def __call__(self, df):
197197
return self.new_series([df], shape=df.shape, dtype=dtype,
198198
name=df.name, index_value=df.index_value)
199199
else:
200+
new_index = df.index_value.to_pandas().astype(self.dtype_values)
201+
new_index_value = parse_index(new_index, store_data=df.index_value.has_value())
200202
return self.new_index([df], shape=df.shape, dtype=dtype,
201-
name=df.name, index_value=df.index_value)
203+
name=df.name, index_value=new_index_value)
202204

203205

204206
def astype(df, dtype, copy=True, errors='raise'):

mars/dataframe/core.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ def to_tensor(self, dtype=None, extract_multi_index=False):
546546

547547

548548
class Index(HasShapeTileableEnity, _ToPandasMixin):
549-
__slots__ = '_df_or_series', '_axis'
549+
__slots__ = '_df_or_series', '_parent_key', '_axis'
550550
_allow_data_type_ = (IndexData,)
551551

552552
def __new__(cls, data: Union[pd.Index, IndexData], **_):
@@ -572,6 +572,7 @@ def _get_df_or_series(self):
572572

573573
def _set_df_or_series(self, df_or_series, axis):
574574
self._df_or_series = weakref.ref(df_or_series)
575+
self._parent_key = df_or_series.key
575576
self._axis = axis
576577

577578
@property
@@ -581,7 +582,7 @@ def name(self):
581582
@name.setter
582583
def name(self, value):
583584
df_or_series = self._get_df_or_series()
584-
if df_or_series is not None:
585+
if df_or_series is not None and df_or_series.key == self._parent_key:
585586
df_or_series.rename_axis(value, axis=self._axis, inplace=True)
586587
self.data = df_or_series.axes[self._axis].data
587588
else:
@@ -962,6 +963,10 @@ def index(self):
962963
idx._set_df_or_series(self, 0)
963964
return idx
964965

966+
@index.setter
967+
def index(self, new_index):
968+
self.set_axis(new_index, axis=0, inplace=True)
969+
965970
@property
966971
def name(self):
967972
return self._data.name
@@ -1265,7 +1270,7 @@ def index(self):
12651270
def columns(self):
12661271
from .datasource.index import from_pandas as from_pandas_index
12671272

1268-
return from_pandas_index(self.dtypes.index)
1273+
return from_pandas_index(self.dtypes.index, store_data=True)
12691274

12701275
@property
12711276
def axes(self):
@@ -1423,6 +1428,10 @@ def index(self):
14231428
idx._set_df_or_series(self, 0)
14241429
return idx
14251430

1431+
@index.setter
1432+
def index(self, new_index):
1433+
self.set_axis(new_index, axis=0, inplace=True)
1434+
14261435
@property
14271436
def columns(self):
14281437
col = self._data.columns
@@ -1431,11 +1440,7 @@ def columns(self):
14311440

14321441
@columns.setter
14331442
def columns(self, new_columns):
1434-
from .indexing.set_label import DataFrameSetLabel
1435-
1436-
op = DataFrameSetLabel(axis=1, value=new_columns)
1437-
new_df = op(self)
1438-
self.data = new_df.data
1443+
self.set_axis(new_columns, axis=1, inplace=True)
14391444

14401445
def keys(self):
14411446
"""

mars/dataframe/datasource/index.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from ... import opcodes as OperandDef
2121
from ...config import options
2222
from ...core import OutputType
23-
from ...serialize import IndexField, DataTypeField, KeyField
23+
from ...serialize import IndexField, DataTypeField, KeyField, BoolField
2424
from ...tensor.utils import get_chunk_slices
2525
from ..operands import DataFrameOperand, DataFrameOperandMixin
2626
from ..utils import parse_index, decide_series_chunk_size
@@ -36,11 +36,12 @@ class IndexDataSource(DataFrameOperand, DataFrameOperandMixin):
3636
_input = KeyField('input')
3737
_data = IndexField('data')
3838
_dtype = DataTypeField('dtype')
39+
_store_data = BoolField('store_data')
3940

4041
def __init__(self, input=None, data=None, dtype=None, gpu=None, # pylint: disable=redefined-builtin
41-
sparse=None, **kw):
42-
super().__init__(_input=input, _data=data, _dtype=dtype, _gpu=gpu,
43-
_sparse=sparse, _output_types=[OutputType.index], **kw)
42+
store_data=None, sparse=None, **kw):
43+
super().__init__(_input=input, _data=data, _dtype=dtype, _gpu=gpu, _sparse=sparse,
44+
_store_data=store_data, _output_types=[OutputType.index], **kw)
4445

4546
@property
4647
def input(self):
@@ -54,6 +55,10 @@ def data(self):
5455
def dtype(self):
5556
return self._dtype
5657

58+
@property
59+
def store_data(self):
60+
return self._store_data
61+
5762
def _set_inputs(self, inputs):
5863
super()._set_inputs(inputs)
5964
if inputs is not None and len(inputs) > 0:
@@ -66,7 +71,7 @@ def __call__(self, shape=None, chunk_size=None, inp=None, name=None,
6671
name = name if name is not None else self._data.name
6772
names = names if names is not None else self._data.names
6873
return self.new_index(None, shape=shape, dtype=self._dtype,
69-
index_value=parse_index(self._data),
74+
index_value=parse_index(self._data, store_data=self.store_data),
7075
name=name, names=names, raw_chunk_size=chunk_size)
7176
elif hasattr(inp, 'index_value'):
7277
# get index from Mars DataFrame, Series or Index
@@ -75,8 +80,8 @@ def __call__(self, shape=None, chunk_size=None, inp=None, name=None,
7580
if inp.index_value.has_value():
7681
self._data = data = inp.index_value.to_pandas()
7782
return self.new_index(None, shape=(inp.shape[0],), dtype=data.dtype,
78-
index_value=parse_index(data), name=name,
79-
names=names, raw_chunk_size=chunk_size)
83+
index_value=parse_index(data, store_data=self.store_data),
84+
name=name, names=names, raw_chunk_size=chunk_size)
8085
else:
8186
if self._dtype is None:
8287
self._dtype = inp.index_value.to_pandas().dtype
@@ -92,7 +97,7 @@ def __call__(self, shape=None, chunk_size=None, inp=None, name=None,
9297
if self._dtype is None:
9398
self._dtype = pd_index.dtype
9499
return self.new_index([inp], shape=inp.shape, dtype=self._dtype,
95-
index_value=parse_index(pd_index, inp),
100+
index_value=parse_index(pd_index, inp, store_data=self.store_data),
96101
name=name, names=names)
97102

98103
@classmethod
@@ -111,9 +116,9 @@ def _tile_from_pandas(cls, op):
111116
chunk_op = op.copy().reset_key()
112117
slc = get_chunk_slices(chunk_size, chunk_index)
113118
chunk_op._data = chunk_data = raw_index[slc]
114-
out_chunk = chunk_op.new_chunk(None, shape=chunk_shape, dtype=index.dtype,
115-
index=chunk_index, name=index.name,
116-
index_value=parse_index(chunk_data))
119+
out_chunk = chunk_op.new_chunk(
120+
None, shape=chunk_shape, dtype=index.dtype, index=chunk_index,
121+
name=index.name, index_value=parse_index(chunk_data, store_data=op.store_data))
117122
out_chunks.append(out_chunk)
118123

119124
new_op = op.copy()
@@ -165,7 +170,7 @@ def _tile_from_tensor(cls, op):
165170
out_chunks = []
166171
for c in inp.chunks:
167172
chunk_op = op.copy().reset_key()
168-
index_value = parse_index(out.index_value.to_pandas(), c)
173+
index_value = parse_index(out.index_value.to_pandas(), c, store_data=op.store_data)
169174
out_chunk = chunk_op.new_chunk([c], shape=c.shape,
170175
dtype=out.dtype, index=c.index,
171176
index_value=index_value,
@@ -206,8 +211,9 @@ def execute(cls, ctx, op):
206211
ctx[out.key] = pd.Index(inp, dtype=dtype, name=out.name)
207212

208213

209-
def from_pandas(data, chunk_size=None, gpu=False, sparse=False):
210-
op = IndexDataSource(data=data, gpu=gpu, sparse=sparse, dtype=data.dtype)
214+
def from_pandas(data, chunk_size=None, gpu=False, sparse=False, store_data=False):
215+
op = IndexDataSource(data=data, gpu=gpu, sparse=sparse, dtype=data.dtype,
216+
store_data=store_data)
211217
return op(shape=data.shape, chunk_size=chunk_size)
212218

213219

mars/dataframe/indexing/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def _install():
2929
from .setitem import dataframe_setitem
3030
from .reindex import reindex
3131
from .where import mask, where
32+
from .set_axis import df_set_axis, series_set_axis
3233

3334
for cls in DATAFRAME_TYPE + SERIES_TYPE:
3435
setattr(cls, 'iloc', cache_readonly(iloc))
@@ -49,20 +50,18 @@ def _install():
4950
setattr(cls, 'insert', df_insert)
5051
setattr(cls, 'reset_index', df_reset_index)
5152
setattr(cls, 'rename', df_rename)
53+
setattr(cls, 'set_axis', df_set_axis)
5254

5355
for cls in SERIES_TYPE:
5456
setattr(cls, '__getitem__', series_getitem)
5557
setattr(cls, 'reset_index', series_reset_index)
5658
setattr(cls, 'rename', series_rename)
59+
setattr(cls, 'set_axis', series_set_axis)
5760

5861
for cls in INDEX_TYPE:
5962
setattr(cls, 'rename', index_rename)
6063
setattr(cls, 'set_names', index_set_names)
6164

62-
# make sure operand is registered
63-
from .set_label import DataFrameSetLabel
64-
del DataFrameSetLabel
65-
6665

6766
_install()
6867
del _install

0 commit comments

Comments
 (0)