Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
>>> import pandas as pd
>>> import numpy as np
>>> ix1 = pd.MultiIndex.from_arrays([[np.nan, 81, 81, 82, 82], [np.nan, np.nan, np.nan, np.nan, np.nan], pd.to_datetime([np.nan, '2018-06-01', '2018-07-01', '2018-07-01', '2018-08-01'])], names=['foo', 'bar', 'date'])
>>> ix1
MultiIndex([( nan, nan, 'NaT'),
(81.0, nan, '2018-06-01'),
(81.0, nan, '2018-07-01'),
(82.0, nan, '2018-07-01'),
(82.0, nan, '2018-08-01')],
names=['foo', 'bar', 'date'])
>>>
>>> s1 = pd.Series([np.nan, 25.058969, 22.519751, 20.847981, 21.625236], index=ix1)
>>> s1
foo bar date
NaN NaN NaT NaN
81 NaN 2018-06-01 25.058969
2018-07-01 22.519751
82 NaN 2018-07-01 20.847981
2018-08-01 21.625236
dtype: float64
>>>
>>> ix2 = pd.Index([81, 82, 83, 84, 85, 86, 87], name='foo')
>>> ix2
Index([81, 82, 83, 84, 85, 86, 87], dtype='int64', name='foo')
>>>
>>> s2 = pd.Series([28.2800, 25.2500, 22.2200, 16.7660, 14.0087, 14.9480, 29.2900], ix2)
>>>
>>> s2
foo
81 28.2800
82 25.2500
83 22.2200
84 16.7660
85 14.0087
86 14.9480
87 29.2900
dtype: float64
>>>
>>>
>>> s1 - s2
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File ".venv/lib64/python3.11/site-packages/pandas/core/ops/common.py", line 76, in new_method
return method(self, other)
^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/arraylike.py", line 194, in __sub__
return self._arith_method(other, operator.sub)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 5814, in _arith_method
self, other = self._align_for_op(other)
^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 5844, in _align_for_op
left, right = left.align(right, copy=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/generic.py", line 10091, in align
left, _right, join_index = self._align_series(
^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/generic.py", line 10213, in _align_series
left = self._reindex_indexer(join_index, lidx, copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 4782, in _reindex_indexer
return self._constructor(new_values, index=new_index, copy=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".venv/lib64/python3.11/site-packages/pandas/core/series.py", line 503, in __init__
com.require_length_match(data, index)
File ".venv/lib64/python3.11/site-packages/pandas/core/common.py", line 561, in require_length_match
raise ValueError(
ValueError: Length of values (5) does not match length of index (4)
>>> s1.iloc[1:] - s2 # proactively removing the all-nan index row allows the operation to succeed without error
foo bar date
81 NaN 2018-06-01 -3.221031
2018-07-01 -5.760249
82 NaN 2018-07-01 -4.402019
2018-08-01 -3.624764
dtype: float64
Issue Description
It is possible to carry out arithmetic operations on two series with "mixed" indices when at least 1 level is the same. However, in my case s1 - s2
, s1
contains an all nan
index row which raises a ValueError: Length of values (5) does not match length of index (4)
.
I found that this could be an error in how the two series are aligned.
class Series(...):
...
def _align_series(...):
...
if not axis:
# equal
if self.index.equals(other.index):
join_index, lidx, ridx = None, None, None
else:
join_index, lidx, ridx = self.index.join(
other.index, how=join, level=level, return_indexers=True
)
## At this point, `join_index` is invalid as it contains different length codes:
## join_index.code == FrozenList([[0, 0, 1, 1], [-1, -1, -1, -1, -1], [-1, 0, 1, 1, 2]])
## which returns (4 items in level index 0)
## join_index.get_level_values(0) == Index([81.0, 81.0, 82.0, 82.0], dtype='float64', name='foo')
## while (5 items in level index 1 and 2)
## join_index.get_level_values(1) == Index([nan, nan, nan, nan, nan], dtype='float64', name='demand_index')
## join_index.get_level_values(2) == DatetimeIndex(['NaT', '2018-06-01', '2018-07-01', '2018-07-01', '2018-08-01'], dtype='datetime64[ns]', name='blend_date', freq=None)
##
if is_series:
## The invalid `join_index` is picked up by `._reindex_indexer(...)` as `len(join_index)` == 4 and `len(new_values) == 5` which causes the `ValueError`
left = self._reindex_indexer(join_index, lidx, copy)
...
def _reindex_indexer(...):
...
new_values = algorithms.take_nd(
self._values, indexer, allow_fill=True, fill_value=None
)
return self._constructor(new_values, index=new_index, copy=False) ## <- raises the ValueError
I traced the origin of the mismatching codes to pandas.core.indexes.base.py:Index._join_level
which blatantly ignores missing values to construct a new index.
class Index(...):
def _join_level(...):
...
else:
left_lev_indexer = ensure_platform_int(left_lev_indexer)
rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
old_codes = left.codes[level]
## This will ignore missing values (nan) without ever inserting
## those values back into the index, ultimately leading to
## different length codes
taker = old_codes[old_codes != -1]
new_lev_codes = rev_indexer.take(taker)
new_codes = list(left.codes)
new_codes[level] = new_lev_codes
new_levels = list(left.levels)
new_levels[level] = new_level
if keep_order: # just drop missing values. o.w. keep order
left_indexer = np.arange(len(left), dtype=np.intp)
left_indexer = cast(np.ndarray, left_indexer)
mask = new_lev_codes != -1
if not mask.all():
new_codes = [lab[mask] for lab in new_codes]
left_indexer = left_indexer[mask]
...
join_index = MultiIndex(
levels=new_levels,
codes=new_codes,
names=left.names,
verify_integrity=False,
)
This is all possible because verify_integrity
is set to False (and not passed down). If I set verify_integrity=True
the join_index = MultiIndex(...)
fails much earlier with ValueError: Length of levels and codes must match. NOTE: this index is in an inconsistent state.
class MultiIndex(...):
def __new__(...):
...
# result._set_codes(codes, copy=copy, validate=False)
result._set_codes(codes, copy=copy, validate=False, verify_integrity=True)
...
I tried to fix this by changing the taker = old_codes[old_codes != -1]
to taker = old_codes
. This alleviates the initial ValueError
(just tested for my case). If I also comment out the -1 handling, I get the desired expected behaviour.
# if not mask.all():
# new_codes = [lab[mask] for lab in new_codes]
# left_indexer = left_indexer[mask]
Expected Behavior
>>> s1
foo bar date
NaN NaN NaT NaN
81 NaN 2018-06-01 25.058969
2018-07-01 22.519751
82 NaN 2018-07-01 20.847981
2018-08-01 21.625236
dtype: float64
>>> s2
foo
81 28.2800
82 25.2500
83 22.2200
84 16.7660
85 14.0087
86 14.9480
87 29.2900
dtype: float64
>>> s1 - s2
foo bar date
NaN NaN NaT NaN
81 NaN 2018-06-01 -3.221031
2018-07-01 -5.760249
82 NaN 2018-07-01 -4.402019
2018-08-01 -3.624764
Installed Versions
INSTALLED VERSIONS
------------------
commit : e86ed377639948c64c429059127bcf5b359ab6be
python : 3.11.11.final.0
python-bits : 64
OS : Linux
OS-release : 6.12.11-200.fc41.x86_64
Version : #1 SMP PREEMPT_DYNAMIC Fri Jan 24 04:59:58 UTC 2025
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : en_AU.UTF-8
LOCALE : en_AU.UTF-8
pandas : 2.1.1
numpy : 1.24.3
pytz : 2020.4
dateutil : 2.8.2
setuptools : 69.2.0
pip : 24.2
Cython : 0.29.34
pytest : 7.3.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 0.9.6
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.9.6
jinja2 : 2.11.2
IPython : None
pandas_datareader : None
bs4 : None
bottleneck : 1.3.5
dataframe-api-compat: None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : 2.8.4
odfpy : None
openpyxl : 3.1.2
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.10.1
sqlalchemy : 1.3.23
tables : 3.8.0
tabulate : None
xarray : None
xlrd : 2.0.1
zstandard : None
tzdata : 2024.2
qtpy : None
pyqt5 : None
Also happens with pandas==2.2.3