From 9bc2b6674733426714c9a446617709ef2b885159 Mon Sep 17 00:00:00 2001 From: Farsidetfs Date: Mon, 12 May 2025 22:35:10 +0000 Subject: [PATCH 1/2] BUG: Raise MergeError when suffixes result in duplicate column names (GH#61402) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/merge.py | 7 +++++-- pandas/tests/reshape/merge/test_merge.py | 9 +++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..6823ef65c2e37 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -847,6 +847,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 68d61da0cf7dd..ccaaa91a2d84a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3062,13 +3062,16 @@ def renamer(x, suffix: str | None): if not llabels.is_unique: # Only warn when duplicates are caused because of suffixes, already duplicated # columns in origin should not warn - dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + dups.extend(llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()) if not rlabels.is_unique: dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + # Suffix addition creates duplicate to pre-existing column name + dups.extend(llabels.intersection(right.difference(to_rename)).tolist()) + dups.extend(rlabels.intersection(left.difference(to_rename)).tolist()) if dups: raise MergeError( f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " - f"not allowed.", + "not allowed.", ) return llabels, rlabels diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..f3418ad047afe 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3060,3 +3060,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("suffixes", [("_dup", ""), ("", "_dup")]) +def test_merge_for_suffix_collisions(suffixes): + # GH#61402 + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=suffixes) From 04a4d07b7ef727d6cbf8071dea5fada44f306bcc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Jun 2025 19:02:05 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8eae774e276f7..c0980bdb39cb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -851,8 +851,8 @@ Reshaping - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) -- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) Sparse ^^^^^^