fix(python): Suggest str.contains for string containment in map_elements

wtn · claude · wtn · commit 88fb82903fe7 · 2025-12-04T22:40:18.000-06:00
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.github/scripts/test_bytecode_parser.py b/.github/scripts/test_bytecode_parser.py
@@ -21,7 +21,10 @@
 import pytest
 from polars._utils.udfs import BytecodeParser
 from tests.unit.operations.map.test_inefficient_map_warning import (
+    MY_COLLECTION,
     MY_DICT,
+    MY_STRING,
+    MY_SUBSTRING,
     NOOP_TEST_CASES,
     TEST_CASES,
 )
@@ -52,7 +55,10 @@ def test_bytecode_parser_expression_in_ipython(
         "from datetime import datetime; "
         "import numpy as np; "
         "import json; "
-        f"MY_DICT = {MY_DICT};"
+        f"MY_DICT = {MY_DICT}; "
+        f"MY_COLLECTION = {MY_COLLECTION}; "
+        f"MY_STRING = {repr(MY_STRING)}; "
+        f"MY_SUBSTRING = {repr(MY_SUBSTRING)}; "
         f'bytecode_parser = BytecodeParser({func}, map_target="expr");'
         f'print(bytecode_parser.to_expression("{col}"));'
     )
diff --git a/py-polars/src/polars/_utils/udfs.py b/py-polars/src/polars/_utils/udfs.py
@@ -701,12 +701,46 @@ def _expr(self, value: StackEntry, col: str, param_name: str, depth: int) -> str
                     not_ = "" if op == "is" else "not_"
                     return f"{e1}.is_{not_}null()"
                 elif op in ("in", "not in"):
-                    not_ = "" if op == "in" else "~"
-                    return (
-                        f"{not_}({e1}.is_in({e2}))"
-                        if " " in e1
-                        else f"{not_}{e1}.is_in({e2})"
+                    e2_stripped = e2.lstrip()
+                    is_collection_literal = e2_stripped.startswith(
+                        ("(", "[", "{", "frozenset(")
                     )
+
+                    is_collection_variable = False
+                    if not is_collection_literal and not e2.startswith(
+                        ("pl.col(", "'")
+                    ):
+                        if not self._caller_variables:
+                            self._caller_variables = _get_all_caller_variables()
+                        var_value = self._caller_variables.get(e2)
+                        if isinstance(var_value, (list, tuple, set, frozenset, dict)):
+                            is_collection_variable = True
+
+                    if is_collection_literal or is_collection_variable:
+                        not_ = "" if op == "in" else "~"
+                        return (
+                            f"{not_}({e1}.is_in({e2}))"
+                            if " " in e1
+                            else f"{not_}{e1}.is_in({e2})"
+                        )
+                    else:
+                        e2_is_col = e2.startswith("pl.col(")
+                        e1_is_col = e1.startswith("pl.col(")
+
+                        if e2_is_col:
+                            needle = f"pl.lit({e1})" if not e1_is_col else e1
+                            haystack = e2
+                        else:
+                            needle = e1
+                            haystack = f"pl.lit({e2})"
+
+                        contains_expr = (
+                            f"{haystack}.str.contains({needle}, literal=True)"
+                        )
+
+                        if op == "not in":
+                            return f"~{contains_expr}"
+                        return contains_expr
                 elif op == "replace_strict":
                     if not self._caller_variables:
                         self._caller_variables = _get_all_caller_variables()
diff --git a/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py b/py-polars/tests/unit/operations/map/test_inefficient_map_warning.py
@@ -21,6 +21,9 @@
 MY_CONSTANT = 3
 MY_DICT = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}
 MY_LIST = [1, 2, 3]
+MY_STRING = "qwerty"
+MY_SUBSTRING = "we"
+MY_COLLECTION = [2, 3, 4]
 
 # column_name, function, expected_suggestion
 TEST_CASES = [
@@ -67,12 +70,68 @@
     ),
     ("a", "lambda x: x in (2, 3, 4)", 'pl.col("a").is_in((2, 3, 4))', None),
     ("a", "lambda x: x not in (2, 3, 4)", '~pl.col("a").is_in((2, 3, 4))', None),
+    ("a", "lambda x: x in MY_COLLECTION", 'pl.col("a").is_in(MY_COLLECTION)', None),
+    ("a", "lambda x: x in MY_DICT", 'pl.col("a").is_in(MY_DICT)', None),
+    (
+        "a",
+        "lambda x: (x + 1) in (1, 2, 3)",
+        '((pl.col("a") + 1).is_in((1, 2, 3)))',
+        None,
+    ),
     (
         "a",
         "lambda x: x in (1, 2, 3, 4, 3) and x % 2 == 0 and x > 0",
         'pl.col("a").is_in((1, 2, 3, 4, 3)) & ((pl.col("a") % 2) == 0) & (pl.col("a") > 0)',
         None,
     ),
+    # ---------------------------------------------
+    # string containment with 'in' operator
+    # ---------------------------------------------
+    (
+        "b",
+        "lambda x: x in MY_STRING",
+        'pl.lit(MY_STRING).str.contains(pl.col("b"), literal=True)',
+        None,
+    ),
+    (
+        "b",
+        "lambda x: MY_SUBSTRING in x",
+        'pl.col("b").str.contains(pl.lit(MY_SUBSTRING), literal=True)',
+        None,
+    ),
+    (
+        "b",
+        'lambda x: "A" in x',
+        "pl.col(\"b\").str.contains(pl.lit('A'), literal=True)",
+        None,
+    ),
+    (
+        "b",
+        "lambda x: x not in MY_STRING",
+        '~pl.lit(MY_STRING).str.contains(pl.col("b"), literal=True)',
+        None,
+    ),
+    (
+        "b",
+        "lambda x: x in x",
+        'pl.col("b").str.contains(pl.col("b"), literal=True)',
+        None,
+    ),
+    (
+        "b",
+        'lambda x: "test" in x',
+        "pl.col(\"b\").str.contains(pl.lit('test'), literal=True)",
+        None,
+    ),
+    (
+        "b",
+        'lambda x: x not in "hello"',
+        "~pl.lit('hello').str.contains(pl.col(\"b\"), literal=True)",
+        None,
+    ),
+    # ---------------------------------------------
+    # constants
+    # ---------------------------------------------
     ("a", "lambda x: MY_CONSTANT + x", 'MY_CONSTANT + pl.col("a")', None),
     (
         "a",
@@ -310,6 +369,9 @@
     "MY_CONSTANT": MY_CONSTANT,
     "MY_DICT": MY_DICT,
     "MY_LIST": MY_LIST,
+    "MY_STRING": MY_STRING,
+    "MY_SUBSTRING": MY_SUBSTRING,
+    "MY_COLLECTION": MY_COLLECTION,
     "cosh": cosh,
     "datetime": datetime,
     "dt": dt,
@@ -601,3 +663,79 @@ def plus(value: int, amount: int) -> int:
     df = pl.DataFrame(data)
     # should not warn
     _ = df["a"].map_elements(partial(plus, amount=1))
+
+
+@pytest.mark.filterwarnings(
+    "ignore:.*:polars.exceptions.PolarsInefficientMapWarning",
+    "ignore:.*:polars.exceptions.MapWithoutReturnDtypeWarning",
+)
+@pytest.mark.parametrize(
+    "pattern",
+    [
+        ".",  # regex: matches any character
+        "^",  # regex: start of string
+        "$",  # regex: end of string
+        "[0]",  # regex: character class
+        "a|b",  # regex: alternation
+        "a+",  # regex: one or more
+        "a?",  # regex: zero or one
+    ],
+)
+def test_string_containment_regex_metacharacters_17182(pattern: str) -> None:
+    """The suggested str.contains must use literal matching, not regex."""
+    df = pl.DataFrame({"b": [f"x{pattern}y", "xyz", pattern, "hello"]})
+
+    # What the lambda actually does (literal matching)
+    result_lambda = df.select(
+        pl.col("b").map_elements(
+            lambda x: x.find(pattern) >= 0,  # equivalent to `pattern in x`
+            return_dtype=pl.Boolean,
+        )
+    )
+
+    # Get the suggested expression from BytecodeParser
+    func = lambda x: pattern in x  # noqa: E731
+    parser = BytecodeParser(func, map_target="expr")
+    suggested = parser.to_expression("b")
+
+    # The suggested expression should produce the same results as the lambda
+    result_suggested = df.select(eval(suggested, {"pl": pl, "pattern": pattern}))
+
+    assert_frame_equal(result_lambda, result_suggested)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:.*:polars.exceptions.PolarsInefficientMapWarning",
+    "ignore:.*:polars.exceptions.MapWithoutReturnDtypeWarning",
+)
+@pytest.mark.parametrize(
+    "pattern",
+    [
+        "*",  # regex: zero or more (invalid without preceding expr)
+        "(",  # regex: unclosed group
+        ")",  # regex: unopened group
+        "[",  # regex: unclosed character class
+    ],
+)
+def test_string_containment_invalid_regex_17182(pattern: str) -> None:
+    """Patterns that are invalid regex but valid for Python's `in` operator."""
+    df = pl.DataFrame({"b": [f"x{pattern}y", "xyz", pattern, "hello"]})
+
+    # What the lambda actually does (literal matching)
+    result_lambda = df.select(
+        pl.col("b").map_elements(
+            lambda x: x.find(pattern) >= 0,  # equivalent to `pattern in x`
+            return_dtype=pl.Boolean,
+        )
+    )
+
+    # Get the suggested expression from BytecodeParser
+    func = lambda x: pattern in x  # noqa: E731
+    parser = BytecodeParser(func, map_target="expr")
+    suggested = parser.to_expression("b")
+
+    # The suggested expression must work without regex errors
+    # and produce the same results as the lambda
+    result_suggested = df.select(eval(suggested, {"pl": pl, "pattern": pattern}))
+
+    assert_frame_equal(result_lambda, result_suggested)