tobymao · georgesittas · Sep 27, 2024 · Sep 11, 2024 · Sep 26, 2024
diff --git a/sqlglot/dialects/spark2.py b/sqlglot/dialects/spark2.py
@@ -13,14 +13,19 @@
     unit_to_str,
 )
 from sqlglot.dialects.hive import Hive
-from sqlglot.helper import seq_get
+from sqlglot.helper import seq_get, ensure_list
 from sqlglot.transforms import (
     preprocess,
     remove_unique_constraints,
     ctas_with_tmp_tables_to_create_tmp_view,
     move_schema_columns_to_partitioned_by,
 )
 
+if t.TYPE_CHECKING:
+    from sqlglot._typing import E
+
+    from sqlglot.optimizer.annotate_types import TypeAnnotator
+
 
 def _map_sql(self: Spark2.Generator, expression: exp.Map) -> str:
     keys = expression.args.get("keys")
@@ -110,10 +115,48 @@ def temporary_storage_provider(expression: exp.Expression) -> exp.Expression:
     return expression
 
 
+def _annotate_by_similar_args(
+    self: TypeAnnotator, expression: E, *args: str, target_type: exp.DataType | exp.DataType.Type
+) -> E:
+    """
+    Infers the type of the expression according to the following rules:
+    - If all args are of the same type OR any arg is of target_type, the expr is inferred as such
+    - If any arg is of UNKNOWN type and none of target_type, the expr is inferred as UNKNOWN
+    """
+    self._annotate_args(expression)
+
+    expressions: t.List[exp.Expression] = []
+    for arg in args:
+        arg_expr = expression.args.get(arg)
+        expressions.extend(expr for expr in ensure_list(arg_expr) if expr)
+
+    last_datatype = None
+
+    has_unknown = False
+    for expr in expressions:
+        if expr.is_type(exp.DataType.Type.UNKNOWN):
+            has_unknown = True
+        elif expr.is_type(target_type):
+            has_unknown = False
+            last_datatype = target_type
+            break
+        else:
+            last_datatype = expr.type
+
+    self._set_type(expression, exp.DataType.Type.UNKNOWN if has_unknown else last_datatype)
+    return expression
+
+
 class Spark2(Hive):
     ANNOTATORS = {
         **Hive.ANNOTATORS,
         exp.Substring: lambda self, e: self._annotate_by_args(e, "this"),
+        exp.Concat: lambda self, e: _annotate_by_similar_args(
+            self, e, "expressions", target_type=exp.DataType.Type.TEXT
+        ),
+        exp.Pad: lambda self, e: _annotate_by_similar_args(
+            self, e, "this", "fill_pattern", target_type=exp.DataType.Type.TEXT
+        ),
     }
 
     class Parser(Hive.Parser):

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -1378,25 +1378,41 @@ def gen_expr(depth: int) -> exp.Expression:
         self.assertEqual(18, normalization_distance(gen_expr(3), max_=100))
         self.assertEqual(110, normalization_distance(gen_expr(10), max_=100))
 
-    def test_custom_annotators(self):
+    def test_spark_annotators(self):
+        """Test Spark annotators, mainly built-in string/binary functions"""
+
+        spark_schema = {"tbl": {"bin_col": "BINARY", "str_col": "STRING"}}
+
+        from sqlglot.dialects import Dialect
+
+        def _assert_func_return_type(func: str, dialect: str, target_type: str):
+            ast = parse_one(f"SELECT {func} FROM tbl", read=dialect)
+            annotators = Dialect.get_or_raise(dialect).ANNOTATORS
+            annotated = annotate_types(ast, annotators=annotators, schema=spark_schema)
+
+            self.assertEqual(
+                annotated.expressions[0].type.sql(dialect),
+                exp.DataType.build(target_type).sql(dialect),
+            )
+
+        str_col, bin_col = "tbl.str_col", "tbl.bin_col"
+
         # In Spark hierarchy, SUBSTRING result type is dependent on input expr type
         for dialect in ("spark2", "spark", "databricks"):
-            for expr_type_pair in (
-                ("col", "STRING"),
-                ("col", "BINARY"),
-                ("'str_literal'", "STRING"),
-                ("CAST('str_literal' AS BINARY)", "BINARY"),
-            ):
-                with self.subTest(
-                    f"Testing {dialect}'s SUBSTRING() result type for {expr_type_pair}"
-                ):
-                    expr, type = expr_type_pair
-                    ast = parse_one(f"SELECT substring({expr}, 2, 3) AS x FROM tbl", read=dialect)
-
-                    subst_type = (
-                        optimizer.optimize(ast, schema={"tbl": {"col": type}}, dialect=dialect)
-                        .expressions[0]
-                        .type
-                    )
-
-                    self.assertEqual(subst_type.sql(dialect), exp.DataType.build(type).sql(dialect))
+            _assert_func_return_type(f"SUBSTRING({str_col}, 0, 0)", dialect, "STRING")
+            _assert_func_return_type(f"SUBSTRING({bin_col}, 0, 0)", dialect, "BINARY")
+
+            _assert_func_return_type(f"CONCAT({bin_col}, {bin_col})", dialect, "BINARY")
+            _assert_func_return_type(f"CONCAT({bin_col}, {str_col})", dialect, "STRING")
+            _assert_func_return_type(f"CONCAT({str_col}, {bin_col})", dialect, "STRING")
+            _assert_func_return_type(f"CONCAT({str_col}, {str_col})", dialect, "STRING")
+
+            _assert_func_return_type(f"CONCAT({str_col}, foo)", dialect, "STRING")
+            _assert_func_return_type(f"CONCAT({bin_col}, bar)", dialect, "UNKNOWN")
+            _assert_func_return_type("CONCAT(foo, bar)", dialect, "UNKNOWN")
+
+            for func in ("LPAD", "RPAD"):
+                _assert_func_return_type(f"{func}({bin_col}, 1, {bin_col})", dialect, "BINARY")
+                _assert_func_return_type(f"{func}({bin_col}, 1, {str_col})", dialect, "STRING")
+                _assert_func_return_type(f"{func}({str_col}, 1, {bin_col})", dialect, "STRING")
+                _assert_func_return_type(f"{func}({str_col}, 1, {str_col})", dialect, "STRING")