tobymao · georgesittas · Sep 27, 2024 · Sep 11, 2024 · Sep 26, 2024
diff --git a/sqlglot/dialects/spark2.py b/sqlglot/dialects/spark2.py
@@ -13,14 +13,19 @@
     unit_to_str,
 )
 from sqlglot.dialects.hive import Hive
-from sqlglot.helper import seq_get
+from sqlglot.helper import seq_get, ensure_list
 from sqlglot.transforms import (
     preprocess,
     remove_unique_constraints,
     ctas_with_tmp_tables_to_create_tmp_view,
     move_schema_columns_to_partitioned_by,
 )
 
+if t.TYPE_CHECKING:
+    from sqlglot._typing import E
+
+    from sqlglot.optimizer.annotate_types import TypeAnnotator
+
 
 def _map_sql(self: Spark2.Generator, expression: exp.Map) -> str:
     keys = expression.args.get("keys")
@@ -110,10 +115,41 @@ def temporary_storage_provider(expression: exp.Expression) -> exp.Expression:
     return expression
 
 
+def _annotate_by_same_args(
+    self: TypeAnnotator, expression: E, *args: str, target_type: exp.DataType | exp.DataType.Type
+) -> E:
+    """
+    Infers the type of the expression if all the param @args are of that type,
+    otherwise defaults to param @target_type
+    """
+    self._annotate_args(expression)
+
+    expressions: t.List[exp.Expression] = []
+    for arg in args:
+        arg_expr = expression.args.get(arg)
+        expressions.extend(expr for expr in ensure_list(arg_expr) if expr)
+
+    last_datatype = expressions[0].type if expressions else None
+
+    for expr in expressions:
+        if not expr.is_type(last_datatype):
+            last_datatype = None
+            break
+
+    self._set_type(expression, last_datatype or target_type)
+    return expression
+
+
 class Spark2(Hive):
     ANNOTATORS = {
         **Hive.ANNOTATORS,
         exp.Substring: lambda self, e: self._annotate_by_args(e, "this"),
+        exp.Concat: lambda self, e: _annotate_by_same_args(
+            self, e, "expressions", target_type=exp.DataType.Type.TEXT
+        ),
+        exp.Pad: lambda self, e: _annotate_by_same_args(
+            self, e, "this", "fill_pattern", target_type=exp.DataType.Type.TEXT
+        ),
     }
 
     class Parser(Hive.Parser):

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -1378,25 +1378,31 @@ def gen_expr(depth: int) -> exp.Expression:
         self.assertEqual(18, normalization_distance(gen_expr(3), max_=100))
         self.assertEqual(110, normalization_distance(gen_expr(10), max_=100))
 
-    def test_custom_annotators(self):
+    def test_spark_annotators(self):
+        """Test Spark annotators, mainly built-in string/binary functions"""
+
+        schema = {"tbl": {"bin_col": "BINARY", "str_col": "STRING"}}
+
+        def _assert_func_return_type(func: str, dialect: str, target_type: str):
+            ast = parse_one(f"SELECT {func} FROM tbl", read=dialect)
+            optimized = optimizer.optimize(ast, schema=schema, dialect=dialect)
+            self.assertEqual(
+                optimized.expressions[0].type.sql(dialect),
+                exp.DataType.build(target_type).sql(dialect),
+            )
+
         # In Spark hierarchy, SUBSTRING result type is dependent on input expr type
         for dialect in ("spark2", "spark", "databricks"):
-            for expr_type_pair in (
-                ("col", "STRING"),
-                ("col", "BINARY"),
-                ("'str_literal'", "STRING"),
-                ("CAST('str_literal' AS BINARY)", "BINARY"),
-            ):
-                with self.subTest(
-                    f"Testing {dialect}'s SUBSTRING() result type for {expr_type_pair}"
-                ):
-                    expr, type = expr_type_pair
-                    ast = parse_one(f"SELECT substring({expr}, 2, 3) AS x FROM tbl", read=dialect)
-
-                    subst_type = (
-                        optimizer.optimize(ast, schema={"tbl": {"col": type}}, dialect=dialect)
-                        .expressions[0]
-                        .type
-                    )
-
-                    self.assertEqual(subst_type.sql(dialect), exp.DataType.build(type).sql(dialect))
+            _assert_func_return_type("SUBSTRING(str_col, 0, 0)", dialect, "STRING")
+            _assert_func_return_type("SUBSTRING(bin_col, 0, 0)", dialect, "BINARY")
+
+            _assert_func_return_type("CONCAT(bin_col, bin_col)", dialect, "BINARY")
+            _assert_func_return_type("CONCAT(bin_col, str_col)", dialect, "STRING")
+            _assert_func_return_type("CONCAT(str_col, bin_col)", dialect, "STRING")
+            _assert_func_return_type("CONCAT(str_col, str_col)", dialect, "STRING")
+
+            for func in ("LPAD", "RPAD"):
+                _assert_func_return_type(f"{func}(bin_col, 1, bin_col)", dialect, "BINARY")
+                _assert_func_return_type(f"{func}(bin_col, 1, str_col)", dialect, "STRING")
+                _assert_func_return_type(f"{func}(str_col, 1, bin_col)", dialect, "STRING")
+                _assert_func_return_type(f"{func}(str_col, 1, str_col)", dialect, "STRING")