refactor!: Improve typing annotations for planner, schema, serde, and transforms modules (#7579)

OutSquareCapital · Copilot · georgesittas · georgesittas · commit 6944a6c3ff5a · 2026-05-05T14:13:03.000+03:00
* refactor: improve planner typing annotations

* refactor: improve planner, schema, and serde annotations

Co-authored-by: Copilot &lt;copilot@github.com&gt;

* refactor: added lazy annotations import to time module

Co-authored-by: Copilot &lt;copilot@github.com&gt;

* refactor: improved transforms annotations

Co-authored-by: Copilot &lt;copilot@github.com&gt;

* fix: make `StackVal` `type alias compatible with python 3.9

Co-authored-by: Copilot &lt;copilot@github.com&gt;

* fix: since ast tree is mutated in `transforms::eliminate_qualify`, we indeed need to collect the Iterator in a list first

Co-authored-by: Copilot &lt;copilot@github.com&gt;

* fix: mypc is buggy with `object::__module__` access, so we can't narrow to a precise type the `Expr` path in serde::dump`

* refactor: revert `type` -&gt; `isinstance` usage in `serde::dump` function body

* refactor: revert `nodes` list type in `serde::load`

* refactor: ignore `node` type in `serde::load` body to avoid errors

* refactor: Apply suggestions from code review

Co-authored-by: Jo &lt;46752250+georgesittas@users.noreply.github.com&gt;

* fix: use `Any` for the key type of the trie mapping in `time::format_time`

* refactor: move comment of joins_ons in `transforms::eliminate_join_marks` above the line

* refactor: change the function body of `trnasforms::move_schema_columns_to_partitioned_by` into something more type safe

* fix: revert instance checks in `serde::dump`

* fix: revert type hint of `node` in `serde::load`

* refactor: make the `_sql_handler` variable in `transforms::preprocess::_to_sql` a Protocol

---------

Co-authored-by: Copilot &lt;copilot@github.com&gt;
Co-authored-by: Jo &lt;46752250+georgesittas@users.noreply.github.com&gt;
diff --git a/sqlglot/planner.py b/sqlglot/planner.py
@@ -11,8 +11,8 @@
 
 class Plan:
     def __init__(self, expression: exp.Expr) -> None:
-        self.expression = expression.copy()
-        self.root = Step.from_expression(self.expression)
+        self.expression: exp.Expr = expression.copy()
+        self.root: Step = Step.from_expression(self.expression)
         self._dag: dict[Step, set[Step]] = {}
 
     @property
@@ -93,10 +93,10 @@ def from_expression(cls, expression: exp.Expr, ctes: dict[str, Step] | None = No
         """
         ctes = ctes or {}
         expression = expression.unnest()
-        with_ = expression.args.get("with_")
+        with_: exp.With | None = expression.args.get("with_")
 
         # CTEs break the mold of scope and introduce themselves to all in the context.
-        if with_:
+        if with_ is not None:
             ctes = ctes.copy()
             for cte in with_.expressions:
                 step = Step.from_expression(cte.this, ctes)
@@ -112,23 +112,22 @@ def from_expression(cls, expression: exp.Expr, ctes: dict[str, Step] | None = No
         else:
             step = Scan()
 
-        joins = expression.args.get("joins")
+        joins: list[exp.Join] | None = expression.args.get("joins")
 
-        if joins:
+        if joins is not None:
             join = Join.from_joins(joins, ctes)
             join.name = step.name
             join.source_name = step.name
             join.add_dependency(step)
             step = join
-
-        projections: list[
-            exp.Expr
-        ] = []  # final selects in this chain of steps representing a select
-        operands = {}  # intermediate computations of agg funcs eg x + 1 in SUM(x + 1)
-        aggregations = {}
+        # final selects in this chain of steps representing a select
+        projections: list[exp.Expr] = []
+        # intermediate computations of agg funcs eg x + 1 in SUM(x + 1)
+        operands: dict[exp.Expr, str] = {}
+        aggregations: dict[exp.Expr, None] = {}
         next_operand_name = name_sequence("_a_")
 
-        def extract_agg_operands(expression):
+        def extract_agg_operands(expression: exp.Expr) -> bool:
             agg_funcs = tuple(expression.find_all(exp.AggFunc))
             if agg_funcs:
                 aggregations[expression] = None
@@ -144,7 +143,7 @@ def extract_agg_operands(expression):
 
             return bool(agg_funcs)
 
-        def set_ops_and_aggs(step):
+        def set_ops_and_aggs(step) -> None:
             step.operands = tuple(alias(operand, alias_) for operand, alias_ in operands.items())
             step.aggregations = list(aggregations)
 
@@ -155,21 +154,21 @@ def set_ops_and_aggs(step):
             else:
                 projections.append(e)
 
-        where = expression.args.get("where")
+        where: exp.Where | None = expression.args.get("where")
 
-        if where:
+        if where is not None:
             step.condition = where.this
 
-        group = expression.args.get("group")
+        group: exp.Group | None = expression.args.get("group")
 
-        if group or aggregations:
+        if group is not None or aggregations:
             aggregate = Aggregate()
             aggregate.source = step.name
             aggregate.name = step.name
 
-            having = expression.args.get("having")
+            having: exp.Having | None = expression.args.get("having")
 
-            if having:
+            if having is not None:
                 if extract_agg_operands(exp.alias_(having.this, "_h", quoted=True)):
                     aggregate.condition = exp.column("_h", step.name, quoted=True)
                 else:
@@ -205,10 +204,10 @@ def set_ops_and_aggs(step):
         else:
             aggregate = None
 
-        order = expression.args.get("order")
+        order: exp.Order | None = expression.args.get("order")
 
-        if order:
-            if aggregate and isinstance(step, Aggregate):
+        if order is not None:
+            if aggregate is not None and isinstance(step, Aggregate):
                 for i, ordered in enumerate(order.expressions):
                     if extract_agg_operands(exp.alias_(ordered.this, f"_o_{i}", quoted=True)):
                         ordered.this.replace(exp.column(f"_o_{i}", step.name, quoted=True))
@@ -234,9 +233,9 @@ def set_ops_and_aggs(step):
             distinct.add_dependency(step)
             step = distinct
 
-        limit = expression.args.get("limit")
+        limit: exp.Limit | None = expression.args.get("limit")
 
-        if limit:
+        if limit is not None:
             step.limit = int(limit.text("expression"))
 
         return step
@@ -304,7 +303,7 @@ def _to_s(self, _indent: str) -> list[str]:
 class Scan(Step):
     @classmethod
     def from_expression(cls, expression: exp.Expr, ctes: dict[str, Step] | None = None) -> Step:
-        table = expression
+        table: exp.Expr = expression
         alias_ = expression.alias_or_name
 
         if isinstance(expression, exp.Subquery):
@@ -356,7 +355,7 @@ def _to_s(self, indent: str) -> list[str]:
         lines = [f"{indent}Source: {self.source_name or self.name}"]
         for name, join in self.joins.items():
             lines.append(f"{indent}{name}: {join['side'] or 'INNER'}")
-            join_key = ", ".join(str(key) for key in t.cast(list, join.get("join_key") or []))
+            join_key = ", ".join(str(key) for key in t.cast(list[str], join.get("join_key") or []))
             if join_key:
                 lines.append(f"{indent}Key: {join_key}")
             if join.get("condition"):
@@ -396,7 +395,7 @@ def _to_s(self, indent: str) -> list[str]:
 class Sort(Step):
     def __init__(self) -> None:
         super().__init__()
-        self.key = None
+        self.key: list[exp.Expr] | None = None
 
     def _to_s(self, indent: str) -> list[str]:
         lines = [f"{indent}Key:"]
@@ -408,18 +407,12 @@ def _to_s(self, indent: str) -> list[str]:
 
 
 class SetOperation(Step):
-    def __init__(
-        self,
-        op: type[exp.Expr],
-        left: str | None,
-        right: str | None,
-        distinct: bool = False,
-    ) -> None:
+    def __init__(self, op: type[exp.Expr], left: str, right: str, distinct: bool = False) -> None:
         super().__init__()
-        self.op = op
-        self.left = left
-        self.right = right
-        self.distinct = distinct
+        self.op: type[exp.Expr] = op
+        self.left: str = left
+        self.right: str = right
+        self.distinct: bool = distinct
 
     @classmethod
     def from_expression(
@@ -442,15 +435,15 @@ def from_expression(
         step.add_dependency(left)
         step.add_dependency(right)
 
-        limit = expression.args.get("limit")
+        limit: exp.Limit | None = expression.args.get("limit")
 
-        if limit:
+        if limit is not None:
             step.limit = int(limit.text("expression"))
 
         return step
 
     def _to_s(self, indent: str) -> list[str]:
-        lines = []
+        lines: list[str] = []
         if self.distinct:
             lines.append(f"{indent}Distinct: {self.distinct}")
         return lines
diff --git a/sqlglot/schema.py b/sqlglot/schema.py
@@ -18,7 +18,7 @@
     from collections.abc import Sequence
     from typing_extensions import Unpack
 
-    ColumnMapping = t.Union[dict, str, list]
+    ColumnMapping = t.Union[dict[str, t.Any], str, list[str]]
 
 
 @trait
@@ -344,7 +344,7 @@ def from_mapping_schema(cls, mapping_schema: MappingSchema) -> MappingSchema:
     def find(
         self, table: exp.Table, raise_on_missing: bool = True, ensure_data_types: bool = False
     ) -> t.Any | None:
-        schema = super().find(
+        schema: dict[str, object] | None = super().find(
             table, raise_on_missing=raise_on_missing, ensure_data_types=ensure_data_types
         )
         if ensure_data_types and isinstance(schema, dict):
@@ -417,7 +417,7 @@ def column_names(
     ) -> list[str]:
         normalized_table = self._normalize_table(table, dialect=dialect, normalize=normalize)
 
-        schema = self.find(normalized_table)
+        schema: dict[str, object] | None = self.find(normalized_table)
         if schema is None:
             return []
 
@@ -440,7 +440,7 @@ def get_column_type(
             column if isinstance(column, str) else column.this, dialect=dialect, normalize=normalize
         )
 
-        table_schema = self.find(normalized_table, raise_on_missing=False)
+        table_schema: dict[str, object] | None = self.find(normalized_table, raise_on_missing=False)
         if table_schema:
             column_type = table_schema.get(normalized_column_name)
 
@@ -500,7 +500,7 @@ def has_column(
             column if isinstance(column, str) else column.this, dialect=dialect, normalize=normalize
         )
 
-        table_schema = self.find(normalized_table, raise_on_missing=False)
+        table_schema: dict[str, object] | None = self.find(normalized_table, raise_on_missing=False)
         return normalized_column_name in table_schema if table_schema else False
 
     def _normalize(self, schema: dict[str, object]) -> dict[str, object]:
@@ -708,7 +708,7 @@ def ensure_schema(
     return MappingSchema(schema, **kwargs)
 
 
-def ensure_column_mapping(mapping: ColumnMapping | None) -> dict:
+def ensure_column_mapping(mapping: ColumnMapping | None) -> dict[str, t.Any]:
     if mapping is None:
         return {}
     elif isinstance(mapping, dict):
diff --git a/sqlglot/serde.py b/sqlglot/serde.py
@@ -3,6 +3,10 @@
 import typing as t
 
 from sqlglot import expressions as exp
+from types import ModuleType
+
+
+StackVal = tuple[t.Any, t.Optional[int], t.Optional[str], bool]
 
 
 INDEX = "i"
@@ -21,8 +25,8 @@ def dump(expression: exp.Expr) -> list[dict[str, t.Any]]:
     Dump an Expr into a JSON serializable List.
     """
     i = 0
-    payloads = []
-    stack: list[tuple[t.Any, int | None, str | None, bool]] = [(expression, None, None, False)]
+    payloads: list[dict[str, t.Any]] = []
+    stack: list[StackVal] = [(expression, None, None, False)]
 
     while stack:
         node, index, arg_key, is_array = stack.pop()
@@ -90,8 +94,8 @@ def load(
             node = payload[VALUE]
 
         nodes.append(node)
-        parent = nodes[payload[INDEX]]
-        arg_key = payload[ARG_KEY]
+        parent: exp.Expr = nodes[payload[INDEX]]
+        arg_key: str = payload[ARG_KEY]
 
         if payload.get(IS_ARRAY):
             parent.append(arg_key, node)
@@ -102,11 +106,11 @@ def load(
 
 
 def _load(payload: dict[str, t.Any]) -> exp.Expr | exp.DType:
-    class_name = payload[CLASS]
+    class_name: str = payload[CLASS]
 
     if class_name == DATA_TYPE:
         return exp.DType(payload[VALUE])
-
+    module: ModuleType
     if "." in class_name:
         module_path, class_name = class_name.rsplit(".", maxsplit=1)
         module = __import__(module_path, fromlist=[class_name])
diff --git a/sqlglot/time.py b/sqlglot/time.py
@@ -1,14 +1,15 @@
-import typing as t
+from __future__ import annotations
 import datetime
+import typing as t
 
 # The generic time format is based on python time.strftime.
 # https://docs.python.org/3/library/time.html#time.strftime
 from sqlglot.trie import TrieResult, in_trie, new_trie
 
 
 def format_time(
-    string: str, mapping: dict[str, str], trie: t.Optional[dict] = None
-) -> t.Optional[str]:
+    string: str, mapping: dict[str, str], trie: dict[t.Any, t.Any] | None = None
+) -> str | None:
     """
     Converts a time string given a mapping.
 
@@ -31,7 +32,7 @@ def format_time(
     size = len(string)
     trie = trie or new_trie(mapping)
     current = trie
-    chunks = []
+    chunks: list[str] = []
     sym = None
 
     while end <= size:
@@ -61,7 +62,7 @@ def format_time(
     return "".join(mapping.get(chars, chars) for chars in chunks)
 
 
-TIMEZONES = {
+TIMEZONES: set[str] = {
     tz.lower()
     for tz in (
         "Africa/Abidjan",
diff --git a/sqlglot/transforms.py b/sqlglot/transforms.py