Feat(table_diff): Add option for case insensitive schema comparisons

erindru · erindru · commit a93b7e90983c · 2025-06-05T21:37:54.000Z
diff --git a/sqlmesh/cli/main.py b/sqlmesh/cli/main.py
@@ -938,6 +938,11 @@ def create_external_models(obj: Context, **kwargs: t.Any) -> None:
     multiple=True,
     help="Specify one or more models to data diff. Use wildcards to diff multiple models. Ex: '*' (all models with applied plan diffs), 'demo.model+' (this and downstream models), 'git:feature_branch' (models with direct modifications in this branch only)",
 )
+@click.option(
+    "--schema-diff-ignore-case",
+    is_flag=True,
+    help="If set, when performing a schema diff the case of column names is ignored when matching between the two schemas. For example, 'col_a' in the source schema and 'COL_A' in the target schema will be treated as the same column.",
+)
 @click.pass_obj
 @error_handler
 @cli_analytics
diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py
@@ -1673,6 +1673,7 @@ def table_diff(
         skip_grain_check: bool = False,
         warn_grain_check: bool = False,
         temp_schema: t.Optional[str] = None,
+        schema_diff_ignore_case: bool = False,
     ) -> t.List[TableDiff]:
         """Show a diff between two tables.
 
@@ -1796,6 +1797,7 @@ def table_diff(
                             show=show,
                             temp_schema=temp_schema,
                             skip_grain_check=skip_grain_check,
+                            schema_diff_ignore_case=schema_diff_ignore_case,
                         ),
                         tasks_num=tasks_num,
                     )
@@ -1821,6 +1823,7 @@ def table_diff(
                     on=on,
                     skip_columns=skip_columns,
                     where=where,
+                    schema_diff_ignore_case=schema_diff_ignore_case,
                 )
             ]
 
@@ -1845,6 +1848,7 @@ def _model_diff(
         show: bool = True,
         temp_schema: t.Optional[str] = None,
         skip_grain_check: bool = False,
+        schema_diff_ignore_case: bool = False,
     ) -> TableDiff:
         self.console.start_table_diff_model_progress(model.name)
 
@@ -1860,6 +1864,7 @@ def _model_diff(
             target=target,
             source_alias=source_alias,
             target_alias=target_alias,
+            schema_diff_ignore_case=schema_diff_ignore_case,
         )
 
         if show:
@@ -1883,6 +1888,7 @@ def _table_diff(
         model: t.Optional[Model] = None,
         skip_columns: t.Optional[t.List[str]] = None,
         where: t.Optional[str | exp.Condition] = None,
+        schema_diff_ignore_case: bool = False,
     ) -> TableDiff:
         if not on:
             raise SQLMeshError(
@@ -1902,6 +1908,7 @@ def _table_diff(
             decimals=decimals,
             model_name=model.name if model else None,
             model_dialect=model.dialect if model else None,
+            schema_diff_ignore_case=schema_diff_ignore_case,
         )
 
     @python_api_analytics
diff --git a/sqlmesh/core/table_diff.py b/sqlmesh/core/table_diff.py
@@ -36,29 +36,78 @@ class SchemaDiff(PydanticModel, frozen=True):
     source_alias: t.Optional[str] = None
     target_alias: t.Optional[str] = None
     model_name: t.Optional[str] = None
+    ignore_case: bool = False
+
+    @property
+    def _normalized_source_schema(self) -> t.Dict[str, exp.DataType]:
+        return (
+            self._lowercase_schema_names(self.source_schema)
+            if self.ignore_case
+            else self.source_schema
+        )
+
+    @property
+    def _normalized_target_schema(self) -> t.Dict[str, exp.DataType]:
+        return (
+            self._lowercase_schema_names(self.target_schema)
+            if self.ignore_case
+            else self.target_schema
+        )
+
+    def _lowercase_schema_names(
+        self, schema: t.Dict[str, exp.DataType]
+    ) -> t.Dict[str, exp.DataType]:
+        return {c.lower(): t for c, t in schema.items()}
+
+    def _original_column_name(
+        self, maybe_lowercased_column_name: str, schema: t.Dict[str, exp.DataType]
+    ) -> str:
+        if not self.ignore_case:
+            return maybe_lowercased_column_name
+
+        return next(c for c in schema if c.lower() == maybe_lowercased_column_name)
 
     @property
     def added(self) -> t.List[t.Tuple[str, exp.DataType]]:
         """Added columns."""
-        return [(c, t) for c, t in self.target_schema.items() if c not in self.source_schema]
+        return [
+            (self._original_column_name(c, self.target_schema), t)
+            for c, t in self._normalized_target_schema.items()
+            if c not in self._normalized_source_schema
+        ]
 
     @property
     def removed(self) -> t.List[t.Tuple[str, exp.DataType]]:
         """Removed columns."""
-        return [(c, t) for c, t in self.source_schema.items() if c not in self.target_schema]
+        return [
+            (self._original_column_name(c, self.source_schema), t)
+            for c, t in self._normalized_source_schema.items()
+            if c not in self._normalized_target_schema
+        ]
 
     @property
     def modified(self) -> t.Dict[str, t.Tuple[exp.DataType, exp.DataType]]:
         """Columns with modified types."""
         modified = {}
-        for column in self.source_schema.keys() & self.target_schema.keys():
-            source_type = self.source_schema[column]
-            target_type = self.target_schema[column]
+        for column in self._normalized_source_schema.keys() & self._normalized_target_schema.keys():
+            source_type = self._normalized_source_schema[column]
+            target_type = self._normalized_target_schema[column]
 
             if source_type != target_type:
                 modified[column] = (source_type, target_type)
+
+        if self.ignore_case:
+            modified = {
+                self._original_column_name(c, self.source_schema): dt for c, dt in modified.items()
+            }
+
         return modified
 
+    @property
+    def has_changes(self) -> bool:
+        """Does the schema contain any changes at all between source and target"""
+        return bool(self.added or self.removed or self.modified)
+
 
 class RowDiff(PydanticModel, frozen=True):
     """Summary statistics and a sample dataframe."""
@@ -183,6 +232,7 @@ def __init__(
         model_name: t.Optional[str] = None,
         model_dialect: t.Optional[str] = None,
         decimals: int = 3,
+        schema_diff_ignore_case: bool = False,
     ):
         if not isinstance(adapter, RowDiffMixin):
             raise ValueError(f"Engine {adapter} doesnt support RowDiff")
@@ -198,6 +248,7 @@ def __init__(
         self.model_name = model_name
         self.model_dialect = model_dialect
         self.decimals = decimals
+        self.schema_diff_ignore_case = schema_diff_ignore_case
 
         # Support environment aliases for diff output improvement in certain cases
         self.source_alias = source_alias
@@ -282,6 +333,7 @@ def schema_diff(self) -> SchemaDiff:
             source_alias=self.source_alias,
             target_alias=self.target_alias,
             model_name=self.model_name,
+            ignore_case=self.schema_diff_ignore_case,
         )
 
     def row_diff(
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
@@ -1,4 +1,5 @@
 import logging
+import string
 from contextlib import contextmanager
 from os import getcwd, path, remove
 from pathlib import Path
@@ -1759,3 +1760,34 @@ def test_ignore_warnings(runner: CliRunner, tmp_path: Path) -> None:
     )
     assert result.exit_code == 0
     assert audit_warning not in result.output
+
+
+def test_table_diff_schema_diff_ignore_case(runner: CliRunner, tmp_path: Path):
+    from sqlmesh.core.engine_adapter import DuckDBEngineAdapter
+
+    create_example_project(tmp_path)
+
+    ctx = Context(paths=tmp_path)
+    assert isinstance(ctx.engine_adapter, DuckDBEngineAdapter)
+
+    ctx.engine_adapter.execute('create table t1 (id int, "naME" varchar)')
+    ctx.engine_adapter.execute('create table t2 (id int, "name" varchar)')
+
+    # default behavior (case sensitive)
+    result = runner.invoke(
+        cli,
+        ["--paths", str(tmp_path), "table_diff", "t1:t2", "-o", "id"],
+    )
+    assert result.exit_code == 0
+    stripped_output = "".join((x for x in result.output if x in string.printable))
+    assert "Added Columns:\n    name (TEXT)" in stripped_output
+    assert "Removed Columns:\n     naME (TEXT)" in stripped_output
+
+    # ignore case
+    result = runner.invoke(
+        cli,
+        ["--paths", str(tmp_path), "table_diff", "t1:t2", "-o", "id", "--schema-diff-ignore-case"],
+    )
+    assert result.exit_code == 0
+    stripped_output = "".join((x for x in result.output if x in string.printable))
+    assert "Schema Diff Between 'T1' and 'T2':\n Schemas match" in stripped_output
diff --git a/tests/core/test_table_diff.py b/tests/core/test_table_diff.py
@@ -11,7 +11,7 @@
 from sqlmesh.core.context import Context
 from sqlmesh.core.config import AutoCategorizationMode, CategorizerConfig, DuckDBConnectionConfig
 from sqlmesh.core.model import SqlModel, load_sql_based_model
-from sqlmesh.core.table_diff import TableDiff
+from sqlmesh.core.table_diff import TableDiff, SchemaDiff
 import numpy as np  # noqa: TID253
 from sqlmesh.utils.errors import SQLMeshError
 
@@ -944,3 +944,91 @@ def test_data_diff_multiple_models_lacking_grain(sushi_context_fixed_date, capsy
     assert row_diff1.t_sample.shape == (0, 2)
     assert row_diff1.joined_sample.shape == (2, 3)
     assert row_diff1.sample.shape == (2, 4)
+
+
+def test_schema_diff_ignore_case():
+    # no changes
+    table_a = {"COL_A": exp.DataType.build("varchar"), "cOl_b": exp.DataType.build("int")}
+    table_b = {"col_a": exp.DataType.build("varchar"), "COL_b": exp.DataType.build("int")}
+
+    diff = SchemaDiff(
+        source="table_a",
+        source_schema=table_a,
+        target="table_b",
+        target_schema=table_b,
+        ignore_case=True,
+    )
+
+    assert not diff.has_changes
+
+    # added in target
+    table_a = {"COL_A": exp.DataType.build("varchar"), "cOl_b": exp.DataType.build("int")}
+    table_b = {
+        "col_a": exp.DataType.build("varchar"),
+        "COL_b": exp.DataType.build("int"),
+        "cOL__C": exp.DataType.build("date"),
+    }
+
+    diff = SchemaDiff(
+        source="table_a",
+        source_schema=table_a,
+        target="table_b",
+        target_schema=table_b,
+        ignore_case=True,
+    )
+
+    assert diff.has_changes
+    assert len(diff.added) == 1
+    assert diff.added[0] == (
+        "cOL__C",
+        exp.DataType.build("date"),
+    )  # notice: case preserved on output
+    assert not diff.removed
+    assert not diff.modified
+
+    # removed from source
+    table_a = {
+        "cOL_fo0": exp.DataType.build("float"),
+        "COL_A": exp.DataType.build("varchar"),
+        "cOl_b": exp.DataType.build("int"),
+    }
+    table_b = {"col_a": exp.DataType.build("varchar"), "COL_b": exp.DataType.build("int")}
+
+    diff = SchemaDiff(
+        source="table_a",
+        source_schema=table_a,
+        target="table_b",
+        target_schema=table_b,
+        ignore_case=True,
+    )
+
+    assert diff.has_changes
+    assert not diff.added
+    assert len(diff.removed) == 1
+    assert diff.removed[0] == (
+        "cOL_fo0",
+        exp.DataType.build("float"),
+    )  # notice: case preserved on output
+    assert not diff.modified
+
+    # column type change
+    table_a = {"CoL_A": exp.DataType.build("varchar"), "cOl_b": exp.DataType.build("int")}
+    table_b = {"col_a": exp.DataType.build("date"), "COL_b": exp.DataType.build("int")}
+
+    diff = SchemaDiff(
+        source="table_a",
+        source_schema=table_a,
+        target="table_b",
+        target_schema=table_b,
+        ignore_case=True,
+    )
+
+    assert diff.has_changes
+    assert not diff.added
+    assert not diff.removed
+    assert diff.modified == {
+        "CoL_A": (
+            exp.DataType.build("varchar"),
+            exp.DataType.build("date"),
+        )  # notice: source casing used on output
+    }