Fix!: Allow python models to emit DataFrame's with a different column order (#4348)

erindru · web-flow · commit d905941209df · 2025-05-13T12:56:52.000+12:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,9 +40,9 @@ athena = ["PyAthena[Pandas]"]
 azuresql = ["pymssql"]
 bigquery = [
     "google-cloud-bigquery[pandas]",
-    "google-cloud-bigquery-storage"
+    "google-cloud-bigquery-storage",
+    "bigframes>=1.32.0"
 ]
-bigframes = ["bigframes>=1.32.0"]
 clickhouse = ["clickhouse-connect"]
 databricks = ["databricks-sql-connector[pyarrow]"]
 dev = [
@@ -107,8 +107,7 @@ slack = ["slack_sdk"]
 snowflake = [
     "cryptography",
     "snowflake-connector-python[pandas,secure-local-storage]",
-    # as at 2024-08-05, snowflake-snowpark-python is only available up to Python 3.11
-    "snowflake-snowpark-python; python_version<'3.12'",
+    "snowflake-snowpark-python",
 ]
 trino = ["trino"]
 web = [
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -246,7 +246,12 @@ def _df_to_source_queries(
         assert isinstance(df, pd.DataFrame)
         num_rows = len(df.index)
         batch_size = sys.maxsize if batch_size == 0 else batch_size
+
+        # we need to ensure that the order of the columns in columns_to_types columns matches the order of the values
+        # they can differ if a user specifies columns() on a python model in a different order than what's in the DataFrame's emitted by that model
+        df = df[list(columns_to_types)]
         values = list(df.itertuples(index=False, name=None))
+
         return [
             SourceQuery(
                 query_factory=partial(
diff --git a/sqlmesh/core/engine_adapter/mssql.py b/sqlmesh/core/engine_adapter/mssql.py
@@ -218,10 +218,13 @@ def query_factory() -> Query:
             # as later calls.
             if not self.table_exists(temp_table):
                 columns_to_types_create = columns_to_types.copy()
-                self._convert_df_datetime(df, columns_to_types_create)
+                ordered_df = df[
+                    list(columns_to_types_create)
+                ]  # reorder DataFrame so it matches columns_to_types
+                self._convert_df_datetime(ordered_df, columns_to_types_create)
                 self.create_table(temp_table, columns_to_types_create)
                 rows: t.List[t.Tuple[t.Any, ...]] = list(
-                    df.replace({np.nan: None}).itertuples(index=False, name=None)  # type: ignore
+                    ordered_df.replace({np.nan: None}).itertuples(index=False, name=None)  # type: ignore
                 )
                 conn = self._connection_pool.get()
                 conn.bulk_copy(temp_table.sql(dialect=self.dialect), rows)
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -288,8 +288,25 @@ def _df_to_source_queries(
         is_snowpark_dataframe = snowpark and isinstance(df, snowpark.dataframe.DataFrame)
 
         def query_factory() -> Query:
+            # The catalog needs to be normalized before being passed to Snowflake's library functions because they
+            # just wrap whatever they are given in quotes without checking if its already quoted
+            database = (
+                normalize_identifiers(temp_table.catalog, dialect=self.dialect)
+                if temp_table.catalog
+                else None
+            )
+
             if is_snowpark_dataframe:
-                df.createOrReplaceTempView(temp_table.sql(dialect=self.dialect, identify=True))  # type: ignore
+                temp_table.set("catalog", database)
+                df_renamed = df.rename(
+                    {
+                        col: exp.to_identifier(col).sql(dialect=self.dialect, identify=True)
+                        for col in columns_to_types
+                    }
+                )  # type: ignore
+                df_renamed.createOrReplaceTempView(
+                    temp_table.sql(dialect=self.dialect, identify=True)
+                )  # type: ignore
             elif isinstance(df, pd.DataFrame):
                 from snowflake.connector.pandas_tools import write_pandas
 
@@ -325,11 +342,7 @@ def query_factory() -> Query:
                     df,
                     temp_table.name,
                     schema=temp_table.db or None,
-                    database=normalize_identifiers(temp_table.catalog, dialect=self.dialect).sql(
-                        dialect=self.dialect
-                    )
-                    if temp_table.catalog
-                    else None,
+                    database=database.sql(dialect=self.dialect) if database else None,
                     chunk_size=self.DEFAULT_BATCH_SIZE,
                     overwrite=True,
                     table_type="temp",
diff --git a/sqlmesh/core/engine_adapter/spark.py b/sqlmesh/core/engine_adapter/spark.py
@@ -279,10 +279,16 @@ def _ensure_pyspark_df(
     ) -> PySparkDataFrame:
         pyspark_df = self.try_get_pyspark_df(generic_df)
         if pyspark_df:
+            if columns_to_types:
+                # ensure Spark dataframe column order matches columns_to_types
+                pyspark_df = pyspark_df.select(*columns_to_types)
             return pyspark_df
         df = self.try_get_pandas_df(generic_df)
         if df is None:
             raise SQLMeshError("Ensure PySpark DF can only be run on a PySpark or Pandas DataFrame")
+        if columns_to_types:
+            # ensure Pandas dataframe column order matches columns_to_types
+            df = df[list(columns_to_types)]
         kwargs = (
             dict(schema=self.sqlglot_to_spark_types(columns_to_types)) if columns_to_types else {}
         )
diff --git a/tests/core/engine_adapter/integration/__init__.py b/tests/core/engine_adapter/integration/__init__.py
@@ -359,7 +359,7 @@ def get_table_comment(
                 FROM pg_class c
                 INNER JOIN pg_description d ON c.oid = d.objoid AND d.objsubid = 0
                 INNER JOIN pg_namespace n ON c.relnamespace = n.oid
-                WHERE 
+                WHERE
                     c.relname = '{table_name}'
                     AND n.nspname= '{schema_name}'
                     AND c.relkind = '{"v" if table_kind == "VIEW" else "r"}'
@@ -465,12 +465,12 @@ def get_column_comments(
                 INNER JOIN pg_namespace n ON c.relnamespace = n.oid
                 INNER JOIN pg_attribute a ON c.oid = a.attrelid
                 INNER JOIN pg_description d
-                ON 
+                ON
                     a.attnum = d.objsubid
                     AND d.objoid = c.oid
                 WHERE
                     n.nspname = '{schema_name}'
-                    AND c.relname = '{table_name}' 
+                    AND c.relname = '{table_name}'
                     AND c.relkind = '{"v" if table_kind == "VIEW" else "r"}'
                 ;
             """
@@ -494,6 +494,7 @@ def create_context(
         self,
         config_mutator: t.Optional[t.Callable[[str, Config], None]] = None,
         path: t.Optional[pathlib.Path] = None,
+        ephemeral_state_connection: bool = True,
     ) -> Context:
         private_sqlmesh_dir = pathlib.Path(pathlib.Path().home(), ".sqlmesh")
         config = load_config_from_paths(
@@ -509,14 +510,12 @@ def create_context(
         config.gateways = {self.gateway: config.gateways[self.gateway]}
 
         gateway_config = config.gateways[self.gateway]
-        if (
-            (sc := gateway_config.state_connection)
-            and (conn := gateway_config.connection)
-            and sc.type_ == "duckdb"
-        ):
-            # if duckdb is being used as the state connection, set concurrent_tasks=1 on the main connection
-            # to prevent duckdb from being accessed from multiple threads and getting deadlocked
-            conn.concurrent_tasks = 1
+        if ephemeral_state_connection:
+            # Override whatever state connection has been configured on the integration test config to use in-memory DuckDB instead
+            # This is so tests that initialize a SQLMesh context can run concurrently without clobbering each others state
+            from sqlmesh.core.config.connection import DuckDBConnectionConfig
+
+            gateway_config.state_connection = DuckDBConnectionConfig()
 
         if "athena" in self.gateway:
             conn = gateway_config.connection
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2721,7 +2721,9 @@ def _use_warehouse_as_state_connection(gateway_name: str, config: Config):
 
         config.gateways[gateway_name].state_schema = test_schema
 
-    sqlmesh_context = ctx.create_context(config_mutator=_use_warehouse_as_state_connection)
+    sqlmesh_context = ctx.create_context(
+        config_mutator=_use_warehouse_as_state_connection, ephemeral_state_connection=False
+    )
     assert sqlmesh_context.config.get_state_schema(ctx.gateway) == test_schema
 
     state_sync = (
@@ -2732,3 +2734,83 @@ def _use_warehouse_as_state_connection(gateway_name: str, config: Config):
 
     # will throw if one of the migrations produces an error, which can happen if we forget to take quoting or normalization into account
     sqlmesh_context.migrate()
+
+
+def test_python_model_column_order(ctx: TestContext, tmp_path: pathlib.Path):
+    if ctx.test_type == "pyspark" and ctx.dialect in ("spark", "databricks"):
+        # dont skip
+        pass
+    elif ctx.test_type != "df":
+        pytest.skip("python model column order test only needs to be run once per db")
+
+    schema = ctx.add_test_suffix(TEST_SCHEMA)
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+    if ctx.test_type == "pyspark":
+        # python model that emits a PySpark dataframe
+        model_path.write_text(
+            """
+from pyspark.sql import DataFrame, Row
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    "TEST_SCHEMA.model",
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return context.spark.createDataFrame([
+        Row(name="foo", id=1)
+    ])
+    """.replace("TEST_SCHEMA", schema)
+        )
+    else:
+        # python model that emits a Pandas DataFrame
+        model_path.write_text(
+            """
+import pandas as pd
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    "TEST_SCHEMA.model",
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> pd.DataFrame:
+    return pd.DataFrame([
+        {"name": "foo", "id": 1}
+    ])
+    """.replace("TEST_SCHEMA", schema)
+        )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(
+        exp.to_table(f"{schema}.model", dialect=ctx.dialect), dialect=ctx.dialect
+    )
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_bigquery.py b/tests/core/engine_adapter/integration/test_integration_bigquery.py
@@ -433,3 +433,50 @@ def test_table_diff_table_name_matches_column_name(ctx: TestContext):
 
     assert row_diff.stats["join_count"] == 1
     assert row_diff.full_match_count == 1
+
+
+def test_bigframe_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    model_name = ctx.table("TEST")
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a BigFrame dataframe
+    model_path.write_text(
+        """
+from bigframes.pandas import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    'MODEL_NAME',
+    columns={
+        "id": "int",
+        "name": "varchar"
+    },
+    dialect="bigquery"
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return DataFrame({'name': ['foo'], 'id': [1]}, session=context.bigframe)
+""".replace("MODEL_NAME", model_name.sql(dialect="bigquery"))
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(model_name)
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py
@@ -1,6 +1,7 @@
 import typing as t
 import pytest
 from sqlglot import exp
+from pathlib import Path
 from sqlglot.optimizer.qualify_columns import quote_identifiers
 from sqlglot.helper import seq_get
 from sqlmesh.core.engine_adapter import SnowflakeEngineAdapter
@@ -210,3 +211,49 @@ def test_create_iceberg_table(ctx: TestContext, engine_adapter: SnowflakeEngineA
     result = sqlmesh.plan(auto_apply=True)
 
     assert len(result.new_snapshots) == 2
+
+
+def test_snowpark_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    model_name = ctx.table("TEST")
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a Snowpark DataFrame
+    model_path.write_text(
+        """
+from snowflake.snowpark.dataframe import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    'MODEL_NAME',
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return context.snowpark.create_dataframe([["foo", 1]], schema=["name", "id"])
+""".replace("MODEL_NAME", model_name.sql(dialect="snowflake"))
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(plan.environment.snapshots[0].fully_qualified_table)
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/test_base.py b/tests/core/engine_adapter/test_base.py
diff --git a/tests/core/engine_adapter/test_snowflake.py b/tests/core/engine_adapter/test_snowflake.py