Add integration test

themisvaltinos · themisvaltinos · commit 047e1608826a · 2025-08-04T11:15:05.000+02:00
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -1897,6 +1897,115 @@ def _mutate_config(current_gateway_name: str, config: Config):
         ctx.cleanup(context)
 
 
+def test_incremental_by_unique_key_model_when_matched(ctx: TestContext):
+    if not ctx.supports_merge:
+        pytest.skip(f"{ctx.dialect} on {ctx.gateway} doesnt support merge")
+
+    # DuckDB and some other engines use logical_merge which doesn't support when_matched
+    if ctx.dialect not in ["bigquery", "databricks", "postgres", "snowflake", "spark"]:
+        pytest.skip(f"{ctx.dialect} doesn't support native MERGE with when_matched clause")
+
+    context = ctx.create_context()
+    schema = ctx.schema(TEST_SCHEMA)
+
+    # Create seed data with multiple days
+    seed_query = ctx.input_data(
+        pd.DataFrame(
+            [
+                [1, "item_a", 100, "2020-01-01"],
+                [2, "item_b", 200, "2020-01-01"],
+                [1, "item_a_changed", 150, "2020-01-02"],  # Same item_id, different name and value
+                [2, "item_b_changed", 250, "2020-01-02"],  # Same item_id, different name and value
+                [3, "item_c", 300, "2020-01-02"],  # New item on day 2
+            ],
+            columns=["item_id", "name", "value", "event_date"],
+        ),
+        columns_to_types={
+            "item_id": exp.DataType.build("integer"),
+            "name": exp.DataType.build("text"),
+            "value": exp.DataType.build("integer"),
+            "event_date": exp.DataType.build("date"),
+        },
+    )
+    context.upsert_model(
+        create_sql_model(name=f"{schema}.seed_model", query=seed_query, kind="FULL")
+    )
+
+    table_format = ""
+    if ctx.dialect == "athena":
+        # INCREMENTAL_BY_UNIQUE_KEY uses MERGE which is only supported in Athena on Iceberg tables
+        table_format = "table_format iceberg,"
+
+    # Create model with when_matched clause that only updates the value column
+    # BUT keeps the existing name column unchanged
+    # batch_size=1 is so that we trigger merge on second batch and verify behaviour of when_matched
+    context.upsert_model(
+        load_sql_based_model(
+            d.parse(
+                f"""MODEL (
+                    name {schema}.test_model_when_matched,
+                    kind INCREMENTAL_BY_UNIQUE_KEY (
+                        unique_key item_id,
+                        batch_size 1,
+                        when_matched WHEN MATCHED THEN UPDATE SET target.value = source.value, target.event_date = source.event_date
+                    ),
+                    {table_format}
+                    start '2020-01-01',
+                    end '2020-01-02',
+                    cron '@daily'
+                );
+
+                select item_id, name, value, event_date
+                from {schema}.seed_model
+                where event_date between @start_date and @end_date""",
+            )
+        )
+    )
+
+    try:
+        # Initial plan to create the model and run it
+        context.plan(auto_apply=True, no_prompts=True)
+
+        test_model = context.get_model(f"{schema}.test_model_when_matched")
+
+        # Verify that the model has the when_matched clause configured
+        assert test_model.kind.when_matched is not None
+
+        actual_df = (
+            ctx.get_current_data(test_model.fqn).sort_values(by="item_id").reset_index(drop=True)
+        )
+
+        # Expected results after batch processing:
+        # - Day 1: Items 1 and 2 are inserted (first insert)
+        # - Day 2: Items 1 and 2 are merged (when_matched clause preserves names but updates values/dates)
+        #          Item 3 is inserted as new
+        expected_df = (
+            pd.DataFrame(
+                [
+                    [1, "item_a", 150, "2020-01-02"],  # name from day 1, value and date from day 2
+                    [2, "item_b", 250, "2020-01-02"],  # name from day 1, value and date from day 2
+                    [3, "item_c", 300, "2020-01-02"],  # new item from day 2
+                ],
+                columns=["item_id", "name", "value", "event_date"],
+            )
+            .sort_values(by="item_id")
+            .reset_index(drop=True)
+        )
+
+        # Convert date columns to string for comparison
+        actual_df["event_date"] = actual_df["event_date"].astype(str)
+        expected_df["event_date"] = expected_df["event_date"].astype(str)
+
+        pd.testing.assert_frame_equal(
+            actual_df,
+            expected_df,
+            check_dtype=False,
+        )
+
+    finally:
+        ctx.cleanup(context)
+
+
 def test_managed_model_upstream_forward_only(ctx: TestContext):
     """
     This scenario goes as follows: