Debug and fix acceptance runs (#2223)

m-abulazm · web-flow · commit 55c56d43907f · 2026-01-14T23:34:59.000Z
* Fix recon tests after `sqlglot` upgrade
* pipe operator is used in hash queries so test fixtures were updated to
expect the same
* use different spark schemas for different `reconcile` tests to get rid
of spark write errors about existing schema
* fix e2e recon test; we didnt see the issues as it did not run in the
CI env before
* delete `pytest` modifier that excluded `reconcile` tests and run all
integration tests in acceptance CI
diff --git a/.codegen.json b/.codegen.json
@@ -5,6 +5,7 @@
   "toolchain": {
     "required": ["hatch"],
     "pre_setup": ["hatch env create"],
-    "prepend_path": ".venv/bin"
+    "prepend_path": ".venv/bin",
+    "acceptance_path": "tests/integration"
   }
 }
diff --git a/.github/scripts/setup_spark_remote.sh b/.github/scripts/setup_spark_remote.sh
@@ -91,6 +91,8 @@ else
   fi
 fi
 
+rm -rf "${HOME}"/spark/"${spark}"/spark-warehouse
+echo "Cleared old spark warehouse default directory"
 
 cd "${spark}" || exit 1
 ## check spark remote is running,if not start the spark remote
diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v6
         with:
-          fetch-depth: 0
+          fetch-depth: 1
 
       - name: Install Python
         uses: actions/setup-python@v6
@@ -45,6 +45,12 @@ jobs:
           chmod +x $GITHUB_WORKSPACE/.github/scripts/setup_mssql_odbc.sh
           $GITHUB_WORKSPACE/.github/scripts/setup_mssql_odbc.sh
 
+      # TODO: Migrate tests to use Databricks clusters instead of Spark local mode
+      - name: Setup spark
+        run: |
+          chmod +x $GITHUB_WORKSPACE/.github/scripts/setup_spark_remote.sh
+          $GITHUB_WORKSPACE/.github/scripts/setup_spark_remote.sh
+
       - name: Run integration tests
         uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.4
         with:
diff --git a/src/databricks/labs/lakebridge/reconcile/recon_capture.py b/src/databricks/labs/lakebridge/reconcile/recon_capture.py
@@ -95,7 +95,7 @@ def generate_final_reconcile_output(
     metadata_config: ReconcileMetadataConfig = ReconcileMetadataConfig(),
     local_test_run: bool = False,
 ) -> ReconcileOutput:
-    _db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
+    _db_prefix = metadata_config.schema if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
     recon_df = spark.sql(
         f"""
     SELECT
@@ -237,7 +237,9 @@ def __init__(
         self.source_dialect = source_dialect
         self.ws = ws
         self.spark = spark
-        self._db_prefix = "default" if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
+        self._db_prefix = (
+            metadata_config.schema if local_test_run else f"{metadata_config.catalog}.{metadata_config.schema}"
+        )
 
     def _generate_recon_main_id(
         self,
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,4 +1,3 @@
-import os
 import logging
 from urllib.parse import urlparse
 
@@ -30,22 +29,6 @@ def get_logger():
     return logger
 
 
-def pytest_collection_modifyitems(config, items):
-    if os.getenv('TEST_ENV') != 'ACCEPTANCE':
-        return
-    selected_items = []
-    deselected_items = []
-    # Add only specific tests to run from acceptance.yml
-    inclusions = {'assessments', 'connections', 'config', 'discovery', 'helpers', 'transpile'}
-    for item in items:
-        if any(f"tests/integration/{inclusion}" in str(item.fspath) for inclusion in inclusions):
-            selected_items.append(item)
-        else:
-            deselected_items.append(item)
-    items[:] = selected_items
-    config.hook.pytest_deselected(items=deselected_items)
-
-
 @pytest.fixture(scope="session")
 def mock_spark() -> SparkSession:
     """
diff --git a/tests/integration/reconcile/conftest.py b/tests/integration/reconcile/conftest.py
@@ -1,14 +1,24 @@
 import logging
+import uuid
+from collections.abc import Generator
 
 import pytest
 
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors.platform import PermissionDenied
 from databricks.sdk.service.catalog import TableInfo, SchemaInfo
+
+from databricks.labs.lakebridge.config import ReconcileMetadataConfig
 from tests.integration.debug_envgetter import TestEnvGetter
 
 logger = logging.getLogger(__name__)
 
+DIAMONDS_COLUMNS = [
+    ("carat", "DOUBLE"),
+    ("cut", "STRING"),
+    ("color", "STRING"),
+    ("clarity", "STRING"),
+]
 DIAMONDS_ROWS_SQL = """
                     INSERT INTO {catalog}.{schema}.{table} (carat, cut, color, clarity) VALUES
                         (0.23, 'Ideal', 'E', 'SI2'),
@@ -24,6 +34,7 @@
 def recon_catalog(make_catalog) -> str:
     try:
         catalog = make_catalog().name
+        logger.info(f"Created catalog {catalog} for recon tests")
     except PermissionDenied as e:
         logger.warning("Could not create catalog for recon tests, using 'sandbox' instead", exc_info=e)
         catalog = "sandbox"
@@ -34,14 +45,20 @@ def recon_catalog(make_catalog) -> str:
 @pytest.fixture
 def recon_schema(recon_catalog, make_schema) -> SchemaInfo:
     from_schema = make_schema(catalog_name=recon_catalog)
+    logger.info(f"Created schema {from_schema.name} in catalog {recon_catalog} for recon tests")
 
     return from_schema
 
 
 @pytest.fixture
 def recon_tables(ws: WorkspaceClient, recon_schema: SchemaInfo, make_table) -> tuple[TableInfo, TableInfo]:
-    src_table = make_table(catalog_name=recon_schema.catalog_name, schema_name=recon_schema.name)
-    tgt_table = make_table(catalog_name=recon_schema.catalog_name, schema_name=recon_schema.name)
+    src_table = make_table(
+        catalog_name=recon_schema.catalog_name, schema_name=recon_schema.name, columns=DIAMONDS_COLUMNS
+    )
+    tgt_table = make_table(
+        catalog_name=recon_schema.catalog_name, schema_name=recon_schema.name, columns=DIAMONDS_COLUMNS
+    )
+    logger.info(f"Created recon tables {src_table.name}, {tgt_table.name} in schema {recon_schema.name}")
 
     test_env = TestEnvGetter(True)
     warehouse = test_env.get("TEST_DEFAULT_WAREHOUSE_ID")
@@ -52,11 +69,32 @@ def recon_tables(ws: WorkspaceClient, recon_schema: SchemaInfo, make_table) -> t
             schema=recon_schema.name,
             table=tbl.name,
         )
-        ws.statement_execution.execute_statement(
+        exc_response = ws.statement_execution.execute_statement(
             warehouse_id=warehouse,
             catalog=recon_schema.catalog_name,
             schema=recon_schema.name,
             statement=sql,
         )
+        logger.info(f"Inserted data into table {tbl.name} and got response {exc_response.status}")
 
     return src_table, tgt_table
+
+
+@pytest.fixture
+def recon_metadata(mock_spark, report_tables_schema) -> Generator[ReconcileMetadataConfig, None, None]:
+    rand = uuid.uuid4().hex
+    schema = f"recon_schema_{rand}"
+    mock_spark.sql(f"CREATE SCHEMA {schema}")
+    main_schema, metrics_schema, details_schema = report_tables_schema
+
+    mock_spark.createDataFrame(data=[], schema=main_schema).write.saveAsTable(f"{schema}.MAIN")
+    mock_spark.createDataFrame(data=[], schema=metrics_schema).write.saveAsTable(f"{schema}.METRICS")
+    mock_spark.createDataFrame(data=[], schema=details_schema).write.saveAsTable(f"{schema}.DETAILS")
+
+    yield ReconcileMetadataConfig(
+        catalog=f"recon_catalog_{rand}",
+        schema=schema,
+        volume=f"recon_volume_{rand}",
+    )
+
+    mock_spark.sql(f"DROP SCHEMA {schema} CASCADE")
diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py
diff --git a/tests/integration/reconcile/test_recon_capture.py b/tests/integration/reconcile/test_recon_capture.py
diff --git a/tests/integration/reconcile/test_recon_databricks.py b/tests/integration/reconcile/test_recon_databricks.py

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`"toolchain": {`
`6`	`6`	`"required": ["hatch"],`
`7`	`7`	`"pre_setup": ["hatch env create"],`
`8`		`- "prepend_path": ".venv/bin"`
	`8`	`+ "prepend_path": ".venv/bin",`
	`9`	`+ "acceptance_path": "tests/integration"`
`9`	`10`	`}`
`10`	`11`	`}`