Merge branch 'main' into docs/remove-outdated-teradata-mention

gueniai · web-flow · commit 75e887363c50 · 2026-01-21T17:12:20.000-05:00
diff --git a/docs/lakebridge/docs/installation.mdx b/docs/lakebridge/docs/installation.mdx
@@ -87,7 +87,7 @@ databricks configure --host <host> --profile <profile_name>
 
 **Version Required:** Python 3.10 or newer; verify that the installed version is in the supported versions (3.10–3.13).
 
-- **Windows** - Install python from [here](https://www.python.org/downloads/). Your Windows computer will need a shell environment ([GitBash](https://www.git-scm.com/downloads) or [WSL](https://learn.microsoft.com/en-us/windows/wsl/about))
+- **Windows** - Install python from [here](https://www.python.org/downloads/).
 - **MacOS/Unix** - Use [brew](https://formulae.brew.sh/formula/python@3.10) to install python in macOS/Unix machines
 
 **Check Python version on Windows, macOS, and Unix:**
diff --git a/src/databricks/labs/lakebridge/assessments/pipeline.py b/src/databricks/labs/lakebridge/assessments/pipeline.py
@@ -1,20 +1,19 @@
-from pathlib import Path
-from subprocess import run, CalledProcessError, Popen, PIPE, STDOUT, DEVNULL
-from dataclasses import dataclass
-from enum import Enum
-
-import sys
+import json
+import logging
 import os
+import sys
 import venv
 import tempfile
-import json
-import logging
-import yaml
-import duckdb
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from subprocess import CalledProcessError, DEVNULL, PIPE, Popen, STDOUT, run
 
-from databricks.labs.lakebridge.connections.credential_manager import cred_file
+import duckdb
+import yaml
 
 from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig, Step
+from databricks.labs.lakebridge.connections.credential_manager import cred_file
 from databricks.labs.lakebridge.connections.database_manager import DatabaseManager, FetchResult
 
 logger = logging.getLogger(__name__)
@@ -235,21 +234,29 @@ def _run_python_script(venv_exec_cmd, script_path, db_path, credential_config):
     def _save_to_db(self, result: FetchResult, step_name: str, mode: str):
         db_path = str(self.db_path_prefix / DB_NAME)
 
-        with duckdb.connect(db_path) as conn:
-            # TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
-            schema = ' STRING, '.join(result.columns) + ' STRING'
+        # Check row count and log appropriately and skip data insertion if 0 rows
+        if not result.rows:
+            logging.warning(
+                f"Query for step '{step_name}' returned 0 rows. Skipping table creation and data insertion."
+            )
+            return
+
+        row_count = len(result.rows)
+        logging.info(f"Query for step '{step_name}' returned {row_count} rows.")
+        # TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
+        _result_frame = result.to_df().astype(str)
 
-            # Handle write modes
+        with duckdb.connect(db_path) as conn:
+            # DuckDB can access _result_frame from the local scope automatically.
             if mode == 'overwrite':
-                conn.execute(f"CREATE OR REPLACE TABLE {step_name} ({schema})")
+                statement = f"CREATE OR REPLACE TABLE {step_name} AS SELECT * FROM _result_frame"
             elif mode == 'append' and step_name not in conn.get_table_names(""):
-                conn.execute(f"CREATE TABLE {step_name} ({schema})")
-
-            # Batch insert using prepared statements
-            placeholders = ', '.join(['?' for _ in result.columns])
-            insert_query = f"INSERT INTO {step_name} VALUES ({placeholders})"
-
-            conn.executemany(insert_query, result.rows)
+                statement = f"CREATE TABLE {step_name} AS SELECT * FROM _result_frame"
+            else:
+                statement = f"INSERT INTO {step_name} SELECT * FROM _result_frame"
+            logging.debug(f"Inserting {row_count} rows: {statement}")
+            conn.execute(statement)
+        logging.info(f"Successfully inserted {row_count} rows into table '{step_name}'.")
 
     @staticmethod
     def _create_dir(dir_path: Path):
diff --git a/src/databricks/labs/lakebridge/connections/database_manager.py b/src/databricks/labs/lakebridge/connections/database_manager.py
@@ -4,6 +4,8 @@
 from typing import Any
 from collections.abc import Sequence, Set
 
+import pandas as pd
+
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Engine, URL
 from sqlalchemy.engine.row import Row
@@ -19,6 +21,12 @@ class FetchResult:
     columns: Set[str]
     rows: Sequence[Row[Any]]
 
+    def to_df(self) -> pd.DataFrame:
+        """Create a pandas dataframe based on these results."""
+        # Row emulates a named tuple, which Pandas understands natively. So the columns are safely inferred unless
+        # we have an empty result-set.
+        return pd.DataFrame(data=self.rows) if self.rows else pd.DataFrame(columns=list(self.columns))
+
 
 class DatabaseConnector(ABC):
     @abstractmethod
diff --git a/tests/integration/assessments/test_pipeline.py b/tests/integration/assessments/test_pipeline.py
@@ -1,10 +1,16 @@
 from collections.abc import Callable
 from pathlib import Path
+from logging import Logger
 from typing import TypeAlias
 import duckdb
 import pytest
 
-from databricks.labs.lakebridge.assessments.pipeline import PipelineClass, DB_NAME, StepExecutionStatus
+from databricks.labs.lakebridge.assessments.pipeline import (
+    PipelineClass,
+    DB_NAME,
+    StepExecutionStatus,
+    StepExecutionResult,
+)
 from databricks.labs.lakebridge.assessments.profiler import Profiler
 
 from databricks.labs.lakebridge.assessments.profiler_config import Step, PipelineConfig
@@ -45,7 +51,20 @@ def python_failure_config(pipeline_configuration_loader: _Loader) -> PipelineCon
     return pipeline_configuration_loader(Path("pipeline_config_python_failure.yml"))
 
 
-def test_run_pipeline(sandbox_sqlserver, pipeline_config, get_logger):
+@pytest.fixture(scope="module")
+def empty_result_config() -> PipelineConfig:
+    prefix = Path(__file__).parent
+    config_path = f"{prefix}/../../resources/assessments/pipeline_config_empty_result.yml"
+    config: PipelineConfig = PipelineClass.load_config_from_yaml(config_path)
+    updated_steps = [step.copy(extract_source=f"{prefix}/../../{step.extract_source}") for step in config.steps]
+    return config.copy(steps=updated_steps)
+
+
+def test_run_pipeline(
+    sandbox_sqlserver: DatabaseManager,
+    pipeline_config: PipelineConfig,
+    get_logger: Logger,
+) -> None:
     pipeline = PipelineClass(config=pipeline_config, executor=sandbox_sqlserver)
     results = pipeline.execute()
 
@@ -56,10 +75,14 @@ def test_run_pipeline(sandbox_sqlserver, pipeline_config, get_logger):
             StepExecutionStatus.SKIPPED,
         ), f"Step {result.step_name} failed with status {result.status}"
 
-    assert verify_output(get_logger, pipeline_config.extract_folder)
+    assert verify_output(get_logger, Path(pipeline_config.extract_folder))
 
 
-def test_run_sql_failure_pipeline(sandbox_sqlserver, sql_failure_config, get_logger):
+def test_run_sql_failure_pipeline(
+    sandbox_sqlserver: DatabaseManager,
+    sql_failure_config: PipelineConfig,
+    get_logger: Logger,
+) -> None:
     pipeline = PipelineClass(config=sql_failure_config, executor=sandbox_sqlserver)
     with pytest.raises(RuntimeError) as e:
         pipeline.execute()
@@ -68,7 +91,11 @@ def test_run_sql_failure_pipeline(sandbox_sqlserver, sql_failure_config, get_log
     assert "Pipeline execution failed due to errors in steps: invalid_sql_step" in str(e.value)
 
 
-def test_run_python_failure_pipeline(sandbox_sqlserver, python_failure_config, get_logger):
+def test_run_python_failure_pipeline(
+    sandbox_sqlserver: DatabaseManager,
+    python_failure_config: PipelineConfig,
+    get_logger: Logger,
+) -> None:
     pipeline = PipelineClass(config=python_failure_config, executor=sandbox_sqlserver)
     with pytest.raises(RuntimeError) as e:
         pipeline.execute()
@@ -77,7 +104,11 @@ def test_run_python_failure_pipeline(sandbox_sqlserver, python_failure_config, g
     assert "Pipeline execution failed due to errors in steps: invalid_python_step" in str(e.value)
 
 
-def test_run_python_dep_failure_pipeline(sandbox_sqlserver, pipeline_dep_failure_config, get_logger):
+def test_run_python_dep_failure_pipeline(
+    sandbox_sqlserver: DatabaseManager,
+    pipeline_dep_failure_config: PipelineConfig,
+    get_logger: Logger,
+):
     pipeline = PipelineClass(config=pipeline_dep_failure_config, executor=sandbox_sqlserver)
     with pytest.raises(RuntimeError) as e:
         pipeline.execute()
@@ -101,16 +132,16 @@ def test_skipped_steps(sandbox_sqlserver: DatabaseManager, pipeline_config: Pipe
         assert result.error_message is None, "Skipped steps should not have error messages"
 
 
-def verify_output(get_logger, path):
+def verify_output(get_logger: Logger, path: Path):
     conn = duckdb.connect(str(Path(path)) + "/" + DB_NAME)
 
     expected_tables = ["usage", "inventory", "random_data"]
     logger = get_logger
     for table in expected_tables:
         try:
             result = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
-            logger.info(f"Count for {table}: {result[0]}")
-            if result[0] == 0:
+            logger.info(f"Count for {table}: {result}")
+            if result is None or result[0] == 0:
                 logger.debug(f"Table {table} is empty")
                 return False
         except duckdb.CatalogException:
@@ -122,7 +153,7 @@ def verify_output(get_logger, path):
     return True
 
 
-def test_pipeline_config_comments():
+def test_pipeline_config_comments() -> None:
     pipeline_w_comments = PipelineConfig(
         name="warehouse_profiler",
         version="1.0",
@@ -136,7 +167,7 @@ def test_pipeline_config_comments():
     assert pipeline_wo_comments.comment is None
 
 
-def test_pipeline_step_comments():
+def test_pipeline_step_comments() -> None:
     step_w_comment = Step(
         name="step_w_comment",
         type="sql",
@@ -156,3 +187,26 @@ def test_pipeline_step_comments():
     )
     assert step_w_comment.comment == "This is a step comment."
     assert step_wo_comment.comment is None
+
+
+def test_run_empty_result_pipeline(
+    sandbox_sqlserver: DatabaseManager,
+    empty_result_config: PipelineConfig,
+    get_logger: Logger,
+) -> None:
+    pipeline = PipelineClass(config=empty_result_config, executor=sandbox_sqlserver)
+    results = pipeline.execute()
+
+    # Verify step completed successfully despite empty results
+    assert len(results) == 1
+    assert results == [
+        StepExecutionResult(step_name="empty_result_step", status=StepExecutionStatus.COMPLETE, error_message=None)
+    ]
+
+    # Verify that no table was created (processing was skipped for empty resultset)
+    with duckdb.connect(str(Path(empty_result_config.extract_folder)) + "/" + DB_NAME) as conn:
+        tables = conn.execute("SHOW TABLES").fetchall()
+        table_names = [table[0] for table in tables]
+
+    # Table should NOT be created when resultset is empty
+    assert "empty_result_step" not in table_names, "Empty resultset should skip table creation"
diff --git a/tests/resources/assessments/empty_resultset.sql b/tests/resources/assessments/empty_resultset.sql
@@ -0,0 +1,6 @@
+-- Query that returns valid schema but 0 rows
+SELECT
+    'test' as col1,
+    'test' as col2,
+    'test' as col3
+WHERE 1 = 0
diff --git a/tests/resources/assessments/pipeline_config_empty_result.yml b/tests/resources/assessments/pipeline_config_empty_result.yml
@@ -0,0 +1,10 @@
+name: test_empty_result_pipeline
+version: 1.0
+extract_folder: /tmp/lakebridge_test_empty_result
+steps:
+  - name: empty_result_step
+    type: sql
+    extract_source: resources/assessments/empty_resultset.sql
+    mode: overwrite
+    frequency: once
+    flag: active