Introduced Profiler Skeleton (#2021)

sundarshankar89 · gueniai · web-flow · commit 9f20d55cf97c · 2025-10-14T06:02:05.000Z
## Changes  ### What does this PR do? - Introduces the Profiler Skeleton for the Lakebridge project. - Adds initial Profiler class with supporting utilities and constants. - Implements core logic for profiling supported source technologies with placeholder support for MSSQL and Synapse. - Sets up the structure for profiling pipelines, including config file handling and extraction logic. ### Relevant implementation details ### Caveats/things to watch out for when reviewing: ### Linked issues  Resolves #.. ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs lakebridge ...` - [ ] ... +add your own ### Tests  - [x] manually tested - [ ] added unit tests - [ ] added integration tests --------- Co-authored-by: Guenia Izquierdo <guenia.izquierdo@databricks.com>
diff --git a/src/databricks/labs/lakebridge/assessments/__init__.py b/src/databricks/labs/lakebridge/assessments/__init__.py
@@ -2,14 +2,14 @@
     PRODUCT_NAME,
     PRODUCT_PATH_PREFIX,
     PROFILER_SOURCE_SYSTEM,
-    PLATFORM_TO_SOURCE_TECHNOLOGY,
+    PLATFORM_TO_SOURCE_TECHNOLOGY_CFG,
     CONNECTOR_REQUIRED,
 )
 
 __all__ = [
     "PRODUCT_NAME",
     "PRODUCT_PATH_PREFIX",
     "PROFILER_SOURCE_SYSTEM",
-    "PLATFORM_TO_SOURCE_TECHNOLOGY",
+    "PLATFORM_TO_SOURCE_TECHNOLOGY_CFG",
     "CONNECTOR_REQUIRED",
 ]
diff --git a/src/databricks/labs/lakebridge/assessments/_constants.py b/src/databricks/labs/lakebridge/assessments/_constants.py
@@ -3,13 +3,14 @@
 PRODUCT_NAME = "lakebridge"
 PRODUCT_PATH_PREFIX = Path.home() / ".databricks" / "labs" / PRODUCT_NAME / "lib"
 
-PLATFORM_TO_SOURCE_TECHNOLOGY = {
+PLATFORM_TO_SOURCE_TECHNOLOGY_CFG = {
     "synapse": "src/databricks/labs/lakebridge/resources/assessments/synapse/pipeline_config.yml",
 }
 
 # TODO modify this PLATFORM_TO_SOURCE_TECHNOLOGY.keys() once all platforms are supported
 PROFILER_SOURCE_SYSTEM = ["mssql", "synapse"]
 
+
 # This flag indicates whether a connector is required for the source system when pipeline is trigger
 # For example in the case of synapse no connector is required and the python scripts
 # manage the connection by directly reading the credentials files
diff --git a/src/databricks/labs/lakebridge/assessments/profiler.py b/src/databricks/labs/lakebridge/assessments/profiler.py
@@ -0,0 +1,88 @@
+import logging
+from pathlib import Path
+
+from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
+from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig
+from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
+from databricks.labs.lakebridge.connections.credential_manager import (
+    create_credential_manager,
+)
+from databricks.labs.lakebridge.connections.env_getter import EnvGetter
+from databricks.labs.lakebridge.assessments import (
+    PRODUCT_NAME,
+    PRODUCT_PATH_PREFIX,
+    PLATFORM_TO_SOURCE_TECHNOLOGY_CFG,
+    CONNECTOR_REQUIRED,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Profiler:
+
+    def __init__(self, platform: str, pipeline_configs: PipelineConfig | None = None):
+        self._platform = platform
+        self._pipeline_config = pipeline_configs
+
+    @classmethod
+    def create(cls, platform: str) -> "Profiler":
+        pipeline_config_path = PLATFORM_TO_SOURCE_TECHNOLOGY_CFG.get(platform, None)
+        pipeline_config = None
+        if pipeline_config_path:
+            pipeline_config_absolute_path = Profiler._locate_config(pipeline_config_path)
+            pipeline_config = Profiler.path_modifier(config_file=pipeline_config_absolute_path)
+        return cls(platform, pipeline_config)
+
+    @classmethod
+    def supported_platforms(cls) -> list[str]:
+        return list(PLATFORM_TO_SOURCE_TECHNOLOGY_CFG.keys())
+
+    @staticmethod
+    def path_modifier(*, config_file: str | Path, path_prefix: Path = PRODUCT_PATH_PREFIX) -> PipelineConfig:
+        # TODO: Make this work install during developer mode
+        config = PipelineClass.load_config_from_yaml(config_file)
+        for step in config.steps:
+            step.extract_source = f"{path_prefix}/{step.extract_source}"
+        return config
+
+    def profile(
+        self,
+        *,
+        extractor: DatabaseManager | None = None,
+        pipeline_config: PipelineConfig | None = None,
+    ) -> None:
+        platform = self._platform.lower()
+        if not pipeline_config:
+            if not self._pipeline_config:
+                raise ValueError(f"Cannot Proceed without a valid pipeline configuration for {platform}")
+            pipeline_config = self._pipeline_config
+        self._execute(platform, pipeline_config, extractor)
+
+    @staticmethod
+    def _setup_extractor(platform: str) -> DatabaseManager | None:
+        if not CONNECTOR_REQUIRED[platform]:
+            return None
+        cred_manager = create_credential_manager(PRODUCT_NAME, EnvGetter())
+        connect_config = cred_manager.get_credentials(platform)
+        return DatabaseManager(platform, connect_config)
+
+    def _execute(self, platform: str, pipeline_config: PipelineConfig, extractor=None) -> None:
+        try:
+            if extractor is None:
+                extractor = Profiler._setup_extractor(platform)
+
+            result = PipelineClass(pipeline_config, extractor).execute()
+            logger.info(f"Profile execution has completed successfully for {platform} for more info check: {result}.")
+        except FileNotFoundError as e:
+            logger.error(f"Configuration file not found for source {platform}: {e}")
+            raise FileNotFoundError(f"Configuration file not found for source {platform}: {e}") from e
+        except Exception as e:
+            logger.error(f"Error executing pipeline for source {platform}: {e}")
+            raise RuntimeError(f"Pipeline execution failed for source {platform} : {e}") from e
+
+    @staticmethod
+    def _locate_config(config_path: str | Path) -> Path:
+        config_file = PRODUCT_PATH_PREFIX / config_path
+        if not config_file.exists():
+            raise FileNotFoundError(f"Configuration file not found: {config_file}")
+        return config_file
diff --git a/tests/integration/assessments/test_profiler.py b/tests/integration/assessments/test_profiler.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+
+import shutil
+import tempfile
+import yaml
+import pytest
+
+from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
+from databricks.labs.lakebridge.assessments.profiler import Profiler
+
+
+def test_supported_source_technologies() -> None:
+    """Test that supported source technologies are correctly returned"""
+    profiler = Profiler("synapse", None)
+    supported_platforms = profiler.supported_platforms()
+    assert isinstance(supported_platforms, list)
+    assert "synapse" in supported_platforms
+
+
+def test_profile_missing_platform_config() -> None:
+    """Test that profiling an unsupported platform raises ValueError"""
+    with pytest.raises(ValueError, match="Cannot Proceed without a valid pipeline configuration for synapse"):
+        profiler = Profiler("synapse", None)
+        profiler.profile()
+
+
+def test_profile_execution() -> None:
+    """Test successful profiling execution using actual pipeline configuration"""
+    profiler = Profiler("synapse")
+    path_prefix = Path(__file__).parent / "../../../"
+    config_file = path_prefix / "tests/resources/assessments/pipeline_config_main.yml"
+    config = profiler.path_modifier(config_file=config_file, path_prefix=path_prefix)
+    profiler.profile(pipeline_config=config)
+    assert Path("/tmp/profiler_main/profiler_extract.db").exists(), "Profiler extract database should be created"
+
+
+def test_profile_execution_with_invalid_config() -> None:
+    """Test profiling execution with invalid configuration"""
+    profiler = Profiler("synapse")
+    path_prefix = Path(__file__).parent / "../../../"
+    with pytest.raises(FileNotFoundError):
+        config_file = path_prefix / "tests/resources/assessments/invalid_pipeline_config.yml"
+        pipeline_config = profiler.path_modifier(
+            config_file=config_file,
+            path_prefix=path_prefix,
+        )
+        profiler.profile(pipeline_config=pipeline_config)
+
+
+def test_profile_execution_config_override() -> None:
+    """Test successful profiling execution using actual pipeline configuration with config file override"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Copy the YAML file and Python script to the temp directory
+        prefix = Path(__file__).parent / ".." / ".."
+        config_file_src = prefix / Path("resources/assessments/pipeline_config_absolute.yml")
+        config_file_dest = Path(temp_dir) / config_file_src.name
+        script_src = prefix / Path("resources/assessments/db_extract.py")
+        script_dest = Path(temp_dir) / script_src.name
+        shutil.copy(script_src, script_dest)
+
+        with open(config_file_src, 'r', encoding="utf-8") as file:
+            config_data = yaml.safe_load(file)
+            for step in config_data['steps']:
+                step['extract_source'] = str(script_dest)
+        with open(config_file_dest, 'w', encoding="utf-8") as file:
+            yaml.safe_dump(config_data, file)
+
+        profiler = Profiler("synapse")
+        pipeline_config = PipelineClass.load_config_from_yaml(config_file_dest)
+        profiler.profile(pipeline_config=pipeline_config)
+        assert Path(
+            "/tmp/profiler_absolute/profiler_extract.db"
+        ).exists(), "Profiler extract database should be created"