Skip to content

Commit 9f20d55

Browse files
Introduced Profiler Skeleton (#2021)
<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST --> ## Changes <!-- Summary of your changes that are easy to understand. Add screenshots when necessary, they're helpful to illustrate the before and after state --> ### What does this PR do? - Introduces the Profiler Skeleton for the Lakebridge project. - Adds initial Profiler class with supporting utilities and constants. - Implements core logic for profiling supported source technologies with placeholder support for MSSQL and Synapse. - Sets up the structure for profiling pipelines, including config file handling and extraction logic. ### Relevant implementation details ### Caveats/things to watch out for when reviewing: ### Linked issues <!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved. See https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword --> Resolves #.. ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs lakebridge ...` - [ ] ... +add your own ### Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [x] manually tested - [ ] added unit tests - [ ] added integration tests --------- Co-authored-by: Guenia Izquierdo <[email protected]>
1 parent 1c0d8df commit 9f20d55

File tree

4 files changed

+165
-3
lines changed

4 files changed

+165
-3
lines changed

src/databricks/labs/lakebridge/assessments/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
PRODUCT_NAME,
33
PRODUCT_PATH_PREFIX,
44
PROFILER_SOURCE_SYSTEM,
5-
PLATFORM_TO_SOURCE_TECHNOLOGY,
5+
PLATFORM_TO_SOURCE_TECHNOLOGY_CFG,
66
CONNECTOR_REQUIRED,
77
)
88

99
__all__ = [
1010
"PRODUCT_NAME",
1111
"PRODUCT_PATH_PREFIX",
1212
"PROFILER_SOURCE_SYSTEM",
13-
"PLATFORM_TO_SOURCE_TECHNOLOGY",
13+
"PLATFORM_TO_SOURCE_TECHNOLOGY_CFG",
1414
"CONNECTOR_REQUIRED",
1515
]

src/databricks/labs/lakebridge/assessments/_constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
PRODUCT_NAME = "lakebridge"
44
PRODUCT_PATH_PREFIX = Path.home() / ".databricks" / "labs" / PRODUCT_NAME / "lib"
55

6-
PLATFORM_TO_SOURCE_TECHNOLOGY = {
6+
PLATFORM_TO_SOURCE_TECHNOLOGY_CFG = {
77
"synapse": "src/databricks/labs/lakebridge/resources/assessments/synapse/pipeline_config.yml",
88
}
99

1010
# TODO modify this PLATFORM_TO_SOURCE_TECHNOLOGY.keys() once all platforms are supported
1111
PROFILER_SOURCE_SYSTEM = ["mssql", "synapse"]
1212

13+
1314
# This flag indicates whether a connector is required for the source system when pipeline is trigger
1415
# For example in the case of synapse no connector is required and the python scripts
1516
# manage the connection by directly reading the credentials files
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import logging
2+
from pathlib import Path
3+
4+
from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
5+
from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig
6+
from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
7+
from databricks.labs.lakebridge.connections.credential_manager import (
8+
create_credential_manager,
9+
)
10+
from databricks.labs.lakebridge.connections.env_getter import EnvGetter
11+
from databricks.labs.lakebridge.assessments import (
12+
PRODUCT_NAME,
13+
PRODUCT_PATH_PREFIX,
14+
PLATFORM_TO_SOURCE_TECHNOLOGY_CFG,
15+
CONNECTOR_REQUIRED,
16+
)
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
class Profiler:
22+
23+
def __init__(self, platform: str, pipeline_configs: PipelineConfig | None = None):
24+
self._platform = platform
25+
self._pipeline_config = pipeline_configs
26+
27+
@classmethod
28+
def create(cls, platform: str) -> "Profiler":
29+
pipeline_config_path = PLATFORM_TO_SOURCE_TECHNOLOGY_CFG.get(platform, None)
30+
pipeline_config = None
31+
if pipeline_config_path:
32+
pipeline_config_absolute_path = Profiler._locate_config(pipeline_config_path)
33+
pipeline_config = Profiler.path_modifier(config_file=pipeline_config_absolute_path)
34+
return cls(platform, pipeline_config)
35+
36+
@classmethod
37+
def supported_platforms(cls) -> list[str]:
38+
return list(PLATFORM_TO_SOURCE_TECHNOLOGY_CFG.keys())
39+
40+
@staticmethod
41+
def path_modifier(*, config_file: str | Path, path_prefix: Path = PRODUCT_PATH_PREFIX) -> PipelineConfig:
42+
# TODO: Make this work install during developer mode
43+
config = PipelineClass.load_config_from_yaml(config_file)
44+
for step in config.steps:
45+
step.extract_source = f"{path_prefix}/{step.extract_source}"
46+
return config
47+
48+
def profile(
49+
self,
50+
*,
51+
extractor: DatabaseManager | None = None,
52+
pipeline_config: PipelineConfig | None = None,
53+
) -> None:
54+
platform = self._platform.lower()
55+
if not pipeline_config:
56+
if not self._pipeline_config:
57+
raise ValueError(f"Cannot Proceed without a valid pipeline configuration for {platform}")
58+
pipeline_config = self._pipeline_config
59+
self._execute(platform, pipeline_config, extractor)
60+
61+
@staticmethod
62+
def _setup_extractor(platform: str) -> DatabaseManager | None:
63+
if not CONNECTOR_REQUIRED[platform]:
64+
return None
65+
cred_manager = create_credential_manager(PRODUCT_NAME, EnvGetter())
66+
connect_config = cred_manager.get_credentials(platform)
67+
return DatabaseManager(platform, connect_config)
68+
69+
def _execute(self, platform: str, pipeline_config: PipelineConfig, extractor=None) -> None:
70+
try:
71+
if extractor is None:
72+
extractor = Profiler._setup_extractor(platform)
73+
74+
result = PipelineClass(pipeline_config, extractor).execute()
75+
logger.info(f"Profile execution has completed successfully for {platform} for more info check: {result}.")
76+
except FileNotFoundError as e:
77+
logger.error(f"Configuration file not found for source {platform}: {e}")
78+
raise FileNotFoundError(f"Configuration file not found for source {platform}: {e}") from e
79+
except Exception as e:
80+
logger.error(f"Error executing pipeline for source {platform}: {e}")
81+
raise RuntimeError(f"Pipeline execution failed for source {platform} : {e}") from e
82+
83+
@staticmethod
84+
def _locate_config(config_path: str | Path) -> Path:
85+
config_file = PRODUCT_PATH_PREFIX / config_path
86+
if not config_file.exists():
87+
raise FileNotFoundError(f"Configuration file not found: {config_file}")
88+
return config_file
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from pathlib import Path
2+
3+
import shutil
4+
import tempfile
5+
import yaml
6+
import pytest
7+
8+
from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
9+
from databricks.labs.lakebridge.assessments.profiler import Profiler
10+
11+
12+
def test_supported_source_technologies() -> None:
13+
"""Test that supported source technologies are correctly returned"""
14+
profiler = Profiler("synapse", None)
15+
supported_platforms = profiler.supported_platforms()
16+
assert isinstance(supported_platforms, list)
17+
assert "synapse" in supported_platforms
18+
19+
20+
def test_profile_missing_platform_config() -> None:
21+
"""Test that profiling an unsupported platform raises ValueError"""
22+
with pytest.raises(ValueError, match="Cannot Proceed without a valid pipeline configuration for synapse"):
23+
profiler = Profiler("synapse", None)
24+
profiler.profile()
25+
26+
27+
def test_profile_execution() -> None:
28+
"""Test successful profiling execution using actual pipeline configuration"""
29+
profiler = Profiler("synapse")
30+
path_prefix = Path(__file__).parent / "../../../"
31+
config_file = path_prefix / "tests/resources/assessments/pipeline_config_main.yml"
32+
config = profiler.path_modifier(config_file=config_file, path_prefix=path_prefix)
33+
profiler.profile(pipeline_config=config)
34+
assert Path("/tmp/profiler_main/profiler_extract.db").exists(), "Profiler extract database should be created"
35+
36+
37+
def test_profile_execution_with_invalid_config() -> None:
38+
"""Test profiling execution with invalid configuration"""
39+
profiler = Profiler("synapse")
40+
path_prefix = Path(__file__).parent / "../../../"
41+
with pytest.raises(FileNotFoundError):
42+
config_file = path_prefix / "tests/resources/assessments/invalid_pipeline_config.yml"
43+
pipeline_config = profiler.path_modifier(
44+
config_file=config_file,
45+
path_prefix=path_prefix,
46+
)
47+
profiler.profile(pipeline_config=pipeline_config)
48+
49+
50+
def test_profile_execution_config_override() -> None:
51+
"""Test successful profiling execution using actual pipeline configuration with config file override"""
52+
with tempfile.TemporaryDirectory() as temp_dir:
53+
# Copy the YAML file and Python script to the temp directory
54+
prefix = Path(__file__).parent / ".." / ".."
55+
config_file_src = prefix / Path("resources/assessments/pipeline_config_absolute.yml")
56+
config_file_dest = Path(temp_dir) / config_file_src.name
57+
script_src = prefix / Path("resources/assessments/db_extract.py")
58+
script_dest = Path(temp_dir) / script_src.name
59+
shutil.copy(script_src, script_dest)
60+
61+
with open(config_file_src, 'r', encoding="utf-8") as file:
62+
config_data = yaml.safe_load(file)
63+
for step in config_data['steps']:
64+
step['extract_source'] = str(script_dest)
65+
with open(config_file_dest, 'w', encoding="utf-8") as file:
66+
yaml.safe_dump(config_data, file)
67+
68+
profiler = Profiler("synapse")
69+
pipeline_config = PipelineClass.load_config_from_yaml(config_file_dest)
70+
profiler.profile(pipeline_config=pipeline_config)
71+
assert Path(
72+
"/tmp/profiler_absolute/profiler_extract.db"
73+
).exists(), "Profiler extract database should be created"

0 commit comments

Comments
 (0)