databrickslabs
diff --git a/‎labs.yml‎
Lines changed: 1 addition & 1 deletion b/‎labs.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/databricks/labs/lakebridge/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎src/databricks/labs/lakebridge/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/databricks/labs/lakebridge/assessments/configure_assessment.py‎
Lines changed: 0 additions & 1 deletion b/‎src/databricks/labs/lakebridge/assessments/configure_assessment.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/databricks/labs/lakebridge/assessments/dashboards/execute.py‎
Lines changed: 20 additions & 11 deletions b/‎src/databricks/labs/lakebridge/assessments/dashboards/execute.py‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎src/databricks/labs/lakebridge/assessments/pipeline.py‎
Lines changed: 0 additions & 1 deletion b/‎src/databricks/labs/lakebridge/assessments/pipeline.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/databricks/labs/lakebridge/assessments/profiler.py‎
Lines changed: 3 additions & 4 deletions b/‎src/databricks/labs/lakebridge/assessments/profiler.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎src/databricks/labs/lakebridge/assessments/profiler_config.py‎
Lines changed: 11 additions & 12 deletions b/‎src/databricks/labs/lakebridge/assessments/profiler_config.py‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎src/databricks/labs/lakebridge/assessments/profiler_validator.py‎
Lines changed: 3 additions & 2 deletions b/‎src/databricks/labs/lakebridge/assessments/profiler_validator.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/databricks/labs/lakebridge/base_install.py‎
Lines changed: 0 additions & 28 deletions b/‎src/databricks/labs/lakebridge/base_install.py‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎src/databricks/labs/lakebridge/cli.py‎
Lines changed: 24 additions & 14 deletions b/‎src/databricks/labs/lakebridge/cli.py‎
Lines changed: 24 additions & 14 deletions
@@ -2,7 +2,7 @@
 name: lakebridge
 description: Code Transpiler and Data Reconciliation tool for Accelerating Data onboarding to Databricks from EDW, CDW and other ETL sources.
 install:
-  script: src/databricks/labs/lakebridge/base_install.py
+  script: src/databricks/labs/lakebridge/install.py
 uninstall:
   script: src/databricks/labs/lakebridge/uninstall.py
 entrypoint: src/databricks/labs/lakebridge/cli.py
 
@@ -1,9 +1,26 @@
+import logging
+
 from databricks.sdk.core import with_user_agent_extra, with_product
+from databricks.labs.blueprint.entrypoint import is_in_debug
 from databricks.labs.blueprint.logger import install_logger
 from databricks.labs.lakebridge.__about__ import __version__
 
+# Ensure that anything that imports this (or lower) submodules triggers setup of the blueprint logging.
 install_logger()
 
+
+def initialize_logging() -> None:
+    """Common logging initialisation for non-CLI entry-points."""
+    # This is intended to be used by all the non-CLI entry-points, such as install/uninstall hooks and pipeline tasks.
+    # It emulates the behaviour of the blueprint App() initialisation, except that we don't have handoff from the
+    # Databricks CLI. As such the policy is:
+    #   - The root (and logging system in general) is left alone.
+    #   - If running in the IDE debugger, databricks.* will be set to DEBUG.
+    #   - Otherwise, databricks.* will be set to INFO.
+    databricks_log_level = logging.DEBUG if is_in_debug() else logging.INFO
+    logging.getLogger("databricks").setLevel(databricks_log_level)
+
+
 # Add lakebridge/<version> for projects depending on lakebridge as a library
 with_user_agent_extra("lakebridge", __version__)
 
 
@@ -16,7 +16,6 @@
 from databricks.labs.lakebridge.assessments import CONNECTOR_REQUIRED
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 
 def _save_to_disk(credential: dict, cred_file: Path) -> None:
 
@@ -1,25 +1,33 @@
 import logging
 import os
 import sys
+from collections.abc import Sequence
+from importlib import resources
+from importlib.abc import Traversable
 from pathlib import Path
-import yaml
-from yaml.parser import ParserError
-from yaml.scanner import ScannerError
 
 import duckdb
+import yaml
 from pyspark.sql import SparkSession
+from yaml.parser import ParserError
+from yaml.scanner import ScannerError
 
+import databricks.labs.lakebridge.resources.assessments as assessment_resources
 from databricks.labs.lakebridge.assessments.profiler_validator import (
     EmptyTableValidationCheck,
     build_validation_report,
     ExtractSchemaValidationCheck,
     build_validation_report_dataframe,
 )
+from databricks.labs.lakebridge import initialize_logging
 
 logger = logging.getLogger(__name__)
 
 
-def main(*argv) -> None:
+def main(*argv: str) -> None:
+    """Lakeview Jobs task entry point: profiler_dashboards"""
+    initialize_logging()
+
     logger.debug(f"Arguments received: {argv}")
     assert len(sys.argv) == 4, f"Invalid number of arguments: {len(sys.argv)}"
     catalog_name = sys.argv[0]
@@ -34,22 +42,22 @@ def main(*argv) -> None:
         raise ValueError("Corrupt or invalid profiler extract.")
 
 
-def _get_extract_tables(schema_def_path: str) -> list:
+def _get_extract_tables(schema_def_path: Path | Traversable) -> Sequence[tuple[str, str, str]]:
     """
     Given a schema definition file for a source technology, returns a list of table info tuples:
     (schema_name, table_name, fully_qualified_name)
     """
     # First, load the schema definition file
     try:
-        with open(schema_def_path, 'r', encoding="UTF-8") as f:
+        with schema_def_path.open(mode="r", encoding="utf-8") as f:
             data = yaml.safe_load(f)
     except (ParserError, ScannerError) as e:
         raise ValueError(f"Could not read extract schema definition '{schema_def_path}': {e}") from e
     except FileNotFoundError as e:
         raise FileNotFoundError(f"Schema definition not found: {schema_def_path}") from e
     # Iterate through the defined schemas and build a list of
     # table info tuples: (schema_name, table_name, fully_qualified_name)
-    extracted_tables = []
+    extracted_tables: list[tuple[str, str, str]] = []
     for schema_name, schema_def in data.get("schemas", {}).items():
         tables = schema_def.get("tables", {})
         for table_name in tables.keys():
@@ -64,10 +72,11 @@ def _validate_profiler_extract(
 ) -> bool:
     logger.info("Validating the profiler extract file.")
     validation_checks: list[EmptyTableValidationCheck | ExtractSchemaValidationCheck] = []
-    schema_def_path = f"{Path(__file__).parent}/../../resources/assessments/{source_tech}_schema_def.yml"
-    tables = _get_extract_tables(schema_def_path)
+    # TODO: Verify this, I don't think it works? (These files are part of the test resources.)
+    schema_def = resources.files(assessment_resources).joinpath(f"{source_tech}_schema_def.yml")
+    tables = _get_extract_tables(schema_def)
     try:
-        with duckdb.connect(database=extract_location) as duck_conn:
+        with duckdb.connect(database=extract_location) as duck_conn, resources.as_file(schema_def) as schema_def_path:
             for table_info in tables:
                 # Ensure that the table contains data
                 empty_check = EmptyTableValidationCheck(table_info[2])
@@ -79,7 +88,7 @@ def _validate_profiler_extract(
                     table_info[1],
                     source_tech=source_tech,
                     extract_path=extract_location,
-                    schema_path=schema_def_path,
+                    schema_path=str(schema_def_path),
                 )
                 validation_checks.append(schema_check)
             report = build_validation_report(validation_checks, duck_conn)
 
@@ -18,7 +18,6 @@
 from databricks.labs.lakebridge.connections.database_manager import DatabaseManager, FetchResult
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 DB_NAME = "profiler_extract.db"
 
 
@@ -39,11 +39,10 @@ def supported_platforms(cls) -> list[str]:
 
     @staticmethod
     def path_modifier(*, config_file: str | Path, path_prefix: Path = PRODUCT_PATH_PREFIX) -> PipelineConfig:
-        # TODO: Make this work install during developer mode
+        # TODO: Choose a better name for this.
         config = PipelineClass.load_config_from_yaml(config_file)
-        for step in config.steps:
-            step.extract_source = f"{path_prefix}/{step.extract_source}"
-        return config
+        new_steps = [step.copy(extract_source=str(path_prefix / step.extract_source)) for step in config.steps]
+        return config.copy(steps=new_steps)
 
     def profile(
         self,
 
@@ -1,30 +1,29 @@
+import dataclasses
 from dataclasses import dataclass, field
 
 
-@dataclass
+@dataclass(frozen=True)
 class Step:
     name: str
     type: str | None
     extract_source: str
-    mode: str | None
-    frequency: str | None
-    flag: str | None
+    mode: str = "append"
+    frequency: str = "once"
+    flag: str = "active"
     dependencies: list[str] = field(default_factory=list)
     comment: str | None = None
 
-    def __post_init__(self):
-        if self.frequency is None:
-            self.frequency = "once"
-        if self.flag is None:
-            self.flag = "active"
-        if self.mode is None:
-            self.mode = "append"
+    def copy(self, /, **changes) -> "Step":
+        return dataclasses.replace(self, **changes)
 
 
-@dataclass
+@dataclass(frozen=True)
 class PipelineConfig:
     name: str
     version: str
     extract_folder: str
     comment: str | None = None
     steps: list[Step] = field(default_factory=list)
+
+    def copy(self, /, **changes) -> "PipelineConfig":
+        return dataclasses.replace(self, **changes)
@@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass
 from collections.abc import Sequence
+from pathlib import Path
 
 import yaml
 from duckdb import DuckDBPyConnection, CatalogException, ParserException, Error
@@ -201,7 +202,7 @@ def validate(self, connection) -> ValidationOutcome:
         )
 
 
-def get_profiler_extract_path(pipeline_config_path: str) -> str:
+def get_profiler_extract_path(pipeline_config_path: Path) -> Path:
     """
     Returns the filesystem path of the profiler extract database.
     input:
@@ -211,7 +212,7 @@ def get_profiler_extract_path(pipeline_config_path: str) -> str:
     """
     pipeline_config = PipelineClass.load_config_from_yaml(pipeline_config_path)
     normalized_db_path = os.path.normpath(pipeline_config.extract_folder)
-    database_path = f"{normalized_db_path}/{PROFILER_DB_NAME}"
+    database_path = Path(normalized_db_path) / PROFILER_DB_NAME
     return database_path
 
 
 
@@ -16,7 +16,7 @@
 from databricks.sdk import WorkspaceClient
 
 from databricks.labs.blueprint.cli import App
-from databricks.labs.blueprint.entrypoint import is_in_debug
+from databricks.labs.blueprint.entrypoint import get_logger
 from databricks.labs.blueprint.installation import RootJsonValue, JsonObject, JsonValue
 from databricks.labs.blueprint.tui import Prompts
 
@@ -30,7 +30,6 @@
 from databricks.labs.lakebridge.connections.credential_manager import cred_file
 from databricks.labs.lakebridge.helpers.recon_config_utils import ReconConfigPrompts
 from databricks.labs.lakebridge.helpers.telemetry_utils import make_alphanum_or_semver
-from databricks.labs.lakebridge.install import installer
 from databricks.labs.lakebridge.reconcile.runner import ReconcileRunner
 from databricks.labs.lakebridge.lineage import lineage_generator
 from databricks.labs.lakebridge.reconcile.recon_config import RECONCILE_OPERATION_NAME, AGG_RECONCILE_OPERATION_NAME
@@ -48,7 +47,6 @@
 
 # Subclass to allow controlled access to protected methods.
 class Lakebridge(App):
-    _logger_instance: logging.Logger | None = None
 
     def create_workspace_client(self) -> WorkspaceClient:
         """Create a workspace client, with the appropriate product and version information.
@@ -58,15 +56,25 @@ def create_workspace_client(self) -> WorkspaceClient:
         self._patch_databricks_host()
         return self._workspace_client()
 
-    def get_logger(self) -> logging.Logger:
-        if self._logger_instance is None:
-            self._logger_instance = self._logger
-            self._logger_instance.setLevel(logging.INFO)
-        return self._logger_instance
+    def _log_level(self, raw: str) -> int:
+        """Convert the log-level provided by the Databricks CLI into a logging level supported by Python."""
+        log_level = super()._log_level(raw)
+        # Due to an issue in the handoff of the intended logging level from the Databricks CLI to our
+        # application, we can't currently distinguish between --log-level=WARN and nothing at all, where we
+        # prefer (and the application logging expects) INFO.
+        #
+        # Rather than default to only have WARNING logs show, it's preferable to default to INFO and have
+        # --log-level=WARN not work for now.
+        #
+        # See: https://github.com/databrickslabs/lakebridge/issues/2167
+        # TODO: Remove this once #2167 has been resolved.
+        if log_level == logging.WARNING:
+            log_level = logging.INFO
+        return log_level
 
 
 lakebridge = Lakebridge(__file__)
-logger = lakebridge.get_logger()
+logger = get_logger(__file__)
 
 
 def raise_validation_exception(msg: str) -> NoReturn:
@@ -745,6 +753,9 @@ def install_transpile(
     transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
 ) -> None:
     """Install or upgrade the Lakebridge transpilers."""
+    # Avoid circular imports.
+    from databricks.labs.lakebridge.install import installer  # pylint: disable=cyclic-import, import-outside-toplevel
+
     is_interactive = interactive_mode(interactive)
     ctx = ApplicationContext(w)
     ctx.add_user_agent_extra("cmd", "install-transpile")
@@ -804,6 +815,9 @@ def configure_reconcile(
     transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
 ) -> None:
     """Configure the Lakebridge reconciliation module"""
+    # Avoid circular imports.
+    from databricks.labs.lakebridge.install import installer  # pylint: disable=cyclic-import, import-outside-toplevel
+
     ctx = ApplicationContext(w)
     ctx.add_user_agent_extra("cmd", "configure-reconcile")
     user = w.current_user
@@ -1017,8 +1031,4 @@ def create_profiler_dashboard(
 
 
 if __name__ == "__main__":
-    app = lakebridge
-    logger = app.get_logger()
-    if is_in_debug():
-        logger.setLevel(logging.DEBUG)
-    app()
+    lakebridge()