kbase
diff --git a/‎src/data_lakehouse_ingest/core.py‎
Lines changed: 7 additions & 9 deletions b/‎src/data_lakehouse_ingest/core.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎src/data_lakehouse_ingest/orchestrator/init_utils.py‎
Lines changed: 12 additions & 19 deletions b/‎src/data_lakehouse_ingest/orchestrator/init_utils.py‎
Lines changed: 12 additions & 19 deletions
diff --git a/‎src/data_lakehouse_ingest/orchestrator/io_utils.py‎
Lines changed: 77 additions & 37 deletions b/‎src/data_lakehouse_ingest/orchestrator/io_utils.py‎
Lines changed: 77 additions & 37 deletions
diff --git a/‎src/data_lakehouse_ingest/orchestrator/models.py‎
Lines changed: 7 additions & 15 deletions b/‎src/data_lakehouse_ingest/orchestrator/models.py‎
Lines changed: 7 additions & 15 deletions
@@ -21,7 +21,7 @@
 from .orchestrator.table_batch_processor import process_tables
 
 from berdl_notebook_utils.setup_spark_session import get_spark_session
-from berdl_notebook_utils.clients import get_minio_client
+from berdl_notebook_utils.clients import get_s3_client
 
 
 def ingest(
@@ -55,7 +55,7 @@ def ingest(
     Notes:
         - SparkSession may be provided by the caller or auto-initialized.
         - A valid MinIO client is REQUIRED for ingestion.
-          If `minio_client` is not provided, `get_minio_client()` is attempted.
+          If `minio_client` is not provided, `get_s3_client()` is attempted.
           If MinIO cannot be initialized, the pipeline fails immediately.
         - Supports multiple file formats (CSV, TSV, JSON, XML, Parquet).
         - Schema enforcement supports SQL-style schemas (`schema_sql`) and structured schemas (`schema` list-of-maps).
@@ -101,16 +101,14 @@ def ingest(
     # MinIO Client Initialization
     # ----------------------------------------------------------------------
     if minio_client is None:
-        logger.info(
-            "No MinIO client provided — attempting auto-initialization via get_minio_client()"
-        )
+        logger.info("No MinIO client provided — attempting auto-initialization via get_s3_client()")
         try:
-            minio_client = get_minio_client()
-            logger.info("MinIO client successfully initialized via get_minio_client()")
+            minio_client = get_s3_client()
+            logger.info("MinIO client successfully initialized via get_s3_client()")
         except Exception as e:
             error_msg = (
                 "MinIO client is required for ingestion but could not be initialized. "
-                "Call get_minio_client() and pass it explicitly into ingest(...)."
+                "Call get_s3_client() and pass it explicitly into ingest(...)."
             )
             return log_error(
                 logger=logger,
@@ -120,7 +118,7 @@ def ingest(
                 exc=e,
             )
 
-    # Defensive check in case get_minio_client() returned None without raising
+    # Defensive check in case get_s3_client() returned None without raising
     if minio_client is None:
         error_msg = "MinIO client is required for ingestion but was not provided or initialized."
         return log_error(
 
@@ -1,7 +1,7 @@
 """
 Initialization utilities for the Data Lakehouse Ingest framework.
 Handles logger setup and Spark session context initialization,
-including tenant creation, catalog switching, and configuration extraction.
+including tenant creation, namespace management, and configuration extraction.
 """
 
 import logging
@@ -62,9 +62,11 @@ def init_run_context(
     """
     Initialize the ingestion run context based on config.
 
-    Uses the JupyterHub helper `create_namespace_if_not_exists` instead of
-    SQL CREATE DATABASE statements. The behavior depends on the 'is_tenant'
-    flag in the config.
+    Uses the Iceberg catalog flow via `create_namespace_if_not_exists(iceberg=True)`
+    to create namespaces with catalog-level isolation (no governance prefixes).
+
+    The catalog is determined by the tenant name: tenant-based configs use the
+    tenant name as the catalog, while personal configs use the ``"my"`` catalog.
 
     Args:
         spark (SparkSession): Active Spark session.
@@ -88,32 +90,24 @@ def init_run_context(
     logger.info(f"Found {len(tables)} table(s) to process")
 
     # ----------------------------------------------------------------------
-    # Create namespace using JupyterHub helper
+    # Create namespace using Iceberg catalog flow
     # ----------------------------------------------------------------------
     try:
         if tenant:
-            # Multi-tenant governed environment
+            # Multi-tenant: tenant name is used as the Iceberg catalog name
             namespace = create_namespace_if_not_exists(
                 spark,
                 namespace=dataset,
                 tenant_name=tenant,
+                iceberg=True,
             )
             logger.info(f"Tenant namespace created/accessed: {namespace}")
         else:
-            # Personal (user-level) environment
-            namespace = create_namespace_if_not_exists(spark, dataset)
+            # Personal: uses the "my" catalog
+            namespace = create_namespace_if_not_exists(spark, dataset, iceberg=True)
             logger.info(f"Personal namespace created/accessed: {namespace}")
 
-        spark.catalog.setCurrentDatabase(namespace)
-
-        # Extract physical namespace path
-        try:
-            ns_info = spark.sql(f"DESCRIBE NAMESPACE EXTENDED {namespace}").collect()
-            base_path = [r.info_value for r in ns_info if r.info_name.lower() == "location"][0]
-            logger.info(f"Namespace storage location: {base_path}")
-        except Exception as e:
-            logger.warning(f"Unable to determine namespace storage location for '{namespace}': {e}")
-            base_path = None
+        spark.sql(f"USE {namespace}")
 
     except Exception as e:
         logger.error(
@@ -127,6 +121,5 @@ def init_run_context(
         "tenant": tenant,
         "dataset": dataset,
         "namespace": namespace,
-        "namespace_base_path": base_path,
         "tables": tables,
     }
@@ -1,10 +1,10 @@
 """
 Input/output utilities for the Data Lakehouse Ingest framework.
 Handles file format detection, data loading from Bronze sources,
-and writing curated data to Silver Delta tables.
+and writing curated data to Silver tables via Iceberg catalogs.
 
 Provides a unified interface for reading CSV, TSV, JSON, and XML formats,
-and ensures consistent creation and registration of Delta tables in Spark.
+and writes tables using catalog-driven APIs (no explicit path management).
 """
 
 import logging
@@ -23,19 +23,18 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
     Determines the file format based on either an explicit configuration
     value (`explicit_fmt`) or by inspecting the file extension.
 
-    Supported extensions: `.csv`, `.tsv`, `.json`, `.xml`.
+     Supported extensions: `.csv`, `.tsv`, `.json`, `.xml`, `.parquet`.
 
     Args:
         bronze_path (str): Full S3/local path to the input data file.
-        explicit_fmt (str | None): Optional explicit format (csv, tsv, json, xml).
+        explicit_fmt (str | None): Optional explicit format (csv, tsv, json, xml, parquet).
 
     Returns:
-        str: The detected format name ("csv", "tsv", "json", or "xml").
+         str: The detected format name.
 
     Notes:
         - Explicit format overrides file extension detection.
         - Defaults to "csv" when no recognizable extension is found.
-        - Ensures consistent downstream loader selection in ingestion pipelines.
     """
 
     # TODO: Explore using python-magic or content-based format detection.
@@ -51,7 +50,6 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
     if explicit_fmt:
         return explicit_fmt.lower()
 
-    # Map file extensions to formats
     extension_map = {
         "csv": "csv",
         "tsv": "tsv",
@@ -61,7 +59,7 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
     }
 
     ext = bronze_path.split(".")[-1].lower()
-    return extension_map.get(ext, "csv")  # default fallback
+    return extension_map.get(ext, "csv")
 
 
 def load_table_data(
@@ -70,9 +68,12 @@ def load_table_data(
     fmt: str,
     opts: dict,
     logger: logging.Logger,
-) -> tuple[object, int]:
+) -> tuple[DataFrame, int]:
     """
-    Loads a DataFrame and returns (df, rows_in).
+    Load source data into a DataFrame and return the DataFrame with its input row count.
+
+    Returns:
+        tuple[DataFrame, int]: The loaded DataFrame and number of rows read from the source.
     """
     fmt_to_loader = {
         "json": load_json_data,
@@ -91,49 +92,88 @@ def load_table_data(
     return df, rows_in
 
 
-def write_to_delta(
+def table_exists(spark: SparkSession, full_table: str) -> bool:
+    """
+    Check whether a catalog table exists.
+
+    Uses Spark table access so it works with fully qualified catalog table names,
+    including Iceberg tables.
+    """
+    try:
+        spark.table(full_table).limit(1).count()
+        return True
+    except Exception:
+        return False
+
+
+def write_table(
     df: DataFrame,
     spark: SparkSession,
     namespace: str,
-    namespace_base_path: str,
     name: str,
-    silver_path: str,
     partition_by: str | list[str] | None,
     mode: str,
+    rows_in: int,
     logger: logging.Logger,
 ) -> int:
-    # TODO: Explore replacing explicit `table_path` writes with a catalog-driven approach.
-    #
-    # Goal:
-    #   Eliminate the need to manually construct and manage table paths (namespace_base_path/name)
-    #   by allowing Spark to handle initial table creation and location assignment.
+    """
+    Write a DataFrame to a table using catalog-driven Iceberg APIs.
 
-    # Construct deterministic table path inside namespace storage location
-    table_path = f"{namespace_base_path}/{name}"
+    The Iceberg catalog manages table storage locations, so this function does
+    not construct explicit paths or use LOCATION clauses. For overwrite mode,
+    the table is created or replaced. For append mode, the table must already
+    exist.
 
-    logger.info(f"Resolved Delta target path: {table_path}")
+    Args:
+        df: DataFrame to write.
+        spark: Active SparkSession.
+        namespace: Fully qualified namespace (e.g., ``my.dataset`` or ``kbase.dataset``).
+        name: Table name.
+        partition_by: Optional partition column(s).
+        mode: Write mode. Defaults to ``"overwrite"`` when omitted.
+              Supported values are ``"overwrite"`` and ``"append"``.
+        rows_in: Number of rows read from the source DataFrame. This value is
+                 returned and logged as rows written, rather than counting the
+                 full target table after write.
+        logger: Logger for structured output.
 
-    rows_written = df.count()
+    Returns:
+        Number of rows written.
+    """
 
-    # Write (with overwriteSchema only for overwrite mode)
-    writer = df.write.format("delta").mode(mode)
+    full_table = f"{namespace}.{name}"
+    # Default mode
+    mode = (mode or "overwrite").lower()
 
-    if mode == "overwrite":
-        writer = writer.option("overwriteSchema", "true")
+    if mode not in {"overwrite", "append"}:
+        raise ValueError(
+            f"Unsupported write mode '{mode}' for {full_table}. "
+            "Supported modes are 'overwrite' and 'append'."
+        )
 
-    if partition_by:
-        writer = writer.partitionBy(partition_by)
+    exists = table_exists(spark, full_table)
+
+    logger.info(f"Writing table: {full_table} (mode={mode}, exists={exists})")
 
-    writer.save(table_path)
+    if mode == "append" and not exists:
+        raise ValueError(
+            f"Cannot append to {full_table} because the table does not exist. "
+            "Use mode='overwrite' or omit mode to create the table."
+        )
+
+    rows_written = rows_in
+
+    writer = df.writeTo(full_table)
+
+    if partition_by:
+        cols = [partition_by] if isinstance(partition_by, str) else list(partition_by)
+        writer = writer.partitionedBy(*cols)
 
-    # Register table if missing (no schema overwrite here!)
-    spark.sql(f"""
-        CREATE TABLE IF NOT EXISTS `{namespace}`.`{name}`
-        USING DELTA
-        LOCATION '{table_path}'
-    """)
+    if mode == "append":
+        writer.append()
+    else:
+        writer.createOrReplace()
 
-    # log rows
-    logger.info(f"Wrote {rows_written} rows → {namespace}.{name} @ {table_path}")
+    logger.info(f"Wrote {rows_written} rows → {full_table}")
 
     return rows_written
@@ -9,7 +9,7 @@
 Two result types are defined:
 
 - TableProcessSuccess: Returned when a table is successfully processed and
-  written to the Silver Delta layer. Includes metrics such as rows read,
+  written to the target Silver table. Includes metrics such as rows read,
   rows written, elapsed time, and optional comment application results.
 
 - TableProcessFailure: Returned when a table fails during processing. Contains
@@ -51,7 +51,7 @@ class InputSource(Enum):
 
 class WriteMode(Enum):
     """
-    Supported write modes when writing Delta tables.
+    Supported write modes when writing target tables.
     """
 
     OVERWRITE = "overwrite"
@@ -64,30 +64,26 @@ class TableProcessSuccess:
     Represents a successful table ingestion result.
 
     This object captures metadata and metrics produced when a table is
-    successfully processed and written to the Silver Delta layer.
+    successfully processed and written to the target Silver table.
 
     Attributes:
         name: Table name.
         tenant: Tenant identifier associated with the ingestion run.
         target_table: Fully qualified target table name in the Silver layer.
-        mode: Write mode used when writing to Delta. Represented by the WriteMode enum.
+        mode: Write mode used when writing the target table. Represented by the WriteMode enum.
         format: Detected input file format when reading from Bronze storage.
         schema_source: Origin of the resolved schema. Represented by the SchemaSource enum.
         input_source: Indicates whether input was read from Bronze storage or provided
             as a Spark DataFrame override. Represented by the InputSource enum.
         bronze_path: Source path in Bronze storage if applicable.
-        silver_path: Target storage path where the Delta table is written.
         rows_in: Number of input rows read.
-        rows_written: Number of rows written to the Silver Delta table.
-        rows_rejected: Number of rows rejected during processing.
+        rows_written: Number of rows written to the target table.
         extra_columns_dropped: Columns dropped because they were not present in the schema.
-        partitions_written: List of partitions written (if partitioning is used).
-        quarantine_path: Location where rejected records would be stored.
         elapsed_sec: Processing time in seconds.
         status: Processing status represented by the ProcessStatus enum.
-        table_comment_report: Result of applying Delta table-level comments when
+        table_comment_report: Result of applying table-level comments when
             a table-level `comment` is provided in the config.
-        column_comments_report: Result of applying Delta column comments when structured
+        column_comments_report: Result of applying column comments when structured
             schema metadata includes column comments.
     """
 
@@ -99,13 +95,9 @@ class TableProcessSuccess:
     schema_source: SchemaSource
     input_source: InputSource
     bronze_path: str | None
-    silver_path: str | None
     rows_in: int | None
     rows_written: int | None
-    rows_rejected: int | None
     extra_columns_dropped: list[str]
-    partitions_written: list[str] | None
-    quarantine_path: str | None
     elapsed_sec: float | None
     status: ProcessStatus
     table_comment_report: dict[str, Any] | None