Skip to content

Commit 6f9d837

Browse files
authored
Merge pull request #73 from kbase/develop
Merge develop into main for v0.1.0 release
2 parents f088ba5 + b0a7e0a commit 6f9d837

13 files changed

Lines changed: 983 additions & 1526 deletions

File tree

src/data_lakehouse_ingest/core.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from .orchestrator.table_batch_processor import process_tables
2222

2323
from berdl_notebook_utils.setup_spark_session import get_spark_session
24-
from berdl_notebook_utils.clients import get_minio_client
24+
from berdl_notebook_utils.clients import get_s3_client
2525

2626

2727
def ingest(
@@ -55,7 +55,7 @@ def ingest(
5555
Notes:
5656
- SparkSession may be provided by the caller or auto-initialized.
5757
- A valid MinIO client is REQUIRED for ingestion.
58-
If `minio_client` is not provided, `get_minio_client()` is attempted.
58+
If `minio_client` is not provided, `get_s3_client()` is attempted.
5959
If MinIO cannot be initialized, the pipeline fails immediately.
6060
- Supports multiple file formats (CSV, TSV, JSON, XML, Parquet).
6161
- Schema enforcement supports SQL-style schemas (`schema_sql`) and structured schemas (`schema` list-of-maps).
@@ -101,16 +101,14 @@ def ingest(
101101
# MinIO Client Initialization
102102
# ----------------------------------------------------------------------
103103
if minio_client is None:
104-
logger.info(
105-
"No MinIO client provided — attempting auto-initialization via get_minio_client()"
106-
)
104+
logger.info("No MinIO client provided — attempting auto-initialization via get_s3_client()")
107105
try:
108-
minio_client = get_minio_client()
109-
logger.info("MinIO client successfully initialized via get_minio_client()")
106+
minio_client = get_s3_client()
107+
logger.info("MinIO client successfully initialized via get_s3_client()")
110108
except Exception as e:
111109
error_msg = (
112110
"MinIO client is required for ingestion but could not be initialized. "
113-
"Call get_minio_client() and pass it explicitly into ingest(...)."
111+
"Call get_s3_client() and pass it explicitly into ingest(...)."
114112
)
115113
return log_error(
116114
logger=logger,
@@ -120,7 +118,7 @@ def ingest(
120118
exc=e,
121119
)
122120

123-
# Defensive check in case get_minio_client() returned None without raising
121+
# Defensive check in case get_s3_client() returned None without raising
124122
if minio_client is None:
125123
error_msg = "MinIO client is required for ingestion but was not provided or initialized."
126124
return log_error(

src/data_lakehouse_ingest/orchestrator/init_utils.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Initialization utilities for the Data Lakehouse Ingest framework.
33
Handles logger setup and Spark session context initialization,
4-
including tenant creation, catalog switching, and configuration extraction.
4+
including tenant creation, namespace management, and configuration extraction.
55
"""
66

77
import logging
@@ -62,9 +62,11 @@ def init_run_context(
6262
"""
6363
Initialize the ingestion run context based on config.
6464
65-
Uses the JupyterHub helper `create_namespace_if_not_exists` instead of
66-
SQL CREATE DATABASE statements. The behavior depends on the 'is_tenant'
67-
flag in the config.
65+
Uses the Iceberg catalog flow via `create_namespace_if_not_exists(iceberg=True)`
66+
to create namespaces with catalog-level isolation (no governance prefixes).
67+
68+
The catalog is determined by the tenant name: tenant-based configs use the
69+
tenant name as the catalog, while personal configs use the ``"my"`` catalog.
6870
6971
Args:
7072
spark (SparkSession): Active Spark session.
@@ -88,32 +90,24 @@ def init_run_context(
8890
logger.info(f"Found {len(tables)} table(s) to process")
8991

9092
# ----------------------------------------------------------------------
91-
# Create namespace using JupyterHub helper
93+
# Create namespace using Iceberg catalog flow
9294
# ----------------------------------------------------------------------
9395
try:
9496
if tenant:
95-
# Multi-tenant governed environment
97+
# Multi-tenant: tenant name is used as the Iceberg catalog name
9698
namespace = create_namespace_if_not_exists(
9799
spark,
98100
namespace=dataset,
99101
tenant_name=tenant,
102+
iceberg=True,
100103
)
101104
logger.info(f"Tenant namespace created/accessed: {namespace}")
102105
else:
103-
# Personal (user-level) environment
104-
namespace = create_namespace_if_not_exists(spark, dataset)
106+
# Personal: uses the "my" catalog
107+
namespace = create_namespace_if_not_exists(spark, dataset, iceberg=True)
105108
logger.info(f"Personal namespace created/accessed: {namespace}")
106109

107-
spark.catalog.setCurrentDatabase(namespace)
108-
109-
# Extract physical namespace path
110-
try:
111-
ns_info = spark.sql(f"DESCRIBE NAMESPACE EXTENDED {namespace}").collect()
112-
base_path = [r.info_value for r in ns_info if r.info_name.lower() == "location"][0]
113-
logger.info(f"Namespace storage location: {base_path}")
114-
except Exception as e:
115-
logger.warning(f"Unable to determine namespace storage location for '{namespace}': {e}")
116-
base_path = None
110+
spark.sql(f"USE {namespace}")
117111

118112
except Exception as e:
119113
logger.error(
@@ -127,6 +121,5 @@ def init_run_context(
127121
"tenant": tenant,
128122
"dataset": dataset,
129123
"namespace": namespace,
130-
"namespace_base_path": base_path,
131124
"tables": tables,
132125
}
Lines changed: 77 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
"""
22
Input/output utilities for the Data Lakehouse Ingest framework.
33
Handles file format detection, data loading from Bronze sources,
4-
and writing curated data to Silver Delta tables.
4+
and writing curated data to Silver tables via Iceberg catalogs.
55
66
Provides a unified interface for reading CSV, TSV, JSON, and XML formats,
7-
and ensures consistent creation and registration of Delta tables in Spark.
7+
and writes tables using catalog-driven APIs (no explicit path management).
88
"""
99

1010
import logging
@@ -23,19 +23,18 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
2323
Determines the file format based on either an explicit configuration
2424
value (`explicit_fmt`) or by inspecting the file extension.
2525
26-
Supported extensions: `.csv`, `.tsv`, `.json`, `.xml`.
26+
Supported extensions: `.csv`, `.tsv`, `.json`, `.xml`, `.parquet`.
2727
2828
Args:
2929
bronze_path (str): Full S3/local path to the input data file.
30-
explicit_fmt (str | None): Optional explicit format (csv, tsv, json, xml).
30+
explicit_fmt (str | None): Optional explicit format (csv, tsv, json, xml, parquet).
3131
3232
Returns:
33-
str: The detected format name ("csv", "tsv", "json", or "xml").
33+
str: The detected format name.
3434
3535
Notes:
3636
- Explicit format overrides file extension detection.
3737
- Defaults to "csv" when no recognizable extension is found.
38-
- Ensures consistent downstream loader selection in ingestion pipelines.
3938
"""
4039

4140
# TODO: Explore using python-magic or content-based format detection.
@@ -51,7 +50,6 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
5150
if explicit_fmt:
5251
return explicit_fmt.lower()
5352

54-
# Map file extensions to formats
5553
extension_map = {
5654
"csv": "csv",
5755
"tsv": "tsv",
@@ -61,7 +59,7 @@ def detect_format(bronze_path: str, explicit_fmt: str | None) -> str:
6159
}
6260

6361
ext = bronze_path.split(".")[-1].lower()
64-
return extension_map.get(ext, "csv") # default fallback
62+
return extension_map.get(ext, "csv")
6563

6664

6765
def load_table_data(
@@ -70,9 +68,12 @@ def load_table_data(
7068
fmt: str,
7169
opts: dict,
7270
logger: logging.Logger,
73-
) -> tuple[object, int]:
71+
) -> tuple[DataFrame, int]:
7472
"""
75-
Loads a DataFrame and returns (df, rows_in).
73+
Load source data into a DataFrame and return the DataFrame with its input row count.
74+
75+
Returns:
76+
tuple[DataFrame, int]: The loaded DataFrame and number of rows read from the source.
7677
"""
7778
fmt_to_loader = {
7879
"json": load_json_data,
@@ -91,49 +92,88 @@ def load_table_data(
9192
return df, rows_in
9293

9394

94-
def write_to_delta(
95+
def table_exists(spark: SparkSession, full_table: str) -> bool:
96+
"""
97+
Check whether a catalog table exists.
98+
99+
Uses Spark table access so it works with fully qualified catalog table names,
100+
including Iceberg tables.
101+
"""
102+
try:
103+
spark.table(full_table).limit(1).count()
104+
return True
105+
except Exception:
106+
return False
107+
108+
109+
def write_table(
95110
df: DataFrame,
96111
spark: SparkSession,
97112
namespace: str,
98-
namespace_base_path: str,
99113
name: str,
100-
silver_path: str,
101114
partition_by: str | list[str] | None,
102115
mode: str,
116+
rows_in: int,
103117
logger: logging.Logger,
104118
) -> int:
105-
# TODO: Explore replacing explicit `table_path` writes with a catalog-driven approach.
106-
#
107-
# Goal:
108-
# Eliminate the need to manually construct and manage table paths (namespace_base_path/name)
109-
# by allowing Spark to handle initial table creation and location assignment.
119+
"""
120+
Write a DataFrame to a table using catalog-driven Iceberg APIs.
110121
111-
# Construct deterministic table path inside namespace storage location
112-
table_path = f"{namespace_base_path}/{name}"
122+
The Iceberg catalog manages table storage locations, so this function does
123+
not construct explicit paths or use LOCATION clauses. For overwrite mode,
124+
the table is created or replaced. For append mode, the table must already
125+
exist.
113126
114-
logger.info(f"Resolved Delta target path: {table_path}")
127+
Args:
128+
df: DataFrame to write.
129+
spark: Active SparkSession.
130+
namespace: Fully qualified namespace (e.g., ``my.dataset`` or ``kbase.dataset``).
131+
name: Table name.
132+
partition_by: Optional partition column(s).
133+
mode: Write mode. Defaults to ``"overwrite"`` when omitted.
134+
Supported values are ``"overwrite"`` and ``"append"``.
135+
rows_in: Number of rows read from the source DataFrame. This value is
136+
returned and logged as rows written, rather than counting the
137+
full target table after write.
138+
logger: Logger for structured output.
115139
116-
rows_written = df.count()
140+
Returns:
141+
Number of rows written.
142+
"""
117143

118-
# Write (with overwriteSchema only for overwrite mode)
119-
writer = df.write.format("delta").mode(mode)
144+
full_table = f"{namespace}.{name}"
145+
# Default mode
146+
mode = (mode or "overwrite").lower()
120147

121-
if mode == "overwrite":
122-
writer = writer.option("overwriteSchema", "true")
148+
if mode not in {"overwrite", "append"}:
149+
raise ValueError(
150+
f"Unsupported write mode '{mode}' for {full_table}. "
151+
"Supported modes are 'overwrite' and 'append'."
152+
)
123153

124-
if partition_by:
125-
writer = writer.partitionBy(partition_by)
154+
exists = table_exists(spark, full_table)
155+
156+
logger.info(f"Writing table: {full_table} (mode={mode}, exists={exists})")
126157

127-
writer.save(table_path)
158+
if mode == "append" and not exists:
159+
raise ValueError(
160+
f"Cannot append to {full_table} because the table does not exist. "
161+
"Use mode='overwrite' or omit mode to create the table."
162+
)
163+
164+
rows_written = rows_in
165+
166+
writer = df.writeTo(full_table)
167+
168+
if partition_by:
169+
cols = [partition_by] if isinstance(partition_by, str) else list(partition_by)
170+
writer = writer.partitionedBy(*cols)
128171

129-
# Register table if missing (no schema overwrite here!)
130-
spark.sql(f"""
131-
CREATE TABLE IF NOT EXISTS `{namespace}`.`{name}`
132-
USING DELTA
133-
LOCATION '{table_path}'
134-
""")
172+
if mode == "append":
173+
writer.append()
174+
else:
175+
writer.createOrReplace()
135176

136-
# log rows
137-
logger.info(f"Wrote {rows_written} rows → {namespace}.{name} @ {table_path}")
177+
logger.info(f"Wrote {rows_written} rows → {full_table}")
138178

139179
return rows_written

src/data_lakehouse_ingest/orchestrator/models.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
Two result types are defined:
1010
1111
- TableProcessSuccess: Returned when a table is successfully processed and
12-
written to the Silver Delta layer. Includes metrics such as rows read,
12+
written to the target Silver table. Includes metrics such as rows read,
1313
rows written, elapsed time, and optional comment application results.
1414
1515
- TableProcessFailure: Returned when a table fails during processing. Contains
@@ -51,7 +51,7 @@ class InputSource(Enum):
5151

5252
class WriteMode(Enum):
5353
"""
54-
Supported write modes when writing Delta tables.
54+
Supported write modes when writing target tables.
5555
"""
5656

5757
OVERWRITE = "overwrite"
@@ -64,30 +64,26 @@ class TableProcessSuccess:
6464
Represents a successful table ingestion result.
6565
6666
This object captures metadata and metrics produced when a table is
67-
successfully processed and written to the Silver Delta layer.
67+
successfully processed and written to the target Silver table.
6868
6969
Attributes:
7070
name: Table name.
7171
tenant: Tenant identifier associated with the ingestion run.
7272
target_table: Fully qualified target table name in the Silver layer.
73-
mode: Write mode used when writing to Delta. Represented by the WriteMode enum.
73+
mode: Write mode used when writing the target table. Represented by the WriteMode enum.
7474
format: Detected input file format when reading from Bronze storage.
7575
schema_source: Origin of the resolved schema. Represented by the SchemaSource enum.
7676
input_source: Indicates whether input was read from Bronze storage or provided
7777
as a Spark DataFrame override. Represented by the InputSource enum.
7878
bronze_path: Source path in Bronze storage if applicable.
79-
silver_path: Target storage path where the Delta table is written.
8079
rows_in: Number of input rows read.
81-
rows_written: Number of rows written to the Silver Delta table.
82-
rows_rejected: Number of rows rejected during processing.
80+
rows_written: Number of rows written to the target table.
8381
extra_columns_dropped: Columns dropped because they were not present in the schema.
84-
partitions_written: List of partitions written (if partitioning is used).
85-
quarantine_path: Location where rejected records would be stored.
8682
elapsed_sec: Processing time in seconds.
8783
status: Processing status represented by the ProcessStatus enum.
88-
table_comment_report: Result of applying Delta table-level comments when
84+
table_comment_report: Result of applying table-level comments when
8985
a table-level `comment` is provided in the config.
90-
column_comments_report: Result of applying Delta column comments when structured
86+
column_comments_report: Result of applying column comments when structured
9187
schema metadata includes column comments.
9288
"""
9389

@@ -99,13 +95,9 @@ class TableProcessSuccess:
9995
schema_source: SchemaSource
10096
input_source: InputSource
10197
bronze_path: str | None
102-
silver_path: str | None
10398
rows_in: int | None
10499
rows_written: int | None
105-
rows_rejected: int | None
106100
extra_columns_dropped: list[str]
107-
partitions_written: list[str] | None
108-
quarantine_path: str | None
109101
elapsed_sec: float | None
110102
status: ProcessStatus
111103
table_comment_report: dict[str, Any] | None

0 commit comments

Comments
 (0)