Merge branch 'main' into fstore-2036-uc-oauth-m2m-sdk

jimdowling · jimdowling · commit ecb45df84339 · 2026-05-25T23:22:48.000+02:00
diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java
@@ -99,7 +99,7 @@ public StreamFeatureGroup getOrCreateFeatureGroup(FeatureStore featureStore, @No
     try {
       return getStreamFeatureGroup(featureStore, name, version);
     } catch (IOException | FeatureStoreException e) {
-      if (e.getMessage().contains("\"errorCode\":270009")) {
+      if (e.getMessage().contains("Error: 404") && e.getMessage().contains("\"errorCode\":270009")) {
         return StreamFeatureGroup.builder()
             .featureStore(featureStore)
             .name(name)
diff --git a/java/hsfs/src/test/java/com/logicalclocks/hsfs/engine/TestFeatureGroupEngine.java b/java/hsfs/src/test/java/com/logicalclocks/hsfs/engine/TestFeatureGroupEngine.java
@@ -49,10 +49,10 @@ void setUp() throws Exception {
   }
 
   @Test
-  void testGetOrCreateReturnsFgWhenBackendReturns400NotFound() throws Exception {
-    // Backend returns 400 with errorCode 270009 (feature group not found)
+  void testGetOrCreateReturnsFgWhenBackendReturns404NotFound() throws Exception {
+    // Backend returns 404 with errorCode 270009 (feature group not found)
     Mockito.when(mockApi.getInternal(Mockito.any(), Mockito.any(), Mockito.any(), Mockito.any()))
-        .thenThrow(new IOException("Error: 400{\"errorCode\":270009,\"errorMsg\":\"Featuregroup wasn't found.\"}"));
+        .thenThrow(new IOException("Error: 404{\"errorCode\":270009,\"errorMsg\":\"Featuregroup wasn't found.\"}"));
 
     StreamFeatureGroup result = engine.getOrCreateFeatureGroup(
         mockFeatureStore, "test_fg", 1, "desc", true,
diff --git a/python/hsfs/core/storage_connector_api.py b/python/hsfs/core/storage_connector_api.py
@@ -15,6 +15,7 @@
 #
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING, Any
 
 from hopsworks_common import client
@@ -81,6 +82,79 @@ def refetch(
             )
         )
 
+    def resolve_unity_catalog_spark_options(
+        self,
+        feature_store_id: int,
+        name: str,
+        catalog: str,
+        schema: str,
+        table: str,
+    ) -> dict[str, Any]:
+        """Call the EE Unity Catalog spark-options resolver for a connector + table.
+
+        Response carries short-lived AWS credentials, so the EE side sets
+        Cache-Control: no-store.
+
+        Parameters:
+            feature_store_id: Numeric id of the feature store containing the connector.
+            name: Name of the Unity Catalog storage connector.
+            catalog: UC catalog name.
+            schema: UC schema name within the catalog.
+            table: UC table name within the schema.
+
+        Returns:
+            The raw JSON dict from the resolver.
+            The calling SDK code wraps it into a UnityCatalogSparkOptions dataclass.
+        """
+        _client = client.get_instance()
+        path_params = [
+            "project",
+            _client._project_id,
+            "featurestores",
+            feature_store_id,
+            "storageconnectors",
+            name,
+            "data_source",
+            "spark_options",
+        ]
+        body = json.dumps({"catalog": catalog, "schema": schema, "table": table})
+        return _client._send_request(
+            "POST",
+            path_params,
+            headers={"content-type": "application/json"},
+            data=body,
+        )
+
+    def resolve_feature_group_spark_options(
+        self,
+        feature_store_id: int,
+        feature_group_id: int,
+    ) -> dict[str, Any]:
+        """Sibling of resolve_unity_catalog_spark_options for FG-driven reads.
+
+        The FG already knows its (catalog, schema, table) via its data source
+        metadata, so the SDK doesn't pass them. EE looks them up server-side.
+
+        Parameters:
+            feature_store_id: Numeric id of the feature store containing the feature group.
+            feature_group_id: Numeric id of the Unity Catalog-backed external feature group.
+
+        Returns:
+            The raw JSON dict from the resolver.
+            Same shape as resolve_unity_catalog_spark_options.
+        """
+        _client = client.get_instance()
+        path_params = [
+            "project",
+            _client._project_id,
+            "featurestores",
+            feature_store_id,
+            "featuregroups",
+            feature_group_id,
+            "spark_options",
+        ]
+        return _client._send_request("POST", path_params)
+
     def get_online_connector(
         self, feature_store_id: int
     ) -> storage_connector.OnlineStorageConnector:
diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
@@ -5199,6 +5199,47 @@ def insert(
             ge_report.to_ge_type() if ge_report is not None else None,
         )
 
+    def _maybe_read_unity_catalog_via_spark(
+        self, *, force_vended: bool = False
+    ) -> Any | None:
+        """Return a Spark DataFrame for UC-backed external FGs, or None.
+
+        Returns None for any FG that isn't a UC external FG so the caller
+        can fall through to the standard Query path.
+        On Databricks-hosted Spark (auto-detected) routes to native
+        `spark.read.table()`; otherwise calls the FG-level
+        spark-options resolver and reads the vended Delta path.
+        `force_vended=True` skips the Databricks detection.
+        """
+        from hsfs import storage_connector as storage_connector_mod
+        from hsfs.core import storage_connector_api
+
+        ds = getattr(self, "_data_source", None) or getattr(self, "data_source", None)
+        connector = getattr(ds, "_storage_connector", None) if ds is not None else None
+        if (
+            connector is None
+            or getattr(connector, "type", None)
+            != storage_connector_mod.StorageConnector.UNITY_CATALOG
+        ):
+            return None
+
+        spark = engine.get_instance()._spark_session
+
+        if not force_vended and storage_connector_mod._running_in_databricks(spark):
+            qualified = storage_connector_mod._quote_uc_identifier(
+                ds.database, ds.group, ds.table
+            )
+            return spark.read.table(qualified)
+
+        api = storage_connector_api.StorageConnectorApi()
+        payload = api.resolve_feature_group_spark_options(
+            self._feature_store_id, self._id
+        )
+        opts = storage_connector_mod.UnityCatalogSparkOptions.from_response_json(
+            payload
+        )
+        return opts.read(spark)
+
     @public
     def read(
         self,
@@ -5209,6 +5250,8 @@ def read(
         read_options: dict[str, Any] | None = None,
         start_time: str | int | datetime | date | None = None,
         end_time: str | int | datetime | date | None = None,
+        *,
+        force_vended: bool = False,
     ) -> (
         TypeVar("pyspark.sql.DataFrame")
         | TypeVar("pyspark.RDD")
@@ -5283,6 +5326,13 @@ def read(
                 `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`, `%Y-%m-%d %H:%M:%S.%f`,
                 or ISO-8601 UTC `%Y-%m-%dT%H:%M:%S.%fZ` (e.g. `2026-01-01T00:00:00.000000Z`).
                 Scheduler-injected `HOPS_START_TIME` / `HOPS_END_TIME` use the ISO-8601 form.
+            force_vended:
+                For Unity Catalog-backed external feature groups read with Spark: skip the
+                Databricks-runtime auto-detection and always resolve vended S3 credentials
+                via Hopsworks instead of falling through to `spark.read.table()`.
+                Use when the Databricks cluster's identity lacks UC grants the connector's
+                service principal has, or to force the Hopsworks read path in tests.
+                Ignored for non-UC feature groups and for non-Spark dataframe types.
 
         Returns:
             A dataframe in the requested format containing the feature group data.
@@ -5303,6 +5353,22 @@ def read(
                 start_time, end_time
             )
 
+        # Unity Catalog external FG + spark engine: short-circuit through
+        # the FG-level spark-options resolver. The standard Query path
+        # can't read UC tables (no Spark UC adapter in the Hopsworks
+        # cluster's Spark); the resolver vends short-lived S3 credentials
+        # and we read the underlying Delta files directly.
+        if (
+            dataframe_type in ("default", "spark")
+            and engine.get_type().startswith("spark")
+            and not online
+            and start_time is None
+            and end_time is None
+        ):
+            uc_df = self._maybe_read_unity_catalog_via_spark(force_vended=force_vended)
+            if uc_df is not None:
+                return uc_df
+
         if (
             engine.get_type() == "python"
             and not online
diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py