Added serverless support to spark fixture (#91)

mwojtyczka · web-flow · commit cefd79e5009d · 2025-01-31T12:23:22.000+01:00
Extend spark fixture to support Serverless compute. ### Linked issues Resolves #90 ### Tests - [x] manually tested - [] added unit tests - [x] added integration tests - [ ] verified on staging environment (screenshot attached)
diff --git a/README.md b/README.md
@@ -375,6 +375,12 @@ See also [`log_account_link`](#log_account_link-fixture), [`make_acc_group`](#ma
 ### `spark` fixture
 Get Databricks Connect Spark session. Requires `databricks-connect` package to be installed.
 
+To enable serverless set the local environment variable `DATABRICKS_SERVERLESS_COMPUTE_ID` to `"auto"`.
+If this environment variable is set, Databricks Connect ignores the cluster_id.
+If `DATABRICKS_SERVERLESS_COMPUTE_ID` is set to a specific serverless cluster ID, that cluster will be used instead.
+However, this is not recommended, as serverless clusters are ephemeral by design.
+See more details [here](https://docs.databricks.com/en/dev-tools/databricks-connect/cluster-config.html#configure-a-connection-to-serverless-compute).
+
 Usage:
 ```python
 def test_databricks_connect(spark):
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "databricks-sdk>=0.30",
+    "databricks-sdk>=0.40,<0.42",
     "databricks-labs-lsql>=0.10",
     "pytest>=8.3",
 ]
@@ -77,6 +77,7 @@ dependencies = [
     "pytest-timeout~=2.3.1",
     "pytest-xdist~=3.5.0",
     "ruff~=0.3.4",
+    "databricks-connect~=15.4.3",
 ]
 
 # store virtual env as the child of this folder. Helps VSCode (and PyCharm) to run better
diff --git a/src/databricks/labs/pytester/fixtures/connect.py b/src/databricks/labs/pytester/fixtures/connect.py
@@ -11,23 +11,46 @@ def spark(ws: WorkspaceClient):
     """
     Get Databricks Connect Spark session. Requires `databricks-connect` package to be installed.
 
+    To enable serverless set the local environment variable `DATABRICKS_SERVERLESS_COMPUTE_ID` to `"auto"`.
+    If this environment variable is set, Databricks Connect ignores the cluster_id.
+    If `DATABRICKS_SERVERLESS_COMPUTE_ID` is set to a specific serverless cluster ID, that cluster will be used instead.
+    However, this is not recommended, as serverless clusters are ephemeral by design.
+    See more details [here](https://docs.databricks.com/en/dev-tools/databricks-connect/cluster-config.html#configure-a-connection-to-serverless-compute).
+
     Usage:
     ```python
     def test_databricks_connect(spark):
         rows = spark.sql("SELECT 1").collect()
         assert rows[0][0] == 1
     ```
     """
-    if not ws.config.cluster_id:
-        skip("No cluster_id found in the environment")
-    ws.clusters.ensure_cluster_is_running(ws.config.cluster_id)
+    cluster_id = ws.config.cluster_id
+    serverless_cluster_id = ws.config.serverless_compute_id
+
+    if not serverless_cluster_id:
+        ensure_cluster_is_running(cluster_id, ws)
+
+    if serverless_cluster_id and serverless_cluster_id != "auto":
+        ensure_cluster_is_running(serverless_cluster_id, ws)
+
     try:
         # pylint: disable-next=import-outside-toplevel
         from databricks.connect import (  # type: ignore[import-untyped]
             DatabricksSession,
         )
 
+        if serverless_cluster_id:
+            logging.debug(f"Using serverless cluster id '{serverless_cluster_id}'")
+            return DatabricksSession.builder.serverless(True).getOrCreate()
+
+        logging.debug(f"Using cluster id '{cluster_id}'")
         return DatabricksSession.builder.sdkConfig(ws.config).getOrCreate()
     except ImportError:
         skip("Please run `pip install databricks-connect`")
         return None
+
+
+def ensure_cluster_is_running(cluster_id: str, ws: WorkspaceClient) -> None:
+    if not cluster_id:
+        skip("No cluster_id found in the environment")
+    ws.clusters.ensure_cluster_is_running(cluster_id)
diff --git a/src/databricks/labs/pytester/fixtures/ml.py b/src/databricks/labs/pytester/fixtures/ml.py
@@ -128,7 +128,7 @@ def create() -> Wait[ServingEndpointDetailed]:
         model = make_model()
         endpoint = ws.serving_endpoints.create(
             endpoint_name,
-            EndpointCoreConfigInput(
+            config=EndpointCoreConfigInput(
                 served_models=[
                     ServedModelInput(
                         model_name=model.name,
diff --git a/tests/integration/fixtures/test_connect.py b/tests/integration/fixtures/test_connect.py
@@ -1,3 +1,62 @@
-def test_databricks_connect(spark):
+import os
+from pytest import fixture
+from pyspark.sql.session import SparkSession
+from databricks.connect import DatabricksSession
+from databricks.sdk import WorkspaceClient
+
+
+@fixture
+def serverless_env():
+    os.environ['DATABRICKS_SERVERLESS_COMPUTE_ID'] = "auto"
+    yield
+    os.environ.pop('DATABRICKS_SERVERLESS_COMPUTE_ID')
+
+
+@fixture
+def debug_env_bugfix(monkeypatch, debug_env):
+    # This is a workaround to set shared cluster
+    # TODO: Update secret vault for acceptance testing and remove the bugfix
+    monkeypatch.setitem(debug_env, "DATABRICKS_CLUSTER_ID", "1114-152544-29g1w07e")
+
+
+@fixture
+def spark_serverless_cluster_id(ws):
+    # get new spark session with serverless cluster outside the actual spark fixture under test
+    spark_serverless = DatabricksSession.builder.serverless(True).getOrCreate()
+    # get cluster id from the existing serverless spark session
+    cluster_id = spark_serverless.conf.get("spark.databricks.clusterUsageTags.clusterId")
+    ws.config.serverless_compute_id = cluster_id
+    yield cluster_id
+    spark_serverless.stop()
+
+
+def test_databricks_connect(debug_env_bugfix, ws, spark):
     rows = spark.sql("SELECT 1").collect()
     assert rows[0][0] == 1
+    assert not is_serverless_cluster(spark, ws)
+
+
+def test_databricks_connect_serverless(serverless_env, ws, spark):
+    rows = spark.sql("SELECT 1").collect()
+    assert rows[0][0] == 1
+    assert is_serverless_cluster(spark, ws)
+
+
+def test_databricks_connect_serverless_set_cluster_id(ws, spark_serverless_cluster_id, spark):
+    rows = spark.sql("SELECT 1").collect()
+    assert rows[0][0] == 1
+
+    cluster_id = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
+    assert spark_serverless_cluster_id == cluster_id
+    assert is_serverless_cluster(spark, ws)
+
+
+def is_serverless_cluster(spark: SparkSession, ws: WorkspaceClient) -> bool:
+    """
+    Check if the current cluster used is serverless.
+    """
+    cluster_id = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
+    if not cluster_id:
+        raise ValueError("clusterId usage tag does not exist")
+    creator = ws.clusters.get(cluster_id).creator_user_name
+    return not creator  # serverless clusters don't have assigned creator

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ classifiers = [`
`48`	`48`	`]`
`49`	`49`
`50`	`50`	`dependencies = [`
`51`		`- "databricks-sdk>=0.30",`
	`51`	`+ "databricks-sdk>=0.40,<0.42",`
`52`	`52`	`"databricks-labs-lsql>=0.10",`
`53`	`53`	`"pytest>=8.3",`
`54`	`54`	`]`
`@@ -77,6 +77,7 @@ dependencies = [`
`77`	`77`	`"pytest-timeout~=2.3.1",`
`78`	`78`	`"pytest-xdist~=3.5.0",`
`79`	`79`	`"ruff~=0.3.4",`
	`80`	`+ "databricks-connect~=15.4.3",`
`80`	`81`	`]`
`81`	`82`
`82`	`83`	`# store virtual env as the child of this folder. Helps VSCode (and PyCharm) to run better`