databrickslabs · BesikiML · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -7,6 +7,7 @@
 from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
 from databricks.labs.lakebridge.reconcile.recon_capture import (
     ReconIntermediatePersist,
+    cache_df_or_not,
 )
 from databricks.labs.lakebridge.reconcile.recon_output_config import (
     DataReconcileOutput,
@@ -452,7 +453,8 @@ def join_aggregate_data(
     joined_df = df.select(*normalized_joined_cols)
 
     # Write the joined df to volume path
-    joined_volume_df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(joined_df).cache()
+    joined_volume_df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(joined_df)
+    joined_volume_df = cache_df_or_not(spark, joined_volume_df)
     logger.warning(f"Unmatched data is written to {path} successfully")
 
     return joined_volume_df
@@ -1,11 +1,12 @@
 import logging
 from datetime import datetime
-from functools import reduce
+from functools import reduce, lru_cache
+from typing import Literal
 
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, collect_list, create_map, lit
 from pyspark.sql.types import StringType, StructField, StructType
-from pyspark.errors import PySparkException
+from pyspark.errors import PySparkException, PySparkAttributeError
 from sqlglot import Dialect
 
 from databricks.labs.lakebridge.config import DatabaseConfig, Table, ReconcileMetadataConfig
@@ -79,6 +80,33 @@ def write_and_read_unmatched_df_with_volumes(
             raise ReadAndWriteWithVolumeException(message) from e
 
 
+SparkRuntimeType = Literal["DATABRICKS_SERVERLESS", "CLASSIC", "SPARK_CONNECT", "NO_JVM_UNKNOWN"]
+
+
+def _classify_spark_runtime(spark: SparkSession) -> SparkRuntimeType:
+    try:
+        _ = spark.sparkContext
+        return "CLASSIC"
+    except PySparkAttributeError as e:
+        msg = str(e).lower()
+
+        if "serverless" in msg:
+            return "DATABRICKS_SERVERLESS"
+
+        if "spark connect" in msg:
+            return "SPARK_CONNECT"
+
+        return "NO_JVM_UNKNOWN"
+
+
+@lru_cache(maxsize=1)
+def cache_df_or_not(spark: SparkSession, df: DataFrame) -> DataFrame:
+    cluster_type = _classify_spark_runtime(spark)
+    if cluster_type != "DATABRICKS_SERVERLESS":
+        df = df.cache()
+    return df
+
+
 def _write_df_to_delta(df: DataFrame, table_name: str, mode="append"):
     try:
         df.write.mode(mode).saveAsTable(table_name)

@@ -28,6 +28,7 @@
 from databricks.labs.lakebridge.reconcile.query_builder.threshold_query import (
     ThresholdQueryBuilder,
 )
+from databricks.labs.lakebridge.reconcile.recon_capture import cache_df_or_not
 from databricks.labs.lakebridge.reconcile.recon_config import (
     Schema,
     Table,
@@ -303,6 +304,7 @@ def _get_sample_data(
             or reconcile_output.missing_in_src_count > 0
             or reconcile_output.missing_in_tgt_count > 0
         ):
+
             src_sampler = SamplingQueryBuilder(table_conf, src_schema, "source", self._source_engine, self._source)
             tgt_sampler = SamplingQueryBuilder(table_conf, tgt_schema, "target", self._target_engine, self._target)
             if reconcile_output.mismatch_count > 0:
@@ -370,7 +372,8 @@ def _get_mismatch_data(
 
         # Uses pre-calculated `mismatch_count` from `reconcile_output.mismatch_count` to avoid from recomputing `mismatch` for RandomSampler.
         mismatch_sampler = SamplerFactory.get_sampler(sampling_options)
-        df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target).cache()
+        df = mismatch_sampler.sample(mismatch, mismatch_count, key_columns, sampling_model_target)
+        df = cache_df_or_not(self._spark, df)
 
         src_mismatch_sample_query = src_sampler.build_query(df)
         tgt_mismatch_sample_query = tgt_sampler.build_query(df)