Add change feed micro-batch streaming scenarios to Databricks live test notebooks

xinlian12 · Copilot · xinlian12 · commit 90ca3f6a9964 · 2026-04-16T15:15:28.000-07:00
Add structured streaming scenarios using cosmos.oltp.changeFeed to both
basicScenario.scala and basicScenarioAadManagedIdentity.scala notebooks.
These scenarios exercise the ChangeFeedInitialOffsetWriter and
HDFSMetadataLog code paths that can break on certain Spark distributions
(e.g. Databricks Runtime 17.3+).

Each scenario:
- Creates a sink container
- Reads change feed from source via readStream with micro-batch
- Writes to sink container via writeStream
- Validates records were copied
- Cleans up both containers

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenario.scala b/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenario.scala
@@ -111,5 +111,69 @@ df.filter(col("isAlive") === true)
 
 // COMMAND ----------
 
+// Change Feed - micro-batch structured streaming
+// This exercises the ChangeFeedInitialOffsetWriter and HDFSMetadataLog code paths
+// that can break on certain Spark distributions (e.g. Databricks Runtime 17.3+)
+
+import org.apache.spark.sql.streaming.Trigger
+
+val sinkContainerName = cosmosContainerName + "Sink"
+spark.sql(s"CREATE TABLE IF NOT EXISTS cosmosCatalog.${cosmosDatabaseName}.${sinkContainerName} using cosmos.oltp " +
+  s"TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '400')")
+
+val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> cosmosContainerName,
+  "spark.cosmos.read.inferSchema.enabled" -> "false",
+  "spark.cosmos.changeFeed.startFrom" -> "Beginning",
+  "spark.cosmos.changeFeed.mode" -> "Incremental",
+  "spark.cosmos.enforceNativeTransport" -> "true"
+)
+
+val writeCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> sinkContainerName,
+  "spark.cosmos.write.strategy" -> "ItemOverwrite",
+  "spark.cosmos.write.bulk.enabled" -> "true",
+  "spark.cosmos.enforceNativeTransport" -> "true"
+)
+
+val testId = java.util.UUID.randomUUID().toString.replace("-", "")
+
+val changeFeedDF = spark
+  .readStream
+  .format("cosmos.oltp.changeFeed")
+  .options(changeFeedCfg)
+  .load()
+
+val microBatchQuery = changeFeedDF
+  .writeStream
+  .format("cosmos.oltp")
+  .queryName(testId)
+  .options(writeCfg)
+  .option("checkpointLocation", s"/tmp/$testId/")
+  .outputMode("append")
+  .start()
+
+microBatchQuery.processAllAvailable()
+
+val sinkCount = spark.read.format("cosmos.oltp").options(Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> sinkContainerName,
+  "spark.cosmos.enforceNativeTransport" -> "true"
+)).load().count()
+
+println(s"Change Feed micro-batch streaming: $sinkCount records copied to sink container")
+assert(sinkCount >= 2, s"Expected at least 2 records in sink container but found $sinkCount")
+
+microBatchQuery.stop()
+
+// COMMAND ----------
+
 // cleanup
+spark.sql(s"DROP TABLE cosmosCatalog.${cosmosDatabaseName}.${sinkContainerName};")
 spark.sql(s"DROP TABLE cosmosCatalog.${cosmosDatabaseName}.${cosmosContainerName};")
diff --git a/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenarioAadManagedIdentity.scala b/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenarioAadManagedIdentity.scala
@@ -96,5 +96,81 @@ df.filter(col("isAlive") === true)
 
 // COMMAND ----------
 
+// Change Feed - micro-batch structured streaming
+// This exercises the ChangeFeedInitialOffsetWriter and HDFSMetadataLog code paths
+// that can break on certain Spark distributions (e.g. Databricks Runtime 17.3+)
+
+import org.apache.spark.sql.streaming.Trigger
+
+val sinkContainerName = cosmosContainerName + "Sink"
+spark.sql(s"CREATE TABLE IF NOT EXISTS cosmosCatalogMI.${cosmosDatabaseName}.${sinkContainerName} using cosmos.oltp " +
+  s"TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '400')")
+
+val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.auth.type" -> authType,
+  "spark.cosmos.account.subscriptionId" -> subscriptionId,
+  "spark.cosmos.account.tenantId" -> tenantId,
+  "spark.cosmos.account.resourceGroupName" -> resourceGroupName,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> cosmosContainerName,
+  "spark.cosmos.read.inferSchema.enabled" -> "false",
+  "spark.cosmos.changeFeed.startFrom" -> "Beginning",
+  "spark.cosmos.changeFeed.mode" -> "Incremental",
+  "spark.cosmos.enforceNativeTransport" -> "true",
+  "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
+)
+
+val writeCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.auth.type" -> authType,
+  "spark.cosmos.account.subscriptionId" -> subscriptionId,
+  "spark.cosmos.account.tenantId" -> tenantId,
+  "spark.cosmos.account.resourceGroupName" -> resourceGroupName,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> sinkContainerName,
+  "spark.cosmos.write.strategy" -> "ItemOverwrite",
+  "spark.cosmos.write.bulk.enabled" -> "true",
+  "spark.cosmos.enforceNativeTransport" -> "true",
+  "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
+)
+
+val testId = java.util.UUID.randomUUID().toString.replace("-", "")
+
+val changeFeedDF = spark
+  .readStream
+  .format("cosmos.oltp.changeFeed")
+  .options(changeFeedCfg)
+  .load()
+
+val microBatchQuery = changeFeedDF
+  .writeStream
+  .format("cosmos.oltp")
+  .queryName(testId)
+  .options(writeCfg)
+  .option("checkpointLocation", s"/tmp/$testId/")
+  .outputMode("append")
+  .start()
+
+microBatchQuery.processAllAvailable()
+
+val sinkCount = spark.read.format("cosmos.oltp").options(Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.auth.type" -> authType,
+  "spark.cosmos.account.subscriptionId" -> subscriptionId,
+  "spark.cosmos.account.tenantId" -> tenantId,
+  "spark.cosmos.account.resourceGroupName" -> resourceGroupName,
+  "spark.cosmos.database" -> cosmosDatabaseName,
+  "spark.cosmos.container" -> sinkContainerName,
+  "spark.cosmos.enforceNativeTransport" -> "true",
+  "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
+)).load().count()
+
+println(s"Change Feed micro-batch streaming: $sinkCount records copied to sink container")
+assert(sinkCount >= 2, s"Expected at least 2 records in sink container but found $sinkCount")
+
+microBatchQuery.stop()
+
+// COMMAND ----------
+
 // cleanup
+spark.sql(s"DROP TABLE cosmosCatalogMI.${cosmosDatabaseName}.${sinkContainerName};")
 spark.sql(s"DROP TABLE cosmosCatalogMI.${cosmosDatabaseName}.${cosmosContainerName};")