Simplify change feed streaming test to use memory sink

xinlian12 · Copilot · xinlian12 · commit 880994db8237 · 2026-04-16T20:51:25.000-07:00
Replace cosmos.oltp sink with in-memory sink to eliminate the need for
a separate sink container. This avoids 404 errors from sink container
creation/resolution and removes checkpoint path concerns.

The test still exercises the full ChangeFeedInitialOffsetWriter and
HDFSMetadataLog code paths (readStream with cosmos.oltp.changeFeed),
which is the goal for validating the MetadataVersionUtil fix.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenario.scala b/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenario.scala
@@ -115,10 +115,6 @@ df.filter(col("isAlive") === true)
 // This exercises the ChangeFeedInitialOffsetWriter and HDFSMetadataLog code paths
 // that can break on certain Spark distributions (e.g. Databricks Runtime 17.3+)
 
-val sinkContainerName = cosmosContainerName + "Sink"
-spark.sql(s"CREATE TABLE IF NOT EXISTS cosmosCatalog.${cosmosDatabaseName}.${sinkContainerName} using cosmos.oltp " +
-  s"TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '400')")
-
 val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
   "spark.cosmos.accountKey" -> cosmosMasterKey,
   "spark.cosmos.database" -> cosmosDatabaseName,
@@ -129,15 +125,6 @@ val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
   "spark.cosmos.enforceNativeTransport" -> "true"
 )
 
-val writeCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
-  "spark.cosmos.accountKey" -> cosmosMasterKey,
-  "spark.cosmos.database" -> cosmosDatabaseName,
-  "spark.cosmos.container" -> sinkContainerName,
-  "spark.cosmos.write.strategy" -> "ItemOverwrite",
-  "spark.cosmos.write.bulk.enabled" -> "true",
-  "spark.cosmos.enforceNativeTransport" -> "true"
-)
-
 val testId = java.util.UUID.randomUUID().toString.replace("-", "")
 
 val changeFeedDF = spark
@@ -148,29 +135,19 @@ val changeFeedDF = spark
 
 val microBatchQuery = changeFeedDF
   .writeStream
-  .format("cosmos.oltp")
+  .format("memory")
   .queryName(testId)
-  .options(writeCfg)
-  .option("checkpointLocation", s"file:/tmp/$testId/")
   .outputMode("append")
   .start()
 
 microBatchQuery.processAllAvailable()
 microBatchQuery.stop()
 
-val sinkCount = spark.read.format("cosmos.oltp").options(Map(
-  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
-  "spark.cosmos.accountKey" -> cosmosMasterKey,
-  "spark.cosmos.database" -> cosmosDatabaseName,
-  "spark.cosmos.container" -> sinkContainerName,
-  "spark.cosmos.enforceNativeTransport" -> "true"
-)).load().count()
-
-println(s"Change Feed micro-batch streaming: $sinkCount records copied to sink container")
-assert(sinkCount >= 2, s"Expected at least 2 records in sink container but found $sinkCount")
+val sinkCount = spark.sql(s"SELECT * FROM $testId").count()
+println(s"Change Feed micro-batch streaming: $sinkCount records read via change feed")
+assert(sinkCount >= 2, s"Expected at least 2 records from change feed but found $sinkCount")
 
 // COMMAND ----------
 
 // cleanup
-spark.sql(s"DROP TABLE cosmosCatalog.${cosmosDatabaseName}.${sinkContainerName};")
 spark.sql(s"DROP TABLE cosmosCatalog.${cosmosDatabaseName}.${cosmosContainerName};")
diff --git a/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenarioAadManagedIdentity.scala b/sdk/cosmos/azure-cosmos-spark_3/test-databricks/notebooks/basicScenarioAadManagedIdentity.scala
@@ -100,10 +100,6 @@ df.filter(col("isAlive") === true)
 // This exercises the ChangeFeedInitialOffsetWriter and HDFSMetadataLog code paths
 // that can break on certain Spark distributions (e.g. Databricks Runtime 17.3+)
 
-val sinkContainerName = cosmosContainerName + "Sink"
-spark.sql(s"CREATE TABLE IF NOT EXISTS cosmosCatalogMI.${cosmosDatabaseName}.${sinkContainerName} using cosmos.oltp " +
-  s"TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '400')")
-
 val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
   "spark.cosmos.auth.type" -> authType,
   "spark.cosmos.account.subscriptionId" -> subscriptionId,
@@ -118,19 +114,6 @@ val changeFeedCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
   "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
 )
 
-val writeCfg = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
-  "spark.cosmos.auth.type" -> authType,
-  "spark.cosmos.account.subscriptionId" -> subscriptionId,
-  "spark.cosmos.account.tenantId" -> tenantId,
-  "spark.cosmos.account.resourceGroupName" -> resourceGroupName,
-  "spark.cosmos.database" -> cosmosDatabaseName,
-  "spark.cosmos.container" -> sinkContainerName,
-  "spark.cosmos.write.strategy" -> "ItemOverwrite",
-  "spark.cosmos.write.bulk.enabled" -> "true",
-  "spark.cosmos.enforceNativeTransport" -> "true",
-  "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
-)
-
 val testId = java.util.UUID.randomUUID().toString.replace("-", "")
 
 val changeFeedDF = spark
@@ -141,33 +124,19 @@ val changeFeedDF = spark
 
 val microBatchQuery = changeFeedDF
   .writeStream
-  .format("cosmos.oltp")
+  .format("memory")
   .queryName(testId)
-  .options(writeCfg)
-  .option("checkpointLocation", s"file:/tmp/$testId/")
   .outputMode("append")
   .start()
 
 microBatchQuery.processAllAvailable()
 microBatchQuery.stop()
 
-val sinkCount = spark.read.format("cosmos.oltp").options(Map(
-  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
-  "spark.cosmos.auth.type" -> authType,
-  "spark.cosmos.account.subscriptionId" -> subscriptionId,
-  "spark.cosmos.account.tenantId" -> tenantId,
-  "spark.cosmos.account.resourceGroupName" -> resourceGroupName,
-  "spark.cosmos.database" -> cosmosDatabaseName,
-  "spark.cosmos.container" -> sinkContainerName,
-  "spark.cosmos.enforceNativeTransport" -> "true",
-  "spark.cosmos.read.consistencyStrategy" -> "LatestCommitted",
-)).load().count()
-
-println(s"Change Feed micro-batch streaming: $sinkCount records copied to sink container")
-assert(sinkCount >= 2, s"Expected at least 2 records in sink container but found $sinkCount")
+val sinkCount = spark.sql(s"SELECT * FROM $testId").count()
+println(s"Change Feed micro-batch streaming: $sinkCount records read via change feed")
+assert(sinkCount >= 2, s"Expected at least 2 records from change feed but found $sinkCount")
 
 // COMMAND ----------
 
 // cleanup
-spark.sql(s"DROP TABLE cosmosCatalogMI.${cosmosDatabaseName}.${sinkContainerName};")
 spark.sql(s"DROP TABLE cosmosCatalogMI.${cosmosDatabaseName}.${cosmosContainerName};")