explicitly label the last s3 object for an acs snapshot (#3438)

isegall-da · web-flow · commit 2e0ef74cd559 · 2025-12-19T20:39:26.000Z
Signed-off-by: Itai Segall &lt;itai.segall@digitalasset.com&gt;
diff --git a/apps/scan/src/main/scala/org/lfdecentralizedtrust/splice/scan/store/bulk/AcsSnapshotBulkStorage.scala b/apps/scan/src/main/scala/org/lfdecentralizedtrust/splice/scan/store/bulk/AcsSnapshotBulkStorage.scala
@@ -87,10 +87,6 @@ class AcsSnapshotBulkStorage(
 
   def dumpAcsSnapshot(migrationId: Long, timestamp: CantonTimestamp): Future[Unit] = {
 
-    // TODO(#3429): currently, if this crashes half-way through, there is no indication in the S3 objects that
-    //  the snapshot is incomplete. We probably want to label the last object with `last` or something like that
-    //  so that we can detect incomplete snapshots and recreate them.
-
     def mksrc = {
       val idx = new AtomicInteger(0)
       val base = Source
@@ -105,19 +101,17 @@ class AcsSnapshotBulkStorage(
           1,
           OverflowStrategy.backpressure,
         )
-        .mapAsync(1) { zstdObj =>
-          {
-            val objectKey = s"snapshot_$idx.zstd"
-            // TODO(#3429): For now, we accumulate the full object in memory, then write it as a whole.
-            //    Consider streaming it to S3 instead. Need to make sure that it then handles crashes correctly,
-            //    i.e. that until we tell S3 that we're done writing, if we stop, then S3 throws away the
-            //    partially written object.
-            // TODO(#3429): Error handling
-            for {
-              _ <- s3Connection.writeFullObject(objectKey, ByteBuffer.wrap(zstdObj.toArrayUnsafe()))
-            } yield {
-              idx.addAndGet(1)
-            }
+        .mapAsync(1) { case ByteStringWithTermination(zstdObj, isLast) =>
+          val objectKey = if (isLast) s"snapshot_${idx}_last.zstd" else s"snapshot_$idx.zstd"
+          // TODO(#3429): For now, we accumulate the full object in memory, then write it as a whole.
+          //    Consider streaming it to S3 instead. Need to make sure that it then handles crashes correctly,
+          //    i.e. that until we tell S3 that we're done writing, if we stop, then S3 throws away the
+          //    partially written object.
+          // TODO(#3429): Error handling
+          for {
+            _ <- s3Connection.writeFullObject(objectKey, ByteBuffer.wrap(zstdObj.toArrayUnsafe()))
+          } yield {
+            idx.addAndGet(1)
           }
         }
       val withKs = base.viaMat(KillSwitches.single)(Keep.right)
diff --git a/apps/scan/src/main/scala/org/lfdecentralizedtrust/splice/scan/store/bulk/ZstdGroupedWeight.scala b/apps/scan/src/main/scala/org/lfdecentralizedtrust/splice/scan/store/bulk/ZstdGroupedWeight.scala
@@ -11,17 +11,23 @@ import org.apache.pekko.util.ByteString
 
 import java.util.concurrent.atomic.AtomicReference
 
+case class ByteStringWithTermination(
+    bytes: ByteString,
+    isLast: Boolean,
+)
+
 /** A Pekko GraphStage that zstd-compresses a stream of bytestrings, and splits the output into zstd objects of size (minWeight + delta).
   * Somewhat similar to Pekko's built-in GroupedWeight, but outputs valid zstd compressed objects.
   */
-case class ZstdGroupedWeight(minSize: Long) extends GraphStage[FlowShape[ByteString, ByteString]] {
+case class ZstdGroupedWeight(minSize: Long)
+    extends GraphStage[FlowShape[ByteString, ByteStringWithTermination]] {
   require(minSize > 0, "minSize must be greater than 0")
 
   val zstdTmpBufferSize = 10 * 1024 * 1024; // TODO(#3429): make configurable?
 
   val in = Inlet[ByteString]("ZstdGroupedWeight.in")
-  val out = Outlet[ByteString]("ZstdGroupedWeight.out")
-  override val shape: FlowShape[ByteString, ByteString] = FlowShape(in, out)
+  val out = Outlet[ByteStringWithTermination]("ZstdGroupedWeight.out")
+  override val shape: FlowShape[ByteString, ByteStringWithTermination] = FlowShape(in, out)
 
   override def initialAttributes: Attributes = Attributes.name("ZstdGroupedWeight")
 
@@ -100,7 +106,7 @@ case class ZstdGroupedWeight(minSize: Long) extends GraphStage[FlowShape[ByteStr
         state.set(state.get().append(compressed))
         if (state.get().left <= 0) {
           state.set(state.get().append(zstd.get().zstdFinish()))
-          push(out, state.get().bytes)
+          push(out, ByteStringWithTermination(state.get().bytes, false))
           reset()
         } else {
           pull(in)
@@ -112,7 +118,7 @@ case class ZstdGroupedWeight(minSize: Long) extends GraphStage[FlowShape[ByteStr
       override def onUpstreamFinish(): Unit = {
         if (state.get().bytes.nonEmpty) {
           state.set(state.get().append(zstd.get().zstdFinish()))
-          push(out, state.get().bytes)
+          push(out, ByteStringWithTermination(state.get().bytes, true))
         }
         completeStage()
       }
diff --git a/apps/scan/src/test/scala/org/lfdecentralizedtrust/splice/scan/store/AcsSnapshotBulkStorageTest.scala b/apps/scan/src/test/scala/org/lfdecentralizedtrust/splice/scan/store/AcsSnapshotBulkStorageTest.scala
@@ -63,6 +63,7 @@ class AcsSnapshotBulkStorageTest extends StoreTest with HasExecutionContext with
             s3BucketConnection,
             loggerFactory,
           ).dumpAcsSnapshot(0, timestamp)
+
           s3Objects <- s3BucketConnection.s3Client
             .listObjects(
               ListObjectsRequest.builder().bucket("bucket").build()
@@ -80,13 +81,16 @@ class AcsSnapshotBulkStorageTest extends StoreTest with HasExecutionContext with
             )
             .map(_.createdEventsInPage)
         } yield {
-          val allContractsFromS3 = s3Objects.contents.asScala
+          val objectKeys = s3Objects.contents.asScala.sortBy(_.key())
+          val allContractsFromS3 = objectKeys
             .map(readUncompressAndDecode(s3BucketConnection))
             .flatten
 
           allContractsFromS3.map(
             reconstructFromS3
           ) should contain theSameElementsInOrderAs allContracts.map(_.event)
+          objectKeys.take(objectKeys.size - 1).forall { !_.key().endsWith("_last.zstd") }
+          objectKeys.last.key() should endWith("_last.zstd")
         }
       })
     }