scylladb · tarzanek · Sep 25, 2025 · Jul 31, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoDB.scala b/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoDB.scala
@@ -9,14 +9,70 @@ import org.apache.hadoop.mapred.JobConf
 import org.apache.log4j.LogManager
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
-import software.amazon.awssdk.services.dynamodb.model.{ AttributeValue, TableDescription }
+import software.amazon.awssdk.services.dynamodb.model.{
+  AttributeValue,
+  DeleteItemRequest,
+  TableDescription
+}
 
 import java.util
+import java.util.stream.Collectors
 
 object DynamoDB {
 
   val log = LogManager.getLogger("com.scylladb.migrator.writers.DynamoDB")
 
+  def deleteRDD(target: TargetSettings.DynamoDB,
+                targetTableDesc: TableDescription,
+                rdd: RDD[util.Map[String, AttributeValue]])(implicit spark: SparkSession): Unit = {
+
+    val keySchema = targetTableDesc.keySchema()
+
+    rdd.foreachPartition { partition =>
+      if (partition.nonEmpty) {
+        val dynamoDB = DynamoUtils.buildDynamoClient(
+          target.endpoint,
+          target.finalCredentials.map(_.toProvider),
+          target.region,
+          Seq.empty
+        )
+
+        try {
+          partition.foreach { item =>
+            val keyToDelete =
+              new util.HashMap[String, AttributeValue]()
+
+            keySchema.forEach { keyElement =>
+              val keyName = keyElement.attributeName()
+              if (item.containsKey(keyName)) {
+                keyToDelete.put(keyName, item.get(keyName))
+              }
+            }
+
+            if (!keyToDelete.isEmpty) {
+              try {
+                dynamoDB.deleteItem(
+                  DeleteItemRequest
+                    .builder()
+                    .tableName(target.table)
+                    .key(keyToDelete)
+                    .build()
+                )
+              } catch {
+                case e: Exception =>
+                  log.error(
+                    s"Failed to delete item with key ${keyToDelete} from table ${target.table}",
+                    e)
+              }
-            if (!keyToDelete.isEmpty) {
-              try {
-                dynamoDB.deleteItem(
-                  DeleteItemRequest
-                    .builder()
-                    .tableName(target.table)
-                    .key(keyToDelete)
-                    .build()
-                )
-              } catch {
-                case e: Exception =>
-                  log.error(
-                    s"Failed to delete item with key ${keyToDelete} from table ${target.table}",
-                    e)
-              }
+            try {
+              dynamoDB.deleteItem(
+                DeleteItemRequest
+                  .builder()
+                  .tableName(target.table)
+                  .key(keyToDelete)
+                  .build()
+              )
+            } catch {
+              case e: Exception =>
+                log.error(
+                  s"Failed to delete item with key ${keyToDelete} from table ${target.table}",
+                  e)
-            if (!keyToDelete.isEmpty) {
-              try {
-                dynamoDB.deleteItem(
-                  DeleteItemRequest
-                    .builder()
-                    .tableName(target.table)
-                    .key(keyToDelete)
-                    .build()
-                )
-              } catch {
-                case e: Exception =>
-                  log.error(
-                    s"Failed to delete item with key ${keyToDelete} from table ${target.table}",
-                    e)
-              }
+            try {
+              dynamoDB.deleteItem(
+                DeleteItemRequest
+                  .builder()
+                  .tableName(target.table)
+                  .key(keyToDelete)
+                  .build()
+              )
+            } catch {
+              case e: Exception =>
+                log.error(
+                  s"Failed to delete item with key ${keyToDelete} from table ${target.table}",
+                  e)
+            }
+          }
+        } finally {
+          dynamoDB.close()
+        }
+      }
+    }
+  }
+
   def writeRDD(target: TargetSettings.DynamoDB,
                renamesMap: Map[String, String],
                rdd: RDD[(Text, DynamoDBItemWritable)],

diff --git a/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoStreamReplication.scala b/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoStreamReplication.scala
@@ -4,24 +4,32 @@ import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter
 import com.amazonaws.services.dynamodbv2.model.{ AttributeValue => AttributeValueV1 }
 import com.scylladb.migrator.AttributeValueUtils
 import com.scylladb.migrator.config.{ AWSCredentials, SourceSettings, TargetSettings }
-import org.apache.hadoop.dynamodb.DynamoDBItemWritable
-import org.apache.hadoop.io.Text
 import org.apache.log4j.LogManager
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.kinesis.{
   KinesisDynamoDBInputDStream,
   KinesisInitialPositions,
   SparkAWSCredentials
 }
-import software.amazon.awssdk.services.dynamodb.model.TableDescription
+import com.scylladb.migrator.DynamoUtils
+import software.amazon.awssdk.services.dynamodb.model.{
+  AttributeValue => AttributeValueV2,
+  DeleteItemRequest,
+  PutItemRequest,
+  TableDescription
+}
 
 import java.util
 import java.util.stream.Collectors
+import scala.jdk.CollectionConverters._
 
 object DynamoStreamReplication {
   val log = LogManager.getLogger("com.scylladb.migrator.writers.DynamoStreamReplication")
 
+  type DynamoItem = util.Map[String, AttributeValueV1]
+
   // We enrich the table items with a column `operationTypeColumn` describing the type of change
   // applied to the item.
   // We have to deal with multiple representation of the data because `spark-kinesis-dynamodb`
@@ -30,6 +38,78 @@ object DynamoStreamReplication {
   private val putOperation = new AttributeValueV1().withBOOL(true)
   private val deleteOperation = new AttributeValueV1().withBOOL(false)
 
+  private[writers] def run(
+    msgs: RDD[Option[DynamoItem]],
+    target: TargetSettings.DynamoDB,
+    renamesMap: Map[String, String],
+    targetTableDesc: TableDescription)(implicit spark: SparkSession): Unit = {
+    val rdd = msgs.flatMap(_.toSeq)
+
+    val putCount = spark.sparkContext.longAccumulator("putCount")
+    val deleteCount = spark.sparkContext.longAccumulator("deleteCount")
+    val keyAttributeNames = targetTableDesc.keySchema.asScala.map(_.attributeName).toSet
+
+    rdd.foreachPartition { partition =>
+      if (partition.nonEmpty) {
+        val client =
+          DynamoUtils.buildDynamoClient(
+            target.endpoint,
+            target.finalCredentials.map(_.toProvider),
+            target.region,
+            Seq.empty
+          )
+        try {
+          partition.foreach { item =>
+            val isPut = item.get(operationTypeColumn) == putOperation
+
+            val itemWithoutOp = item.asScala.collect {
+              case (k, v) if k != operationTypeColumn => k -> AttributeValueUtils.fromV1(v)
+            }.asJava
-            val itemWithoutOp = item.asScala.collect {
-              case (k, v) if k != operationTypeColumn => k -> AttributeValueUtils.fromV1(v)
-            }.asJava
+            val itemWithoutOp = {
+              val m = new java.util.HashMap[String, AttributeValueV2]()
+              val it = item.entrySet().iterator()
+              while (it.hasNext) {
+                val entry = it.next()
+                if (entry.getKey != operationTypeColumn) {
+                  m.put(entry.getKey, AttributeValueUtils.fromV1(entry.getValue))
+                }
+              }
+              m
+            }
-            val itemWithoutOp = item.asScala.collect {
-              case (k, v) if k != operationTypeColumn => k -> AttributeValueUtils.fromV1(v)
-            }.asJava
+            val itemWithoutOp = {
+              val m = new java.util.HashMap[String, AttributeValueV2]()
+              val it = item.entrySet().iterator()
+              while (it.hasNext) {
+                val entry = it.next()
+                if (entry.getKey != operationTypeColumn) {
+                  m.put(entry.getKey, AttributeValueUtils.fromV1(entry.getValue))
+                }
+              }
+              m
+            }
+
+            if (isPut) {
+              putCount.add(1)
+              val finalItem = itemWithoutOp.asScala.map {
+                case (key, value) => renamesMap.getOrElse(key, key) -> value
+              }.asJava
+              try {
+                client.putItem(
+                  PutItemRequest.builder().tableName(target.table).item(finalItem).build())
+              } catch {
+                case e: Exception =>
+                  log.error(s"Failed to put item into ${target.table}", e)
+              }
+            } else {
+              deleteCount.add(1)
+              val keyToDelete = itemWithoutOp.asScala
+                .filter { case (key, _) => keyAttributeNames.contains(key) }
+                .map { case (key, value) => renamesMap.getOrElse(key, key) -> value }
+                .asJava
+              try {
+                client.deleteItem(
+                  DeleteItemRequest.builder().tableName(target.table).key(keyToDelete).build())
+              } catch {
+                case e: Exception =>
+                  log.error(s"Failed to delete item from ${target.table}", e)
+              }
+            }
+          }
+        } finally {
+          client.close()
+        }
+      }
+    }
+
+    if (putCount.value > 0 || deleteCount.value > 0) {
+      log.info(s"""
+                  |Changes to be applied:
+                  |  - ${putCount.value} items to UPSERT
+                  |  - ${deleteCount.value} items to DELETE
+                  |""".stripMargin)
+    } else {
+      log.info("No changes to apply")
+    }
+  }
+
   def createDStream(spark: SparkSession,
                     streamingContext: StreamingContext,
                     src: SourceSettings.DynamoDB,
@@ -45,7 +125,7 @@ object DynamoStreamReplication {
       messageHandler = {
         case recAdapter: RecordAdapter =>
           val rec = recAdapter.getInternalObject
-          val newMap = new util.HashMap[String, AttributeValueV1]()
+          val newMap: DynamoItem = new util.HashMap[String, AttributeValueV1]()
 
           if (rec.getDynamodb.getNewImage ne null) {
             newMap.putAll(rec.getDynamodb.getNewImage)
@@ -76,50 +156,7 @@ object DynamoStreamReplication {
         }
         .getOrElse(SparkAWSCredentials.builder.build())
     ).foreachRDD { msgs =>
-      val rdd = msgs
-        .collect { case Some(item) => item: util.Map[String, AttributeValueV1] }
-        .repartition(Runtime.getRuntime.availableProcessors() * 2)
-
-      val changes =
-        rdd
-          .groupBy { item =>
-            item.get(operationTypeColumn) match {
-              case `putOperation`    => "UPSERT"
-              case `deleteOperation` => "DELETE"
-              case _                 => "UNKNOWN"
-            }
-          }
-          .mapValues(_.size)
-          .collect()
-      if (changes.nonEmpty) {
-        log.info("Changes to be applied:")
-        for ((operation, count) <- changes) {
-          log.info(s"${operation}: ${count}")
-        }
-      } else {
-        log.info("No changes to apply")
-      }
-
-      val writableRdd =
-        rdd.map { item =>
-          (
-            new Text,
-            new DynamoDBItemWritable(
-              item
-                .entrySet()
-                .stream()
-                .collect(
-                  Collectors.toMap(
-                    (e: util.Map.Entry[String, AttributeValueV1]) => e.getKey,
-                    (e: util.Map.Entry[String, AttributeValueV1]) =>
-                      AttributeValueUtils.fromV1(e.getValue)
-                  )
-                )
-            )
-          )
-        }
-
-      DynamoDB.writeRDD(target, renamesMap, writableRdd, targetTableDesc)(spark)
+      run(msgs, target, renamesMap, targetTableDesc)(spark)
     }
 
 }