scylladb · pizzaeueu · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala b/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala
@@ -40,8 +40,7 @@ object Migrator {
             migratorConfig.getSkipTokenRangesOrEmptySet)
           ScyllaMigrator.migrate(migratorConfig, scyllaTarget, sourceDF)
         case (parquetSource: SourceSettings.Parquet, scyllaTarget: TargetSettings.Scylla) =>
-          val sourceDF = readers.Parquet.readDataFrame(spark, parquetSource)
-          ScyllaMigrator.migrate(migratorConfig, scyllaTarget, sourceDF)
+          readers.Parquet.migrateToScylla(migratorConfig, parquetSource, scyllaTarget)(spark)
         case (dynamoSource: SourceSettings.DynamoDB, alternatorTarget: TargetSettings.DynamoDB) =>
           AlternatorMigrator.migrateFromDynamoDB(dynamoSource, alternatorTarget, migratorConfig)
         case (

diff --git a/migrator/src/main/scala/com/scylladb/migrator/alternator/StringSetAccumulator.scala b/migrator/src/main/scala/com/scylladb/migrator/alternator/StringSetAccumulator.scala
@@ -0,0 +1,28 @@
+package com.scylladb.migrator.alternator
+
+import org.apache.spark.util.AccumulatorV2
+import java.util.concurrent.atomic.AtomicReference
+
+class StringSetAccumulator(initialValue: Set[String] = Set.empty)
+    extends AccumulatorV2[String, Set[String]] {
+
+  private val ref = new AtomicReference(initialValue)
+
+  // Note: isZero may be momentarily inconsistent in concurrent scenarios,
+  // as it reads the current value of the set without synchronization.
+  // This is eventually consistent and thread-safe, but may not reflect the most recent updates.
+  def isZero: Boolean = ref.get.isEmpty
+  def copy(): StringSetAccumulator = new StringSetAccumulator(ref.get)
+  def reset(): Unit = ref.set(Set.empty)
+  def add(v: String): Unit = ref.getAndUpdate(_ + v)
+
+  def merge(other: AccumulatorV2[String, Set[String]]): Unit =
+    ref.getAndUpdate(_ ++ other.value)
+
+  def value: Set[String] = ref.get
+}
+
+object StringSetAccumulator {
+  def apply(initialValue: Set[String] = Set.empty): StringSetAccumulator =
+    new StringSetAccumulator(initialValue)
+}
diff --git a/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala b/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala
@@ -14,6 +14,7 @@ case class MigratorConfig(source: SourceSettings,
                           savepoints: Savepoints,
                           skipTokenRanges: Option[Set[(Token[_], Token[_])]],
                           skipSegments: Option[Set[Int]],
+                          skipParquetFiles: Option[Set[String]],
                           validation: Option[Validation]) {
   def render: String = this.asJson.asYaml.spaces2
 
@@ -25,6 +26,8 @@ case class MigratorConfig(source: SourceSettings,
 
   def getSkipTokenRangesOrEmptySet: Set[(Token[_], Token[_])] = skipTokenRanges.getOrElse(Set.empty)
 
+  def getSkipParquetFilesOrEmptySet: Set[String] = skipParquetFiles.getOrElse(Set.empty)
+
 }
 object MigratorConfig {
   implicit val tokenEncoder: Encoder[Token[_]] = Encoder.instance {

diff --git a/migrator/src/main/scala/com/scylladb/migrator/config/Savepoints.scala b/migrator/src/main/scala/com/scylladb/migrator/config/Savepoints.scala
@@ -4,6 +4,7 @@ import io.circe.{ Decoder, Encoder }
 import io.circe.generic.semiauto.{ deriveDecoder, deriveEncoder }
 
-
+
+/**
+  * Configuration for periodic savepoints written during a migration run.
+  *
+  * @param intervalSeconds
+  *   How often, in seconds, a savepoint should be written.
+  * @param path
+  *   Filesystem path (directory or prefix) where savepoint data will be stored.
+  * @param enableParquetFileTracking
+  *   When `true` (the default), enables tracking of already-processed Parquet files
+  *   as part of the savepoint state. This prevents the same Parquet file from
+  *   being migrated more than once if the job is restarted or savepoints are
+  *   resumed.
+  *
+  *   Set this to `false` to keep the legacy behavior where Parquet files are not
+  *   tracked in savepoints. Disabling tracking may be useful for backwards
+  *   compatibility with older savepoints or when file tracking is handled by an
+  *   external mechanism, but it means repeated runs may reprocess the same
+  *   Parquet files.
+  */
-
+
+/**
+  * Configuration for periodic savepoints written during a migration run.
+  *
+  * @param intervalSeconds
+  *   How often, in seconds, a savepoint should be written.
+  * @param path
+  *   Filesystem path (directory or prefix) where savepoint data will be stored.
+  * @param enableParquetFileTracking
+  *   When `true` (the default), enables tracking of already-processed Parquet files
+  *   as part of the savepoint state. This prevents the same Parquet file from
+  *   being migrated more than once if the job is restarted or savepoints are
+  *   resumed.
+  *
+  *   Set this to `false` to keep the legacy behavior where Parquet files are not
+  *   tracked in savepoints. Disabling tracking may be useful for backwards
+  *   compatibility with older savepoints or when file tracking is handled by an
+  *   external mechanism, but it means repeated runs may reprocess the same
+  *   Parquet files.
+  */
 case class Savepoints(intervalSeconds: Int, path: String)
+
 object Savepoints {
   implicit val encoder: Encoder[Savepoints] = deriveEncoder[Savepoints]
   implicit val decoder: Decoder[Savepoints] = deriveDecoder[Savepoints]

diff --git a/migrator/src/main/scala/com/scylladb/migrator/readers/FileCompletionListener.scala b/migrator/src/main/scala/com/scylladb/migrator/readers/FileCompletionListener.scala
@@ -0,0 +1,93 @@
+package com.scylladb.migrator.readers
+
+import org.apache.log4j.LogManager
+import org.apache.spark.scheduler.{ SparkListener, SparkListenerTaskEnd }
+import org.apache.spark.Success
+
+import scala.collection.concurrent.TrieMap
+
+/**
+  * SparkListener that tracks partition completion and aggregates it to file-level completion.
+  *
+  * This listener monitors Spark task completion events and maintains mappings between
+  * partitions and files. When all partitions belonging to a file have been successfully
+  * completed, it marks the file as processed via the ParquetSavepointsManager.
+  *
+  * @param partitionToFile Mapping from Spark partition ID to source file paths
+  * @param fileToPartitions Mapping from file path to the set of partition IDs reading from it
+  * @param savepointsManager Manager to notify when files are completed
+  */
+class FileCompletionListener(
+  partitionToFiles: Map[Int, Set[String]],
+  fileToPartitions: Map[String, Set[Int]],
+  savepointsManager: ParquetSavepointsManager
+) extends SparkListener {
+
+  private val log = LogManager.getLogger("com.scylladb.migrator.readers.FileCompletionListener")
+
+  private val completedPartitions = TrieMap.empty[Int, Boolean]
+
+  private val completedFiles = TrieMap.empty[String, Boolean]
+
+  log.info(
+    s"FileCompletionListener initialized: tracking ${fileToPartitions.size} files " +
+      s"across ${partitionToFiles.size} partitions")
+
+  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit =
+    if (taskEnd.reason == Success) {
+      val partitionId = taskEnd.taskInfo.partitionId
+
+      partitionToFiles.get(partitionId) match {
+        case Some(filenames) =>
+          if (completedPartitions.putIfAbsent(partitionId, true).isEmpty) {
+            filenames.foreach { filename =>
+              log.debug(s"Partition $partitionId completed (file: $filename)")
+              checkFileCompletion(filename)
+            }
+          }
+
+        case None =>
+          log.trace(s"Task completed for untracked partition $partitionId")
+      }
+    } else {
+      log.debug(
+        s"Task for partition ${taskEnd.taskInfo.partitionId} did not complete successfully: ${taskEnd.reason}")
+    }
+
+  private def checkFileCompletion(filename: String): Unit = {
+    if (completedFiles.contains(filename)) {
+      return
+    }
+
+    fileToPartitions.get(filename) match {
+      case Some(allPartitions) =>
+        val allComplete = allPartitions.forall(completedPartitions.contains)
+
+        if (allComplete) {
+          if (completedFiles.putIfAbsent(filename, true).isEmpty) {
+            savepointsManager.markFileAsProcessed(filename)
+
+            val progress = s"${completedFiles.size}/${fileToPartitions.size}"
+            log.info(s"File completed: $filename (progress: $progress)")
+          }
+        } else {
+          val completedCount = allPartitions.count(completedPartitions.contains)
+          log.trace(s"File $filename: $completedCount/${allPartitions.size} partitions complete")
+        }
+
+      case None =>
+        log.warn(s"File $filename not found in fileToPartitions map (this shouldn't happen)")
+    }
+  }
+
+  def getCompletedFilesCount: Int = completedFiles.size
+
+  def getTotalFilesCount: Int = fileToPartitions.size
+
+  def getProgressReport: String = {
+    val filesCompleted = getCompletedFilesCount
+    val totalFiles = getTotalFilesCount
+
+    s"Progress: $filesCompleted/$totalFiles files"
+  }
+}
diff --git a/migrator/src/main/scala/com/scylladb/migrator/readers/Parquet.scala b/migrator/src/main/scala/com/scylladb/migrator/readers/Parquet.scala
@@ -1,22 +1,121 @@
 package com.scylladb.migrator.readers
 
-import com.scylladb.migrator.config.SourceSettings
-import com.scylladb.migrator.scylla.SourceDataFrame
+import com.scylladb.migrator.config.{ MigratorConfig, SourceSettings, TargetSettings }
+import com.scylladb.migrator.scylla.{ ScyllaParquetMigrator, SourceDataFrame }
 import org.apache.log4j.LogManager
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{ AnalysisException, SparkSession }
+import scala.util.Using
 
 object Parquet {
   val log = LogManager.getLogger("com.scylladb.migrator.readers.Parquet")
 
-  def readDataFrame(spark: SparkSession, source: SourceSettings.Parquet): SourceDataFrame = {
+  def migrateToScylla(config: MigratorConfig,
+                      source: SourceSettings.Parquet,
+                      target: TargetSettings.Scylla)(implicit spark: SparkSession): Unit = {
+    log.info("Starting Parquet migration with parallel processing and file-level savepoints")
+
+    configureHadoopCredentials(spark, source)
+
+    val allFiles = listParquetFiles(spark, source.path)
+    val skipFiles = config.getSkipParquetFilesOrEmptySet
+    val filesToProcess = allFiles.filterNot(skipFiles.contains)
+
+    if (filesToProcess.isEmpty) {
+      log.info("No Parquet files to process. Migration is complete.")
+      return
+    }
+
+    log.info(s"Processing ${filesToProcess.size} Parquet files")
+
+    val df = if (skipFiles.isEmpty) {
+      spark.read.parquet(source.path)
+    } else {
+      spark.read.parquet(filesToProcess: _*)
+    }
-    val df = if (skipFiles.isEmpty) {
-      spark.read.parquet(source.path)
-    } else {
-      spark.read.parquet(filesToProcess: _*)
-    }
+    val df = spark.read.parquet(filesToProcess: _*)
-    val df = if (skipFiles.isEmpty) {
-      spark.read.parquet(source.path)
-    } else {
-      spark.read.parquet(filesToProcess: _*)
-    }
+    val df = spark.read.parquet(filesToProcess: _*)
+
+    log.info("Reading partition metadata for file tracking...")
+    val metadata = PartitionMetadataReader.readMetadataFromDataFrame(df)
+
+    val partitionToFiles = PartitionMetadataReader.buildPartitionToFileMap(metadata)
+    val fileToPartitions = PartitionMetadataReader.buildFileToPartitionsMap(metadata)
+
+    log.info(
+      s"Discovered ${fileToPartitions.size} files with ${metadata.size} total partitions to process")
+
+    Using.resource(ParquetSavepointsManager(config, spark.sparkContext)) { savepointsManager =>
+      val listener = new FileCompletionListener(
+        partitionToFiles,
+        fileToPartitions,
+        savepointsManager
+      )
+      spark.sparkContext.addSparkListener(listener)
+
+      try {
+        val sourceDF = SourceDataFrame(df, None, savepointsSupported = false)
+
+        log.info("Created DataFrame from Parquet source")
+
+        ScyllaParquetMigrator.migrate(config, target, sourceDF, savepointsManager)
+
+        savepointsManager.dumpMigrationState("completed")
+
+        log.info(
+          s"Parquet migration completed successfully: " +
+            s"${listener.getCompletedFilesCount}/${listener.getTotalFilesCount} files processed")
+
+      } finally {
+        spark.sparkContext.removeSparkListener(listener)
+        log.info(s"Final progress: ${listener.getProgressReport}")
+      }
+    }
+  }
+
+  def listParquetFiles(spark: SparkSession, path: String): Seq[String] = {
+    log.info(s"Discovering Parquet files in $path")
+
+    try {
+      val dataFrame = spark.read
+        .option("recursiveFileLookup", "true")
+        .parquet(path)
+
+      val files = dataFrame.inputFiles.toSeq.distinct.sorted
+
+      if (files.isEmpty) {
+        throw new IllegalArgumentException(s"No Parquet files found in $path")
+      }
+
+      log.info(s"Found ${files.size} Parquet file(s)")
+      files
+    } catch {
+      case e: AnalysisException =>
+        val message = s"Failed to list Parquet files from $path: ${e.getMessage}"
+        log.error(message)
+        throw new IllegalArgumentException(message, e)
+    }
+  }
+
+  /**
+    * Configures Hadoop S3A credentials for reading from AWS S3.
+    *
+    * This method sets the necessary Hadoop configuration properties for AWS access key, secret key,
+    * and optionally a session token. When a session token is present, it sets the credentials provider
+    * to TemporaryAWSCredentialsProvider as required by Hadoop.
+    *
+    * If a region is specified in the source configuration, this method also sets the S3A endpoint region
+    * via the `fs.s3a.endpoint.region` property.
+    *
+    * For more details, see the official Hadoop AWS documentation:
+    * https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Authentication
+    */
+  private[readers] def configureHadoopCredentials(spark: SparkSession,
+                                                  source: SourceSettings.Parquet): Unit =
     source.finalCredentials.foreach { credentials =>
       log.info("Loaded AWS credentials from config file")
       source.region.foreach { region =>
         spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint.region", region)
       }
       spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", credentials.accessKey)
       spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", credentials.secretKey)
-      // See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Using_Session_Credentials_with_TemporaryAWSCredentialsProvider
       credentials.maybeSessionToken.foreach { sessionToken =>
         spark.sparkContext.hadoopConfiguration.set(
           "fs.s3a.aws.credentials.provider",
@@ -27,8 +126,4 @@ object Parquet {
         )
       }
     }
-
-    SourceDataFrame(spark.read.parquet(source.path), None, false)
-  }
-
 }
diff --git a/migrator/src/main/scala/com/scylladb/migrator/readers/ParquetSavepointsManager.scala b/migrator/src/main/scala/com/scylladb/migrator/readers/ParquetSavepointsManager.scala
@@ -0,0 +1,36 @@
+package com.scylladb.migrator.readers
+
+import com.scylladb.migrator.SavepointsManager
+import com.scylladb.migrator.config.MigratorConfig
+import com.scylladb.migrator.alternator.StringSetAccumulator
+import org.apache.spark.SparkContext
+
+class ParquetSavepointsManager(migratorConfig: MigratorConfig,
+                               filesAccumulator: StringSetAccumulator)
+    extends SavepointsManager(migratorConfig) {
+
+  def describeMigrationState(): String = {
+    val processedCount = filesAccumulator.value.size
+    s"Processed files: $processedCount"
+  }
+
+  def updateConfigWithMigrationState(): MigratorConfig =
+    migratorConfig.copy(skipParquetFiles = Some(filesAccumulator.value))
+
+  def markFileAsProcessed(filePath: String): Unit = {
+    filesAccumulator.add(filePath)
+    log.debug(s"Marked file as processed: $filePath")
+  }
+}
+
+object ParquetSavepointsManager {
+
+  def apply(migratorConfig: MigratorConfig, spark: SparkContext): ParquetSavepointsManager = {
+    val filesAccumulator =
+      StringSetAccumulator(migratorConfig.skipParquetFiles.getOrElse(Set.empty))
+
+    spark.register(filesAccumulator, "processed-parquet-files")
+
+    new ParquetSavepointsManager(migratorConfig, filesAccumulator)
+  }
+}