|
1 | 1 | package io.aiven.guardian.kafka.backup.s3
|
2 | 2 |
|
| 3 | +import akka.Done |
| 4 | +import akka.NotUsed |
| 5 | +import akka.actor.ActorSystem |
| 6 | +import akka.stream.alpakka.s3.FailedUploadPart |
3 | 7 | import akka.stream.alpakka.s3.MultipartUploadResult
|
| 8 | +import akka.stream.alpakka.s3.Part |
4 | 9 | import akka.stream.alpakka.s3.S3Attributes
|
5 | 10 | import akka.stream.alpakka.s3.S3Headers
|
6 | 11 | import akka.stream.alpakka.s3.S3Settings
|
| 12 | +import akka.stream.alpakka.s3.SuccessfulUploadPart |
| 13 | +import akka.stream.alpakka.s3.UploadPartResponse |
7 | 14 | import akka.stream.alpakka.s3.scaladsl.S3
|
8 | 15 | import akka.stream.scaladsl._
|
9 | 16 | import akka.util.ByteString
|
| 17 | +import com.typesafe.scalalogging.StrictLogging |
10 | 18 | import io.aiven.guardian.kafka.KafkaClientInterface
|
11 | 19 | import io.aiven.guardian.kafka.backup.BackupClientInterface
|
12 | 20 | import io.aiven.guardian.kafka.backup.configs.Backup
|
13 | 21 | import io.aiven.guardian.kafka.s3.configs.{S3 => S3Config}
|
14 | 22 |
|
| 23 | +import scala.collection.immutable |
15 | 24 | import scala.concurrent.ExecutionContext
|
16 | 25 | import scala.concurrent.Future
|
17 | 26 |
|
| 27 | +import java.time.Instant |
| 28 | + |
| 29 | +final case class CurrentS3State(uploadId: String, parts: Seq[Part]) |
| 30 | + |
18 | 31 | class BackupClient[T <: KafkaClientInterface](maybeS3Settings: Option[S3Settings])(implicit
|
19 | 32 | override val kafkaClientInterface: T,
|
20 | 33 | override val backupConfig: Backup,
|
| 34 | + override val system: ActorSystem, |
21 | 35 | s3Config: S3Config,
|
22 | 36 | s3Headers: S3Headers
|
23 |
| -) extends BackupClientInterface[T] { |
| 37 | +) extends BackupClientInterface[T] |
| 38 | + with StrictLogging { |
24 | 39 |
|
25 | 40 | override def empty: () => Future[Option[MultipartUploadResult]] = () => Future.successful(None)
|
26 | 41 |
|
27 | 42 | override type BackupResult = Option[MultipartUploadResult]
|
28 | 43 |
|
29 |
| - override def backupToStorageSink(key: String): Sink[ByteString, Future[BackupResult]] = { |
30 |
| - val base = S3 |
31 |
| - .multipartUploadWithHeaders( |
32 |
| - s3Config.dataBucket, |
33 |
| - key, |
34 |
| - s3Headers = s3Headers, |
35 |
| - chunkingParallelism = 1 // Parallelism is pointless for this usecase since we are directly streaming from Kafka |
| 44 | + override type CurrentState = CurrentS3State |
| 45 | + |
| 46 | + override def getCurrentUploadState(key: String): Future[Option[CurrentS3State]] = { |
| 47 | + implicit val ec: ExecutionContext = system.classicSystem.getDispatcher |
| 48 | + |
| 49 | + val baseListMultipart = S3.listMultipartUpload(s3Config.dataBucket, None) |
| 50 | + |
| 51 | + for { |
| 52 | + incompleteUploads <- |
| 53 | + maybeS3Settings |
| 54 | + .fold(baseListMultipart)(s3Settings => baseListMultipart.withAttributes(S3Attributes.settings(s3Settings))) |
| 55 | + .runWith(Sink.seq) |
| 56 | + keys = incompleteUploads.filter(_.key == key) |
| 57 | + result <- if (keys.isEmpty) |
| 58 | + Future.successful(None) |
| 59 | + else { |
| 60 | + val listMultipartUploads = keys match { |
| 61 | + case Seq(single) => |
| 62 | + logger.info( |
| 63 | + s"Found previous uploadId: ${single.uploadId} and bucket: ${s3Config.dataBucket} with key: ${single.key}" |
| 64 | + ) |
| 65 | + single |
| 66 | + case rest => |
| 67 | + val last = rest.minBy(_.initiated)(Ordering[Instant].reverse) |
| 68 | + logger.warn( |
| 69 | + s"Found multiple previously cancelled uploads for key: $key and bucket: ${s3Config.dataBucket}, picking uploadId: ${last.uploadId}" |
| 70 | + ) |
| 71 | + last |
| 72 | + } |
| 73 | + val uploadId = listMultipartUploads.uploadId |
| 74 | + val baseList = S3.listParts(s3Config.dataBucket, key, listMultipartUploads.uploadId) |
| 75 | + |
| 76 | + for { |
| 77 | + parts <- maybeS3Settings |
| 78 | + .fold(baseList)(s3Settings => baseList.withAttributes(S3Attributes.settings(s3Settings))) |
| 79 | + .runWith(Sink.seq) |
| 80 | + |
| 81 | + finalParts = parts.lastOption match { |
| 82 | + case Some(part) if part.size >= akka.stream.alpakka.s3.scaladsl.S3.MinChunkSize => |
| 83 | + parts |
| 84 | + case _ => |
| 85 | + // We drop the last part here since its broken |
| 86 | + parts.dropRight(1) |
| 87 | + } |
| 88 | + } yield Some(CurrentS3State(uploadId, finalParts.map(_.toPart))) |
| 89 | + } |
| 90 | + } yield result |
| 91 | + |
| 92 | + } |
| 93 | + |
| 94 | + private[s3] def failureSink |
| 95 | + : Sink[(FailedUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext]), Future[Done]] = Sink |
| 96 | + .foreach[(FailedUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext])] { case (failedPart, _) => |
| 97 | + logger.warn( |
| 98 | + s"Failed to upload a chunk into S3 with bucket: ${failedPart.multipartUpload.bucket}, key: ${failedPart.multipartUpload.key}, uploadId: ${failedPart.multipartUpload.uploadId} and partNumber: ${failedPart.partNumber}", |
| 99 | + failedPart.exception |
36 | 100 | )
|
37 |
| - .mapMaterializedValue(future => future.map(result => Some(result))(ExecutionContext.parasitic)) |
| 101 | + } |
| 102 | + |
| 103 | + private[s3] def successSink |
| 104 | + : Sink[(SuccessfulUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext]), Future[Done]] = |
| 105 | + kafkaClientInterface.commitCursor |
| 106 | + .contramap[(SuccessfulUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext])] { case (_, cursors) => |
| 107 | + kafkaClientInterface.batchCursorContext(cursors) |
| 108 | + } |
| 109 | + |
| 110 | + private[s3] def kafkaBatchSink |
| 111 | + : Sink[(UploadPartResponse, immutable.Iterable[kafkaClientInterface.CursorContext]), NotUsed] = { |
| 112 | + |
| 113 | + val success = Flow[(UploadPartResponse, immutable.Iterable[kafkaClientInterface.CursorContext])] |
| 114 | + .collectType[(SuccessfulUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext])] |
| 115 | + .wireTap { data => |
| 116 | + val (part, _) = data |
| 117 | + logger.info( |
| 118 | + s"Committing kafka cursor for uploadId:${part.multipartUpload.uploadId} key: ${part.multipartUpload.key} and S3 part: ${part.partNumber}" |
| 119 | + ) |
| 120 | + } |
| 121 | + .toMat(successSink)(Keep.none) |
| 122 | + |
| 123 | + val failure = Flow[(UploadPartResponse, immutable.Iterable[kafkaClientInterface.CursorContext])] |
| 124 | + .collectType[(FailedUploadPart, immutable.Iterable[kafkaClientInterface.CursorContext])] |
| 125 | + .toMat(failureSink)(Keep.none) |
| 126 | + |
| 127 | + Sink.combine(success, failure)(Broadcast(_)) |
| 128 | + } |
| 129 | + |
| 130 | + override def backupToStorageSink(key: String, |
| 131 | + currentState: Option[CurrentS3State] |
| 132 | + ): Sink[(ByteString, kafkaClientInterface.CursorContext), Future[BackupResult]] = { |
| 133 | + |
| 134 | + // Note that chunkingParallelism is pointless for this usecase since we are directly streaming from Kafka. |
| 135 | + // Furthermore the real `KafkaClient` implementation uses `CommittableOffsetBatch` which is a global singleton so |
| 136 | + // we can't have concurrent updates to this data structure. |
| 137 | + |
| 138 | + val sink = currentState match { |
| 139 | + case Some(state) => |
| 140 | + logger.info( |
| 141 | + s"Resuming previous upload with uploadId: ${state.uploadId} and bucket: ${s3Config.dataBucket} with key: $key" |
| 142 | + ) |
| 143 | + S3.resumeMultipartUploadWithHeadersAndContext[kafkaClientInterface.CursorContext]( |
| 144 | + s3Config.dataBucket, |
| 145 | + key, |
| 146 | + state.uploadId, |
| 147 | + state.parts, |
| 148 | + kafkaBatchSink, |
| 149 | + s3Headers = s3Headers, |
| 150 | + chunkingParallelism = 1 |
| 151 | + ) |
| 152 | + case None => |
| 153 | + logger.info( |
| 154 | + s"Creating new upload with bucket: ${s3Config.dataBucket} and key: $key" |
| 155 | + ) |
| 156 | + S3.multipartUploadWithHeadersAndContext[kafkaClientInterface.CursorContext]( |
| 157 | + s3Config.dataBucket, |
| 158 | + key, |
| 159 | + kafkaBatchSink, |
| 160 | + s3Headers = s3Headers, |
| 161 | + chunkingParallelism = 1 |
| 162 | + ) |
| 163 | + } |
| 164 | + |
| 165 | + val base = sink.mapMaterializedValue(future => future.map(result => Some(result))(ExecutionContext.parasitic)) |
38 | 166 | maybeS3Settings.fold(base)(s3Settings => base.withAttributes(S3Attributes.settings(s3Settings)))
|
39 | 167 | }
|
40 | 168 | }
|
0 commit comments