@@ -774,6 +774,92 @@ void testFailedCopyShouldDeleteTheDanglingSegment() throws Exception {
774774 assertEquals (1 , brokerTopicStats .allTopicsStats ().remoteCopyLagSegmentsAggrMetric ().value ());
775775 }
776776
777+ @ Test
778+ void testFailedCopyWithRetriableExceptionShouldNotDeleteTheDanglingSegment () throws Exception {
779+ long oldSegmentStartOffset = 0L ;
780+ long nextSegmentStartOffset = 150L ;
781+ long lastStableOffset = 150L ;
782+ long logEndOffset = 150L ;
783+
784+ when (mockLog .onlyLocalLogSegmentsSize ()).thenReturn (12L );
785+ when (mockLog .onlyLocalLogSegmentsCount ()).thenReturn (2L );
786+ when (mockLog .topicPartition ()).thenReturn (leaderTopicIdPartition .topicPartition ());
787+
788+ // leader epoch preparation
789+ checkpoint .write (totalEpochEntries );
790+ LeaderEpochFileCache cache = new LeaderEpochFileCache (leaderTopicIdPartition .topicPartition (), checkpoint , scheduler );
791+ when (mockLog .leaderEpochCache ()).thenReturn (cache );
792+ when (remoteLogMetadataManager .highestOffsetForEpoch (any (TopicIdPartition .class ), anyInt ())).thenReturn (Optional .of (-1L ));
793+
794+ File tempFile = TestUtils .tempFile ();
795+ File mockProducerSnapshotIndex = TestUtils .tempFile ();
796+ File tempDir = TestUtils .tempDirectory ();
797+ // create 2 log segments, with 0 and 150 as log start offset
798+ LogSegment oldSegment = mock (LogSegment .class );
799+ LogSegment activeSegment = mock (LogSegment .class );
800+
801+ when (oldSegment .baseOffset ()).thenReturn (oldSegmentStartOffset );
802+ when (activeSegment .baseOffset ()).thenReturn (nextSegmentStartOffset );
803+ when (activeSegment .size ()).thenReturn (2 );
804+ verify (oldSegment , times (0 )).readNextOffset ();
805+ verify (activeSegment , times (0 )).readNextOffset ();
806+
807+ FileRecords fileRecords = mock (FileRecords .class );
808+ when (oldSegment .log ()).thenReturn (fileRecords );
809+ when (fileRecords .file ()).thenReturn (tempFile );
810+ when (fileRecords .sizeInBytes ()).thenReturn (10 );
811+ when (oldSegment .readNextOffset ()).thenReturn (nextSegmentStartOffset );
812+
813+ when (mockLog .activeSegment ()).thenReturn (activeSegment );
814+ when (mockLog .logStartOffset ()).thenReturn (oldSegmentStartOffset );
815+ when (mockLog .logSegments (anyLong (), anyLong ())).thenReturn (List .of (oldSegment , activeSegment ));
816+
817+ ProducerStateManager mockStateManager = mock (ProducerStateManager .class );
818+ when (mockLog .producerStateManager ()).thenReturn (mockStateManager );
819+ when (mockStateManager .fetchSnapshot (anyLong ())).thenReturn (Optional .of (mockProducerSnapshotIndex ));
820+ when (mockLog .lastStableOffset ()).thenReturn (lastStableOffset );
821+ when (mockLog .logEndOffset ()).thenReturn (logEndOffset );
822+
823+ OffsetIndex idx = LazyIndex .forOffset (LogFileUtils .offsetIndexFile (tempDir , oldSegmentStartOffset , "" ), oldSegmentStartOffset , 1000 ).get ();
824+ TimeIndex timeIdx = LazyIndex .forTime (LogFileUtils .timeIndexFile (tempDir , oldSegmentStartOffset , "" ), oldSegmentStartOffset , 1500 ).get ();
825+ File txnFile = UnifiedLog .transactionIndexFile (tempDir , oldSegmentStartOffset , "" );
826+ txnFile .createNewFile ();
827+ TransactionIndex txnIndex = new TransactionIndex (oldSegmentStartOffset , txnFile );
828+ when (oldSegment .timeIndex ()).thenReturn (timeIdx );
829+ when (oldSegment .offsetIndex ()).thenReturn (idx );
830+ when (oldSegment .txnIndex ()).thenReturn (txnIndex );
831+
832+ CompletableFuture <Void > dummyFuture = new CompletableFuture <>();
833+ dummyFuture .complete (null );
834+ when (remoteLogMetadataManager .addRemoteLogSegmentMetadata (any (RemoteLogSegmentMetadata .class ))).thenReturn (dummyFuture );
835+ when (rlmCopyQuotaManager .getThrottleTimeMs ()).thenReturn (quotaAvailableThrottleTime );
836+ when (remoteLogMetadataManager .updateRemoteLogSegmentMetadata (any (RemoteLogSegmentMetadataUpdate .class ))).thenReturn (dummyFuture );
837+
838+ // throw retriable exception when copyLogSegmentData
839+ when (remoteStorageManager .copyLogSegmentData (any (RemoteLogSegmentMetadata .class ), any (LogSegmentData .class )))
840+ .thenThrow (new RetriableRemoteStorageException ("test-retriable" ));
841+ RemoteLogManager .RLMCopyTask task = remoteLogManager .new RLMCopyTask (leaderTopicIdPartition , 128 );
842+ assertThrows (RetriableRemoteStorageException .class , () -> task .copyLogSegmentsToRemote (mockLog ));
843+
844+ ArgumentCaptor <RemoteLogSegmentMetadata > remoteLogSegmentMetadataArg = ArgumentCaptor .forClass (RemoteLogSegmentMetadata .class );
845+ verify (remoteLogMetadataManager ).addRemoteLogSegmentMetadata (remoteLogSegmentMetadataArg .capture ());
846+ // verify the segment is not deleted for retriable exception
847+ verify (remoteStorageManager , never ()).deleteLogSegmentData (eq (remoteLogSegmentMetadataArg .getValue ()));
848+ verify (remoteLogMetadataManager , never ()).updateRemoteLogSegmentMetadata (any (RemoteLogSegmentMetadataUpdate .class ));
849+
850+ // Verify the metrics
851+ // Retriable exceptions should not count as failures for copy
852+ assertEquals (1 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).remoteCopyRequestRate ().count ());
853+ assertEquals (0 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).remoteCopyBytesRate ().count ());
854+ assertEquals (0 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).failedRemoteCopyRequestRate ().count ());
855+ // Verify aggregate metrics
856+ assertEquals (1 , brokerTopicStats .allTopicsStats ().remoteCopyRequestRate ().count ());
857+ assertEquals (0 , brokerTopicStats .allTopicsStats ().remoteCopyBytesRate ().count ());
858+ assertEquals (0 , brokerTopicStats .allTopicsStats ().failedRemoteCopyRequestRate ().count ());
859+ assertEquals (10 , brokerTopicStats .allTopicsStats ().remoteCopyLagBytesAggrMetric ().value ());
860+ assertEquals (1 , brokerTopicStats .allTopicsStats ().remoteCopyLagSegmentsAggrMetric ().value ());
861+ }
862+
777863 @ Test
778864 void testRemoteLogManagerTasksAvgIdlePercentAndMetadataCountMetrics () throws Exception {
779865 long oldSegmentStartOffset = 0L ;
@@ -2401,7 +2487,7 @@ long findLogStartOffset(TopicIdPartition topicIdPartition, UnifiedLog log) {
24012487 Thread copyThread = new Thread (() -> {
24022488 try {
24032489 copyTask .copyLogSegmentsToRemote (mockLog );
2404- } catch (InterruptedException e ) {
2490+ } catch (InterruptedException | RetriableRemoteStorageException e ) {
24052491 throw new RuntimeException (e );
24062492 }
24072493 });
@@ -2840,6 +2926,61 @@ public void testFailedDeleteExpiredSegments(long retentionSize,
28402926 verify (remoteStorageManager ).deleteLogSegmentData (metadataList .get (0 ));
28412927 }
28422928
2929+ @ ParameterizedTest (name = "testDeleteSegmentFailureWithRetriableExceptionShouldNotUpdateMetrics retentionSize={0} retentionMs={1}" )
2930+ @ CsvSource (value = {"0, -1" , "-1, 0" })
2931+ public void testDeleteSegmentFailureWithRetriableExceptionShouldNotUpdateMetrics (long retentionSize ,
2932+ long retentionMs ) throws RemoteStorageException , ExecutionException , InterruptedException {
2933+ Map <String , Long > logProps = new HashMap <>();
2934+ logProps .put ("retention.bytes" , retentionSize );
2935+ logProps .put ("retention.ms" , retentionMs );
2936+ LogConfig mockLogConfig = new LogConfig (logProps );
2937+ when (mockLog .config ()).thenReturn (mockLogConfig );
2938+
2939+ List <EpochEntry > epochEntries = List .of (epochEntry0 );
2940+ checkpoint .write (epochEntries );
2941+ LeaderEpochFileCache cache = new LeaderEpochFileCache (tp , checkpoint , scheduler );
2942+ when (mockLog .leaderEpochCache ()).thenReturn (cache );
2943+
2944+ when (mockLog .topicPartition ()).thenReturn (leaderTopicIdPartition .topicPartition ());
2945+ when (mockLog .logEndOffset ()).thenReturn (200L );
2946+
2947+ List <RemoteLogSegmentMetadata > metadataList =
2948+ listRemoteLogSegmentMetadata (leaderTopicIdPartition , 1 , 100 , 1024 , RemoteLogSegmentState .COPY_SEGMENT_FINISHED );
2949+ when (remoteLogMetadataManager .listRemoteLogSegments (leaderTopicIdPartition ))
2950+ .thenReturn (metadataList .iterator ());
2951+ when (remoteLogMetadataManager .listRemoteLogSegments (leaderTopicIdPartition , 0 ))
2952+ .thenAnswer (ans -> metadataList .iterator ());
2953+ when (remoteLogMetadataManager .updateRemoteLogSegmentMetadata (any (RemoteLogSegmentMetadataUpdate .class )))
2954+ .thenReturn (CompletableFuture .runAsync (() -> { }));
2955+
2956+ // Verify the metrics for remote deletes and for failures is zero before attempt to delete segments
2957+ assertEquals (0 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).remoteDeleteRequestRate ().count ());
2958+ assertEquals (0 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).failedRemoteDeleteRequestRate ().count ());
2959+ // Verify aggregate metrics
2960+ assertEquals (0 , brokerTopicStats .allTopicsStats ().remoteDeleteRequestRate ().count ());
2961+ assertEquals (0 , brokerTopicStats .allTopicsStats ().failedRemoteDeleteRequestRate ().count ());
2962+
2963+ RemoteLogManager .RLMExpirationTask task = remoteLogManager .new RLMExpirationTask (leaderTopicIdPartition );
2964+ doThrow (new RetriableRemoteStorageException ("Failed to delete segment with retriable exception" )).when (remoteStorageManager ).deleteLogSegmentData (any ());
2965+ assertThrows (RetriableRemoteStorageException .class , task ::cleanupExpiredRemoteLogSegments );
2966+
2967+ assertEquals (100L , currentLogStartOffset .get ());
2968+ verify (remoteStorageManager ).deleteLogSegmentData (metadataList .get (0 ));
2969+
2970+ // Verify the metric for remote delete is updated correctly
2971+ assertEquals (1 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).remoteDeleteRequestRate ().count ());
2972+ // Verify we did not report failure for remote deletes with retriable exception
2973+ assertEquals (0 , brokerTopicStats .topicStats (leaderTopicIdPartition .topic ()).failedRemoteDeleteRequestRate ().count ());
2974+ // Verify aggregate metrics
2975+ assertEquals (1 , brokerTopicStats .allTopicsStats ().remoteDeleteRequestRate ().count ());
2976+ assertEquals (0 , brokerTopicStats .allTopicsStats ().failedRemoteDeleteRequestRate ().count ());
2977+
2978+ // make sure we'll retry the deletion in next run
2979+ doNothing ().when (remoteStorageManager ).deleteLogSegmentData (any ());
2980+ task .cleanupExpiredRemoteLogSegments ();
2981+ verify (remoteStorageManager ).deleteLogSegmentData (metadataList .get (0 ));
2982+ }
2983+
28432984 @ ParameterizedTest (name = "testDeleteLogSegmentDueToRetentionSizeBreach segmentCount={0} deletableSegmentCount={1}" )
28442985 @ CsvSource (value = {"50, 0" , "50, 1" , "50, 23" , "50, 50" })
28452986 public void testDeleteLogSegmentDueToRetentionSizeBreach (int segmentCount ,
0 commit comments