apache · kirktrue · Aug 16, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java b/clients/src/main/java/org/apache/kafka/clients/producer/internals/TransactionManager.java
@@ -667,14 +667,23 @@ public synchronized void maybeTransitionToErrorState(RuntimeException exception)
     }
 
     synchronized void handleFailedBatch(ProducerBatch batch, RuntimeException exception, boolean adjustSequenceNumbers) {
-        maybeTransitionToErrorState(exception);
+        boolean isStaleBatch = batch.producerId() == producerIdAndEpoch.producerId && batch.producerEpoch() < producerIdAndEpoch.epoch;
+
+        if (!isStaleBatch && !hasFatalError())
+            maybeTransitionToErrorState(exception);
+
         removeInFlightBatch(batch);
 
         if (hasFatalError()) {
             log.debug("Ignoring batch {} with producer id {}, epoch {}, and sequence number {} " +
                             "since the producer is already in fatal error state", batch, batch.producerId(),
                     batch.producerEpoch(), batch.baseSequence(), exception);
             return;
+        } else if (isStaleBatch) {
+            log.debug("Ignoring stale batch {} with producer id {}, epoch {}, and sequence number {} " +
+                    "since the producer has been re-initialized with producer id {} and epoch {}", batch, batch.producerId(),
+                batch.producerEpoch(), batch.baseSequence(), producerIdAndEpoch.producerId, producerIdAndEpoch.epoch, exception);
+            return;
         }
 
         if (exception instanceof OutOfOrderSequenceException && !isTransactional()) {

diff --git a/...nts/src/test/java/org/apache/kafka/clients/producer/internals/TransactionManagerTest.java b/...nts/src/test/java/org/apache/kafka/clients/producer/internals/TransactionManagerTest.java
@@ -28,6 +28,7 @@
 import org.apache.kafka.common.Node;
 import org.apache.kafka.common.TopicPartition;
 import org.apache.kafka.common.compress.Compression;
+import org.apache.kafka.common.errors.DisconnectException;
 import org.apache.kafka.common.errors.FencedInstanceIdException;
 import org.apache.kafka.common.errors.GroupAuthorizationException;
 import org.apache.kafka.common.errors.InvalidTxnStateException;
@@ -3610,6 +3611,44 @@ public void testTransactionAbortableExceptionInAddOffsetsToTxn() {
         assertAbortableError(TransactionAbortableException.class);
     }
 
+    @Test
+    public void testBatchesReceivedAfterAbortableError() {
+        doInitTransactions();
+        transactionManager.beginTransaction();
+
+        ProducerBatch batch = writeIdempotentBatchWithValue(transactionManager, tp1, "first");
+
+        // The producer's connection to the broker is tenuous, so this mimics the catch block for ApiException in
+        // KafkaProducer.doSend().
+        transactionManager.maybeTransitionToErrorState(new DisconnectException("test"));
+
+        // The above error is bubbled up to the user who then aborts the transaction...
+        TransactionalRequestResult result = transactionManager.beginAbort();
+
+        // The transaction manager handles the abort internally and re-initializes the epoch
+        short bumpedEpoch = epoch + 1;
+        prepareInitPidResponse(Errors.NONE, false, producerId, bumpedEpoch);
+        runUntil(result::isCompleted);
+
+        // This mimics a slower produce response that receives the timeout on the client after the above rollback
+        // has completed. The failed batch should not attempt to change the state since it's stale.
+        transactionManager.handleFailedBatch(batch, new TimeoutException(), false);
+    }
+
+    @Test
+    public void testBatchesReceivedAfterFatalError() {
+        doInitTransactions();
+        transactionManager.beginTransaction();
+
+        ProducerBatch batch = writeIdempotentBatchWithValue(transactionManager, tp1, "first");
+
+        // This mimics something that causes the transaction manager to enter its FATAL_ERROR state.
+        transactionManager.transitionToFatalError(Errors.PRODUCER_FENCED.exception());
+
+        // However, even with this failure, the failed batch should not attempt to update to ABORTABLE_ERROR.
+        transactionManager.handleFailedBatch(batch, new TimeoutException(), false);
+    }
+
     @Test
     public void testTransactionAbortableExceptionInTxnOffsetCommit() {
         final TopicPartition tp = new TopicPartition("foo", 0);