4545import org .apache .kafka .common .utils .Time ;
4646import org .apache .kafka .common .utils .internals .ExponentialBackoffManager ;
4747import org .apache .kafka .server .config .ServerConfigs ;
48+ import org .apache .kafka .server .share .metrics .ShareGroupMetrics ;
4849import org .apache .kafka .server .util .InterBrokerSendThread ;
4950import org .apache .kafka .server .util .RequestAndCompletionHandler ;
5051import org .apache .kafka .server .util .timer .Timer ;
@@ -80,6 +81,7 @@ public class ShareGroupDLQStateManager {
8081 private final Time time ;
8182 private final Timer timer ;
8283 private final ShareGroupDLQMetadataCacheHelper cacheHelper ;
84+ private final ShareGroupMetrics shareGroupMetrics ;
8385 public static final long REQUEST_BACKOFF_MS = 1_000L ;
8486 public static final long REQUEST_BACKOFF_MAX_MS = 30_000L ;
8587 private static final int MAX_REQUEST_ATTEMPTS = 5 ;
@@ -91,7 +93,18 @@ public class ShareGroupDLQStateManager {
9193 private final Map <Node , List <ProduceRequestHandler >> nodeRPCMap = new HashMap <>();
9294 private final Object nodeMapLock = new Object ();
9395
94- public ShareGroupDLQStateManager (KafkaClient client , ShareGroupDLQMetadataCacheHelper cacheHelper , Time time , Timer timer ) {
96+ // Called when the generateRequests method is executed by InterBrokerSendThread, returning requests.
97+ // Mainly for testing and introspection purpose to inspect the state of the nodeRPC map
98+ // when generateRequests is called.
99+ private Runnable generateCallback ;
100+
101+ public ShareGroupDLQStateManager (
102+ KafkaClient client ,
103+ ShareGroupDLQMetadataCacheHelper cacheHelper ,
104+ Time time ,
105+ Timer timer ,
106+ ShareGroupMetrics shareGroupMetrics
107+ ) {
95108 if (client == null ) {
96109 throw new IllegalArgumentException ("Kafkaclient must not be null." );
97110 }
@@ -108,9 +121,14 @@ public ShareGroupDLQStateManager(KafkaClient client, ShareGroupDLQMetadataCacheH
108121 throw new IllegalArgumentException ("Timer must not be null." );
109122 }
110123
124+ if (shareGroupMetrics == null ) {
125+ throw new IllegalArgumentException ("ShareGroupMetrics must not be null." );
126+ }
127+
111128 this .time = time ;
112129 this .timer = timer ;
113130 this .cacheHelper = cacheHelper ;
131+ this .shareGroupMetrics = shareGroupMetrics ;
114132 this .sender = new SendThread (
115133 "ShareGroupDLQSendThread" ,
116134 client ,
@@ -154,6 +172,16 @@ CompletableFuture<Void> dlq(ShareGroupDLQRecordParameter param, long requestBack
154172 return future ;
155173 }
156174
175+ // Visibility for tests
176+ void setGenerateCallback (Runnable generateCallback ) {
177+ this .generateCallback = generateCallback ;
178+ }
179+
180+ // Visibility for tests
181+ Map <Node , List <ShareGroupDLQStateManager .ProduceRequestHandler >> nodeRPCMap () {
182+ return nodeRPCMap ;
183+ }
184+
157185 private void enqueue (ProduceRequestHandler requestHandler ) {
158186 sender .enqueue (requestHandler );
159187 }
@@ -333,6 +361,10 @@ public ProduceRequestData.TopicProduceData topicProduceData() {
333361 simpleRecords .toArray (new SimpleRecord []{})
334362 );
335363
364+ // Update the metric to say a new request is created to se sent. This might not be the
365+ // actual RPC count as we coalesce the requests before sending.
366+ shareGroupMetrics .recordDLQProduce (param .groupId ());
367+
336368 return new ProduceRequestData .TopicProduceData ()
337369 .setName (dlqTopicPartitionData .topicName ())
338370 .setTopicId (dlqTopicPartitionData .topicId ().get ())
@@ -580,6 +612,7 @@ private void handleProduceResponse(ClientResponse response) {
580612 switch (error ) {
581613 case NONE :
582614 LOG .debug ("Successfully produced records {} to dlq topic node {}." , this , dlqPartitionLeaderNode ());
615+ shareGroupMetrics .recordDLQRecordWrite (param .groupId (), (int ) (param .lastOffset () - param .firstOffset () + 1 ));
583616 produceRequestBackoff .resetAttempts ();
584617 this .result .complete (null );
585618 break ;
@@ -588,6 +621,7 @@ private void handleProduceResponse(ClientResponse response) {
588621 LOG .debug ("Received retriable error produce response for {} to dlq topic node {} - {}." , this , dlqPartitionLeaderNode (), errorMessage );
589622 if (!produceRequestBackoff .canAttempt ()) {
590623 LOG .error ("Exhausted max retries to produce {} to DLQ topic node {}." , this , dlqPartitionLeaderNode ());
624+ shareGroupMetrics .recordDLQProduceFailed (param .groupId ());
591625 requestErrorResponse (new Exception ("Exhausted max retries to produce to DLQ topic without success." ));
592626 break ;
593627 }
@@ -598,6 +632,7 @@ private void handleProduceResponse(ClientResponse response) {
598632 LOG .error ("Unable to produce {} to DLQ topic node {} - {}." , this , dlqPartitionLeaderNode (), errorMessage );
599633 partitionResponse .recordErrors ().forEach (recordError ->
600634 LOG .error ("Records with errors {} - {}." , recordError .batchIndex (), recordError .batchIndexErrorMessage ()));
635+ shareGroupMetrics .recordDLQProduceFailed (param .groupId ());
601636 requestErrorResponse (error .exception ());
602637 }
603638 break ;
@@ -609,6 +644,7 @@ private void handleProduceResponse(ClientResponse response) {
609644 if (!produceRequestBackoff .canAttempt ()) {
610645 LOG .error ("Exhausted max retries to produce {} to DLQ topic node {} due to client response error {}." ,
611646 param , dlqPartitionLeaderNode (), clientResponseErrorMessage );
647+ shareGroupMetrics .recordDLQProduceFailed (param .groupId ());
612648 requestErrorResponse (clientResponseError .exception ());
613649 break ;
614650 }
@@ -618,6 +654,7 @@ private void handleProduceResponse(ClientResponse response) {
618654 default :
619655 LOG .error ("Unable to produce {} to DLQ topic node {} due to client response error {}." ,
620656 param , dlqPartitionLeaderNode (), clientResponseErrorMessage );
657+ shareGroupMetrics .recordDLQProduceFailed (param .groupId ());
621658 requestErrorResponse (clientResponseError .exception ());
622659 }
623660 }
@@ -634,6 +671,11 @@ private class SendThread extends InterBrokerSendThread {
634671
635672 @ Override
636673 public Collection <RequestAndCompletionHandler > generateRequests () {
674+ // Introspection for testing - will be null in prod
675+ if (generateCallback != null ) {
676+ generateCallback .run ();
677+ }
678+
637679 List <RequestAndCompletionHandler > requests = new ArrayList <>();
638680
639681 if (!queue .isEmpty ()) {
@@ -772,13 +814,19 @@ public void run() {
772814 }
773815 }
774816
775- private record CoalesceResults (
817+ // Visibility for tests
818+ record CoalesceResults (
776819 AbstractRequest .Builder <? extends AbstractRequest > request ,
777820 List <ProduceRequestHandler > liveHandlers
778821 ) {
779822 }
780823
781- private static CoalesceResults coalesceProduceRequests (List <ProduceRequestHandler > handlers ) {
824+ // Visibility for tests
825+ static CoalesceResults coalesceProduceRequests (List <ProduceRequestHandler > handlers ) {
826+ // Above handlers are destined for the same broker node - it could be for different DLQ topics and partitions
827+ // but the same broker node. Now the produce request requires each topic data request to be
828+ // scoped to a specific topic/topicId and the partition data could have all the record information
829+ // and the destination DLQ partition. To accomplish this, we will map handlers by DLQ topic id.
782830 Map <Uuid , ProduceRequestData .TopicProduceData > produceHandlerMap = new HashMap <>();
783831 List <ProduceRequestHandler > liveHandlers = new ArrayList <>(handlers .size ());
784832 handlers .forEach (handler -> {
0 commit comments