Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: SendAsync callback was not invoked when producer is in reconnecting #1333

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pulsar/consumer_multitopic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ import (
"testing"
"time"

"github.com/stretchr/testify/assert"

"github.com/apache/pulsar-client-go/pulsar/internal"
pb "github.com/apache/pulsar-client-go/pulsar/internal/pulsar_proto"
"github.com/apache/pulsar-client-go/pulsaradmin"
"github.com/apache/pulsar-client-go/pulsaradmin/pkg/admin/config"
"github.com/apache/pulsar-client-go/pulsaradmin/pkg/utils"
"github.com/stretchr/testify/assert"
)

func TestMultiTopicConsumerReceive(t *testing.T) {
Expand Down Expand Up @@ -366,6 +367,10 @@ func (dummyConnection) IsProxied() bool {
return false
}

func (dummyConnection) Closed() bool {
return false
}

func TestMultiTopicAckIDListTimeout(t *testing.T) {
topic := fmt.Sprintf("multiTopicAckIDListTimeout%v", time.Now().UnixNano())
assert.NoError(t, createPartitionedTopic(topic, 5))
Expand Down
5 changes: 3 additions & 2 deletions pulsar/internal/connection.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ type Connection interface {
Close()
WaitForClose() <-chan struct{}
IsProxied() bool
Closed() bool
}

type ConsumerHandler interface {
Expand Down Expand Up @@ -658,7 +659,7 @@ func (c *connection) SendRequestNoWait(req *pb.BaseCommand) error {
}

func (c *connection) internalSendRequest(req *request) {
if c.closed() {
if c.Closed() {
c.log.Warnf("internalSendRequest failed for connectionClosed")
if req.callback != nil {
req.callback(req.cmd, ErrConnectionClosed)
Expand Down Expand Up @@ -1064,7 +1065,7 @@ func (c *connection) setStateClosed() {
c.state.Store(int32(connectionClosed))
}

func (c *connection) closed() bool {
func (c *connection) Closed() bool {
return connectionClosed == c.getState()
}

Expand Down
2 changes: 1 addition & 1 deletion pulsar/internal/connection_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (p *connectionPool) GetConnection(logicalAddr *url.URL, physicalAddr *url.U
// When the current connection is in a closed state or the broker actively notifies that the
// current connection is closed, we need to remove the connection object from the current
// connection pool and create a new connection.
if conn.closed() {
if conn.Closed() {
p.log.Debugf("Removed connection from pool key=%s logical_addr=%+v physical_addr=%+v",
key, conn.logicalAddr, conn.physicalAddr)
delete(p.connections, key)
Expand Down
7 changes: 4 additions & 3 deletions pulsar/internal/connection_reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import (
"fmt"
"io"

pb "github.com/apache/pulsar-client-go/pulsar/internal/pulsar_proto"
"google.golang.org/protobuf/proto"

pb "github.com/apache/pulsar-client-go/pulsar/internal/pulsar_proto"
)

type connectionReader struct {
Expand All @@ -41,7 +42,7 @@ func (r *connectionReader) readFromConnection() {
for {
cmd, headersAndPayload, err := r.readSingleCommand()
if err != nil {
if !r.cnx.closed() {
if !r.cnx.Closed() {
r.cnx.log.WithError(err).Infof("Error reading from connection")
r.cnx.Close()
}
Expand Down Expand Up @@ -122,7 +123,7 @@ func (r *connectionReader) readAtLeast(size uint32) error {
n, err := io.ReadAtLeast(r.cnx.cnx, r.buffer.WritableSlice(), int(size))
if err != nil {
// has the connection been closed?
if r.cnx.closed() {
if r.cnx.Closed() {
return errConnectionClosed
}
r.cnx.Close()
Expand Down
15 changes: 12 additions & 3 deletions pulsar/producer_partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,8 @@ func (p *partitionProducer) runEventsLoop() {
}
case connectionClosed := <-p.connectClosedCh:
p.log.Info("runEventsLoop will reconnect in producer")
p.reconnectToBroker(connectionClosed)
// reconnect to broker in a new goroutine so that it won't block the event loop, see issue #1332
go p.reconnectToBroker(connectionClosed)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if my concern is correct.

If use a new thread here, we need to carefully handle follow case:

  1. When a connection closed request comes in during reconnection, we may need to add a reconnecting status to prevent concurrency issues.
  2. During reconnection, if a message is sent using the old connection(maybe closed connection), I'm not sure about the behavior here. Could it stuck or throw error?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even with the check below, we can't be 100% sure of using the old connection, because it's inherently concurrent.

conn := p._getConn()
		if conn.Closed() {
			return
		}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I'm thinking that before a message enters the pendingQueue, even if the user is using the sendAsync method, we should block the user's call.

This way, on the user side, the timeout should be calculated from when the sendAsync method call succeeds; otherwise, it should block at this point.

Copy link
Contributor Author

@gunli gunli Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even with the check below, we can't be 100% sure of using the old connection, because it's inherently concurrent.

conn := p._getConn()
		if conn.Closed() {
			return
		}

I agree with that. Actually, I think that it is quite strange to pass a buffer to a connection's channel when sending a message, as this causes the buffer to be read and written by two different goroutines, leading to a data race. What's worse is that if a buffer is sent into a connection's channel and the connection is closed at the same time, the buffer ends up in an new uncertain or pending state, we need to pay more attention to handling this situation again, currently, this situation is not handled when the connection is closed by network or by server notification pb.BaseCommand_CLOSE_PRODUCER/pb.BaseCommand_CLOSE_CONSUMER, may be we need a new PR to handle this.

In my opinion, it would be better to keep the message in the pending queue and use timeout events and server ack events to determine whether a message has timed out or succeeded.

Copy link
Contributor Author

@gunli gunli Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I'm thinking that before a message enters the pendingQueue, even if the user is using the sendAsync method, we should block the user's call.

This way, on the user side, the timeout should be calculated from when the sendAsync method call succeeds; otherwise, it should block at this point.

I have thought about that, but that will be a breaking change. If we change it that way, SendAsync will sometimes become SendSync and the config MaxPendingMessages will become meaningless.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't say we should introduce new goroutines.

What I mean is that we are already sending data in the goroutine of partitionProducer.internalSend(), we don't have to create a new goroutine again in connection.run().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you look at the Java implementation, you'll find that Java handles many things in sendAsync internal, and before these logics are processed, the user call is blocked.

We have done the same things in go client, the question is that partitionProducer.runEventsLoop() is block in the reconnecting case, which prevent the data from entering the pendingQueue, and failTimeoutMessages() can't failed it, because failTimeoutMessages() only fail the ones in pendingQueue.

Copy link
Contributor Author

@gunli gunli Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope we can use a single goroutinue to handle the reconnect, so like: #691, and then add a variable to record the reconnection state.

If this state is reconnecting, we don't send the request.

I think the connection.State is sufficient and timely enough to control this, because the reconnection state of a producer relies on connection.State and the notification from a connection. If the connection is closed, we should stop sending. The problem is that the connection's implementation still needs to be refactored, as I mentioned above: make the sending logic synchronous and close the connection when receiving BaseCommand_CLOSE_PRODUCER or BaseCommand_CLOSE_CONSUMER.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the connection.State is sufficient and timely enough to control this, because the reconnection state of a producer relies on connection.State and the notification from a connection. If the connection is closed, we should stop sending.

A connection has multiple producers and consumers, please consider one case the connection is active, the producer is inactive, you should not send the message to the broker.

https://github.com/apache/pulsar/blob/v4.0.2/pulsar-client/src/main/java/org/apache/pulsar/client/impl/ProducerImpl.java#L2366-L2382, this is java implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the connection.State is sufficient and timely enough to control this, because the reconnection state of a producer relies on connection.State and the notification from a connection. If the connection is closed, we should stop sending.

A connection has multiple producers and consumers, please consider one case the connection is active, the producer is inactive, you should not send the message to the broker.

https://github.com/apache/pulsar/blob/v4.0.2/pulsar-client/src/main/java/org/apache/pulsar/client/impl/ProducerImpl.java#L2366-L2382, this is java implementation.

I looked into the java code, but I still can't understand it as the JAVA implemention encapsulate too much details and I know little about JAVA.

You said that please consider one case the connection is active, the producer is inactive, as I know, only the connection's close event can trigger a produce from active to inactive.

Regardless, the current implemention of the conneciton in pulsar-go-client is problematic, c.internalSendRequest(req) and c.internalWriteData(data)/c.sendPing() can happen at the same time as the are run in 2 goroutines, they will call c.cnx.Write() finally, which will cause write/data conflict, IMO, this can be considered as a BUG, but we can fix it in another PR.

And, the conneciton use incomingRequestsCh with capacity of 10 (incomingRequestsCh: make(chan *request, 10)) to receive and hold user's data, this can be considered as another pendingQueue, now we have 2 pendingQueues, one in the producer, think aboute a case: a user's data/buffer is sent into producer.pendingQueue and then sent into connection.incomingRequestsCh, but c.cnx.Write() is not called yet, now there can be a problem: If the timeout config is short, now, the reqeust is treated as timeout by producer.failTimeoutMessages(), then the buffer is put back into the pool, but connection.incomingRequestsCh still keeps a reference of the buffer, if the buffer is realloced by the pool, now, if the connection run to c.cnx.Write(), there will be data race. This can be considered another BUG,t oo, a new PR is needed again.

go func() {
		for {
			select {
			case <-c.closeCh:
				c.failLeftRequestsWhenClose()
				return

			case req := <-c.incomingRequestsCh:
				if req == nil {
					return // TODO: this never gonna be happen
				}
				c.internalSendRequest(req)
			}
		}
	}()

	for {
		select {
		case <-c.closeCh:
			return

		case cmd := <-c.incomingCmdCh:
			c.internalReceivedCommand(cmd.cmd, cmd.headersAndPayload)
		case data := <-c.writeRequestsCh:
			if data == nil {
				return
			}
			c.internalWriteData(data)

		case <-pingSendTicker.C:
			c.sendPing()
		}
	}

case <-p.batchFlushTicker.C:
p.internalFlushCurrentBatch()
}
Expand Down Expand Up @@ -902,7 +903,15 @@ func (p *partitionProducer) writeData(buffer internal.Buffer, sequenceID uint64,
sequenceID: sequenceID,
sendRequests: callbacks,
})
p._getConn().WriteData(buffer)

// If the connection is closed, stop sending data. Continuing to send data
// to a closed connection will cause the buffer to be passed to it, which
// prevents further processing.
conn := p._getConn()
if conn.Closed() {
return
}
conn.WriteData(buffer)
}
}

Expand Down Expand Up @@ -1737,10 +1746,10 @@ func (i *pendingItem) done(err error) {
return
}
i.isDone = true
buffersPool.Put(i.buffer)
if i.flushCallback != nil {
i.flushCallback(err)
}
buffersPool.Put(i.buffer)
}

// _setConn sets the internal connection field of this partition producer atomically.
Expand Down
100 changes: 100 additions & 0 deletions pulsar/producer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import (
"testing"
"time"

"github.com/docker/docker/api/types/container"
"github.com/docker/go-connections/nat"
"github.com/stretchr/testify/require"
"github.com/testcontainers/testcontainers-go"
"github.com/testcontainers/testcontainers-go/wait"
Expand Down Expand Up @@ -2575,6 +2577,104 @@ func TestProducerKeepReconnectingAndThenCallClose(t *testing.T) {
}, 30*time.Second, 1*time.Second)
}

func TestProducerKeepReconnectingAndThenCallSendAsync(t *testing.T) {
req := testcontainers.ContainerRequest{
Image: getPulsarTestImage(),
ExposedPorts: []string{"6650/tcp", "8080/tcp"},
WaitingFor: wait.ForExposedPort(),
Cmd: []string{"bin/pulsar", "standalone", "-nfw"},
// use fixed port binding so that it can be reconnected after restart
HostConfigModifier: func(hostConfig *container.HostConfig) {
hostConfig.PortBindings = map[nat.Port][]nat.PortBinding{
"6650/tcp": {{HostIP: "0.0.0.0", HostPort: "6650"}},
"8080/tcp": {{HostIP: "0.0.0.0", HostPort: "8080"}},
}
},
}
c, err := testcontainers.GenericContainer(context.Background(), testcontainers.GenericContainerRequest{
ContainerRequest: req,
Started: true,
})
require.NoError(t, err, "Failed to start the pulsar container")
endpoint, err := c.PortEndpoint(context.Background(), "6650", "pulsar")
require.NoError(t, err, "Failed to get the pulsar endpoint")

client, err := NewClient(ClientOptions{
URL: endpoint,
ConnectionTimeout: 5 * time.Second,
OperationTimeout: 5 * time.Second,
})
require.NoError(t, err)
defer client.Close()

var testProducer Producer
require.Eventually(t, func() bool {
testProducer, err = client.CreateProducer(ProducerOptions{
Topic: newTopicName(),
Schema: NewBytesSchema(nil),
SendTimeout: 3 * time.Second,
})
return err == nil
}, 30*time.Second, 1*time.Second)

// send a message
errChan := make(chan error)
defer close(errChan)

testProducer.SendAsync(context.Background(), &ProducerMessage{
Payload: []byte("test"),
}, func(_ MessageID, _ *ProducerMessage, err error) {
errChan <- err
})
// should success
err = <-errChan
require.NoError(t, err)

// stop pulsar server
timeout := 10 * time.Second
_ = c.Stop(context.Background(), &timeout)

// send again
testProducer.SendAsync(context.Background(), &ProducerMessage{
Payload: []byte("test"),
}, func(_ MessageID, _ *ProducerMessage, err error) {
errChan <- err
})
// should get a timeout error
err = <-errChan
require.True(t, errors.Is(err, ErrSendTimeout))

oldConn := testProducer.(*producer).producers[0].(*partitionProducer)._getConn()
// restart pulsar server
err = c.Start(context.Background())
require.NoError(t, err)
defer c.Terminate(context.Background())

// wait for reconnection success
waitTime := 0
for {
newConn := testProducer.(*producer).producers[0].(*partitionProducer)._getConn()
if oldConn != newConn {
break
}
time.Sleep(5 * time.Second)
waitTime += 5
if waitTime > 60 {
break
}
}

// send again
testProducer.SendAsync(context.Background(), &ProducerMessage{
Payload: []byte("test"),
}, func(_ MessageID, _ *ProducerMessage, err error) {
errChan <- err
})
// should success
err = <-errChan
require.NoError(t, err)
}

func TestSelectConnectionForSameProducer(t *testing.T) {
topicName := newTopicName()

Expand Down