Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit fc5a64f

Browse files
committedFeb 23, 2025·
[client] Add retry when get one available tablet serverNode fails (#425)
1 parent fcc15c0 commit fc5a64f

File tree

6 files changed

+45
-16
lines changed

6 files changed

+45
-16
lines changed
 

‎fluss-client/src/main/java/com/alibaba/fluss/client/FlussConnection.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,9 @@ public RemoteFileDownloader getOrCreateRemoteFileDownloader() {
143143
GatewayClientProxy.createGatewayProxy(
144144
() ->
145145
getOneAvailableTabletServerNode(
146-
metadataUpdater.getCluster()),
146+
metadataUpdater.getCluster(), rpcClient, conf),
147147
rpcClient,
148148
AdminReadOnlyGateway.class);
149-
150149
SecurityTokenProvider securityTokenProvider =
151150
new DefaultSecurityTokenProvider(gateway);
152151
securityTokenManager =

‎fluss-client/src/main/java/com/alibaba/fluss/client/metadata/MetadataUpdater.java

+5-2
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,17 @@ public class MetadataUpdater {
6161

6262
private static final int MAX_RETRY_TIMES = 5;
6363

64+
private final Configuration configuration;
6465
private final RpcClient rpcClient;
6566
protected volatile Cluster cluster;
6667

6768
public MetadataUpdater(Configuration configuration, RpcClient rpcClient) {
68-
this(rpcClient, initializeCluster(configuration, rpcClient));
69+
this(configuration, rpcClient, initializeCluster(configuration, rpcClient));
6970
}
7071

7172
@VisibleForTesting
72-
public MetadataUpdater(RpcClient rpcClient, Cluster cluster) {
73+
public MetadataUpdater(Configuration configuration, RpcClient rpcClient, Cluster cluster) {
74+
this.configuration = configuration;
7375
this.rpcClient = rpcClient;
7476
this.cluster = cluster;
7577
}
@@ -255,6 +257,7 @@ private void updateMetadata(
255257
sendMetadataRequestAndRebuildCluster(
256258
cluster,
257259
rpcClient,
260+
configuration,
258261
tablePaths,
259262
tablePartitionNames,
260263
tablePartitionIds);

‎fluss-client/src/main/java/com/alibaba/fluss/client/utils/MetadataUtils.java

+26-11
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import com.alibaba.fluss.cluster.Cluster;
2121
import com.alibaba.fluss.cluster.ServerNode;
2222
import com.alibaba.fluss.cluster.ServerType;
23+
import com.alibaba.fluss.config.ConfigOptions;
24+
import com.alibaba.fluss.config.Configuration;
2325
import com.alibaba.fluss.exception.FlussRuntimeException;
2426
import com.alibaba.fluss.metadata.PhysicalTablePath;
2527
import com.alibaba.fluss.metadata.TableBucket;
@@ -29,6 +31,7 @@
2931
import com.alibaba.fluss.rpc.GatewayClientProxy;
3032
import com.alibaba.fluss.rpc.RpcClient;
3133
import com.alibaba.fluss.rpc.gateway.AdminReadOnlyGateway;
34+
import com.alibaba.fluss.rpc.gateway.CoordinatorGateway;
3235
import com.alibaba.fluss.rpc.messages.MetadataRequest;
3336
import com.alibaba.fluss.rpc.messages.MetadataResponse;
3437
import com.alibaba.fluss.rpc.messages.PbBucketMetadata;
@@ -57,8 +60,6 @@
5760
public class MetadataUtils {
5861
private static final Logger LOG = LoggerFactory.getLogger(MetadataUtils.class);
5962

60-
private static final int MAX_RETRY_TIMES = 5;
61-
6263
private static final Random randOffset = new Random();
6364

6465
/**
@@ -81,13 +82,14 @@ public static Cluster sendMetadataRequestAndRebuildCluster(
8182
public static Cluster sendMetadataRequestAndRebuildCluster(
8283
Cluster cluster,
8384
RpcClient client,
85+
Configuration configuration,
8486
@Nullable Set<TablePath> tablePaths,
8587
@Nullable Collection<PhysicalTablePath> tablePartitionNames,
8688
@Nullable Collection<Long> tablePartitionIds)
8789
throws ExecutionException, InterruptedException, TimeoutException {
8890
AdminReadOnlyGateway gateway =
8991
GatewayClientProxy.createGatewayProxy(
90-
() -> getOneAvailableTabletServerNode(cluster),
92+
() -> getOneAvailableTabletServerNode(cluster, client, configuration),
9193
client,
9294
AdminReadOnlyGateway.class);
9395
return sendMetadataRequestAndRebuildCluster(
@@ -260,24 +262,37 @@ public NewTableMetadata(
260262
}
261263
}
262264

263-
public static ServerNode getOneAvailableTabletServerNode(Cluster cluster) {
265+
public static ServerNode getOneAvailableTabletServerNode(
266+
Cluster cluster, RpcClient rpcClient, Configuration configuration) {
264267
List<ServerNode> aliveTabletServers = null;
265-
for (int retryTimes = 0; retryTimes <= MAX_RETRY_TIMES; retryTimes++) {
268+
int maxRetryTimes =
269+
configuration.getInt(ConfigOptions.CLIENT_GET_TABLET_SERVER_NODE_MAX_RETRY_TIMES);
270+
for (int retryTimes = 0; retryTimes <= maxRetryTimes; retryTimes++) {
266271
aliveTabletServers = cluster.getAliveTabletServerList();
267272
if (aliveTabletServers.isEmpty()) {
268-
LOG.error("Fluss create gateway proxy error, retry times = {}.", retryTimes);
269-
if (retryTimes >= MAX_RETRY_TIMES) {
273+
LOG.error(
274+
"Fluss get one available tablet server node failed retry times = {}.",
275+
retryTimes);
276+
if (retryTimes >= maxRetryTimes) {
270277
String exceptionMsg =
271278
String.format(
272-
"Execution of Fluss get one available tablet failed, no alive tablet server in cluster, retry times = %d.",
273-
retryTimes);
279+
"Execution of Fluss get one available tablet server node failed, no alive tablet server in cluster, retry times = %d.",
280+
maxRetryTimes);
274281
throw new FlussRuntimeException(exceptionMsg);
275282
} else {
276283
try {
284+
GatewayClientProxy.createGatewayProxy(
285+
cluster::getCoordinatorServer,
286+
rpcClient,
287+
CoordinatorGateway.class)
288+
.metadata(new MetadataRequest())
289+
.get(1, TimeUnit.MINUTES);
277290
Thread.sleep(1000L * retryTimes);
278-
} catch (InterruptedException interruptedException) {
291+
} catch (ExecutionException | InterruptedException | TimeoutException e) {
279292
Thread.currentThread().interrupt();
280-
throw new RuntimeException(interruptedException);
293+
throw new RuntimeException(
294+
"Execution of Fluss get one available tablet server node failed",
295+
e);
281296
}
282297
}
283298
} else {

‎fluss-client/src/test/java/com/alibaba/fluss/client/metadata/TestingMetadataUpdater.java

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ private TestingMetadataUpdater(
6262
List<ServerNode> tabletServers,
6363
Map<TablePath, TableInfo> tableInfos) {
6464
super(
65+
new Configuration(),
6566
RpcClient.create(new Configuration(), TestingClientMetricGroup.newInstance()),
6667
Cluster.empty());
6768
initializeCluster(coordinatorServer, tabletServers, tableInfos);

‎fluss-client/src/test/java/com/alibaba/fluss/client/table/FlussFailServerTableITCase.java

+5-1
Original file line numberDiff line numberDiff line change
@@ -171,17 +171,21 @@ void testRetryGetTabletServerNodes() throws Exception {
171171
FLUSS_CLUSTER_EXTENSION.stopTabletServer(serverNode.id());
172172
}
173173

174+
FLUSS_CLUSTER_EXTENSION.waitUtilAllGatewayHasSameMetadata();
175+
174176
try (Connection connNew = ConnectionFactory.createConnection(clientConf)) {
175177
assertThatThrownBy(() -> connNew.getTable(DATA1_TABLE_PATH))
176178
.cause()
177179
.isInstanceOf(FlussRuntimeException.class)
178180
.hasMessage(
179-
"Execution of Fluss get one available tablet failed, no alive tablet server in cluster, retry times = %d.",
181+
"Execution of Fluss get one available tablet server node failed, no alive tablet server in cluster, retry times = %d.",
180182
5);
181183
} finally {
184+
// start all tablet server
182185
for (ServerNode serverNode : serverNodes) {
183186
FLUSS_CLUSTER_EXTENSION.startTabletServer(serverNode.id());
184187
}
188+
FLUSS_CLUSTER_EXTENSION.waitUtilAllGatewayHasSameMetadata();
185189
}
186190
}
187191
}

‎fluss-common/src/main/java/com/alibaba/fluss/config/ConfigOptions.java

+7
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,13 @@ public class ConfigOptions {
876876
"Enable metrics for client. When metrics is enabled, the client "
877877
+ "will collect metrics and report by the JMX metrics reporter.");
878878

879+
public static final ConfigOption<Integer> CLIENT_GET_TABLET_SERVER_NODE_MAX_RETRY_TIMES =
880+
key("client.get-tablet-server-node.max-retry-times")
881+
.intType()
882+
.defaultValue(5)
883+
.withDescription(
884+
"Max retry times when get tablet server node failed for client.");
885+
879886
// ------------------------------------------------------------------------
880887
// ConfigOptions for Fluss Table
881888
// ------------------------------------------------------------------------

0 commit comments

Comments
 (0)
Please sign in to comment.