Skip to content

Commit d80db4e

Browse files
hsavaserehemanthsavasere
authored andcommitted
feat(tablet-server): make controlled shutdown parameters configurable
Add configuration options for tablet server controlled shutdown retry behavior: - tablet-server.controlled-shutdown.max-retries - tablet-server.controlled-shutdown.retry-interval Update documentation and add tests to verify the new configuration options work as expected
1 parent 025ffd1 commit d80db4e

File tree

5 files changed

+76
-10
lines changed

5 files changed

+76
-10
lines changed

fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,26 @@ public class ConfigOptions {
418418
+ WRITER_ID_EXPIRATION_TIME.key()
419419
+ " passing. The default value is 10 minutes.");
420420

421+
public static final ConfigOption<Integer> TABLET_SERVER_CONTROLLED_SHUTDOWN_MAX_RETRIES =
422+
key("tablet-server.controlled-shutdown.max-retries")
423+
.intType()
424+
.defaultValue(3)
425+
.withDescription(
426+
"The maximum number of retries for controlled shutdown of the tablet server. "
427+
+ "During controlled shutdown, the tablet server attempts to transfer leadership "
428+
+ "of its buckets to other servers. If the transfer fails, it will retry up to "
429+
+ "this number of times before proceeding with shutdown. The default value is 3.");
430+
431+
public static final ConfigOption<Duration> TABLET_SERVER_CONTROLLED_SHUTDOWN_RETRY_INTERVAL =
432+
key("tablet-server.controlled-shutdown.retry-interval")
433+
.durationType()
434+
.defaultValue(Duration.ofMillis(1000))
435+
.withDescription(
436+
"The interval between retries during controlled shutdown of the tablet server. "
437+
+ "When controlled shutdown fails to transfer bucket leadership, the tablet server "
438+
+ "will wait for this duration before attempting the next retry. "
439+
+ "The default value is 1000 milliseconds (1 second).");
440+
421441
public static final ConfigOption<Integer> BACKGROUND_THREADS =
422442
key("server.background.threads")
423443
.intType()

fluss-dist/src/main/resources/server.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ bind.listeners: FLUSS://localhost:9123
6565
# when running multiple tablet servers.
6666
tablet-server.id: 0
6767

68+
# Controlled shutdown configuration for tablet servers
69+
# tablet-server.controlled-shutdown.max-retries: 3
70+
# tablet-server.controlled-shutdown.retry-interval: 1000ms
71+
6872
#==============================================================================
6973
# OSS FileSystem
7074
#==============================================================================

fluss-server/src/main/java/org/apache/fluss/server/tablet/TabletServer.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ public class TabletServer extends ServerBase {
8585

8686
private static final Logger LOG = LoggerFactory.getLogger(TabletServer.class);
8787

88-
// TODO, maybe need to make it configurable
89-
private static final int CONTROLLED_SHUTDOWN_MAX_RETRIES = 3;
90-
private static final long CONTROLLED_SHUTDOWN_RETRY_INTERVAL_MS = 1000L;
91-
9288
private final int serverId;
9389

9490
/**
@@ -452,7 +448,11 @@ private void controlledShutDown() {
452448
// a period of time and try again for a number of retries. If all the attempt fails, we
453449
// simply force the shutdown.
454450
boolean shutdownSucceeded = false;
455-
int remainingRetries = CONTROLLED_SHUTDOWN_MAX_RETRIES;
451+
int remainingRetries =
452+
conf.getInt(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_MAX_RETRIES);
453+
long retryIntervalMs =
454+
conf.get(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_RETRY_INTERVAL).toMillis();
455+
456456
while (!shutdownSucceeded && remainingRetries > 0) {
457457
remainingRetries--;
458458

@@ -484,7 +484,7 @@ private void controlledShutDown() {
484484

485485
if (!shutdownSucceeded && remainingRetries > 0) {
486486
try {
487-
Thread.sleep(CONTROLLED_SHUTDOWN_RETRY_INTERVAL_MS);
487+
Thread.sleep(retryIntervalMs);
488488
} catch (InterruptedException e) {
489489
Thread.currentThread().interrupt();
490490
break;

fluss-server/src/test/java/org/apache/fluss/server/tablet/TabletServerShutdownITCase.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.fluss.server.tablet;
1919

2020
import org.apache.fluss.config.ConfigOptions;
21+
import org.apache.fluss.config.Configuration;
2122
import org.apache.fluss.exception.RetriableException;
2223
import org.apache.fluss.metadata.Schema;
2324
import org.apache.fluss.metadata.TableBucket;
@@ -112,6 +113,35 @@ void testIOExceptionShouldStopTabletServer(boolean isLogTable) throws Exception
112113
FLUSS_CLUSTER_EXTENSION.startTabletServer(leader, true);
113114
}
114115

116+
@Test
117+
void testControlledShutdownConfiguration() throws Exception {
118+
// Test that the controlled shutdown configuration options are properly loaded
119+
Configuration conf = new Configuration();
120+
121+
// Verify default values are loaded correctly
122+
assertThat(conf.getInt(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_MAX_RETRIES))
123+
.isEqualTo(3);
124+
assertThat(
125+
conf.get(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_RETRY_INTERVAL)
126+
.toMillis())
127+
.isEqualTo(1000L);
128+
129+
// Test custom configuration values
130+
Configuration customConf = new Configuration();
131+
customConf.set(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_MAX_RETRIES, 5);
132+
customConf.set(
133+
ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_RETRY_INTERVAL,
134+
Duration.ofMillis(2000));
135+
136+
assertThat(customConf.getInt(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_MAX_RETRIES))
137+
.isEqualTo(5);
138+
assertThat(
139+
customConf
140+
.get(ConfigOptions.TABLET_SERVER_CONTROLLED_SHUTDOWN_RETRY_INTERVAL)
141+
.toMillis())
142+
.isEqualTo(2000L);
143+
}
144+
115145
@Test
116146
void testControlledShutdown() throws Exception {
117147
FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);

website/docs/maintenance/operations/graceful-shutdown.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,20 @@ kill -TERM <tablet-server-pid>
6969

7070
#### Configuration Options
7171

72-
- **Controlled Shutdown Retries**: Number of attempts to transfer leadership (`default:` 3 retries)
73-
- **Retry Interval**: Time between retry attempts (`default`: 1000L)
72+
The controlled shutdown process can be configured using the following options:
73+
74+
- **`tablet-server.controlled-shutdown.max-retries`**: Maximum number of attempts to transfer leadership before proceeding with unclean shutdown (default: 3)
75+
- **`tablet-server.controlled-shutdown.retry-interval`**: Time interval between retry attempts (default: 1000ms)
76+
77+
**Example Configuration:**
78+
79+
```yaml
80+
# server.yaml
81+
tablet-server:
82+
controlled-shutdown:
83+
max-retries: 5
84+
retry-interval: 2000ms
85+
```
7486
7587
## Monitoring Shutdown
7688
@@ -110,8 +122,8 @@ Monitor shutdown-related metrics:
110122

111123
| Configuration | Description | Default |
112124
|---------------|-------------|---------|
113-
| `controlled.shutdown.max.retries` | Maximum retries for controlled shutdown | 3 |
114-
| `controlled.shutdown.retry.interval.ms` | Interval between retry attempts | 5000 |
125+
| `tablet-server.controlled-shutdown.max-retries` | Maximum retries for controlled shutdown | 3 |
126+
| `tablet-server.controlled-shutdown.retry-interval` | Interval between retry attempts | 1000ms |
115127
| `shutdown.timeout.ms` | General shutdown timeout | 30000 |
116128

117129
## See Also

0 commit comments

Comments
 (0)