Skip to content

Commit a673324

Browse files
makes balancing a noop when balancer setup fails. (#5530)
* WIP makes balancing a noop when balancer setup fails. In the case where setting up a balancer fails the manager currently falls back to using the SimpleLoadBalancer. This fallback behavior could be very unhelpful and one problem (bad balancer config) into multiple problems as the simple load balancer may start assigning tablets in a way that is detrimental. The reason this is a draft is to get feedback on the approach. New test are needed, but those will be a lot more work than this simple change. Did not want to write test if we settle on a different approach. Need to test the following for system balancer and per table balancers. * Configuring a balancer class name that does not exists * Configuring a balancer class that is not of the correct type * Configuring a balancer class that throws an exception in it constructor * Configuring a balancer class that throws an exception when init is called * Rename to DoNothingBalancer. Updates log messages Renames the Derelict Balancer to the DoNothing Balancer. Updates the log messages to show that the ignore message is due to a failed state. * fix formatting * added test and fixed bugs * format code * code review updates * code review update --------- Co-authored-by: Daniel Roberts ddanielr <[email protected]>
1 parent 2b5e3eb commit a673324

File tree

5 files changed

+240
-31
lines changed

5 files changed

+240
-31
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.accumulo.core.spi.balancer;
20+
21+
import org.apache.accumulo.core.data.TableId;
22+
import org.slf4j.Logger;
23+
import org.slf4j.LoggerFactory;
24+
25+
/**
26+
* A balancer that will do nothing and warn about doing nothing. This purpose of this balancer is as
27+
* a fallback when attempts to create a balancer fail.
28+
*
29+
* @since 2.1.4
30+
*/
31+
public class DoNothingBalancer implements TabletBalancer {
32+
33+
private static final Logger log = LoggerFactory.getLogger(DoNothingBalancer.class);
34+
35+
private final TableId tableId;
36+
37+
public DoNothingBalancer() {
38+
this.tableId = null;
39+
}
40+
41+
public DoNothingBalancer(TableId tableId) {
42+
this.tableId = tableId;
43+
}
44+
45+
@Override
46+
public void init(BalancerEnvironment balancerEnvironment) {}
47+
48+
@Override
49+
public void getAssignments(AssignmentParameters params) {
50+
if (tableId != null) {
51+
log.warn("Balancer creation failed. Ignoring {} assignment request for tableId {}",
52+
params.unassignedTablets().size(), tableId);
53+
} else {
54+
log.warn("Balancer creation failed. Ignoring {} assignment request ",
55+
params.unassignedTablets().size());
56+
}
57+
}
58+
59+
@Override
60+
public long balance(BalanceParameters params) {
61+
if (tableId != null) {
62+
log.warn("Balancer creation failed. Ignoring request to balance tablets for tableId:{}",
63+
tableId);
64+
} else {
65+
log.warn("Balancer creation failed. Ignoring request to balance tablets");
66+
}
67+
return 30_000;
68+
}
69+
}

core/src/main/java/org/apache/accumulo/core/spi/balancer/TableLoadBalancer.java

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ protected String getLoadBalancerClassNameForTable(TableId table) {
6666
return null;
6767
}
6868

69+
private TabletBalancer constructAndInitializeBalancer(String clazzName, TableId tableId) {
70+
try {
71+
var balancer = constructNewBalancerForTable(clazzName, tableId);
72+
balancer.init(environment);
73+
return balancer;
74+
} catch (Exception e) {
75+
log.warn("Failed to load table balancer class {} for table {}", clazzName, tableId, e);
76+
return null;
77+
}
78+
}
79+
6980
protected TabletBalancer getBalancerForTable(TableId tableId) {
7081
TabletBalancer balancer = perTableBalancers.get(tableId);
7182

@@ -74,35 +85,16 @@ protected TabletBalancer getBalancerForTable(TableId tableId) {
7485
if (clazzName == null) {
7586
clazzName = SimpleLoadBalancer.class.getName();
7687
}
77-
if (balancer != null) {
78-
if (!clazzName.equals(balancer.getClass().getName())) {
79-
// the balancer class for this table does not match the class specified in the configuration
80-
try {
81-
balancer = constructNewBalancerForTable(clazzName, tableId);
82-
perTableBalancers.put(tableId, balancer);
83-
balancer.init(environment);
84-
85-
log.info("Loaded new class {} for table {}", clazzName, tableId);
86-
} catch (Exception e) {
87-
log.warn("Failed to load table balancer class {} for table {}", clazzName, tableId, e);
88-
}
89-
}
90-
}
91-
if (balancer == null) {
92-
try {
93-
balancer = constructNewBalancerForTable(clazzName, tableId);
94-
log.info("Loaded class {} for table {}", clazzName, tableId);
95-
} catch (Exception e) {
96-
log.warn("Failed to load table balancer class {} for table {}", clazzName, tableId, e);
97-
}
9888

89+
if (balancer == null || !clazzName.equals(balancer.getClass().getName())) {
90+
balancer = constructAndInitializeBalancer(clazzName, tableId);
9991
if (balancer == null) {
100-
log.info("Creating balancer {} limited to balancing table {}",
101-
SimpleLoadBalancer.class.getName(), tableId);
102-
balancer = new SimpleLoadBalancer(tableId);
92+
balancer = constructAndInitializeBalancer(DoNothingBalancer.class.getName(), tableId);
93+
log.error("Fell back to balancer {} for table {}", DoNothingBalancer.class.getName(),
94+
tableId);
10395
}
96+
log.info("Loaded class {} for table {}", balancer.getClass().getName(), tableId);
10497
perTableBalancers.put(tableId, balancer);
105-
balancer.init(environment);
10698
}
10799
return balancer;
108100
}

server/manager/src/main/java/org/apache/accumulo/manager/Manager.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
import org.apache.accumulo.core.process.thrift.ServerProcessService;
9999
import org.apache.accumulo.core.replication.thrift.ReplicationCoordinator;
100100
import org.apache.accumulo.core.spi.balancer.BalancerEnvironment;
101-
import org.apache.accumulo.core.spi.balancer.TableLoadBalancer;
101+
import org.apache.accumulo.core.spi.balancer.DoNothingBalancer;
102102
import org.apache.accumulo.core.spi.balancer.TabletBalancer;
103103
import org.apache.accumulo.core.spi.balancer.data.TServerStatus;
104104
import org.apache.accumulo.core.spi.balancer.data.TabletMigration;
@@ -1918,10 +1918,22 @@ public boolean isUpgrading() {
19181918
}
19191919

19201920
void initializeBalancer() {
1921-
var localTabletBalancer = Property.createInstanceFromPropertyName(getConfiguration(),
1922-
Property.MANAGER_TABLET_BALANCER, TabletBalancer.class, new TableLoadBalancer());
1923-
localTabletBalancer.init(balancerEnvironment);
1924-
tabletBalancer = localTabletBalancer;
1921+
try {
1922+
getContext().getPropStore().getCache().removeAll();
1923+
getConfiguration().invalidateCache();
1924+
log.debug("Attempting to reinitialize balancer using class {}",
1925+
getConfiguration().get(Property.MANAGER_TABLET_BALANCER));
1926+
var localTabletBalancer = Property.createInstanceFromPropertyName(getConfiguration(),
1927+
Property.MANAGER_TABLET_BALANCER, TabletBalancer.class, new DoNothingBalancer());
1928+
localTabletBalancer.init(balancerEnvironment);
1929+
tabletBalancer = localTabletBalancer;
1930+
} catch (Exception e) {
1931+
log.warn("Failed to create balancer {} using {} instead",
1932+
getConfiguration().get(Property.MANAGER_TABLET_BALANCER), DoNothingBalancer.class, e);
1933+
var localTabletBalancer = new DoNothingBalancer();
1934+
localTabletBalancer.init(balancerEnvironment);
1935+
tabletBalancer = localTabletBalancer;
1936+
}
19251937
}
19261938

19271939
Class<?> getBalancerClass() {

test/src/main/java/org/apache/accumulo/test/BalanceIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ public void testBalanceMetadata() throws Exception {
144144
}
145145
}
146146

147-
private Map<String,Integer> countLocations(AccumuloClient client, String tableName)
147+
static Map<String,Integer> countLocations(AccumuloClient client, String tableName)
148148
throws Exception {
149149
var ctx = ((ClientContext) client);
150150
var ample = ctx.getAmple();
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* https://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.accumulo.test;
20+
21+
import static org.junit.jupiter.api.Assertions.assertEquals;
22+
23+
import java.util.Map;
24+
import java.util.SortedSet;
25+
import java.util.TreeSet;
26+
27+
import org.apache.accumulo.core.client.Accumulo;
28+
import org.apache.accumulo.core.client.AccumuloClient;
29+
import org.apache.accumulo.core.client.admin.NewTableConfiguration;
30+
import org.apache.accumulo.core.conf.Property;
31+
import org.apache.accumulo.core.data.TableId;
32+
import org.apache.accumulo.core.spi.balancer.BalancerEnvironment;
33+
import org.apache.accumulo.core.spi.balancer.SimpleLoadBalancer;
34+
import org.apache.accumulo.core.spi.balancer.TableLoadBalancer;
35+
import org.apache.accumulo.core.util.UtilWaitThread;
36+
import org.apache.accumulo.minicluster.ServerType;
37+
import org.apache.accumulo.miniclusterImpl.MiniAccumuloConfigImpl;
38+
import org.apache.accumulo.test.functional.ConfigurableMacBase;
39+
import org.apache.accumulo.test.util.Wait;
40+
import org.apache.hadoop.conf.Configuration;
41+
import org.apache.hadoop.io.Text;
42+
import org.junit.jupiter.api.Test;
43+
import org.slf4j.Logger;
44+
import org.slf4j.LoggerFactory;
45+
46+
public class BrokenBalancerIT extends ConfigurableMacBase {
47+
48+
private static final Logger log = LoggerFactory.getLogger(BrokenBalancerIT.class);
49+
50+
public static class BrokenBalancer extends SimpleLoadBalancer {
51+
public BrokenBalancer() {
52+
super();
53+
}
54+
55+
public BrokenBalancer(TableId tableId) {
56+
super(tableId);
57+
}
58+
59+
@Override
60+
public void init(BalancerEnvironment balancerEnvironment) {
61+
throw new IllegalStateException();
62+
}
63+
}
64+
65+
@Override
66+
public void configure(MiniAccumuloConfigImpl cfg, Configuration hadoopCoreSite) {
67+
Map<String,String> siteConfig = cfg.getSiteConfig();
68+
siteConfig.put(Property.TSERV_MAXMEM.getKey(), "10K");
69+
siteConfig.put(Property.TSERV_MAJC_DELAY.getKey(), "50ms");
70+
siteConfig.put(Property.MANAGER_TABLET_GROUP_WATCHER_INTERVAL.getKey(), "3s");
71+
cfg.setSiteConfig(siteConfig);
72+
// ensure we have two tservers
73+
if (cfg.getNumTservers() != 2) {
74+
cfg.setNumTservers(2);
75+
}
76+
}
77+
78+
@Test
79+
public void testBalancerException() throws Exception {
80+
String tableName = getUniqueNames(1)[0];
81+
testBadBalancer(BrokenBalancer.class.getName(), tableName);
82+
}
83+
84+
@Test
85+
public void testBalancerNotFound() throws Exception {
86+
String tableName = getUniqueNames(1)[0];
87+
testBadBalancer("org.apache.accumulo.abc.NonExistentBalancer", tableName);
88+
}
89+
90+
private void testBadBalancer(String balancerClass, String tableName) throws Exception {
91+
try (AccumuloClient c = Accumulo.newClient().from(getClientProperties()).build()) {
92+
SortedSet<Text> splits = new TreeSet<>();
93+
for (int i = 0; i < 10; i++) {
94+
splits.add(new Text("" + i));
95+
}
96+
var props = Map.of(Property.TABLE_LOAD_BALANCER.getKey(), balancerClass);
97+
NewTableConfiguration ntc =
98+
new NewTableConfiguration().withSplits(splits).setProperties(props);
99+
c.tableOperations().create(tableName, ntc);
100+
101+
assertEquals(Map.of(" none", 11), BalanceIT.countLocations(c, tableName));
102+
UtilWaitThread.sleep(5000);
103+
// scan should not be able to complete because the tablet should not be assigned
104+
assertEquals(Map.of(" none", 11), BalanceIT.countLocations(c, tableName));
105+
106+
// fix the balancer config
107+
log.info("fixing per tablet balancer");
108+
c.tableOperations().setProperty(tableName, Property.TABLE_LOAD_BALANCER.getKey(),
109+
SimpleLoadBalancer.class.getName());
110+
111+
Wait.waitFor(() -> 2 == BalanceIT.countLocations(c, tableName).size());
112+
113+
// break the balancer at the system level
114+
log.info("breaking manager balancer");
115+
c.instanceOperations().setProperty(Property.MANAGER_TABLET_BALANCER.getKey(), balancerClass);
116+
117+
// add some tablet servers
118+
assertEquals(2, getCluster().getConfig().getNumTservers());
119+
getCluster().getConfig().setNumTservers(5);
120+
getCluster().getClusterControl().start(ServerType.TABLET_SERVER);
121+
122+
UtilWaitThread.sleep(5000);
123+
124+
// should not have balanced across the two new tservers
125+
assertEquals(2, BalanceIT.countLocations(c, tableName).size());
126+
127+
// fix the system level balancer
128+
log.info("fixing manager balancer");
129+
c.instanceOperations().setProperty(Property.MANAGER_TABLET_BALANCER.getKey(),
130+
TableLoadBalancer.class.getName());
131+
132+
// should eventually balance across all 5 tabletsevers
133+
Wait.waitFor(() -> 5 == BalanceIT.countLocations(c, tableName).size());
134+
}
135+
}
136+
}

0 commit comments

Comments
 (0)