Skip to content

Commit f05974b

Browse files
TaoYang526slfan1989
authored andcommitted
YARN-11785. Race condition in QueueMetrics due to non-thread-safe HashMap causes MetricsException. (#7459) Contributed by Tao Yang.
* YARN-11785. Race condition in QueueMetrics due to non-thread-safe HashMap causes MetricsException. Signed-off-by: Shilun Fan <[email protected]>
1 parent b3ae9d1 commit f05974b

File tree

2 files changed

+63
-1
lines changed
  • hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src

2 files changed

+63
-1
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.HashMap;
2525
import java.util.Map;
2626
import java.util.Set;
27+
import java.util.concurrent.ConcurrentHashMap;
2728

2829
import org.apache.hadoop.classification.InterfaceAudience;
2930
import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -253,7 +254,7 @@ public synchronized static void clearQueueMetrics() {
253254
* Simple metrics cache to help prevent re-registrations.
254255
*/
255256
private static final Map<String, QueueMetrics> QUEUE_METRICS =
256-
new HashMap<String, QueueMetrics>();
257+
new ConcurrentHashMap<>();
257258

258259
/**
259260
* Returns the metrics cache to help prevent re-registrations.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java

+61
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
import org.junit.Before;
3838
import org.junit.Test;
3939

40+
import java.util.concurrent.CountDownLatch;
41+
import java.util.concurrent.atomic.AtomicInteger;
42+
4043
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
4144
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
4245
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_COMPLETED;
@@ -61,6 +64,7 @@
6164
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceMetricsChecker.ResourceMetricsKey.RESERVED_CONTAINERS;
6265
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceMetricsChecker.ResourceMetricsKey.RESERVED_MB;
6366
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceMetricsChecker.ResourceMetricsKey.RESERVED_V_CORES;
67+
import static org.junit.Assert.assertEquals;
6468
import static org.junit.Assert.assertNull;
6569
import static org.mockito.Mockito.mock;
6670
import static org.mockito.Mockito.when;
@@ -794,6 +798,63 @@ public void testCollectAllMetrics() {
794798
.checkAgainst(queueSource, true);
795799
}
796800

801+
@Test
802+
public void testQueueMetricsRaceCondition() throws InterruptedException {
803+
final CountDownLatch latch = new CountDownLatch(2);
804+
final int numIterations = 100000;
805+
final AtomicInteger exceptionCount = new AtomicInteger(0);
806+
final AtomicInteger getCount = new AtomicInteger(0);
807+
808+
// init a queue metrics for testing
809+
String queueName = "test";
810+
QueueMetrics metrics =
811+
QueueMetrics.forQueue(ms, queueName, null, false, conf);
812+
QueueMetrics.getQueueMetrics().put(queueName, metrics);
813+
814+
/*
815+
* simulate the concurrent calls for QueueMetrics#getQueueMetrics
816+
*/
817+
// thread A will keep querying the same queue metrics for a specified number of iterations
818+
Thread threadA = new Thread(() -> {
819+
try {
820+
for (int i = 0; i < numIterations; i++) {
821+
QueueMetrics qm = QueueMetrics.getQueueMetrics().get(queueName);
822+
if (qm != null) {
823+
getCount.incrementAndGet();
824+
}
825+
}
826+
} catch (Exception e) {
827+
System.out.println("Exception: " + e.getMessage());
828+
exceptionCount.incrementAndGet();
829+
} finally {
830+
latch.countDown();
831+
}
832+
});
833+
// thread B will keep adding new queue metrics for a specified number of iterations
834+
Thread threadB = new Thread(() -> {
835+
try {
836+
for (int i = 0; i < numIterations; i++) {
837+
QueueMetrics.getQueueMetrics().put("q" + i, metrics);
838+
}
839+
} catch (Exception e) {
840+
exceptionCount.incrementAndGet();
841+
} finally {
842+
latch.countDown();
843+
}
844+
});
845+
846+
// start threads and wait for them to finish
847+
threadA.start();
848+
threadB.start();
849+
latch.await();
850+
851+
// check if all get operations are successful to
852+
// make sure there is no race condition
853+
assertEquals(numIterations, getCount.get());
854+
// check if there is any exception
855+
assertEquals(0, exceptionCount.get());
856+
}
857+
797858
private static void checkAggregatedNodeTypes(MetricsSource source,
798859
long nodeLocal, long rackLocal, long offSwitch) {
799860
MetricsRecordBuilder rb = getMetrics(source);

0 commit comments

Comments
 (0)