Skip to content

Commit 4dd2a57

Browse files
committed
Fix deadlock related to safepoint sync.
`gc_lock` now never does safepoint check. We always explicitly enter safepoint via `ThreadBlockInVM`. The VM companion thread now lets `VM_MMTkSTWOperation` (executed by the VM thread) transition the state to `_threads_resumed` instead of doing it after `VMThread::execute` returns. This works around a problem where `VMThread::execute` continues to block the companion thread when the VM thread starts executing other VM operations. Some minor style changes: We consistently use the constant `Mutex::_no_safepoint_check_flag` instead of `true` when acquiring or waiting for the `Monitor`. Slightly updated comments and logs. Logging statements no longer include thread IDs because thread IDs can be turned on by a command line option.
1 parent b9e3aad commit 4dd2a57

5 files changed

+91
-53
lines changed

openjdk/mmtkHeap.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ MMTkHeap::MMTkHeap(MMTkCollectorPolicy* policy) :
6969
_collector_policy(policy),
7070
_num_root_scan_tasks(0),
7171
_n_workers(0),
72-
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_sometimes)),
72+
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_never)),
7373
_soft_ref_policy()
7474
{
7575
_heap = this;

openjdk/mmtkUpcalls.cpp

+26-19
Original file line numberDiff line numberDiff line change
@@ -74,19 +74,19 @@ static void mmtk_resume_mutators(void *tls) {
7474
#endif
7575

7676
// Note: we don't have to hold gc_lock to increment the counter.
77-
// The increment has to be done before mutators can be resumed
78-
// otherwise, mutators might see a stale value
77+
// The increment has to be done before mutators can be resumed (from `block_for_gc` or yieldpoints).
78+
// Otherwise, mutators might see an outdated start-the-world count.
7979
Atomic::inc(&mmtk_start_the_world_count);
80+
log_debug(gc)("Incremented start_the_world counter to %zu.", Atomic::load(&mmtk_start_the_world_count));
8081

81-
log_debug(gc)("Requesting the VM to resume all mutators...");
82+
log_debug(gc)("Requesting the companion thread to resume all mutators blocking on yieldpoints...");
8283
MMTkHeap::heap()->companion_thread()->request(MMTkVMCompanionThread::_threads_resumed, true);
83-
log_debug(gc)("Mutators resumed. Now notify any mutators waiting for GC to finish...");
8484

85+
log_debug(gc)("Notifying mutators blocking on the start-the-world counter...");
8586
{
86-
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), true);
87+
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);
8788
MMTkHeap::heap()->gc_lock()->notify_all();
8889
}
89-
log_debug(gc)("Mutators notified.");
9090
}
9191

9292
static const int GC_THREAD_KIND_WORKER = 1;
@@ -111,30 +111,37 @@ static void mmtk_spawn_gc_thread(void* tls, int kind, void* ctx) {
111111

112112
static void mmtk_block_for_gc() {
113113
MMTkHeap::heap()->_last_gc_time = os::javaTimeNanos() / NANOSECS_PER_MILLISEC;
114-
log_debug(gc)("Thread (id=%d) will block waiting for GC to finish.", Thread::current()->osthread()->thread_id());
115114

116115
// We must read the counter before entering safepoint.
117-
// This thread has just triggered GC.
118-
// Before this thread enters safe point, the GC cannot start, and therefore cannot finish,
119-
// and cannot increment the counter mmtk_start_the_world_count.
120-
// Otherwise, if we attempt to acquire the gc_lock first, GC may have triggered stop-the-world
121-
// first, and this thread will be blocked for the entire stop-the-world duration before it can
122-
// get the lock. Once that happens, the current thread will read the counter after the GC, and
123-
// wait for the next non-existing GC forever.
116+
// This thread (or another mutator) has just triggered GC.
117+
// The GC cannot start until all mutators enter safepoint.
118+
// If the GC cannot start, it cannot finish, and cannot increment mmtk_start_the_world_count.
119+
// Otherwise, if we enter safepoint before reading mmtk_start_the_world_count,
120+
// we will allow the GC to start before we read the counter,
121+
// and the GC workers may run so fast that they have finished one whole GC and incremented the
122+
// counter before this mutator reads the counter for the first time.
123+
// Once that happens, the current mutator will wait for the next GC forever,
124+
// which may not happen at all before the program exits.
124125
size_t my_count = Atomic::load(&mmtk_start_the_world_count);
125126
size_t next_count = my_count + 1;
126127

128+
log_debug(gc)("Will block until the start_the_world counter reaches %zu.", next_count);
129+
127130
{
128-
// Once this thread acquires the lock, the VM will consider this thread to be "in safe point".
129-
MutexLocker locker(MMTkHeap::heap()->gc_lock());
131+
// Enter safepoint.
132+
JavaThread* thread = JavaThread::current();
133+
ThreadBlockInVM tbivm(thread);
134+
135+
// No safepoint check. We are already in safepoint.
136+
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);
130137

131138
while (Atomic::load(&mmtk_start_the_world_count) < next_count) {
132139
// wait() may wake up spuriously, but the authoritative condition for unblocking is
133140
// mmtk_start_the_world_count being incremented.
134-
MMTkHeap::heap()->gc_lock()->wait();
141+
MMTkHeap::heap()->gc_lock()->wait(Mutex::_no_safepoint_check_flag);
135142
}
136143
}
137-
log_debug(gc)("Thread (id=%d) resumed after GC finished.", Thread::current()->osthread()->thread_id());
144+
log_debug(gc)("Resumed after GC finished.");
138145
}
139146

140147
static void mmtk_out_of_memory(void* tls, MMTkAllocationError err_kind) {
@@ -167,7 +174,7 @@ static bool mmtk_is_mutator(void* tls) {
167174

168175
static void mmtk_get_mutators(MutatorClosure closure) {
169176
JavaThread *thr;
170-
for (JavaThreadIteratorWithHandle jtiwh; thr = jtiwh.next();) {
177+
for (JavaThreadIteratorWithHandle jtiwh; (thr = jtiwh.next());) {
171178
closure.invoke(&thr->third_party_heap_mutator);
172179
}
173180
}

openjdk/mmtkVMCompanionThread.cpp

+62-31
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ void MMTkVMCompanionThread::run() {
5353
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
5454
assert(_reached_state == _threads_resumed, "Threads should be running at this moment.");
5555
while (_desired_state != _threads_suspended) {
56-
_lock->wait(true);
56+
_lock->wait(Mutex::_no_safepoint_check_flag);
5757
}
5858
assert(_reached_state == _threads_resumed, "Threads should still be running at this moment.");
5959
}
@@ -63,25 +63,9 @@ void MMTkVMCompanionThread::run() {
6363
VM_MMTkSTWOperation op(this);
6464
// VMThread::execute() is blocking. The companion thread will be blocked
6565
// here waiting for the VM thread to execute op, and the VM thread will
66-
// be blocked in reach_suspended_and_wait_for_resume() until a GC thread
66+
// be blocked in do_mmtk_stw_operation() until a GC thread
6767
// calls request(_threads_resumed).
6868
VMThread::execute(&op);
69-
70-
// Tell the waiter thread that the world has resumed.
71-
log_trace(gc)("MMTkVMCompanionThread: Notifying threads resumption...");
72-
{
73-
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
74-
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
75-
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
76-
_reached_state = _threads_resumed;
77-
_lock->notify_all();
78-
}
79-
{
80-
MutexLocker x(Heap_lock);
81-
if (Universe::has_reference_pending_list()) {
82-
Heap_lock->notify_all();
83-
}
84-
}
8569
}
8670
}
8771

@@ -107,7 +91,7 @@ void MMTkVMCompanionThread::request(stw_state desired_state, bool wait_until_rea
10791

10892
if (wait_until_reached) {
10993
while (_reached_state != desired_state) {
110-
_lock->wait(true);
94+
_lock->wait(Mutex::_no_safepoint_check_flag);
11195
}
11296
}
11397
}
@@ -123,23 +107,70 @@ void MMTkVMCompanionThread::wait_for_reached(stw_state desired_state) {
123107
assert(_desired_state == desired_state, "State %d not requested.", desired_state);
124108

125109
while (_reached_state != desired_state) {
126-
_lock->wait(true);
110+
_lock->wait(Mutex::_no_safepoint_check_flag);
127111
}
128112
}
129113

130-
// Called by the VM thread to indicate that all Java threads have stopped.
131-
// This method will block until the GC requests start-the-world.
132-
void MMTkVMCompanionThread::reach_suspended_and_wait_for_resume() {
133-
assert(Thread::current()->is_VM_thread(), "reach_suspended_and_wait_for_resume can only be executed by the VM thread");
114+
// Called by the VM thread in `VM_MMTkSTWOperation`.
115+
// This method notify that all Java threads have yielded, and will block the VM thread (thereby
116+
// blocking Java threads) until the GC requests start-the-world.
117+
void MMTkVMCompanionThread::do_mmtk_stw_operation() {
118+
assert(Thread::current()->is_VM_thread(), "do_mmtk_stw_operation can only be executed by the VM thread");
134119

135-
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
120+
{
121+
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
136122

137-
// Tell the waiter thread that the world has stopped.
138-
_reached_state = _threads_suspended;
139-
_lock->notify_all();
123+
// Tell the waiter thread that Java threads have stopped at yieldpoints.
124+
_reached_state = _threads_suspended;
125+
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_suspended state. Notifying...");
126+
_lock->notify_all();
127+
128+
// Wait until resume-the-world is requested
129+
while (_desired_state != _threads_resumed) {
130+
_lock->wait(Mutex::_no_safepoint_check_flag);
131+
}
140132

141-
// Wait until resume-the-world is requested
142-
while (_desired_state != _threads_resumed) {
143-
_lock->wait(true);
133+
// Tell the waiter thread that Java threads will eventually resume from yieldpoints. This
134+
// function will return, and, as soon as the VM thread stops executing safepoint VM operations,
135+
// Java threads will resume from yieldpoints.
136+
//
137+
// Note: We have to notify *now* instead of after `VMThread::execute()`. For reasons unknown
138+
// (likely a bug in OpenJDK 11), the VMThread fails to notify the companion thread after
139+
// evaluating `VM_MMTkSTWOperation`, and continues to execute other VM operations (such as
140+
// `RevokeBias`). This leaves the companion thread blocking on `VMThread::execute()` until the
141+
// VM thread finishes executing the next batch of queued VM operations. If we notify after
142+
// `VMThread::execute` in `run()`, it will cause a deadlock like the following:
143+
//
144+
// - The companion thread is blocked at `VMThread::execute()`, waiting for the next batch of VM
145+
// operations to finish.
146+
// - The VM thread is blocked in `SafepointSynchronize::begin()`, waiting for all mutators to
147+
// reach safepoints.
148+
// - One mutator is allocating too fast and triggers a GC, which requires the `WorkerMonitor`
149+
// lock in mmtk-core.
150+
// - A GC worker is still executing `mmtk_resume_mutator`, holding the `WorkerMutator` (as the
151+
// last parked GC worker). It is asking the companion thread to resume mutators, and is still
152+
// waiting for the companion thread to reach the `_thread_resumed` state. As we see before,
153+
// the companion thread is waiting, too.
154+
//
155+
// By notifying now, we let the companion thread stop waiting, and therefore allowing the last
156+
// parked GC worker to finish `resume_mutators`, breaking the deadlock. When the next GC
157+
// starts, the GC worker running `mmtk_stop_all_mutators` will need to wait a little longer (as
158+
// it always should) until the VM thread finishes executing other VM operations and the
159+
// companion thread is ready to respond to another request from GC workers.
160+
//
161+
// Also note that OpenJDK 17 changed the way the VM thread executes VM operations. The same
162+
// problem may not manifest in OpenJDK 17 or 21.
163+
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
164+
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
165+
_reached_state = _threads_resumed;
166+
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_resumed state. Notifying...");
167+
_lock->notify_all();
168+
}
169+
170+
{
171+
MutexLockerEx x(Heap_lock, Mutex::_no_safepoint_check_flag);
172+
if (Universe::has_reference_pending_list()) {
173+
Heap_lock->notify_all();
174+
}
144175
}
145176
}

openjdk/mmtkVMCompanionThread.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class MMTkVMCompanionThread: public NamedThread {
6363
void wait_for_reached(stw_state reached_state);
6464

6565
// Interface for the VM_MMTkSTWOperation
66-
void reach_suspended_and_wait_for_resume();
66+
void do_mmtk_stw_operation();
6767
};
6868

6969
#endif // MMTK_OPENJDK_MMTK_VM_COMPANION_THREAD_HPP

openjdk/mmtkVMOperation.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,6 @@ VM_MMTkSTWOperation::VM_MMTkSTWOperation(MMTkVMCompanionThread *companion_thread
3434

3535
void VM_MMTkSTWOperation::doit() {
3636
log_trace(vmthread)("Entered VM_MMTkSTWOperation::doit().");
37-
_companion_thread->reach_suspended_and_wait_for_resume();
37+
_companion_thread->do_mmtk_stw_operation();
3838
log_trace(vmthread)("Leaving VM_MMTkSTWOperation::doit()");
3939
}

0 commit comments

Comments
 (0)