Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix deadlock related to safepoint sync. #279

Merged
merged 3 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/ci-test-only-normal-no-compressed-oops.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ run_all_no_compressed_oop() {
runbms_dacapo2006_with_heap_multiplier antlr $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier fop $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier luindex $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier lusearch $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier pmd $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier hsqldb $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
runbms_dacapo2006_with_heap_multiplier eclipse $heap_multiplier -XX:-UseCompressedOops -XX:-UseCompressedClassPointers
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/ci-test-only-normal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ run_all() {
runbms_dacapo2006_with_heap_multiplier antlr $heap_multiplier
runbms_dacapo2006_with_heap_multiplier fop $heap_multiplier
runbms_dacapo2006_with_heap_multiplier luindex $heap_multiplier
runbms_dacapo2006_with_heap_multiplier lusearch $heap_multiplier
runbms_dacapo2006_with_heap_multiplier pmd $heap_multiplier
runbms_dacapo2006_with_heap_multiplier hsqldb $heap_multiplier
runbms_dacapo2006_with_heap_multiplier eclipse $heap_multiplier
Expand Down
2 changes: 1 addition & 1 deletion openjdk/mmtkHeap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ MMTkHeap::MMTkHeap(MMTkCollectorPolicy* policy) :
_collector_policy(policy),
_num_root_scan_tasks(0),
_n_workers(0),
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_sometimes)),
_gc_lock(new Monitor(Mutex::safepoint, "MMTkHeap::_gc_lock", true, Monitor::_safepoint_check_never)),
_soft_ref_policy()
{
_heap = this;
Expand Down
45 changes: 26 additions & 19 deletions openjdk/mmtkUpcalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,19 @@ static void mmtk_resume_mutators(void *tls) {
#endif

// Note: we don't have to hold gc_lock to increment the counter.
// The increment has to be done before mutators can be resumed
// otherwise, mutators might see a stale value
// The increment has to be done before mutators can be resumed (from `block_for_gc` or yieldpoints).
// Otherwise, mutators might see an outdated start-the-world count.
Atomic::inc(&mmtk_start_the_world_count);
log_debug(gc)("Incremented start_the_world counter to %zu.", Atomic::load(&mmtk_start_the_world_count));

log_debug(gc)("Requesting the VM to resume all mutators...");
log_debug(gc)("Requesting the companion thread to resume all mutators blocking on yieldpoints...");
MMTkHeap::heap()->companion_thread()->request(MMTkVMCompanionThread::_threads_resumed, true);
log_debug(gc)("Mutators resumed. Now notify any mutators waiting for GC to finish...");

log_debug(gc)("Notifying mutators blocking on the start-the-world counter...");
{
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), true);
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);
MMTkHeap::heap()->gc_lock()->notify_all();
}
log_debug(gc)("Mutators notified.");
}

static const int GC_THREAD_KIND_WORKER = 1;
Expand All @@ -111,30 +111,37 @@ static void mmtk_spawn_gc_thread(void* tls, int kind, void* ctx) {

static void mmtk_block_for_gc() {
MMTkHeap::heap()->_last_gc_time = os::javaTimeNanos() / NANOSECS_PER_MILLISEC;
log_debug(gc)("Thread (id=%d) will block waiting for GC to finish.", Thread::current()->osthread()->thread_id());

// We must read the counter before entering safepoint.
// This thread has just triggered GC.
// Before this thread enters safe point, the GC cannot start, and therefore cannot finish,
// and cannot increment the counter mmtk_start_the_world_count.
// Otherwise, if we attempt to acquire the gc_lock first, GC may have triggered stop-the-world
// first, and this thread will be blocked for the entire stop-the-world duration before it can
// get the lock. Once that happens, the current thread will read the counter after the GC, and
// wait for the next non-existing GC forever.
// This thread (or another mutator) has just triggered GC.
// The GC cannot start until all mutators enter safepoint.
// If the GC cannot start, it cannot finish, and cannot increment mmtk_start_the_world_count.
// Otherwise, if we enter safepoint before reading mmtk_start_the_world_count,
// we will allow the GC to start before we read the counter,
// and the GC workers may run so fast that they have finished one whole GC and incremented the
// counter before this mutator reads the counter for the first time.
// Once that happens, the current mutator will wait for the next GC forever,
// which may not happen at all before the program exits.
size_t my_count = Atomic::load(&mmtk_start_the_world_count);
size_t next_count = my_count + 1;

log_debug(gc)("Will block until the start_the_world counter reaches %zu.", next_count);

{
// Once this thread acquires the lock, the VM will consider this thread to be "in safe point".
MutexLocker locker(MMTkHeap::heap()->gc_lock());
// Enter safepoint.
JavaThread* thread = JavaThread::current();
ThreadBlockInVM tbivm(thread);

// No safepoint check. We are already in safepoint.
MutexLockerEx locker(MMTkHeap::heap()->gc_lock(), Mutex::_no_safepoint_check_flag);

while (Atomic::load(&mmtk_start_the_world_count) < next_count) {
// wait() may wake up spuriously, but the authoritative condition for unblocking is
// mmtk_start_the_world_count being incremented.
MMTkHeap::heap()->gc_lock()->wait();
MMTkHeap::heap()->gc_lock()->wait(Mutex::_no_safepoint_check_flag);
}
}
log_debug(gc)("Thread (id=%d) resumed after GC finished.", Thread::current()->osthread()->thread_id());
log_debug(gc)("Resumed after GC finished.");
}

static void mmtk_out_of_memory(void* tls, MMTkAllocationError err_kind) {
Expand Down Expand Up @@ -167,7 +174,7 @@ static bool mmtk_is_mutator(void* tls) {

static void mmtk_get_mutators(MutatorClosure closure) {
JavaThread *thr;
for (JavaThreadIteratorWithHandle jtiwh; thr = jtiwh.next();) {
for (JavaThreadIteratorWithHandle jtiwh; (thr = jtiwh.next());) {
closure.invoke(&thr->third_party_heap_mutator);
}
}
Expand Down
93 changes: 62 additions & 31 deletions openjdk/mmtkVMCompanionThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void MMTkVMCompanionThread::run() {
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
assert(_reached_state == _threads_resumed, "Threads should be running at this moment.");
while (_desired_state != _threads_suspended) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
assert(_reached_state == _threads_resumed, "Threads should still be running at this moment.");
}
Expand All @@ -63,25 +63,9 @@ void MMTkVMCompanionThread::run() {
VM_MMTkSTWOperation op(this);
// VMThread::execute() is blocking. The companion thread will be blocked
// here waiting for the VM thread to execute op, and the VM thread will
// be blocked in reach_suspended_and_wait_for_resume() until a GC thread
// be blocked in do_mmtk_stw_operation() until a GC thread
// calls request(_threads_resumed).
VMThread::execute(&op);

// Tell the waiter thread that the world has resumed.
log_trace(gc)("MMTkVMCompanionThread: Notifying threads resumption...");
{
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
_reached_state = _threads_resumed;
_lock->notify_all();
}
{
MutexLocker x(Heap_lock);
if (Universe::has_reference_pending_list()) {
Heap_lock->notify_all();
}
}
}
}

Expand All @@ -107,7 +91,7 @@ void MMTkVMCompanionThread::request(stw_state desired_state, bool wait_until_rea

if (wait_until_reached) {
while (_reached_state != desired_state) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
}
}
Expand All @@ -123,23 +107,70 @@ void MMTkVMCompanionThread::wait_for_reached(stw_state desired_state) {
assert(_desired_state == desired_state, "State %d not requested.", desired_state);

while (_reached_state != desired_state) {
_lock->wait(true);
_lock->wait(Mutex::_no_safepoint_check_flag);
}
}

// Called by the VM thread to indicate that all Java threads have stopped.
// This method will block until the GC requests start-the-world.
void MMTkVMCompanionThread::reach_suspended_and_wait_for_resume() {
assert(Thread::current()->is_VM_thread(), "reach_suspended_and_wait_for_resume can only be executed by the VM thread");
// Called by the VM thread in `VM_MMTkSTWOperation`.
// This method notify that all Java threads have yielded, and will block the VM thread (thereby
// blocking Java threads) until the GC requests start-the-world.
void MMTkVMCompanionThread::do_mmtk_stw_operation() {
assert(Thread::current()->is_VM_thread(), "do_mmtk_stw_operation can only be executed by the VM thread");

MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);
{
MutexLockerEx locker(_lock, Mutex::_no_safepoint_check_flag);

// Tell the waiter thread that the world has stopped.
_reached_state = _threads_suspended;
_lock->notify_all();
// Tell the waiter thread that Java threads have stopped at yieldpoints.
_reached_state = _threads_suspended;
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_suspended state. Notifying...");
_lock->notify_all();

// Wait until resume-the-world is requested
while (_desired_state != _threads_resumed) {
_lock->wait(Mutex::_no_safepoint_check_flag);
}

// Wait until resume-the-world is requested
while (_desired_state != _threads_resumed) {
_lock->wait(true);
// Tell the waiter thread that Java threads will eventually resume from yieldpoints. This
// function will return, and, as soon as the VM thread stops executing safepoint VM operations,
// Java threads will resume from yieldpoints.
//
// Note: We have to notify *now* instead of after `VMThread::execute()`. For reasons unknown
// (likely a bug in OpenJDK 11), the VMThread fails to notify the companion thread after
// evaluating `VM_MMTkSTWOperation`, and continues to execute other VM operations (such as
// `RevokeBias`). This leaves the companion thread blocking on `VMThread::execute()` until the
// VM thread finishes executing the next batch of queued VM operations. If we notify after
// `VMThread::execute` in `run()`, it will cause a deadlock like the following:
//
// - The companion thread is blocked at `VMThread::execute()`, waiting for the next batch of VM
// operations to finish.
// - The VM thread is blocked in `SafepointSynchronize::begin()`, waiting for all mutators to
// reach safepoints.
// - One mutator is allocating too fast and triggers a GC, which requires the `WorkerMonitor`
// lock in mmtk-core.
// - A GC worker is still executing `mmtk_resume_mutator`, holding the `WorkerMutator` (as the
// last parked GC worker). It is asking the companion thread to resume mutators, and is still
// waiting for the companion thread to reach the `_thread_resumed` state. As we see before,
// the companion thread is waiting, too.
//
// By notifying now, we let the companion thread stop waiting, and therefore allowing the last
// parked GC worker to finish `resume_mutators`, breaking the deadlock. When the next GC
// starts, the GC worker running `mmtk_stop_all_mutators` will need to wait a little longer (as
// it always should) until the VM thread finishes executing other VM operations and the
// companion thread is ready to respond to another request from GC workers.
//
// Also note that OpenJDK 17 changed the way the VM thread executes VM operations. The same
// problem may not manifest in OpenJDK 17 or 21.
assert(_desired_state == _threads_resumed, "start-the-world should be requested.");
assert(_reached_state == _threads_suspended, "Threads should still be suspended at this moment.");
_reached_state = _threads_resumed;
log_trace(gc)("do_mmtk_stw_operation: Reached _thread_resumed state. Notifying...");
_lock->notify_all();
}

{
MutexLockerEx x(Heap_lock, Mutex::_no_safepoint_check_flag);
if (Universe::has_reference_pending_list()) {
Heap_lock->notify_all();
}
}
}
2 changes: 1 addition & 1 deletion openjdk/mmtkVMCompanionThread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class MMTkVMCompanionThread: public NamedThread {
void wait_for_reached(stw_state reached_state);

// Interface for the VM_MMTkSTWOperation
void reach_suspended_and_wait_for_resume();
void do_mmtk_stw_operation();
};

#endif // MMTK_OPENJDK_MMTK_VM_COMPANION_THREAD_HPP
2 changes: 1 addition & 1 deletion openjdk/mmtkVMOperation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ VM_MMTkSTWOperation::VM_MMTkSTWOperation(MMTkVMCompanionThread *companion_thread

void VM_MMTkSTWOperation::doit() {
log_trace(vmthread)("Entered VM_MMTkSTWOperation::doit().");
_companion_thread->reach_suspended_and_wait_for_resume();
_companion_thread->do_mmtk_stw_operation();
log_trace(vmthread)("Leaving VM_MMTkSTWOperation::doit()");
}
Loading