Skip to content

Commit 940c2c3

Browse files
committed
fix deadlock misconfiguration
There is not permitted to be either a safepoint or equivalently a safepoint region transition while a lock is being held that might be needed by GC as that forms a cycle. We must ensure that this thread is permitted to keep running, so that it becomes possible for it to reach the condition wait call and release the lock to keep in synchronized with the notify_all in the sweep. Alternatively we could use a timed_wait and continuous poll the GC (to ensure we actually run GC periodically), but I would rather that be an internal GC policy than an external forcing factor here. This prevents the GC from recruiting this thread (via a signal) to help GC even though it is observably sleeping, but we might consider later integrating a way for the GC to notify a set of condition variables upon starting to wake them and recruit them temporarily via a subsequent safepoint such as (in julia-syntax psuedocode): gc_safe_enter() lock(condvar) do while !condmet wait(condvar) while gc_running unlock(lock) unsafe_enter() gc_safepoint() unsafe_leave() lock(lock) end end end gc_safe_leave()
1 parent 847319f commit 940c2c3

File tree

2 files changed

+11
-10
lines changed

2 files changed

+11
-10
lines changed

src/engine.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
6363
ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph
6464
jl_code_instance_t *ci = jl_new_codeinst_uninit(m, owner); // allocate a placeholder
6565
JL_GC_PUSH1(&ci);
66+
int8_t gc_state = jl_gc_safe_enter(ct->ptls);
6667
InferKey key = {m, owner};
6768
std::unique_lock<std::mutex> lock(engine_lock);
6869
auto tid = jl_atomic_load_relaxed(&ct->tid);
@@ -72,6 +73,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
7273
auto record = Reservations.find(key);
7374
if (record == Reservations.end()) {
7475
Reservations[key] = ReservationInfo{tid, ci};
76+
lock.unlock();
77+
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
7578
JL_GC_POP();
7679
return ci;
7780
}
@@ -81,6 +84,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
8184
auto wait_tid = record->second.tid;
8285
while (1) {
8386
if (wait_tid == tid) {
87+
lock.unlock();
88+
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
8489
JL_GC_POP();
8590
ct->ptls->engine_nqueued--;
8691
return ci; // break the cycle
@@ -96,11 +101,9 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
96101
assert(wait_tid != record2->second.tid);
97102
wait_tid = record2->second.tid;
98103
}
99-
int8_t gc_state = jl_gc_safe_enter(ct->ptls);
100104
Awaiting[tid] = key;
101105
engine_wait.wait(lock);
102106
Awaiting[tid] = InferKey{};
103-
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
104107
}
105108
}
106109

@@ -136,10 +139,6 @@ void jl_engine_sweep(jl_ptls_t *gc_all_tls_states)
136139
engine_wait.notify_all();
137140
}
138141

139-
void jl_engine_inhibit_finalizers(void)
140-
{
141-
}
142-
143142
void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src)
144143
{
145144
jl_task_t *ct = jl_current_task;

src/julia_threads.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -352,10 +352,12 @@ STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
352352
return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state));
353353
}
354354
#ifdef __clang_gcanalyzer__
355-
int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; // this could be a safepoint, but we will assume it is not
356-
void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
357-
int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
358-
void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE; // this might not be a safepoint, but we have to assume it could be (statically)
355+
// these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically)
356+
// however mark a delineated region in which safepoints would be not permissible
357+
int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_LEAVE;
358+
void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_ENTER;
359+
int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_ENTER;
360+
void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE;
359361
#else
360362
#define jl_gc_unsafe_enter(ptls) jl_gc_state_save_and_set(ptls, JL_GC_STATE_UNSAFE)
361363
#define jl_gc_unsafe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), JL_GC_STATE_UNSAFE))

0 commit comments

Comments
 (0)