Skip to content

Commit ee261dc

Browse files
tkfvtjnash
authored andcommitted
Make profiling more robust with many tasks (#42978)
This patch includes two sets of changes. (1) `jl_thread_suspend_and_get_state` uses `pthread_cond_timedwait` to recover from the case where the request is not received by the signal handler. This is required because `usr2_handler` contains some paths for the case where it is not possible to obtain `ptls`. (2) `ctx_switch` now makes sure to null out `ptls` of the last task (`lastt->ptls = NULL`) after changing the current task by updating pgcstack (`jl_set_pgcstack(&t->gcstack)`). This closes the gap in which `usr2_handler` can observe the null `ptls`. Co-authored-by: Jameson Nash <[email protected]> (cherry picked from commit 8131580)
1 parent 07d6734 commit ee261dc

File tree

4 files changed

+71
-3
lines changed

4 files changed

+71
-3
lines changed

src/signals-unix.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <sys/stat.h>
88
#include <sys/mman.h>
99
#include <pthread.h>
10+
#include <time.h>
1011
#include <errno.h>
1112
#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
1213
#define MAP_ANONYMOUS MAP_ANON
@@ -364,11 +365,25 @@ static pthread_cond_t signal_caught_cond;
364365

365366
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
366367
{
368+
struct timespec ts;
369+
clock_gettime(CLOCK_REALTIME, &ts);
370+
ts.tv_sec += 1;
367371
pthread_mutex_lock(&in_signal_lock);
368372
jl_ptls_t ptls2 = jl_all_tls_states[tid];
369373
jl_atomic_store_release(&ptls2->signal_request, 1);
370374
pthread_kill(ptls2->system_id, SIGUSR2);
371-
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
375+
// wait for thread to acknowledge
376+
int err = pthread_cond_timedwait(&signal_caught_cond, &in_signal_lock, &ts);
377+
if (err == ETIMEDOUT) {
378+
sig_atomic_t request = 1;
379+
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
380+
*ctx = NULL;
381+
pthread_mutex_unlock(&in_signal_lock);
382+
return;
383+
}
384+
err = pthread_cond_wait(&signal_caught_cond, &in_signal_lock);
385+
}
386+
assert(!err);
372387
assert(jl_atomic_load_acquire(&ptls2->signal_request) == 0);
373388
*ctx = signal_context;
374389
}
@@ -750,6 +765,8 @@ static void *signal_listener(void *arg)
750765
for (int i = jl_n_threads; i-- > 0; ) {
751766
// notify thread to stop
752767
jl_thread_suspend_and_get_state(i, &signal_context);
768+
if (signal_context == NULL)
769+
continue;
753770

754771
// do backtrace on thread contexts for critical signals
755772
// this part must be signal-handler safe

src/task.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,18 +388,19 @@ static void ctx_switch(jl_task_t *lastt)
388388
else
389389
#endif
390390
*pt = NULL; // can't fail after here: clear the gc-root for the target task now
391-
lastt->ptls = NULL;
392391
}
393392

394393
// set up global state for new task and clear global state for old task
395394
t->ptls = ptls;
396395
ptls->current_task = t;
397396
JL_GC_PROMISE_ROOTED(t);
397+
jl_signal_fence();
398+
jl_set_pgcstack(&t->gcstack);
399+
jl_signal_fence();
398400
lastt->ptls = NULL;
399401
#ifdef MIGRATE_TASKS
400402
ptls->previous_task = lastt;
401403
#endif
402-
jl_set_pgcstack(&t->gcstack);
403404

404405
#if defined(JL_TSAN_ENABLED)
405406
tsan_switch_to_ctx(&t->tsan_state);

test/profile_spawnmany_exec.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# This file is a part of Julia. License is MIT: https://julialang.org/license
2+
3+
using Profile
4+
5+
function spawnmany(n)
6+
if n > 2
7+
m = n ÷ 2
8+
t = Threads.@spawn spawnmany(m)
9+
spawnmany(m)
10+
wait(t)
11+
end
12+
end
13+
14+
@profile spawnmany(parse(Int, get(ENV, "NTASKS", "2000000")))

test/threads.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,39 @@ end
147147

148148
# We don't need the watchdog anymore
149149
close(proc.in)
150+
151+
# https://github.com/JuliaLang/julia/pull/42973
152+
Sys.islinux() && @testset "spawn and wait *a lot* of tasks in @profile" begin
153+
# Not using threads_exec.jl for better isolation, reproducibility, and a
154+
# tighter timeout.
155+
script = "profile_spawnmany_exec.jl"
156+
cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $script`
157+
@testset for n in [20000, 200000, 2000000]
158+
proc = run(ignorestatus(setenv(cmd, "NTASKS" => n; dir = @__DIR__)); wait = false)
159+
done = Threads.Atomic{Bool}(false)
160+
timeout = false
161+
timer = Timer(100) do _
162+
timeout = true
163+
for sig in [Base.SIGTERM, Base.SIGHUP, Base.SIGKILL]
164+
for _ in 1:1000
165+
kill(proc, sig)
166+
if done[]
167+
if sig != Base.SIGTERM
168+
@warn "Terminating `$script` required signal $sig"
169+
end
170+
return
171+
end
172+
sleep(0.001)
173+
end
174+
end
175+
end
176+
try
177+
wait(proc)
178+
finally
179+
done[] = true
180+
close(timer)
181+
end
182+
@test success(proc)
183+
@test !timeout
184+
end
185+
end

0 commit comments

Comments
 (0)