From f6ea007ed8273c2d2758592c031951de59c0ba3e Mon Sep 17 00:00:00 2001 From: d-netto Date: Wed, 25 Sep 2024 15:24:18 -0300 Subject: [PATCH] all tasks profiler --- src/julia_internal.h | 17 +++ src/signal-handling.c | 38 +++--- src/signals-mach.c | 177 +++++++++++++------------ src/signals-unix.c | 234 +++++++++++++++++++++++----------- src/signals-win.c | 30 +++-- src/stackwalk.c | 33 +++-- src/task.c | 34 +++++ stdlib/Profile/src/Profile.jl | 71 ++++++++--- 8 files changed, 419 insertions(+), 215 deletions(-) diff --git a/src/julia_internal.h b/src/julia_internal.h index f00667d016796d..726453254f66ad 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -211,6 +211,23 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; +arraylist_t jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT; +typedef struct { + size_t bt_size; + int tid; +} jl_record_backtrace_result_t; +JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data, + size_t max_bt_size) JL_NOTSAFEPOINT; +extern volatile struct _jl_bt_element_t *profile_bt_data_prof; +extern volatile size_t profile_bt_size_max; +extern volatile size_t profile_bt_size_cur; +extern volatile int profile_running; +extern volatile int profile_all_tasks; +STATIC_INLINE int all_tasks_profile_running(void) JL_NOTSAFEPOINT +{ + return profile_running && profile_all_tasks; +} + // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT { diff --git a/src/signal-handling.c b/src/signal-handling.c index d7f4697a3c4f06..f6edabdb66de12 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -18,46 +18,46 @@ extern "C" { #include // Profiler control variables -// Note: these "static" variables are also used in "signals-*.c" -static volatile jl_bt_element_t *bt_data_prof = NULL; -static volatile size_t bt_size_max = 0; -static volatile size_t bt_size_cur = 0; +volatile jl_bt_element_t *profile_bt_data_prof = NULL; +volatile size_t profile_bt_size_max = 0; +volatile size_t profile_bt_size_cur = 0; static volatile uint64_t nsecprof = 0; -static volatile int running = 0; -static const uint64_t GIGA = 1000000000ULL; +volatile int profile_running = 0; +volatile int profile_all_tasks = 0; +static const uint64_t GIGA = 1000000000ULL; // Timers to take samples at intervals JL_DLLEXPORT void jl_profile_stop_timer(void); -JL_DLLEXPORT int jl_profile_start_timer(void); +JL_DLLEXPORT int jl_profile_start_timer(uint8_t); /////////////////////// // Utility functions // /////////////////////// JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec) { - bt_size_max = maxsize; + profile_bt_size_max = maxsize; nsecprof = delay_nsec; - if (bt_data_prof != NULL) - free((void*)bt_data_prof); - bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); - if (bt_data_prof == NULL && maxsize > 0) + if (profile_bt_data_prof != NULL) + free((void*)profile_bt_data_prof); + profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); + if (profile_bt_data_prof == NULL && maxsize > 0) return -1; - bt_size_cur = 0; + profile_bt_size_cur = 0; return 0; } JL_DLLEXPORT uint8_t *jl_profile_get_data(void) { - return (uint8_t*) bt_data_prof; + return (uint8_t*) profile_bt_data_prof; } JL_DLLEXPORT size_t jl_profile_len_data(void) { - return bt_size_cur; + return profile_bt_size_cur; } JL_DLLEXPORT size_t jl_profile_maxlen_data(void) { - return bt_size_max; + return profile_bt_size_max; } JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) @@ -67,12 +67,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) JL_DLLEXPORT void jl_profile_clear_data(void) { - bt_size_cur = 0; + profile_bt_size_cur = 0; } JL_DLLEXPORT int jl_profile_is_running(void) { - return running; + return profile_running; } // Any function that acquires this lock must be either a unmanaged thread @@ -184,7 +184,7 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void) // Declare buffer full if there isn't enough room to sample even just the // thread metadata and one max-sized frame. The `+ 6` is for the two block // terminator `0`'s plus the 4 metadata entries. - return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max; + return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max; } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index a939e4df71ae02..07fb2d4b239d27 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -724,6 +724,85 @@ void jl_unlock_stackwalk(int lockret) jl_unlock_profile_mach(1, lockret); } +// assumes holding `jl_lock_profile_mach` +void jl_profile_thread_mach(int tid) +{ + // if there is no space left, return early + if (jl_profile_is_buffer_full()) { + jl_profile_stop_timer(); + return; + } + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_prepare(); // briefly acquire the dlsym lock + host_thread_state_t state; + int valid_thread = jl_thread_suspend_and_get_state2(tid, &state); + unw_context_t *uc = (unw_context_t*)&state; + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_parent(); // quickly release the dlsym lock + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); + if (!valid_thread) + return; + if (profile_running) { +#ifdef LLVMLIBUNWIND + /* + * Unfortunately compact unwind info is incorrectly generated for quite a number of + * libraries by quite a large number of compilers. We can fall back to DWARF unwind info + * in some cases, but in quite a number of cases (especially libraries not compiled in debug + * mode, only the compact unwind info may be available). Even more unfortunately, there is no + * way to detect such bogus compact unwind info (other than noticing the resulting segfault). + * What we do here is ugly, but necessary until the compact unwind info situation improves. + * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. + * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost + * entry will always be correct, and the number of cases in which this is an issue is rather small. + * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling + * and during stack unwinding we only ever read memory, but never write it. + */ + + forceDwarf = 0; + unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point + + if (forceDwarf == 0) { + // Save the backtrace + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == 1) { + profile_bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == -1) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } + + forceDwarf = -2; +#else + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); +#endif + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + } + // We're done! Resume the thread. + jl_thread_resume(tid); +} + +void jl_profile_task_unix(void); + void *mach_profile_listener(void *arg) { (void)arg; @@ -741,88 +820,21 @@ void *mach_profile_listener(void *arg) // sample each thread, round-robin style in reverse order // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int nthreads = jl_atomic_load_acquire(&jl_n_threads); - int *randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = randperm[idx]; - // if there is no space left, break early - if (jl_profile_is_buffer_full()) { - jl_profile_stop_timer(); - break; - } - - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_prepare(); - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_prepare(); // briefly acquire the dlsym lock - host_thread_state_t state; - int valid_thread = jl_thread_suspend_and_get_state2(i, &state); - unw_context_t *uc = (unw_context_t*)&state; - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_parent(); // quickly release the dlsym lock - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_parent(); - if (!valid_thread) - continue; - if (running) { -#ifdef LLVMLIBUNWIND - /* - * Unfortunately compact unwind info is incorrectly generated for quite a number of - * libraries by quite a large number of compilers. We can fall back to DWARF unwind info - * in some cases, but in quite a number of cases (especially libraries not compiled in debug - * mode, only the compact unwind info may be available). Even more unfortunately, there is no - * way to detect such bogus compact unwind info (other than noticing the resulting segfault). - * What we do here is ugly, but necessary until the compact unwind info situation improves. - * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. - * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost - * entry will always be correct, and the number of cases in which this is an issue is rather small. - * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling - * and during stack unwinding we only ever read memory, but never write it. - */ - - forceDwarf = 0; - unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point - - if (forceDwarf == 0) { - // Save the backtrace - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == 1) { - bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == -1) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } - - forceDwarf = -2; -#else - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); -#endif - jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - - // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); - - // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(); + } + else { + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in random order. + int i = randperm[idx]; + jl_profile_thread_mach(i); } - // We're done! Resume the thread. - jl_thread_resume(i); } jl_unlock_profile_mach(0, keymgr_locked); - if (running) { + if (profile_running) { jl_check_profile_autostop(); // Reset the alarm kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); @@ -831,7 +843,8 @@ void *mach_profile_listener(void *arg) } } -JL_DLLEXPORT int jl_profile_start_timer(void) + +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { kern_return_t ret; if (!profile_started) { @@ -860,7 +873,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) timerprof.tv_sec = nsecprof/GIGA; timerprof.tv_nsec = nsecprof%GIGA; - running = 1; + profile_running = 1; + profile_all_tasks = all_tasks; // ensure the alarm is running ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); HANDLE_MACH_ERROR("clock_alarm", ret); @@ -870,5 +884,6 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index f99eca31730b62..9bf2586bc16e2a 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -9,6 +9,10 @@ #include #include #include + +#include "julia.h" +#include "julia_internal.h" + #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON #endif @@ -638,7 +642,7 @@ int timer_graceperiod_elapsed(void) static timer_t timerprof; static struct itimerspec itsprof; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { struct sigevent sigprof; @@ -647,10 +651,12 @@ JL_DLLEXPORT int jl_profile_start_timer(void) sigprof.sigev_notify = SIGEV_SIGNAL; sigprof.sigev_signo = SIGUSR1; sigprof.sigev_value.sival_ptr = &timerprof; - // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer - running = 1; + // Because SIGUSR1 is multipurpose, set `profile_running` before so that we know that the first SIGUSR1 came from the timer + profile_running = 1; + profile_all_tasks = all_tasks; if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -2; } @@ -660,7 +666,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) itsprof.it_value.tv_sec = nsecprof / GIGA; itsprof.it_value.tv_nsec = nsecprof % GIGA; if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -3; } return 0; @@ -668,10 +675,10 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - if (running) { + if (profile_running) { timer_delete(timerprof); last_timer_delete_time = jl_hrtime(); - running = 0; + profile_running = 0; } } @@ -789,7 +796,7 @@ void trigger_profile_peek(void) jl_safe_printf("\n======================================================================================\n"); jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration); jl_safe_printf("======================================================================================\n"); - if (bt_size_max == 0){ + if (profile_bt_size_max == 0){ // If the buffer hasn't been initialized, initialize with default size // Keep these values synchronized with Profile.default_init() if (jl_profile_init(10000000, 1000000) == -1) { @@ -797,13 +804,128 @@ void trigger_profile_peek(void) return; } } - bt_size_cur = 0; // clear profile buffer - if (jl_profile_start_timer() < 0) + profile_bt_size_cur = 0; // clear profile buffer + if (jl_profile_start_timer(0) < 0) jl_safe_printf("ERROR: Could not start profile timer\n"); else profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9); } +void jl_profile_task_unix(void) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + + // Get a random task for which we know that we won't + // drop a sample in the profiler + // TODO(Diogo, Nick): should we put an upper bound on the number of iterations? + arraylist_t tasks = jl_get_all_tasks_arraylist(); + jl_task_t *t = NULL; + uint64_t seed = jl_rand(); + while (1) { + t = (jl_task_t*)tasks.items[cong(tasks.len, &seed)]; + assert(t == NULL || jl_is_task(t)); + if (t == NULL) { + continue; + } + int t_state = jl_atomic_load_relaxed(&t->_state); + if (t_state == JL_TASK_STATE_DONE) { + continue; + } + break; + } + arraylist_free(&tasks); + assert(t != NULL); + + jl_bt_element_t *bt_data_prof = (jl_bt_element_t*)(profile_bt_data_prof + profile_bt_size_cur); + size_t bt_size_max = profile_bt_size_max - profile_bt_size_cur - 1; + + jl_record_backtrace_result_t r = jl_record_backtrace(t, bt_data_prof, bt_size_max); + + int tid = r.tid; + // we failed to get a backtrace + // TODO(Diogo, Nick): should we add a special value to the buffer to indicate this? + // Should we retry? Note that dropping a backtraces affects the effective sample rate... + // TODO(Diogo, Nick): we were running the profiler on a Julia session with --threads=8 ans saw same samples + // from thread 16. Investigate whether this makes sense... + if (r.bt_size == 0) { + return; + } + + // update the profile buffer size + profile_bt_size_cur += r.bt_size; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t; + + // store cpu cycle clock + // TODO(Diogo, Nick): why are we storing the cycle clock here? + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // the thread profiler uses this block to record whether the thread is not sleeping (1) or sleeping (2) + // let's use a dummy value which is not 1 or 2 to + // indicate that we are profiling a task, and therefore, this block is not about the thread state + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 3; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; +} + +// assumes holding `jl_lock_stackwalk` +void jl_profile_thread_unix(int tid, bt_context_t *signal_context) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + // notify thread to stop + if (!jl_thread_suspend_and_get_state(tid, 1, signal_context)) + return; + // unwinding can fail, so keep track of the current state + // and restore from the SEGV handler if anything happens. + jl_jmp_buf *old_buf = jl_get_safe_restore(); + jl_jmp_buf buf; + + jl_set_safe_restore(&buf); + if (jl_setjmp(buf, 0)) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } else { + // Get backtrace data + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, signal_context, NULL); + } + jl_set_safe_restore(old_buf); + + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + + // notify thread to resume + jl_thread_resume(tid); +} + static void *signal_listener(void *arg) { static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -903,13 +1025,13 @@ static void *signal_listener(void *arg) int doexit = critical; #ifdef SIGINFO if (sig == SIGINFO) { - if (running != 1) + if (profile_running != 1) trigger_profile_peek(); doexit = 0; } #else if (sig == SIGUSR1) { - if (running != 1 && timer_graceperiod_elapsed()) + if (profile_running != 1 && timer_graceperiod_elapsed()) trigger_profile_peek(); doexit = 0; } @@ -943,78 +1065,46 @@ static void *signal_listener(void *arg) bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) bt_context_t signal_context; - // sample each thread, round-robin style in reverse order - // (so that thread zero gets notified last) - if (critical || profile) { + if (critical) { int lockret = jl_lock_stackwalk(); - int *randperm; - if (profile) - randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = profile ? randperm[idx] : idx; + // sample each thread, round-robin style in reverse order + // (so that thread zero gets notified last) + for (int i = nthreads; i-- > 0; ) { // notify thread to stop if (!jl_thread_suspend_and_get_state(i, 1, &signal_context)) continue; // do backtrace on thread contexts for critical signals // this part must be signal-handler safe - if (critical) { - bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / nthreads - 1, - &signal_context, NULL); - bt_data[bt_size++].uintptr = 0; - } - - // do backtrace for profiler - if (profile && running) { - if (jl_profile_is_buffer_full()) { - // Buffer full: Delete the timer - jl_profile_stop_timer(); - } - else { - // unwinding can fail, so keep track of the current state - // and restore from the SEGV handler if anything happens. - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0)) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &signal_context, NULL); - } - jl_set_safe_restore(old_buf); - - jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - - // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); - - // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; - } - } - - // notify thread to resume + bt_size += rec_backtrace_ctx(bt_data + bt_size, + JL_MAX_BT_SIZE / nthreads - 1, + &signal_context, NULL); + bt_data[bt_size++].uintptr = 0; jl_thread_resume(i); } jl_unlock_stackwalk(lockret); } + else if (profile) { + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(); + } + else { + int lockret = jl_lock_stackwalk(); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in the random order. + int i = randperm[idx]; + // do backtrace for profiler + if (profile && profile_running) { + jl_profile_thread_unix(i, &signal_context); + } + } + jl_unlock_stackwalk(lockret); + } + } #ifndef HAVE_MACH - if (profile && running) { + if (profile && profile_running) { jl_check_profile_autostop(); #if defined(HAVE_TIMER) timer_settime(timerprof, 0, &itsprof, NULL); diff --git a/src/signals-win.c b/src/signals-win.c index b5f8dd8bd79d92..21377f6ffe2769 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -404,7 +404,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) while (1) { DWORD timeout_ms = nsecprof / (GIGA / 1000); Sleep(timeout_ms > 0 ? timeout_ms : 1); - if (running) { + if (profile_running) { if (jl_profile_is_buffer_full()) { jl_profile_stop_timer(); // does not change the thread state SuspendThread(GetCurrentThread()); @@ -421,26 +421,26 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) break; } // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &ctxThread, NULL); + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, &ctxThread, NULL); jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; // META_OFFSET_TASKID store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; jl_unlock_stackwalk(lockret); jl_thread_resume(0); jl_check_profile_autostop(); @@ -455,7 +455,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) static volatile TIMECAPS timecaps; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { if (hBtThread == NULL) { @@ -483,20 +483,22 @@ JL_DLLEXPORT int jl_profile_start_timer(void) return -2; } } - if (running == 0) { + if (profile_running == 0) { // Failure to change the timer resolution is not fatal. However, it is important to // ensure that the timeBeginPeriod/timeEndPeriod is paired. if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin)) timecaps.wPeriodMin = 0; } - running = 1; // set `running` finally + profile_all_tasks = all_tasks; + profile_running = 1; // set `profile_running` finally return 0; } JL_DLLEXPORT void jl_profile_stop_timer(void) { - if (running && timecaps.wPeriodMin) + if (profile_running && timecaps.wPeriodMin) timeEndPeriod(timecaps.wPeriodMin); - running = 0; + profile_running = 0; + profile_all_tasks = 0; } void jl_install_default_signal_handlers(void) diff --git a/src/stackwalk.c b/src/stackwalk.c index 6aa36fa8b499cc..1972a3d850e525 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1187,24 +1187,34 @@ return 0; #endif } -JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT +JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT { - jl_task_t *ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; + jl_record_backtrace_result_t result = {0, 0}; + jl_task_t *ct = NULL; + jl_ptls_t ptls = NULL; + int16_t tid = INT16_MAX; + if (!all_tasks_profile_running()) { + ct = jl_current_task; + ptls = ct->ptls; + ptls->bt_size = 0; + tid = ptls->tid; + } if (t == ct) { - return rec_backtrace(bt_data, max_bt_size, 0); + result.bt_size = rec_backtrace(bt_data, max_bt_size, 0); + result.tid = tid; + return result; } bt_context_t *context = NULL; bt_context_t c; int16_t old = -1; - while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) { + while (!jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid) { int lockret = jl_lock_stackwalk(); // if this task is already running somewhere, we need to stop the thread it is running on and query its state if (!jl_thread_suspend_and_get_state(old, 1, &c)) { jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) != old) continue; - return 0; + return result; } jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) == old) { @@ -1243,9 +1253,11 @@ JL_DLLEXPORT size_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, t->gcstack); if (old == -1) jl_atomic_store_relaxed(&t->tid, old); - else if (old != ptls->tid) + else if (old != tid) jl_thread_resume(old); - return bt_size; + result.bt_size = bt_size; + result.tid = old; + return result; } //-------------------------------------------------- @@ -1279,7 +1291,8 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT jl_ptls_t ptls = ct->ptls; ptls->bt_size = 0; jl_bt_element_t *bt_data = ptls->bt_data; - size_t bt_size = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE); + jl_record_backtrace_result_t r = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE); + size_t bt_size = r.bt_size; size_t i; for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { jl_print_bt_entry_codeloc(bt_data + i); @@ -1293,8 +1306,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT jlbacktrace(); } -extern int gc_first_tid; - // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT { diff --git a/src/task.c b/src/task.c index f86e0ab3a880d9..afd979f5bbdba9 100644 --- a/src/task.c +++ b/src/task.c @@ -1165,6 +1165,40 @@ JL_DLLEXPORT jl_task_t *jl_get_current_task(void) return pgcstack == NULL ? NULL : container_of(pgcstack, jl_task_t, gcstack); } +extern int gc_first_tid; + +// Select a task at random to profile. Racy: `live_tasks` can change at any time. +arraylist_t jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT +{ + arraylist_t tasks; + arraylist_new(&tasks, 0); + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads... + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) { + continue; + } + jl_task_t *t = ptls2->root_task; + if (t->ctx.stkbuf != NULL) { + arraylist_push(&tasks, t); + } + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + for (size_t i = 0; i < n; i++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); + if (t->ctx.stkbuf != NULL) { + arraylist_push(&tasks, t); + } + } + } + return tasks; +} + // Do one-time initializations for task system void jl_init_tasks(void) JL_GC_DISABLED { diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index b753c9ca88f249..526d2f82b4ae16 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -63,6 +63,25 @@ macro profile(ex) end end +export @profile_tasks + +""" + @profile_tasks + +`@profile_tasks ` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running). +These are appended to an internal buffer of backtraces. +""" +macro profile_tasks(ex) + return quote + try + start_timer(true) + $(esc(ex)) + finally + stop_timer() + end + end +end + # An internal function called to show the report after an information request (SIGINFO or SIGUSR1). function _peek_report() iob = Base.AnnotatedIOBuffer() @@ -403,9 +422,9 @@ end function has_meta(data) for i in 6:length(data) - data[i] == 0 || continue # first block end null - data[i - 1] == 0 || continue # second block end null - data[i - META_OFFSET_SLEEPSTATE] in 1:2 || continue + data[i] == 0 || continue # first block end null + data[i - 1] == 0 || continue # second block end null + data[i - META_OFFSET_SLEEPSTATE] in 1:3 || continue # 1 for not sleeping, 2 for sleeping, 3 for task profiler fake state data[i - META_OFFSET_CPUCYCLECLOCK] != 0 || continue data[i - META_OFFSET_TASKID] != 0 || continue data[i - META_OFFSET_THREADID] != 0 || continue @@ -608,9 +627,9 @@ Julia, and examine the resulting `*.mem` files. clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) # C wrappers -function start_timer() +function start_timer(all_tasks::Bool=false) check_init() # if the profile buffer hasn't been initialized, initialize with default size - status = ccall(:jl_profile_start_timer, Cint, ()) + status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks) if status < 0 error(error_codes[status]) end @@ -722,12 +741,16 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, startframe = length(data) skip = false nsleeping = 0 + task_profiler = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP ip = data[i] if is_block_end(data, i) # read metadata - thread_sleeping = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + task_profiler = true + end # cpu_cycle_clock = data[i - META_OFFSET_CPUCYCLECLOCK] taskid = data[i - META_OFFSET_TASKID] threadid = data[i - META_OFFSET_THREADID] @@ -735,7 +758,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -769,14 +792,14 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, end end @assert length(lilist) == length(n) == length(m) == length(lilist_idx) - return (lilist, n, m, totalshots, nsleeping) + return (lilist, n, m, totalshots, nsleeping, task_profiler) end const FileNameMap = Dict{Symbol,Tuple{String,String,String}} function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) - lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) + lilist, n, m, totalshots, nsleeping, task_profiler = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) if false # optional: drop the "non-interpretable" ones keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) lilist = lilist[keep] @@ -796,11 +819,15 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo return true end is_subsection || print_flat(io, lilist, n, m, cols, filenamemap, fmt) - Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + if task_profiler + Base.print(io, "Total snapshots: ", totalshots, "\n") + else + Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection println(io) print_flat(io, lilist, n, m, cols, filenamemap, fmt) - else + elseif !task_profiler Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false @@ -1034,12 +1061,16 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI startframe = length(all) skip = false nsleeping = 0 + task_profiler = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (it's read ahead below) and extra block end NULL IP ip = all[i] if is_block_end(all, i) # read metadata - thread_sleeping = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + task_profiler = true + end # cpu_cycle_clock = all[i - META_OFFSET_CPUCYCLECLOCK] taskid = all[i - META_OFFSET_TASKID] threadid = all[i - META_OFFSET_THREADID] @@ -1048,7 +1079,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -1154,7 +1185,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI nothing end cleanup!(root) - return root, nsleeping + return root, nsleeping, task_profiler end function maxstats(root::StackFrameTree) @@ -1223,9 +1254,9 @@ end function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) if fmt.combine - root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, task_profiler = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) else - root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, task_profiler = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end util_perc = (1 - (nsleeping / root.count)) * 100 is_subsection || print_tree(io, root, cols, fmt, is_subsection) @@ -1239,11 +1270,15 @@ function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, Line end return true end - Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + if task_profiler + Base.print(io, "Total snapshots: ", root.count, "\n") + else + Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection Base.println(io) print_tree(io, root, cols, fmt, is_subsection) - else + elseif !task_profiler Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false