From 2e5a73ebe7af0e8307d671c28556b9416421300c Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 11 Sep 2020 18:47:39 -0400 Subject: [PATCH 1/4] i#4462: Add new global memtrace size limit Adds a new drcachesim memtrace option -max_global_trace_refs to supply a global trace size limit that does not kill the process. When the maximum is reached, the tracer omits traces for new threads entirely. Add an online and an offline test for the feature. Adds documentation. Fixes #4462 --- api/docs/release.dox | 2 + clients/drcachesim/common/options.cpp | 11 +++- clients/drcachesim/common/options.h | 1 + clients/drcachesim/drcachesim.dox.in | 17 ++++++- .../drcachesim/tests/delay-global.templatex | 11 ++++ .../tests/offline-max-global.templatex | 10 ++++ clients/drcachesim/tracer/tracer.cpp | 51 ++++++++++++++++--- suite/tests/CMakeLists.txt | 12 +++++ 8 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 clients/drcachesim/tests/delay-global.templatex create mode 100644 clients/drcachesim/tests/offline-max-global.templatex diff --git a/api/docs/release.dox b/api/docs/release.dox index 55fab1e507e..69c29f109bf 100644 --- a/api/docs/release.dox +++ b/api/docs/release.dox @@ -168,6 +168,8 @@ Further non-compatibility-affecting changes include: CLIENT{32,64}_{ABS,REL} in tool files. Added dr_get_client_info_ex() and dr_client_iterator_next_ex() to support querying other-bitwidth client registration. + - Added a new drcachesim option \p -max_global_trace_refs for specifying a global + trace size limit that does not terminate the process. **************************************************
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 2aaef5c179b..b217c7b647c 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -226,6 +226,14 @@ droption_t op_max_trace_size( "of one internal buffer. Once reached, instrumentation continues for that thread, " "but no further data is recorded."); +droption_t op_max_global_trace_refs( + DROPTION_SCOPE_CLIENT, "max_global_trace_refs", 0, + "Cap on the total references traced", + "If non-zero, this sets a maximum size on the amount of trace references recorded. " + "Once reached, instrumented execution continues, but no further data is recorded. " + "This is similar to -exit_after_tracing but without terminating the process." + "The reference count is approximate."); + droption_t op_trace_after_instrs( DROPTION_SCOPE_CLIENT, "trace_after_instrs", 0, "Do not start tracing until N instructions", @@ -238,7 +246,8 @@ droption_t op_exit_after_tracing( DROPTION_SCOPE_CLIENT, "exit_after_tracing", 0, "Exit the process after tracing N references", "If non-zero, after tracing the specified number of references, the process is " - "exited with an exit code of 0. The reference count is approximate."); + "exited with an exit code of 0. The reference count is approximate. " + "Use -max_global_trace_refs instead to avoid terminating the process."); droption_t op_online_instr_types( DROPTION_SCOPE_CLIENT, "online_instr_types", false, diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h index 44dd75c7b85..c686e4dbfa7 100644 --- a/clients/drcachesim/common/options.h +++ b/clients/drcachesim/common/options.h @@ -86,6 +86,7 @@ extern droption_t op_use_physical; extern droption_t op_virt2phys_freq; extern droption_t op_cpu_scheduling; extern droption_t op_max_trace_size; +extern droption_t op_max_global_trace_refs; extern droption_t op_trace_after_instrs; extern droption_t op_exit_after_tracing; extern droption_t op_online_instr_types; diff --git a/clients/drcachesim/drcachesim.dox.in b/clients/drcachesim/drcachesim.dox.in index 53d9dd2017f..06ad2bcff65 100644 --- a/clients/drcachesim/drcachesim.dox.in +++ b/clients/drcachesim/drcachesim.dox.in @@ -719,8 +719,21 @@ during a desired window of execution. The \p -trace_after_instrs option delays tracing by the specified number of dynamic instruction executions. This can be used to skip initialization -and arrive at the desired starting point. The trace's length can also be -limited by the \p -exit_after_tracing option. +and arrive at the desired starting point. The trace's length can be +limited in several ways: + +- The \p -max_global_trace_refs option causes the recording of trace + data to cease once the specified threshold is exceeded by the sum of + all trace references across all threads. One trace reference entry + equals one recorded address, but due to post-processing expansion a + final offline line trace will be larger. Once recording ceases, the + application will continue to run. Threads that are newly created after + the threshold is reached will not appear in the trace. +- The \p -exit_after_tracing option similarly specifies a global trace + reference count, but once it is exceeded, the process is terminated. +- The \p -max_trace_size option sets a cap on the number of bytes written + by each thread. This is a per-thread limit, and if one thread hits the + limit it does not affect the trace recoding of other threads. If the application can be modified, it can be linked with the \p drcachesim tracer and use DynamoRIO's start/stop API routines dr_app_setup_and_start() diff --git a/clients/drcachesim/tests/delay-global.templatex b/clients/drcachesim/tests/delay-global.templatex new file mode 100644 index 00000000000..76fc46ac2aa --- /dev/null +++ b/clients/drcachesim/tests/delay-global.templatex @@ -0,0 +1,11 @@ +Hit delay threshold: enabling tracing. +Hit -max_global_trace_refs: disabling tracing. +.* + Total Number Of iterations : 3 + ................................................................... +---- ---- +Basic counts tool results: +Total counts: +.* + 1 total threads +.* diff --git a/clients/drcachesim/tests/offline-max-global.templatex b/clients/drcachesim/tests/offline-max-global.templatex new file mode 100644 index 00000000000..21e6a487d38 --- /dev/null +++ b/clients/drcachesim/tests/offline-max-global.templatex @@ -0,0 +1,10 @@ +Hit delay threshold: enabling tracing. +Hit -max_global_trace_refs: disabling tracing. +.* + Total Number Of iterations : 3 + ................................................................... +Basic counts tool results: +Total counts: +.* + 1 total threads +.* diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp index 9995b5f7219..dbc77375f33 100644 --- a/clients/drcachesim/tracer/tracer.cpp +++ b/clients/drcachesim/tracer/tracer.cpp @@ -113,7 +113,7 @@ static size_t max_buf_size; static drvector_t scratch_reserve_vec; -/* thread private buffer and counter */ +/* Thread private data. This is all set to 0 at thread init. */ typedef struct { byte *seg_base; byte *buf_base; @@ -128,6 +128,8 @@ typedef struct { /* For level 0 filters */ byte *l0_dcache; byte *l0_icache; + /* For max output thresholds. */ + bool output_disabled; } per_thread_t; #define MAX_NUM_DELAY_INSTRS 32 @@ -390,6 +392,13 @@ is_ok_to_split_before(trace_type_t type) type == TRACE_TYPE_MARKER || type == TRACE_TYPE_THREAD_EXIT; } +static inline bool +is_beyond_global_max(void) +{ + return op_max_global_trace_refs.get_value() > 0 && + num_refs_racy > op_max_global_trace_refs.get_value(); +} + static void memtrace(void *drcontext, bool skip_size_cap) { @@ -413,13 +422,29 @@ memtrace(void *drcontext, bool skip_size_cap) dr_get_thread_id(drcontext)); pipe_start = data->buf_base; pipe_end = pipe_start; - if (!skip_size_cap && op_max_trace_size.get_value() > 0 && - data->bytes_written > op_max_trace_size.get_value()) { + if (!skip_size_cap && + (data->output_disabled || + (((op_max_trace_size.get_value() > 0 && + data->bytes_written > op_max_trace_size.get_value()) || + is_beyond_global_max())))) { /* We don't guarantee to match the limit exactly so we allow one buffer * beyond. We also don't put much effort into reducing overhead once * beyond the limit: we still instrument and come here. */ do_write = false; + if (!data->output_disabled) { + data->output_disabled = true; + /* std::atomic *should* be safe (we can assert std::atomic_is_lock_free()) + * but to avoid any risk we use DR's atomics and add 1. This will only + * happen once per thread so the int should never overflow (even if it does + * an extra print is not disastrous). + */ + static int notify_once; + int count = dr_atomic_add32_return_sum(¬ify_once, 1); + if (count == 1) { + NOTIFY(0, "Hit -max_global_trace_refs: disabling tracing.\n"); + } + } } else data->bytes_written += buf_ptr - pipe_start; @@ -1555,9 +1580,10 @@ event_thread_init(void *drcontext) data->seg_base = (byte *)dr_get_dr_segment_base(tls_seg); DR_ASSERT(data->seg_base != NULL); - if (should_trace_thread_cb != NULL && - !(*should_trace_thread_cb)(dr_get_thread_id(drcontext), - trace_thread_cb_user_data)) + if ((should_trace_thread_cb != NULL && + !(*should_trace_thread_cb)(dr_get_thread_id(drcontext), + trace_thread_cb_user_data)) || + is_beyond_global_max()) BUF_PTR(data->seg_base) = NULL; else { create_buffer(data); @@ -1582,7 +1608,11 @@ event_thread_exit(void *drcontext) BUF_PTR(data->seg_base) += instru->append_thread_exit( BUF_PTR(data->seg_base), dr_get_thread_id(drcontext)); - memtrace(drcontext, true); + memtrace(drcontext, + /* If this thread already wrote some data, include its exit even + * if we're over a size limit. + */ + data->bytes_written > 0); if (op_offline.get_value()) file_ops_func.close_file(data->file); @@ -1663,6 +1693,8 @@ event_exit(void) should_trace_thread_cb = nullptr; trace_thread_cb_user_data = nullptr; thread_filtering_enabled = false; + num_refs = 0; + num_refs_racy = 0; dr_mutex_destroy(mutex); drutil_exit(); @@ -1921,6 +1953,11 @@ drmemtrace_client_main(client_id_t id, int argc, const char *argv[]) NOTIFY(0, "-use_physical is unsafe with statically linked clients\n"); #endif } + + if (op_max_global_trace_refs.get_value() > 0) { + /* We need the same is-buffer-zero checks in the instrumentation. */ + thread_filtering_enabled = true; + } } /* To support statically linked multiple clients, we add drmemtrace_client_main diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt index 6bc49d28a14..45efd697336 100644 --- a/suite/tests/CMakeLists.txt +++ b/suite/tests/CMakeLists.txt @@ -3177,6 +3177,13 @@ if (CLIENT_INTERFACE) torunonly_drcachesim(delay-simple ${ci_shared_app} "-trace_after_instrs 20000 -exit_after_tracing 10000" "") + # We use a many-threaded test with a small max and test that we only see + # 1 thread, testing the thread ignore logic. The max should be small enough + # to not be flaky on any platform. + torunonly_drcachesim(delay-global client.annotation-concurrency + "-simulator_type basic_counts -trace_after_instrs 20K -max_global_trace_refs 10K" + "${annotation_test_args_shorter}") + # Test that "Warmup hits" and "Warmup misses" are printed out torunonly_drcachesim(warmup-valid ${ci_shared_app} "-warmup_refs 1" "") @@ -3399,6 +3406,11 @@ if (CLIENT_INTERFACE) torunonly_drcacheoff(instr-only-trace ${ci_shared_app} "-instr_only_trace" "" "") torunonly_drcacheoff(filter-and-instr-only-trace ${ci_shared_app} "-instr_only_trace -L0_filter" "" "") + # As for the online test, we check that only 1 thread is in the final trace. + torunonly_drcacheoff(max-global client.annotation-concurrency + "-trace_after_instrs 20K -max_global_trace_refs 10K" + "@-simulator_type@basic_counts" "${annotation_test_args_shorter}") + # __builtin_prefetch used in the test is not defined on MSVC. if (NOT MSVC) torunonly_drcacheoff(builtin-prefetch-basic-counts builtin_prefetch "" "@-simulator_type@basic_counts" "") From d2eb39b601dea33fc93ef96d01c9e88d0a60136b Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 7 Oct 2020 19:35:43 -0400 Subject: [PATCH 2/4] Limit notification to global max --- clients/drcachesim/tracer/tracer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp index dbc77375f33..2195cd6270a 100644 --- a/clients/drcachesim/tracer/tracer.cpp +++ b/clients/drcachesim/tracer/tracer.cpp @@ -432,7 +432,7 @@ memtrace(void *drcontext, bool skip_size_cap) * beyond the limit: we still instrument and come here. */ do_write = false; - if (!data->output_disabled) { + if (!data->output_disabled && is_beyond_global_max()) { data->output_disabled = true; /* std::atomic *should* be safe (we can assert std::atomic_is_lock_free()) * but to avoid any risk we use DR's atomics and add 1. This will only From bf8f3b279a88771397e7eae1fdeefa074709ceb0 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 8 Oct 2020 14:50:18 -0400 Subject: [PATCH 3/4] Address reviewer requests: switch to atomic load check for 0; add helper for existing trace_max checks; rename helper; add regex on instr count to test; augment reattach comment --- clients/drcachesim/common/options.cpp | 5 ++- .../drcachesim/tests/delay-global.templatex | 1 + clients/drcachesim/tracer/tracer.cpp | 39 ++++++++++--------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index b217c7b647c..bade5eb5135 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -228,8 +228,9 @@ droption_t op_max_trace_size( droption_t op_max_global_trace_refs( DROPTION_SCOPE_CLIENT, "max_global_trace_refs", 0, - "Cap on the total references traced", - "If non-zero, this sets a maximum size on the amount of trace references recorded. " + "Cap on the total references of any type traced", + "If non-zero, this sets a maximum size on the amount of trace entry references " + "(of any type: instructions, loads, stores, markers, etc.) recorded. " "Once reached, instrumented execution continues, but no further data is recorded. " "This is similar to -exit_after_tracing but without terminating the process." "The reference count is approximate."); diff --git a/clients/drcachesim/tests/delay-global.templatex b/clients/drcachesim/tests/delay-global.templatex index 76fc46ac2aa..0c788f061a8 100644 --- a/clients/drcachesim/tests/delay-global.templatex +++ b/clients/drcachesim/tests/delay-global.templatex @@ -6,6 +6,7 @@ Hit -max_global_trace_refs: disabling tracing. ---- ---- Basic counts tool results: Total counts: + 1.... total \(fetched\) instructions .* 1 total threads .* diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp index 2195cd6270a..2b2badd59aa 100644 --- a/clients/drcachesim/tracer/tracer.cpp +++ b/clients/drcachesim/tracer/tracer.cpp @@ -128,8 +128,6 @@ typedef struct { /* For level 0 filters */ byte *l0_dcache; byte *l0_icache; - /* For max output thresholds. */ - bool output_disabled; } per_thread_t; #define MAX_NUM_DELAY_INSTRS 32 @@ -393,12 +391,19 @@ is_ok_to_split_before(trace_type_t type) } static inline bool -is_beyond_global_max(void) +is_num_refs_beyond_global_max(void) { return op_max_global_trace_refs.get_value() > 0 && num_refs_racy > op_max_global_trace_refs.get_value(); } +static inline bool +is_bytes_written_beyond_trace_max(per_thread_t *data) +{ + return op_max_trace_size.get_value() > 0 && + data->bytes_written > op_max_trace_size.get_value(); +} + static void memtrace(void *drcontext, bool skip_size_cap) { @@ -423,26 +428,23 @@ memtrace(void *drcontext, bool skip_size_cap) pipe_start = data->buf_base; pipe_end = pipe_start; if (!skip_size_cap && - (data->output_disabled || - (((op_max_trace_size.get_value() > 0 && - data->bytes_written > op_max_trace_size.get_value()) || - is_beyond_global_max())))) { + (is_bytes_written_beyond_trace_max(data) || is_num_refs_beyond_global_max())) { /* We don't guarantee to match the limit exactly so we allow one buffer * beyond. We also don't put much effort into reducing overhead once * beyond the limit: we still instrument and come here. */ do_write = false; - if (!data->output_disabled && is_beyond_global_max()) { - data->output_disabled = true; + if (is_num_refs_beyond_global_max()) { /* std::atomic *should* be safe (we can assert std::atomic_is_lock_free()) - * but to avoid any risk we use DR's atomics and add 1. This will only - * happen once per thread so the int should never overflow (even if it does - * an extra print is not disastrous). + * but to avoid any risk we use DR's atomics. The store cannot be moved + * above the load due to the acquire-release semantics. */ static int notify_once; - int count = dr_atomic_add32_return_sum(¬ify_once, 1); - if (count == 1) { - NOTIFY(0, "Hit -max_global_trace_refs: disabling tracing.\n"); + if (dr_atomic_load32(¬ify_once) == 0) { + int count = dr_atomic_add32_return_sum(¬ify_once, 1); + if (count == 1) { + NOTIFY(0, "Hit -max_global_trace_refs: disabling tracing.\n"); + } } } } else @@ -1583,7 +1585,7 @@ event_thread_init(void *drcontext) if ((should_trace_thread_cb != NULL && !(*should_trace_thread_cb)(dr_get_thread_id(drcontext), trace_thread_cb_user_data)) || - is_beyond_global_max()) + is_num_refs_beyond_global_max()) BUF_PTR(data->seg_base) = NULL; else { create_buffer(data); @@ -1600,8 +1602,7 @@ event_thread_exit(void *drcontext) /* This thread was *not* filtered out. */ /* let the simulator know this thread has exited */ - if (op_max_trace_size.get_value() > 0 && - data->bytes_written > op_max_trace_size.get_value()) { + if (is_bytes_written_beyond_trace_max(data)) { // If over the limit, we still want to write the footer, but nothing else. BUF_PTR(data->seg_base) = data->buf_base + buf_hdr_slots_size; } @@ -1686,7 +1687,7 @@ event_exit(void) DR_ASSERT(false); dr_unregister_exit_event(event_exit); - /* Clear callbacks to support reset. */ + /* Clear callbacks and globals to support re-attach when linked statically. */ file_ops_func = file_ops_func_t(); if (!offline_instru_t::custom_module_data(nullptr, nullptr, nullptr)) DR_ASSERT(false && "failed to clear custom module callbacks"); From dbe36a70f9c299df11d5d8d1581f3bfdddc78ae7 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 8 Oct 2020 15:46:15 -0400 Subject: [PATCH 4/4] Revert flaky test regex; add reset of do-once var on detach --- clients/drcachesim/tests/delay-global.templatex | 1 - clients/drcachesim/tracer/tracer.cpp | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/clients/drcachesim/tests/delay-global.templatex b/clients/drcachesim/tests/delay-global.templatex index 0c788f061a8..76fc46ac2aa 100644 --- a/clients/drcachesim/tests/delay-global.templatex +++ b/clients/drcachesim/tests/delay-global.templatex @@ -6,7 +6,6 @@ Hit -max_global_trace_refs: disabling tracing. ---- ---- Basic counts tool results: Total counts: - 1.... total \(fetched\) instructions .* 1 total threads .* diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp index 2b2badd59aa..30251cf04d1 100644 --- a/clients/drcachesim/tracer/tracer.cpp +++ b/clients/drcachesim/tracer/tracer.cpp @@ -96,6 +96,7 @@ static char logsubdir[MAXIMUM_PATH]; static char subdir_prefix[MAXIMUM_PATH]; /* Holds op_subdir_prefix. */ static file_t module_file; static file_t funclist_file = INVALID_FILE; +static int notify_beyond_global_max_once; /* Max number of entries a buffer can have. It should be big enough * to hold all entries between clean calls. @@ -436,12 +437,10 @@ memtrace(void *drcontext, bool skip_size_cap) do_write = false; if (is_num_refs_beyond_global_max()) { /* std::atomic *should* be safe (we can assert std::atomic_is_lock_free()) - * but to avoid any risk we use DR's atomics. The store cannot be moved - * above the load due to the acquire-release semantics. + * but to avoid any risk we use DR's atomics. */ - static int notify_once; - if (dr_atomic_load32(¬ify_once) == 0) { - int count = dr_atomic_add32_return_sum(¬ify_once, 1); + if (dr_atomic_load32(¬ify_beyond_global_max_once) == 0) { + int count = dr_atomic_add32_return_sum(¬ify_beyond_global_max_once, 1); if (count == 1) { NOTIFY(0, "Hit -max_global_trace_refs: disabling tracing.\n"); } @@ -1696,6 +1695,7 @@ event_exit(void) thread_filtering_enabled = false; num_refs = 0; num_refs_racy = 0; + notify_beyond_global_max_once = 0; dr_mutex_destroy(mutex); drutil_exit();