From 92753240c3386f5aec5a2844d18074084613a0d7 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Tue, 30 Aug 2022 00:40:34 -0500 Subject: [PATCH] Significant overhaul to improve sampling + more - created thread_info struct for mapping different thread IDs - reorganized many files - moved api.hpp and api.cpp - updated CMake in libomnitrace - added categories.hpp - added concepts.hpp - moved around name definitions - moved all omnitrace components into omnitrace::component namespace - there was a lot of inconsistency b/t using tim::component in some places and omnitrace::component - added macros like OMNITRACE_DECLARE_COMPONENT in lieu of TIMEMORY_DECLARE_COMPONENT - OMNITRACE_CRITICAL_TRACE_NUM_THREADS -> OMNITRACE_THREAD_POOL_SIZE - roctracer and critical_trace use same thread pool - critical_trace functions do not lock anymore bc of thread-local TaskGroup - added component::local_category_region to support using component::category_region without explicitly passing in name - removed component::omnitrace - removed component::user_region - removed component::functors - migrated Kokkos to use component::local_category_region - migrated OMPT to use component::local_category_region - migrated omnitrace_{push,pop}_{trace,region}_hidden to use component::category_region - migrated some ppdefs - api::omnitrace -> project::omnitrace - improved recording the execution time of threads - migrated this functionality out of pthread_create_gotcha and into thread_info - moved mpi_gotcha, fork_gotcha, exit_gotcha, rcclp into omnitrace::component namespace - split backtrace up into backtrace, backtrace_metrics, backtrace_timestamp components - sampling.cpp handles setup and post-processing that was formerly in backtrace - updated logging to use colors - OMNITRACE_COLORIZED_LOG config variable - updated docs on JSON output from timemory - instrumentation info in instrumentation subfolder - added testing for KokkosP entries - added testing for ompt entries - add_critical_trace function defined in critical_trace.hpp - disable push_thread_state and pop_thread_state when thread state is Disabled or Completed - add comp::page_rss to main bundle - thread_data supports std::optional instead of std::unique_ptr - thread_data supports tim::identity to avoid unique_ptr or optional - tracing::record_thread_start_time() - tracing::push_timemory and tracing::pop_timemory are templated on CategoryT - removed anonymous namespace from omnitrace::utility - sampling backtrace stores instruction pointers instead of strings - component::category_region updates - handle disabled thread state - handle finalized state - fewer debug messages - invoke thread_init() - invoke thread_init_sampling() - handle push/pop count based on category - push/pop count only modified when used - component::cpu_freq - components/ensure_storage.hpp - reworked the pthread_create replacement function - updated parallel-overhead example to report # of times locked - OMNITRACE_MAX_UNWIND_DEPTH build option - update timemory submodule --- .cmake-format.yaml | 12 - CMakeLists.txt | 10 + examples/openmp/CMakeLists.txt | 9 + .../parallel-overhead/parallel-overhead.cpp | 8 +- external/timemory | 2 +- source/bin/omnitrace-avail/info_type.cpp | 4 +- .../critical-trace.cpp | 7 +- source/bin/omnitrace/omnitrace.cpp | 4 +- source/bin/tests/CMakeLists.txt | 7 +- source/docs/output.md | 306 +++++-- source/docs/runtime.md | 10 +- source/lib/common/defines.h.in | 22 + source/lib/omnitrace/CMakeLists.txt | 36 +- source/lib/omnitrace/{library => }/api.cpp | 2 +- source/lib/omnitrace/{library => }/api.hpp | 0 source/lib/omnitrace/library.cpp | 321 ++------ source/lib/omnitrace/library.hpp | 128 --- source/lib/omnitrace/library/CMakeLists.txt | 12 +- source/lib/omnitrace/library/categories.hpp | 168 ++++ source/lib/omnitrace/library/common.hpp | 77 +- .../library/components/CMakeLists.txt | 32 +- .../library/components/backtrace.cpp | 768 ++---------------- .../library/components/backtrace.hpp | 69 +- .../library/components/backtrace_metrics.cpp | 316 +++++++ .../library/components/backtrace_metrics.hpp | 119 +++ ...ser_region.cpp => backtrace_timestamp.cpp} | 28 +- ...ser_region.hpp => backtrace_timestamp.hpp} | 54 +- .../library/components/category_region.hpp | 166 +++- .../library/components/comm_data.cpp | 10 +- .../library/components/comm_data.hpp | 16 +- .../omnitrace/library/components/cpu_freq.cpp | 221 +++++ .../omnitrace/library/components/cpu_freq.hpp | 113 +++ .../library/components/ensure_storage.hpp | 69 ++ .../library/components/exit_gotcha.cpp | 36 +- .../library/components/exit_gotcha.hpp | 8 +- .../library/components/fork_gotcha.cpp | 7 +- .../library/components/fork_gotcha.hpp | 6 +- .../omnitrace/library/components/functors.hpp | 177 ---- .../lib/omnitrace/library/components/fwd.hpp | 211 ++--- .../library/components/mpi_gotcha.cpp | 27 +- .../library/components/mpi_gotcha.hpp | 6 +- .../library/components/omnitrace.cpp | 50 -- .../components/pthread_create_gotcha.cpp | 141 ++-- .../components/pthread_create_gotcha.hpp | 17 +- .../library/components/pthread_gotcha.cpp | 31 +- .../components/pthread_mutex_gotcha.cpp | 89 +- .../components/pthread_mutex_gotcha.hpp | 50 +- .../omnitrace/library/components/rcclp.cpp | 10 +- .../omnitrace/library/components/rcclp.hpp | 18 +- .../library/components/rocprofiler.cpp | 163 +--- .../library/components/rocprofiler.hpp | 45 +- .../library/components/roctracer.cpp | 11 +- .../library/components/roctracer.hpp | 24 +- .../omnitrace.hpp => concepts.hpp} | 53 +- source/lib/omnitrace/library/config.cpp | 30 +- source/lib/omnitrace/library/config.hpp | 4 +- source/lib/omnitrace/library/coverage.cpp | 2 +- source/lib/omnitrace/library/cpu_freq.cpp | 165 +--- .../lib/omnitrace/library/critical_trace.cpp | 26 +- .../lib/omnitrace/library/critical_trace.hpp | 81 ++ source/lib/omnitrace/library/defines.hpp.in | 12 +- .../lib/omnitrace/library/dynamic_library.cpp | 1 - source/lib/omnitrace/library/gpu.cpp | 2 +- source/lib/omnitrace/library/kokkosp.cpp | 92 +-- source/lib/omnitrace/library/ompt.cpp | 4 +- source/lib/omnitrace/library/perfetto.hpp | 75 +- .../lib/omnitrace/library/process_sampler.cpp | 10 +- source/lib/omnitrace/library/ptl.cpp | 139 ++-- source/lib/omnitrace/library/ptl.hpp | 25 +- source/lib/omnitrace/library/rcclp.cpp | 14 +- source/lib/omnitrace/library/rocm.cpp | 5 +- .../library/{components => }/rocm_smi.cpp | 28 +- .../library/{components => }/rocm_smi.hpp | 8 +- source/lib/omnitrace/library/rocprofiler.cpp | 36 +- source/lib/omnitrace/library/rocprofiler.hpp | 6 +- source/lib/omnitrace/library/roctracer.cpp | 61 +- source/lib/omnitrace/library/roctracer.hpp | 5 +- source/lib/omnitrace/library/runtime.cpp | 32 +- source/lib/omnitrace/library/runtime.hpp | 6 +- source/lib/omnitrace/library/sampling.cpp | 561 ++++++++++++- source/lib/omnitrace/library/sampling.hpp | 25 +- source/lib/omnitrace/library/state.cpp | 2 +- source/lib/omnitrace/library/state.hpp | 4 +- source/lib/omnitrace/library/thread_data.cpp | 4 +- source/lib/omnitrace/library/thread_data.hpp | 236 +++++- source/lib/omnitrace/library/thread_info.cpp | 211 +++++ source/lib/omnitrace/library/thread_info.hpp | 104 +++ source/lib/omnitrace/library/timemory.hpp | 11 +- source/lib/omnitrace/library/tracing.cpp | 8 + source/lib/omnitrace/library/tracing.hpp | 87 +- source/lib/omnitrace/library/utility.hpp | 3 - source/python/libpyomnitrace.cpp | 5 +- tests/CMakeLists.txt | 38 +- 93 files changed, 3758 insertions(+), 2716 deletions(-) rename source/lib/omnitrace/{library => }/api.cpp (99%) rename source/lib/omnitrace/{library => }/api.hpp (100%) delete mode 100644 source/lib/omnitrace/library.hpp create mode 100644 source/lib/omnitrace/library/categories.hpp create mode 100644 source/lib/omnitrace/library/components/backtrace_metrics.cpp create mode 100644 source/lib/omnitrace/library/components/backtrace_metrics.hpp rename source/lib/omnitrace/library/components/{user_region.cpp => backtrace_timestamp.cpp} (66%) rename source/lib/omnitrace/library/components/{user_region.hpp => backtrace_timestamp.hpp} (53%) create mode 100644 source/lib/omnitrace/library/components/cpu_freq.cpp create mode 100644 source/lib/omnitrace/library/components/cpu_freq.hpp create mode 100644 source/lib/omnitrace/library/components/ensure_storage.hpp delete mode 100644 source/lib/omnitrace/library/components/functors.hpp delete mode 100644 source/lib/omnitrace/library/components/omnitrace.cpp rename source/lib/omnitrace/library/{components/omnitrace.hpp => concepts.hpp} (61%) rename source/lib/omnitrace/library/{components => }/rocm_smi.cpp (95%) rename source/lib/omnitrace/library/{components => }/rocm_smi.hpp (97%) create mode 100644 source/lib/omnitrace/library/thread_info.cpp create mode 100644 source/lib/omnitrace/library/thread_info.hpp diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 2b4cef573..ed1370b64 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -180,18 +180,6 @@ parse: PATHS: '*' PATH_SUFFIXES: '*' DOC: '*' - omnitrace_add_child_library: - flags: - - SHARED - - STATIC - - OBJECT - - MODULE - - EXCLUDE_FROM_ALL - omnitrace_target_sources: - kwargs: - PUBLIC: '*' - PRIVATE: '*' - INTERFACE: '*' override_spec: {} vartags: [] proptags: [] diff --git a/CMakeLists.txt b/CMakeLists.txt index cb5efc4ac..3f061bf20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,16 @@ omnitrace_add_feature( OMNITRACE_MAX_THREADS "Maximum number of total threads supported in the host application (default: max of 128 or 16 * nproc)" ) +set(OMNITRACE_MAX_UNWIND_DEPTH + "64" + CACHE + STRING + "Maximum call-stack depth to search during call-stack unwinding. Decreasing this value will result in sampling consuming less memory" + ) +omnitrace_add_feature( + OMNITRACE_MAX_UNWIND_DEPTH + "Maximum call-stack depth to search during call-stack unwinding. Decreasing this value will result in sampling consuming less memory" + ) # default visibility settings set(CMAKE_C_VISIBILITY_PRESET diff --git a/examples/openmp/CMakeLists.txt b/examples/openmp/CMakeLists.txt index ea6a7a6d6..3f61748b7 100644 --- a/examples/openmp/CMakeLists.txt +++ b/examples/openmp/CMakeLists.txt @@ -14,6 +14,9 @@ add_executable(openmp-lu ${CMAKE_CURRENT_SOURCE_DIR}/LU/lu.cpp if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") find_package(OpenMP REQUIRED) target_link_libraries(openmp-common PUBLIC OpenMP::OpenMP_CXX) + set(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY + ON + CACHE INTERNAL "Used by omnitrace testing" FORCE) else() find_program(CLANGXX_EXECUTABLE NAMES clang++) find_library( @@ -27,9 +30,15 @@ else() omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-common) omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-cg) omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-lu) + set(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY + ON + CACHE INTERNAL "Used by omnitrace testing" FORCE) else() find_package(OpenMP REQUIRED) target_link_libraries(openmp-common PUBLIC OpenMP::OpenMP_CXX) + set(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY + OFF + CACHE INTERNAL "Used by omnitrace testing" FORCE) endif() endif() diff --git a/examples/parallel-overhead/parallel-overhead.cpp b/examples/parallel-overhead/parallel-overhead.cpp index c5bf98a9f..1eca62b4a 100644 --- a/examples/parallel-overhead/parallel-overhead.cpp +++ b/examples/parallel-overhead/parallel-overhead.cpp @@ -14,11 +14,13 @@ #if USE_LOCKS > 0 # include -using auto_lock_t = std::unique_lock; -long total = 0; +using auto_lock_t = std::unique_lock; +long total = 0; +long lock_count = 0; std::mutex mtx{}; #else std::atomic total{ 0 }; +long lock_count = 0; #endif long @@ -52,6 +54,7 @@ run(size_t nitr, long n) auto _v = fib(_get_n()); auto_lock_t _lk{ mtx }; total += _v; + ++lock_count; } #else long local = 0; @@ -110,6 +113,7 @@ main(int argc, char** argv) printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread, static_cast(total)); + printf("[%s] number of mutex locks = %li\n", _name.c_str(), lock_count); return 0; } diff --git a/external/timemory b/external/timemory index 48f4735fb..45172e95c 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 48f4735fb7c8c452088c1103936423c4727e4884 +Subproject commit 45172e95c9f401a579aeb77123cd570357d6acd1 diff --git a/source/bin/omnitrace-avail/info_type.cpp b/source/bin/omnitrace-avail/info_type.cpp index 07d6830a2..49a85d065 100644 --- a/source/bin/omnitrace-avail/info_type.cpp +++ b/source/bin/omnitrace-avail/info_type.cpp @@ -24,15 +24,13 @@ #include "enumerated_list.hpp" #include "get_availability.hpp" -#include "library/api.hpp" +#include "api.hpp" #include "library/components/backtrace.hpp" #include "library/components/fork_gotcha.hpp" #include "library/components/mpi_gotcha.hpp" -#include "library/components/omnitrace.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/components/rocprofiler.hpp" #include "library/components/roctracer.hpp" -#include "library/components/user_region.hpp" #include #include diff --git a/source/bin/omnitrace-critical-trace/critical-trace.cpp b/source/bin/omnitrace-critical-trace/critical-trace.cpp index 7c73e51c3..46e69d0b7 100644 --- a/source/bin/omnitrace-critical-trace/critical-trace.cpp +++ b/source/bin/omnitrace-critical-trace/critical-trace.cpp @@ -22,7 +22,7 @@ #include "critical-trace.hpp" -#include "library/api.hpp" +#include "api.hpp" #include "library/config.hpp" #include "library/perfetto.hpp" @@ -54,7 +54,7 @@ main(int argc, char** argv) // config::set_setting_value("OMNITRACE_CRITICAL_TRACE_DEBUG", true); config::set_setting_value("OMNITRACE_CRITICAL_TRACE_COUNT", 500); config::set_setting_value("OMNITRACE_CRITICAL_TRACE_PER_ROW", 100); - config::set_setting_value("OMNITRACE_CRITICAL_TRACE_NUM_THREADS", + config::set_setting_value("OMNITRACE_THREAD_POOL_SIZE", std::thread::hardware_concurrency()); config::set_setting_value("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", true); @@ -856,8 +856,7 @@ compute_critical_trace() try { - PTL::ThreadPool _tp{ get_critical_trace_num_threads(), []() { copy_hash_ids(); }, - []() {} }; + PTL::ThreadPool _tp{ get_thread_pool_size(), []() { copy_hash_ids(); }, []() {} }; _tp.set_verbose(-1); PTL::TaskGroup _tg{ &_tp }; diff --git a/source/bin/omnitrace/omnitrace.cpp b/source/bin/omnitrace/omnitrace.cpp index f325ea266..dde0c106f 100644 --- a/source/bin/omnitrace/omnitrace.cpp +++ b/source/bin/omnitrace/omnitrace.cpp @@ -195,7 +195,7 @@ main(int argc, char** argv) }; std::set dyninst_defs = { "TypeChecking", "SaveFPR", "DelayedParsing", - "MergeTramp" }; + "DebugParsing", "MergeTramp" }; int _argc = argc; int _cmdc = 0; @@ -1040,7 +1040,7 @@ main(int argc, char** argv) bpatch->setTypeChecking(true); bpatch->setSaveFPR(true); bpatch->setDelayedParsing(true); - bpatch->setDebugParsing(false); + bpatch->setDebugParsing(true); bpatch->setInstrStackFrames(false); bpatch->setLivenessAnalysis(false); bpatch->setBaseTrampDeletion(false); diff --git a/source/bin/tests/CMakeLists.txt b/source/bin/tests/CMakeLists.txt index 5ec702c93..27d55781d 100644 --- a/source/bin/tests/CMakeLists.txt +++ b/source/bin/tests/CMakeLists.txt @@ -174,10 +174,10 @@ omnitrace_add_bin_test( omnitrace_add_bin_test( NAME omnitrace-exe-simulate-ls-check DEPENDS omnitrace-exe-simulate-ls - COMMAND ls omnitrace-tests-output/omnitrace-exe-simulate-ls + COMMAND ls omnitrace-tests-output/omnitrace-exe-simulate-ls/instrumentation TIMEOUT 60 PASS_REGEX - ".*available-instr.json.*available-instr.txt.*available-instr.xml.*excluded-instr.json.*excluded-instr.txt.*excluded-instr.xml.*instrumented-instr.json.*instrumented-instr.txt.*instrumented-instr.xml.*overlapping-instr.json.*overlapping-instr.txt.*overlapping-instr.xml.*" + ".*available.json.*available.txt.*available.xml.*excluded.json.*excluded.txt.*excluded.xml.*instrumented.json.*instrumented.txt.*instrumented.xml.*overlapping.json.*overlapping.txt.*overlapping.xml.*" ) omnitrace_add_bin_test( @@ -265,7 +265,8 @@ omnitrace_add_bin_test( --advanced LABELS "omnitrace-avail" TIMEOUT 45 - PASS_REGEX "ENVIRONMENT VARIABLE,[ \n]+OMNITRACE_USE_PID,[ \n]+" + PASS_REGEX + "ENVIRONMENT VARIABLE,[ \n]+OMNITRACE_THREAD_POOL_SIZE,[ \n]+OMNITRACE_USE_PID,[ \n]+" FAIL_REGEX "OMNITRACE_USE_PERFETTO") string(REPLACE "+" "\\\+" _AVAIL_CFG_PATH diff --git a/source/docs/output.md b/source/docs/output.md index f78f012a7..d8e219d0e 100644 --- a/source/docs/output.md +++ b/source/docs/output.md @@ -524,125 +524,299 @@ component explicitly sets type-traits which specify that the data is only releva |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ``` -### Timemory Flat JSON Output +### Timemory JSON Output -> ***Hint: the generation of flat JSON output is configurable via `OMNITRACE_JSON_OUTPUT`*** +> ***Hint: the generation of flat JSON output is configurable via `OMNITRACE_JSON_OUTPUT`.*** +> ***The generation of hierarchical JSON data is configurable via `OMNITRACE_TREE_OUTPUT`.*** -Timemory provides two JSON output formats. The flat JSON output files are similar to the text files: the hierarchical information -is represented by the indentation of the `"prefix"` field and the `"depth"` field. All the data entries are in a single JSON array, -e.g. the `["timemory"]["wall_clock"]["ranks"][0]["graph"][]["prefix"]` entry in the below: +Timemory represents the data within the JSON output in two forms: a flat structure and a hierarchical structure. +The flat JSON data represents the data similar to the text files: the hierarchical information +is represented by the indentation of the `"prefix"` field and the `"depth"` field. +The hierarchical JSON contains additional information with respect to inclusive and exclusive value, however, +it's structure requires processing through recursion. This section of the JSON supports analysis +by [hatchet](https://github.com/hatchet/hatchet). +All the data entries for the flat structure are in a single JSON array. +This format is easier than the hierarchical format to write a simple Python script for post-processing. + +#### Timemory JSON Output Sample + +In the JSON below, the flat data starts at `["timemory"]["wall_clock"]["ranks"]` +and the hierarchical data starts at `["timemory"]["wall_clock"]["graph"]`. +E.g., accessing the name (prefix) of the nth entry in the flat data layout is: +`["timemory"]["wall_clock"]["ranks"][0]["graph"][]["prefix"]`. When full MPI +support is enable, the per-rank data in flat layout will be represented +in as an entry in the "ranks" array; in the hierarchical data structure, +the per-rank data is represented as entry in the "mpi" array (but "graph" +is used in lieu of "mpi" when full MPI support is enabled). +In the hierarchical layout, all data for the process is all a child of a (dummy) +root node (which has the name `unknown-hash=0`). ```json { "timemory": { "wall_clock": { - "description": "Real-clock timer (i.e. wall-clock timer)", - "thread_count": 12, - "process_count": 1, "properties": { "cereal_class_version": 0, + "value": 78, "enum": "WALL_CLOCK", "id": "wall_clock", - "value": 78, "ids": [ "real_clock", "virtual_clock", "wall_clock" ] }, - "mpi_size": 0, - "num_ranks": 1, - "concurrency": 12, - "upcxx_size": 1, - "unit_value": 1000000000, - "thread_scope_only": false, "type": "wall_clock", + "description": "Real-clock timer (i.e. wall-clock timer)", + "unit_value": 1000000000, "unit_repr": "sec", + "thread_scope_only": false, + "thread_count": 2, + "mpi_size": 1, + "upcxx_size": 1, + "process_count": 1, + "num_ranks": 1, + "concurrency": 2, "ranks": [ { - "graph_size": 173, "rank": 0, + "graph_size": 112, "graph": [ { + "hash": 17481650134347108265, + "prefix": "|0>>> main", "depth": 0, - "stats": { - "count": 1, - "min": 13.360264917, - "sqr": 178.49667865242102, - "sum": 13.360264917, - "stddev": 0.0, - "max": 13.360264917, + "entry": { "cereal_class_version": 0, - "mean": 13.360264917 + "laps": 1, + "value": 894743517, + "accum": 894743517, + "repr_data": 0.894743517, + "repr_display": 0.894743517 }, - "prefix": "|00>>> main", - "rolling_hash": 17481650134347108265, - "entry": { - "repr_display": 13.360264917, - "value": 13360264917, - "repr_data": 13.360264917, + "stats": { "cereal_class_version": 0, - "accum": 13360264917, - "laps": 1 + "sum": 0.894743517, + "count": 1, + "min": 0.894743517, + "max": 0.894743517, + "sqr": 0.8005659612135293, + "mean": 0.894743517, + "stddev": 0.0 }, - "hash": 17481650134347108265 + "rolling_hash": 17481650134347108265 }, { + "hash": 3455444288293231339, + "prefix": "|0>>> |_read_input", "depth": 1, + "entry": { + "laps": 1, + "value": 9808, + "accum": 9808, + "repr_data": 9.808e-06, + "repr_display": 9.808e-06 + }, "stats": { + "sum": 9.808e-06, "count": 1, - "min": 10.924160502, - "max": 10.924160502, - "sum": 10.924160502, - "stddev": 0.0, - "sqr": 119.33728267345688, - "mean": 10.924160502 + "min": 9.808e-06, + "max": 9.808e-06, + "sqr": 9.6196864e-11, + "mean": 9.808e-06, + "stddev": 0.0 }, - "prefix": "|00>>> |_ompt_thread_initial", - "rolling_hash": 5142782188440775656, + "rolling_hash": 2490350348930787988 + }, + { + "hash": 8456966793631718807, + "prefix": "|0>>> |_setcoeff", + "depth": 1, "entry": { - "repr_display": 10.924160502, "laps": 1, - "accum": 10924160502, - "repr_data": 10.924160502, - "value": 10924160502 + "value": 922, + "accum": 922, + "repr_data": 9.22e-07, + "repr_display": 9.22e-07 + }, + "stats": { + "sum": 9.22e-07, + "count": 1, + "min": 9.22e-07, + "max": 9.22e-07, + "sqr": 8.50084e-13, + "mean": 9.22e-07, + "stddev": 0.0 }, - "hash": 6107876127803219007 + "rolling_hash": 7491872854269275456 }, { - "depth": 2, + "hash": 6107876127803219007, + "prefix": "|0>>> |_ompt_thread_initial", + "depth": 1, + "entry": { + "laps": 1, + "value": 896506392, + "accum": 896506392, + "repr_data": 0.896506392, + "repr_display": 0.896506392 + }, "stats": { + "sum": 0.896506392, "count": 1, - "min": 10.923050237, - "max": 10.923050237, - "sum": 10.923050237, - "stddev": 0.0, - "sqr": 119.31302648002575, - "mean": 10.923050237 + "min": 0.896506392, + "max": 0.896506392, + "sqr": 0.8037237108968578, + "mean": 0.896506392, + "stddev": 0.0 }, - "prefix": "|00>>> |_ompt_implicit_task", - "rolling_hash": 2098840206724841601, + "rolling_hash": 5142782188440775656 + }, + { + "hash": 15402802091993617561, + "prefix": "|0>>> |_ompt_implicit_task", + "depth": 2, "entry": { - "repr_display": 10.923050237, "laps": 1, - "accum": 10923050237, - "repr_data": 10.923050237, - "value": 10923050237 + "value": 896479111, + "accum": 896479111, + "repr_data": 0.896479111, + "repr_display": 0.896479111 }, - "hash": 15402802091993617561 - }, + "stats": { + "sum": 0.896479111, + "count": 1, + "min": 0.896479111, + "max": 0.896479111, + "sqr": 0.8036747964593504, + "mean": 0.896479111, + "stddev": 0.0 + }, + "rolling_hash": 2098840206724841601 }, { "..." : "... etc. ..." } ] } + ], + "graph": [ + [ + { + "cereal_class_version": 0, + "node": { + "hash": 0, + "prefix": "unknown-hash=0", + "tid": [ + 0 + ], + "pid": [ + 2539175 + ], + "depth": 0, + "is_dummy": false, + "inclusive": { + "entry": { + "laps": 0, + "value": 0, + "accum": 0, + "repr_data": 0.0, + "repr_display": 0.0 + }, + "stats": { + "sum": 0.0, + "count": 0, + "min": 0.0, + "max": 0.0, + "sqr": 0.0, + "mean": 0.0, + "stddev": 0.0 + } + }, + "exclusive": { + "entry": { + "laps": 0, + "value": -894743517, + "accum": -894743517, + "repr_data": -0.894743517, + "repr_display": -0.894743517 + }, + "stats": { + "sum": 0.0, + "count": 0, + "min": 0.0, + "max": 0.0, + "sqr": 0.0, + "mean": 0.0, + "stddev": 0.0 + } + } + }, + "children": [ + { + "node": { + "hash": 17481650134347108265, + "prefix": "main", + "tid": [ + 0 + ], + "pid": [ + 2539175 + ], + "depth": 1, + "is_dummy": false, + "inclusive": { + "entry": { + "laps": 1, + "value": 894743517, + "accum": 894743517, + "repr_data": 0.894743517, + "repr_display": 0.894743517 + }, + "stats": { + "sum": 0.894743517, + "count": 1, + "min": 0.894743517, + "max": 0.894743517, + "sqr": 0.8005659612135293, + "mean": 0.894743517, + "stddev": 0.0 + } + }, + "exclusive": { + "entry": { + "laps": 1, + "value": -1773605, + "accum": -1773605, + "repr_data": -0.001773605, + "repr_display": -0.001773605 + }, + "stats": { + "sum": -0.001773605, + "count": 1, + "min": 9.22e-07, + "max": 0.896506392, + "sqr": -0.0031577497803754, + "mean": -0.001773605, + "stddev": 0.0 + } + } + }, + "children": [ + { + "..." : "... etc. ..." + } + ] + } + ] + } + ] ] } } } ``` -This format is easier than the hierarchical format to write a simple Python script for post-processing, e.g.: +#### Timemory JSON Output Python Post-Processing Example ```python #!/usr/bin/env python3 @@ -708,11 +882,3 @@ This script applied to the corresponding JSON output from [Text Output Example]( [openmp-cg.inst-wall_clock.json] Found metric: wall_clock [openmp-cg.inst-wall_clock.json] Maximum value: 'conj_grad' at depth 6 was called 76x :: 10.641 sec (mean = 1.400e-01 sec) ``` - -### Timemory Hierarchical JSON Output - -> ***Hint: the generation of hierarchical JSON output is configurable via `OMNITRACE_TREE_OUTPUT`*** - -The hierarchical JSON output (extension: `.tree.json`) contains the very similar data to the flat JSON output, however, -it's structure requires processing through recursion. The main use of these files are their analysis support -by [hatchet](https://github.com/hatchet/hatchet). diff --git a/source/docs/runtime.md b/source/docs/runtime.md index 380bcd845..2c6baa9bf 100644 --- a/source/docs/runtime.md +++ b/source/docs/runtime.md @@ -191,7 +191,7 @@ OMNITRACE_CRITICAL_TRACE = false OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT = 2000 OMNITRACE_CRITICAL_TRACE_COUNT = 0 OMNITRACE_CRITICAL_TRACE_DEBUG = false -OMNITRACE_CRITICAL_TRACE_NUM_THREADS = 8 +OMNITRACE_THREAD_POOL_SIZE = 8 OMNITRACE_CRITICAL_TRACE_PER_ROW = 0 OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES = false OMNITRACE_DEBUG = false @@ -287,7 +287,7 @@ $ omnitrace-avail -S -bd | OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT | Number of critical trace records to ... | | OMNITRACE_CRITICAL_TRACE_COUNT | Number of critical trace to export (... | | OMNITRACE_CRITICAL_TRACE_DEBUG | Enable debugging for critical trace | -| OMNITRACE_CRITICAL_TRACE_NUM_THREADS | Number of threads to use when genera... | +| OMNITRACE_THREAD_POOL_SIZE | Number of threads to use when genera... | | OMNITRACE_CRITICAL_TRACE_PER_ROW | How many critical traces per row in ... | | OMNITRACE_CRITICAL_TRACE_SERIALIZE_N... | Include names in serialization of cr... | | OMNITRACE_DEBUG | Enable debug output | @@ -1200,21 +1200,20 @@ OMNITRACE_USE_PERFETTO = $ENABLE OMNITRACE_USE_TIMEMORY = $ENABLE OMNITRACE_USE_SAMPLING = $SAMPLE OMNITRACE_USE_PROCESS_SAMPLING = $SAMPLE -OMNITRACE_CRITICAL_TRACE = OFF # debug OMNITRACE_DEBUG = OFF OMNITRACE_VERBOSE = 1 # output fields -OMNITRACE_OUTPUT_PATH = omnitrace-example-output +OMNITRACE_OUTPUT_PATH = omnitrace-output OMNITRACE_OUTPUT_PREFIX = %tag%/ OMNITRACE_TIME_OUTPUT = OFF OMNITRACE_USE_PID = OFF # timemory fields OMNITRACE_PAPI_EVENTS = PAPI_TOT_INS PAPI_FP_INS -OMNITRACE_TIMEMORY_COMPONENTS = wall_clock trip_count +OMNITRACE_TIMEMORY_COMPONENTS = wall_clock peak_rss trip_count OMNITRACE_MEMORY_UNITS = MB OMNITRACE_TIMING_UNITS = sec @@ -1226,7 +1225,6 @@ OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES # misc env variables (see metadata JSON file after run) $env:OMNITRACE_SAMPLING_KEEP_DYNINST_SUFFIX = OFF -$env:OMNITRACE_SAMPLING_KEEP_INTERNAL = OFF ``` ### Sample JSON Configuration File diff --git a/source/lib/common/defines.h.in b/source/lib/common/defines.h.in index f9235a41d..8f52ec097 100644 --- a/source/lib/common/defines.h.in +++ b/source/lib/common/defines.h.in @@ -45,6 +45,28 @@ ((10000 * OMNITRACE_HIP_VERSION_MAJOR) + (100 * OMNITRACE_HIP_VERSION_MINOR) + \ OMNITRACE_HIP_VERSION_PATCH) +// clang-format off +#if !defined(OMNITRACE_MAX_THREADS) +# define OMNITRACE_MAX_THREADS @OMNITRACE_MAX_THREADS@ +#endif + +#if !defined(OMNITRACE_MAX_UNWIND_DEPTH) +# define OMNITRACE_MAX_UNWIND_DEPTH @OMNITRACE_MAX_UNWIND_DEPTH@ +#endif +// clang-format on + +#if !defined(OMNITRACE_MAX_COUNTERS) +# define OMNITRACE_MAX_COUNTERS 25 +#endif + +#if !defined(OMNITRACE_ROCM_LOOK_AHEAD) +# define OMNITRACE_ROCM_LOOK_AHEAD 128 +#endif + +#if !defined(OMNITRACE_MAX_ROCM_QUEUES) +# define OMNITRACE_MAX_ROCM_QUEUES OMNITRACE_MAX_THREADS +#endif + #define OMNITRACE_ATTRIBUTE(...) __attribute__((__VA_ARGS__)) #define OMNITRACE_VISIBILITY(MODE) OMNITRACE_ATTRIBUTE(visibility(MODE)) #define OMNITRACE_PUBLIC_API OMNITRACE_VISIBILITY("default") diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index 17260b9cc..492bf2705 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -53,38 +53,10 @@ target_link_libraries( add_library(omnitrace-object-library OBJECT) add_library(omnitrace::omnitrace-object-library ALIAS omnitrace-object-library) -target_sources(omnitrace-object-library PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library.cpp - ${CMAKE_CURRENT_LIST_DIR}/library.hpp) - -function(OMNITRACE_ADD_CHILD_LIBRARY _LIBRARY _TYPE) - add_library(${_LIBRARY} ${_TYPE}) - add_library(omnitrace::${_LIBRARY} ALIAS ${_LIBRARY}) - - if(NOT "${ARGN}" STREQUAL "") - target_sources(${_LIBRARY} ${ARGN}) - endif() - - target_link_libraries(${_LIBRARY} PRIVATE omnitrace::omnitrace-interface-library) - - if("${_TYPE}" STREQUAL "OBJECT") - target_sources(omnitrace-object-library - PUBLIC $>) - endif() - - target_link_libraries(omnitrace-object-library - PUBLIC $) -endfunction() - -function(OMNITRACE_TARGET_SOURCES _LIBRARY _MODE) - set(_MODE PUBLIC) - foreach(_FILE ${ARGN}) - if("${_FILE}" MATCHES "^(PUBLIC|PRIVATE|INTERFACE)$" AND NOT EXISTS "${_FILE}") - set(_MODE ${_FILE}) - else() - target_sources(${_LIBRARY} ${_MODE} $) - endif() - endforeach() -endfunction() +target_sources( + omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library.cpp ${CMAKE_CURRENT_LIST_DIR}/api.cpp + ${CMAKE_CURRENT_LIST_DIR}/api.hpp) add_subdirectory(library) diff --git a/source/lib/omnitrace/library/api.cpp b/source/lib/omnitrace/api.cpp similarity index 99% rename from source/lib/omnitrace/library/api.cpp rename to source/lib/omnitrace/api.cpp index b1205daae..1a436d8ea 100644 --- a/source/lib/omnitrace/library/api.cpp +++ b/source/lib/omnitrace/api.cpp @@ -20,7 +20,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "library/api.hpp" +#include "api.hpp" #include "library/debug.hpp" #include diff --git a/source/lib/omnitrace/library/api.hpp b/source/lib/omnitrace/api.hpp similarity index 100% rename from source/lib/omnitrace/library/api.hpp rename to source/lib/omnitrace/api.hpp diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 7aec59d75..5b4a51136 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -20,17 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "library.hpp" +#include "api.hpp" #include "common/setup.hpp" -#include "library/api.hpp" +#include "library/components/category_region.hpp" #include "library/components/exit_gotcha.hpp" #include "library/components/fork_gotcha.hpp" -#include "library/components/functors.hpp" #include "library/components/fwd.hpp" #include "library/components/mpi_gotcha.hpp" -#include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/pthread_mutex_gotcha.hpp" #include "library/components/rocprofiler.hpp" #include "library/config.hpp" #include "library/coverage.hpp" @@ -45,9 +42,13 @@ #include "library/rocprofiler.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" +#include "library/thread_info.hpp" #include "library/timemory.hpp" #include "library/tracing.hpp" +#include +#include +#include #include #include @@ -59,29 +60,21 @@ using namespace omnitrace; //======================================================================================// -namespace -{ -struct omni_regions -{}; -struct user_regions -{}; -} // namespace - -using omni_functors = omnitrace::component::functors; -using user_functors = omnitrace::component::functors; - -TIMEMORY_INVOKE_PREINIT(omni_functors) -TIMEMORY_INVOKE_PREINIT(user_functors) - -//======================================================================================// - namespace { auto ensure_finalization(bool _static_init = false) { - (void) threading::get_id(); - (void) utility::get_thread_index(); + const auto& _info = thread_info::init(); + auto _tid = _info->index_data; + OMNITRACE_CI_THROW(_tid->internal_value != threading::get_id(), + "Error! internal tid != %li :: %li", threading::get_id(), + _tid->internal_value); + OMNITRACE_CI_THROW(_tid->system_value != threading::get_sys_tid(), + "Error! system tid != %li :: %li", threading::get_sys_tid(), + _tid->system_value); + + if(!get_env("OMNITRACE_COLORIZED_LOG", true)) tim::log::colorized() = false; if(!_static_init) { @@ -119,105 +112,13 @@ using Phase = critical_trace::Phase; extern "C" void omnitrace_push_trace_hidden(const char* name) { - ++tracing::push_count(); - - // unconditionally return if finalized - if(get_state() == State::Finalized) - { - OMNITRACE_CONDITIONAL_BASIC_PRINT( - tracing::debug_push, "omnitrace_push_trace(%s) called during finalization\n", - name); - return; - } - - OMNITRACE_CONDITIONAL_BASIC_PRINT(tracing::debug_push, "omnitrace_push_trace(%s)\n", - name); - - // the expectation here is that if the state is not active then the call - // to omnitrace_init_tooling_hidden will activate all the appropriate - // tooling one time and as it exits set it to active and return true. - if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) - { - static auto _debug = get_debug_env() || get_debug_init(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug, "omnitrace_push_trace(%s) ignored :: not active. state = %s\n", name, - std::to_string(get_state()).c_str()); - return; - } - - OMNITRACE_DEBUG("omnitrace_push_trace(%s)\n", name); - - static auto _sample_rate = std::max(get_instrumentation_interval(), 1); - static thread_local size_t _sample_idx = 0; - auto& _interval = tracing::get_interval_data(); - auto _enabled = (_sample_idx++ % _sample_rate == 0); - - _interval->emplace_back(_enabled); - if(_enabled) omni_functors::start(name); - if(get_use_critical_trace()) - { - uint64_t _cid = 0; - uint64_t _parent_cid = 0; - uint32_t _depth = 0; - std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); - auto _ts = comp::wall_clock::record(); - add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0, - critical_trace::add_hash_id(name), _depth); - } + component::category_region::start(name); } -//======================================================================================// -/// -/// -/// -//======================================================================================// - extern "C" void omnitrace_pop_trace_hidden(const char* name) { - ++tracing::pop_count(); - - OMNITRACE_CONDITIONAL_BASIC_PRINT(tracing::debug_pop, "omnitrace_pop_trace(%s)\n", - name); - - // only execute when active - if(get_state() == State::Active) - { - OMNITRACE_DEBUG("omnitrace_pop_trace(%s)\n", name); - - auto& _interval_data = tracing::get_interval_data(); - if(!_interval_data->empty()) - { - if(_interval_data->back()) omni_functors::stop(name); - _interval_data->pop_back(); - } - - if(get_use_critical_trace()) - { - if(get_cpu_cid_stack() && !get_cpu_cid_stack()->empty()) - { - auto _cid = get_cpu_cid_stack()->back(); - if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end()) - { - uint64_t _parent_cid = 0; - uint32_t _depth = 0; - auto _ts = comp::wall_clock::record(); - std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid); - add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0, - critical_trace::add_hash_id(name), _depth); - } - } - } - } - else - { - static auto _debug = get_debug_env(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug, "omnitrace_pop_trace(%s) ignored :: state = %s\n", name, - std::to_string(get_state()).c_str()); - } + component::category_region::stop(name); } //======================================================================================// @@ -229,52 +130,13 @@ omnitrace_pop_trace_hidden(const char* name) extern "C" void omnitrace_push_region_hidden(const char* name) { - // unconditionally return if finalized - if(get_state() == State::Finalized) - { - OMNITRACE_CONDITIONAL_BASIC_PRINT( - tracing::debug_user, "omnitrace_push_region(%s) called during finalization\n", - name); - return; - } - - OMNITRACE_CONDITIONAL_BASIC_PRINT(tracing::debug_push, "omnitrace_push_region(%s)\n", - name); - - // the expectation here is that if the state is not active then the call - // to omnitrace_init_tooling_hidden will activate all the appropriate - // tooling one time and as it exits set it to active and return true. - if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) - { - static auto _debug = get_debug_env() || get_debug_init(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug, "omnitrace_push_region(%s) ignored :: not active. state = %s\n", name, - std::to_string(get_state()).c_str()); - return; - } - - OMNITRACE_DEBUG("omnitrace_push_region(%s)\n", name); - user_functors::start(name); + component::category_region::start(name); } -//======================================================================================// - extern "C" void omnitrace_pop_region_hidden(const char* name) { - // only execute when active - if(get_state() == State::Active) - { - OMNITRACE_DEBUG("omnitrace_pop_region(%s)\n", name); - user_functors::stop(name); - } - else - { - static auto _debug = get_debug_env(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug, "omnitrace_pop_region(%s) ignored :: state = %s\n", name, - std::to_string(get_state()).c_str()); - } + component::category_region::stop(name); } //======================================================================================// @@ -393,7 +255,7 @@ omnitrace_init_library_hidden() "glibc's backtrace() occurs...\n"); { std::stringstream _ss{}; - tim::print_backtrace<16>(_ss); + timemory_print_backtrace<16>(_ss); (void) _ss; } @@ -471,7 +333,6 @@ omnitrace_init_tooling_hidden() auto _dtor = scope::destructor{ []() { // if set to finalized, don't continue if(get_state() > State::Active) return; - if(config::get_trace_thread_locks()) pthread_mutex_gotcha::validate(); if(get_use_process_sampling()) { pthread_gotcha::push_enable_sampling_on_child_threads(false); @@ -527,7 +388,7 @@ omnitrace_init_tooling_hidden() } else { - tim::trait::runtime_enabled::set(false); + tim::trait::runtime_enabled::set(false); } } @@ -581,67 +442,6 @@ omnitrace_init_tooling_hidden() perfetto::TrackEvent::Register(); } - auto _exe = get_exe_name(); - - if(get_use_perfetto() && get_use_timemory()) - { - omni_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_perfetto(category::host{}, name); - tracing::push_timemory(name); - tracing::thread_init_sampling(); - }, - [](const char* name) { - tracing::pop_timemory(name); - tracing::pop_perfetto(category::host{}, name); - }); - user_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_perfetto(category::user{}, name); - tracing::push_timemory(name); - }, - [](const char* name) { - tracing::pop_timemory(name); - tracing::pop_perfetto(category::user{}, name); - }); - } - else if(get_use_perfetto()) - { - omni_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_perfetto(category::host{}, name); - tracing::thread_init_sampling(); - }, - [](const char* name) { tracing::pop_perfetto(category::host{}, name); }); - user_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_perfetto(category::user{}, name); - tracing::thread_init_sampling(); - }, - [](const char* name) { tracing::pop_perfetto(category::user{}, name); }); - } - else if(get_use_timemory()) - { - omni_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_timemory(name); - tracing::thread_init_sampling(); - }, - [](const char* name) { tracing::pop_timemory(name); }); - user_functors::configure( - [](const char* name) { - tracing::thread_init(); - tracing::push_timemory(name); - tracing::thread_init_sampling(); - }, - [](const char* name) { tracing::pop_timemory(name); }); - } - if(get_use_ompt()) { OMNITRACE_VERBOSE_F(1, "Setting up OMPT...\n"); @@ -683,8 +483,6 @@ omnitrace_init_tooling_hidden() if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n"); - pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record(); - return true; } @@ -761,8 +559,6 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a { get_gotcha_bundle()->start(); } - - pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record(); } //======================================================================================// @@ -784,7 +580,7 @@ omnitrace_finalize_hidden(void) } OMNITRACE_VERBOSE_F(0, "finalizing...\n"); - pthread_create_gotcha::get_execution_time()->second = comp::wall_clock::record(); + thread_info::set_stop(comp::wall_clock::record()); // some functions called during finalization may alter the push/pop count so we need // to save them here @@ -803,9 +599,6 @@ omnitrace_finalize_hidden(void) set_state(State::Finalized); - omni_functors::configure([](const char*) {}, [](const char*) {}); - user_functors::configure([](const char*) {}, [](const char*) {}); - pthread_gotcha::push_enable_sampling_on_child_threads(false); pthread_gotcha::set_sampling_on_all_future_threads(false); @@ -847,6 +640,13 @@ omnitrace_finalize_hidden(void) } } + // stop the main bundle which shuts down the pthread gotchas + if(get_main_bundle()) + { + OMNITRACE_DEBUG_F("Stopping main bundle...\n"); + get_main_bundle()->stop(); + } + if(get_use_rcclp()) { OMNITRACE_VERBOSE_F(1, "Shutting down RCCLP...\n"); @@ -862,14 +662,22 @@ omnitrace_finalize_hidden(void) OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); for(size_t i = 0; i < max_supported_threads; ++i) { - auto& itr = instrumentation_bundles::instances().at(i); + auto& itr = instrumentation_bundles::instances().at(i); + const auto& _info = thread_info::get(i, InternalTID); while(!itr.bundles.empty()) { - OMNITRACE_VERBOSE_F(1, + int _lvl = 1; + if(_info->is_offset) + { + ++_pop_count; + _lvl = 4; + } + OMNITRACE_VERBOSE_F(_lvl, "Warning! instrumentation bundle on thread %zu (TID=%li) " "with label '%s' was not stopped.\n", i, itr.bundles.back()->tid(), itr.bundles.back()->key().c_str()); + itr.bundles.back()->stop(); itr.bundles.back()->pop(); itr.allocator.destroy(itr.bundles.back()); @@ -891,12 +699,9 @@ omnitrace_finalize_hidden(void) OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n"); get_gotcha_bundle()->stop(); get_gotcha_bundle().reset(); - mpi_gotcha::shutdown(); + component::mpi_gotcha::shutdown(); } - OMNITRACE_VERBOSE_F(1, "Shutting down pthread gotchas...\n"); - pthread_gotcha::shutdown(); - if(get_use_process_sampling()) { OMNITRACE_VERBOSE_F(1, "Shutting down background sampler...\n"); @@ -923,11 +728,10 @@ omnitrace_finalize_hidden(void) if(dmp::rank() == 0) fprintf(stderr, "\n"); - OMNITRACE_DEBUG_F("Stopping main bundle...\n"); - // stop the main bundle and report the high-level metrics + OMNITRACE_VERBOSE_F(3, "Reporting the process- and thread-level metrics...\n"); + // report the high-level metrics for the process if(get_main_bundle()) { - get_main_bundle()->stop(); std::string _msg = JOIN("", *get_main_bundle()); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); @@ -940,7 +744,6 @@ omnitrace_finalize_hidden(void) // if they are still running (e.g. thread-pool still alive), the // thread-specific data will be wrong if try to stop them from // the main thread. - OMNITRACE_VERBOSE_F(3, "Destroying thread bundle data...\n"); for(auto& itr : thread_data::instances()) { if(itr && itr->get() && @@ -957,19 +760,16 @@ omnitrace_finalize_hidden(void) if(get_use_sampling()) { OMNITRACE_VERBOSE_F(1, "Post-processing the sampling backtraces...\n"); - for(size_t i = 0; i < max_supported_threads; ++i) - { - sampling::backtrace::post_process(i); - sampling::get_sampler(i).reset(); - } + sampling::post_process(); } if(get_use_critical_trace() || (get_use_rocm_smi() && get_use_roctracer())) { OMNITRACE_VERBOSE_F(1, "Generating the critical trace...\n"); - // increase the thread-pool size - tasking::critical_trace::get_thread_pool().initialize_threadpool( - get_critical_trace_num_threads()); + // (potentially) increase the thread-pool size since application + // shouldn't be using threads during finalization + tasking::initialize_threadpool(std::min( + { std::thread::hardware_concurrency(), get_thread_pool_size(), 8 })); for(size_t i = 0; i < max_supported_threads; ++i) { @@ -1021,8 +821,9 @@ omnitrace_finalize_hidden(void) if(get_verbose() >= 0) fprintf(stderr, "\n"); if(get_verbose() >= 0 || get_debug()) - fprintf(stderr, "[%s][%s]|%i> Flushing perfetto...\n", TIMEMORY_PROJECT_NAME, - OMNITRACE_FUNCTION, dmp::rank()); + fprintf(stderr, "%s[%s][%s]|%i> Flushing perfetto...%s\n", + tim::log::color::info(), TIMEMORY_PROJECT_NAME, OMNITRACE_FUNCTION, + dmp::rank(), tim::log::color::end()); // Make sure the last event is closed for this example. perfetto::TrackEvent::Flush(); @@ -1066,35 +867,33 @@ omnitrace_finalize_hidden(void) if(!trace_data.empty()) { + operation::file_output_message _fom{}; // Write the trace into a file. if(get_verbose() >= 0) - fprintf(stderr, - "[%s][%s]|%i> Outputting '%s' (%.2f KB / %.2f MB / %.2f GB)... ", - TIMEMORY_PROJECT_NAME, OMNITRACE_FUNCTION, dmp::rank(), - get_perfetto_output_filename().c_str(), - static_cast(trace_data.size()) / units::KB, - static_cast(trace_data.size()) / units::MB, - static_cast(trace_data.size()) / units::GB); + _fom(get_perfetto_output_filename(), std::string{ "perfetto" }, + " (%.2f KB / %.2f MB / %.2f GB)... ", + static_cast(trace_data.size()) / units::KB, + static_cast(trace_data.size()) / units::MB, + static_cast(trace_data.size()) / units::GB); std::ofstream ofs{}; if(!tim::filepath::open(ofs, get_perfetto_output_filename(), std::ios::out | std::ios::binary)) { - OMNITRACE_VERBOSE_F(0, "Error opening '%s'...\n", - get_perfetto_output_filename().c_str()); + _fom.append("Error opening '%s'...", + get_perfetto_output_filename().c_str()); _perfetto_output_error = true; } else { // Write the trace into a file. ofs.write(&trace_data[0], trace_data.size()); - if(get_verbose() >= 0) fprintf(stderr, "Done\n"); + if(get_verbose() >= 0) _fom.append("%s", "Done"); // NOLINT auto _manager = tim::manager::instance(); if(_manager) _manager->add_file_output("protobuf", "perfetto", get_perfetto_output_filename()); } ofs.close(); - if(get_verbose() >= 0) fprintf(stderr, "\n"); } else if(dmp::rank() == 0) { diff --git a/source/lib/omnitrace/library.hpp b/source/lib/omnitrace/library.hpp deleted file mode 100644 index eba85694c..000000000 --- a/source/lib/omnitrace/library.hpp +++ /dev/null @@ -1,128 +0,0 @@ -// MIT License -// -// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -// this always needs to included first -// clang-format off -#include "library/perfetto.hpp" -// clang-format on - -#include "library/timemory.hpp" -#include "library/components/roctracer.hpp" -#include "library/api.hpp" -#include "library/components/fork_gotcha.hpp" -#include "library/components/mpi_gotcha.hpp" -#include "library/api.hpp" -#include "library/common.hpp" -#include "library/state.hpp" -#include "library/config.hpp" -#include "library/thread_data.hpp" -#include "library/ptl.hpp" -#include "library/debug.hpp" -#include "library/critical_trace.hpp" -#include "library/runtime.hpp" - -#include -#include -#include - -#include - -namespace omnitrace -{ -template -inline void -add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, - size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid, - uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0) -{ - // clang-format off - // these are used to create unique type mutexes - struct critical_insert {}; - struct cpu_cid_stack {}; - // clang-format on - - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - - using tim::type_mutex; - using auto_lock_t = tim::auto_lock_t; - static constexpr auto num_mutexes = max_supported_threads; - static auto _update_freq = critical_trace::get_update_frequency(); - static auto _pid = process::get_id(); - auto _self_tid = threading::get_id(); - - if constexpr(PhaseID != critical_trace::Phase::NONE) - { - auto& _self_mtx = - type_mutex(_self_tid); - - auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; - - // unique lock per thread - if(!_self_lk.owns_lock()) _self_lk.lock(); - - auto& _critical_trace = critical_trace::get(_self_tid); - _critical_trace->emplace_back(critical_trace::entry{ - DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid, - _parent_cid, _ts_beg, _ts_val, _queue, _hash }); - } - - if constexpr(UpdateStack) - { - auto& _self_mtx = get_cpu_cid_stack_lock(_self_tid); - auto& _targ_mtx = get_cpu_cid_stack_lock(_targ_tid); - - auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; - auto_lock_t _targ_lk{ _targ_mtx, std::defer_lock }; - - // unique lock per thread - auto _lock = [&_self_lk, &_targ_lk, _self_tid, _targ_tid]() { - if(!_self_lk.owns_lock() && _self_tid != _targ_tid) _self_lk.lock(); - if(!_targ_lk.owns_lock()) _targ_lk.lock(); - }; - - if constexpr(PhaseID == critical_trace::Phase::NONE) - { - _lock(); - get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); - } - else if constexpr(PhaseID == critical_trace::Phase::BEGIN) - { - _lock(); - get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); - } - else if constexpr(PhaseID == critical_trace::Phase::END) - { - _lock(); - get_cpu_cid_stack(_targ_tid)->pop_back(); - if(_gpu_cid == 0 && _cpu_cid % _update_freq == (_update_freq - 1)) - critical_trace::update(_targ_tid); - } - tim::consume_parameters(_lock); - } - - tim::consume_parameters(_pid, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, - _ts_val, _devid, _queue, _hash, _depth, _prio, num_mutexes); -} -} // namespace omnitrace diff --git a/source/lib/omnitrace/library/CMakeLists.txt b/source/lib/omnitrace/library/CMakeLists.txt index 0adf225fe..098afa7f0 100644 --- a/source/lib/omnitrace/library/CMakeLists.txt +++ b/source/lib/omnitrace/library/CMakeLists.txt @@ -3,7 +3,6 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/defines.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/defines.hpp @ONLY) set(library_sources - ${CMAKE_CURRENT_LIST_DIR}/api.cpp ${CMAKE_CURRENT_LIST_DIR}/config.cpp ${CMAKE_CURRENT_LIST_DIR}/coverage.cpp ${CMAKE_CURRENT_LIST_DIR}/cpu_freq.cpp @@ -21,13 +20,15 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/sampling.cpp ${CMAKE_CURRENT_LIST_DIR}/state.cpp ${CMAKE_CURRENT_LIST_DIR}/thread_data.cpp + ${CMAKE_CURRENT_LIST_DIR}/thread_info.cpp ${CMAKE_CURRENT_LIST_DIR}/timemory.cpp ${CMAKE_CURRENT_LIST_DIR}/tracing.cpp) set(library_headers - ${CMAKE_CURRENT_LIST_DIR}/api.hpp + ${CMAKE_CURRENT_LIST_DIR}/categories.hpp ${CMAKE_CURRENT_LIST_DIR}/config.hpp ${CMAKE_CURRENT_LIST_DIR}/common.hpp + ${CMAKE_CURRENT_LIST_DIR}/concepts.hpp ${CMAKE_CURRENT_LIST_DIR}/coverage.hpp ${CMAKE_CURRENT_LIST_DIR}/cpu_freq.hpp ${CMAKE_CURRENT_LIST_DIR}/critical_trace.hpp @@ -41,12 +42,14 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/ptl.hpp ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp + ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp ${CMAKE_CURRENT_LIST_DIR}/runtime.hpp ${CMAKE_CURRENT_LIST_DIR}/sampling.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp ${CMAKE_CURRENT_LIST_DIR}/thread_data.hpp + ${CMAKE_CURRENT_LIST_DIR}/thread_info.hpp ${CMAKE_CURRENT_LIST_DIR}/timemory.hpp ${CMAKE_CURRENT_LIST_DIR}/tracing.hpp ${CMAKE_CURRENT_LIST_DIR}/utility.hpp) @@ -75,5 +78,10 @@ if(OMNITRACE_USE_ROCPROFILER) ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp) endif() +if(OMNITRACE_USE_ROCM_SMI) + target_sources(omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp) +endif() + add_subdirectory(components) add_subdirectory(rocprofiler) diff --git a/source/lib/omnitrace/library/categories.hpp b/source/lib/omnitrace/library/categories.hpp new file mode 100644 index 000000000..2d43d11c5 --- /dev/null +++ b/source/lib/omnitrace/library/categories.hpp @@ -0,0 +1,168 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "common/join.hpp" +#include "library/defines.hpp" + +#if defined(TIMEMORY_PERFETTO_CATEGORIES) +# error "TIMEMORY_PERFETTO_CATEGORIES is already defined. Please include \"" __FILE__ "\" before including any timemory files" +#endif + +#include +#include +#include +#include + +#define OMNITRACE_DEFINE_NAME_TRAIT(NAME, ...) \ + namespace tim \ + { \ + namespace trait \ + { \ + template <> \ + struct perfetto_category<__VA_ARGS__> \ + { \ + static constexpr auto value = NAME; \ + }; \ + } \ + } + +#define OMNITRACE_DECLARE_CATEGORY(NS, VALUE, NAME) \ + TIMEMORY_DECLARE_NS_API(NS, VALUE) \ + OMNITRACE_DEFINE_NAME_TRAIT(NAME, NS::VALUE) +#define OMNITRACE_DEFINE_CATEGORY(NS, VALUE, NAME) \ + TIMEMORY_DEFINE_NS_API(NS, VALUE) \ + OMNITRACE_DEFINE_NAME_TRAIT(NAME, NS::VALUE) + +// these are defined by omnitrace +OMNITRACE_DEFINE_CATEGORY(project, omnitrace, "omnitrace") +OMNITRACE_DEFINE_CATEGORY(category, host, "host") +OMNITRACE_DEFINE_CATEGORY(category, user, "user") +OMNITRACE_DEFINE_CATEGORY(category, device, "device") +OMNITRACE_DEFINE_CATEGORY(category, device_hip, "device_hip") +OMNITRACE_DEFINE_CATEGORY(category, device_hsa, "device_hsa") +OMNITRACE_DEFINE_CATEGORY(category, rocm_hip, "rocm_hip") +OMNITRACE_DEFINE_CATEGORY(category, rocm_hsa, "rocm_hsa") +OMNITRACE_DEFINE_CATEGORY(category, rocm_roctx, "rocm_roctx") +OMNITRACE_DEFINE_CATEGORY(category, rocm_smi, "rocm_smi") +OMNITRACE_DEFINE_CATEGORY(category, rocm_rccl, "rccl") +OMNITRACE_DEFINE_CATEGORY(category, roctracer, "roctracer") +OMNITRACE_DEFINE_CATEGORY(category, rocprofiler, "rocprofiler") +OMNITRACE_DEFINE_CATEGORY(category, pthread, "pthread") +OMNITRACE_DEFINE_CATEGORY(category, kokkos, "kokkos") +OMNITRACE_DEFINE_CATEGORY(category, mpi, "mpi") +OMNITRACE_DEFINE_CATEGORY(category, ompt, "ompt") +OMNITRACE_DEFINE_CATEGORY(category, process_sampling, "process_sampling") +OMNITRACE_DEFINE_CATEGORY(category, critical_trace, "critical-trace") +OMNITRACE_DEFINE_CATEGORY(category, host_critical_trace, "host-critical-trace") +OMNITRACE_DEFINE_CATEGORY(category, device_critical_trace, "device-critical-trace") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_page, "process_page_fault") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_virt, "process_virtual_memory") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_peak, "process_memory_hwm") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_context_switch, "process_context_switch") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_page_fault, "process_page_fault") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_user_mode_time, "process_user_cpu_time") +OMNITRACE_DEFINE_CATEGORY(cpu_freq, cpu_kernel_mode_time, "process_kernel_cpu_time") + +OMNITRACE_DECLARE_CATEGORY(category, sampling, "sampling") + +namespace tim +{ +namespace trait +{ +template +using name = perfetto_category; +} +} // namespace tim + +#define OMNITRACE_PERFETTO_CATEGORIES \ + perfetto::Category(tim::trait::name::value) \ + .SetDescription("Host-side function tracing"), \ + perfetto::Category("user").SetDescription("User-defined regions"), \ + perfetto::Category("sampling").SetDescription("Host-side function sampling"), \ + perfetto::Category("device_hip") \ + .SetDescription("Device-side functions submitted via HSA API"), \ + perfetto::Category("device_hsa") \ + .SetDescription("Device-side functions submitted via HIP API"), \ + perfetto::Category("rocm_hip").SetDescription("Host-side HIP functions"), \ + perfetto::Category("rocm_hsa").SetDescription("Host-side HSA functions"), \ + perfetto::Category("rocm_roctx").SetDescription("Host-side ROCTX labels"), \ + perfetto::Category("device_busy") \ + .SetDescription("Busy percentage of a GPU device"), \ + perfetto::Category("device_temp") \ + .SetDescription("Temperature of GPU device in degC"), \ + perfetto::Category("device_power") \ + .SetDescription("Power consumption of GPU device in watts"), \ + perfetto::Category("device_memory_usage") \ + .SetDescription("Memory usage of GPU device in MB"), \ + perfetto::Category("thread_peak_memory") \ + .SetDescription( \ + "Peak memory usage on thread in MB (derived from sampling)"), \ + perfetto::Category("thread_context_switch") \ + .SetDescription("Context switches on thread (derived from sampling)"), \ + perfetto::Category("thread_page_fault") \ + .SetDescription("Memory page faults on thread (derived from sampling)"), \ + perfetto::Category("hardware_counter") \ + .SetDescription("Hardware counter value on thread (derived from sampling)"), \ + perfetto::Category("cpu_freq") \ + .SetDescription("CPU frequency in MHz (collected in background thread)"), \ + perfetto::Category("process_page_fault") \ + .SetDescription( \ + "Memory page faults in process (collected in background thread)"), \ + perfetto::Category("process_memory_hwm") \ + .SetDescription("Memory High-Water Mark i.e. peak memory usage (collected " \ + "in background thread)"), \ + perfetto::Category("process_virtual_memory") \ + .SetDescription("Virtual memory usage in process in MB (collected in " \ + "background thread)"), \ + perfetto::Category("process_context_switch") \ + .SetDescription( \ + "Context switches in process (collected in background thread)"), \ + perfetto::Category("process_page_fault") \ + .SetDescription( \ + "Memory page faults in process (collected in background thread)"), \ + perfetto::Category("process_user_cpu_time") \ + .SetDescription("CPU time of functions executing in user-space in process " \ + "in seconds (collected in background thread)"), \ + perfetto::Category("process_kernel_cpu_time") \ + .SetDescription("CPU time of functions executing in kernel-space in " \ + "process in seconds (collected in background thread)"), \ + perfetto::Category("pthread").SetDescription("Pthread functions"), \ + perfetto::Category("kokkos").SetDescription("Kokkos regions"), \ + perfetto::Category("mpi").SetDescription("MPI regions"), \ + perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \ + perfetto::Category("rccl").SetDescription( \ + "ROCm Communication Collectives Library (RCCL) regions"), \ + perfetto::Category("comm_data") \ + .SetDescription( \ + "MPI/RCCL counters for tracking amount of data sent or received"), \ + perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \ + perfetto::Category("host-critical-trace") \ + .SetDescription("Host-side critical traces"), \ + perfetto::Category("device-critical-trace") \ + .SetDescription("Device-side critical traces"), \ + perfetto::Category("timemory").SetDescription("Events from the timemory API") + +#if defined(TIMEMORY_USE_PERFETTO) +# define TIMEMORY_PERFETTO_CATEGORIES OMNITRACE_PERFETTO_CATEGORIES +#endif diff --git a/source/lib/omnitrace/library/common.hpp b/source/lib/omnitrace/library/common.hpp index 1cc7832e8..16405eacf 100644 --- a/source/lib/omnitrace/library/common.hpp +++ b/source/lib/omnitrace/library/common.hpp @@ -23,13 +23,19 @@ #pragma once #include "common/join.hpp" +#include "library/categories.hpp" +#include "library/concepts.hpp" #include "library/defines.hpp" #include -#include +#include #include +#include +#include +#include #include #include +#include #include #include @@ -44,20 +50,75 @@ #include #include -TIMEMORY_DEFINE_NS_API(api, omnitrace) -TIMEMORY_DEFINE_NS_API(api, sampling) -TIMEMORY_DEFINE_NS_API(api, rocm_smi) -TIMEMORY_DEFINE_NS_API(api, rccl) +#define OMNITRACE_DECLARE_COMPONENT(NAME) \ + namespace omnitrace \ + { \ + namespace component \ + { \ + struct NAME; \ + } \ + } \ + namespace tim \ + { \ + namespace trait \ + { \ + template <> \ + struct is_component : true_type \ + {}; \ + } \ + } \ + namespace tim \ + { \ + namespace component \ + { \ + using ::omnitrace::component::NAME; \ + } \ + } + +#define OMNITRACE_COMPONENT_ALIAS(NAME, ...) \ + namespace omnitrace \ + { \ + namespace component \ + { \ + using NAME = __VA_ARGS__; \ + } \ + } \ + namespace tim \ + { \ + namespace component \ + { \ + using ::omnitrace::component::NAME; \ + } \ + } + +#define OMNITRACE_DEFINE_CONCRETE_TRAIT(TRAIT, TYPE, VALUE) \ + namespace tim \ + { \ + namespace trait \ + { \ + template <> \ + struct TRAIT<::omnitrace::TYPE> : VALUE \ + {}; \ + } \ + } namespace omnitrace { -namespace api = ::tim::api; // NOLINT -namespace category = ::tim::category; // NOLINT -namespace filepath = ::tim::filepath; // NOLINT +namespace api = ::tim::api; // NOLINT +namespace category = ::tim::category; // NOLINT +namespace filepath = ::tim::filepath; // NOLINT +namespace project = ::tim::project; // NOLINT +namespace process = ::tim::process; // NOLINT +namespace threading = ::tim::threading; // NOLINT +namespace scope = ::tim::scope; // NOLINT +namespace policy = ::tim::policy; // NOLINT +namespace trait = ::tim::trait; // NOLINT +using ::tim::auto_lock_t; // NOLINT using ::tim::demangle; // NOLINT using ::tim::get_env; // NOLINT using ::tim::try_demangle; // NOLINT +using ::tim::type_mutex; // NOLINT } // namespace omnitrace // same sort of functionality as python's " ".join([...]) diff --git a/source/lib/omnitrace/library/components/CMakeLists.txt b/source/lib/omnitrace/library/components/CMakeLists.txt index c4c8c0ccf..3f87c8ad3 100644 --- a/source/lib/omnitrace/library/components/CMakeLists.txt +++ b/source/lib/omnitrace/library/components/CMakeLists.txt @@ -8,11 +8,9 @@ set(component_sources ${CMAKE_CURRENT_LIST_DIR}/exit_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/fork_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/omnitrace.cpp ${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.cpp - ${CMAKE_CURRENT_LIST_DIR}/user_region.cpp) + ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.cpp) set(component_headers ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp @@ -22,41 +20,29 @@ set(component_headers ${CMAKE_CURRENT_LIST_DIR}/category_region.hpp ${CMAKE_CURRENT_LIST_DIR}/comm_data.hpp ${CMAKE_CURRENT_LIST_DIR}/cpu_freq.hpp + ${CMAKE_CURRENT_LIST_DIR}/ensure_storage.hpp ${CMAKE_CURRENT_LIST_DIR}/exit_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/fork_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/functors.hpp ${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/omnitrace.hpp ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp - ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/roctracer.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/user_region.hpp) + ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp) -omnitrace_add_child_library(omnitrace-components-object-library OBJECT) - -omnitrace_target_sources(omnitrace-components-object-library PUBLIC ${component_sources} - ${component_headers}) +target_sources(omnitrace-object-library PRIVATE ${component_sources} ${component_headers}) if(OMNITRACE_USE_ROCTRACER OR OMNITRACE_USE_ROCPROFILER) - omnitrace_target_sources(omnitrace-components-object-library - PUBLIC ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp) + target_sources(omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocprofiler.cpp) endif() if(OMNITRACE_USE_ROCTRACER) - omnitrace_target_sources(omnitrace-components-object-library - PUBLIC ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp) + target_sources(omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/roctracer.cpp) endif() if(OMNITRACE_USE_RCCL) - omnitrace_target_sources(omnitrace-components-object-library - PUBLIC ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) -endif() - -if(OMNITRACE_USE_ROCM_SMI) - omnitrace_target_sources(omnitrace-components-object-library - PUBLIC ${CMAKE_CURRENT_LIST_DIR}/rocm_smi.cpp) + target_sources(omnitrace-object-library PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) endif() diff --git a/source/lib/omnitrace/library/components/backtrace.cpp b/source/lib/omnitrace/library/components/backtrace.cpp index e1b3c2a31..72fe93422 100644 --- a/source/lib/omnitrace/library/components/backtrace.cpp +++ b/source/lib/omnitrace/library/components/backtrace.cpp @@ -20,17 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "library/components/ensure_storage.hpp" #include "library/components/fwd.hpp" -#include "library/components/pthread_create_gotcha.hpp" -#include "library/components/pthread_gotcha.hpp" -#include "library/components/rocm_smi.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/perfetto.hpp" #include "library/ptl.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" -#include "library/tracing.hpp" #include #include @@ -49,8 +46,6 @@ #include #include #include -#include -#include #include #include #include @@ -73,150 +68,47 @@ #include #include -namespace tracing -{ -using namespace ::omnitrace::tracing; -} - -namespace -{ -template -struct ensure_storage -{ - TIMEMORY_DEFAULT_OBJECT(ensure_storage) - - void operator()() const { TIMEMORY_FOLD_EXPRESSION((*this)(tim::type_list{})); } - -private: - template ::value, int> = 0> - void operator()(tim::type_list) const - { - using namespace tim; - static thread_local auto _storage = operation::get_storage{}(); - static thread_local auto _tid = threading::get_id(); - static thread_local auto _dtor = - scope::destructor{ []() { operation::set_storage{}(nullptr, _tid); } }; - - tim::operation::set_storage{}(_storage, _tid); - if(_tid == 0 && !_storage) tim::trait::runtime_enabled::set(false); - } - - template ::value, long> = 0> - void operator()(tim::type_list) const - { - tim::trait::runtime_enabled::set(false); - } -}; -} // namespace - namespace omnitrace { namespace component { -using hw_counters = typename backtrace::hw_counters; -using signal_type_instances = thread_data, api::sampling>; -using backtrace_init_instances = thread_data; -using sampler_running_instances = thread_data; -using papi_vector_instances = thread_data; -using papi_label_instances = thread_data, api::sampling>; - -namespace -{ -struct perfetto_rusage -{}; - -unique_ptr_t>& -get_papi_labels(int64_t _tid) -{ - static auto& _v = - papi_label_instances::instances(papi_label_instances::construct_on_init{}); - return _v.at(_tid); -} - -unique_ptr_t& -get_papi_vector(int64_t _tid) -{ - static auto& _v = papi_vector_instances::instances(); - if(_tid == threading::get_id()) papi_vector_instances::construct(); - return _v.at(_tid); -} - -unique_ptr_t& -get_backtrace_init(int64_t _tid) -{ - static auto& _v = backtrace_init_instances::instances(); - return _v.at(_tid); -} - -unique_ptr_t& -get_sampler_running(int64_t _tid) -{ - static auto& _v = sampler_running_instances::instances( - sampler_running_instances::construct_on_init{}, false); - return _v.at(_tid); -} -} // namespace - -bool -backtrace::operator<(const backtrace& rhs) const -{ - return (m_ts == rhs.m_ts) ? (m_tid < rhs.m_tid) : (m_ts < rhs.m_ts); -} - -std::vector +std::vector backtrace::get() const { - std::vector _v = {}; - if(m_size == 0) return _v; - size_t _size = 0; - for(const auto* itr : m_data) - _size += (strlen(itr) > 0) ? 1 : 0; - _v.reserve(_size); - for(const auto* itr : m_data) + std::vector _v = {}; + if(size() == 0) return _v; + _v.reserve(m_data.size()); + for(const auto& itr : m_data.call_stack) { - if(strlen(itr) > 0) _v.emplace_back(itr); + if(!itr) continue; + +#if defined(OMNITRACE_CI) && OMNITRACE_CI > 0 + std::string _name = {}; + _name.reserve(1024); + const char* _addr = _name.data(); + _name = itr->get_name(m_data.context, _name); + + OMNITRACE_CONDITIONAL_PRINT( + _name.data() != _addr, + "[backtrace::get()] processing unw_get_proc_name_from_ip for '%s' " + "caused a reallocation. Before=%p, After=%p\n", + _name.c_str(), _addr, _name.data()); +#else + auto _name = itr->get_name(m_data.context); +#endif + + if(!_name.empty()) _v.emplace_back(_name); } + // put the bottom of the call-stack on top std::reverse(_v.begin(), _v.end()); - while(!_v.empty() && _v.back() == "funlockfile") + // + auto _known_excludes = + std::set{ "funlockfile", "killpg", "__restore_rt" }; + // remove some known functions which are by-products of interrupts + while(!_v.empty() && _known_excludes.find(_v.back()) != _known_excludes.end()) _v.pop_back(); - return _v; -} - -void -backtrace::preinit() -{ - sampling_wall_clock::label() = "sampling_wall_clock"; - sampling_wall_clock::description() = "Wall clock time (via sampling)"; - - sampling_cpu_clock::label() = "sampling_cpu_clock"; - sampling_cpu_clock::description() = "CPU clock time (via sampling)"; - sampling_percent::label() = "sampling_percent"; - sampling_percent::description() = "Percentage of samples"; - - sampling_gpu_busy::label() = "sampling_gpu_busy_percent"; - sampling_gpu_busy::description() = "Utilization of GPU(s)"; - sampling_gpu_busy::set_precision(0); - sampling_gpu_busy::set_format_flags(sampling_gpu_busy::get_format_flags() & - std::ios_base::showpoint); - - sampling_gpu_memory::label() = "sampling_gpu_memory_usage"; - sampling_gpu_memory::description() = "Memory usage of GPU(s)"; - - sampling_gpu_power::label() = "sampling_gpu_power"; - sampling_gpu_power::description() = "Power usage of GPU(s)"; - sampling_gpu_power::unit() = units::watt; - sampling_gpu_power::display_unit() = "watts"; - sampling_gpu_power::set_precision(2); - sampling_gpu_power::set_format_flags(sampling_gpu_power::get_format_flags()); - - sampling_gpu_temp::label() = "sampling_gpu_temperature"; - sampling_gpu_temp::description() = "Temperature of GPU(s)"; - sampling_gpu_temp::unit() = 1; - sampling_gpu_temp::display_unit() = "degC"; - sampling_gpu_temp::set_precision(1); - sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags()); + return _v; } std::string @@ -231,252 +123,9 @@ backtrace::description() return "Records backtrace data"; } -void -backtrace::start() -{} - -void -backtrace::stop() -{} - -bool -backtrace::empty() const -{ - return (m_size == 0); -} - -size_t -backtrace::size() const -{ - return m_size; -} - -uint64_t -backtrace::get_timestamp() const -{ - return m_ts; -} - -int64_t -backtrace::get_thread_cpu_timestamp() const -{ - return m_thr_cpu_ts; -} - -void -backtrace::sample(int signum) -{ - if(signum != -1 && get_state() != State::Active) - { - OMNITRACE_CONDITIONAL_PRINT( - get_debug_sampling(), - "request to sample (signal %i) ignored because omnitrace is not active\n", - signum); - return; - } - - if(get_debug_sampling()) - { - static auto _timestamp_str = [](const auto& _tp) { - char _repr[64]; - std::memset(_repr, '\0', sizeof(_repr)); - std::time_t _value = system_clock::to_time_t(_tp); - // alternative: "%c %Z" - if(std::strftime(_repr, sizeof(_repr), "%a %b %d %T %Y %Z", - std::localtime(&_value)) > 0) - return std::string{ _repr }; - return std::string{}; - }; - - static thread_local size_t _tot = 0; - static thread_local auto _last = system_clock::now(); - auto _now = system_clock::now(); - auto _diff = (_now - _last).count(); - _last = _now; - _tot += _diff; - - OMNITRACE_PRINT( - "Sample on signal %i taken at %s after interval %zu :: total %zu\n", signum, - _timestamp_str(_now).c_str(), _diff, _tot); - } - - if(!*get_sampler_running(0)) return; - - m_size = 0; - m_tid = threading::get_id(); - m_ts = comp::wall_clock::record(); - m_thr_cpu_ts = tim::get_clock_thread_now(); - auto _cache = tim::rusage_cache{ RUSAGE_THREAD }; - m_mem_peak = _cache.get_peak_rss(); - m_ctx_swch = _cache.get_num_priority_context_switch() + - _cache.get_num_voluntary_context_switch(); - m_page_flt = _cache.get_num_major_page_faults() + _cache.get_num_minor_page_faults(); - m_data = tim::get_unw_backtrace(); - m_size = m_data.size(); - - if constexpr(tim::trait::is_available::value) - { - if(tim::trait::runtime_enabled::get()) - { - assert(get_papi_vector(m_tid).get() != nullptr); - m_hw_counter = get_papi_vector(m_tid)->record(); - } - } -} - -std::set -backtrace::configure(bool _setup, int64_t _tid) -{ - auto& _sampler = sampling::get_sampler(_tid); - auto& _running = get_sampler_running(_tid); - bool _is_running = (!_running) ? false : *_running; - auto& _signal_types = sampling::get_signal_types(_tid); - - ensure_storage{}(); - - if(_setup && !_sampler && !_is_running) - { - (void) get_debug_sampling(); // make sure query in sampler does not allocate - assert(_tid == threading::get_id()); - - sampling::block_signals(*_signal_types); - if constexpr(tim::trait::is_available::value) - { - perfetto_counter_track::init(); - OMNITRACE_DEBUG("HW COUNTER: starting...\n"); - if(get_papi_vector(_tid)) - { - using common_type_t = typename hw_counters::common_type; - get_papi_vector(_tid)->start(); - *get_papi_labels(_tid) = comp::papi_common::get_events(); - } - } - - auto _alrm_freq = std::min(get_sampling_freq(), 20.0); - auto _prof_freq = get_sampling_freq(); - auto _delay = std::max(1.0e-3, get_sampling_delay()); - auto _verbose = std::min(get_verbose() - 2, 2); - if(get_debug_sampling()) _verbose = 2; - - OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid); - sampling::sampler_instances::construct("omnitrace", _tid, *_signal_types); - - _sampler->set_signals(*_signal_types); - _sampler->set_flags(SA_RESTART); - _sampler->set_delay(_delay, *_signal_types, (_verbose > 1)); - _sampler->set_verbose(_verbose); - if(_signal_types->count(get_realtime_signal()) > 0) - _sampler->set_frequency(_alrm_freq, { get_realtime_signal() }, - (_verbose > 1)); - if(_signal_types->count(get_cputime_signal()) > 0) - _sampler->set_frequency(_prof_freq, { get_cputime_signal() }, (_verbose > 1)); - - static_assert(tim::trait::buffer_size::value > 0, - "Error! Zero buffer size"); - - OMNITRACE_CONDITIONAL_THROW( - _sampler->get_buffer_size() <= 0, - "dynamic sampler requires a positive buffer size: %zu", - _sampler->get_buffer_size()); - - for(auto itr : *_signal_types) - { - const char* _type = (itr == get_realtime_signal()) ? "wall" : "CPU"; - OMNITRACE_VERBOSE(1, - "[%i] Sampler for thread %lu will be triggered %.1fx per " - "second of %s-time (every %.3f milliseconds)...\n", - itr, _tid, _sampler->get_frequency(units::sec, itr), _type, - _sampler->get_period(units::msec, itr)); - } - - *_running = true; - backtrace_init_instances::construct(); - get_backtrace_init(_tid)->sample(); - _sampler->configure(false); - _sampler->start(); - } - else if(!_setup && _sampler && _is_running) - { - OMNITRACE_DEBUG("Destroying sampler for thread %lu...\n", _tid); - *_running = false; - - if(_tid == threading::get_id()) - { - sampling::block_signals(*_signal_types); - } - - // remove the timer delivering the signal - _sampler->reset(false, *_signal_types); - - if(_tid == 0) - { - // this propagates to all threads - _sampler->ignore(*_signal_types); - for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) - { - if(sampling::get_sampler(i)) - { - sampling::get_sampler(i)->reset(false, - *sampling::get_signal_types(i)); - *get_sampler_running(i) = false; - } - } - } - - _sampler->stop(); - if constexpr(tim::trait::is_available::value) - { - if(_tid == threading::get_id()) - { - if(get_papi_vector(_tid)) get_papi_vector(_tid)->stop(); - OMNITRACE_DEBUG("HW COUNTER: stopped...\n"); - } - } - OMNITRACE_DEBUG("Sampler destroyed for thread %lu\n", _tid); - } - - return (_signal_types) ? *_signal_types : std::set{}; -} - -backtrace::hw_counter_data_t& -backtrace::get_last_hwcounters() -{ - static thread_local auto _v = hw_counter_data_t{ 0 }; - return _v; -} - -void -backtrace::post_process(int64_t _tid) +std::vector +backtrace::filter_and_patch(const std::vector& _data) { - namespace quirk = tim::quirk; - - configure(false, _tid); - - auto& _sampler = sampling::sampler_instances::instances().at(_tid); - if(!_sampler) - { - // this should be relatively common - OMNITRACE_CONDITIONAL_PRINT( - get_debug() && get_verbose() >= 2, - "Post-processing sampling entries for thread %lu skipped (no sampler)\n", - _tid); - return; - } - - auto& _init = backtrace_init_instances::instances().at(_tid); - if(!_init) - { - // this is not common - OMNITRACE_PRINT( - "Post-processing sampling entries for thread %lu skipped (not initialized)\n", - _tid); - return; - } - - _init->m_ts = std::max( - _init->m_ts, pthread_create_gotcha::get_execution_time(_tid)->first); - // check whether the call-stack entry should be used. -1 means break, 0 means continue auto _use_label = [](std::string_view _lbl) -> short { // debugging feature @@ -491,7 +140,6 @@ backtrace::post_process(int64_t _tid) if(_lbl.find("rocprofiler_") != _npos) return -1; if(_lbl.find("roctracer_") != _npos) return -1; if(_lbl.find("perfetto::") != _npos) return -1; - if(_lbl == "funlockfile") return 0; return 1; }; @@ -509,327 +157,55 @@ backtrace::post_process(int64_t _tid) return std::string{ _lbl }.replace(_pos, _dyninst.length(), ""); }; - auto _hw_cnt_labels = *get_papi_labels(_tid); - - auto _process_perfetto_counters = [&](const std::vector& _data) { - auto _tid_name = JOIN("", '[', _tid, ']'); - if(!perfetto_counter_track::exists(_tid)) - { - perfetto_counter_track::emplace( - _tid, JOIN(' ', "Thread Peak Memory Usage", _tid_name, "(S)"), "MB"); - perfetto_counter_track::emplace( - _tid, JOIN(' ', "Thread Context Switches", _tid_name, "(S)")); - perfetto_counter_track::emplace( - _tid, JOIN(' ', "Thread Page Faults", _tid_name, "(S)")); - } - - if(!perfetto_counter_track::exists(_tid) && - tim::trait::runtime_enabled::get()) - { - for(auto& itr : _hw_cnt_labels) - { - std::string _desc = tim::papi::get_event_info(itr).short_descr; - if(_desc.empty()) _desc = itr; - OMNITRACE_CI_THROW(_desc.empty(), "Empty description for %s\n", - itr.c_str()); - perfetto_counter_track::emplace( - _tid, JOIN(' ', "Thread", _desc, _tid_name, "(S)")); - } - } - - uint64_t _mean_ts = 0; - const backtrace* _last_bt = nullptr; - for(const auto& ditr : _data) - { - const auto* _bt = ditr->get(); - if(_bt->m_tid != _tid) continue; - - auto _ts = _bt->m_ts; - if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue; - - _last_bt = _bt; - _mean_ts += _ts; - - TRACE_COUNTER("thread_peak_memory", - perfetto_counter_track::at(_tid, 0), _ts, - _bt->m_mem_peak / units::megabyte); - - TRACE_COUNTER("thread_context_switch", - perfetto_counter_track::at(_tid, 1), _ts, - _bt->m_ctx_swch); - - TRACE_COUNTER("thread_page_fault", - perfetto_counter_track::at(_tid, 2), _ts, - _bt->m_page_flt); - - if(tim::trait::runtime_enabled::get()) - { - for(size_t i = 0; i < perfetto_counter_track::size(_tid); - ++i) - { - if(i < _bt->m_hw_counter.size()) - { - TRACE_COUNTER("hardware_counter", - perfetto_counter_track::at(_tid, i), - _ts, _bt->m_hw_counter.at(i)); - } - } - } - } - - if(_tid > 0 && _last_bt) - { - auto _ts = pthread_create_gotcha::get_execution_time(_tid)->second; - uint64_t _zero = 0; - TRACE_COUNTER("thread_peak_memory", - perfetto_counter_track::at(_tid, 0), _ts, - _zero); - - TRACE_COUNTER("thread_context_switch", - perfetto_counter_track::at(_tid, 1), _ts, - _zero); - - TRACE_COUNTER("thread_page_fault", - perfetto_counter_track::at(_tid, 2), _ts, - _zero); - - if(tim::trait::runtime_enabled::get()) - { - for(size_t i = 0; i < perfetto_counter_track::size(_tid); - ++i) - { - if(i < _last_bt->m_hw_counter.size()) - { - TRACE_COUNTER("hardware_counter", - perfetto_counter_track::at(_tid, i), - _ts, _zero); - } - } - } - } - }; - - auto _process_perfetto = [&](const std::vector& _data, - bool _rename) { - if(_rename) - threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid, "(S)").c_str()); - - uint64_t _beg_ns = pthread_create_gotcha::get_execution_time(_tid)->first; - uint64_t _end_ns = pthread_create_gotcha::get_execution_time(_tid)->second; - uint64_t _last_wall_ts = _init->get_timestamp(); - - tracing::push_perfetto_ts(category::sampling{}, "samples [omnitrace]", _beg_ns, - "begin_ns", _beg_ns); - - for(const auto& ditr : _data) - { - const auto* _bt = ditr->get(); - if(_bt->m_tid != _tid) continue; - - static std::set _static_strings{}; - for(const auto& itr : _bt->get()) - { - auto _name = tim::demangle(_patch_label(itr)); - auto _use = _use_label(_name); - if(_use == -1) break; - if(_use == 0) continue; - auto sitr = _static_strings.emplace(_name); - uint64_t _beg = _last_wall_ts; - uint64_t _end = _bt->m_ts; - if(_end <= _beg) continue; - if(!pthread_create_gotcha::is_valid_execution_time(_tid, _beg)) continue; - if(!pthread_create_gotcha::is_valid_execution_time(_tid, _end)) continue; - - tracing::push_perfetto_ts(category::sampling{}, sitr.first->c_str(), _beg, - "begin_ns", _beg); - tracing::pop_perfetto_ts(category::sampling{}, sitr.first->c_str(), _end, - "end_ns", _end); - } - _last_wall_ts = _bt->m_ts; - } - - tracing::pop_perfetto_ts(category::sampling{}, "samples [omnitrace]", _end_ns, - "end_ns", _end_ns); - }; - - _sampler->stop(); - auto _raw_data = _sampler->get_data(); - OMNITRACE_CI_THROW( - _sampler->get_sample_count() != _raw_data.size(), - "Error! sampler recorded %zu samples but %zu samples were returned\n", - _sampler->get_sample_count(), _raw_data.size()); - // single sample that is useless (backtrace to unblocking signals) - if(_raw_data.size() == 1 && _raw_data.front().size() <= 1) _raw_data.clear(); - - std::vector _data{}; - for(auto& itr : _raw_data) - { - _data.reserve(_data.size() + itr.size()); - auto* _bt = itr.get(); - if(!_bt) - { - OMNITRACE_PRINT("Warning! Nullptr to backtrace instance for thread %lu...\n", - _tid); - continue; - } - if(_bt->empty()) continue; - if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue; - _data.emplace_back(&itr); - } - - if(_data.empty()) return; - - OMNITRACE_VERBOSE(0 || get_debug_sampling(), - "Post-processing %zu sampling entries for thread %lu...\n", - _data.size(), _tid); - - std::sort(_data.begin(), _data.end(), - [](const sampling::bundle_t* _lhs, const sampling::bundle_t* _rhs) { - return _lhs->get()->m_ts < _rhs->get()->m_ts; - }); - - if(get_use_perfetto()) + auto _ret = std::vector{}; + for(const auto& itr : _data) { - _process_perfetto_counters(_data); - - pthread_gotcha::push_enable_sampling_on_child_threads(false); - std::thread{ _process_perfetto, _data, true }.join(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); + auto _name = tim::demangle(_patch_label(itr)); + auto _use = _use_label(_name); + if(_use == -1) break; + if(_use == 0) continue; + _ret.emplace_back(_name); } - if(!get_use_timemory()) return; - - std::map> _depth_sum = {}; - auto _scope = tim::scope::config{}; - if(get_timeline_sampling()) _scope += scope::timeline{}; - if(get_flat_sampling()) _scope += scope::flat{}; - - backtrace* _last_bt = _init.get(); - for(auto& ditr : _data) - { - using bundle_t = tim::lightweight_tuple; - - auto* _bt = ditr->get(); - - if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue; - if(_bt->m_ts < _last_bt->m_ts) continue; - - double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts); - double _elapsed_cc = (_bt->m_thr_cpu_ts - _last_bt->m_thr_cpu_ts); - - std::vector _tc{}; - _tc.reserve(_bt->size()); - - // generate the instances of the tuple of components and start them - for(const auto& itr : _bt->get()) - { - auto _lbl = tim::demangle(_patch_label(itr)); - auto _use = _use_label(_lbl); - if(_use == -1) break; - if(_use == 0) continue; - _tc.emplace_back(tim::string_view_t{ _lbl }, _scope); - _tc.back().push(_bt->m_tid); - _tc.back().start(); - } - - // stop the instances and update the values as needed - for(size_t i = 0; i < _tc.size(); ++i) - { - auto& itr = _tc.at(_tc.size() - i - 1); - size_t _depth = 0; - _depth_sum[_bt->m_tid][_depth] += 1; - itr.stop(); - if constexpr(tim::trait::is_available::value) - { - auto* _sc = itr.get(); - if(_sc) - { - auto _value = _elapsed_wc / sampling_wall_clock::get_unit(); - _sc->set_value(_value); - _sc->set_accum(_value); - } - } - if constexpr(tim::trait::is_available::value) - { - auto* _cc = itr.get(); - if(_cc) - { - _cc->set_value(_elapsed_cc / sampling_cpu_clock::get_unit()); - _cc->set_accum(_elapsed_cc / sampling_cpu_clock::get_unit()); - } - } - if constexpr(tim::trait::is_available::value) - { - auto _hw_cnt_vals = _bt->m_hw_counter; - if(_last_bt && _bt->m_hw_counter.size() == _last_bt->m_hw_counter.size()) - { - for(size_t k = 0; k < _bt->m_hw_counter.size(); ++k) - { - if(_last_bt->m_hw_counter[k] > _hw_cnt_vals[k]) - _hw_cnt_vals[k] -= _last_bt->m_hw_counter[k]; - } - } - auto* _hw_counter = itr.get(); - if(_hw_counter) - { - _hw_counter->set_value(_hw_cnt_vals); - _hw_counter->set_accum(_hw_cnt_vals); - } - } - itr.pop(); - } - _last_bt = _bt; - } - - for(auto&& ditr : _data) - { - using bundle_t = - tim::lightweight_tuple>; + return _ret; +} - auto* _bt = ditr->get(); +void +backtrace::start() +{} - std::vector _tc{}; - _tc.reserve(_bt->size()); +void +backtrace::stop() +{} - // generate the instances of the tuple of components and start them - for(const auto& itr : _bt->get()) - { - auto _lbl = tim::demangle(_patch_label(itr)); - auto _use = _use_label(_lbl); - if(_use == -1) break; - if(_use == 0) continue; - _tc.emplace_back(tim::string_view_t{ _lbl }); - _tc.back().push(_bt->m_tid); - _tc.back().start(); - } +bool +backtrace::empty() const +{ + return (size() == 0); +} - // stop the instances and update the values as needed - for(size_t i = 0; i < _tc.size(); ++i) - { - auto& itr = _tc.at(_tc.size() - i - 1); - size_t _depth = 0; - double _value = (1.0 / _depth_sum[_bt->m_tid][_depth]) * 100.0; - itr.store(std::plus{}, _value); - itr.stop(); - itr.pop(); - } - } +size_t +backtrace::size() const +{ + return m_data.size(); } +void +backtrace::sample(int) +{ + using namespace tim::backtrace; + constexpr bool with_signal_frame = false; + constexpr size_t ignore_depth = 3; + // ignore depth based on: + // 1. this frame + // 2. tim::sampling::sampler<...>::sample(...) [always inline] + // 3. tim::sampling::sampler<...>::execute(...) + // 4a. funlockfile [common but not explicitly in call-stack] + // 4b. __resume_rt [common but not explicitly in call-stack] + // 4c. killpg [common but not explicitly in call-stack] + m_data = get_unw_backtrace_raw(); +} } // namespace component } // namespace omnitrace -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::backtrace) diff --git a/source/lib/omnitrace/library/components/backtrace.hpp b/source/lib/omnitrace/library/components/backtrace.hpp index 146591907..45f8ff2a8 100644 --- a/source/lib/omnitrace/library/components/backtrace.hpp +++ b/source/lib/omnitrace/library/components/backtrace.hpp @@ -29,8 +29,6 @@ #include "library/timemory.hpp" #include -#include -#include #include #include #include @@ -50,74 +48,37 @@ struct backtrace : tim::component::empty_base , tim::concepts::component { - static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE; - static constexpr size_t buffer_width = 512; - static constexpr size_t stack_depth = 128; + static constexpr size_t stack_depth = OMNITRACE_MAX_UNWIND_DEPTH; - using data_t = std::array; + using data_t = tim::unwind::stack; using clock_type = std::chrono::steady_clock; using value_type = void; - using hw_counters = tim::component::papi_array; - using hw_counter_data_t = typename hw_counters::value_type; using system_clock = std::chrono::system_clock; using system_time_point = typename system_clock::time_point; - static void preinit(); static std::string label(); static std::string description(); - backtrace() = default; - ~backtrace() = default; - backtrace(backtrace&&) = default; - backtrace(const backtrace&) = default; + backtrace() = default; + ~backtrace() = default; + backtrace(const backtrace&) = default; + backtrace(backtrace&&) noexcept = default; backtrace& operator=(const backtrace&) = default; - backtrace& operator=(backtrace&&) = default; + backtrace& operator=(backtrace&&) noexcept = default; - bool operator<(const backtrace& rhs) const; + static std::vector filter_and_patch(const std::vector&); - static std::set configure(bool, int64_t _tid = threading::get_id()); - static void post_process(int64_t _tid = threading::get_id()); - static hw_counter_data_t& get_last_hwcounters(); + static void start(); + static void stop(); - static void start(); - static void stop(); - void sample(int = -1); - bool empty() const; - size_t size() const; - std::vector get() const; - uint64_t get_timestamp() const; - int64_t get_thread_cpu_timestamp() const; + void sample(int = -1); + bool empty() const; + size_t size() const; + std::vector get() const; private: - int64_t m_tid = 0; - int64_t m_thr_cpu_ts = 0; - int64_t m_mem_peak = 0; - int64_t m_ctx_swch = 0; - int64_t m_page_flt = 0; - uint64_t m_ts = {}; - size_t m_size = 0; - data_t m_data = {}; - hw_counter_data_t m_hw_counter = {}; + data_t m_data = {}; }; } // namespace component } // namespace omnitrace - -#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ - (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) - -# include - -TIMEMORY_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -TIMEMORY_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -TIMEMORY_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -#endif diff --git a/source/lib/omnitrace/library/components/backtrace_metrics.cpp b/source/lib/omnitrace/library/components/backtrace_metrics.cpp new file mode 100644 index 000000000..192edaf75 --- /dev/null +++ b/source/lib/omnitrace/library/components/backtrace_metrics.cpp @@ -0,0 +1,316 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/backtrace_metrics.hpp" +#include "library/components/ensure_storage.hpp" +#include "library/components/fwd.hpp" +#include "library/config.hpp" +#include "library/debug.hpp" +#include "library/perfetto.hpp" +#include "library/ptl.hpp" +#include "library/runtime.hpp" +#include "library/sampling.hpp" +#include "library/thread_info.hpp" +#include "library/tracing.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace tracing +{ +using namespace ::omnitrace::tracing; +} + +namespace omnitrace +{ +namespace component +{ +using hw_counters = typename backtrace_metrics::hw_counters; +using signal_type_instances = thread_data, category::sampling>; +using backtrace_metrics_init_instances = + thread_data; +using sampler_running_instances = thread_data; +using papi_vector_instances = thread_data; +using papi_label_instances = thread_data, category::sampling>; + +namespace +{ +struct perfetto_rusage +{}; + +unique_ptr_t>& +get_papi_labels(int64_t _tid) +{ + static auto& _v = + papi_label_instances::instances(papi_label_instances::construct_on_init{}); + return _v.at(_tid); +} + +unique_ptr_t& +get_papi_vector(int64_t _tid) +{ + static auto& _v = papi_vector_instances::instances(); + if(_tid == threading::get_id()) papi_vector_instances::construct(); + return _v.at(_tid); +} + +unique_ptr_t& +get_backtrace_metrics_init(int64_t _tid) +{ + static auto& _v = backtrace_metrics_init_instances::instances(); + return _v.at(_tid); +} + +unique_ptr_t& +get_sampler_running(int64_t _tid) +{ + static auto& _v = sampler_running_instances::instances( + sampler_running_instances::construct_on_init{}, false); + return _v.at(_tid); +} +} // namespace + +std::string +backtrace_metrics::label() +{ + return "backtrace_metrics"; +} + +std::string +backtrace_metrics::description() +{ + return "Records sampling data"; +} + +void +backtrace_metrics::start() +{} + +void +backtrace_metrics::stop() +{} + +void +backtrace_metrics::sample(int) +{ + auto _tid = threading::get_id(); + auto _cache = tim::rusage_cache{ RUSAGE_THREAD }; + m_cpu = tim::get_clock_thread_now(); + m_mem_peak = _cache.get_peak_rss(); + m_ctx_swch = _cache.get_num_priority_context_switch() + + _cache.get_num_voluntary_context_switch(); + m_page_flt = _cache.get_num_major_page_faults() + _cache.get_num_minor_page_faults(); + + if constexpr(tim::trait::is_available::value) + { + if(tim::trait::runtime_enabled::get()) + { + assert(get_papi_vector(_tid).get() != nullptr); + m_hw_counter = get_papi_vector(_tid)->record(); + } + } +} + +void +backtrace_metrics::configure(bool _setup, int64_t _tid) +{ + auto& _running = get_sampler_running(_tid); + bool _is_running = (!_running) ? false : *_running; + + ensure_storage{}(); + + if(_setup && !_is_running) + { + (void) get_debug_sampling(); // make sure query in sampler does not allocate + assert(_tid == threading::get_id()); + + if constexpr(tim::trait::is_available::value) + { + perfetto_counter_track::init(); + OMNITRACE_DEBUG("HW COUNTER: starting...\n"); + if(get_papi_vector(_tid)) + { + using common_type_t = typename hw_counters::common_type; + get_papi_vector(_tid)->start(); + *get_papi_labels(_tid) = comp::papi_common::get_events(); + } + } + } + else if(!_setup && _is_running) + { + OMNITRACE_DEBUG("Destroying sampler for thread %lu...\n", _tid); + *_running = false; + + if constexpr(tim::trait::is_available::value) + { + if(_tid == threading::get_id()) + { + if(get_papi_vector(_tid)) get_papi_vector(_tid)->stop(); + OMNITRACE_DEBUG("HW COUNTER: stopped...\n"); + } + } + OMNITRACE_DEBUG("Sampler destroyed for thread %lu\n", _tid); + } +} + +void +backtrace_metrics::init_perfetto(int64_t _tid) +{ + auto _hw_cnt_labels = *get_papi_labels(_tid); + auto _tid_name = JOIN("", '[', _tid, ']'); + + if(!perfetto_counter_track::exists(_tid)) + { + perfetto_counter_track::emplace( + _tid, JOIN(' ', "Thread Peak Memory Usage", _tid_name, "(S)"), "MB"); + perfetto_counter_track::emplace( + _tid, JOIN(' ', "Thread Context Switches", _tid_name, "(S)")); + perfetto_counter_track::emplace( + _tid, JOIN(' ', "Thread Page Faults", _tid_name, "(S)")); + } + + if(!perfetto_counter_track::exists(_tid) && + tim::trait::runtime_enabled::get()) + { + for(auto& itr : _hw_cnt_labels) + { + std::string _desc = tim::papi::get_event_info(itr).short_descr; + if(_desc.empty()) _desc = itr; + OMNITRACE_CI_THROW(_desc.empty(), "Empty description for %s\n", itr.c_str()); + perfetto_counter_track::emplace( + _tid, JOIN(' ', "Thread", _desc, _tid_name, "(S)")); + } + } +} + +void +backtrace_metrics::fini_perfetto(int64_t _tid) +{ + auto _hw_cnt_labels = *get_papi_labels(_tid); + const auto& _thread_info = thread_info::get(_tid, InternalTID); + + OMNITRACE_CI_THROW(!_thread_info, "Error! missing thread info for tid=%li\n", _tid); + if(!_thread_info) return; + + uint64_t _ts = _thread_info->get_stop(); + + TRACE_COUNTER("thread_peak_memory", + perfetto_counter_track::at(_tid, 0), _ts, 0); + + TRACE_COUNTER("thread_context_switch", + perfetto_counter_track::at(_tid, 1), _ts, 0); + + TRACE_COUNTER("thread_page_fault", + perfetto_counter_track::at(_tid, 2), _ts, 0); + + if(tim::trait::runtime_enabled::get()) + { + for(size_t i = 0; i < perfetto_counter_track::size(_tid); ++i) + { + if(i < _hw_cnt_labels.size()) + { + TRACE_COUNTER("hardware_counter", + perfetto_counter_track::at(_tid, i), _ts, 0.0); + } + } + } +} + +void +backtrace_metrics::post_process_perfetto(int64_t _tid, uint64_t _ts) const +{ + TRACE_COUNTER("thread_peak_memory", + perfetto_counter_track::at(_tid, 0), _ts, + m_mem_peak / units::megabyte); + + TRACE_COUNTER("thread_context_switch", + perfetto_counter_track::at(_tid, 1), _ts, m_ctx_swch); + + TRACE_COUNTER("thread_page_fault", + perfetto_counter_track::at(_tid, 2), _ts, m_page_flt); + + if(tim::trait::runtime_enabled::get()) + { + for(size_t i = 0; i < perfetto_counter_track::size(_tid); ++i) + { + if(i < m_hw_counter.size()) + { + TRACE_COUNTER("hardware_counter", + perfetto_counter_track::at(_tid, i), _ts, + m_hw_counter.at(i)); + } + } + } +} +} // namespace component +} // namespace omnitrace + +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::backtrace_metrics) diff --git a/source/lib/omnitrace/library/components/backtrace_metrics.hpp b/source/lib/omnitrace/library/components/backtrace_metrics.hpp new file mode 100644 index 000000000..f727992d7 --- /dev/null +++ b/source/lib/omnitrace/library/components/backtrace_metrics.hpp @@ -0,0 +1,119 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/components/backtrace.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/thread_data.hpp" +#include "library/timemory.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace component +{ +struct backtrace_metrics +: tim::component::empty_base +, tim::concepts::component +{ + static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE; + + using clock_type = std::chrono::steady_clock; + using value_type = void; + using hw_counters = tim::component::papi_array; + using hw_counter_data_t = typename hw_counters::value_type; + using system_clock = std::chrono::system_clock; + using system_time_point = typename system_clock::time_point; + + static std::string label(); + static std::string description(); + + backtrace_metrics() = default; + ~backtrace_metrics() = default; + backtrace_metrics(const backtrace_metrics&) = default; + backtrace_metrics(backtrace_metrics&&) noexcept = default; + + backtrace_metrics& operator=(const backtrace_metrics&) = default; + backtrace_metrics& operator=(backtrace_metrics&&) noexcept = default; + + static void configure(bool, int64_t _tid = threading::get_id()); + static void init_perfetto(int64_t _tid); + static void fini_perfetto(int64_t _tid); + + static void start(); + static void stop(); + void sample(int = -1); + void post_process(int64_t _tid, const backtrace* _bt, + const backtrace_metrics* _last) const; + + auto get_cpu_timestamp() const { return m_cpu; } + auto get_peak_memory() const { return m_mem_peak; } + auto get_context_switches() const { return m_ctx_swch; } + auto get_page_faults() const { return m_page_flt; } + const auto& get_hw_counters() const { return m_hw_counter; } + + void post_process_perfetto(int64_t _tid, uint64_t _ts) const; + +private: + int64_t m_cpu = 0; + int64_t m_mem_peak = 0; + int64_t m_ctx_swch = 0; + int64_t m_page_flt = 0; + hw_counter_data_t m_hw_counter = {}; +}; +} // namespace component +} // namespace omnitrace + +#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ + (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + +# include + +OMNITRACE_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +OMNITRACE_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +OMNITRACE_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +#endif diff --git a/source/lib/omnitrace/library/components/user_region.cpp b/source/lib/omnitrace/library/components/backtrace_timestamp.cpp similarity index 66% rename from source/lib/omnitrace/library/components/user_region.cpp rename to source/lib/omnitrace/library/components/backtrace_timestamp.cpp index 8b39b4496..b9a62dff2 100644 --- a/source/lib/omnitrace/library/components/user_region.cpp +++ b/source/lib/omnitrace/library/components/backtrace_timestamp.cpp @@ -20,33 +20,35 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "library/components/user_region.hpp" -#include "library/api.hpp" -#include "library/components/fwd.hpp" +#include "library/components/backtrace_timestamp.hpp" +#include "library/thread_info.hpp" + +#include namespace omnitrace { namespace component { -void -user_region::start() +bool +backtrace_timestamp::operator<(const backtrace_timestamp& rhs) const { - if(m_prefix) omnitrace_push_region_hidden(m_prefix); + return std::tie(m_tid, m_real) < std::tie(rhs.m_tid, rhs.m_real); } -void -user_region::stop() +bool +backtrace_timestamp::is_valid() const { - if(m_prefix) omnitrace_pop_region_hidden(m_prefix); + const auto& _info = thread_info::get(m_tid, InternalTID); + return (_info) ? _info->is_valid_time(m_real) : false; } void -user_region::set_prefix(const char* _prefix) +backtrace_timestamp::sample(int) { - m_prefix = _prefix; + m_tid = tim::threading::get_id(); + m_real = tim::get_clock_real_now(); } } // namespace component } // namespace omnitrace -TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::user_region) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(omnitrace_user_region, false, void) +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::backtrace_timestamp) diff --git a/source/lib/omnitrace/library/components/user_region.hpp b/source/lib/omnitrace/library/components/backtrace_timestamp.hpp similarity index 53% rename from source/lib/omnitrace/library/components/user_region.hpp rename to source/lib/omnitrace/library/components/backtrace_timestamp.hpp index d18894d77..4c7f90531 100644 --- a/source/lib/omnitrace/library/components/user_region.hpp +++ b/source/lib/omnitrace/library/components/backtrace_timestamp.hpp @@ -22,35 +22,53 @@ #pragma once +#include "library/common.hpp" +#include "library/components/fwd.hpp" #include "library/defines.hpp" #include "library/timemory.hpp" +#include +#include +#include + +#include +#include + namespace omnitrace { namespace component { -// timemory component which calls omnitrace functions -// (used in gotcha wrappers) -struct user_region : comp::base +struct backtrace_timestamp +: tim::component::empty_base +, tim::concepts::component { - static std::string label() { return "user_region"; } - void start(); - void stop(); - void set_prefix(const char*); + using value_type = void; -private: - const char* m_prefix = nullptr; -}; -} // namespace component -} // namespace omnitrace + static std::string label() { return "backtrace_timestamp"; } + static std::string description() { return "Timestamp for backtrace"; } -TIMEMORY_COMPONENT_ALIAS(omnitrace_user_region, omnitrace::component::user_region) + backtrace_timestamp() = default; + ~backtrace_timestamp() = default; + backtrace_timestamp(const backtrace_timestamp&) = default; + backtrace_timestamp(backtrace_timestamp&&) noexcept = default; -#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ - (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + backtrace_timestamp& operator=(const backtrace_timestamp&) = default; + backtrace_timestamp& operator=(backtrace_timestamp&&) noexcept = default; -# include + bool operator<(const backtrace_timestamp& rhs) const; -TIMEMORY_DECLARE_EXTERN_COMPONENT(omnitrace_user_region, false, void) + static void start() {} + static void stop() {} -#endif + void sample(int = -1); + + auto get_tid() const { return m_tid; } + auto get_timestamp() const { return m_real; } + bool is_valid() const; + +private: + int64_t m_tid = 0; + uint64_t m_real = 0; +}; +} // namespace component +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/category_region.hpp b/source/lib/omnitrace/library/components/category_region.hpp index cfdfffd50..ad94af9e0 100644 --- a/source/lib/omnitrace/library/components/category_region.hpp +++ b/source/lib/omnitrace/library/components/category_region.hpp @@ -23,7 +23,9 @@ #pragma once #include "library/config.hpp" +#include "library/critical_trace.hpp" #include "library/defines.hpp" +#include "library/runtime.hpp" #include "library/timemory.hpp" #include "library/tracing.hpp" @@ -48,8 +50,6 @@ struct timemory : concepts::quirk_type namespace omnitrace { -namespace audit = ::tim::audit; - namespace component { // timemory component which calls omnitrace functions @@ -90,36 +90,23 @@ template void category_region::start(std::string_view name, Args&&... args) { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + // unconditionally return if thread is disabled or finalized + if(get_thread_state() == ThreadState::Disabled) return; + if(get_state() == State::Finalized) return; - // unconditionally return if finalized - if(get_state() == State::Finalized) - { - OMNITRACE_CONDITIONAL_BASIC_PRINT( - tracing::debug_user, "omnitrace_push_region(%s) called during finalization\n", - name.data()); - return; - } + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); // the expectation here is that if the state is not active then the call // to omnitrace_init_tooling_hidden will activate all the appropriate // tooling one time and as it exits set it to active and return true. - if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) - { - static auto _debug = get_debug_env() || get_debug_init(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - _debug, "[%s] omnitrace_push_region(%s) ignored :: not active. state = %s\n", - category_name, name.data(), std::to_string(get_state()).c_str()); - return; - } + if(get_state() != State::Active && !omnitrace_init_tooling_hidden()) return; - OMNITRACE_CONDITIONAL_PRINT(tracing::debug_push, - "[%s][PID=%i][state=%s] omnitrace_push_region(%s)\n", - category_name, process::get_id(), - std::to_string(get_state()).c_str(), name.data()); + tracing::thread_init(); + + // thread initialization may have disabled the thread + if(get_thread_state() == ThreadState::Disabled) return; - auto _use_timemory = get_use_timemory(); - auto _use_perfetto = get_use_perfetto(); + tracing::thread_init_sampling(); constexpr bool _ct_use_timemory = (sizeof...(OptsT) == 0 || @@ -129,23 +116,49 @@ category_region::start(std::string_view name, Args&&... args) (sizeof...(OptsT) == 0 || tim::is_one_of>::value); - if(_use_timemory || _use_perfetto) tracing::thread_init(); + OMNITRACE_CONDITIONAL_PRINT(tracing::debug_push, + "[%s][PID=%i][state=%s] omnitrace_push_region(%s)\n", + category_name, process::get_id(), + std::to_string(get_state()).c_str(), name.data()); - if(_use_perfetto) + if constexpr(tim::is_one_of>::value) { - if constexpr(_ct_use_perfetto) + ++tracing::push_count(); + } + + if constexpr(_ct_use_perfetto) + { + if(get_use_perfetto()) { tracing::push_perfetto(CategoryT{}, name.data(), std::forward(args)...); } } - if(_use_timemory) + + if constexpr(_ct_use_timemory) { - if constexpr(_ct_use_timemory) + if(get_use_timemory()) + { + tracing::push_timemory(CategoryT{}, name.data(), std::forward(args)...); + } + } + + if constexpr(tim::is_one_of>::value) + { + using Device = critical_trace::Device; + using Phase = critical_trace::Phase; + + if(get_use_critical_trace()) { - tracing::push_timemory(name.data(), std::forward(args)...); + uint64_t _cid = 0; + uint64_t _parent_cid = 0; + uint32_t _depth = 0; + std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); + auto _ts = comp::wall_clock::record(); + add_critical_trace( + threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0, + critical_trace::add_hash_id(name.data()), _depth); } } - if(_use_timemory || _use_perfetto) tracing::thread_init_sampling(); } template @@ -153,6 +166,8 @@ template void category_region::stop(std::string_view name, Args&&... args) { + if(get_thread_state() == ThreadState::Disabled) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); constexpr bool _ct_use_timemory = @@ -171,21 +186,52 @@ category_region::stop(std::string_view name, Args&&... args) // only execute when active if(get_state() == State::Active) { - if(get_use_timemory()) + if constexpr(tim::is_one_of>::value) { - if constexpr(_ct_use_timemory) + ++tracing::pop_count(); + } + + if constexpr(_ct_use_timemory) + { + if(get_use_timemory()) { - tracing::pop_timemory(name.data(), std::forward(args)...); + tracing::pop_timemory(CategoryT{}, name.data(), + std::forward(args)...); } } - if(get_use_perfetto()) + + if constexpr(_ct_use_perfetto) { - if constexpr(_ct_use_perfetto) + if(get_use_perfetto()) { tracing::pop_perfetto(CategoryT{}, name.data(), std::forward(args)...); } } + + if constexpr(tim::is_one_of>::value) + { + using Device = critical_trace::Device; + using Phase = critical_trace::Phase; + + if(get_use_critical_trace()) + { + if(get_cpu_cid_stack() && !get_cpu_cid_stack()->empty()) + { + auto _cid = get_cpu_cid_stack()->back(); + if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end()) + { + uint64_t _parent_cid = 0; + uint32_t _depth = 0; + auto _ts = comp::wall_clock::record(); + std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid); + add_critical_trace( + threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0, + critical_trace::add_hash_id(name.data()), _depth); + } + } + } + } } else { @@ -249,5 +295,51 @@ category_region::audit(quirk::config, Args&&... _args) { audit(std::forward(_args)...); } + +template +struct local_category_region : comp::base, void> +{ + using impl_type = category_region; + + static constexpr auto category_name = impl_type::category_name; + static std::string label() { return impl_type::label(); } + + template + auto start(Args&&... args) + { + if(m_prefix.empty()) return; + return impl_type::template start(m_prefix, std::forward(args)...); + } + + template + auto stop(Args&&... args) + { + if(m_prefix.empty()) return; + return impl_type::template stop(m_prefix, std::forward(args)...); + } + + template + auto audit(Args&&... args) + -> decltype(impl_type::template audit(std::declval(), + std::forward(args)...)) + { + if(m_prefix.empty()) return; + return impl_type::template audit(m_prefix, std::forward(args)...); + } + + template + auto audit(quirk::config, Args&&... args) + { + if(m_prefix.empty()) return; + return impl_type::template audit(quirk::config{}, m_prefix, + std::forward(args)...); + } + + void set_prefix(std::string_view _v) { m_prefix = _v; } + +private: + std::string_view m_prefix = {}; +}; + } // namespace component } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/comm_data.cpp b/source/lib/omnitrace/library/components/comm_data.cpp index 27f22818c..5d39bca61 100644 --- a/source/lib/omnitrace/library/components/comm_data.cpp +++ b/source/lib/omnitrace/library/components/comm_data.cpp @@ -31,7 +31,7 @@ #include #include -namespace tim +namespace omnitrace { namespace component { @@ -415,9 +415,9 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const v } #endif } // namespace component -} // namespace tim +} // namespace omnitrace -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, float) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, float) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(comm_data, false, void) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(comm_data, false, void) diff --git a/source/lib/omnitrace/library/components/comm_data.hpp b/source/lib/omnitrace/library/components/comm_data.hpp index 4d34d0b58..7763d784d 100644 --- a/source/lib/omnitrace/library/components/comm_data.hpp +++ b/source/lib/omnitrace/library/components/comm_data.hpp @@ -30,6 +30,7 @@ #include "library/timemory.hpp" #include +#include #include #include #include @@ -55,11 +56,14 @@ #include #include -namespace tim +OMNITRACE_COMPONENT_ALIAS(comm_data_tracker_t, + ::tim::component::data_tracker) + +namespace omnitrace { namespace component { -using comm_data_tracker_t = data_tracker; +using gotcha_data = ::tim::component::gotcha_data; struct comm_data : base { @@ -231,7 +235,7 @@ struct comm_data : base } }; } // namespace component -} // namespace tim +} // namespace omnitrace #if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) @@ -240,8 +244,8 @@ struct comm_data : base # include # include -TIMEMORY_DECLARE_EXTERN_COMPONENT(TIMEMORY_ESC(data_tracker), - true, float) +OMNITRACE_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, float) -TIMEMORY_DECLARE_EXTERN_COMPONENT(comm_data, false, void) +OMNITRACE_DECLARE_EXTERN_COMPONENT(comm_data, false, void) #endif diff --git a/source/lib/omnitrace/library/components/cpu_freq.cpp b/source/lib/omnitrace/library/components/cpu_freq.cpp new file mode 100644 index 000000000..26ea1f73d --- /dev/null +++ b/source/lib/omnitrace/library/components/cpu_freq.cpp @@ -0,0 +1,221 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/cpu_freq.hpp" +#include "library/common.hpp" +#include "library/components/fwd.hpp" +#include "library/config.hpp" +#include "library/debug.hpp" +#include "library/defines.hpp" +#include "library/perfetto.hpp" +#include "library/timemory.hpp" + +#include +#include +#include +#include +#include +#include + +namespace cpuinfo = tim::procfs::cpuinfo; + +namespace omnitrace +{ +namespace component +{ +cpu_freq::cpu_id_set_t& +cpu_freq::get_enabled_cpus() +{ + static auto _v = cpu_id_set_t{}; + return _v; +} + +std::string +cpu_freq::label() +{ + return "cpu_freq"; +} + +std::string +cpu_freq::description() +{ + return "Records the current CPU frequencies"; +} + +int64_t +cpu_freq::unit() +{ + return tim::units::MHz; +} + +std::string +cpu_freq::display_unit() +{ + return tim::units::freq_repr(unit()); +} + +void +cpu_freq::configure() +{ + auto _ncpu = cpuinfo::freq::size(); + auto _enabled_freqs = std::set{}; + + auto _enabled_val = get_sampling_cpus(); + for(auto& itr : _enabled_val) + itr = tolower(itr); + if(_enabled_val == "off") + _enabled_val = "none"; + else if(_enabled_val == "on") + _enabled_val = "all"; + if(_enabled_val != "none" && _enabled_val != "all") + { + auto _enabled = tim::delimit(_enabled_val, ",; \t"); + if(_enabled.empty()) + { + for(size_t i = 0; i < _ncpu; ++i) + _enabled_freqs.emplace(i); + } + for(auto&& _v : _enabled) + { + if(_v.find_first_not_of("0123456789-") != std::string::npos) + { + OMNITRACE_VERBOSE_F( + 0, + "Invalid CPU specification. Only numerical values (e.g., 0) or " + "ranges (e.g., 0-7) are permitted. Ignoring %s...", + _v.c_str()); + continue; + } + if(_v.find('-') != std::string::npos) + { + auto _vv = tim::delimit(_v, "-"); + OMNITRACE_CONDITIONAL_THROW( + _vv.size() != 2, + "Invalid CPU range specification: %s. Required format N-M, e.g. 0-4", + _v.c_str()); + for(size_t i = std::stoull(_vv.at(0)); i <= std::stoull(_vv.at(1)); ++i) + _enabled_freqs.emplace(i); + } + else + { + _enabled_freqs.emplace(std::stoull(_v)); + } + } + } + else if(_enabled_val == "all") + { + for(size_t i = 0; i < _ncpu; ++i) + _enabled_freqs.emplace(i); + } + else if(_enabled_val == "none") + { + _enabled_freqs.clear(); + } + + for(auto itr : _enabled_freqs) + { + if(itr < cpuinfo::freq::size()) + _enabled_freqs.emplace(itr); + else + { + OMNITRACE_VERBOSE( + 0, "[cpu_freq::config] Warning! Removing invalid cpu %zu...\n", itr); + } + } + + if(!cpuinfo::freq{}) + { + OMNITRACE_VERBOSE(0, "[cpu_freq::config] Warning! CPU frequencies are disabled " + ":: unable to open /proc/cpuinfo"); + _enabled_freqs.clear(); + } + + OMNITRACE_CI_FAIL(!cpuinfo::freq{}, "[cpu_freq::config] CPU frequencies are disabled " + ":: unable to open /proc/cpuinfo"); + + get_enabled_cpus() = _enabled_freqs; +} + +std::string +cpu_freq::as_string() const +{ + return tim::operation::base_printer{}(std::stringstream{}, *this).str(); +} + +cpu_freq::value_type +cpu_freq::record() +{ + auto& enabled_cpu_freqs = get_enabled_cpus(); + + std::vector _freqs{}; + if(!enabled_cpu_freqs.empty()) + { + _freqs.reserve(enabled_cpu_freqs.size()); + auto&& _freq = cpuinfo::freq{}; + for(const auto& itr : enabled_cpu_freqs) + { + _freqs.emplace_back(_freq(itr) * tim::units::MHz); + } + } + + return _freqs; +} + +void +cpu_freq::start() +{ + value = record(); +} + +void +cpu_freq::stop() +{ + using namespace tim::stl; + value = (record() - value); +} + +cpu_freq& +cpu_freq::sample() +{ + value = record(); + return *this; +} + +float +cpu_freq::at(size_t _idx, int64_t _unit) const +{ + return (value.at(_idx) / static_cast(_unit)); +} + +std::vector +cpu_freq::get(int64_t _unit) const +{ + std::vector _v{}; + _v.reserve(value.size()); + for(const auto& itr : value) + _v.emplace_back(itr / static_cast(_unit)); + return _v; +} +} // namespace component +} // namespace omnitrace + +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::cpu_freq) diff --git a/source/lib/omnitrace/library/components/cpu_freq.hpp b/source/lib/omnitrace/library/components/cpu_freq.hpp new file mode 100644 index 000000000..16c21d7a2 --- /dev/null +++ b/source/lib/omnitrace/library/components/cpu_freq.hpp @@ -0,0 +1,113 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include +#include + +namespace omnitrace +{ +namespace component +{ +struct cpu_freq +: tim::concepts::component +, tim::component::empty_base +, tim::component::base_format +, tim::component::base_data, 1> +{ + using base_type = tim::component::empty_base; + using this_type = cpu_freq; + using value_type = std::vector; + using storage_type = tim::storage; + using cpu_id_set_t = std::set; + + TIMEMORY_DEFAULT_OBJECT(cpu_freq) + + // string id for component + static std::string label(); + static std::string description(); + static int64_t unit(); + static std::string display_unit(); + + static void configure(); + static cpu_id_set_t& get_enabled_cpus(); + static value_type record(); + + // this will get called right before fork + void start(); + void stop(); + cpu_freq& sample(); + std::string as_string() const; + + float at(size_t _idx, int64_t _unit = unit()) const; + std::vector get(int64_t _unit = unit()) const; + +public: + static auto get_label() { return label(); } + static auto get_description() { return description(); } + static auto get_unit() { return unit(); } + static auto get_display_unit() { return display_unit(); } + static int64_t get_laps() { return 0; } + static storage_type* get_storage() { return nullptr; } + + auto get_display() const { return as_string(); } + + friend std::ostream& operator<<(std::ostream& _os, const cpu_freq& _v) + { + return (_os << _v.as_string()); + } + + template + void serialize(ArchiveT& _ar, const unsigned _version) + { + if constexpr(tim::concepts::is_output_archive::value) + operation::serialization{}(*this, _ar, _version); + else + _ar(tim::cereal::make_nvp("value", value)); + } + + this_type& operator+=(const this_type& _rhs) + { + using namespace tim::stl; + value += _rhs.value; + return *this; + } + + this_type& operator-=(const this_type& _rhs) + { + using namespace tim::stl; + value -= _rhs.value; + return *this; + } + +private: + using tim::component::base_data::value; +}; +} // namespace component +} // namespace omnitrace + +OMNITRACE_DEFINE_NAME_TRAIT("cpu_freq", omnitrace::component::cpu_freq); diff --git a/source/lib/omnitrace/library/components/ensure_storage.hpp b/source/lib/omnitrace/library/components/ensure_storage.hpp new file mode 100644 index 000000000..65dbc2231 --- /dev/null +++ b/source/lib/omnitrace/library/components/ensure_storage.hpp @@ -0,0 +1,69 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/defines.hpp" + +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace component +{ +namespace +{ +template +struct ensure_storage +{ + TIMEMORY_DEFAULT_OBJECT(ensure_storage) + + void operator()() const { OMNITRACE_FOLD_EXPRESSION((*this)(tim::type_list{})); } + +private: + template ::value, int> = 0> + void operator()(tim::type_list) const + { + using namespace tim; + static thread_local auto _storage = operation::get_storage{}(); + static thread_local auto _tid = threading::get_id(); + static thread_local auto _dtor = + scope::destructor{ []() { operation::set_storage{}(nullptr, _tid); } }; + + tim::operation::set_storage{}(_storage, _tid); + if(_tid == 0 && !_storage) tim::trait::runtime_enabled::set(false); + } + + template ::value, long> = 0> + void operator()(tim::type_list) const + { + tim::trait::runtime_enabled::set(false); + } +}; +} // namespace +} // namespace component +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/exit_gotcha.cpp b/source/lib/omnitrace/library/components/exit_gotcha.cpp index e0a1d36ba..fb3175e55 100644 --- a/source/lib/omnitrace/library/components/exit_gotcha.cpp +++ b/source/lib/omnitrace/library/components/exit_gotcha.cpp @@ -29,8 +29,6 @@ #include "library/timemory.hpp" #include -#include -#include #include #include @@ -38,13 +36,15 @@ namespace omnitrace { +namespace component +{ void exit_gotcha::configure() { exit_gotcha_t::get_initializer() = []() { - exit_gotcha_t::template configure<0, void>("abort"); - exit_gotcha_t::template configure<1, void, int>("exit"); - exit_gotcha_t::template configure<2, void, int>("quick_exit"); + exit_gotcha_t::configure<0, void>("abort"); + exit_gotcha_t::configure<1, void, int>("exit"); + exit_gotcha_t::configure<2, void, int>("quick_exit"); }; } @@ -54,10 +54,29 @@ template void invoke_exit_gotcha(const exit_gotcha::gotcha_data& _data, FuncT _func, Args... _args) { - OMNITRACE_VERBOSE(0, "%s called %s(%s)...\n", get_exe_name().c_str(), - _data.tool_id.c_str(), JOIN(", ", _args...).c_str()); + if(config::settings_are_configured()) + { + OMNITRACE_VERBOSE(0, "%s called %s(%s)...\n", get_exe_name().c_str(), + _data.tool_id.c_str(), JOIN(", ", _args...).c_str()); + } + else + { + OMNITRACE_BASIC_VERBOSE(0, "%s called %s(%s)...\n", get_exe_name().c_str(), + _data.tool_id.c_str(), JOIN(", ", _args...).c_str()); + } + + if(get_state() != State::Finalized) omnitrace_finalize_hidden(); - if(get_state() != omnitrace::State::Finalized) omnitrace_finalize_hidden(); + if(config::settings_are_configured()) + { + OMNITRACE_VERBOSE(0, "%s called %s(%s)...\n", get_exe_name().c_str(), + _data.tool_id.c_str(), JOIN(", ", _args...).c_str()); + } + else + { + OMNITRACE_BASIC_VERBOSE(0, "%s called %s(%s)...\n", get_exe_name().c_str(), + _data.tool_id.c_str(), JOIN(", ", _args...).c_str()); + } (*_func)(_args...); } @@ -77,4 +96,5 @@ exit_gotcha::operator()(const gotcha_data& _data, abort_func_t _func) const { invoke_exit_gotcha(_data, _func); } +} // namespace component } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/exit_gotcha.hpp b/source/lib/omnitrace/library/components/exit_gotcha.hpp index faca9d4a5..a531adf88 100644 --- a/source/lib/omnitrace/library/components/exit_gotcha.hpp +++ b/source/lib/omnitrace/library/components/exit_gotcha.hpp @@ -34,6 +34,8 @@ namespace omnitrace { +namespace component +{ struct exit_gotcha : tim::component::base { using gotcha_data = tim::component::gotcha_data; @@ -49,11 +51,15 @@ struct exit_gotcha : tim::component::base static void configure(); static void shutdown(); + static inline void start() {} + static inline void stop() {} + // exit void operator()(const gotcha_data&, exit_func_t, int) const; // abort void operator()(const gotcha_data&, abort_func_t) const; }; +} // namespace component -using exit_gotcha_t = tim::component::gotcha<3, std::tuple<>, exit_gotcha>; +using exit_gotcha_t = tim::component::gotcha<3, std::tuple<>, component::exit_gotcha>; } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/fork_gotcha.cpp b/source/lib/omnitrace/library/components/fork_gotcha.cpp index 1cfc6a0cb..cd4bd31f7 100644 --- a/source/lib/omnitrace/library/components/fork_gotcha.cpp +++ b/source/lib/omnitrace/library/components/fork_gotcha.cpp @@ -32,6 +32,8 @@ namespace omnitrace { +namespace component +{ void fork_gotcha::configure() { @@ -57,8 +59,9 @@ fork_gotcha::audit(const gotcha_data_t&, audit::outgoing, pid_t _pid) if(_pid != 0) { OMNITRACE_VERBOSE(1, "fork() called on PID %i created PID %i\n", getppid(), _pid); - tim::settings::use_output_suffix() = true; - tim::settings::default_process_suffix() = process::get_id(); + settings::use_output_suffix() = true; + settings::default_process_suffix() = process::get_id(); } } +} // namespace component } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/fork_gotcha.hpp b/source/lib/omnitrace/library/components/fork_gotcha.hpp index 55964e287..7cdcc8f70 100644 --- a/source/lib/omnitrace/library/components/fork_gotcha.hpp +++ b/source/lib/omnitrace/library/components/fork_gotcha.hpp @@ -28,6 +28,8 @@ namespace omnitrace { +namespace component +{ // this is used to wrap fork() struct fork_gotcha : comp::base { @@ -51,6 +53,8 @@ struct fork_gotcha : comp::base static inline void start() {} static inline void stop() {} }; +} // namespace component -using fork_gotcha_t = comp::gotcha<4, tim::component_tuple, api::omnitrace>; +using fork_gotcha_t = + comp::gotcha<4, tim::component_tuple, project::omnitrace>; } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/functors.hpp b/source/lib/omnitrace/library/components/functors.hpp deleted file mode 100644 index 880c47b54..000000000 --- a/source/lib/omnitrace/library/components/functors.hpp +++ /dev/null @@ -1,177 +0,0 @@ -// MIT License -// -// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "library/components/fwd.hpp" -#include "library/defines.hpp" -#include "library/runtime.hpp" -#include "library/state.hpp" -#include "library/timemory.hpp" - -#include -#include -#include - -#include -#include - -namespace omnitrace -{ -namespace component -{ -template -using enable_if_t = typename std::enable_if::type; - -template -static auto get_default_functor(tim::type_list) -{ - return [](Tp...) {}; -}; - -// timemory component which calls omnitrace functions -// (used in gotcha wrappers) -template -struct functors : comp::base, void> -{ - using this_type = functors; - using base_type = comp::base, void>; - using pair_type = std::pair; - - static constexpr bool begin_supports_cstr = - std::is_invocable::value; - static constexpr bool end_supports_cstr = - std::is_invocable::value; - - static constexpr bool begin_supports_void = std::is_invocable::value; - static constexpr bool end_supports_void = std::is_invocable::value; - - static void preinit(); - static void configure(StartFuncT&& _beg, StopFuncT&& _end); - static std::string label(); - - template 0) && - std::is_invocable_v), - int> = 0> - static auto start(Args&&... _args) - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().first(std::forward(_args)...); - } - - template 0) && - std::is_invocable_v), - int> = 0> - static auto stop(Args&&... _args) - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().second(std::forward(_args)...); - } - - TIMEMORY_DEFAULT_OBJECT(functors) - - template = 0> - void start() - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().first(m_prefix); - } - - template = 0> - void stop() - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().second(m_prefix); - } - - template = 0> - void start() - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().first(); - } - - template = 0> - void stop() - { - OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - get_functors().second(); - } - - template = 0> - void set_prefix(const char* _v) - { - m_prefix = _v; - } - -private: - static bool& is_configured(); - static pair_type& get_functors(); - -private: - const char* m_prefix = nullptr; -}; - -template -void -functors::preinit() -{ - using start_args_t = typename tim::mpl::function_traits::args_type; - using stop_args_t = typename tim::mpl::function_traits::args_type; - get_functors().first = get_default_functor(start_args_t{}); - get_functors().second = get_default_functor(stop_args_t{}); -} - -template -void -functors::configure(StartFuncT&& _beg, StopFuncT&& _end) -{ - is_configured() = true; - get_functors().first = std::forward(_beg); - get_functors().second = std::forward(_end); -} - -template -std::string -functors::label() -{ - return trait::name::value; -} - -template -bool& -functors::is_configured() -{ - static bool _v = false; - return _v; -} - -template -typename functors::pair_type& -functors::get_functors() -{ - static auto _v = pair_type{}; - return _v; -} -} // namespace component -} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index 300c664f4..4d3ad2ec9 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -22,11 +22,13 @@ #pragma once +#include "library/categories.hpp" #include "library/common.hpp" #include "library/defines.hpp" #include #include +#include #include #include #include @@ -37,78 +39,24 @@ #include -TIMEMORY_DEFINE_NS_API(project, omnitrace) -TIMEMORY_DEFINE_NS_API(category, process_sampling) +OMNITRACE_DECLARE_COMPONENT(roctracer) +OMNITRACE_DECLARE_COMPONENT(rocprofiler) +OMNITRACE_DECLARE_COMPONENT(rcclp_handle) +OMNITRACE_DECLARE_COMPONENT(comm_data) -TIMEMORY_DECLARE_COMPONENT(roctracer) -TIMEMORY_DECLARE_COMPONENT(rocprofiler) -TIMEMORY_DECLARE_COMPONENT(rcclp_handle) -TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl) -TIMEMORY_COMPONENT_ALIAS(comm_data_tracker_t, data_tracker) -TIMEMORY_DECLARE_COMPONENT(comm_data) - -/// \struct tim::trait::name -/// \brief provides a constexpr string in ::value -TIMEMORY_DECLARE_TYPE_TRAIT(name, typename Tp) - -#define TIMEMORY_DEFINE_NAME_TRAIT(NAME, ...) \ - namespace tim \ - { \ - namespace trait \ - { \ - template <> \ - struct name<__VA_ARGS__> \ - { \ - static constexpr auto value = NAME; \ - }; \ - template <> \ - struct name> : name<__VA_ARGS__> \ - {}; \ - } \ - } - -TIMEMORY_DEFINE_NS_API(category, host) -TIMEMORY_DEFINE_NS_API(category, user) -TIMEMORY_DEFINE_NS_API(category, device) -TIMEMORY_DEFINE_NS_API(category, device_hip) -TIMEMORY_DEFINE_NS_API(category, device_hsa) -TIMEMORY_DEFINE_NS_API(category, rocm_hip) -TIMEMORY_DEFINE_NS_API(category, rocm_hsa) -TIMEMORY_DEFINE_NS_API(category, rocm_smi) -TIMEMORY_DEFINE_NS_API(category, rocm_roctx) -TIMEMORY_DEFINE_NS_API(category, pthread) -TIMEMORY_DEFINE_NS_API(category, kokkos) -TIMEMORY_DEFINE_NS_API(category, mpi) -TIMEMORY_DEFINE_NS_API(category, ompt) -TIMEMORY_DEFINE_NS_API(category, rccl) -TIMEMORY_DEFINE_NS_API(category, critical_trace) -TIMEMORY_DEFINE_NS_API(category, host_critical_trace) -TIMEMORY_DEFINE_NS_API(category, device_critical_trace) - -TIMEMORY_DEFINE_NAME_TRAIT("host", category::host); -TIMEMORY_DEFINE_NAME_TRAIT("device", category::device); -TIMEMORY_DEFINE_NAME_TRAIT("device_hip", category::device_hip); -TIMEMORY_DEFINE_NAME_TRAIT("device_hsa", category::device_hsa); -TIMEMORY_DEFINE_NAME_TRAIT("user", category::user); -TIMEMORY_DEFINE_NAME_TRAIT("rocm_hip", category::rocm_hip); -TIMEMORY_DEFINE_NAME_TRAIT("rocm_hsa", category::rocm_hsa); -TIMEMORY_DEFINE_NAME_TRAIT("rocm_smi", category::rocm_smi); -TIMEMORY_DEFINE_NAME_TRAIT("rocm_roctx", category::rocm_roctx); -TIMEMORY_DEFINE_NAME_TRAIT("sampling", category::sampling); -TIMEMORY_DEFINE_NAME_TRAIT("thread_sampling", category::thread_sampling); -TIMEMORY_DEFINE_NAME_TRAIT("pthread", category::pthread); -TIMEMORY_DEFINE_NAME_TRAIT("kokkos", category::kokkos); -TIMEMORY_DEFINE_NAME_TRAIT("mpi", category::mpi); -TIMEMORY_DEFINE_NAME_TRAIT("ompt", category::ompt); -TIMEMORY_DEFINE_NAME_TRAIT("rccl", category::rccl); -TIMEMORY_DEFINE_NAME_TRAIT("critical-trace", category::critical_trace); -TIMEMORY_DEFINE_NAME_TRAIT("host-critical-trace", category::host_critical_trace); -TIMEMORY_DEFINE_NAME_TRAIT("device-critical-trace", category::device_critical_trace); +OMNITRACE_COMPONENT_ALIAS(comm_data_tracker_t, + ::tim::component::data_tracker) namespace omnitrace { +namespace policy = ::tim::policy; // NOLINT +namespace comp = ::tim::component; // NOLINT + namespace component { +template +using base = ::tim::component::base; + template using data_tracker = tim::component::data_tracker; @@ -117,9 +65,9 @@ using functor_t = std::function; using default_functor_t = functor_t; -struct omnitrace; -struct user_region; struct backtrace; +struct backtrace_metrics; +struct backtrace_timestamp; struct backtrace_wall_clock {}; struct backtrace_cpu_clock @@ -141,8 +89,6 @@ using sampling_gpu_busy = data_tracker; using sampling_gpu_temp = data_tracker; using sampling_gpu_power = data_tracker; using sampling_gpu_memory = data_tracker; -using roctracer = tim::component::roctracer; -using rocprofiler = tim::component::rocprofiler; template @@ -151,49 +97,40 @@ struct functors; } // namespace omnitrace #if !defined(OMNITRACE_USE_ROCTRACER) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) #endif #if !defined(OMNITRACE_USE_ROCPROFILER) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) #endif #if !defined(OMNITRACE_USE_RCCL) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, category::rocm_rccl, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) #endif #if !defined(OMNITRACE_USE_RCCL) && !defined(OMNITRACE_USE_MPI) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data_tracker_t, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data_tracker_t, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data, false_type) #endif #if !defined(TIMEMORY_USE_LIBUNWIND) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_wall_clock, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_cpu_clock, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_percent, - false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, category::sampling, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::backtrace, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::backtrace_metrics, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::backtrace_timestamp, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_wall_clock, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_cpu_clock, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_percent, false_type) #endif #if !defined(TIMEMORY_USE_LIBUNWIND) || !defined(OMNITRACE_USE_ROCM_SMI) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_gpu_busy, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_gpu_temp, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_gpu_power, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::sampling_gpu_memory, - false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_busy, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_temp, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_power, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::sampling_gpu_memory, false_type) #endif -TIMEMORY_SET_COMPONENT_API(omnitrace::component::omnitrace, project::omnitrace, - category::dynamic_instrumentation, os::supports_linux) -TIMEMORY_SET_COMPONENT_API(omnitrace::component::user_region, project::omnitrace, - os::supports_linux) TIMEMORY_SET_COMPONENT_API(omnitrace::component::roctracer, project::omnitrace, tpls::rocm, device::gpu, os::supports_linux, category::external) @@ -223,10 +160,6 @@ TIMEMORY_SET_COMPONENT_API(omnitrace::component::sampling_gpu_temp, project::omn category::temperature, category::sampling, category::process_sampling) -TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::omnitrace, OMNITRACE_COMPONENT, - "omnitrace", "omnitrace_component") -TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::user_region, OMNITRACE_USER_REGION, - "user_region", "omnitrace_user_region") TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::roctracer, OMNITRACE_ROCTRACER, "roctracer", "omnitrace_roctracer") TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::rocprofiler, OMNITRACE_ROCPROFILER, @@ -248,15 +181,6 @@ TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_gpu_power, TIMEMORY_PROPERTY_SPECIALIZATION(omnitrace::component::sampling_gpu_temp, OMNITRACE_SAMPLING_GPU_TEMP, "sampling_gpu_temp", "") -TIMEMORY_METADATA_SPECIALIZATION( - omnitrace::component::omnitrace, "omnitrace", - "Invokes instrumentation functions 'omnitrace_push_trace' and 'omnitrace_pop_trace'", - "Used by gotcha wrappers") -TIMEMORY_METADATA_SPECIALIZATION( - omnitrace::component::user_region, "user_region", - "Invokes instrumentation functions 'omnitrace_user_push_region' and " - "'omnitrace_user_pop_region'", - "Used by OMPT") TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::roctracer, "roctracer", "High-precision ROCm API and kernel tracing", "") TIMEMORY_METADATA_SPECIALIZATION(omnitrace::component::rocprofiler, "rocprofiler", @@ -292,44 +216,55 @@ TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_busy, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_temp, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_power, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_memory, double) -TIMEMORY_STATISTICS_TYPE(component::comm_data_tracker_t, float) +TIMEMORY_STATISTICS_TYPE(omnitrace::component::comm_data_tracker_t, float) // enable timing units -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, - omnitrace::component::sampling_wall_clock, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, - omnitrace::component::sampling_cpu_clock, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, omnitrace::component::sampling_percent, - true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, - omnitrace::component::sampling_wall_clock, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, - omnitrace::component::sampling_cpu_clock, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_timing_category, component::sampling_wall_clock, + true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_timing_category, component::sampling_cpu_clock, + true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_timing_category, component::sampling_percent, + true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::sampling_wall_clock, + true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::sampling_cpu_clock, + true_type) // enable percent units -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_percent_units, - omnitrace::component::sampling_gpu_busy, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_percent_units, component::sampling_gpu_busy, + true_type) // enable memory units -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, - omnitrace::component::sampling_gpu_memory, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, - omnitrace::component::sampling_gpu_memory, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_memory_category, component::sampling_gpu_memory, + true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::sampling_gpu_memory, + true_type) // reporting categories (sum) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_sum, omnitrace::component::sampling_gpu_busy, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_sum, omnitrace::component::sampling_gpu_temp, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_sum, omnitrace::component::sampling_gpu_power, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_sum, omnitrace::component::sampling_gpu_memory, - false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_busy, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_temp, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_power, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_sum, component::sampling_gpu_memory, false_type) // reporting categories (mean) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_mean, omnitrace::component::sampling_percent, - false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_type) // reporting categories (stats) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_statistics, omnitrace::component::sampling_percent, - false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_statistics, component::sampling_percent, + false_type) + +#define OMNITRACE_DECLARE_EXTERN_COMPONENT(NAME, HAS_DATA, ...) \ + TIMEMORY_DECLARE_EXTERN_TEMPLATE( \ + struct tim::component::base) \ + TIMEMORY_DECLARE_EXTERN_OPERATIONS(TIMEMORY_ESC(omnitrace::component::NAME), \ + HAS_DATA) \ + TIMEMORY_DECLARE_EXTERN_STORAGE(TIMEMORY_ESC(omnitrace::component::NAME)) + +#define OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(NAME, HAS_DATA, ...) \ + TIMEMORY_INSTANTIATE_EXTERN_TEMPLATE( \ + struct tim::component::base) \ + TIMEMORY_INSTANTIATE_EXTERN_OPERATIONS(TIMEMORY_ESC(omnitrace::component::NAME), \ + HAS_DATA) \ + TIMEMORY_INSTANTIATE_EXTERN_STORAGE(TIMEMORY_ESC(omnitrace::component::NAME)) diff --git a/source/lib/omnitrace/library/components/mpi_gotcha.cpp b/source/lib/omnitrace/library/components/mpi_gotcha.cpp index 02eca94b0..21cf09ca4 100644 --- a/source/lib/omnitrace/library/components/mpi_gotcha.cpp +++ b/source/lib/omnitrace/library/components/mpi_gotcha.cpp @@ -21,7 +21,7 @@ // SOFTWARE. #include "library/components/mpi_gotcha.hpp" -#include "library/api.hpp" +#include "api.hpp" #include "library/components/category_region.hpp" #include "library/components/comm_data.hpp" #include "library/components/fwd.hpp" @@ -41,11 +41,12 @@ namespace omnitrace { +namespace component +{ namespace { using mpip_bundle_t = - tim::component_tuple, - comp::comm_data>; + tim::component_tuple, comp::comm_data>; struct comm_rank_data { @@ -111,7 +112,7 @@ omnitrace_mpi_set_attr() static auto _mpi_fini = [](MPI_Comm, int, void*, void*) { OMNITRACE_DEBUG("MPI Comm attribute finalize\n"); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip(mpip_index); + comp::deactivate_mpip(mpip_index); omnitrace_finalize_hidden(); return MPI_SUCCESS; }; @@ -199,8 +200,6 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) { OMNITRACE_BASIC_DEBUG_F("%s(int*, char***)\n", _data.tool_id.c_str()); - if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); - omnitrace_push_trace_hidden(_data.tool_id.c_str()); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) tim::mpi::is_initialized_callback() = []() { return true; }; @@ -213,8 +212,6 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in { OMNITRACE_BASIC_DEBUG_F("%s(int*, char***, int, int*)\n", _data.tool_id.c_str()); - if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); - omnitrace_push_trace_hidden(_data.tool_id.c_str()); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) tim::mpi::is_initialized_callback() = []() { return true; }; @@ -228,7 +225,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str()); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip(mpip_index); + comp::deactivate_mpip(mpip_index); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) tim::mpi::is_initialized_callback() = []() { return false; }; @@ -278,15 +275,10 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n"); - if(!get_use_timemory()) - { - trait::runtime_enabled::set(false); - trait::runtime_enabled::set(false); - } // use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST // to control the gotcha bindings at runtime - comp::configure_mpip(); - mpip_index = comp::activate_mpip(); + comp::configure_mpip(); + mpip_index = comp::activate_mpip(); } auto_lock_t _lk{ type_mutex() }; @@ -344,6 +336,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) } omnitrace_pop_trace_hidden(_data.tool_id.c_str()); } +} // namespace component } // namespace omnitrace -TIMEMORY_INITIALIZE_STORAGE(omnitrace::mpi_gotcha) +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::mpi_gotcha) diff --git a/source/lib/omnitrace/library/components/mpi_gotcha.hpp b/source/lib/omnitrace/library/components/mpi_gotcha.hpp index 4b7cae7d2..7c58d4a7b 100644 --- a/source/lib/omnitrace/library/components/mpi_gotcha.hpp +++ b/source/lib/omnitrace/library/components/mpi_gotcha.hpp @@ -30,6 +30,8 @@ namespace omnitrace { +namespace component +{ // this is used to wrap MPI_Init and MPI_Init_thread struct mpi_gotcha : comp::base { @@ -76,6 +78,8 @@ struct mpi_gotcha : comp::base int* m_size_ptr = nullptr; uintptr_t m_comm_val = null_comm(); }; +} // namespace component -using mpi_gotcha_t = comp::gotcha<5, tim::component_tuple, api::omnitrace>; +using mpi_gotcha_t = + comp::gotcha<5, tim::component_tuple, project::omnitrace>; } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/omnitrace.cpp b/source/lib/omnitrace/library/components/omnitrace.cpp deleted file mode 100644 index 3b7b24dc5..000000000 --- a/source/lib/omnitrace/library/components/omnitrace.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// MIT License -// -// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/omnitrace.hpp" -#include "library/api.hpp" - -namespace omnitrace -{ -namespace component -{ -void -omnitrace::start() -{ - if(m_prefix) omnitrace_push_trace_hidden(m_prefix); -} - -void -omnitrace::stop() -{ - if(m_prefix) omnitrace_pop_trace_hidden(m_prefix); -} - -void -omnitrace::set_prefix(const char* _prefix) -{ - m_prefix = _prefix; -} -} // namespace component -} // namespace omnitrace - -TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::omnitrace) diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp index 0a1324deb..bfc4e3991 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -22,16 +22,19 @@ #include "library/components/pthread_create_gotcha.hpp" #include "library/components/category_region.hpp" -#include "library/components/omnitrace.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/components/roctracer.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" +#include "library/state.hpp" #include "library/thread_data.hpp" +#include "library/thread_info.hpp" +#include "library/utility.hpp" #include +#include #include #include #include @@ -49,13 +52,10 @@ std::set shutdown(); } // namespace sampling -namespace mpl = tim::mpl; - -using bundle_t = tim::lightweight_tuple; -using wall_pw_t = mpl::piecewise_select; // only wall-clock -using main_pw_t = mpl::piecewise_ignore; // exclude wall-clock -using category_region_t = - tim::lightweight_tuple>; +namespace component +{ +using bundle_t = tim::lightweight_tuple; +using category_region_t = tim::lightweight_tuple>; namespace { @@ -63,7 +63,7 @@ auto* is_shutdown = new bool{ false }; // intentional data leak auto* bundles = new std::map>{}; auto* bundles_mutex = new std::mutex{}; auto bundles_dtor = scope::destructor{ []() { - omnitrace::pthread_create_gotcha::shutdown(); + pthread_create_gotcha::shutdown(); delete bundles; delete bundles_mutex; bundles = nullptr; @@ -105,12 +105,12 @@ stop_bundle(bundle_t& _bundle, int64_t _tid, Args&&... _args) _bundle.key().c_str(), _tid); if(get_use_timemory()) { - _bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value + auto _wc = *_bundle.get(); + _wc.stop(); // update roctracer_data - _bundle.store(std::plus{}, - _bundle.get()->get() * units::sec); - // stop all other components including roctracer_data after update - _bundle.stop(main_pw_t{}); + _bundle.store(std::plus{}, _wc.get() * _wc.unit()); + // stop all + _bundle.stop(); // exclude popping wall-clock _bundle.pop(_tid); } @@ -154,15 +154,16 @@ pthread_create_gotcha::wrapper::operator()() const return m_routine(m_arg); } - push_thread_state(omnitrace::ThreadState::Internal); + push_thread_state(ThreadState::Internal); int64_t _tid = -1; void* _ret = nullptr; auto _is_sampling = false; auto _bundle = std::shared_ptr{}; auto _signals = std::set{}; - auto _coverage = (get_mode() == omnitrace::Mode::Coverage); - auto _dtor = [&]() { + auto _coverage = (get_mode() == Mode::Coverage); + // const auto& _parent_info = thread_info::get(m_parent_tid, LookupTID); + auto _dtor = [&]() { set_thread_state(ThreadState::Internal); if(_is_sampling) { @@ -172,11 +173,11 @@ pthread_create_gotcha::wrapper::operator()() const if(_tid >= 0) { - auto _active = (get_state() == omnitrace::State::Active && + auto _active = (get_state() == ::omnitrace::State::Active && bundles != nullptr && bundles_mutex != nullptr); if(!_active) return; - get_execution_time(_tid)->second = comp::wall_clock::record(); - auto& _thr_bundle = thread_bundle_data_t::instance(); + thread_info::set_stop(comp::wall_clock::record()); + auto& _thr_bundle = thread_bundle_data_t::instance(); if(_thr_bundle && _thr_bundle->get() && _thr_bundle->get()->get_is_running()) _thr_bundle->stop(); @@ -185,18 +186,19 @@ pthread_create_gotcha::wrapper::operator()() const } }; - auto _active = (get_state() == omnitrace::State::Active && bundles != nullptr && + auto _active = (get_state() == ::omnitrace::State::Active && bundles != nullptr && bundles_mutex != nullptr); if(_active && !_coverage) { - _tid = threading::get_id(); + const auto& _tid_index = thread_info::init(); + _tid = _tid_index->index_data->internal_value; threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); if(!thread_bundle_data_t::instances().at(_tid)) { thread_data::construct( TIMEMORY_JOIN('/', "omnitrace/process", process::get_id(), "thread", - threading::get_id()), + _tid), quirk::config{}); thread_bundle_data_t::instances().at(_tid)->start(); } @@ -207,12 +209,9 @@ pthread_create_gotcha::wrapper::operator()() const .first->second; } if(_bundle) start_bundle(*_bundle); - get_execution_time(_tid)->first = comp::wall_clock::record(); get_cpu_cid_stack(_tid, m_parent_tid); if(m_enable_sampling) { - // initialize thread-local statics - (void) tim::get_unw_backtrace<12, 1, false>(); _is_sampling = true; pthread_gotcha::push_enable_sampling_on_child_threads(false); _signals = sampling::setup(); @@ -220,6 +219,10 @@ pthread_create_gotcha::wrapper::operator()() const sampling::unblock_signals(); } } + else + { + thread_info::init(true); + } // notify the wrapper that all internal work is completed if(m_promise) m_promise->set_value(); @@ -227,7 +230,7 @@ pthread_create_gotcha::wrapper::operator()() const // Internal -> Enabled pop_thread_state(); - push_thread_state(omnitrace::ThreadState::Enabled); + push_thread_state(ThreadState::Enabled); // execute the original function _ret = m_routine(m_arg); @@ -237,7 +240,7 @@ pthread_create_gotcha::wrapper::operator()() const // execute the destructor actions _dtor(); - set_thread_state(omnitrace::ThreadState::Completed); + set_thread_state(ThreadState::Completed); return _ret; } @@ -324,48 +327,50 @@ int pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void*), void* arg) const { - auto _initial_thread_state = get_thread_state(); + auto _disabled = (get_thread_state() == ThreadState::Disabled); + auto _enabled = (get_thread_state() == ThreadState::Enabled); + auto _bundle = std::optional{}; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - bundle_t _bundle{ "pthread_create" }; - auto _enable_sampling = pthread_gotcha::sampling_enabled_on_child_threads(); - auto _coverage = (get_mode() == omnitrace::Mode::Coverage); - auto _active = (get_state() == omnitrace::State::Active); - int64_t _tid = (_active) ? threading::get_id() : 0; - if(_active) + auto _active = (get_state() == ::omnitrace::State::Active && !_disabled); + auto _coverage = (get_mode() == Mode::Coverage); + auto _use_sampling = get_use_sampling(); + auto _sample_child = pthread_gotcha::sampling_enabled_on_child_threads(); + auto _tid = utility::get_thread_index(); + auto _use_bundle = (_active && !_coverage); + const auto& _info = thread_info::init(!_active || !_sample_child || _disabled); + auto _enable_sampling = + (!_disabled && _enabled && _sample_child && _use_sampling && !_info->is_offset); + + if(_active && !_disabled && !_info->is_offset) { OMNITRACE_VERBOSE(1, "Creating new thread on PID %i (rank: %i), TID %li\n", process::get_id(), dmp::rank(), _tid); } // ensure that cpu cid stack exists on the parent thread if active - if(!_coverage && _active) get_cpu_cid_stack(); + if(_active && !_coverage) get_cpu_cid_stack(); - if(!get_use_sampling() || !_enable_sampling) - { - auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr); - if(_active && !_coverage && _enable_sampling && - _initial_thread_state == ThreadState::Enabled) - start_bundle(_bundle, audit::incoming{}, thread, attr, start_routine, arg); - // create the thread - auto _ret = (*m_wrappee)(thread, attr, &wrapper::wrap, static_cast(_obj)); - if(_active && !_coverage && _enable_sampling && - _initial_thread_state == ThreadState::Enabled) - stop_bundle(_bundle, _tid, audit::outgoing{}, _ret); - return _ret; - } - - // block the signals in entire process - OMNITRACE_DEBUG("blocking signals...\n"); - auto _blocked_signals = get_sampling_signals(); - tim::sampling::block_signals(_blocked_signals, tim::sampling::sigmask_scope::process); - - start_bundle(_bundle, audit::incoming{}, thread, attr, start_routine, arg); - - // promise set by thread when signal handler is configured + set_thread_state(ThreadState::Disabled); + auto _blocked = get_sampling_signals(); auto _promise = std::promise{}; auto _fut = _promise.get_future(); auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise); + set_thread_state(ThreadState::Internal); + + // block the signals in entire process + if(_enable_sampling && !_blocked.empty()) + { + OMNITRACE_DEBUG("blocking signals...\n"); + tim::sampling::block_signals(_blocked, tim::sampling::sigmask_scope::process); + } + + if(_use_bundle) + { + _bundle = bundle_t{ "pthread_create" }; + start_bundle(*_bundle, audit::incoming{}, thread, attr, start_routine, arg); + } // create the thread auto _ret = (*m_wrappee)(thread, attr, &wrapper::wrap, static_cast(_wrap)); @@ -374,21 +379,19 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, OMNITRACE_DEBUG("waiting for child to signal it is setup...\n"); _fut.wait(); - stop_bundle(_bundle, threading::get_id(), audit::outgoing{}, _ret); + if(_use_bundle) stop_bundle(*_bundle, threading::get_id(), audit::outgoing{}, _ret); // unblock the signals in the entire process - OMNITRACE_DEBUG("unblocking signals...\n"); - tim::sampling::unblock_signals(_blocked_signals, - tim::sampling::sigmask_scope::process); + if(_enable_sampling && !_blocked.empty()) + { + OMNITRACE_DEBUG("unblocking signals...\n"); + tim::sampling::unblock_signals(_blocked, tim::sampling::sigmask_scope::process); + } OMNITRACE_DEBUG("returning success...\n"); return _ret; } - -bool -pthread_create_gotcha::is_valid_execution_time(int64_t _tid, uint64_t _ts) -{ - return (_ts >= get_execution_time(_tid)->first && - _ts <= get_execution_time(_tid)->second); -} +} // namespace component } // namespace omnitrace + +TIMEMORY_INITIALIZE_STORAGE(component::roctracer_data) diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp index 29d675db0..459f83476 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp @@ -32,6 +32,8 @@ namespace omnitrace { +namespace component +{ struct pthread_create_gotcha : tim::component::base { using routine_t = void* (*) (void*); @@ -68,26 +70,13 @@ struct pthread_create_gotcha : tim::component::base int operator()(pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void*), void* arg) const; - static auto& get_execution_time(int64_t _tid = threading::get_id()); - static bool is_valid_execution_time(int64_t _tid, uint64_t _ts); - void set_data(wrappee_t); private: wrappee_t m_wrappee = &pthread_create; }; -inline auto& -pthread_create_gotcha::get_execution_time(int64_t _tid) -{ - struct omnitrace_thread_exec_time - {}; - using data_t = std::pair; - using thread_data_t = thread_data; - static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); - return _v.at(_tid); -} - using pthread_create_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>; +} // namespace component } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/pthread_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_gotcha.cpp index da79a5d0b..1fea0d3c0 100644 --- a/source/lib/omnitrace/library/components/pthread_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_gotcha.cpp @@ -21,7 +21,6 @@ // SOFTWARE. #include "library/components/pthread_gotcha.hpp" -#include "library/components/omnitrace.hpp" #include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_mutex_gotcha.hpp" #include "library/components/roctracer.hpp" @@ -33,7 +32,6 @@ #include "library/utility.hpp" #include -#include #include #include @@ -45,7 +43,8 @@ namespace omnitrace { namespace { -using bundle_t = tim::lightweight_tuple; +using bundle_t = tim::lightweight_tuple; auto& get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) @@ -62,6 +61,8 @@ get_bundle() if(!_v) _v = std::make_unique("pthread_gotcha"); return _v; } + +bool is_configured = false; } // namespace //--------------------------------------------------------------------------------------// @@ -69,15 +70,23 @@ get_bundle() void pthread_gotcha::configure() { - pthread_create_gotcha::configure(); - pthread_mutex_gotcha::configure(); + if(!is_configured) + { + ::omnitrace::component::pthread_create_gotcha::configure(); + ::omnitrace::component::pthread_mutex_gotcha::configure(); + is_configured = true; + } } void pthread_gotcha::shutdown() { - pthread_create_gotcha::shutdown(); - pthread_mutex_gotcha::shutdown(); + if(is_configured) + { + ::omnitrace::component::pthread_mutex_gotcha::shutdown(); + ::omnitrace::component::pthread_create_gotcha::shutdown(); + is_configured = false; + } } bool @@ -89,10 +98,10 @@ pthread_gotcha::sampling_enabled_on_child_threads() bool pthread_gotcha::push_enable_sampling_on_child_threads(bool _v) { - auto& _hist = get_sampling_on_child_threads_history(); - bool _last = sampling_on_child_threads(); - _hist.emplace_back(_last); + bool _last = sampling_on_child_threads(); sampling_on_child_threads() = _v; + auto& _hist = get_sampling_on_child_threads_history(); + _hist.emplace_back(_last); return _last; } @@ -128,6 +137,7 @@ pthread_gotcha::sampling_on_child_threads() void pthread_gotcha::start() { + configure(); get_bundle()->start(); } @@ -136,5 +146,6 @@ pthread_gotcha::stop() { get_bundle()->stop(); get_bundle().reset(); + shutdown(); } } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index 87c2a1d4a..c1587b6ed 100644 --- a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -21,7 +21,6 @@ // SOFTWARE. #include "library/components/pthread_mutex_gotcha.hpp" -#include "library.hpp" #include "library/components/category_region.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" @@ -41,6 +40,8 @@ namespace omnitrace { +namespace component +{ using Device = critical_trace::Device; using Phase = critical_trace::Phase; @@ -98,7 +99,7 @@ pthread_mutex_gotcha::configure() pthread_mutex_gotcha_t::get_initializer() = []() { if(config::get_trace_thread_locks()) { - pthread_mutex_gotcha::validate(); + validate(); pthread_mutex_gotcha_t::configure( comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" }); @@ -186,23 +187,39 @@ pthread_mutex_gotcha::validate() } } +pthread_mutex_gotcha::pthread_mutex_gotcha(const gotcha_data_t& _data) +: m_data{ &_data } +{} + template auto -pthread_mutex_gotcha::operator()(uintptr_t&& _id, const comp::gotcha_data& _data, - int (*_callee)(Args...), Args... _args) const +pthread_mutex_gotcha::operator()(uintptr_t&& _id, int (*_callee)(Args...), + Args... _args) const { - using bundle_t = omnitrace::component::category_region; + using bundle_t = category_region; if(is_disabled()) { if(!_callee) { - OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str()); + if(m_data) + { + OMNITRACE_PRINT("Warning! nullptr to %s\n", m_data->tool_id.c_str()); + } return EINVAL; } return (*_callee)(_args...); } + struct local_dtor + { + explicit local_dtor(bool& _v) + : _protect{ _v } + {} + ~local_dtor() { _protect = false; } + bool& _protect; + } _dtor{ m_protect = true }; + uint64_t _cid = 0; uint64_t _parent_cid = 0; uint32_t _depth = 0; @@ -216,15 +233,15 @@ pthread_mutex_gotcha::operator()(uintptr_t&& _id, const comp::gotcha_data& _data _ts = comp::wall_clock::record(); } - bundle_t::audit(_data, audit::incoming{}, _args...); + bundle_t::audit(std::string_view{ m_data->tool_id }, audit::incoming{}, _args...); auto _ret = (*_callee)(_args...); - bundle_t::audit(_data, audit::outgoing{}, _ret); + bundle_t::audit(std::string_view{ m_data->tool_id }, audit::outgoing{}, _ret); if(_id < std::numeric_limits::max() && get_use_critical_trace()) { add_critical_trace( threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0, - _id, get_hashes().at(_data.index), _depth); + _id, get_hashes().at(m_data->index), _depth); } tim::consume_parameters(_id, _cid, _parent_cid, _depth, _ts); @@ -232,51 +249,69 @@ pthread_mutex_gotcha::operator()(uintptr_t&& _id, const comp::gotcha_data& _data } int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_mutex_t*), +pthread_mutex_gotcha::operator()(int (*_callee)(pthread_mutex_t*), pthread_mutex_t* _mutex) const { - return (*this)(reinterpret_cast(_mutex), _data, _callee, _mutex); + if(m_protect) return (*_callee)(_mutex); + return (*this)(reinterpret_cast(_mutex), _callee, _mutex); } int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_spinlock_t*), +pthread_mutex_gotcha::operator()(int (*_callee)(pthread_spinlock_t*), pthread_spinlock_t* _lock) const { - return (*this)(reinterpret_cast(_lock), _data, _callee, _lock); + if(m_protect) return (*_callee)(_lock); + return (*this)(reinterpret_cast(_lock), _callee, _lock); } int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_rwlock_t*), +pthread_mutex_gotcha::operator()(int (*_callee)(pthread_rwlock_t*), pthread_rwlock_t* _lock) const { - return (*this)(reinterpret_cast(_lock), _data, _callee, _lock); + if(m_protect) return (*_callee)(_lock); + return (*this)(reinterpret_cast(_lock), _callee, _lock); } int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_barrier_t*), +pthread_mutex_gotcha::operator()(int (*_callee)(pthread_barrier_t*), pthread_barrier_t* _barrier) const { - return (*this)(reinterpret_cast(_barrier), _data, _callee, _barrier); + if(m_protect) return (*_callee)(_barrier); + return (*this)(reinterpret_cast(_barrier), _callee, _barrier); } int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_t, void**), pthread_t _thr, +pthread_mutex_gotcha::operator()(int (*_callee)(pthread_t, void**), pthread_t _thr, void** _tinfo) const { - return (*this)(static_cast(threading::get_id()), _data, _callee, _thr, - _tinfo); + if(m_protect) return (*_callee)(_thr, _tinfo); + return (*this)(static_cast(threading::get_id()), _callee, _thr, _tinfo); } bool pthread_mutex_gotcha::is_disabled() { - return (omnitrace::get_state() != omnitrace::State::Active || - omnitrace::get_thread_state() != omnitrace::ThreadState::Enabled || + return (get_state() != ::omnitrace::State::Active || + get_thread_state() != ThreadState::Enabled || (get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads())); } +} // namespace component } // namespace omnitrace + +namespace tim +{ +namespace policy +{ +template +pthread_mutex_gotcha& +static_data::operator()( + std::integral_constant, const component::gotcha_data& _data) const +{ + using thread_data_t = + omnitrace::thread_data>; + static thread_local auto& _v = + thread_data_t::instance(omnitrace::construct_on_init{}, _data); + return *_v; +} +} // namespace policy +} // namespace tim diff --git a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp index facbd3715..bf1c31767 100644 --- a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp +++ b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp @@ -26,12 +26,17 @@ #include "library/defines.hpp" #include "library/timemory.hpp" +#include +#include + #include #include #include namespace omnitrace { +namespace component +{ // this is used to wrap pthread_mutex() struct pthread_mutex_gotcha : comp::base { @@ -41,6 +46,8 @@ struct pthread_mutex_gotcha : comp::base TIMEMORY_DEFAULT_OBJECT(pthread_mutex_gotcha) + explicit pthread_mutex_gotcha(const gotcha_data_t&); + // string id for component static std::string label() { return "pthread_mutex_gotcha"; } @@ -49,25 +56,44 @@ struct pthread_mutex_gotcha : comp::base static void shutdown(); static void validate(); - int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), - pthread_mutex_t*) const; - int operator()(const gotcha_data_t&, int (*)(pthread_spinlock_t*), - pthread_spinlock_t*) const; - int operator()(const gotcha_data_t&, int (*)(pthread_rwlock_t*), - pthread_rwlock_t*) const; - int operator()(const gotcha_data_t&, int (*)(pthread_barrier_t*), - pthread_barrier_t*) const; - int operator()(const gotcha_data_t&, int (*)(pthread_t, void**), pthread_t, - void**) const; + int operator()(int (*)(pthread_mutex_t*), pthread_mutex_t*) const; + int operator()(int (*)(pthread_spinlock_t*), pthread_spinlock_t*) const; + int operator()(int (*)(pthread_rwlock_t*), pthread_rwlock_t*) const; + int operator()(int (*)(pthread_barrier_t*), pthread_barrier_t*) const; + int operator()(int (*)(pthread_t, void**), pthread_t, void**) const; private: static bool is_disabled(); static hash_array_t& get_hashes(); template - auto operator()(uintptr_t&&, const gotcha_data_t&, int (*)(Args...), Args...) const; + auto operator()(uintptr_t&&, int (*)(Args...), Args...) const; + + mutable bool m_protect = false; + const gotcha_data_t* m_data = nullptr; }; using pthread_mutex_gotcha_t = comp::gotcha; + std::tuple<>, pthread_mutex_gotcha>; +} // namespace component } // namespace omnitrace + +OMNITRACE_DEFINE_CONCRETE_TRAIT(fast_gotcha, component::pthread_mutex_gotcha_t, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(static_data, component::pthread_mutex_gotcha_t, true_type) + +namespace tim +{ +namespace policy +{ +using pthread_mutex_gotcha = ::omnitrace::component::pthread_mutex_gotcha; +using pthread_mutex_gotcha_t = ::omnitrace::component::pthread_mutex_gotcha_t; + +template <> +struct static_data : std::true_type +{ + template + pthread_mutex_gotcha& operator()(std::integral_constant, + const component::gotcha_data& _data) const; +}; +} // namespace policy +} // namespace tim diff --git a/source/lib/omnitrace/library/components/rcclp.cpp b/source/lib/omnitrace/library/components/rcclp.cpp index 90611323b..13936f2db 100644 --- a/source/lib/omnitrace/library/components/rcclp.cpp +++ b/source/lib/omnitrace/library/components/rcclp.cpp @@ -33,7 +33,7 @@ operator<<(std::ostream& _os, const ncclUniqueId& _v) return _os; } -namespace tim +namespace omnitrace { namespace component { @@ -59,7 +59,7 @@ activate_rcclp() std::stringstream ss; ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); + << demangle(); tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); return 1; } @@ -75,7 +75,7 @@ deactivate_rcclp(uint64_t id) { std::stringstream ss; ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); + << demangle(); tim::manager::instance()->cleanup(ss.str()); return 0; } @@ -90,7 +90,7 @@ configure_rcclp(const std::set& permit, const std::set static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS; using rcclp_gotcha_t = - tim::component::gotcha; + tim::component::gotcha; static bool is_initialized = false; if(!is_initialized) @@ -197,4 +197,4 @@ rcclp_handle::get_tool_count() return get_persistent_data().m_count; } } // namespace component -} // namespace tim +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index 9f3c855b6..977c2f68e 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -49,18 +49,20 @@ # define OMNITRACE_NUM_RCCLP_WRAPPERS 25 #endif -TIMEMORY_COMPONENT_ALIAS( +OMNITRACE_COMPONENT_ALIAS( rccl_toolset_t, - component_bundle, - comm_data>) -TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t, - gotcha) + ::tim::component_bundle, + comm_data>) +OMNITRACE_COMPONENT_ALIAS(rcclp_gotcha_t, + ::tim::component::gotcha) #if !defined(OMNITRACE_USE_RCCL) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) #endif -namespace tim +namespace omnitrace { namespace component { @@ -106,4 +108,4 @@ struct rcclp_handle : base static std::atomic& get_tool_count(); }; } // namespace component -} // namespace tim +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/rocprofiler.cpp b/source/lib/omnitrace/library/components/rocprofiler.cpp index dffff72c4..cf3d54235 100644 --- a/source/lib/omnitrace/library/components/rocprofiler.cpp +++ b/source/lib/omnitrace/library/components/rocprofiler.cpp @@ -22,8 +22,6 @@ #include "library/components/rocprofiler.hpp" #include "library/common.hpp" -#include "library/components/pthread_create_gotcha.hpp" -#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" @@ -45,7 +43,7 @@ #include #include -namespace tim +namespace omnitrace { namespace component { @@ -59,10 +57,10 @@ rocprofiler_activity_count() } } // namespace -omnitrace::unique_ptr_t& +unique_ptr_t& rocm_data(int64_t _tid) { - using thread_data_t = omnitrace::thread_data; + using thread_data_t = thread_data; static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); return _v.at(_tid); } @@ -179,153 +177,6 @@ rocprofiler::shutdown() { omnitrace::rocprofiler::post_process(); omnitrace::rocprofiler::rocm_cleanup(); - /* - using storage_type = typename rocprofiler_data::storage_type; - using bundle_t = rocprofiler_data; - using tag_t = api::omnitrace; - - auto _data = omnitrace::rocprofiler::get_data(); - auto _labels = omnitrace::rocprofiler::get_data_labels(); - auto _info = omnitrace::rocprofiler::rocm_metrics(); - int64_t _idx = 0; - auto _scope = tim::scope::get_default(); - - auto _get_metric_desc = [_info](std::string_view _v) { - for(auto itr : _info) - { - if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0) - return std::make_pair(itr.short_description(), itr.long_description()); - } - return std::make_pair(std::string{}, std::string{}); - }; - - auto _debug = settings::debug(); - settings::debug() = true; - - struct hw_counters - {}; - - using rocm_counter = omnitrace::rocprofiler::rocm_counter; - - struct perfetto_rocm_event - { - rocm_counter entry = {}; - rocm_counter exit = {}; - rocprofiler_value value = {}; - - bool operator<(const perfetto_rocm_event& _v) const - { - return (entry.at(0) == _v.entry.at(0)) ? exit.at(0) < _v.exit.at(0) - : entry.at(0) < _v.entry.at(0); - } - }; - - // contains the necessary info for export to perfetto - auto _perfetto_raw_data = - std::map>>{}; - // contains the time-stamp regions for the counter tracks - auto _perfetto_time_regions = - std::map>>{}; - - // create a layout compatible for exporting to perfetto - for(const auto& itr : _labels) - { - auto _dev_id = itr.first; - auto _dev_name = JOIN("", '[', _dev_id, ']'); - - for(size_t i = 0; i < itr.second.size(); ++i) - { - auto _metric_name = itr.second.at(i); - auto _idx = perfetto_counter_track::emplace( - _dev_id, JOIN(' ', "Device", _metric_name, _dev_name)); - auto& _raw = _perfetto_raw_data[_dev_id][_idx]; - auto& _reg = _perfetto_time_regions[_dev_id][_idx]; - for(const auto& ditr : _data) - { - _raw.emplace_back( - perfetto_rocm_event{ ditr.entry, ditr.exit, ditr.data.at(i) }); - } - std::sort(_raw.begin(), _raw.end()); - for(auto ritr : _raw) - { - if(pthread_create_gotcha::is_valid_execution_time(0, ritr.entry.at(0))) - _reg.emplace(ritr.entry.at(0)); - if(pthread_create_gotcha::is_valid_execution_time(0, ritr.exit.at(0))) - _reg.emplace(ritr.exit.at(0)); - } - } - } - - for(auto& ditr : _perfetto_time_regions) - for(auto& citr : ditr.second) - { - for(auto _ts = citr.second.begin(); _ts != citr.second.end(); ++_ts) - { - rocprofiler_value _v = {}; - auto _curr = _ts; - auto _next = std::next(_ts); - if(_next == citr.second.end()) continue; - auto _min_ts = *_curr; - auto _max_ts = (_next == citr.second.end()) ? *_curr : *_next; - for(auto itr : _perfetto_raw_data[ditr.first][citr.first]) - { - if(itr.entry[0] >= _min_ts && itr.exit[0] <= _max_ts) - { - using namespace tim::stl; - _v += itr.value; - } - } - - auto _write_counter = [&](auto _v) { - if(_min_ts == _max_ts) - { - using value_type = std::remove_reference_t< - std::remove_cv_t>>; - _v = static_cast(0); - } - - TRACE_COUNTER( - "hardware_counter", - perfetto_counter_track::at(ditr.first, citr.first), - _min_ts, _v); - }; - std::visit(_write_counter, _v); - } - } - - for(const auto& itr : _labels) - { - for(size_t i = 0; i < itr.second.size(); ++i) - { - auto _metric_name = itr.second.at(i); - auto _metric_desc = _get_metric_desc(_metric_name).second; - rocprofiler_data::label() = _metric_name; - if(!_metric_desc.empty()) - rocprofiler_data::description() = JOIN(" - ", "rocprof", _metric_desc); - auto _dev_id = itr.first; - auto _label = JOIN('-', "rocprofiler", _metric_name, "device", _dev_id); - storage_type _storage{ standalone_storage{}, ++_idx, _label }; - std::vector _bundles = {}; - _bundles.reserve(_data.size()); - for(const auto& ditr : _data) - { - auto _hash = add_hash_id(ditr.name); - auto _v = ditr.data.at(i); - auto _obj = std::tie(_bundles.emplace_back(bundle_t{})); - invoke::reset(_obj); - invoke::push(_obj, _scope, _hash, &_storage, _dev_id); - invoke::start(_obj); - invoke::store(_obj, _v); - invoke::stop(_obj); - invoke::pop(_obj, &_storage, _dev_id); - } - - _storage.write(_label); - } - } - settings::debug() = _debug; - */ - OMNITRACE_VERBOSE_F(1, "rocprofiler is shutdown\n"); } @@ -336,8 +187,8 @@ rocprofiler::protect_flush_activity() []() { ++rocprofiler_activity_count(); }); } } // namespace component -} // namespace tim +} // namespace omnitrace -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(rocprofiler, false, void) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(rocprofiler_data, true, - tim::component::rocprofiler_value) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(rocprofiler, false, void) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(rocprofiler_data, true, + tim::component::rocprofiler_value) diff --git a/source/lib/omnitrace/library/components/rocprofiler.hpp b/source/lib/omnitrace/library/components/rocprofiler.hpp index 8d3e6041d..a46a46241 100644 --- a/source/lib/omnitrace/library/components/rocprofiler.hpp +++ b/source/lib/omnitrace/library/components/rocprofiler.hpp @@ -47,19 +47,7 @@ #include #include -#if !defined(OMNITRACE_MAX_COUNTERS) -# define OMNITRACE_MAX_COUNTERS 25 -#endif - -#if !defined(OMNITRACE_ROCM_LOOK_AHEAD) -# define OMNITRACE_ROCM_LOOK_AHEAD 128 -#endif - -#if !defined(OMNITRACE_MAX_ROCM_QUEUES) -# define OMNITRACE_MAX_ROCM_QUEUES OMNITRACE_MAX_THREADS -#endif - -namespace tim +namespace omnitrace { namespace component { @@ -159,7 +147,21 @@ rocprofiler::is_setup() } #endif } // namespace component +} // namespace omnitrace + +namespace tim +{ +namespace component +{ +using ::omnitrace::component::rocm_data_tracker; +using ::omnitrace::component::rocm_feature_value; +using ::omnitrace::component::rocprofiler_data; +using ::omnitrace::component::rocprofiler_value; +} // namespace component +} // namespace tim +namespace tim +{ namespace operation { template <> @@ -214,25 +216,26 @@ struct get_storage } // namespace tim #if !defined(OMNITRACE_USE_ROCTRACER) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler_data, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler_data, false_type) #endif TIMEMORY_SET_COMPONENT_API(component::rocprofiler_data, project::timemory, category::timing, os::supports_unix) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, component::rocprofiler_data, - false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::rocprofiler_data, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_units, component::rocprofiler_data, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_timing_category, component::rocprofiler_data, + false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::rocprofiler_data, + false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_units, component::rocprofiler_data, false_type) TIMEMORY_STATISTICS_TYPE(component::rocprofiler_data, component::rocprofiler_value) TIMEMORY_STATISTICS_TYPE(component::rocm_data_tracker, component::rocm_feature_value) -TIMEMORY_DEFINE_CONCRETE_TRAIT(report_units, component::rocm_data_tracker, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_units, component::rocm_data_tracker, false_type) #if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) # include -TIMEMORY_DECLARE_EXTERN_COMPONENT(rocprofiler, false, void) -TIMEMORY_DECLARE_EXTERN_COMPONENT(rocprofiler_data, true, double) +OMNITRACE_DECLARE_EXTERN_COMPONENT(rocprofiler, false, void) +OMNITRACE_DECLARE_EXTERN_COMPONENT(rocprofiler_data, true, double) #endif diff --git a/source/lib/omnitrace/library/components/roctracer.cpp b/source/lib/omnitrace/library/components/roctracer.cpp index 9b49cadc7..39bdfc097 100644 --- a/source/lib/omnitrace/library/components/roctracer.cpp +++ b/source/lib/omnitrace/library/components/roctracer.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "library/components/roctracer.hpp" +#include "library/common.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" @@ -31,9 +32,7 @@ #include "library/sampling.hpp" #include "library/thread_data.hpp" -using namespace omnitrace; - -namespace tim +namespace omnitrace { namespace component { @@ -243,7 +242,7 @@ roctracer::protect_flush_activity() []() { ++roctracer_activity_count(); }); } } // namespace component -} // namespace tim +} // namespace omnitrace -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(roctracer, false, void) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(roctracer_data, true, double) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(roctracer, false, void) +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT(roctracer_data, true, double) diff --git a/source/lib/omnitrace/library/components/roctracer.hpp b/source/lib/omnitrace/library/components/roctracer.hpp index 3d50dec3d..54e06b6c5 100644 --- a/source/lib/omnitrace/library/components/roctracer.hpp +++ b/source/lib/omnitrace/library/components/roctracer.hpp @@ -22,6 +22,7 @@ #pragma once +#include "library/common.hpp" #include "library/components/fwd.hpp" #include "library/defines.hpp" @@ -35,12 +36,13 @@ #include #include -namespace tim +OMNITRACE_COMPONENT_ALIAS(roctracer_data, + ::tim::component::data_tracker) + +namespace omnitrace { namespace component { -using roctracer_data = data_tracker; - struct roctracer : base , private policy::instance_tracker @@ -87,23 +89,23 @@ roctracer::is_setup() } #endif } // namespace component -} // namespace tim +} // namespace omnitrace #if !defined(OMNITRACE_USE_ROCTRACER) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer_data, false_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer_data, false_type) #endif -TIMEMORY_SET_COMPONENT_API(component::roctracer_data, project::timemory, category::timing, - os::supports_unix) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type) +TIMEMORY_SET_COMPONENT_API(omnitrace::component::roctracer_data, project::timemory, + category::timing, os::supports_unix) +OMNITRACE_DEFINE_CONCRETE_TRAIT(is_timing_category, component::roctracer_data, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::roctracer_data, true_type) #if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) # include -TIMEMORY_DECLARE_EXTERN_COMPONENT(roctracer, false, void) -TIMEMORY_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double) +OMNITRACE_DECLARE_EXTERN_COMPONENT(roctracer, false, void) +OMNITRACE_DECLARE_EXTERN_COMPONENT(roctracer_data, true, double) #endif diff --git a/source/lib/omnitrace/library/components/omnitrace.hpp b/source/lib/omnitrace/library/concepts.hpp similarity index 61% rename from source/lib/omnitrace/library/components/omnitrace.hpp rename to source/lib/omnitrace/library/concepts.hpp index abc3885e2..1440ec90f 100644 --- a/source/lib/omnitrace/library/components/omnitrace.hpp +++ b/source/lib/omnitrace/library/concepts.hpp @@ -23,23 +23,46 @@ #pragma once #include "library/defines.hpp" -#include "library/timemory.hpp" + +#include + +#include +#include namespace omnitrace { -namespace component +namespace concepts = ::tim::concepts; // NOLINT + +template +struct thread_deleter; + +// unique ptr type for omnitrace +template +using unique_ptr_t = std::unique_ptr>; +} // namespace omnitrace + +namespace tim { -// timemory component which calls omnitrace functions -// (used in gotcha wrappers) -struct omnitrace : comp::base +namespace concepts { - static std::string label() { return "omnitrace"; } - void start(); - void stop(); - void set_prefix(const char*); - -private: - const char* m_prefix = nullptr; -}; -} // namespace component -} // namespace omnitrace +template +struct is_unique_pointer : std::false_type +{}; + +template +struct is_unique_pointer<::omnitrace::unique_ptr_t> : std::true_type +{}; + +template +struct is_unique_pointer> : std::true_type +{}; + +template +struct is_optional : std::false_type +{}; + +template +struct is_optional> : std::true_type +{}; +} // namespace concepts +} // namespace tim diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 54a42a6c5..70eed1856 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -27,7 +27,6 @@ #include "library/mproc.hpp" #include "library/perfetto.hpp" #include "library/runtime.hpp" -#include "timemory/log/logger.hpp" #include #include @@ -35,6 +34,8 @@ #include #include #include +#include +#include #include #include #include @@ -221,6 +222,9 @@ configure_settings(bool _init) "for continuous integration)", false, "debugging", "advanced"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_COLORIZED_LOG", "Enable colorized logging", + true, "debugging", "advanced"); + OMNITRACE_CONFIG_EXT_SETTING(int, "OMNITRACE_DL_VERBOSE", "Verbosity within the omnitrace-dl library", 0, "debugging", "libomnitrace-dl", "advanced"); @@ -473,6 +477,13 @@ configure_settings(bool _init) "data", "advanced") ->set_choices(get_available_perfetto_categories>()); + OMNITRACE_CONFIG_SETTING( + uint64_t, "OMNITRACE_THREAD_POOL_SIZE", + "Max number of threads for processing background tasks", + std::max(std::min(4, std::thread::hardware_concurrency() / 2), + 1), + "parallelism", "advanced"); + OMNITRACE_CONFIG_EXT_SETTING(int64_t, "OMNITRACE_CRITICAL_TRACE_COUNT", "Number of critical trace to export (0 == all)", int64_t{ 0 }, "data", "critical_trace", @@ -483,12 +494,6 @@ configure_settings(bool _init) "memory before submitting to shared buffer", uint64_t{ 2000 }, "data", "critical_trace", "advanced"); - OMNITRACE_CONFIG_SETTING( - uint64_t, "OMNITRACE_CRITICAL_TRACE_NUM_THREADS", - "Number of threads to use when generating the critical trace", - std::min(8, std::thread::hardware_concurrency()), "parallelism", - "critical_trace", "advanced"); - OMNITRACE_CONFIG_EXT_SETTING( int64_t, "OMNITRACE_CRITICAL_TRACE_PER_ROW", "How many critical traces per row in perfetto (0 == all in one row)", @@ -689,6 +694,9 @@ configure_settings(bool _init) settings::suppress_config() = true; + if(!get_env("OMNITRACE_COLORIZED_LOG", _config->get("OMNITRACE_COLORIZED_LOG"))) + tim::log::colorized() = false; + if(_init) { using argparser_t = tim::argparse::argument_parser; @@ -759,6 +767,7 @@ configure_mode_settings() set_default_setting_value("OMNITRACE_USE_CODE_COVERAGE", true); _set("OMNITRACE_USE_PERFETTO", false); _set("OMNITRACE_USE_TIMEMORY", false); + //_set("OMNITRACE_USE_CAUSAL", false); _set("OMNITRACE_USE_ROCM_SMI", false); _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); @@ -815,6 +824,7 @@ configure_mode_settings() { _set("OMNITRACE_USE_PERFETTO", false); _set("OMNITRACE_USE_TIMEMORY", false); + //_set("OMNITRACE_USE_CAUSAL", false); _set("OMNITRACE_USE_ROCM_SMI", false); _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); @@ -1141,7 +1151,6 @@ print_settings( _spacer << "#" << std::setw(tot_width + _spacer_extra) << "" << "#"; _os << _spacer.str() << "\n"; - // _os << "# api::omnitrace settings:" << std::setw(tot_width - 8) << "#" << "\n"; for(const auto& itr : _data) { _os << ((_md) ? "| " : "# "); @@ -1613,10 +1622,9 @@ get_critical_trace_update_freq() } uint64_t -get_critical_trace_num_threads() +get_thread_pool_size() { - static uint64_t _v = - get_config()->get("OMNITRACE_CRITICAL_TRACE_NUM_THREADS"); + static uint64_t _v = get_config()->get("OMNITRACE_THREAD_POOL_SIZE"); return _v; } diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index a4c7a04f0..e13fdc91a 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -22,7 +22,7 @@ #pragma once -#include "library/api.hpp" +#include "api.hpp" #include "library/common.hpp" #include "library/defines.hpp" #include "library/state.hpp" @@ -262,7 +262,7 @@ uint64_t get_critical_trace_update_freq(); uint64_t -get_critical_trace_num_threads(); +get_thread_pool_size(); std::string get_trace_hsa_api_types(); diff --git a/source/lib/omnitrace/library/coverage.cpp b/source/lib/omnitrace/library/coverage.cpp index 8ce53bba8..c8dd9e998 100644 --- a/source/lib/omnitrace/library/coverage.cpp +++ b/source/lib/omnitrace/library/coverage.cpp @@ -21,7 +21,7 @@ // SOFTWARE. #include "library/coverage.hpp" -#include "library/api.hpp" +#include "api.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/impl/coverage.hpp" diff --git a/source/lib/omnitrace/library/cpu_freq.cpp b/source/lib/omnitrace/library/cpu_freq.cpp index 319ece31c..1f26a37ef 100644 --- a/source/lib/omnitrace/library/cpu_freq.cpp +++ b/source/lib/omnitrace/library/cpu_freq.cpp @@ -22,12 +22,14 @@ #include "library/cpu_freq.hpp" #include "library/common.hpp" +#include "library/components/cpu_freq.hpp" #include "library/components/fwd.hpp" -#include "library/components/pthread_create_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" #include "library/perfetto.hpp" +#include "library/thread_data.hpp" +#include "library/thread_info.hpp" #include "library/timemory.hpp" #include @@ -44,39 +46,21 @@ #include #include -namespace cpuinfo = tim::procfs::cpuinfo; - namespace omnitrace { namespace cpu_freq { +using namespace ::tim::cpu_freq; +using cpu_freq_component = component::cpu_freq; + template using type_list = tim::type_list; namespace { -struct cpu_freq // cpu frequency -{}; -struct cpu_page // amount of memory allocated in pages -{}; -struct cpu_virt // virtual memory usage -{}; -struct cpu_peak // memory high-water mark -{}; -struct cpu_context_switch -{}; -struct cpu_page_fault -{}; -struct cpu_user_mode_time // cpu time spent in userspace -{}; -struct cpu_kernel_mode_time // cpu time spent in kernelspace -{}; using cpu_data_tuple_t = std::tuple>; - -std::set enabled_cpu_freqs = {}; -std::deque cpu_data = {}; -int64_t ncpu = threading::affinity::hw_concurrency(); + int64_t, int64_t, cpu_freq_component>; +std::deque cpu_data = {}; template void init_perfetto_counter_tracks(type_list) @@ -87,18 +71,6 @@ void init_perfetto_counter_tracks(type_list) } // namespace cpu_freq } // namespace omnitrace -TIMEMORY_DEFINE_NAME_TRAIT("cpu_freq", omnitrace::cpu_freq::cpu_freq); -TIMEMORY_DEFINE_NAME_TRAIT("process_page_fault", omnitrace::cpu_freq::cpu_page); -TIMEMORY_DEFINE_NAME_TRAIT("process_virtual_memory", omnitrace::cpu_freq::cpu_virt); -TIMEMORY_DEFINE_NAME_TRAIT("process_memory_hwm", omnitrace::cpu_freq::cpu_peak); -TIMEMORY_DEFINE_NAME_TRAIT("process_context_switch", - omnitrace::cpu_freq::cpu_context_switch); -TIMEMORY_DEFINE_NAME_TRAIT("process_page_fault", omnitrace::cpu_freq::cpu_page_fault); -TIMEMORY_DEFINE_NAME_TRAIT("process_user_cpu_time", - omnitrace::cpu_freq::cpu_user_mode_time); -TIMEMORY_DEFINE_NAME_TRAIT("process_kernel_cpu_time", - omnitrace::cpu_freq::cpu_kernel_mode_time); - namespace omnitrace { namespace cpu_freq @@ -107,109 +79,24 @@ void setup() { init_perfetto_counter_tracks( - type_list{}); } void config() { - auto _ncpu = cpuinfo::freq::size(); - auto _enabled_freqs = std::set{}; - - auto _enabled_val = get_sampling_cpus(); - for(auto& itr : _enabled_val) - itr = tolower(itr); - if(_enabled_val == "off") - _enabled_val = "none"; - else if(_enabled_val == "on") - _enabled_val = "all"; - if(_enabled_val != "none" && _enabled_val != "all") - { - auto _enabled = tim::delimit(_enabled_val, ",; \t"); - if(_enabled.empty()) - { - for(size_t i = 0; i < _ncpu; ++i) - _enabled_freqs.emplace(i); - } - for(auto&& _v : _enabled) - { - if(_v.find_first_not_of("0123456789-") != std::string::npos) - { - OMNITRACE_VERBOSE_F( - 0, - "Invalid CPU specification. Only numerical values (e.g., 0) or " - "ranges (e.g., 0-7) are permitted. Ignoring %s...", - _v.c_str()); - continue; - } - if(_v.find('-') != std::string::npos) - { - auto _vv = tim::delimit(_v, "-"); - OMNITRACE_CONDITIONAL_THROW( - _vv.size() != 2, - "Invalid CPU range specification: %s. Required format N-M, e.g. 0-4", - _v.c_str()); - for(size_t i = std::stoull(_vv.at(0)); i <= std::stoull(_vv.at(1)); ++i) - _enabled_freqs.emplace(i); - } - else - { - _enabled_freqs.emplace(std::stoull(_v)); - } - } - } - else if(_enabled_val == "all") - { - for(size_t i = 0; i < _ncpu; ++i) - _enabled_freqs.emplace(i); - } - else if(_enabled_val == "none") - { - _enabled_freqs.clear(); - } - - for(auto itr : _enabled_freqs) - { - if(itr < cpuinfo::freq::size()) - _enabled_freqs.emplace(itr); - else - { - OMNITRACE_VERBOSE( - 0, "[cpu_freq::config] Warning! Removing invalid cpu %zu...\n", itr); - } - } - - if(!cpuinfo::freq{}) - { - OMNITRACE_VERBOSE(0, "[cpu_freq::config] Warning! CPU frequencies are disabled " - ":: unable to open /proc/cpuinfo"); - _enabled_freqs.clear(); - } - - OMNITRACE_CI_FAIL(!cpuinfo::freq{}, "[cpu_freq::config] CPU frequencies are disabled " - ":: unable to open /proc/cpuinfo"); - - enabled_cpu_freqs = _enabled_freqs; + cpu_freq_component::configure(); } void sample() { - std::vector _freqs{}; - if(!enabled_cpu_freqs.empty()) - { - _freqs.reserve(enabled_cpu_freqs.size()); - auto&& _freq = cpuinfo::freq{}; - for(const auto& itr : enabled_cpu_freqs) - { - _freqs.emplace_back(_freq(itr)); - } - } - auto _ts = tim::get_clock_real_now(); - tim::rusage_cache _rcache{ RUSAGE_SELF }; + auto _rcache = tim::rusage_cache{ RUSAGE_SELF }; + auto _freqs = cpu_freq_component{}.sample(); + // user and kernel mode times are in microseconds cpu_data.emplace_back( _ts, tim::get_page_rss(), tim::get_virt_mem(), _rcache.get_peak_rss(), @@ -276,7 +163,12 @@ post_process() OMNITRACE_PRINT("Post-processing %zu cpu frequency and memory usage entries...\n", cpu_data.size()); auto _process_frequencies = [](size_t _idx, size_t _offset) { - using freq_track = perfetto_counter_track; + using freq_track = perfetto_counter_track; + + const auto& _thread_info = thread_info::get(0, LookupTID); + OMNITRACE_CI_THROW(!_thread_info, "Missing thread info for thread 0"); + if(!_thread_info) return; + if(!freq_track::exists(_idx)) { auto addendum = [&](const char* _v) { @@ -289,12 +181,12 @@ post_process() { uint64_t _ts = std::get<0>(itr); double _freq = std::get<8>(itr).at(_offset); - if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; - write_perfetto_counter_track(index{ _idx }, _ts, _freq); + if(!_thread_info->is_valid_time(_ts)) continue; + write_perfetto_counter_track(index{ _idx }, _ts, _freq); } - auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second; - write_perfetto_counter_track(index{ _idx }, _end_ts, 0); + auto _end_ts = _thread_info->get_stop(); + write_perfetto_counter_track(index{ _idx }, _end_ts, 0); }; auto _process_cpu_rusage = []() { @@ -305,10 +197,14 @@ post_process() "Page Faults", "User Time", "Kernel Time" }, { "MB", "MB", "MB", "", "", "sec", "sec" }); + const auto& _thread_info = thread_info::get(0, LookupTID); + OMNITRACE_CI_THROW(!_thread_info, "Missing thread info for thread 0"); + if(!_thread_info) return; + for(auto& itr : cpu_data) { uint64_t _ts = std::get<0>(itr); - if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + if(!_thread_info->is_valid_time(_ts)) continue; double _page = std::get<1>(itr); double _virt = std::get<2>(itr); @@ -326,7 +222,7 @@ post_process() write_perfetto_counter_track(_ts, _kern / units::sec); } - auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second; + auto _end_ts = _thread_info->get_stop(); write_perfetto_counter_track(_end_ts, 0.0); write_perfetto_counter_track(_end_ts, 0.0); write_perfetto_counter_track(_end_ts, 0.0); @@ -337,13 +233,14 @@ post_process() }; _process_cpu_rusage(); + + auto& enabled_cpu_freqs = cpu_freq_component::get_enabled_cpus(); for(auto itr = enabled_cpu_freqs.begin(); itr != enabled_cpu_freqs.end(); ++itr) { auto _idx = *itr; auto _offset = std::distance(enabled_cpu_freqs.begin(), itr); _process_frequencies(_idx, _offset); } - enabled_cpu_freqs.clear(); } } // namespace cpu_freq diff --git a/source/lib/omnitrace/library/critical_trace.cpp b/source/lib/omnitrace/library/critical_trace.cpp index 74eb46e73..dd9c15e15 100644 --- a/source/lib/omnitrace/library/critical_trace.cpp +++ b/source/lib/omnitrace/library/critical_trace.cpp @@ -547,7 +547,6 @@ void add_hash_id(const hash_ids& _labels) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; tasking::critical_trace::get_task_group().exec([_labels]() { static std::mutex _mtx{}; @@ -578,7 +577,6 @@ update(int64_t _tid) { if(!get_use_critical_trace() && !get_use_rocm_smi()) return; OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; call_chain _data{}; std::swap(_data, *critical_trace::get(_tid)); @@ -590,7 +588,6 @@ compute(int64_t _tid) { update(_tid); OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; tasking::critical_trace::get_task_group().exec(compute_critical_trace); } @@ -808,13 +805,13 @@ compute_critical_trace() using perfstats_t = tim::lightweight_tuple; - perfstats_t _ct_perf{ JOIN("", "[", __FUNCTION__, "]") }; + perfstats_t _ct_perf{}; _ct_perf.start(); try { - OMNITRACE_CT_DEBUG("[%s] initial call chain: %zu entries\n", __FUNCTION__, - complete_call_chain.size()); + OMNITRACE_VERBOSE_F(1, "[%s] initial call chain: %zu entries\n", __FUNCTION__, + complete_call_chain.size()); perfstats_t _perf{ get_perf_name(__FUNCTION__) }; _perf.start(); @@ -822,7 +819,7 @@ compute_critical_trace() std::sort(complete_call_chain.begin(), complete_call_chain.end()); _perf.stop().rekey("Sorting critical trace"); - OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str()); + OMNITRACE_VERBOSE_F(1, "%s\n", JOIN("", _perf).c_str()); _perf.reset().start(); save_call_chain_json( @@ -830,20 +827,16 @@ compute_critical_trace() complete_call_chain, true, __FUNCTION__); _perf.stop().rekey("Save call-chain"); - OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str()); + OMNITRACE_VERBOSE_F(1, "%s\n", JOIN("", _perf).c_str()); } catch(std::exception& e) { - OMNITRACE_PRINT("Thread exited '%s' with exception: %s\n", __FUNCTION__, - e.what()); + OMNITRACE_PRINT_F("Thread exited '%s' with exception: %s\n", __FUNCTION__, + e.what()); TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(true, 32); } - _ct_perf.stop(); - auto _ct_msg = JOIN("", _ct_perf); - auto _ct_pos = _ct_msg.find(">>> "); - if(_ct_pos != std::string::npos) _ct_msg = _ct_msg.substr(_ct_pos + 5); - OMNITRACE_PRINT("%s\n", _ct_msg.c_str()); + OMNITRACE_PRINT_F("%s\n", _ct_perf.stop().as_string().c_str()); } } // namespace @@ -869,8 +862,7 @@ get_entries(int64_t _ts, const std::function& _eval) *_targ = _v; }; OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; - size_t _n = 0; + size_t _n = 0; std::vector> _v{}; if(!tasking::critical_trace::get_task_group().pool()) return _v; tasking::critical_trace::get_task_group().exec(_func, &_v, &_n); diff --git a/source/lib/omnitrace/library/critical_trace.hpp b/source/lib/omnitrace/library/critical_trace.hpp index 9dfa32964..f482720e2 100644 --- a/source/lib/omnitrace/library/critical_trace.hpp +++ b/source/lib/omnitrace/library/critical_trace.hpp @@ -22,16 +22,23 @@ #pragma once +#include "library/common.hpp" #include "library/config.hpp" #include "library/defines.hpp" +#include "library/runtime.hpp" #include "library/thread_data.hpp" +#include +#include #include +#include #include #include +#include #include #include +#include #include #include #include @@ -284,4 +291,78 @@ struct id {}; } // namespace critical_trace + +template +inline void +add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, + size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid, + uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0) +{ + // clang-format off + // these are used to create unique type mutexes + struct critical_insert {}; + struct cpu_cid_stack {}; + // clang-format on + + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + + static constexpr auto num_mutexes = max_supported_threads; + static auto _update_freq = critical_trace::get_update_frequency(); + static auto _pid = process::get_id(); + auto _self_tid = threading::get_id(); + + if constexpr(PhaseID != critical_trace::Phase::NONE) + { + auto& _self_mtx = + type_mutex(_self_tid); + + auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; + + // unique lock per thread + if(!_self_lk.owns_lock()) _self_lk.lock(); + + auto& _critical_trace = critical_trace::get(_self_tid); + _critical_trace->emplace_back(critical_trace::entry{ + DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid, + _parent_cid, _ts_beg, _ts_val, _queue, _hash }); + } + + if constexpr(UpdateStack) + { + auto& _self_mtx = get_cpu_cid_stack_lock(_self_tid); + auto& _targ_mtx = get_cpu_cid_stack_lock(_targ_tid); + + auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; + auto_lock_t _targ_lk{ _targ_mtx, std::defer_lock }; + + // unique lock per thread + auto _lock = [&_self_lk, &_targ_lk, _self_tid, _targ_tid]() { + if(!_self_lk.owns_lock() && _self_tid != _targ_tid) _self_lk.lock(); + if(!_targ_lk.owns_lock()) _targ_lk.lock(); + }; + + if constexpr(PhaseID == critical_trace::Phase::NONE) + { + _lock(); + get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); + } + else if constexpr(PhaseID == critical_trace::Phase::BEGIN) + { + _lock(); + get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); + } + else if constexpr(PhaseID == critical_trace::Phase::END) + { + _lock(); + get_cpu_cid_stack(_targ_tid)->pop_back(); + if(_gpu_cid == 0 && _cpu_cid % _update_freq == (_update_freq - 1)) + critical_trace::update(_targ_tid); + } + tim::consume_parameters(_lock); + } + + tim::consume_parameters(_pid, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, + _ts_val, _devid, _queue, _hash, _depth, _prio, num_mutexes); +} } // namespace omnitrace diff --git a/source/lib/omnitrace/library/defines.hpp.in b/source/lib/omnitrace/library/defines.hpp.in index b0a8ae5f3..1d938e3fc 100644 --- a/source/lib/omnitrace/library/defines.hpp.in +++ b/source/lib/omnitrace/library/defines.hpp.in @@ -25,14 +25,12 @@ #include "common/defines.h" #define TIMEMORY_USER_COMPONENT_ENUM \ - OMNITRACE_COMPONENT_idx, OMNITRACE_USER_REGION_idx, OMNITRACE_ROCTRACER_idx, \ - OMNITRACE_ROCPROFILER_idx, OMNITRACE_SAMPLING_WALL_CLOCK_idx, \ - OMNITRACE_SAMPLING_CPU_CLOCK_idx, OMNITRACE_SAMPLING_PERCENT_idx, \ - OMNITRACE_SAMPLING_GPU_POWER_idx, OMNITRACE_SAMPLING_GPU_TEMP_idx, \ - OMNITRACE_SAMPLING_GPU_BUSY_idx, OMNITRACE_SAMPLING_GPU_MEMORY_USAGE_idx, + OMNITRACE_ROCTRACER_idx, OMNITRACE_ROCPROFILER_idx, \ + OMNITRACE_SAMPLING_WALL_CLOCK_idx, OMNITRACE_SAMPLING_CPU_CLOCK_idx, \ + OMNITRACE_SAMPLING_PERCENT_idx, OMNITRACE_SAMPLING_GPU_POWER_idx, \ + OMNITRACE_SAMPLING_GPU_TEMP_idx, OMNITRACE_SAMPLING_GPU_BUSY_idx, \ + OMNITRACE_SAMPLING_GPU_MEMORY_USAGE_idx, -#define OMNITRACE_COMPONENT OMNITRACE_COMPONENT_idx -#define OMNITRACE_USER_REGION OMNITRACE_USER_REGION_idx #define OMNITRACE_ROCTRACER OMNITRACE_ROCTRACER_idx #define OMNITRACE_ROCPROFILER OMNITRACE_ROCPROFILER_idx #define OMNITRACE_SAMPLING_WALL_CLOCK OMNITRACE_SAMPLING_WALL_CLOCK_idx diff --git a/source/lib/omnitrace/library/dynamic_library.cpp b/source/lib/omnitrace/library/dynamic_library.cpp index bd568e4af..ce84c3558 100644 --- a/source/lib/omnitrace/library/dynamic_library.cpp +++ b/source/lib/omnitrace/library/dynamic_library.cpp @@ -21,7 +21,6 @@ // SOFTWARE. #include "library/dynamic_library.hpp" -#include "common/defines.h" #include "library/common.hpp" #include "library/debug.hpp" #include "library/defines.hpp" diff --git a/source/lib/omnitrace/library/gpu.cpp b/source/lib/omnitrace/library/gpu.cpp index a778f67e6..96de0ca4f 100644 --- a/source/lib/omnitrace/library/gpu.cpp +++ b/source/lib/omnitrace/library/gpu.cpp @@ -23,7 +23,7 @@ #include "library/gpu.hpp" #if defined(OMNITRACE_USE_ROCM_SMI) && OMNITRACE_USE_ROCM_SMI > 0 -# include "library/components/rocm_smi.hpp" +# include "library/rocm_smi.hpp" #elif !defined(OMNITRACE_USE_ROCM_SMI) # define OMNITRACE_USE_ROCM_SMI 0 #endif diff --git a/source/lib/omnitrace/library/kokkosp.cpp b/source/lib/omnitrace/library/kokkosp.cpp index 046d53b32..fd389f93d 100644 --- a/source/lib/omnitrace/library/kokkosp.cpp +++ b/source/lib/omnitrace/library/kokkosp.cpp @@ -22,8 +22,9 @@ #define TIMEMORY_KOKKOSP_POSTFIX OMNITRACE_PUBLIC_API -#include "library/api.hpp" -#include "library/components/user_region.hpp" +#include "api.hpp" +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" @@ -39,7 +40,11 @@ #include #include -namespace kokkosp = tim::kokkosp; +namespace kokkosp = ::tim::kokkosp; +namespace category = ::tim::category; +namespace comp = ::omnitrace::component; + +using kokkosp_region = comp::local_category_region; //--------------------------------------------------------------------------------------// @@ -140,6 +145,10 @@ extern "C" OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); tim::consume_parameters(devInfoCount, deviceInfo); + OMNITRACE_BASIC_VERBOSE_F( + 0, "Initializing omnitrace kokkos connector (sequence %d, version: %llu)... ", + loadSeq, (unsigned long long) interfaceVer); + if(_standalone_initialized || (!omnitrace::config::settings_are_configured() && omnitrace::get_state() < omnitrace::State::Active)) { @@ -173,10 +182,6 @@ extern "C" } } - OMNITRACE_BASIC_VERBOSE_F(0, - "Initializing kokkos omnitrace connector " - "(standalone, sequence %d, version: %llu)...\n", - loadSeq, (unsigned long long) interfaceVer); OMNITRACE_BASIC_VERBOSE_F(0, "Initializing omnitrace (standalone)... "); auto _mode = tim::get_env("OMNITRACE_MODE", "trace"); auto _arg0 = (_initialize_arguments.empty()) ? std::string{ "unknown" } @@ -187,21 +192,17 @@ extern "C" omnitrace_init_hidden(_mode.c_str(), false, _arg0.c_str()); omnitrace_push_trace_hidden("kokkos_main"); } - else - { - OMNITRACE_VERBOSE_F(0, - "Initializing kokkos omnitrace connector " - "(sequence %d, version: %llu)... ", - loadSeq, (unsigned long long) interfaceVer); - } setup_kernel_logger(); tim::trait::runtime_enabled::set( omnitrace::config::get_use_timemory()); - if(_standalone_initialized && omnitrace::get_verbose() >= 0) - fprintf(stderr, "Done\n"); + if(omnitrace::get_verbose() >= 0) + { + fprintf(stderr, "%sDone\n%s", tim::log::color::info(), + tim::log::color::end()); + } } void kokkosp_finalize_library() @@ -233,16 +234,16 @@ extern "C" : TIMEMORY_JOIN(" ", TIMEMORY_JOIN("", "[kokkos][dev", devid, ']'), name); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); - kokkosp::create_profiler(pname, *kernid); - kokkosp::start_profiler(*kernid); + kokkosp::create_profiler(pname, *kernid); + kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_for(uint64_t kernid) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); - kokkosp::stop_profiler(kernid); - kokkosp::destroy_profiler(kernid); + kokkosp::stop_profiler(kernid); + kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// @@ -256,16 +257,16 @@ extern "C" : TIMEMORY_JOIN(" ", TIMEMORY_JOIN("", "[kokkos][dev", devid, ']'), name); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); - kokkosp::create_profiler(pname, *kernid); - kokkosp::start_profiler(*kernid); + kokkosp::create_profiler(pname, *kernid); + kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_reduce(uint64_t kernid) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); - kokkosp::stop_profiler(kernid); - kokkosp::destroy_profiler(kernid); + kokkosp::stop_profiler(kernid); + kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// @@ -279,16 +280,16 @@ extern "C" : TIMEMORY_JOIN(" ", TIMEMORY_JOIN("", "[kokkos][dev", devid, ']'), name); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); - kokkosp::create_profiler(pname, *kernid); - kokkosp::start_profiler(*kernid); + kokkosp::create_profiler(pname, *kernid); + kokkosp::start_profiler(*kernid); } void kokkosp_end_parallel_scan(uint64_t kernid) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); - kokkosp::stop_profiler(kernid); - kokkosp::destroy_profiler(kernid); + kokkosp::stop_profiler(kernid); + kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// @@ -302,16 +303,16 @@ extern "C" : TIMEMORY_JOIN(" ", TIMEMORY_JOIN("", "[kokkos][dev", devid, ']'), name); *kernid = kokkosp::get_unique_id(); kokkosp::logger_t{}.mark(1, __FUNCTION__, name, *kernid); - kokkosp::create_profiler(pname, *kernid); - kokkosp::start_profiler(*kernid); + kokkosp::create_profiler(pname, *kernid); + kokkosp::start_profiler(*kernid); } void kokkosp_end_fence(uint64_t kernid) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); - kokkosp::stop_profiler(kernid); - kokkosp::destroy_profiler(kernid); + kokkosp::stop_profiler(kernid); + kokkosp::destroy_profiler(kernid); } //----------------------------------------------------------------------------------// @@ -321,9 +322,9 @@ extern "C" if(omnitrace::get_use_perfetto()) return; // perfetto doesn't support regions OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, name); - kokkosp::get_profiler_stack().push_back( - kokkosp::profiler_t(name)); - kokkosp::get_profiler_stack().back().start(); + kokkosp::get_profiler_stack().push_back( + kokkosp::profiler_t(name)); + kokkosp::get_profiler_stack().back().start(); } void kokkosp_pop_profile_region() @@ -331,10 +332,9 @@ extern "C" if(omnitrace::get_use_perfetto()) return; // perfetto doesn't support regions OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); - if(kokkosp::get_profiler_stack().empty()) - return; - kokkosp::get_profiler_stack().back().stop(); - kokkosp::get_profiler_stack().pop_back(); + if(kokkosp::get_profiler_stack().empty()) return; + kokkosp::get_profiler_stack().back().stop(); + kokkosp::get_profiler_stack().pop_back(); } //----------------------------------------------------------------------------------// @@ -345,14 +345,14 @@ extern "C" OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); *secid = kokkosp::get_unique_id(); auto pname = TIMEMORY_JOIN(" ", "[kokkos]", name); - kokkosp::create_profiler(pname, *secid); + kokkosp::create_profiler(pname, *secid); } void kokkosp_destroy_profile_section(uint32_t secid) { if(omnitrace::get_use_perfetto()) return; // perfetto doesn't support regions OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - kokkosp::destroy_profiler(secid); + kokkosp::destroy_profiler(secid); } //----------------------------------------------------------------------------------// @@ -362,7 +362,7 @@ extern "C" if(omnitrace::get_use_perfetto()) return; // perfetto doesn't support regions OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, secid); - kokkosp::start_profiler(secid); + kokkosp::start_profiler(secid); } void kokkosp_stop_profile_section(uint32_t secid) @@ -370,7 +370,7 @@ extern "C" if(omnitrace::get_use_perfetto()) return; // perfetto doesn't support regions OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, secid); - kokkosp::start_profiler(secid); + kokkosp::start_profiler(secid); } //----------------------------------------------------------------------------------// @@ -412,7 +412,7 @@ extern "C" TIMEMORY_JOIN('=', dst_handle.name, dst_name), TIMEMORY_JOIN('=', src_handle.name, src_name)); - auto& _data = kokkosp::get_profiler_stack(); + auto& _data = kokkosp::get_profiler_stack(); _data.emplace_back(name); _data.back().audit(dst_handle, dst_name, dst_ptr, src_handle, src_name, src_ptr, size); @@ -424,7 +424,7 @@ extern "C" { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); - auto& _data = kokkosp::get_profiler_stack(); + auto& _data = kokkosp::get_profiler_stack(); if(_data.empty()) return; _data.back().store(std::minus{}, 0); _data.back().stop(); @@ -436,7 +436,7 @@ extern "C" void kokkosp_profile_event(const char* name) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - kokkosp::profiler_t{}.mark(name); + kokkosp::profiler_t{}.mark(name); } //----------------------------------------------------------------------------------// diff --git a/source/lib/omnitrace/library/ompt.cpp b/source/lib/omnitrace/library/ompt.cpp index f41952c33..c2f9acf9f 100644 --- a/source/lib/omnitrace/library/ompt.cpp +++ b/source/lib/omnitrace/library/ompt.cpp @@ -26,8 +26,8 @@ #if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0 +# include "library/components/category_region.hpp" # include "library/components/fwd.hpp" -# include "library/components/user_region.hpp" # include # include @@ -67,7 +67,7 @@ setup() comp::user_ompt_bundle::global_init(); comp::user_ompt_bundle::reset(); tim::auto_lock_t lk{ tim::type_mutex() }; - comp::user_ompt_bundle::configure(); + comp::user_ompt_bundle::configure>(); f_bundle = std::make_unique("omnitrace/ompt", quirk::config{}); } diff --git a/source/lib/omnitrace/library/perfetto.hpp b/source/lib/omnitrace/library/perfetto.hpp index c8823d542..a0ce84d65 100644 --- a/source/lib/omnitrace/library/perfetto.hpp +++ b/source/lib/omnitrace/library/perfetto.hpp @@ -22,81 +22,10 @@ #pragma once -#include "library/defines.hpp" - -#if defined(OMNITRACE_PERFETTO_CATEGORIES) -# error "OMNITRACE_PERFETTO_CATEGORIES is already defined. Please include \"" __FILE__ "\" before including any timemory files" -#endif - -#define OMNITRACE_PERFETTO_CATEGORIES \ - perfetto::Category("host").SetDescription("Host-side function tracing"), \ - perfetto::Category("user").SetDescription("User-defined regions"), \ - perfetto::Category("sampling").SetDescription("Host-side function sampling"), \ - perfetto::Category("device_hip") \ - .SetDescription("Device-side functions submitted via HSA API"), \ - perfetto::Category("device_hsa") \ - .SetDescription("Device-side functions submitted via HIP API"), \ - perfetto::Category("rocm_hip").SetDescription("Host-side HIP functions"), \ - perfetto::Category("rocm_hsa").SetDescription("Host-side HSA functions"), \ - perfetto::Category("rocm_roctx").SetDescription("Host-side ROCTX labels"), \ - perfetto::Category("device_busy") \ - .SetDescription("Busy percentage of a GPU device"), \ - perfetto::Category("device_temp") \ - .SetDescription("Temperature of GPU device in degC"), \ - perfetto::Category("device_power") \ - .SetDescription("Power consumption of GPU device in watts"), \ - perfetto::Category("device_memory_usage") \ - .SetDescription("Memory usage of GPU device in MB"), \ - perfetto::Category("thread_peak_memory") \ - .SetDescription( \ - "Peak memory usage on thread in MB (derived from sampling)"), \ - perfetto::Category("thread_context_switch") \ - .SetDescription("Context switches on thread (derived from sampling)"), \ - perfetto::Category("thread_page_fault") \ - .SetDescription("Memory page faults on thread (derived from sampling)"), \ - perfetto::Category("hardware_counter") \ - .SetDescription("Hardware counter value on thread (derived from sampling)"), \ - perfetto::Category("cpu_freq") \ - .SetDescription("CPU frequency in MHz (collected in background thread)"), \ - perfetto::Category("process_page_fault") \ - .SetDescription( \ - "Memory page faults in process (collected in background thread)"), \ - perfetto::Category("process_memory_hwm") \ - .SetDescription("Memory High-Water Mark i.e. peak memory usage (collected " \ - "in background thread)"), \ - perfetto::Category("process_virtual_memory") \ - .SetDescription("Virtual memory usage in process in MB (collected in " \ - "background thread)"), \ - perfetto::Category("process_context_switch") \ - .SetDescription( \ - "Context switches in process (collected in background thread)"), \ - perfetto::Category("process_page_fault") \ - .SetDescription( \ - "Memory page faults in process (collected in background thread)"), \ - perfetto::Category("process_user_cpu_time") \ - .SetDescription("CPU time of functions executing in user-space in process " \ - "in seconds (collected in background thread)"), \ - perfetto::Category("process_kernel_cpu_time") \ - .SetDescription("CPU time of functions executing in kernel-space in " \ - "process in seconds (collected in background thread)"), \ - perfetto::Category("pthread").SetDescription("Pthread functions"), \ - perfetto::Category("kokkos").SetDescription("Kokkos regions"), \ - perfetto::Category("mpi").SetDescription("MPI regions"), \ - perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \ - perfetto::Category("rccl").SetDescription( \ - "ROCm Communication Collectives Library (RCCL) regions"), \ - perfetto::Category("comm_data") \ - .SetDescription( \ - "MPI/RCCL counters for tracking amount of data sent or received"), \ - perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \ - perfetto::Category("host-critical-trace") \ - .SetDescription("Host-side critical traces"), \ - perfetto::Category("device-critical-trace") \ - .SetDescription("Device-side critical traces"), \ - perfetto::Category("timemory").SetDescription("Events from the timemory API") +#include "library/categories.hpp" +#include "library/common.hpp" #if defined(TIMEMORY_USE_PERFETTO) -# define TIMEMORY_PERFETTO_CATEGORIES OMNITRACE_PERFETTO_CATEGORIES # include #else # include diff --git a/source/lib/omnitrace/library/process_sampler.cpp b/source/lib/omnitrace/library/process_sampler.cpp index b042e757e..6126b6cdd 100644 --- a/source/lib/omnitrace/library/process_sampler.cpp +++ b/source/lib/omnitrace/library/process_sampler.cpp @@ -22,11 +22,12 @@ #include "library/process_sampler.hpp" #include "library/components/pthread_gotcha.hpp" -#include "library/components/rocm_smi.hpp" #include "library/config.hpp" #include "library/cpu_freq.hpp" #include "library/debug.hpp" +#include "library/rocm_smi.hpp" #include "library/runtime.hpp" +#include "library/sampling.hpp" #include #include @@ -37,7 +38,6 @@ namespace process_sampler { namespace { -using auto_lock_t = tim::auto_lock_t; using promise_t = std::promise; std::unique_ptr polling_finished = {}; std::vector> instances = {}; @@ -126,8 +126,6 @@ sampler::setup() // shutdown if already running shutdown(); - pthread_gotcha::push_enable_sampling_on_child_threads(false); - if(get_use_rocm_smi()) { auto& _rocm_smi = instances.emplace_back(std::make_unique()); @@ -158,12 +156,12 @@ sampler::setup() polling_finished = std::make_unique(); set_state(State::PreInit); + pthread_gotcha::push_enable_sampling_on_child_threads(false); get_thread() = std::make_unique(&poll, &get_sampler_state(), msec_t{ _msec_freq }, &_prom); - _fut.wait(); - pthread_gotcha::pop_enable_sampling_on_child_threads(); + set_state(State::Active); } diff --git a/source/lib/omnitrace/library/ptl.cpp b/source/lib/omnitrace/library/ptl.cpp index ac2e0525f..b33eb1dc3 100644 --- a/source/lib/omnitrace/library/ptl.cpp +++ b/source/lib/omnitrace/library/ptl.cpp @@ -26,6 +26,8 @@ #include "library/defines.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" +#include "library/thread_data.hpp" +#include "library/thread_info.hpp" #include @@ -39,24 +41,59 @@ namespace tasking namespace { auto _thread_pool_cfg = []() { + int64_t _nthreads = 0; + if(config::settings_are_configured()) + { + _nthreads = config::get_thread_pool_size(); + } + else + { + const int64_t _max_threads = std::thread::hardware_concurrency() / 2; + const int64_t _min_threads = 1; + + _nthreads = get_env("OMNITRACE_THREAD_POOL_SIZE", -1, false); + if(_nthreads == -1) + { + _nthreads = 4; + if(_nthreads > _max_threads) _nthreads = _max_threads; + if(_nthreads < _min_threads) _nthreads = _min_threads; + + tim::set_env("OMNITRACE_THREAD_POOL_SIZE", _nthreads, 0); + } + } + PTL::ThreadPool::Config _v{}; _v.init = true; _v.use_affinity = false; _v.use_tbb = false; _v.verbose = -1; _v.initializer = []() { - threading::offset_this_id(true); - set_thread_state(ThreadState::Internal); - sampling::block_signals(); + thread_info::init(true); threading::set_thread_name( JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str()); + sampling::block_signals(); }; _v.finalizer = []() {}; _v.priority = 5; - _v.pool_size = 1; + _v.pool_size = _nthreads; + return _v; +}; + +auto& +get_thread_pool_state() +{ + static auto _v = State::PreInit; + return _v; +} + +PTL::ThreadPool& +get_thread_pool() +{ + static auto _v = + (get_thread_pool_state() = State::Active, PTL::ThreadPool{ _thread_pool_cfg() }); return _v; -}(); } +} // namespace namespace roctracer { @@ -94,13 +131,15 @@ join() if(roctracer::get_thread_pool_state() == State::Active) { OMNITRACE_DEBUG_F("waiting for all roctracer tasks to complete...\n"); - tasking::roctracer::get_task_group().join(); + for(size_t i = 0; i < max_supported_threads; ++i) + roctracer::get_task_group(i).join(); } if(critical_trace::get_thread_pool_state() == State::Active) { OMNITRACE_DEBUG_F("waiting for all critical tasks to complete...\n"); - tasking::critical_trace::get_task_group().join(); + for(size_t i = 0; i < max_supported_threads; ++i) + critical_trace::get_task_group(i).join(); } } @@ -109,70 +148,62 @@ shutdown() { if(roctracer::get_thread_pool_state() == State::Active) { - OMNITRACE_DEBUG_F("Destroying the roctracer thread pool...\n"); - std::unique_lock _lk{ roctracer::get_mutex() }; - roctracer::get_task_group().join(); - roctracer::get_task_group().clear(); - roctracer::get_task_group().set_pool(nullptr); - roctracer::get_thread_pool().destroy_threadpool(); + OMNITRACE_DEBUG_F("Waiting on completion of roctracer tasks...\n"); + for(size_t i = 0; i < max_supported_threads; ++i) + { + roctracer::get_task_group(i).join(); + roctracer::get_task_group(i).clear(); + roctracer::get_task_group(i).set_pool(nullptr); + } roctracer::get_thread_pool_state() = State::Finalized; } if(critical_trace::get_thread_pool_state() == State::Active) { - OMNITRACE_DEBUG_F("Destroying the critical trace thread pool...\n"); - std::unique_lock _lk{ critical_trace::get_mutex() }; - critical_trace::get_task_group().join(); - critical_trace::get_task_group().clear(); - critical_trace::get_task_group().set_pool(nullptr); - critical_trace::get_thread_pool().destroy_threadpool(); + OMNITRACE_DEBUG_F("Waiting on completion of critical trace tasks...\n"); + for(size_t i = 0; i < max_supported_threads; ++i) + { + critical_trace::get_task_group(i).join(); + critical_trace::get_task_group(i).clear(); + critical_trace::get_task_group(i).set_pool(nullptr); + } critical_trace::get_thread_pool_state() = State::Finalized; } -} -std::mutex& -roctracer::get_mutex() -{ - static std::mutex _v{}; - return _v; + if(get_thread_pool_state() == State::Active) + { + OMNITRACE_DEBUG_F("Destroying the omnitrace thread pool...\n"); + get_thread_pool().destroy_threadpool(); + get_thread_pool_state() = State::Finalized; + } } -PTL::ThreadPool& -roctracer::get_thread_pool() +size_t +initialize_threadpool(size_t _v) { - static auto _v = (roctracer::get_thread_pool_state() = State::Active, - PTL::ThreadPool{ _thread_pool_cfg }); - return _v; + return get_thread_pool().initialize_threadpool(_v); } PTL::TaskGroup& -roctracer::get_task_group() -{ - static PTL::TaskGroup _v{ &roctracer::get_thread_pool() }; - return _v; -} - -std::mutex& -critical_trace::get_mutex() -{ - static std::mutex _v{}; - return _v; -} - -PTL::ThreadPool& -critical_trace::get_thread_pool() -{ - static auto _v = (critical_trace::get_thread_pool_state() = State::Active, - PTL::ThreadPool{ _thread_pool_cfg }); - return _v; +roctracer::get_task_group(int64_t _tid) +{ + struct local + {}; + using thread_data_t = thread_data, local>; + static auto& _v = + thread_data_t::instances(construct_on_init{}, &tasking::get_thread_pool()); + return *_v.at(_tid); } PTL::TaskGroup& -critical_trace::get_task_group() -{ - static PTL::TaskGroup _v{ &critical_trace::get_thread_pool() }; - return _v; +critical_trace::get_task_group(int64_t _tid) +{ + struct local + {}; + using thread_data_t = thread_data, local>; + static auto& _v = + thread_data_t::instances(construct_on_init{}, &tasking::get_thread_pool()); + return *_v.at(_tid); } - } // namespace tasking } // namespace omnitrace diff --git a/source/lib/omnitrace/library/ptl.hpp b/source/lib/omnitrace/library/ptl.hpp index 759506466..47f35b663 100644 --- a/source/lib/omnitrace/library/ptl.hpp +++ b/source/lib/omnitrace/library/ptl.hpp @@ -23,6 +23,7 @@ #pragma once #include "library/defines.hpp" +#include "library/utility.hpp" #include @@ -41,6 +42,8 @@ join(); void shutdown(); +size_t initialize_threadpool(size_t); + //--------------------------------------------------------------------------------------// // // roctracer @@ -49,17 +52,8 @@ shutdown(); namespace roctracer { -std::mutex& -get_mutex(); - -PTL::ThreadPool& -get_thread_pool(); - PTL::TaskGroup& -get_task_group(); - -bool -get_thread_pool_is_active(); +get_task_group(int64_t _tid = utility::get_thread_index()); } // namespace roctracer //--------------------------------------------------------------------------------------// @@ -70,17 +64,8 @@ get_thread_pool_is_active(); namespace critical_trace { -std::mutex& -get_mutex(); - -PTL::ThreadPool& -get_thread_pool(); - PTL::TaskGroup& -get_task_group(); - -bool -get_thread_pool_is_active(); +get_task_group(int64_t _tid = utility::get_thread_index()); } // namespace critical_trace } // namespace tasking } // namespace omnitrace diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index 4d92b7e8f..648b8570b 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -67,17 +67,17 @@ setup() auto _use_data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", get_use_timemory()); if(!get_use_timemory()) { - trait::runtime_enabled::set(false); - trait::runtime_enabled::set(false); + trait::runtime_enabled::set(false); + trait::runtime_enabled::set(false); } else { - trait::runtime_enabled::set(_use_data); - trait::runtime_enabled::set(_use_data); + trait::runtime_enabled::set(_use_data); + trait::runtime_enabled::set(_use_data); } - comp::configure_rcclp(); - global_id = comp::activate_rcclp(); + component::configure_rcclp(); + global_id = component::activate_rcclp(); if(librccl_handle) dlclose(librccl_handle); } @@ -85,7 +85,7 @@ void shutdown() { if(global_id < std::numeric_limits::max()) - comp::deactivate_rcclp(global_id); + component::deactivate_rcclp(global_id); } } // namespace rcclp } // namespace omnitrace diff --git a/source/lib/omnitrace/library/rocm.cpp b/source/lib/omnitrace/library/rocm.cpp index df3ab44d1..fd8a4131f 100644 --- a/source/lib/omnitrace/library/rocm.cpp +++ b/source/lib/omnitrace/library/rocm.cpp @@ -21,14 +21,13 @@ // SOFTWARE. #include "library/rocm.hpp" -#include "library.hpp" -#include "library/components/rocm_smi.hpp" #include "library/components/rocprofiler.hpp" #include "library/components/roctracer.hpp" #include "library/config.hpp" -#include "library/critical_trace.hpp" #include "library/debug.hpp" +#include "library/dynamic_library.hpp" #include "library/gpu.hpp" +#include "library/rocm_smi.hpp" #include "library/rocprofiler.hpp" #include "library/rocprofiler/hsa_rsrc_factory.hpp" #include "library/roctracer.hpp" diff --git a/source/lib/omnitrace/library/components/rocm_smi.cpp b/source/lib/omnitrace/library/rocm_smi.cpp similarity index 95% rename from source/lib/omnitrace/library/components/rocm_smi.cpp rename to source/lib/omnitrace/library/rocm_smi.cpp index 4b3abdc0b..b0513f57e 100644 --- a/source/lib/omnitrace/library/components/rocm_smi.cpp +++ b/source/lib/omnitrace/library/rocm_smi.cpp @@ -30,10 +30,9 @@ # undef NDEBUG #endif -#include "library/components/rocm_smi.hpp" +#include "library/rocm_smi.hpp" #include "library/common.hpp" #include "library/components/fwd.hpp" -#include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" @@ -41,6 +40,7 @@ #include "library/gpu.hpp" #include "library/perfetto.hpp" #include "library/state.hpp" +#include "library/thread_info.hpp" #include #include @@ -65,10 +65,8 @@ namespace omnitrace { namespace rocm_smi { -using tim::type_mutex; -using auto_lock_t = tim::auto_lock_t; using bundle_t = std::deque; -using sampler_instances = thread_data; +using sampler_instances = thread_data; namespace { @@ -247,6 +245,10 @@ data::post_process(uint32_t _dev_id) auto _rocm_smi = (_rocm_smi_v) ? *_rocm_smi_v : std::deque{}; auto _process_perfetto = [&]() { + const auto& _thread_info = thread_info::get(0, LookupTID); + OMNITRACE_CI_THROW(!_thread_info, "Missing thread info for thread 0"); + if(!_thread_info) return; + for(auto& itr : _rocm_smi) { using counter_track = perfetto_counter_track; @@ -262,7 +264,7 @@ data::post_process(uint32_t _dev_id) counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); } uint64_t _ts = itr.m_ts; - if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + if(!_thread_info->is_valid_time(_ts)) continue; double _busy = itr.m_busy_perc; double _temp = itr.m_temp / 1.0e3; @@ -289,7 +291,7 @@ data::post_process(uint32_t _dev_id) { using entry_t = critical_trace::entry; auto _ts = itr.m_ts; - if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + if(!_thread_info->is_valid_time(_ts)) continue; auto _entries = critical_trace::get_entries(_ts, [](const entry_t& _e) { return _e.device == critical_trace::Device::GPU; @@ -322,7 +324,7 @@ data::post_process(uint32_t _dev_id) void setup() { - auto_lock_t _lk{ type_mutex() }; + auto_lock_t _lk{ type_mutex() }; if(is_initialized() || !get_use_rocm_smi()) return; @@ -407,7 +409,7 @@ setup() void shutdown() { - auto_lock_t _lk{ type_mutex() }; + auto_lock_t _lk{ type_mutex() }; if(!is_initialized()) return; @@ -454,18 +456,18 @@ device_count() } // namespace rocm_smi } // namespace omnitrace -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( +OMNITRACE_INSTANTIATE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) diff --git a/source/lib/omnitrace/library/components/rocm_smi.hpp b/source/lib/omnitrace/library/rocm_smi.hpp similarity index 97% rename from source/lib/omnitrace/library/components/rocm_smi.hpp rename to source/lib/omnitrace/library/rocm_smi.hpp index 8a6a6e3f3..b22658e39 100644 --- a/source/lib/omnitrace/library/components/rocm_smi.hpp +++ b/source/lib/omnitrace/library/rocm_smi.hpp @@ -154,19 +154,19 @@ inline void set_state(State) {} # include # include -TIMEMORY_DECLARE_EXTERN_COMPONENT( +OMNITRACE_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_DECLARE_EXTERN_COMPONENT( +OMNITRACE_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_DECLARE_EXTERN_COMPONENT( +OMNITRACE_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) -TIMEMORY_DECLARE_EXTERN_COMPONENT( +OMNITRACE_DECLARE_EXTERN_COMPONENT( TIMEMORY_ESC(data_tracker), true, double) diff --git a/source/lib/omnitrace/library/rocprofiler.cpp b/source/lib/omnitrace/library/rocprofiler.cpp index 82cca15b5..fbacd4c2c 100644 --- a/source/lib/omnitrace/library/rocprofiler.cpp +++ b/source/lib/omnitrace/library/rocprofiler.cpp @@ -174,10 +174,11 @@ rocm_dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, rocm_check_status(rocprofiler_get_metrics(group.context)); } - auto _evt = comp::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name, - record->begin, record->end, feature_count, features }; + auto _evt = + component::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name, + record->begin, record->end, feature_count, features }; - comp::rocm_data()->emplace_back(_evt); + component::rocm_data()->emplace_back(_evt); } // Profiling completion handler @@ -232,7 +233,6 @@ rocm_dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* a unsigned metrics_input(unsigned _device, rocprofiler_feature_t** ret) { - // OMNITRACE_THROW("%s\n", __FUNCTION__); // Profiling feature objects auto _events = tim::delimit(config::get_rocm_events(), ", ;\t\n"); std::vector _features = {}; @@ -275,8 +275,8 @@ metrics_input(unsigned _device, rocprofiler_feature_t** ret) struct info_data { - const AgentInfo* agent = nullptr; - std::vector* data = nullptr; + const AgentInfo* agent = nullptr; + std::vector* data = nullptr; }; hsa_status_t @@ -304,7 +304,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) { auto _sym = JOIN("", info.metric.name, _device_qualifier_sym); auto _short_desc = JOIN("", "Derived counter: ", info.metric.expr); - _data->emplace_back(comp::rocm_info_entry( + _data->emplace_back(component::rocm_info_entry( true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, _pysym, _short_desc, _long_desc, _units, qualifier_vec_t{ _device_qualifier })); @@ -316,7 +316,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) auto _sym = JOIN("", info.metric.name, _device_qualifier_sym); auto _short_desc = JOIN("", info.metric.name, " on device ", _agent->dev_index); - _data->emplace_back(comp::rocm_info_entry( + _data->emplace_back(component::rocm_info_entry( true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, _pysym, _short_desc, _long_desc, _units, qualifier_vec_t{ _device_qualifier })); @@ -334,7 +334,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) _device_qualifier_sym); auto _short_desc = JOIN("", info.metric.name, " instance ", i, " on device ", _agent->dev_index); - _data->emplace_back(comp::rocm_info_entry( + _data->emplace_back(component::rocm_info_entry( true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym, _pysym, _short_desc, _long_desc, _units, qualifier_vec_t{ _device_qualifier, _instance_qualifier })); @@ -348,10 +348,10 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg) return HSA_STATUS_SUCCESS; } -std::vector +std::vector rocm_metrics() { - std::vector _data = {}; + std::vector _data = {}; try { (void) HsaRsrcFactory::Instance(); @@ -475,11 +475,11 @@ rocm_cleanup() namespace { -using rocm_event = comp::rocm_event; -using rocm_data_t = comp::rocm_data_t; -using rocm_metric_type = comp::rocm_metric_type; -using rocm_feature_value = comp::rocm_feature_value; -using rocm_data_tracker = comp::rocm_data_tracker; +using rocm_event = component::rocm_event; +using rocm_data_t = component::rocm_data_t; +using rocm_metric_type = component::rocm_metric_type; +using rocm_feature_value = component::rocm_feature_value; +using rocm_data_tracker = component::rocm_data_tracker; void post_process_perfetto() @@ -496,7 +496,7 @@ post_process_perfetto() for(size_t i = 0; i < OMNITRACE_MAX_THREADS; ++i) { - auto& _v = comp::rocm_data(i); + auto& _v = component::rocm_data(i); if(_v) { _data.reserve(_data.size() + _v->size()); @@ -605,7 +605,7 @@ post_process_timemory() for(size_t i = 0; i < OMNITRACE_MAX_THREADS; ++i) { - auto& _v = comp::rocm_data(i); + auto& _v = component::rocm_data(i); if(_v) { _data.reserve(_data.size() + _v->size()); diff --git a/source/lib/omnitrace/library/rocprofiler.hpp b/source/lib/omnitrace/library/rocprofiler.hpp index 3c0e75f1f..e08329c9a 100644 --- a/source/lib/omnitrace/library/rocprofiler.hpp +++ b/source/lib/omnitrace/library/rocprofiler.hpp @@ -65,7 +65,7 @@ is_setup(); void post_process(); -std::vector +std::vector rocm_metrics(); #if !defined(OMNITRACE_USE_ROCPROFILER) || OMNITRACE_USE_ROCPROFILER == 0 @@ -77,10 +77,10 @@ inline void rocm_cleanup() {} -inline std::vector +inline std::vector rocm_metrics() { - return std::vector{}; + return std::vector{}; } #endif diff --git a/source/lib/omnitrace/library/roctracer.cpp b/source/lib/omnitrace/library/roctracer.cpp index 3ac752697..b587a0733 100644 --- a/source/lib/omnitrace/library/roctracer.cpp +++ b/source/lib/omnitrace/library/roctracer.cpp @@ -21,7 +21,6 @@ // SOFTWARE. #include "library/roctracer.hpp" -#include "library.hpp" #include "library/components/fwd.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" @@ -98,7 +97,7 @@ auto& get_roctracer_hip_data(int64_t _tid = threading::get_id()) { using data_t = std::unordered_map; - using thread_data_t = thread_data; + using thread_data_t = thread_data; static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); return _v.at(_tid); } @@ -137,7 +136,7 @@ auto& get_roctracer_cid_data(int64_t _tid = threading::get_id()) { using thread_data_t = - thread_data, api::roctracer>; + thread_data, category::roctracer>; static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); return *_v.at(_tid); } @@ -145,8 +144,9 @@ get_roctracer_cid_data(int64_t _tid = threading::get_id()) auto& get_hip_activity_callbacks(int64_t _tid = threading::get_id()) { - using thread_data_t = thread_data>, api::roctracer>; - static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); + using thread_data_t = + thread_data>, category::roctracer>; + static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); return _v.at(_tid); } @@ -156,8 +156,8 @@ using key_data_mutex_t = std::decay_t; auto& get_hip_activity_mutex(int64_t _tid = threading::get_id()) { - return tim::type_mutex( - _tid); + return tim::type_mutex(_tid); } } // namespace @@ -230,17 +230,6 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - static thread_local std::once_flag _once{}; - std::call_once(_once, []() { - threading::offset_this_id(true); - if(threading::get_id() != 0) - { - sampling::block_signals(); - threading::set_thread_name("roctracer.hsa"); - sampling::shutdown(); - } - }); - (void) arg; const hsa_api_data_t* data = reinterpret_cast(callback_data); OMNITRACE_CONDITIONAL_PRINT_F( @@ -326,9 +315,8 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_timemory()) { - std::unique_lock _lk{ tasking::roctracer::get_mutex() }; - auto _beg_ns = begin_timestamp; - auto _end_ns = end_timestamp; + auto _beg_ns = begin_timestamp; + auto _end_ns = end_timestamp; if(tasking::roctracer::get_task_group().pool()) tasking::roctracer::get_task_group().exec( [_name, _beg_ns, _end_ns]() { @@ -413,7 +401,6 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) } }; - std::unique_lock _lk{ tasking::roctracer::get_mutex() }; if(tasking::roctracer::get_task_group().pool()) tasking::roctracer::get_task_group().exec(_func); @@ -463,13 +450,15 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, if(get_use_perfetto()) tracing::push_perfetto(category::rocm_roctx{}, _data->args.message); - if(get_use_timemory()) tracing::push_timemory(_data->args.message); + if(get_use_timemory()) + tracing::push_timemory(category::rocm_roctx{}, _data->args.message); break; } case ROCTX_API_ID_roctxRangePop: { - if(get_use_timemory()) tracing::pop_timemory(_data->args.message); + if(get_use_timemory()) + tracing::pop_timemory(category::rocm_roctx{}, _data->args.message); if(get_use_perfetto()) tracing::pop_perfetto(category::rocm_roctx{}, _data->args.message); break; @@ -486,7 +475,8 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, if(get_use_perfetto()) tracing::push_perfetto(category::rocm_roctx{}, _data->args.message); - if(get_use_timemory()) tracing::push_timemory(_data->args.message); + if(get_use_timemory()) + tracing::push_timemory(category::rocm_roctx{}, _data->args.message); break; } case ROCTX_API_ID_roctxRangeStop: @@ -513,7 +503,8 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, if(!_message.empty()) { - if(get_use_timemory()) tracing::pop_timemory(_message.data()); + if(get_use_timemory()) + tracing::pop_timemory(category::rocm_roctx{}, _message.data()); if(get_use_perfetto()) tracing::pop_perfetto(category::rocm_roctx{}, _message.data()); } @@ -845,9 +836,9 @@ hip_activity_callback(const char* begin, const char* end, void*) const char* op_name = roctracer_op_string(record->domain, record->op, record->kind); - - uint64_t _beg_ns = record->begin_ns + get_clock_skew(); - uint64_t _end_ns = record->end_ns + get_clock_skew(); + auto _ns_skew = get_clock_skew(); + uint64_t _beg_ns = record->begin_ns + _ns_skew; + uint64_t _end_ns = record->end_ns + _ns_skew; auto _corr_id = record->correlation_id; static auto _scope = []() { auto _v = scope::config{}; @@ -902,11 +893,13 @@ hip_activity_callback(const char* begin, const char* end, void*) { static size_t _n = 0; OMNITRACE_CONDITIONAL_PRINT_F( - get_debug() && get_verbose() >= 2, - "%4zu :: %-20s :: %-20s :: correlation_id(%6lu) time_ns(%12lu:%12lu) " - "delta_ns(%12lu) device_id(%d) stream_id(%lu) proc_id(%u) thr_id(%lu)\n", + (get_debug() && get_verbose() >= 2) || _end_ns <= _beg_ns, + "%4zu :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) " + "delta=%li, device_id=%d, stream_id=%lu, pid=%u, tid=%lu\n", _n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns, - (_end_ns - _beg_ns), _devid, _queid, record->process_id, _tid); + (static_cast(_end_ns) - static_cast(_beg_ns)), _devid, + _queid, record->process_id, _tid); + if(_end_ns <= _beg_ns) continue; } // execute this on this thread bc of how perfetto visualization works @@ -918,7 +911,7 @@ hip_activity_callback(const char* begin, const char* end, void*) if(_kernel_names.find(_name) == _kernel_names.end()) _kernel_names.emplace(_name, tim::demangle(_name)); - assert(_end_ns > _beg_ns); + assert(_end_ns >= _beg_ns); tracing::push_perfetto_ts( category::device_hip{}, _kernel_names.at(_name).c_str(), _beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns", _beg_ns, "corr_id", diff --git a/source/lib/omnitrace/library/roctracer.hpp b/source/lib/omnitrace/library/roctracer.hpp index 951bc5a33..871f25515 100644 --- a/source/lib/omnitrace/library/roctracer.hpp +++ b/source/lib/omnitrace/library/roctracer.hpp @@ -25,7 +25,6 @@ #include "library/components/roctracer.hpp" #include "library/config.hpp" #include "library/debug.hpp" -#include "library/dynamic_library.hpp" #include "library/perfetto.hpp" #include "library/ptl.hpp" @@ -48,9 +47,9 @@ namespace omnitrace { using roctracer_bundle_t = - tim::component_bundle; + tim::component_bundle; using roctracer_hsa_bundle_t = - tim::component_bundle; + tim::component_bundle; using roctracer_functions_t = std::vector>>; // HSA API callback function diff --git a/source/lib/omnitrace/library/runtime.cpp b/source/lib/omnitrace/library/runtime.cpp index 9bbf178df..2d9157c3d 100644 --- a/source/lib/omnitrace/library/runtime.cpp +++ b/source/lib/omnitrace/library/runtime.cpp @@ -21,7 +21,7 @@ // SOFTWARE. #include "library/runtime.hpp" -#include "library/api.hpp" +#include "api.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,8 @@ get_cputime_signal() std::set get_sampling_signals(int64_t _tid) { + if(!get_use_sampling()) return std::set{}; + auto _sigreal = get_realtime_signal(); auto _sigprof = get_cputime_signal(); @@ -168,7 +171,8 @@ get_cpu_cid_stack_lock(int64_t _tid) { struct cpu_cid_stack_s {}; - return tim::type_mutex(_tid); + return tim::type_mutex( + _tid); } namespace @@ -183,19 +187,24 @@ setup_gotchas() OMNITRACE_BASIC_DEBUG( "Configuring gotcha wrapper around fork, MPI_Init, and MPI_Init_thread\n"); - mpi_gotcha::configure(); - exit_gotcha::configure(); - fork_gotcha::configure(); - pthread_gotcha::configure(); + component::mpi_gotcha::configure(); + component::exit_gotcha::configure(); + component::fork_gotcha::configure(); } } // namespace std::unique_ptr& get_main_bundle() { - static auto _v = - std::make_unique(JOIN('/', "omnitrace/process", process::get_id()), - quirk::config{}); + static auto _v = []() { + auto _self = RUSAGE_SELF; + std::swap(_self, tim::get_rusage_type()); + auto _tmp = std::make_unique( + JOIN('/', "omnitrace/process", process::get_id()), + quirk::config{}); + std::swap(_self, tim::get_rusage_type()); + return _tmp; + }(); return _v; } @@ -239,12 +248,16 @@ set_thread_state(ThreadState _n) ThreadState push_thread_state(ThreadState _v) { + if(get_thread_state() >= ThreadState::Completed) return get_thread_state(); + return get_thread_state_history().emplace_back(set_thread_state(_v)); } ThreadState pop_thread_state() { + if(get_thread_state() >= ThreadState::Completed) return get_thread_state(); + auto& _hist = get_thread_state_history(); if(!_hist.empty()) { @@ -253,5 +266,4 @@ pop_thread_state() } return get_thread_state(); } - } // namespace omnitrace diff --git a/source/lib/omnitrace/library/runtime.hpp b/source/lib/omnitrace/library/runtime.hpp index 228449b3c..6608fd737 100644 --- a/source/lib/omnitrace/library/runtime.hpp +++ b/source/lib/omnitrace/library/runtime.hpp @@ -22,7 +22,7 @@ #pragma once -#include "library/api.hpp" +#include "api.hpp" #include "library/common.hpp" #include "library/components/exit_gotcha.hpp" #include "library/components/fork_gotcha.hpp" @@ -47,8 +47,8 @@ namespace omnitrace { // bundle of components around omnitrace_init and omnitrace_finalize using main_bundle_t = - tim::lightweight_tuple; + tim::lightweight_tuple; using gotcha_bundle_t = tim::lightweight_tuple; diff --git a/source/lib/omnitrace/library/sampling.cpp b/source/lib/omnitrace/library/sampling.cpp index 6a31a6cca..81398cf40 100644 --- a/source/lib/omnitrace/library/sampling.cpp +++ b/source/lib/omnitrace/library/sampling.cpp @@ -21,11 +21,20 @@ // SOFTWARE. #include "library/sampling.hpp" +#include "library/common.hpp" +#include "library/components/backtrace.hpp" +#include "library/components/backtrace_metrics.hpp" +#include "library/components/backtrace_timestamp.hpp" #include "library/components/fwd.hpp" +#include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/ptl.hpp" #include "library/runtime.hpp" +#include "library/thread_data.hpp" +#include "library/thread_info.hpp" +#include "library/tracing.hpp" +#include "library/utility.hpp" #include #include @@ -67,21 +76,30 @@ namespace omnitrace { namespace sampling { -using bundle_t = tim::lightweight_tuple; -using sampler_t = tim::sampling::sampler; +using hw_counters = typename component::backtrace_metrics::hw_counters; +using signal_type_instances = thread_data, category::sampling>; +using sampler_running_instances = thread_data; +using bundle_t = + tim::lightweight_tuple; +using sampler_t = tim::sampling::sampler; +using sampler_instances = thread_data; +using sampler_init_instances = thread_data; + +using tim::sampling::timer; } // namespace sampling } // namespace omnitrace +OMNITRACE_DEFINE_CONCRETE_TRAIT(prevent_reentry, sampling::sampler_t, std::true_type) + +OMNITRACE_DEFINE_CONCRETE_TRAIT(provide_backtrace, sampling::sampler_t, std::false_type) + +OMNITRACE_DEFINE_CONCRETE_TRAIT(buffer_size, sampling::sampler_t, + TIMEMORY_ESC(std::integral_constant)) + namespace omnitrace { namespace sampling { -using hw_counters = typename component::backtrace::hw_counters; -using signal_type_instances = thread_data, api::sampling>; -using backtrace_init_instances = thread_data; -using sampler_running_instances = thread_data; -using papi_vector_instances = thread_data; - namespace { template @@ -119,6 +137,166 @@ get_signal_names(Tp&& _v) " "; return _sig_names.substr(0, _sig_names.length() - 1); } + +unique_ptr_t& +get_sampler(int64_t _tid = threading::get_id()) +{ + static auto& _v = sampler_instances::instances(); + return _v.at(_tid); +} + +unique_ptr_t& +get_sampler_init(int64_t _tid = threading::get_id()) +{ + static auto& _v = sampler_init_instances::instances(); + if(!_v.at(_tid)) _v.at(_tid) = unique_ptr_t{ new bundle_t{} }; + return _v.at(_tid); +} + +unique_ptr_t& +get_sampler_running(int64_t _tid) +{ + static auto& _v = sampler_running_instances::instances( + sampler_running_instances::construct_on_init{}, false); + return _v.at(_tid); +} + +std::set +configure(bool _setup, int64_t _tid = threading::get_id()) +{ + const auto& _info = thread_info::get(_tid, InternalTID); + auto& _sampler = sampling::get_sampler(_tid); + auto& _running = get_sampler_running(_tid); + bool _is_running = (!_running) ? false : *_running; + auto& _signal_types = sampling::get_signal_types(_tid); + + pthread_gotcha::push_enable_sampling_on_child_threads(false); + auto _dtor = scope::destructor{ []() { + pthread_gotcha::pop_enable_sampling_on_child_threads(); + } }; + + if(_setup && !_sampler && !_is_running && !_signal_types->empty()) + { + // if this thread has an offset ID, that means it was created internally + // and is probably here bc it called a function which was instrumented. + // thus we should not start a sampler for it + if(_tid > 0 && _info && _info->is_offset) return std::set{}; + // if the thread state is disabled or completed, return + if(_info && _info->index_data->internal_value == _tid && + get_thread_state() == ThreadState::Disabled) + return std::set{}; + + (void) get_debug_sampling(); // make sure query in sampler does not allocate + assert(_tid == threading::get_id()); + + if(trait::runtime_enabled::get()) + backtrace_metrics::configure(_setup, _tid); + + sampling::block_signals(*_signal_types); + auto _alrm_freq = get_sampling_freq(); + auto _prof_freq = get_sampling_freq(); + auto _delay = std::max(1.0e-3, get_sampling_delay()); + auto _verbose = std::min(get_verbose() - 2, 2); + if(get_debug_sampling()) _verbose = 2; + if(!get_use_sampling_realtime()) _alrm_freq = std::min(_alrm_freq, 50.0); + + OMNITRACE_DEBUG("Configuring sampler for thread %lu...\n", _tid); + sampling::sampler_instances::construct("omnitrace", _tid, _verbose); + + _sampler->set_flags(SA_RESTART); + _sampler->set_verbose(_verbose); + + if(_signal_types->count(get_realtime_signal()) > 0) + { + _sampler->configure(timer{ get_realtime_signal(), CLOCK_REALTIME, + SIGEV_THREAD_ID, _alrm_freq, _delay, _tid, + threading::get_sys_tid() }); + } + + if(_signal_types->count(get_cputime_signal()) > 0) + { + _sampler->configure(timer{ get_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, + SIGEV_THREAD_ID, _prof_freq, _delay, _tid, + threading::get_sys_tid() }); + } + + static_assert(tim::trait::buffer_size::value > 0, + "Error! Zero buffer size"); + + OMNITRACE_CONDITIONAL_THROW( + _sampler->get_buffer_size() != + tim::trait::buffer_size::value, + "dynamic sampler has a buffer size different from static trait: %zu instead " + "of %zu", + _sampler->get_buffer_size(), + tim::trait::buffer_size::value); + + OMNITRACE_CONDITIONAL_THROW( + _sampler->get_buffer_size() <= 0, + "dynamic sampler requires a positive buffer size: %zu", + _sampler->get_buffer_size()); + + for(auto itr : *_signal_types) + { + const char* _type = (itr == get_realtime_signal()) ? "wall" : "CPU"; + const auto* _timer = _sampler->get_timer(itr); + if(_timer) + { + OMNITRACE_VERBOSE( + 1, + "[SIG%i] Sampler for thread %lu will be triggered %.1fx per " + "second of %s-time (every %.3e milliseconds)...\n", + itr, _tid, _timer->get_frequency(units::sec), _type, + _timer->get_period(units::msec)); + } + } + + *_running = true; + sampling::get_sampler_init(_tid)->sample(); + _sampler->start(); + } + else if(!_setup && _sampler && _is_running) + { + OMNITRACE_DEBUG("Destroying sampler for thread %lu...\n", _tid); + *_running = false; + + if(_tid == threading::get_id() && !_signal_types->empty()) + { + sampling::block_signals(*_signal_types); + } + + if(_tid == 0) + { + // this propagates to all threads + _sampler->ignore(*_signal_types); + for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) + { + if(sampling::get_sampler(i)) + { + sampling::get_sampler(i)->stop(); + sampling::get_sampler(i)->reset(); + *get_sampler_running(i) = false; + } + } + } + + _sampler->stop(); + + if(trait::runtime_enabled::get()) + backtrace_metrics::configure(_setup, _tid); + + OMNITRACE_DEBUG("Sampler destroyed for thread %lu\n", _tid); + } + + return (_signal_types) ? *_signal_types : std::set{}; +} + +void +post_process_perfetto(int64_t _tid, const bundle_t* _init, + const std::vector& _data); +void +post_process_timemory(int64_t _tid, const bundle_t* _init, + const std::vector& _data); } // namespace unique_ptr_t>& @@ -133,13 +311,13 @@ std::set setup() { if(!get_use_sampling()) return std::set{}; - return backtrace::configure(true); + return configure(true); } std::set shutdown() { - return backtrace::configure(false); + return configure(false); } void @@ -176,11 +354,366 @@ unblock_signals(std::set _signals) thread_sigmask(SIG_UNBLOCK, &_v, nullptr); } -unique_ptr_t& -get_sampler(int64_t _tid) +void +post_process() { - static auto& _v = sampler_instances::instances(); - return _v.at(_tid); + omnitrace::component::backtrace::stop(); + OMNITRACE_VERBOSE(2 || get_debug_sampling(), "Stopping backtrace metrics...\n"); + + for(size_t i = 0; i < max_supported_threads; ++i) + backtrace_metrics::configure(false, i); + + OMNITRACE_VERBOSE(1 || get_debug_sampling(), "Post-processing sampling data...\n"); + + for(size_t i = 0; i < max_supported_threads; ++i) + { + auto& _sampler = get_sampler(i); + + if(!_sampler) + { + // this should be relatively common + OMNITRACE_CONDITIONAL_PRINT( + get_debug() && get_verbose() >= 2, + "Post-processing sampling entries for thread %lu skipped (no sampler)\n", + i); + continue; + } + + auto* _init = get_sampler_init(i).get(); + + if(!_init) + { + // this is not common + OMNITRACE_PRINT("Post-processing sampling entries for thread %lu skipped " + "(not initialized)\n", + i); + continue; + } + + const auto& _thread_info = thread_info::get(i, InternalTID); + + OMNITRACE_VERBOSE(3 || get_debug_sampling(), + "Getting sampler data for thread %lu...\n", i); + + _sampler->stop(); + auto& _raw_data = _sampler->get_data(); + + OMNITRACE_VERBOSE(0 || get_debug_sampling(), + "Sampler data for thread %lu has %zu initial entries...\n", i, + _raw_data.size()); + + OMNITRACE_CI_THROW( + _sampler->get_sample_count() != _raw_data.size(), + "Error! sampler recorded %zu samples but %zu samples were returned\n", + _sampler->get_sample_count(), _raw_data.size()); + // single sample that is useless (backtrace to unblocking signals) + if(_raw_data.size() == 1 && _raw_data.front().size() <= 1) _raw_data.clear(); + + std::vector _data{}; + for(auto& itr : _raw_data) + { + _data.reserve(_data.size() + itr.size()); + auto* _bt = itr.get(); + auto* _ts = itr.get(); + if(!_bt || !_ts) continue; + if(_bt->empty()) continue; + if(!_thread_info->is_valid_time(_ts->get_timestamp())) continue; + _data.emplace_back(&itr); + } + + if(_data.empty()) + { + OMNITRACE_VERBOSE( + 3 || get_debug_sampling(), + "Sampler data for thread %lu has %zu valid entries... (skipped)\n", i, + _raw_data.size()); + continue; + } + + OMNITRACE_VERBOSE(0 || get_debug_sampling(), + "Sampler data for thread %lu has %zu valid entries...\n", i, + _raw_data.size()); + + if(get_use_perfetto()) post_process_perfetto(i, _init, _data); + if(get_use_timemory()) post_process_timemory(i, _init, _data); + } + + OMNITRACE_VERBOSE(0 || get_debug_sampling(), + "Post-processing sampling entries completed\n"); + + for(size_t i = 0; i < max_supported_threads; ++i) + { + get_sampler(i).reset(); + } + + OMNITRACE_VERBOSE(0 || get_debug_sampling(), "Post-processing samplers destroyed\n"); +} + +namespace +{ +void +post_process_perfetto(int64_t _tid, const bundle_t* _init, + const std::vector& _data) +{ + if(trait::runtime_enabled::get()) + { + OMNITRACE_VERBOSE(3 || get_debug_sampling(), + "[%li] Post-processing metrics for perfetto...\n", _tid); + backtrace_metrics::init_perfetto(_tid); + for(const auto& itr : _data) + { + const auto* _bt_metrics = itr->get(); + const auto* _bt_time = itr->get(); + if(!_bt_metrics || !_bt_time) continue; + if(_bt_time->get_tid() != _tid) continue; + _bt_metrics->post_process_perfetto(_tid, _bt_time->get_timestamp()); + } + + backtrace_metrics::fini_perfetto(_tid); + } + + auto _process_perfetto = [_tid, + _init](const std::vector& _data) { + OMNITRACE_VERBOSE(3 || get_debug_sampling(), + "[%li] Post-processing backtraces for perfetto...\n", _tid); + + const auto& _thread_info = thread_info::get(_tid, InternalTID); + OMNITRACE_CI_THROW(!_thread_info, "No valid thread info for tid=%li\n", _tid); + + if(!_thread_info) return; + + uint64_t _beg_ns = _thread_info->get_start(); + uint64_t _end_ns = _thread_info->get_stop(); + uint64_t _last_ts = std::max( + _init->get()->get_timestamp(), _beg_ns); + + tracing::push_perfetto_ts(category::sampling{}, "samples [omnitrace]", _beg_ns, + "begin_ns", _beg_ns); + + for(const auto& itr : _data) + { + const auto* _bt_ts = itr->get(); + const auto* _bt_cs = itr->get(); + + if(!_bt_ts || !_bt_cs) continue; + if(_bt_ts->get_tid() != _tid) continue; + + static std::set _static_strings{}; + for(const auto& itr : backtrace::filter_and_patch(_bt_cs->get())) + { + const auto* _name = _static_strings.emplace(itr).first->c_str(); + uint64_t _beg = _last_ts; + uint64_t _end = _bt_ts->get_timestamp(); + if(!_thread_info->is_valid_lifetime({ _beg, _end })) continue; + + tracing::push_perfetto_ts(category::sampling{}, _name, _beg, "begin_ns", + _beg); + tracing::pop_perfetto_ts(category::sampling{}, _name, _end, "end_ns", + _end); + } + _last_ts = _bt_ts->get_timestamp(); + } + + tracing::pop_perfetto_ts(category::sampling{}, "samples [omnitrace]", _end_ns, + "end_ns", _end_ns); + }; + + auto _processing_thread = threading::get_tid(); + auto _process_perfetto_wrapper = [&]() { + if(threading::get_tid() != _processing_thread) + threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid, "(S)").c_str()); + + try + { + _process_perfetto(_data); + } catch(std::runtime_error& _e) + { + OMNITRACE_PRINT("[sampling][post_process_perfetto] Exception: %s\n", + _e.what()); + OMNITRACE_CI_ABORT(true, "[sampling][post_process_perfetto] Exception: %s\n", + _e.what()); + } + }; + + if(_tid == 0 && config::get_mode() == Mode::Sampling && + config::get_perfetto_fill_policy() == "discard") + { + _process_perfetto(_data); + } + else + { + pthread_gotcha::push_enable_sampling_on_child_threads(false); + std::thread{ _process_perfetto_wrapper }.join(); + pthread_gotcha::pop_enable_sampling_on_child_threads(); + } } + +void +post_process_timemory(int64_t _tid, const bundle_t* _init, + const std::vector& _data) +{ + std::map> _depth_sum = {}; + auto _scope = tim::scope::config{}; + if(get_timeline_sampling()) _scope += scope::timeline{}; + if(get_flat_sampling()) _scope += scope::flat{}; + + OMNITRACE_VERBOSE(3 || get_debug_sampling(), + "[%li] Post-processing data for timemory...\n", _tid); + + const auto* _last = _init; + for(const auto& itr : _data) + { + using bundle_t = tim::lightweight_tuple; + + auto* _bt_data = itr->get(); + auto* _bt_time = itr->get(); + auto* _bt_metrics = itr->get(); + + if(!_bt_data || !_bt_time || !_bt_metrics) continue; + + double _elapsed_wc = (_bt_time->get_timestamp() - + _last->get()->get_timestamp()); + double _elapsed_cc = (_bt_metrics->get_cpu_timestamp() - + _last->get()->get_cpu_timestamp()); + + std::vector _tc{}; + _tc.reserve(_bt_data->size()); + + // generate the instances of the tuple of components and start them + for(const auto& itr : backtrace::filter_and_patch(_bt_data->get())) + { + _tc.emplace_back(tim::string_view_t{ itr }, _scope); + _tc.back().push(_bt_time->get_tid()); + _tc.back().start(); + } + + // stop the instances and update the values as needed + for(size_t i = 0; i < _tc.size(); ++i) + { + auto& itr = _tc.at(_tc.size() - i - 1); + size_t _depth = 0; + _depth_sum[_bt_time->get_tid()][_depth] += 1; + itr.stop(); + if constexpr(tim::trait::is_available::value) + { + auto* _sc = itr.get(); + if(_sc) + { + auto _value = _elapsed_wc / sampling_wall_clock::get_unit(); + _sc->set_value(_value); + _sc->set_accum(_value); + } + } + if constexpr(tim::trait::is_available::value) + { + auto* _cc = itr.get(); + if(_cc) + { + _cc->set_value(_elapsed_cc / sampling_cpu_clock::get_unit()); + _cc->set_accum(_elapsed_cc / sampling_cpu_clock::get_unit()); + } + } + if constexpr(tim::trait::is_available::value) + { + auto _hw_cnt_vals = _bt_metrics->get_hw_counters(); + if(_last && _bt_metrics->get_hw_counters().size() == + _last->get()->get_hw_counters().size()) + { + for(size_t k = 0; k < _bt_metrics->get_hw_counters().size(); ++k) + { + if(_last->get()->get_hw_counters()[k] > + _hw_cnt_vals[k]) + _hw_cnt_vals[k] -= + _last->get()->get_hw_counters()[k]; + } + } + auto* _hw_counter = itr.get(); + if(_hw_counter) + { + _hw_counter->set_value(_hw_cnt_vals); + _hw_counter->set_accum(_hw_cnt_vals); + } + } + itr.pop(); + } + _last = itr; + } + + for(auto&& itr : _data) + { + using bundle_t = + tim::lightweight_tuple>; + + auto* _bt_data = itr->get(); + auto* _bt_time = itr->get(); + + if(!_bt_time || !_bt_data) continue; + if(_depth_sum.find(_bt_time->get_tid()) == _depth_sum.end()) continue; + + std::vector _tc{}; + _tc.reserve(_bt_data->size()); + + // generate the instances of the tuple of components and start them + for(const auto& itr : backtrace::filter_and_patch(_bt_data->get())) + { + _tc.emplace_back(tim::string_view_t{ itr }); + _tc.back().push(_bt_time->get_tid()); + _tc.back().start(); + } + + // stop the instances and update the values as needed + for(size_t i = 0; i < _tc.size(); ++i) + { + auto& itr = _tc.at(_tc.size() - i - 1); + size_t _depth = 0; + double _value = (1.0 / _depth_sum[_bt_time->get_tid()][_depth]) * 100.0; + itr.store(std::plus{}, _value); + itr.stop(); + itr.pop(); + } + } +} + +struct sampling_initialization +{ + static void preinit() + { + sampling_wall_clock::label() = "sampling_wall_clock"; + sampling_wall_clock::description() = "Wall clock time (via sampling)"; + + sampling_cpu_clock::label() = "sampling_cpu_clock"; + sampling_cpu_clock::description() = "CPU clock time (via sampling)"; + + sampling_percent::label() = "sampling_percent"; + sampling_percent::description() = "Percentage of samples"; + + sampling_gpu_busy::label() = "sampling_gpu_busy_percent"; + sampling_gpu_busy::description() = "Utilization of GPU(s)"; + sampling_gpu_busy::set_precision(0); + sampling_gpu_busy::set_format_flags(sampling_gpu_busy::get_format_flags() & + std::ios_base::showpoint); + + sampling_gpu_memory::label() = "sampling_gpu_memory_usage"; + sampling_gpu_memory::description() = "Memory usage of GPU(s)"; + + sampling_gpu_power::label() = "sampling_gpu_power"; + sampling_gpu_power::description() = "Power usage of GPU(s)"; + sampling_gpu_power::unit() = units::watt; + sampling_gpu_power::display_unit() = "watts"; + sampling_gpu_power::set_precision(2); + sampling_gpu_power::set_format_flags(sampling_gpu_power::get_format_flags()); + + sampling_gpu_temp::label() = "sampling_gpu_temperature"; + sampling_gpu_temp::description() = "Temperature of GPU(s)"; + sampling_gpu_temp::unit() = 1; + sampling_gpu_temp::display_unit() = "degC"; + sampling_gpu_temp::set_precision(1); + sampling_gpu_temp::set_format_flags(sampling_gpu_temp::get_format_flags()); + } +}; +} // namespace } // namespace sampling } // namespace omnitrace + +TIMEMORY_INVOKE_PREINIT(omnitrace::sampling::sampling_initialization) diff --git a/source/lib/omnitrace/library/sampling.hpp b/source/lib/omnitrace/library/sampling.hpp index ca8b74ad0..3ad938330 100644 --- a/source/lib/omnitrace/library/sampling.hpp +++ b/source/lib/omnitrace/library/sampling.hpp @@ -24,26 +24,30 @@ #include "library/common.hpp" #include "library/components/backtrace.hpp" +#include "library/components/backtrace_metrics.hpp" +#include "library/components/backtrace_timestamp.hpp" #include "library/components/fwd.hpp" #include "library/defines.hpp" #include "library/thread_data.hpp" #include "library/timemory.hpp" #include -#include #include #include #include #include +#include namespace omnitrace { namespace sampling { -using component::backtrace; +using component::backtrace; // NOLINT using component::backtrace_cpu_clock; // NOLINT using component::backtrace_fraction; // NOLINT +using component::backtrace_metrics; // NOLINT +using component::backtrace_timestamp; // NOLINT using component::backtrace_wall_clock; // NOLINT using component::sampling_cpu_clock; using component::sampling_gpu_busy; @@ -66,21 +70,8 @@ void block_signals(std::set = {}); void unblock_signals(std::set = {}); -using bundle_t = tim::lightweight_tuple; -using sampler_t = tim::sampling::sampler; -using sampler_instances = thread_data; - -unique_ptr_t& -get_sampler(int64_t _tid = threading::get_id()); +void +post_process(); } // namespace sampling } // namespace omnitrace - -TIMEMORY_DEFINE_CONCRETE_TRAIT(prevent_reentry, omnitrace::sampling::sampler_t, - std::true_type) - -TIMEMORY_DEFINE_CONCRETE_TRAIT(check_signals, omnitrace::sampling::sampler_t, - std::true_type) - -TIMEMORY_DEFINE_CONCRETE_TRAIT(buffer_size, omnitrace::sampling::sampler_t, - TIMEMORY_ESC(std::integral_constant)) diff --git a/source/lib/omnitrace/library/state.cpp b/source/lib/omnitrace/library/state.cpp index 700ae24c3..c5c7596bf 100644 --- a/source/lib/omnitrace/library/state.cpp +++ b/source/lib/omnitrace/library/state.cpp @@ -47,8 +47,8 @@ to_string(omnitrace::ThreadState _v) { case omnitrace::ThreadState::Enabled: return "Enabled"; case omnitrace::ThreadState::Internal: return "Internal"; - case omnitrace::ThreadState::Disabled: return "Disabled"; case omnitrace::ThreadState::Completed: return "Completed"; + case omnitrace::ThreadState::Disabled: return "Disabled"; } return {}; } diff --git a/source/lib/omnitrace/library/state.hpp b/source/lib/omnitrace/library/state.hpp index 22f588c34..91f17856e 100644 --- a/source/lib/omnitrace/library/state.hpp +++ b/source/lib/omnitrace/library/state.hpp @@ -32,8 +32,8 @@ enum class State : unsigned short PreInit = 0, Init, Active, + Finalized, Disabled, - Finalized }; // used for specifying the state of omnitrace @@ -41,8 +41,8 @@ enum class ThreadState : unsigned short { Enabled = 0, Internal, - Disabled, Completed, + Disabled, }; enum class Mode : unsigned short diff --git a/source/lib/omnitrace/library/thread_data.cpp b/source/lib/omnitrace/library/thread_data.cpp index 36457eedc..f2d322b72 100644 --- a/source/lib/omnitrace/library/thread_data.cpp +++ b/source/lib/omnitrace/library/thread_data.cpp @@ -22,9 +22,11 @@ #include "library/thread_data.hpp" #include "library/components/pthread_create_gotcha.hpp" +#include "library/thread_info.hpp" #include "library/utility.hpp" #include +#include namespace omnitrace { @@ -38,7 +40,7 @@ instrumentation_bundles::instances() void thread_deleter::operator()() const { - pthread_create_gotcha::shutdown(threading::get_id()); + component::pthread_create_gotcha::shutdown(threading::get_id()); set_thread_state(ThreadState::Completed); if(get_state() != State::Finalized && threading::get_id() == 0) omnitrace_finalize_hidden(); diff --git a/source/lib/omnitrace/library/thread_data.hpp b/source/lib/omnitrace/library/thread_data.hpp index aca30a01c..7c36ccd34 100644 --- a/source/lib/omnitrace/library/thread_data.hpp +++ b/source/lib/omnitrace/library/thread_data.hpp @@ -22,30 +22,31 @@ #pragma once -#include "library/api.hpp" +#include "api.hpp" #include "library/common.hpp" +#include "library/concepts.hpp" #include "library/config.hpp" #include "library/defines.hpp" #include "library/state.hpp" #include "library/timemory.hpp" +#include + #include #include #include #include +#include #include -#if !defined(OMNITRACE_MAX_THREADS) -# define OMNITRACE_MAX_THREADS 1024 -#endif - namespace omnitrace { ThreadState set_thread_state(ThreadState); // bundle of components used in instrumentation using instrumentation_bundle_t = - tim::component_bundle; + tim::component_bundle; // allocator for instrumentation_bundle_t using bundle_allocator_t = tim::data::ring_buffer_allocator; @@ -77,18 +78,56 @@ struct thread_deleter } }; +template +struct generate +{ + using type = Tp; + + template + auto operator()(Args&&... _args) const + { + if constexpr(concepts::is_unique_pointer::value) + { + using value_type = typename type::element_type; + return type{ new value_type{ invoke(std::forward(_args), 0)... } }; + } + else + { + return type{ invoke(std::forward(_args), 0)... }; + } + } + +private: + template + static auto invoke(Up&& _v, int, + std::enable_if_t::value, int> = 0) + -> decltype(std::forward(_v)()) + { + return std::forward(_v)(); + } + + template + static auto&& invoke(Up&& _v, long) + { + return std::forward(_v); + } +}; + +using construct_on_init = std::true_type; + template struct thread_data { - using instance_array_t = std::array, MaxThreads>; + using value_type = unique_ptr_t; + using instance_array_t = std::array; using construct_on_init = std::true_type; template static void construct(Args&&...); - static unique_ptr_t& instance(); + static value_type& instance(); static instance_array_t& instances(); template - static unique_ptr_t& instance(construct_on_init, Args&&...); + static value_type& instance(construct_on_init, Args&&...); template static instance_array_t& instances(construct_on_init, Args&&...); @@ -108,9 +147,9 @@ thread_data::construct(Args&&... _args) { // construct outside of lambda to prevent data-race static auto& _instances = instances(); - static thread_local bool _v = [&_args...]() { + static thread_local bool _v = [&]() { _instances.at(threading::get_id()) = - unique_ptr_t{ new Tp(std::forward(_args)...) }; + generate{}(std::forward(_args)...); return true; }(); (void) _v; @@ -148,7 +187,180 @@ thread_data::instances(construct_on_init, Args&&... _args) static auto& _v = [&]() -> instance_array_t& { auto& _internal = instances(); for(size_t i = 0; i < MaxThreads; ++i) - _internal.at(i) = unique_ptr_t{ new Tp(std::forward(_args)...) }; + _internal.at(i) = generate{}(std::forward(_args)...); + return _internal; + }(); + return _v; +} + +//--------------------------------------------------------------------------------------// +// +// thread_data with std::optional +// +//--------------------------------------------------------------------------------------// + +template +struct thread_data, Tag, MaxThreads> +{ + using value_type = std::optional; + using instance_array_t = std::array; + using construct_on_init = std::true_type; + + template + static void construct(Args&&...); + static value_type& instance(); + static instance_array_t& instances(); + template + static value_type& instance(construct_on_init, Args&&...); + template + static instance_array_t& instances(construct_on_init, Args&&...); + + static constexpr size_t size() { return MaxThreads; } + + decltype(auto) begin() { return instances().begin(); } + decltype(auto) end() { return instances().end(); } + + decltype(auto) begin() const { return instances().begin(); } + decltype(auto) end() const { return instances().end(); } +}; + +template +template +void +thread_data, Tag, MaxThreads>::construct(Args&&... _args) +{ + // construct outside of lambda to prevent data-race + static auto& _instances = instances(); + static thread_local bool _v = [&]() { + _instances.at(threading::get_id()) = + generate{}(std::forward(_args)...); + return true; + }(); + (void) _v; +} + +template +std::optional& +thread_data, Tag, MaxThreads>::instance() +{ + return instances().at(threading::get_id()); +} + +template +typename thread_data, Tag, MaxThreads>::instance_array_t& +thread_data, Tag, MaxThreads>::instances() +{ + static auto _v = instance_array_t{}; + return _v; +} + +template +template +std::optional& +thread_data, Tag, MaxThreads>::instance(construct_on_init, + Args&&... _args) +{ + construct(std::forward(_args)...); + return instances().at(threading::get_id()); +} + +template +template +typename thread_data, Tag, MaxThreads>::instance_array_t& +thread_data, Tag, MaxThreads>::instances(construct_on_init, + Args&&... _args) +{ + static auto& _v = [&]() -> instance_array_t& { + auto& _internal = instances(); + for(size_t i = 0; i < MaxThreads; ++i) + _internal.at(i) = generate{}(std::forward(_args)...); + return _internal; + }(); + return _v; +} + +//--------------------------------------------------------------------------------------// +// +// thread_data with raw data (no pointer) +// +//--------------------------------------------------------------------------------------// + +using tim::identity; +using tim::identity_t; + +template +struct thread_data, Tag, MaxThreads> +{ + using value_type = Tp; + using instance_array_t = std::array; + using construct_on_init = std::true_type; + + template + static void construct(Args&&...); + static value_type& instance(); + static instance_array_t& instances(); + template + static value_type& instance(construct_on_init, Args&&...); + template + static instance_array_t& instances(construct_on_init, Args&&...); + + static constexpr size_t size() { return MaxThreads; } + + decltype(auto) begin() { return instances().begin(); } + decltype(auto) end() { return instances().end(); } + + decltype(auto) begin() const { return instances().begin(); } + decltype(auto) end() const { return instances().end(); } +}; + +template +template +void +thread_data, Tag, MaxThreads>::construct(Args&&... _args) +{ + // construct outside of lambda to prevent data-race + static auto& _instances = instances(); + static thread_local bool _v = [&]() { + _instances.at(threading::get_id()) = + generate{}(std::forward(_args)...); + return true; + }(); + (void) _v; +} + +template +Tp& +thread_data, Tag, MaxThreads>::instance() +{ + return instances().at(threading::get_id()); +} + +template +typename thread_data, Tag, MaxThreads>::instance_array_t& +thread_data, Tag, MaxThreads>::instances() +{ + static auto _v = instance_array_t{}; + return _v; +} + +template +template +Tp& +thread_data, Tag, MaxThreads>::instance(construct_on_init, Args&&... _args) +{ + construct(std::forward(_args)...); + return instances().at(threading::get_id()); +} + +template +template +typename thread_data, Tag, MaxThreads>::instance_array_t& +thread_data, Tag, MaxThreads>::instances(construct_on_init, Args&&... _args) +{ + static auto& _v = [&]() -> instance_array_t& { + auto& _internal = instances(); + for(size_t i = 0; i < MaxThreads; ++i) + _internal.at(i) = generate{}(std::forward(_args)...); return _internal; }(); return _v; diff --git a/source/lib/omnitrace/library/thread_info.cpp b/source/lib/omnitrace/library/thread_info.cpp new file mode 100644 index 000000000..3ba93d6c6 --- /dev/null +++ b/source/lib/omnitrace/library/thread_info.cpp @@ -0,0 +1,211 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/thread_info.hpp" +#include "library/config.hpp" +#include "library/debug.hpp" +#include "library/runtime.hpp" +#include "library/state.hpp" +#include "library/thread_data.hpp" +#include "library/utility.hpp" + +#include +#include + +namespace omnitrace +{ +namespace +{ +using thread_index_data_t = + thread_data, project::omnitrace>; +using thread_info_data_t = thread_data, project::omnitrace>; + +auto& +get_index_data(int64_t _tid) +{ + static auto& _v = thread_index_data_t::instances(); + return _v.at(_tid); +} + +auto +init_index_data(bool _offset = false) +{ + auto& itr = get_index_data(utility::get_thread_index()); + if(!itr) + { + threading::offset_this_id(_offset); + itr = thread_index_data{}; + if(!config::settings_are_configured()) + { + OMNITRACE_BASIC_VERBOSE_F( + 2, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", + itr->system_value, process::get_id(), dmp::rank(), itr->internal_value); + } + else + { + OMNITRACE_VERBOSE_F( + 2, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", + itr->system_value, process::get_id(), dmp::rank(), itr->internal_value); + } + } + return itr; +} + +const auto unknown_thread = std::optional{}; +} // namespace + +const std::optional& +thread_info::init(bool _offset) +{ + auto& _instances = thread_info_data_t::instances(); + auto _tid = utility::get_thread_index(); + auto _init = [&] { + threading::offset_this_id(_offset); + std::optional& _info = _instances.at(_tid); + _info = thread_info{}; + _info->is_offset = threading::offset_this_id(); + _info->index_data = init_index_data(_info->is_offset); + _info->lifetime.first = tim::get_clock_real_now(); + if(_info->is_offset) set_thread_state(ThreadState::Disabled); + }; + + static thread_local std::once_flag _once{}; + std::call_once(_once, _init); + + return _instances.at(_tid); +} + +const std::optional& +thread_info::get() +{ + return get(utility::get_thread_index(), LookupTID); +} + +const std::optional& +thread_info::get(int64_t _tid, ThreadIdType _type) +{ + if(_type == ThreadIdType::LookupTID) + return thread_info_data_t::instances().at(_tid); + else if(_type == ThreadIdType::SystemTID) + { + const auto& _v = thread_info_data_t::instances(); + for(const auto& itr : _v) + { + if(itr && itr->index_data->system_value == _tid) return itr; + } + } + else if(_type == ThreadIdType::InternalTID) + { + const auto& _v = thread_info_data_t::instances(); + for(const auto& itr : _v) + { + if(itr && itr->index_data->internal_value == _tid) return itr; + } + } + + OMNITRACE_CI_THROW(unknown_thread, "Unknown thread has been assigned a value"); + return unknown_thread; +} + +void +thread_info::set_start(uint64_t _ts) +{ + auto& _v = thread_info_data_t::instances().at(utility::get_thread_index()); + if(!_v) init(); + if(_ts > 0 && (_v->lifetime.first == 0 || _ts < _v->lifetime.first)) + _v->lifetime.first = _ts; +} + +void +thread_info::set_stop(uint64_t _ts) +{ + auto _tid = utility::get_thread_index(); + auto& _v = thread_info_data_t::instances().at(_tid); + if(_v) + { + _v->lifetime.second = _ts; + // if the main thread, make sure all child threads have a end lifetime + // less than or equal to the main thread end lifetime + if(_tid == 0) + { + for(auto& itr : thread_info_data_t::instances()) + { + if(itr && itr->index_data && itr->index_data->lookup_value > _tid) + { + if(itr->lifetime.second > _v->lifetime.second) + itr->lifetime.second = _v->lifetime.second; + } + } + } + } +} + +uint64_t +thread_info::get_start() const +{ + return lifetime.first; +} + +uint64_t +thread_info::get_stop() const +{ + return lifetime.second; +} + +bool +thread_info::is_valid_time(uint64_t _ts) const +{ + return (_ts >= lifetime.first && _ts <= lifetime.second); +} + +bool +thread_info::is_valid_lifetime(uint64_t _beg, uint64_t _end) const +{ + return (is_valid_time(_beg) && is_valid_time(_end)); +} + +bool +thread_info::is_valid_lifetime(lifetime_data_t _v) const +{ + return (is_valid_time(_v.first) && is_valid_time(_v.second)); +} + +thread_info::lifetime_data_t +thread_info::get_valid_lifetime(lifetime_data_t _v) const +{ + if(!is_valid_time(_v.first)) _v.first = lifetime.first; + if(!is_valid_time(_v.second)) _v.second = lifetime.second; + return _v; +} + +std::string +thread_info::as_string() const +{ + std::stringstream _ss{}; + _ss << std::boolalpha << "is_offset=" << is_offset; + if(index_data) + _ss << ", index_data=(" << index_data->lookup_value << ", " + << index_data->system_value << ", " << index_data->internal_value << ")"; + _ss << ", lifetime=(" << lifetime.first << ":" << lifetime.second << ")"; + return _ss.str(); +} +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/thread_info.hpp b/source/lib/omnitrace/library/thread_info.hpp new file mode 100644 index 000000000..524c8b105 --- /dev/null +++ b/source/lib/omnitrace/library/thread_info.hpp @@ -0,0 +1,104 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/utility.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace omnitrace +{ +enum ThreadIdType : int +{ + LookupTID = 0, + SystemTID = 1, + InternalTID = 2, +}; + +struct thread_index_data +{ + // the lookup value is always incremented for each thread + // the system value is the tid provided by the operating system + // the internal value is the value which the user expects + int64_t lookup_value = utility::get_thread_index(); + int64_t system_value = tim::threading::get_sys_tid(); + int64_t internal_value = tim::threading::get_id(); +}; + +struct thread_info +{ + using index_data_t = std::optional; + using lifetime_data_t = std::pair; + + ~thread_info() = default; + thread_info(const thread_info&) = delete; + thread_info(thread_info&&) = default; + + thread_info& operator=(const thread_info&) = delete; + thread_info& operator=(thread_info&&) = default; + + static void set_start(uint64_t); + static void set_stop(uint64_t); + + uint64_t get_start() const; + uint64_t get_stop() const; + + bool is_valid_time(uint64_t _ts) const; + bool is_valid_lifetime(uint64_t _beg, uint64_t _end) const; + bool is_valid_lifetime(lifetime_data_t) const; + lifetime_data_t get_valid_lifetime(lifetime_data_t) const; + + std::string as_string() const; + + static const std::optional& init(bool _offset = false); + static const std::optional& get(); + static const std::optional& get(int64_t _tid, ThreadIdType _type); + + bool is_offset = false; + index_data_t index_data = {}; + lifetime_data_t lifetime = { 0, 0 }; + + friend std::ostream& operator<<(std::ostream& _os, const thread_info& _v) + { + return (_os << _v.as_string()); + } + +private: + thread_info() = default; +}; +} // namespace omnitrace + +namespace std +{ +inline std::string +to_string(const omnitrace::thread_info& _info) +{ + return _info.as_string(); +} +} // namespace std diff --git a/source/lib/omnitrace/library/timemory.hpp b/source/lib/omnitrace/library/timemory.hpp index 886467b64..f274df060 100644 --- a/source/lib/omnitrace/library/timemory.hpp +++ b/source/lib/omnitrace/library/timemory.hpp @@ -47,15 +47,12 @@ namespace omnitrace { namespace audit = ::tim::audit; // NOLINT namespace comp = ::tim::component; // NOLINT -namespace quirk = ::tim::quirk; // NOLINT -namespace threading = ::tim::threading; // NOLINT -namespace scope = ::tim::scope; // NOLINT namespace dmp = ::tim::dmp; // NOLINT -namespace process = ::tim::process; // NOLINT -namespace units = ::tim::units; // NOLINT -namespace trait = ::tim::trait; // NOLINT -namespace api = ::tim::api; // NOLINT namespace operation = ::tim::operation; // NOLINT +namespace quirk = ::tim::quirk; // NOLINT +namespace units = ::tim::units; // NOLINT using settings = ::tim::settings; // NOLINT + +using ::tim::get_env; // NOLINT } // namespace omnitrace diff --git a/source/lib/omnitrace/library/tracing.cpp b/source/lib/omnitrace/library/tracing.cpp index 9cab24c5c..03fd268fa 100644 --- a/source/lib/omnitrace/library/tracing.cpp +++ b/source/lib/omnitrace/library/tracing.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "library/tracing.hpp" +#include "library/thread_info.hpp" namespace omnitrace { @@ -54,5 +55,12 @@ get_timemory_hash_aliases(int64_t _tid) std::array{}; return _v.at(_tid); } + +void +record_thread_start_time() +{ + static thread_local std::once_flag _once{}; + std::call_once(_once, []() { thread_info::set_start(comp::wall_clock::record()); }); +} } // namespace tracing } // namespace omnitrace diff --git a/source/lib/omnitrace/library/tracing.hpp b/source/lib/omnitrace/library/tracing.hpp index 0903682f3..4363be6ac 100644 --- a/source/lib/omnitrace/library/tracing.hpp +++ b/source/lib/omnitrace/library/tracing.hpp @@ -22,6 +22,7 @@ #pragma once +#include "library/common.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" @@ -58,6 +59,9 @@ now() return ::tim::get_clock_real_now(); } +void +record_thread_start_time(); + namespace { bool debug_push = // NOLINT @@ -100,6 +104,16 @@ pop_count() inline void thread_init() { + static thread_local auto _dtor = scope::destructor{ []() { + if(get_state() != State::Finalized) + { + if(get_use_sampling()) sampling::shutdown(); + auto& _thr_bundle = thread_data::instance(); + if(_thr_bundle && _thr_bundle->get() && + _thr_bundle->get()->get_is_running()) + _thr_bundle->stop(); + } + } }; static thread_local auto _thread_setup = []() { if(threading::get_id() > 0) threading::set_thread_name(JOIN(" ", "Thread", threading::get_id()).c_str()); @@ -111,18 +125,9 @@ thread_init() // save the hash maps get_timemory_hash_ids() = tim::get_hash_ids(); get_timemory_hash_aliases() = tim::get_hash_aliases(); + record_thread_start_time(); return true; }(); - static thread_local auto _dtor = scope::destructor{ []() { - if(get_state() != State::Finalized) - { - if(get_use_sampling()) sampling::shutdown(); - auto& _thr_bundle = thread_data::instance(); - if(_thr_bundle && _thr_bundle->get() && - _thr_bundle->get()->get_is_running()) - _thr_bundle->stop(); - } - } }; (void) _thread_setup; (void) _dtor; } @@ -144,41 +149,47 @@ thread_init_sampling() (void) _v; } -template +template inline void -push_timemory(const char* name, Args&&... args) -{ - auto& _data = tracing::get_instrumentation_bundles(); - // this generates a hash for the raw string array - auto _hash = tim::add_hash_id(tim::string_view_t{ name }); - auto* _bundle = _data.allocator.allocate(1); - _data.bundles.emplace_back(_bundle); - _data.allocator.construct(_bundle, _hash); - _bundle->start(std::forward(args)...); +push_timemory(CategoryT, const char* name, Args&&... args) +{ + if(trait::runtime_enabled::get()) + { + auto& _data = tracing::get_instrumentation_bundles(); + // this generates a hash for the raw string array + auto _hash = tim::add_hash_id(tim::string_view_t{ name }); + auto* _bundle = _data.allocator.allocate(1); + _data.bundles.emplace_back(_bundle); + _data.allocator.construct(_bundle, _hash); + _bundle->start(std::forward(args)...); + } } -template +template inline void -pop_timemory(const char* name, Args&&... args) +pop_timemory(CategoryT, const char* name, Args&&... args) { - auto _hash = tim::hash::get_hash_id(tim::string_view_t{ name }); - auto& _data = tracing::get_instrumentation_bundles(); - if(_data.bundles.empty()) + if(trait::runtime_enabled::get()) { - OMNITRACE_DEBUG("[%s] skipped %s :: empty bundle stack\n", "omnitrace_pop_trace", - name); - return; - } - for(size_t i = _data.bundles.size(); i > 0; --i) - { - auto*& _v = _data.bundles.at(i - 1); - if(_v->get_hash() == _hash) + auto _hash = tim::hash::get_hash_id(tim::string_view_t{ name }); + auto& _data = tracing::get_instrumentation_bundles(); + if(_data.bundles.empty()) + { + OMNITRACE_DEBUG("[%s] skipped %s :: empty bundle stack\n", + "omnitrace_pop_trace", name); + return; + } + for(size_t i = _data.bundles.size(); i > 0; --i) { - _v->stop(std::forward(args)...); - _data.allocator.destroy(_v); - _data.allocator.deallocate(_v, 1); - _data.bundles.erase(_data.bundles.begin() + (i - 1)); - break; + auto*& _v = _data.bundles.at(i - 1); + if(_v->get_hash() == _hash) + { + _v->stop(std::forward(args)...); + _data.allocator.destroy(_v); + _data.allocator.deallocate(_v, 1); + _data.bundles.erase(_data.bundles.begin() + (i - 1)); + break; + } } } } diff --git a/source/lib/omnitrace/library/utility.hpp b/source/lib/omnitrace/library/utility.hpp index 8dd651f3b..cddaecade 100644 --- a/source/lib/omnitrace/library/utility.hpp +++ b/source/lib/omnitrace/library/utility.hpp @@ -32,8 +32,6 @@ namespace omnitrace { namespace utility { -namespace -{ /// provides an alternative thread index for when using threading::get_id() is not /// desirable inline auto @@ -65,6 +63,5 @@ get_reserved_vector(size_t _n) _v.reserve(_n); return _v; } -} // namespace } // namespace utility } // namespace omnitrace diff --git a/source/python/libpyomnitrace.cpp b/source/python/libpyomnitrace.cpp index 153cca77e..2a986a3ea 100644 --- a/source/python/libpyomnitrace.cpp +++ b/source/python/libpyomnitrace.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -762,8 +763,8 @@ generate(py::module& _pymod) std::ofstream ofs{}; if(tim::filepath::open(ofs, _name)) { - fprintf(stderr, "[%s][coverage]> Outputting '%s'...\n", TIMEMORY_PROJECT_NAME, - _name.c_str()); + tim::operation::file_output_message{}( + _name, std::string{ "coverage" }); ofs << oss.str() << "\n"; } else diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1a6f2cf40..e398c7e37 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -231,7 +231,7 @@ endfunction() # -------------------------------------------------------------------------------------- # function(OMNITRACE_ADD_TEST) - foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN) + foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN BASELINE) foreach(_TYPE PASS FAIL SKIP) list(APPEND _REGEX_OPTS "${_PREFIX}_${_TYPE}_REGEX") endforeach() @@ -404,6 +404,8 @@ function(OMNITRACE_ADD_TEST) set(_REGEX_VAR RUNTIME) elseif("${_TEST}" MATCHES "binary-rewrite") set(_REGEX_VAR REWRITE) + elseif("${_TEST}" MATCHES "baseline") + set(_REGEX_VAR BASELINE) else() set(_REGEX_VAR) endif() @@ -822,8 +824,9 @@ omnitrace_add_test( LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" - ) + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + REWRITE_RUN_PASS_REGEX "\\|_\\[kokkos\\]" + RUNTIME_PASS_REGEX "\\|_\\[kokkos\\]") omnitrace_add_test( SKIP_RUNTIME SKIP_REWRITE @@ -835,8 +838,9 @@ omnitrace_add_test( LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace.so" - ) + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace.so" + BASELINE_PASS_REGEX + "\\|_\\[kokkos\\]") omnitrace_add_test( SKIP_RUNTIME SKIP_REWRITE @@ -848,8 +852,9 @@ omnitrace_add_test( LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" - ) + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + BASELINE_PASS_REGEX + "\\|_\\[kokkos\\]") omnitrace_add_test( SKIP_BASELINE @@ -923,6 +928,12 @@ omnitrace_add_test( "${_timemory_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") +if(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY) + set(_OMPT_PASS_REGEX "\\|_ompt_") +else() + set(_OMPT_PASS_REGEX "") +endif() + omnitrace_add_test( SKIP_SAMPLING NAME openmp-cg @@ -932,7 +943,9 @@ omnitrace_add_test( RUNTIME_ARGS -e -v 1 --label return args REWRITE_TIMEOUT 180 RUNTIME_TIMEOUT 360 - ENVIRONMENT "${_ompt_environment};OMNITRACE_USE_SAMPLING=OFF" + ENVIRONMENT "${_ompt_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_COUT_OUTPUT=ON" + REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" + RUNTIME_PASS_REGEX "${_OMPT_PASS_REGEX}" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") omnitrace_add_test( @@ -945,7 +958,8 @@ omnitrace_add_test( REWRITE_TIMEOUT 180 RUNTIME_TIMEOUT 360 ENVIRONMENT - "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=100" + "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=100;OMNITRACE_COUT_OUTPUT=ON" + REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") omnitrace_add_test( @@ -1104,7 +1118,7 @@ if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE) NAME parallel-overhead-attach COMMAND ${CMAKE_CURRENT_LIST_DIR}/run-omnitrace-pid.sh $ - -ME "\.c$" -e -v 1 --label return args file -l -- + -ME "\.c$" -E fib -e -v 1 --label return args file -l -- $ 30 8 1000 WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) @@ -1192,6 +1206,10 @@ endforeach() set(_INDEX 0) foreach(_VERSION ${OMNITRACE_PYTHON_VERSIONS}) + if(NOT OMNITRACE_USE_PYTHON) + continue() + endif() + list(GET OMNITRACE_PYTHON_ROOT_DIRS ${_INDEX} _PYTHON_ROOT_DIR) omnitrace_find_python(