Skip to content

Commit

Permalink
HIP API args in perfetto + new perfetto categories (#76)
Browse files Browse the repository at this point in the history
* HIP API perfetto args + updated perfetto categories

- Support for HIP API args field in perfetto
- PERFETTO_CATEGORIES -> OMNITRACE_PERFETTO_CATEGORIES
- Changed perfetto categories for several trace events and trace counters
- migrated several TRACE_EVENT_* to use omnitrace::tracing::{push,pop}_perfetto_ts(...)

* Tweaked category_region to encode the type of args as well as value

- Affects MPI args field in perfetto

* Improved testing in ubuntu-focal.yml

- "Test Install" step sources setup-env.sh
- "Test Install" step tests python support
- "Test Install" step tests reading ~/.omnitrace.cfg
- Avoid installing boost and tbb libs when building from submodule

* validate-perfetto-proto.py accepts -m / --categories

* Remove reference from category_region typeids

* Tweak opensuse action name

* Tweak the "Test Install" Step of ubuntu-focal
  • Loading branch information
jrmadsen authored Jun 29, 2022
1 parent 9b800c0 commit 2e1fd5a
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 103 deletions.
1 change: 1 addition & 0 deletions .cmake-format.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ parse:
NAME: '*'
ARGS: '*'
PERFETTO_FILE: '*'
PERFETTO_METRIC: '*'
TIMEMORY_FILE: '*'
TIMEMORY_METRIC: '*'
rocm_version_message:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/opensuse.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: OpenSUSE 15.2, 15.3 (Python)
name: OpenSUSE 15 (GCC, Python)

on:
push:
Expand Down
31 changes: 23 additions & 8 deletions .github/workflows/ubuntu-focal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,17 @@ jobs:
python: ['ON']
ompt: ['ON']
papi: ['ON']
deps: ['libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev']
include:
- compiler: 'g++'
mpi: ''
boost: 'ON'
tbb: 'ON'
build-type: 'Release'
python: 'OFF'
python: 'ON'
ompt: 'OFF'
papi: 'OFF'
deps: ''

steps:
- uses: actions/checkout@v2
Expand All @@ -45,7 +47,7 @@ jobs:
timeout-minutes: 5
run:
sudo apt-get update &&
sudo apt-get install -y build-essential m4 autoconf libtool python3-pip libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} &&
sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.deps }} clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy &&
python3 -m pip install perfetto &&
Expand All @@ -55,8 +57,7 @@ jobs:
run:
echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV &&
echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV &&
echo "/opt/omnitrace/bin:${HOME}/.local/bin" >> $GITHUB_PATH &&
echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
echo "${HOME}/.local/bin" >> $GITHUB_PATH

- name: Configure CMake
timeout-minutes: 10
Expand Down Expand Up @@ -103,25 +104,39 @@ jobs:
./omnitrace-*.sh --prefix=/opt/omnitrace --exclude-subdir --skip-license
- name: Test Install
timeout-minutes: 10
timeout-minutes: 15
run: |
set -v
export OMNITRACE_DEBUG=ON
source /opt/omnitrace/share/omnitrace/setup-env.sh
cat << EOF > ${HOME}/.omnitrace.cfg
OMNITRACE_VERBOSE = 2
OMNITRACE_USE_TIMEMORY = ON
OMNITRACE_USE_PERFETTO = ON
OMNITRACE_USE_SAMPLING = ON
OMNITRACE_USE_PROCESS_SAMPLING = ON
OMNITRACE_OUTPUT_PATH = omnitrace-tests-output
OMNITRACE_OUTPUT_PREFIX = %tag%/
OMNITRACE_SAMPLING_FREQ = 50
OMNITRACE_SAMPLING_DELAY = 0.05
EOF
which omnitrace-avail
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace-python
omnitrace-python --help
omnitrace-python -b -- ${{ github.workspace }}/examples/python/builtin.py
which omnitrace
ldd $(which omnitrace)
omnitrace --help
omnitrace -e -v 1 -o ls.inst --simulate -- ls
for i in omnitrace-ls.inst-output/*; do echo -e "\n\n --> ${i} \n\n"; cat ${i}; done
for i in omnitrace-tests-output/ls.inst/*; do echo -e "\n\n --> ${i} \n\n"; cat ${i}; done
omnitrace -e -v 1 -o ls.inst -- ls
./ls.inst
omnitrace -e -v 1 --simulate -- ls
for i in omnitrace-ls-output/*; do echo -e "\n\n --> ${i} \n\n"; cat ${i}; done
for i in omnitrace-tests-output/ls/*; do echo -e "\n\n --> ${i} \n\n"; cat ${i}; done
omnitrace -e -v 1 -- ls
- name: Test User API
Expand Down
20 changes: 10 additions & 10 deletions source/lib/omnitrace/library/components/backtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,15 +516,15 @@ backtrace::post_process(int64_t _tid)
_last_bt = _bt;
_mean_ts += _ts;

TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_peak_memory",
perfetto_counter_track<perfetto_rusage>::at(_tid, 0), _ts,
_bt->m_mem_peak / units::megabyte);

TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_context_switch",
perfetto_counter_track<perfetto_rusage>::at(_tid, 1), _ts,
_bt->m_ctx_swch);

TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_page_fault",
perfetto_counter_track<perfetto_rusage>::at(_tid, 2), _ts,
_bt->m_page_flt);

Expand All @@ -535,7 +535,7 @@ backtrace::post_process(int64_t _tid)
{
if(i < _bt->m_hw_counter.size())
{
TRACE_COUNTER("sampling",
TRACE_COUNTER("hardware_counter",
perfetto_counter_track<hw_counters>::at(_tid, i),
_ts, _bt->m_hw_counter.at(i));
}
Expand All @@ -547,15 +547,15 @@ backtrace::post_process(int64_t _tid)
{
auto _ts = pthread_create_gotcha::get_execution_time(_tid)->second;
uint64_t _zero = 0;
TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_peak_memory",
perfetto_counter_track<perfetto_rusage>::at(_tid, 0), _ts,
_zero);

TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_context_switch",
perfetto_counter_track<perfetto_rusage>::at(_tid, 1), _ts,
_zero);

TRACE_COUNTER("sampling",
TRACE_COUNTER("thread_page_fault",
perfetto_counter_track<perfetto_rusage>::at(_tid, 2), _ts,
_zero);

Expand All @@ -566,7 +566,7 @@ backtrace::post_process(int64_t _tid)
{
if(i < _last_bt->m_hw_counter.size())
{
TRACE_COUNTER("sampling",
TRACE_COUNTER("hardware_counter",
perfetto_counter_track<hw_counters>::at(_tid, i),
_ts, _zero);
}
Expand Down Expand Up @@ -601,10 +601,10 @@ backtrace::post_process(int64_t _tid)
auto _ts = _bt->m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue;

TRACE_EVENT_BEGIN("sampling",
TRACE_EVENT_BEGIN("hardware_counter",
perfetto::StaticString{ sitr.first->c_str() },
_last_wall_ts, "begin_ns", _last_wall_ts);
TRACE_EVENT_END("sampling", _ts, "end_ns", _ts);
TRACE_EVENT_END("hardware_counter", _ts, "end_ns", _ts);
}
_last_wall_ts = _bt->m_ts;
}
Expand Down
4 changes: 3 additions & 1 deletion source/lib/omnitrace/library/components/category_region.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ category_region<CategoryT>::audit(const gotcha_data_t& _data, audit::incoming,
Args&&... _args)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
start(_data.tool_id.c_str(), "args", JOIN(", ", _args...));
start(_data.tool_id.c_str(), "args",
JOIN(", ",
JOIN('=', tim::try_demangle<std::remove_reference_t<Args>>(), _args)...));
}

template <typename CategoryT>
Expand Down
10 changes: 9 additions & 1 deletion source/lib/omnitrace/library/components/fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,12 @@ TIMEMORY_DECLARE_TYPE_TRAIT(name, typename Tp)
}

TIMEMORY_DEFINE_NS_API(category, host)
TIMEMORY_DEFINE_NS_API(category, device)
TIMEMORY_DEFINE_NS_API(category, user)
TIMEMORY_DEFINE_NS_API(category, device)
TIMEMORY_DEFINE_NS_API(category, device_hip)
TIMEMORY_DEFINE_NS_API(category, device_hsa)
TIMEMORY_DEFINE_NS_API(category, rocm_hip)
TIMEMORY_DEFINE_NS_API(category, rocm_hsa)
TIMEMORY_DEFINE_NS_API(category, rocm_smi)
TIMEMORY_DEFINE_NS_API(category, kokkos)
TIMEMORY_DEFINE_NS_API(category, mpi)
Expand All @@ -73,7 +77,11 @@ TIMEMORY_DEFINE_NS_API(category, device_critical_trace)

TIMEMORY_DEFINE_NAME_TRAIT("host", category::host);
TIMEMORY_DEFINE_NAME_TRAIT("device", category::device);
TIMEMORY_DEFINE_NAME_TRAIT("device_hip", category::device_hip);
TIMEMORY_DEFINE_NAME_TRAIT("device_hsa", category::device_hsa);
TIMEMORY_DEFINE_NAME_TRAIT("user", category::user);
TIMEMORY_DEFINE_NAME_TRAIT("rocm_hip", category::rocm_hip);
TIMEMORY_DEFINE_NAME_TRAIT("rocm_hsa", category::rocm_hsa);
TIMEMORY_DEFINE_NAME_TRAIT("rocm_smi", category::rocm_smi);
TIMEMORY_DEFINE_NAME_TRAIT("sampling", category::sampling);
TIMEMORY_DEFINE_NAME_TRAIT("thread_sampling", category::thread_sampling);
Expand Down
9 changes: 5 additions & 4 deletions source/lib/omnitrace/library/components/rocm_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,11 @@ data::post_process(uint32_t _dev_id)
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 0), _ts, _busy);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 1), _ts, _temp);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 2), _ts, _power);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 3), _ts, _usage);
TRACE_COUNTER("device_busy", counter_track::at(_dev_id, 0), _ts, _busy);
TRACE_COUNTER("device_temp", counter_track::at(_dev_id, 1), _ts, _temp);
TRACE_COUNTER("device_power", counter_track::at(_dev_id, 2), _ts, _power);
TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, 3), _ts,
_usage);
}
};

Expand Down
81 changes: 57 additions & 24 deletions source/lib/omnitrace/library/components/roctracer_callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "library/tracing.hpp"

#include <timemory/backends/cpu.hpp>
#include <timemory/backends/threading.hpp>
Expand All @@ -38,19 +39,45 @@
#include <cstdint>
#include <tuple>

#define HIP_PROF_HIP_API_STRING 1

#include <roctracer_ext.h>
#include <roctracer_hcc.h>
#include <roctracer_hip.h>

#define AMD_INTERNAL_BUILD 1
#include <roctracer_hsa.h>

#if __has_include(<hip/amd_detail/hip_prof_str.h>) || (defined(OMNITRACE_USE_HIP) && OMNITRACE_USE_HIP > 0)
# include <hip/amd_detail/hip_prof_str.h>
# define OMNITRACE_HIP_API_ARGS 1
#else
# define OMNITRACE_HIP_API_ARGS 0
#endif

TIMEMORY_DEFINE_API(roctracer)
namespace omnitrace
{
namespace api = tim::api;
namespace
{
std::string
hip_api_string(hip_api_id_t id, const hip_api_data_t* data)
{
#if OMNITRACE_HIP_API_ARGS > 0
std::string _v = hipApiString(id, data);
if(_v.empty()) return _v;
auto _pbeg = _v.find('(');
if(_pbeg == std::string::npos) return _v;
auto _pend = _v.find_last_of(')');
if(_pend == std::string::npos || _pbeg >= _pend) return _v;
auto _n = (_pend - _pbeg - 1);
return _v.substr(_pbeg + 1, _n);
#else
tim::consume_parameters(id, data);
#endif
}
//
int64_t
get_clock_skew()
{
Expand Down Expand Up @@ -275,11 +302,12 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*

if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ _name },
static_cast<uint64_t>(begin_timestamp), "begin_ns",
static_cast<uint64_t>(begin_timestamp));
TRACE_EVENT_END("device", static_cast<uint64_t>(end_timestamp),
"end_ns", static_cast<uint64_t>(end_timestamp));
uint64_t _beg_ts = begin_timestamp;
uint64_t _end_ts = end_timestamp;
tracing::push_perfetto_ts(category::rocm_hsa{}, _name, _beg_ts,
"begin_ns", _beg_ts);
tracing::pop_perfetto_ts(category::rocm_hsa{}, _name, _end_ts,
"end_ns", _end_ts);
}

if(get_use_timemory())
Expand Down Expand Up @@ -334,6 +362,9 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
default: break;
}

OMNITRACE_CI_FAIL(_name == nullptr, "Error! HSA operation type not handled: %u\n",
op);

if(!_name) return;

auto _beg_ns = record->begin_ns + get_clock_skew();
Expand All @@ -347,11 +378,10 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)

if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ *_name },
static_cast<uint64_t>(_beg_ns), "begin_ns",
static_cast<uint64_t>(_beg_ns));
TRACE_EVENT_END("device", static_cast<uint64_t>(_end_ns), "end_ns",
static_cast<uint64_t>(_end_ns));
uint64_t _beg = _beg_ns;
uint64_t _end = _end_ns;
tracing::push_perfetto_ts(category::device_hsa{}, *_name, _beg, "begin_ns", _beg);
tracing::pop_perfetto_ts(category::device_hsa{}, *_name, _end, "end_ns", _end);
}

auto _func = [_beg_ns, _end_ns, _name]() {
Expand Down Expand Up @@ -582,11 +612,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*

if(get_use_perfetto())
{
TRACE_EVENT_BEGIN(
"host", perfetto::StaticString{ op_name }, static_cast<uint64_t>(_ts),
perfetto::Flow::ProcessScoped(_cid), "begin_ns",
static_cast<uint64_t>(_ts), "pcid", _parent_cid, "cid", _cid, "device",
_device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
auto _api_id = static_cast<hip_api_id_t>(cid);
tracing::push_perfetto_ts(
category::rocm_hip{}, op_name, _ts, perfetto::Flow::ProcessScoped(_cid),
"begin_ns", static_cast<uint64_t>(_ts), "pcid", _parent_cid, "cid", _cid,
"device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id,
"args", hip_api_string(_api_id, data));
}
if(get_use_timemory())
{
Expand Down Expand Up @@ -622,8 +653,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*

if(get_use_perfetto())
{
TRACE_EVENT_END("host", static_cast<uint64_t>(_ts), "end_ns",
static_cast<uint64_t>(_ts));
tracing::pop_perfetto_ts(category::rocm_hip{}, op_name, _ts, "end_ns",
static_cast<uint64_t>(_ts));
}
if(get_use_timemory())
{
Expand Down Expand Up @@ -779,14 +810,16 @@ hip_activity_callback(const char* begin, const char* end, void*)
_kernel_names.emplace(_name, tim::demangle(_name));

assert(_end_ns > _beg_ns);
TRACE_EVENT_BEGIN("device",
perfetto::StaticString{ _kernel_names.at(_name).c_str() },
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns",
_beg_ns, "corr_id", record->correlation_id, "device",
_devid, "queue", _queid, "op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
tracing::push_perfetto_ts(
category::device_hip{}, _kernel_names.at(_name).c_str(), _beg_ns,
perfetto::Flow::ProcessScoped(_cid), "begin_ns", _beg_ns, "corr_id",
record->correlation_id, "device", _devid, "queue", _queid, "op",
_op_id_names.at(record->op));
tracing::pop_perfetto_ts(category::device_hip{}, "", _end_ns, "end_ns",
_end_ns);
// for some reason, this is necessary to make sure very last one ends
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
tracing::pop_perfetto_ts(category::device_hip{}, "", _end_ns, "end_ns",
_end_ns);
}

if(_critical_trace)
Expand Down
Loading

0 comments on commit 2e1fd5a

Please sign in to comment.