Skip to content

Commit

Permalink
#8672: Reduce profiler global memory usage
Browse files Browse the repository at this point in the history
  • Loading branch information
mo-tenstorrent committed May 21, 2024
1 parent 727096c commit ce08c57
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 8 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/run-profiler-regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ jobs:
fail-fast: false
matrix:
runner-info: [
# No GS as tests now require synced starts. GS profiler tests will run on dedicated BMs
# E150
{arch: grayskull, runs-on: ["grayskull"], name: E150},
# N150
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-1"]},
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-1"], name: N150},
# N300
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-2"]},
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-2"], name: N300},
]
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/run_performance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ run_device_perf_models() {
if [ "$tt_arch" == "grayskull" ]; then
#TODO(MO): Until #6560 is fixed, GS device profiler test are grouped with
#Model Device perf regression tests to make sure thy run on no-soft-reset BMs
tests/scripts/run_profiler_regressions.sh PROFILER
tests/scripts/run_profiler_regressions.sh PROFILER_NO_RESET

env pytest models/demos/metal_BERT_large_11/tests -m $test_marker

Expand Down
25 changes: 24 additions & 1 deletion tests/scripts/run_profiler_regressions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ run_profiling_test(){

run_additional_T3000_test

TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py -vvv
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_custom_cycle_count -vvv
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_full_buffer -vvv
#TODO(MO): Needed until #6560 is fixed.
if [ "$ARCH_NAME" != "grayskull" ]; then
TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv
fi

remove_default_log_locations

Expand All @@ -51,6 +56,22 @@ run_profiling_test(){
remove_default_log_locations
}

run_profiling_no_reset_test(){
if [[ -z "$ARCH_NAME" ]]; then
echo "Must provide ARCH_NAME in environment" 1>&2
exit 1
fi

echo "Make sure this test runs in a build with ENABLE_PROFILER=1 ENABLE_TRACY=1"

source build/python_env/bin/activate
export PYTHONPATH=$TT_METAL_HOME

TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py::test_multi_op -vvv

remove_default_log_locations
}

run_post_proc_test(){
source build/python_env/bin/activate
export PYTHONPATH=$TT_METAL_HOME
Expand All @@ -62,6 +83,8 @@ cd $TT_METAL_HOME

if [[ $1 == "PROFILER" ]]; then
run_profiling_test
elif [[ $1 == "PROFILER_NO_RESET" ]]; then
run_profiling_no_reset_test
elif [[ $1 == "POST_PROC" ]]; then
run_post_proc_test
else
Expand Down
10 changes: 7 additions & 3 deletions tt_metal/jit_build/genfiles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,12 @@ std::string generate_bank_to_noc_coord_descriptor_string(
ss << endl;
ss << "extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS];" << endl;
ss << "extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS];" << endl;
ss << "extern int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl;
ss << "extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS];" << endl;
ss << "extern int32_t bank_to_l1_offset[NUM_L1_BANKS];" << endl;
ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl;
ss << "extern uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl;
ss << "extern uint16_t profiler_core_count_per_dram;" << endl;
ss << "#endif" << endl;

ss << endl;
ss << "#else // !KERNEL_BUILD (FW_BUILD)" << endl;
Expand Down Expand Up @@ -451,17 +453,18 @@ std::string generate_bank_to_noc_coord_descriptor_string(
* For DRAM banks in particular, integer division of flat_id/core_count_per_dram gives the dram bank id and the modulo
* is the offset.
* */
ss << "#if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC)" << endl;
ss << "uint16_t profiler_core_count_per_dram __attribute__((used)) = ";
ss << core_count_per_dram << ";" << endl;
ss << endl;

ss << "int32_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl;
ss << "uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl;
for (unsigned int x = 0; x < grid_size.x; x++) {
ss << " {" << endl;
for (unsigned int y = 0; y < grid_size.y; y++) {
CoreCoord core = {x,y};
if (profiler_flat_id_map.find(core) == profiler_flat_id_map.end()){
ss << " " << -1 << "," << endl;
ss << " " << 255 << "," << endl;
}
else{
ss << " " << profiler_flat_id_map.at(core) << "," << endl;
Expand All @@ -471,6 +474,7 @@ std::string generate_bank_to_noc_coord_descriptor_string(
}
ss << "};" << endl;
ss << endl;
ss << "#endif" << endl;

#endif

Expand Down

0 comments on commit ce08c57

Please sign in to comment.