Skip to content

Commit

Permalink
simulation finalizes
Browse files Browse the repository at this point in the history
implement knob for sapphire rapids latency (knob works and the entry in the tex file is updated)
correct counting of AMX instructions (config + 3 loads + multiply + store + release = 7)
trace generator outputs some strange numbers regarding stride length and memory size -- need to check on this
need to check if there is anything to be done with the matmul execution logic (I don't think there is?
  • Loading branch information
Michael Allen Goldstein committed Mar 11, 2024
1 parent 4754bdd commit 507059e
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 37 deletions.
3 changes: 2 additions & 1 deletion doc/latex/knob.tex
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ \subsubsection{\cpu Latency Parameters}
\Verb+uop_latency_map x86+ & \Verb+// default latency mapping to Intel Sandy Bridge+ \\
\Verb+uop_latency_map skylake+ & \Verb+// latency mapping to Intel Skylake+ \\
\Verb+uop_latency_map skylake_x+ & \Verb+// latency mapping to Intel Skylake-X+ \\
\Verb+uop_latency_map coffee_lake+ & \Verb+// latency mapping to Intel Coffee Lake+
\Verb+uop_latency_map coffee_lake+ & \Verb+// latency mapping to Intel Coffee Lake+ \\
\Verb+uop_latency_map sapphire_rapids+ & \Verb+// latency mapping to Intel Sapphire Rapids+
\end{tabular}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down
33 changes: 20 additions & 13 deletions src/trace_read_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,6 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
if (pi->m_opcode == XED_CATEGORY_AMX_TILE) {
// handle AMX tile instructions
bool is_amx_mem = (pi->m_has_st) || (pi->m_num_ld > 0);
// bool is_amx_config = (pi->m_num_ld == 1) && (pi->m_tile_info.palette != 0); // questionable...
dyn_uop_counter = 1;

if (is_amx_mem) {
Expand All @@ -911,6 +910,8 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,

if (pi->m_has_st) {
trace_uop[0]->m_mem_type = MEM_ST;
trace_uop[0]->m_mem_size = pi->m_mem_write_size;
//cout << "store address:0x" << trace_uop[0]->m_addr << endl;
DEBUG_CORE(
core_id,
"AMX_TILE_MEM core_id:%d thread_id:%d pc:0x%llx opcode:%d"
Expand All @@ -921,7 +922,7 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
);
STAT_EVENT_N(TILESTORED_COUNT, 16);
} else {
int tileload_type = true;
tileload_type = true;
trace_uop[0]->m_mem_type = MEM_LD;
trace_uop[0]->m_mem_size = pi->m_mem_read_size;
DEBUG_CORE(
Expand All @@ -937,7 +938,7 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
}
// generate 1 load uop for each of the 16 rows for a tile
// TODO: test this when servers are working properly
int num_tile_uops = pi->m_num_ld; // 16
int num_tile_uops = 16;//pi->m_num_ld; // 16

key_addr = (pi->m_instruction_addr << 3);
info = htable->hash_table_access_create(key_addr, &new_entry);
Expand All @@ -952,8 +953,12 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
for (jj = dyn_uop_counter; jj < num_tile_uops; jj++) {
if (tileload_type) {
trace_uop[jj]->m_mem_type = MEM_LD;
info->m_table_info->m_mem_type = MEM_LD;
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_read_size;
} else {
trace_uop[jj]->m_mem_type = MEM_ST;
info->m_table_info->m_mem_type = MEM_ST;
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_write_size;
}

if (jj == 0) {
Expand All @@ -967,11 +972,11 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
key_addr = (pi->m_instruction_addr << 5) + jj;
info = htable->hash_table_access_create(key_addr, &new_entry);

if (tileload_type) {
info->m_table_info->m_mem_type = MEM_LD;
} else {
info->m_table_info->m_mem_type = MEM_ST;
}
// if (tileload_type) {
// info->m_table_info->m_mem_type = MEM_LD;
// } else {
// info->m_table_info->m_mem_type = MEM_ST;
// }

if (!(jj == 0 && ii == 0)) {
info->m_trace_info.m_bom = false;
Expand All @@ -984,11 +989,13 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
rep_offset, pi->m_mem_read_size, jj
);

if (tileload_type) {
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_read_size;
} else {
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_write_size;
}
// if (tileload_type) {
// // load
// trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_read_size;
// } else {
// // store
// trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_write_size;
// }

convert_dyn_uop(info, pi, trace_uop[dyn_uop_counter], rep_offset, core_id);

Expand Down
41 changes: 19 additions & 22 deletions tools/x86_trace_generator/trace_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ CONTROL_MANAGER control;
// AMX Handling
////////////////////////////////////////////////////////////////////////////////////////////////////////

VOID AMXLoad(ADDRINT *addr, UINT32 stride, UINT32 mem_read_size, THREADID tid) {
VOID AMXLoad(ADDRINT addr, UINT32 stride, UINT32 mem_read_size, THREADID tid) {
// check thread is not a dummy and is being instrumented
tid = threadMap[tid];
THREAD_ENABLE_CHECK(tid);
Expand All @@ -211,12 +211,17 @@ VOID AMXLoad(ADDRINT *addr, UINT32 stride, UINT32 mem_read_size, THREADID tid) {
if (tr_info == nullptr || !PIN_IsAmxActive(tid)){
return;
}
tr_info->vaddr1 = *addr;
#ifdef VERBOSE
cout << "load address: 0x" << std::hex << addr << endl;
cout << "load size: " << mem_read_size << endl;
cout << "stride: " << stride << endl;
#endif
tr_info->vaddr1 = addr;
tr_info->mem_read_size = mem_read_size;
tr_info->vaddr2 = static_cast<ADDRINT>(stride);
}

VOID AMXStore(ADDRINT *addr, UINT32 stride, UINT32 mem_st_size, THREADID tid) {
VOID AMXStore(ADDRINT addr, UINT32 stride, UINT32 mem_st_size, THREADID tid) {
// check thread is not a dummy and is being instrumented
tid = threadMap[tid];
THREAD_ENABLE_CHECK(tid);
Expand All @@ -225,7 +230,12 @@ VOID AMXStore(ADDRINT *addr, UINT32 stride, UINT32 mem_st_size, THREADID tid) {
if (tr_info == nullptr || !PIN_IsAmxActive(tid)){
return;
}
tr_info->st_vaddr = *addr;
#ifdef VERBOSE
cout << "store address: 0x" << std::hex << addr << endl;
cout << "store size: " << mem_st_size << endl;
cout << "stride: " << stride << endl;
#endif
tr_info->st_vaddr = addr;
tr_info->vaddr2 = static_cast<ADDRINT>(stride);
tr_info->mem_write_size = mem_st_size;
}
Expand All @@ -235,7 +245,6 @@ VOID AMXZero(UINT32 dst, THREADID tid) {
if (tr_info == nullptr || !PIN_IsAmxActive(tid)) {
return;
}
// is there anything I need to do here?
}

VOID AMXGEMM(UINT32 dst, UINT32 a, UINT32 b, THREADID tid) {
Expand Down Expand Up @@ -972,8 +981,8 @@ void instrument(INS ins)
ins,
IPOINT_BEFORE, AFUNPTR(AMXLoad),
IARG_MEMORYOP_PTR, 0,
IARG_UINT32, 64, // assume max size
IARG_UINT32, 64,
IARG_UINT32, 64, // stride
IARG_UINT32, 64, // read size
IARG_THREAD_ID,
IARG_END
);
Expand Down Expand Up @@ -1039,31 +1048,19 @@ void instrument(INS ins)
ins,
IPOINT_BEFORE, AFUNPTR(AMXStore),
IARG_MEMORYOP_EA, 0,
IARG_UINT32, 64, // assuming max size
IARG_UINT32, 64,
IARG_UINT32, 64, // stride
IARG_UINT32, 64, // write size
IARG_THREAD_ID,
IARG_END
);
} else if (INS_Mnemonic(ins) == "LDTILECFG") {
#ifdef VERBOSE
//REG base_reg = INS_OperandMemoryBaseReg(ins, 1);
//REG index_reg = INS_OperandMemoryIndexReg(ins, 1);
cout << "ldtilecfg" /*[" << REG_StringShort(base_reg) << "+" << REG_StringShort(index_reg) << "]"*/ << endl;
cout << "ldtilecfg" << endl;
#endif
// send memory address to copy config data from
// info->num_ld = 1;
// INS_InsertCall(
// ins,
// IPOINT_BEFORE, AFUNPTR(AMXConfig),
// IARG_MEMORYOP_PTR, 0,
// IARG_THREAD_ID,
// IARG_END
// );
} else if (INS_Mnemonic(ins) == "TILERELEASE") {
#ifdef VERBOSE
cout << "tilerelease" << endl;
#endif
// memset((void *)&t_info, 0, sizeof(tile_info_t));
} else {
cerr << "Unsupported AMX instruction: " << INS_Mnemonic(ins) << endl;
exit(-1);
Expand Down
1 change: 0 additions & 1 deletion tools/x86_trace_generator/trace_generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ struct Inst_info
bool write_flg; // 1bit
uint8_t num_ld; // 2bit
uint8_t size; // 5 bit
// tile_info_t tile_info; // 64 bytes
// **** dynamic ****
uint64_t ld_vaddr1; // 4 bytes
uint64_t ld_vaddr2; // 4 bytes
Expand Down

0 comments on commit 507059e

Please sign in to comment.