From e92cbbd642890d16c16918b53ed51e5b5be61340 Mon Sep 17 00:00:00 2001 From: Floogle <18466542+skyfloogle@users.noreply.github.com> Date: Tue, 9 Jan 2024 00:49:43 +0100 Subject: [PATCH] further improve instruction timing accuracy --- include/arm_emit.h | 5 ++++ source/common/drc_core.c | 55 ++++++++++++++++++++++++++++++++++++++++ source/common/v810_cpu.c | 4 +-- 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/include/arm_emit.h b/include/arm_emit.h index bed3220..7664d6c 100644 --- a/include/arm_emit.h +++ b/include/arm_emit.h @@ -397,6 +397,11 @@ static inline void new_floating_point(BYTE cond, BYTE opc1, BYTE opc2, BYTE b12, #define ADDS_I(Rd, Rn, imm8, rot) \ new_data_proc_imm(ARM_COND_AL, ARM_OP_ADD, 1, Rn, Rd, rot, imm8) +// sub Rd, Rn, imm8, ror #rot +// Subtract immediate +#define SUB_I(Rd, Rn, imm8, rot) \ + new_data_proc_imm(ARM_COND_AL, ARM_OP_SUB, 0, Rn, Rd, rot, imm8) + // orr Rd, imm, ror #rot // Or immediate // imm8 can be rotated an even number of times diff --git a/source/common/drc_core.c b/source/common/drc_core.c index ecfeb0a..4225711 100644 --- a/source/common/drc_core.c +++ b/source/common/drc_core.c @@ -833,6 +833,8 @@ int drc_translateBlock(exec_block *block) { } B(arm_cond, 0); } + // branch not taken, so it only took 1 cycle + SUB_I(10, 10, 2, 0); break; // Special case: bnh and bh can't be directly translated to ARM case V810_OP_BNH: @@ -848,6 +850,8 @@ int drc_translateBlock(exec_block *block) { B(ARM_COND_CS, 0); B(ARM_COND_EQ, 0); } + // branch not taken, so it only took 1 cycle + SUB_I(10, 10, 2, 0); break; case V810_OP_BH: if (inst_cache[i].busywait) { @@ -863,6 +867,8 @@ int drc_translateBlock(exec_block *block) { Boff(ARM_COND_EQ, 2); B(ARM_COND_AL, 0); } + // branch not taken, so it only took 1 cycle + SUB_I(10, 10, 2, 0); break; case V810_OP_MOVHI: // movhi imm16, reg1, reg2: MOV_I(0, (inst_cache[i].imm >> 8), 8); @@ -1126,6 +1132,15 @@ int drc_translateBlock(exec_block *block) { } else { SAVE_REG2(0); } + + if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) { + // load immediately following another load takes 4 cycles instead of 5 + cycles -= 1; + } else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) { + // load following instruction taking "many" cycles only takes 1 cycles + // guessing "many" is 4 for now + cycles -= 4; + } break; case V810_OP_LD_H: // ld.h disp16 [reg1], reg2 case V810_OP_IN_H: // in.h disp16 [reg1], reg2 @@ -1148,6 +1163,15 @@ int drc_translateBlock(exec_block *block) { } else { SAVE_REG2(0); } + + if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) { + // load immediately following another load takes 4 cycles instead of 5 + cycles -= 1; + } else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) { + // load following instruction taking "many" cycles only takes 1 cycles + // guessing "many" is 4 for now + cycles -= 4; + } break; case V810_OP_LD_W: // ld.w disp16 [reg1], reg2 case V810_OP_IN_W: // in.w disp16 [reg1], reg2 @@ -1162,6 +1186,15 @@ int drc_translateBlock(exec_block *block) { BLX(ARM_COND_AL, 1); SAVE_REG2(0); + + if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) { + // load immediately following another load takes 4 cycles instead of 5 + cycles -= 1; + } else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) { + // load following instruction taking "many" cycles only takes 1 cycles + // guessing "many" is 4 for now + cycles -= 4; + } break; case V810_OP_ST_B: // st.h reg2, disp16 [reg1] case V810_OP_OUT_B: // out.h reg2, disp16 [reg1] @@ -1179,6 +1212,13 @@ int drc_translateBlock(exec_block *block) { LDR_IO(2, 11, 69 * 4); ADD_I(2, 2, DRC_RELOC_WBYTE*4, 0); BLX(ARM_COND_AL, 2); + + if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + ) { + // with three consecutive stores, the third takes 4 cycles instead of 1 + cycles += 3; + } break; case V810_OP_ST_H: // st.h reg2, disp16 [reg1] case V810_OP_OUT_H: // out.h reg2, disp16 [reg1] @@ -1196,6 +1236,13 @@ int drc_translateBlock(exec_block *block) { LDR_IO(2, 11, 69 * 4); ADD_I(2, 2, DRC_RELOC_WHWORD*4, 0); BLX(ARM_COND_AL, 2); + + if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + ) { + // with three consecutive stores, the third takes 4 cycles instead of 1 + cycles += 3; + } break; case V810_OP_ST_W: // st.h reg2, disp16 [reg1] case V810_OP_OUT_W: // out.h reg2, disp16 [reg1] @@ -1214,12 +1261,20 @@ int drc_translateBlock(exec_block *block) { ADD_I(2, 2, DRC_RELOC_WWORD*4, 0); BLX(ARM_COND_AL, 2); + if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2 + ) { + // with three consecutive stores, the third takes 4 cycles instead of 1 + cycles += 3; + } + // if we load the same thing immediately after saving it, skip the loading if (i + 1 < num_v810_inst && (inst_cache[i + 1].opcode == V810_OP_LD_W || inst_cache[i + 1].opcode == V810_OP_IN_W) && inst_cache[i + 1].imm == inst_cache[i].imm && inst_cache[i + 1].reg1 == inst_cache[i].reg1 && inst_cache[i + 1].reg2 == inst_cache[i].reg2 ) { + cycles += 5; inst_cache[i].branch_offset = 8; B(ARM_COND_AL, 0); } diff --git a/source/common/v810_cpu.c b/source/common/v810_cpu.c index b90d002..c16fcf3 100644 --- a/source/common/v810_cpu.c +++ b/source/common/v810_cpu.c @@ -24,8 +24,8 @@ const BYTE opcycle[0x50] = { 0x01,0x01,0x01,0x01,0x01,0x01,0x03,0x01,0x0D,0x26,0x0D,0x24,0x01,0x01,0x01,0x01, 0x01,0x01,0x01,0x01,0x01,0x01,0x0C,0x01,0x0F,0x0A,0x05,0x00,0x08,0x08,0x0C,0x00, //EI, HALT, LDSR, STSR, DI, BSTR -- Unknown clocks 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x03,0x03,0x01,0x01,0x01,0x01, - 0x05,0x05,0x0D,0x05,0x05,0x05,0x00,0x05,0x05,0x05,0x1A,0x05,0x05,0x05,0x00,0x05, //these are based on 16-bit bus!! (should be 32-bit?) - 0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02 + 0x05,0x05,0x0D,0x05,0x01,0x01,0x00,0x01,0x05,0x05,0x1A,0x05,0x01,0x01,0x00,0x01, //these are based on 16-bit bus!! (should be 32-bit?) + 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x03,0x03 }; int v810_init(char *rom_name) {