Skip to content

Commit

Permalink
further improve instruction timing accuracy
Browse files Browse the repository at this point in the history
  • Loading branch information
skyfloogle committed Jan 9, 2024
1 parent 3e0f09e commit e92cbbd
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 2 deletions.
5 changes: 5 additions & 0 deletions include/arm_emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,11 @@ static inline void new_floating_point(BYTE cond, BYTE opc1, BYTE opc2, BYTE b12,
#define ADDS_I(Rd, Rn, imm8, rot) \
new_data_proc_imm(ARM_COND_AL, ARM_OP_ADD, 1, Rn, Rd, rot, imm8)

// sub Rd, Rn, imm8, ror #rot
// Subtract immediate
#define SUB_I(Rd, Rn, imm8, rot) \
new_data_proc_imm(ARM_COND_AL, ARM_OP_SUB, 0, Rn, Rd, rot, imm8)

// orr Rd, imm, ror #rot
// Or immediate
// imm8 can be rotated an even number of times
Expand Down
55 changes: 55 additions & 0 deletions source/common/drc_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,8 @@ int drc_translateBlock(exec_block *block) {
}
B(arm_cond, 0);
}
// branch not taken, so it only took 1 cycle
SUB_I(10, 10, 2, 0);
break;
// Special case: bnh and bh can't be directly translated to ARM
case V810_OP_BNH:
Expand All @@ -848,6 +850,8 @@ int drc_translateBlock(exec_block *block) {
B(ARM_COND_CS, 0);
B(ARM_COND_EQ, 0);
}
// branch not taken, so it only took 1 cycle
SUB_I(10, 10, 2, 0);
break;
case V810_OP_BH:
if (inst_cache[i].busywait) {
Expand All @@ -863,6 +867,8 @@ int drc_translateBlock(exec_block *block) {
Boff(ARM_COND_EQ, 2);
B(ARM_COND_AL, 0);
}
// branch not taken, so it only took 1 cycle
SUB_I(10, 10, 2, 0);
break;
case V810_OP_MOVHI: // movhi imm16, reg1, reg2:
MOV_I(0, (inst_cache[i].imm >> 8), 8);
Expand Down Expand Up @@ -1126,6 +1132,15 @@ int drc_translateBlock(exec_block *block) {
} else {
SAVE_REG2(0);
}

if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) {
// load immediately following another load takes 4 cycles instead of 5
cycles -= 1;
} else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) {
// load following instruction taking "many" cycles only takes 1 cycles
// guessing "many" is 4 for now
cycles -= 4;
}
break;
case V810_OP_LD_H: // ld.h disp16 [reg1], reg2
case V810_OP_IN_H: // in.h disp16 [reg1], reg2
Expand All @@ -1148,6 +1163,15 @@ int drc_translateBlock(exec_block *block) {
} else {
SAVE_REG2(0);
}

if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) {
// load immediately following another load takes 4 cycles instead of 5
cycles -= 1;
} else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) {
// load following instruction taking "many" cycles only takes 1 cycles
// guessing "many" is 4 for now
cycles -= 4;
}
break;
case V810_OP_LD_W: // ld.w disp16 [reg1], reg2
case V810_OP_IN_W: // in.w disp16 [reg1], reg2
Expand All @@ -1162,6 +1186,15 @@ int drc_translateBlock(exec_block *block) {
BLX(ARM_COND_AL, 1);

SAVE_REG2(0);

if (i > 0 && (inst_cache[i - 1].opcode & 0x34) == 0x30 && (inst_cache[i - 1].opcode & 3) != 2) {
// load immediately following another load takes 4 cycles instead of 5
cycles -= 1;
} else if (i > 0 && opcycle[inst_cache[i - 1].opcode] > 4) {
// load following instruction taking "many" cycles only takes 1 cycles
// guessing "many" is 4 for now
cycles -= 4;
}
break;
case V810_OP_ST_B: // st.h reg2, disp16 [reg1]
case V810_OP_OUT_B: // out.h reg2, disp16 [reg1]
Expand All @@ -1179,6 +1212,13 @@ int drc_translateBlock(exec_block *block) {
LDR_IO(2, 11, 69 * 4);
ADD_I(2, 2, DRC_RELOC_WBYTE*4, 0);
BLX(ARM_COND_AL, 2);

if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
&& (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
) {
// with three consecutive stores, the third takes 4 cycles instead of 1
cycles += 3;
}
break;
case V810_OP_ST_H: // st.h reg2, disp16 [reg1]
case V810_OP_OUT_H: // out.h reg2, disp16 [reg1]
Expand All @@ -1196,6 +1236,13 @@ int drc_translateBlock(exec_block *block) {
LDR_IO(2, 11, 69 * 4);
ADD_I(2, 2, DRC_RELOC_WHWORD*4, 0);
BLX(ARM_COND_AL, 2);

if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
&& (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
) {
// with three consecutive stores, the third takes 4 cycles instead of 1
cycles += 3;
}
break;
case V810_OP_ST_W: // st.h reg2, disp16 [reg1]
case V810_OP_OUT_W: // out.h reg2, disp16 [reg1]
Expand All @@ -1214,12 +1261,20 @@ int drc_translateBlock(exec_block *block) {
ADD_I(2, 2, DRC_RELOC_WWORD*4, 0);
BLX(ARM_COND_AL, 2);

if (i > 1 && (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
&& (inst_cache[i - 1].opcode & 0x34) == 0x34 && (inst_cache[i - 1].opcode & 3) != 2
) {
// with three consecutive stores, the third takes 4 cycles instead of 1
cycles += 3;
}

// if we load the same thing immediately after saving it, skip the loading
if (i + 1 < num_v810_inst &&
(inst_cache[i + 1].opcode == V810_OP_LD_W || inst_cache[i + 1].opcode == V810_OP_IN_W) &&
inst_cache[i + 1].imm == inst_cache[i].imm && inst_cache[i + 1].reg1 == inst_cache[i].reg1 &&
inst_cache[i + 1].reg2 == inst_cache[i].reg2
) {
cycles += 5;
inst_cache[i].branch_offset = 8;
B(ARM_COND_AL, 0);
}
Expand Down
4 changes: 2 additions & 2 deletions source/common/v810_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ const BYTE opcycle[0x50] = {
0x01,0x01,0x01,0x01,0x01,0x01,0x03,0x01,0x0D,0x26,0x0D,0x24,0x01,0x01,0x01,0x01,
0x01,0x01,0x01,0x01,0x01,0x01,0x0C,0x01,0x0F,0x0A,0x05,0x00,0x08,0x08,0x0C,0x00, //EI, HALT, LDSR, STSR, DI, BSTR -- Unknown clocks
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x03,0x03,0x01,0x01,0x01,0x01,
0x05,0x05,0x0D,0x05,0x05,0x05,0x00,0x05,0x05,0x05,0x1A,0x05,0x05,0x05,0x00,0x05, //these are based on 16-bit bus!! (should be 32-bit?)
0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02
0x05,0x05,0x0D,0x05,0x01,0x01,0x00,0x01,0x05,0x05,0x1A,0x05,0x01,0x01,0x00,0x01, //these are based on 16-bit bus!! (should be 32-bit?)
0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x03,0x03
};

int v810_init(char *rom_name) {
Expand Down

0 comments on commit e92cbbd

Please sign in to comment.