From b145080d372661f86f34b1b707b7619c10e283fc Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Sun, 19 May 2024 17:22:58 +0200 Subject: [PATCH] [DYNAREC] Fixed CALLRET and backported many BIBLOCK improvments from box64 --- src/dynarec/arm_emitter.h | 2 + src/dynarec/arm_epilog.S | 4 +- src/dynarec/arm_prolog.S | 2 +- src/dynarec/dynarec_arm.c | 101 ++++++++++++++++++++-------- src/dynarec/dynarec_arm_d8.c | 4 +- src/dynarec/dynarec_arm_d9.c | 30 ++++----- src/dynarec/dynarec_arm_da.c | 6 +- src/dynarec/dynarec_arm_db.c | 14 ++-- src/dynarec/dynarec_arm_dc.c | 4 +- src/dynarec/dynarec_arm_dd.c | 14 ++-- src/dynarec/dynarec_arm_de.c | 18 ++--- src/dynarec/dynarec_arm_df.c | 20 +++--- src/dynarec/dynarec_arm_functions.c | 61 +++++++++++++++-- src/dynarec/dynarec_arm_functions.h | 4 ++ src/dynarec/dynarec_arm_helper.c | 47 +------------ src/dynarec/dynarec_arm_helper.h | 34 +++++++++- src/dynarec/dynarec_arm_pass.c | 12 ++-- src/dynarec/dynarec_arm_pass0.h | 5 -- src/dynarec/dynarec_arm_private.h | 1 + src/dynarec/dynarec_private.h | 1 + src/emu/x86emu_private.h | 2 +- src/include/dynarec_arm.h | 2 + 22 files changed, 241 insertions(+), 147 deletions(-) diff --git a/src/dynarec/arm_emitter.h b/src/dynarec/arm_emitter.h index 9e5244cbf..bdd0575af 100755 --- a/src/dynarec/arm_emitter.h +++ b/src/dynarec/arm_emitter.h @@ -430,6 +430,8 @@ Op is 20-27 // blx reg #define BLX(reg) EMIT(0xe12fff30 | (reg) ) +// blx cond reg +#define BLXcond(C, reg) EMIT(C | 0x012fff30 | (reg) ) // b cond offset #define Bcond(C, O) EMIT(C | (0b101<<25) | (0<<24) | (((O)>>2)&0xffffff)) diff --git a/src/dynarec/arm_epilog.S b/src/dynarec/arm_epilog.S index 2e2b30b84..0e618e682 100755 --- a/src/dynarec/arm_epilog.S +++ b/src/dynarec/arm_epilog.S @@ -12,7 +12,7 @@ arm_epilog: stm r0, {r4-r12,r14} // put back reg value in emu, including EIP (so r14 must be Flags now) // restore stack pointer ldr sp, [r0, #(8*4+2*4)] - ldr r5, [sp, #-4] + pop {r4, r5} str r5, [r0, #(8*4+2*4)] // put back old value //restore all used register vpop {d8-d15} @@ -25,7 +25,7 @@ arm_epilog: arm_epilog_fast: // restore stack pointer ldr sp, [r0, #(8*4+2*4)] - ldr r5, [sp, #-4] + pop {r4, r5} str r5, [r0, #(8*4+2*4)] // put back old value //restore all used register vpop {d8-d15} diff --git a/src/dynarec/arm_prolog.S b/src/dynarec/arm_prolog.S index 57246d612..f0ce5fb0c 100755 --- a/src/dynarec/arm_prolog.S +++ b/src/dynarec/arm_prolog.S @@ -13,9 +13,9 @@ arm_prolog: vpush {d8-d15} // save Sp and setup stack for optionnal callret ldr r5, [r0, #(8*4+2*4)] // grab old value of xSPSave - str sp, [r0, #(8*4+2*4)] mov r4, #0 push {r4-r5} + str sp, [r0, #(8*4+2*4)] //setup emu -> register ldm r0, {r4-r12} // all 8 register in direct access, plus flags, no EIP (so r14 can be used as scratch) //jump to function diff --git a/src/dynarec/dynarec_arm.c b/src/dynarec/dynarec_arm.c index d4d3d2f36..1703c697f 100755 --- a/src/dynarec/dynarec_arm.c +++ b/src/dynarec/dynarec_arm.c @@ -73,8 +73,7 @@ void add_next(dynarec_arm_t *dyn, uintptr_t addr) { } // add slots if(dyn->next_sz == dyn->next_cap) { - dyn->next_cap += 64; - dyn->next = (uintptr_t*)dynaRealloc(dyn->next, dyn->next_cap*sizeof(uintptr_t)); + printf_log(LOG_NONE, "Warning, overallocating next\n"); } dyn->next[dyn->next_sz++] = addr; } @@ -98,8 +97,7 @@ uintptr_t get_closest_next(dynarec_arm_t *dyn, uintptr_t addr) { void add_jump(dynarec_arm_t *dyn, int ninst) { // add slots if(dyn->jmp_sz == dyn->jmp_cap) { - dyn->jmp_cap += 64; - dyn->jmps = (int*)dynaRealloc(dyn->jmps, dyn->jmp_cap*sizeof(int)); + printf_log(LOG_NONE, "Warning, overallocating jmps\n"); } dyn->jmps[dyn->jmp_sz++] = ninst; } @@ -282,33 +280,47 @@ void addInst(instsize_t* insts, size_t* size, int x86_size, int native_size) } } +static void recurse_mark_alive(dynarec_arm_t* dyn, int i) +{ + if(dyn->insts[i].x86.alive) + return; + dyn->insts[i].x86.alive = 1; + if(dyn->insts[i].x86.jmp && dyn->insts[i].x86.jmp_insts!=-1) + recurse_mark_alive(dyn, dyn->insts[i].x86.jmp_insts); + if(isize-1 && dyn->insts[i].x86.has_next) + recurse_mark_alive(dyn, i+1); +} -static void fillPredecessors(dynarec_arm_t* dyn) +static int sizePredecessors(dynarec_arm_t* dyn) { int pred_sz = 1; // to be safe - // compute total size of predecessor to alocate the array + // compute total size of predecessor to allocate the array + // mark alive... + recurse_mark_alive(dyn, 0); // first compute the jumps + int jmpto; for(int i=0; isize; ++i) { - if(dyn->insts[i].x86.jmp && dyn->insts[i].x86.jmp_insts!=-1) { - ++pred_sz; - ++dyn->insts[dyn->insts[i].x86.jmp_insts].pred_sz; + if(dyn->insts[i].x86.alive && dyn->insts[i].x86.jmp && ((jmpto=dyn->insts[i].x86.jmp_insts)!=-1)) { + pred_sz++; + dyn->insts[jmpto].pred_sz++; } } - // remove "has_next" from orphean branch + // remove "has_next" from orphan branch for(int i=0; isize-1; ++i) { - if(!dyn->insts[i].x86.has_next) { - if(dyn->insts[i+1].x86.has_next && !dyn->insts[i+1].pred_sz) - dyn->insts[i+1].x86.has_next = 0; - } + if(dyn->insts[i].x86.has_next && !dyn->insts[i+1].x86.alive) + dyn->insts[i].x86.has_next = 0; } // second the "has_next" for(int i=0; isize-1; ++i) { if(dyn->insts[i].x86.has_next) { - ++pred_sz; - ++dyn->insts[i+1].pred_sz; + pred_sz++; + dyn->insts[i+1].pred_sz++; } } - dyn->predecessor = (int*)dynaMalloc(pred_sz*sizeof(int)); + return pred_sz; +} +static void fillPredecessors(dynarec_arm_t* dyn) +{ // fill pred pointer int* p = dyn->predecessor; for(int i=0; isize; ++i) { @@ -317,7 +329,7 @@ static void fillPredecessors(dynarec_arm_t* dyn) dyn->insts[i].pred_sz=0; // reset size, it's reused to actually fill pred[] } // fill pred - for(int i=0; isize; ++i) { + for(int i=0; isize; ++i) if(dyn->insts[i].x86.alive) { if((i!=dyn->size-1) && dyn->insts[i].x86.has_next) dyn->insts[i+1].pred[dyn->insts[i+1].pred_sz++] = i; if(dyn->insts[i].x86.jmp && (dyn->insts[i].x86.jmp_insts!=-1)) { @@ -371,22 +383,23 @@ static int updateNeed(dynarec_arm_t* dyn, int ninst, uint8_t need) { } void* current_helper = NULL; +static int static_jmps[MAX_INSTS+2]; +static uintptr_t static_next[MAX_INSTS+2]; +static instruction_arm_t static_insts[MAX_INSTS+2] = {0}; +// TODO: ninst could be a uint16_t instead of an int, that could same some temp. memory void CancelBlock(int need_lock) { if(need_lock) mutex_lock(&my_context->mutex_dyndump); dynarec_arm_t* helper = (dynarec_arm_t*)current_helper; - current_helper = NULL; if(helper) { - dynaFree(helper->next); - dynaFree(helper->insts); - dynaFree(helper->predecessor); if(helper->dynablock && helper->dynablock->actual_block) { FreeDynarecMap((uintptr_t)helper->dynablock->actual_block); helper->dynablock->actual_block = NULL; } } + current_helper = NULL; if(need_lock) mutex_unlock(&my_context->mutex_dyndump); } @@ -451,10 +464,19 @@ dynarec_log(LOG_DEBUG, "Asked to Fill block %p with %p\n", block, (void*)addr); helper.dynablock = block; helper.start = addr; uintptr_t start = addr; - helper.cap = 64; // needs epilog handling - helper.insts = (instruction_arm_t*)dynaCalloc(helper.cap, sizeof(instruction_arm_t)); + helper.cap = MAX_INSTS; + helper.insts = static_insts; + helper.jmps = static_jmps; + helper.jmp_cap = MAX_INSTS; + helper.next = static_next; + helper.next_cap = MAX_INSTS; // pass 0, addresses, x86 jump addresses, overall size of the block uintptr_t end = arm_pass0(&helper, addr); + if(helper.abort) { + if(box86_dynarec_dump || box86_dynarec_log)dynarec_log(LOG_NONE, "Abort dynablock on pass0\n"); + CancelBlock(0); + return NULL; + } // basic checks if(!helper.size) { dynarec_log(LOG_DEBUG, "Warning, null-sized dynarec block (%p)\n", (void*)addr); @@ -519,24 +541,42 @@ dynarec_log(LOG_DEBUG, "Asked to Fill block %p with %p\n", block, (void*)addr); } } // no need for next and jmps anymore - dynaFree(helper.next); helper.next_sz = helper.next_cap = 0; helper.next = NULL; - dynaFree(helper.jmps); helper.jmp_sz = helper.jmp_cap = 0; helper.jmps = NULL; // fill predecessors with the jump address + int alloc_size = sizePredecessors(&helper); + helper.predecessor = (int*)alloca(alloc_size*sizeof(int)); fillPredecessors(&helper); int pos = helper.size; while (pos>=0) pos = updateNeed(&helper, pos, 0); + // remove fpu stuff on non-executed code + for(int i=1; iinstsize = instsize; // ok, free the helper now - dynaFree(helper.insts); helper.insts = NULL; helper.instsize = NULL; - dynaFree(helper.predecessor); helper.predecessor = NULL; block->size = sz; block->isize = helper.size; @@ -612,6 +655,8 @@ dynarec_log(LOG_DEBUG, "Asked to Fill block %p with %p\n", block, (void*)addr); CancelBlock(0); return NULL; } + // ok, free the helper now + helper.insts = NULL; if(insts_rsize/sizeof(instsize_t)>3)&7) { case 0: INST_NAME("FLD ST0, float[ED]"); - v1 = x87_do_push(dyn, ninst, x1, box86_dynarec_x87double?NEON_CACHE_ST_D:NEON_CACHE_ST_F); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, box86_dynarec_x87double?NEON_CACHE_ST_D:NEON_CACHE_ST_F); if(ST_IS_F(0)) s0 = v1; else @@ -676,7 +676,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VMOVfrV(x2, s0); STR_IMM9(x2, ed, fixedaddress); } - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 4: INST_NAME("FLDENV Ed"); diff --git a/src/dynarec/dynarec_arm_da.c b/src/dynarec/dynarec_arm_da.c index a2b27957b..3253d1981 100755 --- a/src/dynarec/dynarec_arm_da.c +++ b/src/dynarec/dynarec_arm_da.c @@ -129,8 +129,8 @@ uintptr_t dynarecDA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE4: @@ -197,7 +197,7 @@ uintptr_t dynarecDA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCVT_F64_S32(d0, s0); VCMP_F64(v1, d0); FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 4: INST_NAME("FISUB ST0, Ed"); diff --git a/src/dynarec/dynarec_arm_db.c b/src/dynarec/dynarec_arm_db.c index 8aaebcbfc..15c2033b1 100755 --- a/src/dynarec/dynarec_arm_db.c +++ b/src/dynarec/dynarec_arm_db.c @@ -183,7 +183,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ed"); - v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_D); s0 = fpu_get_scratch_single(dyn); parity = getedparity(dyn, ninst, addr, nextop, 2); if(parity) { @@ -218,7 +218,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, MOV_IMM_COND(cNE, ed, 0b10, 1); // 0x80000000 WBACK; VMSR(x14); // put back values - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 2: INST_NAME("FIST Ed, ST0"); @@ -260,7 +260,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, TSTS_IMM8_ROR(x3, 0b00000001, 0); MOV_IMM_COND(cNE, ed, 0b10, 1); // 0x80000000 WBACK; - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); x87_restoreround(dyn, ninst, u8); break; case 5: @@ -281,7 +281,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, STRH_IMM8(x14, ed, 8); } else { if(box86_x87_no80bits) { - v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_D); parity = getedparity(dyn, ninst, addr, nextop, 3); if (parity) { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 1023, 3, 0, NULL); @@ -298,10 +298,10 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, if(ed!=x1) { MOV_REG(x1, ed); } - x87_do_push_empty(dyn, ninst, x3); +X87_PUSH_OR_FAIL_empty( , dyn, ninst, x3); CALL(arm_fld, -1, 0); #else - v1 = x87_do_push(dyn, ninst, x2, NEON_CACHE_ST_D); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x2, NEON_CACHE_ST_D); // copy 10bytes of *ED to STld(0) LDR_IMM9(x3, xEmu, offsetof(x86emu_t, top)); int a = -dyn->n.x87stack; @@ -407,7 +407,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, MARK2; #endif } - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/dynarec_arm_dc.c b/src/dynarec/dynarec_arm_dc.c index e9d6e67cf..13310c264 100755 --- a/src/dynarec/dynarec_arm_dc.c +++ b/src/dynarec/dynarec_arm_dc.c @@ -115,7 +115,7 @@ uintptr_t dynarecDC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -276,7 +276,7 @@ uintptr_t dynarecDC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } VCMP_F64(v1, d1); FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 4: INST_NAME("FSUB ST0, double[ED]"); diff --git a/src/dynarec/dynarec_arm_dd.c b/src/dynarec/dynarec_arm_dd.c index 2aac56222..958df569a 100755 --- a/src/dynarec/dynarec_arm_dd.c +++ b/src/dynarec/dynarec_arm_dd.c @@ -49,7 +49,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, #if 1 if((nextop&7)==0 && PK(0)==0xD9 && PK(1)==0xF7) { MESSAGE(LOG_DUMP, "Hack for FFREE ST0 / FINCSTP\n"); - x87_do_pop(dyn, ninst, x1); + X87_POP_OR_FAIL(dyn, ninst, x1); addr+=2; SKIPTEST(x1); } else @@ -80,7 +80,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, break; case 0xD8: INST_NAME("FSTP ST0, ST0"); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xD9: case 0xDA: @@ -92,7 +92,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, INST_NAME("FSTP ST0, STx"); // copy the cache value for st0 to stx x87_swapreg(dyn, ninst, x1, x2, 0, nextop&7); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE0: @@ -130,7 +130,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xC8: @@ -164,7 +164,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FLD double"); - v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_D); parity = getedparity(dyn, ninst, addr, nextop, 3); if (parity) { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 1023, 3, 0, NULL); @@ -182,7 +182,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, 0, NULL); if(ed!=x1) {MOV_REG(x1, ed);} CALL(arm_fistt64, -1, 0); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 2: INST_NAME("FST double"); @@ -211,7 +211,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, STR_IMM9(x2, ed, fixedaddress); STR_IMM9(x3, ed, fixedaddress+4); } - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 4: INST_NAME("FRSTOR m108byte"); diff --git a/src/dynarec/dynarec_arm_de.c b/src/dynarec/dynarec_arm_de.c index af95e7107..4ff403b29 100755 --- a/src/dynarec/dynarec_arm_de.c +++ b/src/dynarec/dynarec_arm_de.c @@ -54,7 +54,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround) x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xC8: case 0xC9: @@ -76,7 +76,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround) x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xD0: case 0xD1: @@ -95,7 +95,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xD9: @@ -108,8 +108,8 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOM(x1, x2); - x87_do_pop(dyn, ninst, x3); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -131,7 +131,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround) x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE8: case 0xE9: @@ -153,7 +153,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround) x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xF0: case 0xF1: @@ -175,7 +175,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround) x87_restoreround(dyn, ninst, u8); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xF8: case 0xF9: @@ -220,7 +220,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } if(!box86_dynarec_fastround || !box86_dynarec_fastnan) VMSR(x14); // restore fpscr - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xD8: diff --git a/src/dynarec/dynarec_arm_df.c b/src/dynarec/dynarec_arm_df.c index 53a042575..3b2bec1e8 100755 --- a/src/dynarec/dynarec_arm_df.c +++ b/src/dynarec/dynarec_arm_df.c @@ -53,7 +53,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 0xC7: INST_NAME("FFREEP STx"); // not handling Tag... - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xE0: @@ -91,7 +91,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOMI(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xF0: case 0xF1: @@ -112,7 +112,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCMP_F64(v1, v2); } FCOMI(x1, x2); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 0xC8: @@ -161,7 +161,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ew"); - v1 = x87_do_push(dyn, ninst, x1, box86_dynarec_x87double?NEON_CACHE_ST_D:NEON_CACHE_ST_F); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, box86_dynarec_x87double?NEON_CACHE_ST_D:NEON_CACHE_ST_F); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 255, 0, 0, NULL); LDRSH_IMM8(x1, wback, fixedaddress); if(ST_IS_F(0)) { @@ -197,7 +197,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, CMPS_REG_LSL_IMM5_COND(cEQ, ed, x3, 0); MOVW_COND(cNE, x3, 0x8000); // saturated STRH_IMM8(x3, wback, fixedaddress); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); VMSR(x14); break; case 2: @@ -242,20 +242,20 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, CMPS_REG_LSL_IMM5_COND(cEQ, ed, x3, 0); MOVW_COND(cNE, x3, 0x8000); // saturated STRH_IMM8(x3, wback, fixedaddress); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); x87_restoreround(dyn, ninst, u8); break; case 4: INST_NAME("FBLD ST0, tbytes"); MESSAGE(LOG_DUMP, "Need Optimization\n"); - x87_do_push_empty(dyn, ninst, x1); + X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, x1); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, 0, NULL); if(ed!=x1) {MOV_REG(x1, ed);} CALL(fpu_fbld, -1, 0); break; case 5: INST_NAME("FILD ST0, i64"); - v1 = x87_do_push(dyn, ninst, x1, NEON_CACHE_ST_D); + X87_PUSH_OR_FAIL(v1, dyn, ninst, x1, NEON_CACHE_ST_D); v2 = fpu_get_scratch_double(dyn); s0 = fpu_get_scratch_single(dyn); parity = getedparity(dyn, ninst, addr, nextop, 3); @@ -315,7 +315,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0, 0, NULL); if(ed!=x1) {MOV_REG(x1, ed);} CALL(fpu_fbst, -1, 0); - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; case 7: // could be inlined for most thing, but is it usefull? INST_NAME("FISTP i64, ST0"); @@ -440,7 +440,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, CALL(arm_fistp64, -1, 0); #endif } - x87_do_pop(dyn, ninst, x3); + X87_POP_OR_FAIL(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/dynarec_arm_functions.c b/src/dynarec/dynarec_arm_functions.c index 53e96f7ae..fe1a6300b 100755 --- a/src/dynarec/dynarec_arm_functions.c +++ b/src/dynarec/dynarec_arm_functions.c @@ -487,13 +487,18 @@ void fpu_free_reg_quad(dynarec_arm_t* dyn, int reg) dyn->n.neoncache[i+1].v = 0; } // Reset fpu regs counter -void fpu_reset_reg(dynarec_arm_t* dyn) +static void fpu_reset_reg_neoncache(neoncache_t* n) { - dyn->n.fpu_reg = 0; + n->fpu_reg = 0; for (int i=0; i<24; ++i) { - dyn->n.fpuused[i]=0; - dyn->n.neoncache[i].v = 0; + n->fpuused[i]=0; + n->neoncache[i].v = 0; } + +} +void fpu_reset_reg(dynarec_arm_t* dyn) +{ + fpu_reset_reg_neoncache(&dyn->n); } int neoncache_get_st(dynarec_arm_t* dyn, int ninst, int a) @@ -1042,6 +1047,54 @@ void print_opcode(dynarec_arm_t* dyn, int ninst, uint32_t opcode) dynarec_log(LOG_NONE, "\t%08x\t%s\n", opcode, arm_print(opcode)); } +static void x87_reset(neoncache_t* n) +{ + for (int i=0; i<8; ++i) + n->x87cache[i] = -1; + n->x87stack = 0; + n->stack = 0; + n->stack_next = 0; + n->stack_pop = 0; + n->stack_push = 0; + n->combined1 = n->combined2 = 0; + n->swapped = 0; + n->barrier = 0; + n->pushed = 0; + n->poped = 0; + + for(int i=0; i<24; ++i) + if(n->neoncache[i].t == NEON_CACHE_ST_F || n->neoncache[i].t == NEON_CACHE_ST_D) + n->neoncache[i].v = 0; +} +static void mmx_reset(neoncache_t* n) +{ + n->mmxcount = 0; + for (int i=0; i<8; ++i) + n->mmxcache[i] = -1; +} +static void sse_reset(neoncache_t* n) +{ + for (int i=0; i<8; ++i) + n->ssecache[i].v = -1; +} + + +void fpu_reset(dynarec_arm_t* dyn) +{ + x87_reset(&dyn->n); + mmx_reset(&dyn->n); + sse_reset(&dyn->n); + fpu_reset_reg(dyn); +} + +void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst) +{ + x87_reset(&dyn->insts[ninst].n); + mmx_reset(&dyn->insts[ninst].n); + sse_reset(&dyn->insts[ninst].n); + fpu_reset_reg_neoncache(&dyn->insts[ninst].n); +} + int fpu_is_st_freed(dynarec_arm_t* dyn, int ninst, int st) { return (dyn->n.tags&(0b11<<(st*2)))?1:0; diff --git a/src/dynarec/dynarec_arm_functions.h b/src/dynarec/dynarec_arm_functions.h index 350d33b6c..e53add784 100755 --- a/src/dynarec/dynarec_arm_functions.h +++ b/src/dynarec/dynarec_arm_functions.h @@ -101,6 +101,10 @@ const char* getCacheName(int t, int n); void inst_name_pass3(dynarec_arm_t* dyn, int ninst, const char* name); void print_opcode(dynarec_arm_t* dyn, int ninst, uint32_t opcode); +// reset the cache +void fpu_reset(dynarec_arm_t* dyn); +void fpu_reset_ninst(dynarec_arm_t* dyn, int ninst); + // is st freed int fpu_is_st_freed(dynarec_arm_t* dyn, int ninst, int st); #endif //__DYNAREC_ARM_FUNCTIONS_H__ \ No newline at end of file diff --git a/src/dynarec/dynarec_arm_helper.c b/src/dynarec/dynarec_arm_helper.c index 27e384120..916e98cea 100755 --- a/src/dynarec/dynarec_arm_helper.c +++ b/src/dynarec/dynarec_arm_helper.c @@ -295,11 +295,10 @@ void ret_to_epilog(dynarec_arm_t* dyn, int ninst) // pop the actual return address for ARM stack LDM(xSP, (1<n.x87cache[i] = -1; - dyn->n.x87stack = 0; - dyn->n.stack = 0; - dyn->n.stack_next = 0; - dyn->n.stack_pop = 0; - dyn->n.stack_push = 0; - dyn->n.combined1 = dyn->n.combined2 = 0; - dyn->n.swapped = 0; - dyn->n.barrier = 0; - dyn->n.pushed = 0; - dyn->n.poped = 0; - - for(int i=0; i<24; ++i) - if(dyn->n.neoncache[i].t == NEON_CACHE_ST_F || dyn->n.neoncache[i].t == NEON_CACHE_ST_D) - dyn->n.neoncache[i].v = 0; -} - void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch) { if(!dyn->n.x87stack) @@ -1383,12 +1361,6 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1) } // MMX helpers -static void mmx_reset(dynarec_arm_t* dyn) -{ - dyn->n.mmxcount = 0; - for (int i=0; i<8; ++i) - dyn->n.mmxcache[i] = -1; -} static int isx87Empty(dynarec_arm_t* dyn) { for (int i=0; i<8; ++i) @@ -1470,11 +1442,6 @@ static void mmx_reflectcache(dynarec_arm_t* dyn, int ninst, int s1) // SSE / SSE2 helpers -static void sse_reset(dynarec_arm_t* dyn) -{ - for (int i=0; i<8; ++i) - dyn->n.ssecache[i].v = -1; -} // get neon register for a SSE reg, create the entry if needed int sse_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a, int forwrite) { @@ -2099,14 +2066,6 @@ void fpu_unreflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) x87_unreflectcache(dyn, ninst, s1, s2, s3); } -void fpu_reset(dynarec_arm_t* dyn) -{ - x87_reset(dyn); - mmx_reset(dyn); - sse_reset(dyn); - fpu_reset_reg(dyn); -} - // get the single reg that from the double "reg" (so Dx[idx]) int fpu_get_single_reg(dynarec_arm_t* dyn, int ninst, int reg, int idx) { diff --git a/src/dynarec/dynarec_arm_helper.h b/src/dynarec/dynarec_arm_helper.h index 3daa5e649..e97eb444a 100755 --- a/src/dynarec/dynarec_arm_helper.h +++ b/src/dynarec/dynarec_arm_helper.h @@ -343,6 +343,37 @@ } \ +#if STEP == 0 +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) var = x87_do_push(dyn, ninst, scratch, t) +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) x87_do_push_empty(dyn, ninst, scratch) +#define X87_POP_OR_FAIL(dyn, ninst, scratch) x87_do_pop(dyn, ninst, scratch) +#else +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ + if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) { \ + if(box86_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + var = x87_do_push(dyn, ninst, scratch, t) + +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->n.x87stack==8) || (dyn->n.pushed==8)) { \ + if(box86_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_push_empty(dyn, ninst, scratch) + +#define X87_POP_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->n.x87stack==-8) || (dyn->n.poped==8)) { \ + if(box86_dynarec_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->n.x87stack, dyn->n.poped, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_pop(dyn, ninst, scratch) +#endif + + #define SET_DFNONE(S) if(!dyn->f.dfnone) {MOVW(S, d_none); STR_IMM9(S, xEmu, offsetof(x86emu_t, df)); dyn->f.dfnone=1;} #define SET_DF(S, N) \ if(N) { \ @@ -590,7 +621,6 @@ void* arm_next(x86emu_t* emu, uintptr_t addr); #define fpu_pushcache STEPNAME(fpu_pushcache) #define fpu_popcache STEPNAME(fpu_popcache) -#define fpu_reset STEPNAME(fpu_reset) #define fpu_reset_cache STEPNAME(fpu_reset_cache) #define fpu_propagate_stack STEPNAME(fpu_propagate_stack) #define fpu_purgecache STEPNAME(fpu_purgecache) @@ -805,8 +835,6 @@ void sse_forget_reg(dynarec_arm_t* dyn, int ninst, int a, int s1); int sse_reflect_reg(dynarec_arm_t* dyn, int ninst, int a, int s1); // common coproc helpers -// reset the cache -void fpu_reset(dynarec_arm_t* dyn); // reset the cache with n void fpu_reset_cache(dynarec_arm_t* dyn, int ninst, int reset_n); // propagate stack state diff --git a/src/dynarec/dynarec_arm_pass.c b/src/dynarec/dynarec_arm_pass.c index 47e561d31..b8e68f144 100755 --- a/src/dynarec/dynarec_arm_pass.c +++ b/src/dynarec/dynarec_arm_pass.c @@ -49,6 +49,9 @@ uintptr_t arm_pass(dynarec_arm_t* dyn, uintptr_t addr) dyn->forward_to = 0; dyn->forward_size = 0; dyn->forward_ninst = 0; + #if STEP == 0 + memset(&dyn->insts[ninst], 0, sizeof(instruction_arm_t)); + #endif fpu_reset(dyn); int reset_n = -1; int stopblock = 2+(FindElfAddress(my_context, addr)?0:1); // if block is in elf_memory, it can be extended with bligblocks==2, else it needs 3 // ok, go now @@ -156,8 +159,8 @@ uintptr_t arm_pass(dynarec_arm_t* dyn, uintptr_t addr) ok = 1; // we use the 1st predecessor here int ii = ninst+1; - if(iisize && !dyn->insts[ii].pred_sz) { - while(iisize && (!dyn->insts[ii].pred_sz || (dyn->insts[ii].pred_sz==1 && dyn->insts[ii].pred[0]==ii-1))) { + if(iisize && !dyn->insts[ii].x86.alive) { + while(iisize && !dyn->insts[ii].x86.alive) { // may need to skip opcodes to advance ++ninst; NEW_INST; @@ -189,7 +192,7 @@ uintptr_t arm_pass(dynarec_arm_t* dyn, uintptr_t addr) if(dyn->forward_to == addr && !need_epilog && ok>=0) { // we made it! reset_n = get_first_jump(dyn, addr); - if(box86_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x86.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to); + if(box86_dynarec_dump) dynarec_log(LOG_NONE, "Forward extend block for %d bytes %s%p -> %p (ninst %d - %d)\n", dyn->forward_to-dyn->forward, dyn->insts[dyn->forward_ninst].x86.has_callret?"(opt. call) ":"", (void*)dyn->forward, (void*)dyn->forward_to, reset_n, ninst); if(dyn->insts[dyn->forward_ninst].x86.has_callret && !dyn->insts[dyn->forward_ninst].x86.has_next) dyn->insts[dyn->forward_ninst].x86.has_next = 1; // this block actually continue dyn->forward = 0; @@ -197,7 +200,7 @@ uintptr_t arm_pass(dynarec_arm_t* dyn, uintptr_t addr) dyn->forward_size = 0; dyn->forward_ninst = 0; ok = 1; // in case it was 0 - } else if ((dyn->forward_to < addr) || !ok) { + } else if ((dyn->forward_to < addr) || ok<=0) { // something when wrong! rollback if(box86_dynarec_dump) dynarec_log(LOG_NONE, "Could not forward extend block for %d bytes %p -> %p\n", dyn->forward_to-dyn->forward, (void*)dyn->forward, (void*)dyn->forward_to); ok = 0; @@ -272,6 +275,7 @@ uintptr_t arm_pass(dynarec_arm_t* dyn, uintptr_t addr) reset_n = -2; ++ninst; #if STEP == 0 + memset(&dyn->insts[ninst], 0, sizeof(instruction_arm_t)); if(ok && (((box86_dynarec_bigblock=box86_nodynarec_start && addrinsts[ninst].x86.has_callret = 1 #define NEW_INST \ ++dyn->size; \ - if(dyn->size+3>=dyn->cap) { \ - dyn->insts = (instruction_arm_t*)dynaRealloc(dyn->insts, sizeof(instruction_arm_t)*dyn->cap*2);\ - memset(&dyn->insts[dyn->cap], 0, sizeof(instruction_arm_t)*dyn->cap); \ - dyn->cap *= 2; \ - } \ dyn->insts[ninst].x86.addr = ip; \ dyn->n.combined1 = dyn->n.combined2 = 0;\ dyn->n.swapped = 0; dyn->n.barrier = 0; \ diff --git a/src/dynarec/dynarec_arm_private.h b/src/dynarec/dynarec_arm_private.h index 0cc606ed5..d889a27b5 100755 --- a/src/dynarec/dynarec_arm_private.h +++ b/src/dynarec/dynarec_arm_private.h @@ -114,6 +114,7 @@ typedef struct dynarec_arm_s { int32_t forward_size; // size at the forward point int forward_ninst; // ninst at the forward point uint8_t always_test; + uint8_t abort; // abort the creation of the block } dynarec_arm_t; void add_next(dynarec_arm_t *dyn, uintptr_t addr); diff --git a/src/dynarec/dynarec_private.h b/src/dynarec/dynarec_private.h index 11e45634c..091d9df05 100755 --- a/src/dynarec/dynarec_private.h +++ b/src/dynarec/dynarec_private.h @@ -37,6 +37,7 @@ typedef struct instruction_x86_s { uint8_t jmp_cond:1; // 1 of conditionnal jump uint8_t has_next:1; // does this opcode can continue to the next? uint8_t has_callret:1; // this instruction have an optimised call setup + uint8_t alive:1; // this opcode gets executed (0 if dead code in that block) uint8_t barrier; // next instruction is a jump point, so no optim allowed uint8_t barrier_next; // next instruction needs a barrier uint8_t state_flags;// One of SF_XXX state diff --git a/src/emu/x86emu_private.h b/src/emu/x86emu_private.h index ebc831aba..47ee69ae6 100755 --- a/src/emu/x86emu_private.h +++ b/src/emu/x86emu_private.h @@ -62,9 +62,9 @@ typedef struct x86emu_s { mmx87_regs_t mmx[8]; uint32_t top; // top is part of sw, but it's faster to have it separatly int fpu_stack; + uint32_t fpu_tags; // tags for the x87 regs, stacked, only on a 16bits anyway fpu_ld_t fpu_ld[8]; // for long double emulation / 80bits fld fst fpu_ll_t fpu_ll[8]; // for 64bits fild / fist sequence - uint32_t fpu_tags; // tags for the x87 regs, stacked, only on a 16bits anyway // sse sse_regs_t xmm[8]; mmxcontrol_t mxcsr; diff --git a/src/include/dynarec_arm.h b/src/include/dynarec_arm.h index 901a5f086..7467e7bc2 100755 --- a/src/include/dynarec_arm.h +++ b/src/include/dynarec_arm.h @@ -5,6 +5,8 @@ typedef struct dynablock_s dynablock_t; typedef struct x86emu_s x86emu_t; typedef struct instsize_s instsize_t; +#define MAX_INSTS 32760 + void addInst(instsize_t* insts, size_t* size, int x86_size, int arm_size); void CancelBlock();