Skip to content

Commit

Permalink
vertexjit: Only save extra regs on x64.
Browse files Browse the repository at this point in the history
  • Loading branch information
unknownbrackets committed Feb 1, 2021
1 parent 30b6f1f commit c1fa495
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions GPU/Common/VertexDecoderX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ alignas(16) static const float by16384[4] = {
1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
};

#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
#ifdef _WIN32
static const X64Reg tempReg1 = RAX;
static const X64Reg tempReg2 = R9;
Expand Down Expand Up @@ -197,8 +197,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
MOVUPS(MDisp(ESP, 16), XMM5);
MOVUPS(MDisp(ESP, 32), XMM6);
MOVUPS(MDisp(ESP, 48), XMM7);
#if PPSSPP_ARCH(AMD64)
MOVUPS(MDisp(ESP, 64), XMM8);
MOVUPS(MDisp(ESP, 80), XMM9);
#endif

bool prescaleStep = false;
// Look for prescaled texcoord steps
Expand Down Expand Up @@ -275,11 +277,13 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
MOVUPS(XMM5, MDisp(ESP, 16));
MOVUPS(XMM6, MDisp(ESP, 32));
MOVUPS(XMM7, MDisp(ESP, 48));
#if PPSSPP_ARCH(AMD64)
MOVUPS(XMM8, MDisp(ESP, 64));
MOVUPS(XMM9, MDisp(ESP, 80));
#endif
ADD(PTRBITS, R(ESP), Imm8(STACK_FIXED_ALLOC));

#ifdef _M_IX86
#if PPSSPP_ARCH(X86)
// Restore register values
POP(EBP);
POP(EBX);
Expand Down Expand Up @@ -466,7 +470,7 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));

#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
if (dec_->nweights > 4) {
// This reads 8 bytes, we split the top 4 so we can expand each set of 4.
MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
Expand Down Expand Up @@ -518,7 +522,7 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() {

for (int j = 0; j < dec_->nweights; j++) {
X64Reg weight = XMM1;
#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
if (j == 3 || j == dec_->nweights - 1) {
// In the previous iteration, we already spread this value to all lanes.
Expand Down Expand Up @@ -576,7 +580,7 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() {
void VertexDecoderJitCache::Jit_WeightsU16Skin() {
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));

#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
if (dec_->nweights > 6) {
// Since this is probably not aligned, two MOVQs are better than one MOVDQU.
MOVQ_xmm(XMM8, MDisp(srcReg, dec_->weightoff));
Expand Down Expand Up @@ -632,7 +636,7 @@ void VertexDecoderJitCache::Jit_WeightsU16Skin() {

for (int j = 0; j < dec_->nweights; j++) {
X64Reg weight = XMM1;
#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
X64Reg weightSrc = j < 4 ? XMM8 : XMM9;
if (j == 3 || j == dec_->nweights - 1) {
// In the previous iteration, we already spread this value to all lanes.
Expand Down Expand Up @@ -730,7 +734,7 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() {
}

void VertexDecoderJitCache::Jit_TcFloat() {
#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
#else
Expand Down Expand Up @@ -911,7 +915,7 @@ void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
}

void VertexDecoderJitCache::Jit_TcFloatThrough() {
#ifdef _M_X64
#if PPSSPP_ARCH(AMD64)
MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
#else
Expand Down

0 comments on commit c1fa495

Please sign in to comment.