diff --git a/Common/ArmEmitter.cpp b/Common/ArmEmitter.cpp index 97465269dc66..49ed28cc543a 100644 --- a/Common/ArmEmitter.cpp +++ b/Common/ArmEmitter.cpp @@ -1634,7 +1634,7 @@ void ARMXEmitter::VMOV_neon(u32 Size, ARMReg Vd, ARMReg Rt, int lane) _assert_msg_(JIT, false, "VMOV_neon unsupported size"); } - if (Vd < S0 && Rt >= D0) + if (Vd < S0 && Rt >= D0 && Rt < Q0) { // Oh, reading to reg, our params are backwards. ARMReg Src = Rt; @@ -1645,7 +1645,7 @@ void ARMXEmitter::VMOV_neon(u32 Size, ARMReg Vd, ARMReg Rt, int lane) Write32(condition | (0xE1 << 20) | U | (opc1 << 21) | EncodeVn(Src) | (Dest << 12) | (0xB << 8) | (opc2 << 5) | (1 << 4)); } - else if (Rt < S0 && Vd >= D0) + else if (Rt < S0 && Vd >= D0 && Vd < Q0) { ARMReg Src = Rt; ARMReg Dest = Vd; diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index 0cf79dd9ba8f..0efba2780abc 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -430,16 +430,13 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() { if (NEONSkinning) { // Weight is first so srcReg is correct. switch (dec_->nweights) { - case 1: LDRB(scratchReg2, srcReg, 0); break; - case 2: LDRH(scratchReg2, srcReg, 0); break; - case 3: - case 4: + case 1: VLD1_lane(I_8, neonScratchReg, srcReg, 0, false); break; + case 2: VLD1_lane(I_16, neonScratchReg, srcReg, 0, false); break; + default: + // For 3, we over read, for over 4, we read more later. VLD1_lane(I_32, neonScratchReg, srcReg, 0, false); break; } - if (dec_->nweights == 1 || dec_->nweights == 2) { - VMOV_neon(I_32, neonScratchReg, scratchReg2, 0); - } // This can be represented as a constant. VMOV_neon(F_32, Q3, by128); VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); @@ -450,16 +447,13 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() { if (dec_->nweights > 4) { ADD(tempReg1, srcReg, 4 * sizeof(u8)); switch (dec_->nweights) { - case 5: LDRB(scratchReg2, tempReg1, 0); break; - case 6: LDRH(scratchReg2, tempReg1, 0); break; + case 5: VLD1_lane(I_8, neonScratchReg, tempReg1, 0, false); break; + case 6: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, false); break; case 7: case 8: VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false); break; } - if (dec_->nweights == 5 || dec_->nweights == 6) { - VMOV_neon(I_32, neonScratchReg, scratchReg2, 0); - } VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); @@ -480,13 +474,10 @@ void VertexDecoderJitCache::Jit_WeightsU8Skin() { void VertexDecoderJitCache::Jit_WeightsU16Skin() { if (NEONSkinning) { switch (dec_->nweights) { - case 1: - LDRH(scratchReg, srcReg, 0); - VMOV_neon(I_32, neonScratchReg, scratchReg, 0); - break; + case 1: VLD1_lane(I_16, neonScratchReg, srcReg, 0, true); break; case 2: VLD1_lane(I_32, neonScratchReg, srcReg, 0, false); break; - case 3: - case 4: + default: + // For 3, we over read, for over 4, we read more later. VLD1(I_32, neonScratchReg, srcReg, 1, ALIGN_NONE); break; } @@ -499,10 +490,7 @@ void VertexDecoderJitCache::Jit_WeightsU16Skin() { if (dec_->nweights > 4) { ADD(tempReg1, srcReg, 4 * sizeof(u16)); switch (dec_->nweights) { - case 5: - LDRH(scratchReg, tempReg1, 0); - VMOV_neon(I_32, neonScratchReg, scratchReg, 0); - break; + case 5: VLD1_lane(I_16, neonScratchReg, tempReg1, 0, true); break; case 6: VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false); break; case 7: case 8: