diff --git a/CMakeLists.txt b/CMakeLists.txt
index caaef337810f..520ae2b0a945 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -737,6 +737,7 @@ add_library(Common STATIC
Common/Input/InputState.cpp
Common/Input/InputState.h
Common/Math/fast/fast_matrix.c
+ Common/Math/CrossSIMD.h
Common/Math/curves.cpp
Common/Math/curves.h
Common/Math/expression_parser.cpp
diff --git a/Common/Common.vcxproj b/Common/Common.vcxproj
index 93e91681b0cf..2b31119bd9cc 100644
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@@ -484,6 +484,7 @@
+
diff --git a/Common/Common.vcxproj.filters b/Common/Common.vcxproj.filters
index 73aefb9323bf..6d82c03c0417 100644
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@@ -518,6 +518,9 @@
GPU\Vulkan
+
+ Math
+
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
new file mode 100644
index 000000000000..3eb8e0e75e2f
--- /dev/null
+++ b/Common/Math/CrossSIMD.h
@@ -0,0 +1,58 @@
+// CrossSIMD
+//
+// Compatibility wrappers for SIMD dialects.
+//
+// In the long run, might do a more general single-source-SIMD wrapper here consisting
+// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
+// our various color conversion functions and so on in a pretty generic manner.
+
+#include "ppsspp_config.h"
+
+#include
+
+#if PPSSPP_ARCH(SSE2)
+#include
+#endif
+
+#if PPSSPP_ARCH(ARM_NEON)
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include
+#else
+#include
+#endif
+#endif
+
+// Basic types
+
+#if PPSSPP_ARCH(ARM64_NEON)
+
+// No special ones here.
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+// Compatibility wrappers making ARM64 NEON code run on ARM32
+// With optimization on, these should compile down to the optimal code.
+
+inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
+ switch (lane & 3) {
+ case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
+ case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
+ case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
+ default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
+ }
+}
+
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
+ switch (lane & 3) {
+ case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
+ case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
+ case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
+ default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+ }
+}
+
+inline uint32x4_t vcgezq_f32(float32x4_t v) {
+ return vcgeq_f32(v, vdupq_n_f32(0.0f));
+}
+
+#endif
diff --git a/Common/UI/View.cpp b/Common/UI/View.cpp
index b60f6ac88de8..37f8e9ab02aa 100644
--- a/Common/UI/View.cpp
+++ b/Common/UI/View.cpp
@@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo
void CollapsibleHeader::Draw(UIContext &dc) {
Style style = dc.theme->itemStyle;
- style.background.color = 0;
if (HasFocus()) style = dc.theme->itemFocusedStyle;
if (down_) style = dc.theme->itemDownStyle;
if (!IsEnabled()) style = dc.theme->itemDisabledStyle;
diff --git a/Core/Config.cpp b/Core/Config.cpp
index fc4a2b913a06..eec83a23e87c 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {
// Parse the string.
PlayTime gameTime{};
- if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
+ if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
tracker_[key] = gameTime;
}
}
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 8927a1c2a099..96a81b36b934 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -21,6 +21,7 @@
#include "Common/Data/Convert/ColorConv.h"
#include "Common/Profiler/Profiler.h"
#include "Common/LogReporting.h"
+#include "Common/Math/CrossSIMD.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
@@ -197,15 +198,10 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex
// Gated by DIRTY_CULL_PLANES
void DrawEngineCommon::UpdatePlanes() {
- float world[16];
float view[16];
- float worldview[16];
- float worldviewproj[16];
- ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+ float viewproj[16];
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
- // TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
- Matrix4ByMatrix4(worldview, world, view);
- Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
+ Matrix4ByMatrix4(viewproj, view, gstate.projMatrix);
// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
// Note that the PSP does not clip against the viewport.
@@ -214,6 +210,9 @@ void DrawEngineCommon::UpdatePlanes() {
minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);
+ // Let's not handle these special cases in the fast culler.
+ offsetOutsideEdge_ = maxOffset_.x >= 4096.0f || minOffset_.x < 1.0f || minOffset_.y < 1.0f || maxOffset_.y >= 4096.0f;
+
// Now let's apply the viewport to our scissor/region + offset range.
Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
@@ -232,14 +231,14 @@ void DrawEngineCommon::UpdatePlanes() {
applyViewport.wy = -(maxViewport.y + minViewport.y) * viewportInvSize.y;
float mtx[16];
- Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m);
-
- planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
- planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
- planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
- planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
- planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
- planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
+ Matrix4ByMatrix4(mtx, viewproj, applyViewport.m);
+ // I'm sure there's some fairly optimized way to set these.
+ planes_.Set(0, mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
+ planes_.Set(1, mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
+ planes_.Set(2, mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
+ planes_.Set(3, mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
+ planes_.Set(4, mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
+ planes_.Set(5, mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
}
// This code has plenty of potential for optimization.
@@ -262,7 +261,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
float *verts = (float *)(decoded_ + 65536 * 18);
- int vertStride = 3;
// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
// Let's always say objects are within bounds.
@@ -338,17 +336,23 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
}
break;
case GE_VTYPE_POS_FLOAT:
- // No need to copy in this case, we can just read directly from the source format with a stride.
- verts = (float *)((uint8_t *)vdata + offset);
- vertStride = stride / 4;
// Previous code:
- // for (int i = 0; i < vertexCount; i++)
- // memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
+ for (int i = 0; i < vertexCount; i++)
+ memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
break;
}
}
}
+ // Pretransform the verts in-place so we don't have to do it inside the loop.
+ // We do this differently in the fast version below since we skip the max/minOffset checks there
+ // making it easier to get the whole thing ready for SIMD.
+ for (int i = 0; i < vertexCount; i++) {
+ float worldpos[3];
+ Vec3ByMatrix43(worldpos, &verts[i * 3], gstate.worldMatrix);
+ memcpy(&verts[i * 3], worldpos, 12);
+ }
+
// Note: near/far are not checked without clamp/clip enabled, so we skip those planes.
int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4;
for (int plane = 0; plane < totalPlanes; plane++) {
@@ -358,8 +362,8 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
// Test against the frustum planes, and count.
// TODO: We should test 4 vertices at a time using SIMD.
// I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6.
- const float *pos = verts + i * vertStride;
- float value = planes_[plane].Test(pos);
+ const float *worldpos = verts + i * 3;
+ float value = planes_.Test(plane, worldpos);
if (value <= -FLT_EPSILON) // Not sure why we use exactly this value. Probably '< 0' would do.
out++;
else
@@ -388,6 +392,179 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
return true;
}
+// NOTE: This doesn't handle through-mode, indexing, morph, or skinning.
+bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) {
+ SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
+ float *verts = (float *)(decoded_ + 65536 * 18);
+
+ // Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
+ // Let's always say objects are within bounds.
+ if (gstate_c.Use(GPU_USE_VIRTUAL_REALITY))
+ return true;
+
+ // Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform
+ // in here as well. Though, it still does cut down on a lot of updates in Tekken 6.
+ if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) {
+ UpdatePlanes();
+ gpuStats.numPlaneUpdates++;
+ gstate_c.Clean(DIRTY_CULL_PLANES);
+ }
+
+ // Also let's just bail if offsetOutsideEdge_ is set, instead of handling the cases.
+ // NOTE: This is written to in UpdatePlanes so can't check it before.
+ if (offsetOutsideEdge_)
+ return true;
+
+ // Simple, most common case.
+ VertexDecoder *dec = GetVertexDecoder(vertType);
+ int stride = dec->VertexSize();
+ int offset = dec->posoff;
+ int vertStride = 3;
+
+ // TODO: Possibly do the plane tests directly against the source formats instead of converting.
+ switch (vertType & GE_VTYPE_POS_MASK) {
+ case GE_VTYPE_POS_8BIT:
+ for (int i = 0; i < vertexCount; i++) {
+ const s8 *data = (const s8 *)vdata + i * stride + offset;
+ for (int j = 0; j < 3; j++) {
+ verts[i * 3 + j] = data[j] * (1.0f / 128.0f);
+ }
+ }
+ break;
+ case GE_VTYPE_POS_16BIT:
+ {
+#if PPSSPP_ARCH(SSE2)
+ __m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f);
+ for (int i = 0; i < vertexCount; i++) {
+ const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+ __m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data));
+ // Sign extension. Hacky without SSE4.
+ bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
+ __m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
+ _mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
+ }
+#elif PPSSPP_ARCH(ARM_NEON)
+ for (int i = 0; i < vertexCount; i++) {
+ const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+ int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
+ float32x4_t pos = vcvtq_n_f32_s32(data, 15); // >> 15 = division by 32768.0f
+ vst1q_f32(verts + i * 3, pos);
+ }
+#else
+ for (int i = 0; i < vertexCount; i++) {
+ const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+ for (int j = 0; j < 3; j++) {
+ verts[i * 3 + j] = data[j] * (1.0f / 32768.0f);
+ }
+ }
+#endif
+ break;
+ }
+ case GE_VTYPE_POS_FLOAT:
+ // No need to copy in this case, we can just read directly from the source format with a stride.
+ verts = (float *)((uint8_t *)vdata + offset);
+ vertStride = stride / 4;
+ break;
+ }
+
+ // We only check the 4 sides. Near/far won't likely make a huge difference.
+ // We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
+ // for testing, don't want to re-do that, so we have to use that "pivot" of the data.
+#if PPSSPP_ARCH(SSE2)
+ const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
+ const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
+ const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
+ const __m128 worldW = _mm_loadu_ps(gstate.worldMatrix + 9);
+ const __m128 planeX = _mm_loadu_ps(planes_.x);
+ const __m128 planeY = _mm_loadu_ps(planes_.y);
+ const __m128 planeZ = _mm_loadu_ps(planes_.z);
+ const __m128 planeW = _mm_loadu_ps(planes_.w);
+ __m128 inside = _mm_set1_ps(0.0f);
+ for (int i = 0; i < vertexCount; i++) {
+ const float *pos = verts + i * vertStride;
+ __m128 worldpos = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps(worldX, _mm_set1_ps(pos[0])),
+ _mm_mul_ps(worldY, _mm_set1_ps(pos[1]))
+ ),
+ _mm_add_ps(
+ _mm_mul_ps(worldZ, _mm_set1_ps(pos[2])),
+ worldW
+ )
+ );
+ // OK, now we check it against the four planes.
+ // This is really curiously similar to a matrix multiplication (well, it is one).
+ __m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128 planeDist = _mm_add_ps(
+ _mm_add_ps(
+ _mm_mul_ps(planeX, posX),
+ _mm_mul_ps(planeY, posY)
+ ),
+ _mm_add_ps(
+ _mm_mul_ps(planeZ, posZ),
+ planeW
+ )
+ );
+ inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
+ }
+ // 0xF means that we found at least one vertex inside every one of the planes.
+ // We don't bother with counts, though it wouldn't be hard if we had a use for them.
+ return _mm_movemask_ps(inside) == 0xF;
+#elif PPSSPP_ARCH(ARM_NEON)
+ const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
+ const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
+ const float32x4_t worldZ = vld1q_f32(gstate.worldMatrix + 6);
+ const float32x4_t worldW = vld1q_f32(gstate.worldMatrix + 9);
+ const float32x4_t planeX = vld1q_f32(planes_.x);
+ const float32x4_t planeY = vld1q_f32(planes_.y);
+ const float32x4_t planeZ = vld1q_f32(planes_.z);
+ const float32x4_t planeW = vld1q_f32(planes_.w);
+ uint32x4_t inside = vdupq_n_u32(0);
+ for (int i = 0; i < vertexCount; i++) {
+ const float *pos = verts + i * vertStride;
+ float32x4_t objpos = vld1q_f32(pos);
+ float32x4_t worldpos = vaddq_f32(
+ vmlaq_laneq_f32(
+ vmulq_laneq_f32(worldX, objpos, 0),
+ worldY, objpos, 1),
+ vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
+ );
+ // OK, now we check it against the four planes.
+ // This is really curiously similar to a matrix multiplication (well, it is one).
+ float32x4_t planeDist = vaddq_f32(
+ vmlaq_laneq_f32(
+ vmulq_laneq_f32(planeX, worldpos, 0),
+ planeY, worldpos, 1),
+ vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
+ );
+ inside = vorrq_u32(inside, vcgezq_f32(planeDist));
+ }
+ uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
+ return ~insideBits == 0; // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
+#else
+ int inside[4]{};
+ for (int i = 0; i < vertexCount; i++) {
+ const float *pos = verts + i * vertStride;
+ float worldpos[3];
+ Vec3ByMatrix43(worldpos, pos, gstate.worldMatrix);
+ for (int plane = 0; plane < 4; plane++) {
+ float value = planes_.Test(plane, worldpos);
+ if (value >= 0.0f)
+ inside[plane]++;
+ }
+ }
+
+ for (int plane = 0; plane < 4; plane++) {
+ if (inside[plane] == 0) {
+ return false;
+ }
+ }
+#endif
+ return true;
+}
+
// TODO: This probably is not the best interface.
bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector &vertices, std::vector &indices) {
// This is always for the current vertices.
@@ -670,6 +847,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
return cmd - start;
}
+void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
+ if (!indexGen.PrimCompatible(prevPrim_, prim)) {
+ DispatchFlush();
+ }
+
+ // This isn't exactly right, if we flushed, since prims can straddle previous calls.
+ // But it generally works for common usage.
+ if (prim == GE_PRIM_KEEP_PREVIOUS) {
+ // Has to be set to something, let's assume POINTS (0) if no previous.
+ if (prevPrim_ == GE_PRIM_INVALID)
+ prevPrim_ = GE_PRIM_POINTS;
+ prim = prevPrim_;
+ } else {
+ prevPrim_ = prim;
+ }
+
+ // If vtype has changed, setup the vertex decoder.
+ if (vertTypeID != lastVType_ || !dec_) {
+ dec_ = GetVertexDecoder(vertTypeID);
+ lastVType_ = vertTypeID;
+ }
+
+ *bytesRead = vertexCount * dec_->VertexSize();
+}
+
// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index e9e8870ef33d..147a1178bd56 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -69,11 +69,11 @@ class TessellationDataTransfer {
virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
};
-// Culling plane.
-struct Plane {
- float x, y, z, w;
- void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
- float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
+// Culling plane, group of 8.
+struct alignas(16) Plane8 {
+ float x[8], y[8], z[8], w[8];
+ void Set(int i, float _x, float _y, float _z, float _w) { x[i] = _x; y[i] = _y; z[i] = _z; w[i] = _w; }
+ float Test(int i, const float f[3]) const { return x[i] * f[0] + y[i] * f[1] + z[i] * f[2] + w[i]; }
};
class DrawEngineCommon {
@@ -104,6 +104,10 @@ class DrawEngineCommon {
bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType);
+ // This is a less accurate version of TestBoundingBox, but faster. Can have more false positives.
+ // Doesn't support indexing.
+ bool TestBoundingBoxFast(const void *control_points, int vertexCount, u32 vertType);
+
void FlushSkin() {
bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode;
if (applySkin) {
@@ -113,6 +117,8 @@ class DrawEngineCommon {
int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
+ void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
+
template
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
void ClearSplineBezierWeights();
@@ -287,7 +293,8 @@ class DrawEngineCommon {
TessellationDataTransfer *tessDataTransfer;
// Culling
- Plane planes_[6];
+ Plane8 planes_;
Vec2f minOffset_;
Vec2f maxOffset_;
+ bool offsetOutsideEdge_;
};
diff --git a/GPU/GPU.h b/GPU/GPU.h
index 7d4d4d1c0a07..f2edbdc1f624 100644
--- a/GPU/GPU.h
+++ b/GPU/GPU.h
@@ -76,6 +76,7 @@ struct GPUStatistics {
void ResetFrame() {
numDrawCalls = 0;
numVertexDecodes = 0;
+ numCulledDraws = 0;
numDrawSyncs = 0;
numListSyncs = 0;
numVertsSubmitted = 0;
@@ -111,6 +112,7 @@ struct GPUStatistics {
// Per frame statistics
int numDrawCalls;
int numVertexDecodes;
+ int numCulledDraws;
int numDrawSyncs;
int numListSyncs;
int numFlushes;
diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp
index f961880d8a42..66906f4b1b47 100644
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@@ -989,9 +989,45 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
int cullMode = gstate.getCullMode();
uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
- if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+
+#define MAX_CULL_CHECK_COUNT 6
+
+// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V.
+#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2)
+
+#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
+
+#else
+
+#define PASSES_CULLING true
+
+#endif
+
+ // If certain conditions are true, do frustum culling.
+ bool passCulling = PASSES_CULLING;
+ if (!passCulling) {
+ // Do software culling.
+ if (drawEngineCommon_->TestBoundingBoxFast(verts, count, vertexType)) {
+ passCulling = true;
+ } else {
+ gpuStats.numCulledDraws++;
+ }
+ }
+
+ // If the first one in a batch passes, let's assume the whole batch passes.
+ // Cuts down on checking, while not losing that much efficiency.
+ bool onePassed = false;
+ if (passCulling) {
+ if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+ canExtend = false;
+ }
+ onePassed = true;
+ } else {
+ // Still need to advance bytesRead.
+ drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
canExtend = false;
}
+
// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
// Some games rely on this, they don't bother reloading VADDR and IADDR.
// The VADDR/IADDR registers are NOT updated.
@@ -1027,7 +1063,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
if (canExtend) {
// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
- // are consecutive in memory.
+ // are consecutive in memory. We also ignore culling here.
_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
if (!commandsExecuted) {
@@ -1047,7 +1083,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
// We can extend again after submitting a normal draw.
canExtend = isTriangle;
}
- if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+
+ bool passCulling = onePassed || PASSES_CULLING;
+ if (!passCulling) {
+ // Do software culling.
+ if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
+ passCulling = true;
+ } else {
+ gpuStats.numCulledDraws++;
+ }
+ }
+ if (passCulling) {
+ if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+ canExtend = false;
+ }
+ // As soon as one passes, assume we don't need to check the rest of this batch.
+ onePassed = true;
+ } else {
+ // Still need to advance bytesRead.
+ drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
canExtend = false;
}
AdvanceVerts(vertexType, count, bytesRead);
@@ -1412,7 +1466,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) {
if (dst[i] != newVal) {
Flush();
dst[i] = newVal;
- gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
+ gstate_c.Dirty(DIRTY_WORLDMATRIX);
}
if (++i >= end) {
break;
@@ -1435,7 +1489,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) {
if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
Flush();
((u32 *)gstate.worldMatrix)[num] = newVal;
- gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
+ gstate_c.Dirty(DIRTY_WORLDMATRIX);
}
num++;
gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
@@ -1691,7 +1745,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
return snprintf(buffer, size,
"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
- "Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
+ "Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
"Vertices: %d drawn: %d\n"
"FBOs active: %d (evaluations: %d)\n"
"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
@@ -1705,6 +1759,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numListSyncs,
gpuStats.numDrawCalls,
gpuStats.numVertexDecodes,
+ gpuStats.numCulledDraws,
gpuStats.numFlushes,
gpuStats.numClears,
gpuStats.numBBOXJumps,
diff --git a/UWP/CommonUWP/CommonUWP.vcxproj b/UWP/CommonUWP/CommonUWP.vcxproj
index b4d9a8937551..b27cbdbb06be 100644
--- a/UWP/CommonUWP/CommonUWP.vcxproj
+++ b/UWP/CommonUWP/CommonUWP.vcxproj
@@ -105,6 +105,7 @@
+
diff --git a/UWP/CommonUWP/CommonUWP.vcxproj.filters b/UWP/CommonUWP/CommonUWP.vcxproj.filters
index 2eedf9e8c653..262e1e7af3b7 100644
--- a/UWP/CommonUWP/CommonUWP.vcxproj.filters
+++ b/UWP/CommonUWP/CommonUWP.vcxproj.filters
@@ -862,6 +862,9 @@
ext\naett
+
+ Math
+
diff --git a/ppsspp_config.h b/ppsspp_config.h
index 2861b621b333..71e7b9c9466d 100644
--- a/ppsspp_config.h
+++ b/ppsspp_config.h
@@ -11,6 +11,7 @@
#if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__)
#define PPSSPP_ARCH_X86 1
#define PPSSPP_ARCH_32BIT 1
+ #define PPSSPP_ARCH_SSE2 1
//TODO: Remove this compat define
#ifndef _M_IX86
#define _M_IX86 600
@@ -19,6 +20,7 @@
#if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__)
#define PPSSPP_ARCH_AMD64 1
+ #define PPSSPP_ARCH_SSE2 1
#if defined(__ILP32__)
#define PPSSPP_ARCH_32BIT 1
#else