Moved to NEON optimized memcpy usage.

Rinnegatamante · Jun 23, 2020 · 0c75f27 · 0c75f27
1 parent 9895189
commit 0c75f27
Show file tree

Hide file tree

Showing 12 changed files with 227 additions and 82 deletions.
diff --git a/Makefile b/Makefile
@@ -7,9 +7,10 @@ SOURCES += source/hacks
 endif
 
 CFILES   := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c))
+ASMFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.S))
 CGFILES  := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg))
 HEADERS  := $(CGFILES:.cg=.h)
-OBJS     := $(CFILES:.c=.o)
+OBJS     := $(CFILES:.c=.o) $(ASMFILES:.S=.o)
 
 PREFIX  = arm-vita-eabi
 CC      = $(PREFIX)-gcc

diff --git a/source/custom_shaders.c b/source/custom_shaders.c
@@ -159,7 +159,7 @@ void glShaderBinary(GLsizei count, const GLuint *handles, GLenum binaryFormat, c
 
 	// Allocating compiled shader on RAM and registering it into sceGxmShaderPatcher
 	s->prog = (SceGxmProgram *)malloc(length);
-	memcpy((void *)s->prog, binary, length);
+	memcpy_neon((void *)s->prog, binary, length);
 	sceGxmShaderPatcherRegisterProgram(gxm_shader_patcher, s->prog, &s->id);
 	s->prog = sceGxmShaderPatcherGetProgramFromId(s->id);
 }
@@ -449,13 +449,13 @@ void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean nor
 
 	// Copying passed data to vitaGL mempool
 	if (stride == 0)
-		memcpy(ptr, pointer, count * bpe * size); // Faster if stride == 0
+		memcpy_neon(ptr, pointer, count * bpe * size); // Faster if stride == 0
 	else {
 		int i;
 		uint8_t *dst = (uint8_t *)ptr;
 		uint8_t *src = (uint8_t *)pointer;
 		for (i = 0; i < count; i++) {
-			memcpy(dst, src, bpe * size);
+			memcpy_neon(dst, src, bpe * size);
 			dst += (bpe * size);
 			src += stride;
 		}

diff --git a/source/hacks/memcpy_neon.S b/source/hacks/memcpy_neon.S
@@ -0,0 +1,140 @@
+/*
+ * NEON code contributed by Siarhei Siamashka <[email protected]>.
+ * Origin: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License.
+ *
+ * Tweaked for Android by Jim Huang <[email protected]>
+ */
+
+.arm
+.fpu neon
+
+@ void* memcpy_n(void *destination, const void *source, size_t num)
+.global memcpy_neon
+.type memcpy_neon, %function
+/*
+ * ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
+ * of unaligned load/store memory accesses supported since ARMv6. This
+ * will further improve performance, but can purely theoretically cause
+ * problems if somebody decides to set SCTLR.A bit in the OS kernel
+ * (to trap each unaligned memory access) or somehow mess with strongly
+ * ordered/device memory.
+ */
+#define ENABLE_UNALIGNED_MEM_ACCESSES 1
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+.align 4
+memcpy_neon:
+	.fnstart
+		mov	ip, r0
+		cmp	r2, #16
+		blt     4f	@ Have less than 16 bytes to copy
+
+		@ First ensure 16 byte alignment for the destination buffer
+		tst	r0, #0xF
+		beq	2f
+		tst	r0, #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		subne	r2, r2, #1
+		tst	ip, #2
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		ldrneh	r3, [r1], #2
+		strneh	r3, [ip], #2
+#else
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+#endif
+		subne	r2, r2, #2
+
+		tst	ip, #4
+		beq	1f
+		vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+		vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+		sub	r2, r2, #4
+1:
+		tst	ip, #8
+		beq	2f
+		vld1.8	{d0}, [r1]!
+		vst1.8	{d0}, [ip, :64]!
+		sub	r2, r2, #8
+2:
+		subs	r2, r2, #32
+		blt	3f
+		mov	r3, #32
+
+		@ Main copy loop, 32 bytes are processed per iteration.
+		@ ARM instructions are used for doing fine-grained prefetch,
+		@ increasing prefetch distance progressively up to
+		@ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+		vld1.8	{d0-d3}, [r1]!
+		cmp	r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+		pld	[r1, r3]
+		addle	r3, r3, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		sub	r2, r2, #32
+		cmp	r2, r3
+		bge	1b
+		cmp	r2, #0
+		blt	3f
+1:		@ Copy the remaining part of the buffer (already prefetched)
+		vld1.8	{d0-d3}, [r1]!
+		subs	r2, r2, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		bge	1b
+3:		@ Copy up to 31 remaining bytes
+		tst	r2, #16
+		beq	4f
+		vld1.8	{d0, d1}, [r1]!
+		vst1.8	{d0, d1}, [ip, :128]!
+4:
+		@ Use ARM instructions exclusively for the final trailing part
+		@ not fully fitting into full 16 byte aligned block in order
+		@ to avoid "ARM store after NEON store" hazard. Also NEON
+		@ pipeline will be (mostly) flushed by the time when the
+		@ control returns to the caller, making the use of NEON mostly
+		@ transparent (and avoiding hazards in the caller code)
+
+#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
+		movs	r3, r2, lsl #29
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrcs	r3, [r1], #4
+		strcs	r3, [ip], #4
+		ldrmi	r3, [r1], #4
+		strmi	r3, [ip], #4
+		movs	r2, r2, lsl #31
+		ldrcsh	r3, [r1], #2
+		strcsh	r3, [ip], #2
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#else
+		movs	r3, r2, lsl #29
+		bcc	1f
+	.rept	8
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+	.endr
+1:
+		bpl	1f
+	.rept	4
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+	.endr
+1:
+		movs	r2, r2, lsl #31
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+#endif
+		bx	lr
+	.fnend
diff --git a/source/legacy.c b/source/legacy.c
@@ -107,7 +107,7 @@ void glVertex3f(GLfloat x, GLfloat y, GLfloat z) {
 	last_vert->v.x = x;
 	last_vert->v.y = y;
 	last_vert->v.z = z;
-	memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
+	memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
 	last_clr->next = last_vert->next = NULL;
 
 	// Increasing vertex counter
@@ -135,8 +135,8 @@ void glVertex3fv(const GLfloat *v) {
 	}
 
 	// Properly populating the new element
-	memcpy(&last_vert->v, v, sizeof(vector3f));
-	memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
+	memcpy_neon(&last_vert->v, v, sizeof(vector3f));
+	memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
 	last_clr->next = last_vert->next = NULL;
 
 	// Increasing vertex counter
@@ -157,7 +157,7 @@ void glColor3f(GLfloat red, GLfloat green, GLfloat blue) {
 
 void glColor3fv(const GLfloat *v) {
 	// Setting current color value
-	memcpy(&current_color.r, v, sizeof(vector3f));
+	memcpy_neon(&current_color.r, v, sizeof(vector3f));
 	current_color.a = 1.0f;
 }
 
@@ -187,7 +187,7 @@ void glColor4f(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha) {
 
 void glColor4fv(const GLfloat *v) {
 	// Setting current color value
-	memcpy(&current_color.r, v, sizeof(vector4f));
+	memcpy_neon(&current_color.r, v, sizeof(vector4f));
 }
 
 void glColor4ub(GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha) {
@@ -309,7 +309,7 @@ void glArrayElement(GLint i) {
 		last_clr->next = NULL;
 
 		// Populating new vertex element
-		memcpy(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);
+		memcpy_neon(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);
 
 		// Checking if current texture unit has GL_COLOR_ARRAY enabled
 		if (tex_unit->color_array_state) {
@@ -322,11 +322,11 @@ void glArrayElement(GLint i) {
 
 			// Populating new color element
 			last_clr->v.a = 1.0f;
-			memcpy(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);
+			memcpy_neon(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);
 
 		} else {
 			// Populating new color element with current color
-			memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
+			memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
 		}
 
 		// Checking if current texture unit has GL_TEXTURE_COORD_ARRAY enabled
@@ -347,7 +347,7 @@ void glArrayElement(GLint i) {
 			}
 
 			// Populating new texcoord element
-			memcpy(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
+			memcpy_neon(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
 			last_uv->next = NULL;
 		}
 	}
@@ -498,8 +498,8 @@ void glEnd(void) {
 			memset(vertices, 0, (vertex_count * sizeof(vector3f)));
 			indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
 			for (i = 0; i < vertex_count; i++) {
-				memcpy(&vertices[n], &object->v, sizeof(vector3f));
-				memcpy(&uv_map[n], &object_uv->v, sizeof(vector2f));
+				memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
+				memcpy_neon(&uv_map[n], &object_uv->v, sizeof(vector2f));
 				indices[n] = n;
 				object = object->next;
 				object_uv = object_uv->next;
@@ -522,8 +522,8 @@ void glEnd(void) {
 				indices[i * 6 + 5] = i * 4 + 3;
 			}
 			for (j = 0; j < vertex_count; j++) {
-				memcpy(&vertices[j], &object->v, sizeof(vector3f));
-				memcpy(&uv_map[j], &object_uv->v, sizeof(vector2f));
+				memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
+				memcpy_neon(&uv_map[j], &object_uv->v, sizeof(vector2f));
 				object = object->next;
 				object_uv = object_uv->next;
 			}
@@ -554,8 +554,8 @@ void glEnd(void) {
 			memset(vertices, 0, (vertex_count * sizeof(vector3f)));
 			indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
 			for (i = 0; i < vertex_count; i++) {
-				memcpy(&vertices[n], &object->v, sizeof(vector3f));
-				memcpy(&colors[n], &object_clr->v, sizeof(vector4f));
+				memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
+				memcpy_neon(&colors[n], &object_clr->v, sizeof(vector4f));
 				indices[n] = n;
 				object = object->next;
 				object_clr = object_clr->next;
@@ -579,8 +579,8 @@ void glEnd(void) {
 				indices[i * 6 + 5] = i * 4 + 3;
 			}
 			for (j = 0; j < vertex_count; j++) {
-				memcpy(&vertices[j], &object->v, sizeof(vector3f));
-				memcpy(&colors[j], &object_clr->v, sizeof(vector4f));
+				memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
+				memcpy_neon(&colors[j], &object_clr->v, sizeof(vector4f));
 				object = object->next;
 				object_clr = object_clr->next;
 			}

diff --git a/source/misc.c b/source/misc.c
@@ -504,7 +504,7 @@ void glFogfv(GLenum pname, const GLfloat *params) {
 		fog_far = params[0];
 		break;
 	case GL_FOG_COLOR:
-		memcpy(&fog_color.r, params, sizeof(vector4f));
+		memcpy_neon(&fog_color.r, params, sizeof(vector4f));
 		break;
 	default:
 		vgl_error = GL_INVALID_ENUM;
@@ -545,7 +545,7 @@ void glClipPlane(GLenum plane, const GLdouble *equation) {
 		matrix4x4_transpose(inverted_transposed, inverted);
 		vector4f temp;
 		vector4f_matrix4x4_mult(&temp, inverted_transposed, &clip_plane0_eq);
-		memcpy(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
+		memcpy_neon(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
 		break;
 	default:
 		vgl_error = GL_INVALID_ENUM;