Skip to content

Commit

Permalink
Moved to NEON optimized memcpy usage.
Browse files Browse the repository at this point in the history
  • Loading branch information
Rinnegatamante committed Jun 23, 2020
1 parent 9895189 commit 0c75f27
Show file tree
Hide file tree
Showing 12 changed files with 227 additions and 82 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ SOURCES += source/hacks
endif

CFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c))
ASMFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.S))
CGFILES := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg))
HEADERS := $(CGFILES:.cg=.h)
OBJS := $(CFILES:.c=.o)
OBJS := $(CFILES:.c=.o) $(ASMFILES:.S=.o)

PREFIX = arm-vita-eabi
CC = $(PREFIX)-gcc
Expand Down
6 changes: 3 additions & 3 deletions source/custom_shaders.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void glShaderBinary(GLsizei count, const GLuint *handles, GLenum binaryFormat, c

// Allocating compiled shader on RAM and registering it into sceGxmShaderPatcher
s->prog = (SceGxmProgram *)malloc(length);
memcpy((void *)s->prog, binary, length);
memcpy_neon((void *)s->prog, binary, length);
sceGxmShaderPatcherRegisterProgram(gxm_shader_patcher, s->prog, &s->id);
s->prog = sceGxmShaderPatcherGetProgramFromId(s->id);
}
Expand Down Expand Up @@ -449,13 +449,13 @@ void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean nor

// Copying passed data to vitaGL mempool
if (stride == 0)
memcpy(ptr, pointer, count * bpe * size); // Faster if stride == 0
memcpy_neon(ptr, pointer, count * bpe * size); // Faster if stride == 0
else {
int i;
uint8_t *dst = (uint8_t *)ptr;
uint8_t *src = (uint8_t *)pointer;
for (i = 0; i < count; i++) {
memcpy(dst, src, bpe * size);
memcpy_neon(dst, src, bpe * size);
dst += (bpe * size);
src += stride;
}
Expand Down
140 changes: 140 additions & 0 deletions source/hacks/memcpy_neon.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* NEON code contributed by Siarhei Siamashka <[email protected]>.
* Origin: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html
*
* The GNU C Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License.
*
* Tweaked for Android by Jim Huang <[email protected]>
*/

.arm
.fpu neon

@ void* memcpy_n(void *destination, const void *source, size_t num)
.global memcpy_neon
.type memcpy_neon, %function
/*
* ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use
* of unaligned load/store memory accesses supported since ARMv6. This
* will further improve performance, but can purely theoretically cause
* problems if somebody decides to set SCTLR.A bit in the OS kernel
* (to trap each unaligned memory access) or somehow mess with strongly
* ordered/device memory.
*/
#define ENABLE_UNALIGNED_MEM_ACCESSES 1

#define NEON_MAX_PREFETCH_DISTANCE 320

.align 4
memcpy_neon:
.fnstart
mov ip, r0
cmp r2, #16
blt 4f @ Have less than 16 bytes to copy

@ First ensure 16 byte alignment for the destination buffer
tst r0, #0xF
beq 2f
tst r0, #1
ldrneb r3, [r1], #1
strneb r3, [ip], #1
subne r2, r2, #1
tst ip, #2
#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
ldrneh r3, [r1], #2
strneh r3, [ip], #2
#else
ldrneb r3, [r1], #1
strneb r3, [ip], #1
ldrneb r3, [r1], #1
strneb r3, [ip], #1
#endif
subne r2, r2, #2

tst ip, #4
beq 1f
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
sub r2, r2, #4
1:
tst ip, #8
beq 2f
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [ip, :64]!
sub r2, r2, #8
2:
subs r2, r2, #32
blt 3f
mov r3, #32

@ Main copy loop, 32 bytes are processed per iteration.
@ ARM instructions are used for doing fine-grained prefetch,
@ increasing prefetch distance progressively up to
@ NEON_MAX_PREFETCH_DISTANCE at runtime
1:
vld1.8 {d0-d3}, [r1]!
cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
pld [r1, r3]
addle r3, r3, #32
vst1.8 {d0-d3}, [ip, :128]!
sub r2, r2, #32
cmp r2, r3
bge 1b
cmp r2, #0
blt 3f
1: @ Copy the remaining part of the buffer (already prefetched)
vld1.8 {d0-d3}, [r1]!
subs r2, r2, #32
vst1.8 {d0-d3}, [ip, :128]!
bge 1b
3: @ Copy up to 31 remaining bytes
tst r2, #16
beq 4f
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [ip, :128]!
4:
@ Use ARM instructions exclusively for the final trailing part
@ not fully fitting into full 16 byte aligned block in order
@ to avoid "ARM store after NEON store" hazard. Also NEON
@ pipeline will be (mostly) flushed by the time when the
@ control returns to the caller, making the use of NEON mostly
@ transparent (and avoiding hazards in the caller code)

#ifdef ENABLE_UNALIGNED_MEM_ACCESSES
movs r3, r2, lsl #29
ldrcs r3, [r1], #4
strcs r3, [ip], #4
ldrcs r3, [r1], #4
strcs r3, [ip], #4
ldrmi r3, [r1], #4
strmi r3, [ip], #4
movs r2, r2, lsl #31
ldrcsh r3, [r1], #2
strcsh r3, [ip], #2
ldrmib r3, [r1], #1
strmib r3, [ip], #1
#else
movs r3, r2, lsl #29
bcc 1f
.rept 8
ldrcsb r3, [r1], #1
strcsb r3, [ip], #1
.endr
1:
bpl 1f
.rept 4
ldrmib r3, [r1], #1
strmib r3, [ip], #1
.endr
1:
movs r2, r2, lsl #31
ldrcsb r3, [r1], #1
strcsb r3, [ip], #1
ldrcsb r3, [r1], #1
strcsb r3, [ip], #1
ldrmib r3, [r1], #1
strmib r3, [ip], #1
#endif
bx lr
.fnend
34 changes: 17 additions & 17 deletions source/legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ void glVertex3f(GLfloat x, GLfloat y, GLfloat z) {
last_vert->v.x = x;
last_vert->v.y = y;
last_vert->v.z = z;
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
last_clr->next = last_vert->next = NULL;

// Increasing vertex counter
Expand Down Expand Up @@ -135,8 +135,8 @@ void glVertex3fv(const GLfloat *v) {
}

// Properly populating the new element
memcpy(&last_vert->v, v, sizeof(vector3f));
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
memcpy_neon(&last_vert->v, v, sizeof(vector3f));
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
last_clr->next = last_vert->next = NULL;

// Increasing vertex counter
Expand All @@ -157,7 +157,7 @@ void glColor3f(GLfloat red, GLfloat green, GLfloat blue) {

void glColor3fv(const GLfloat *v) {
// Setting current color value
memcpy(&current_color.r, v, sizeof(vector3f));
memcpy_neon(&current_color.r, v, sizeof(vector3f));
current_color.a = 1.0f;
}

Expand Down Expand Up @@ -187,7 +187,7 @@ void glColor4f(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha) {

void glColor4fv(const GLfloat *v) {
// Setting current color value
memcpy(&current_color.r, v, sizeof(vector4f));
memcpy_neon(&current_color.r, v, sizeof(vector4f));
}

void glColor4ub(GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha) {
Expand Down Expand Up @@ -309,7 +309,7 @@ void glArrayElement(GLint i) {
last_clr->next = NULL;

// Populating new vertex element
memcpy(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);
memcpy_neon(&last_vert->v, ptr, tex_unit->vertex_array.size * tex_unit->vertex_array.num);

// Checking if current texture unit has GL_COLOR_ARRAY enabled
if (tex_unit->color_array_state) {
Expand All @@ -322,11 +322,11 @@ void glArrayElement(GLint i) {

// Populating new color element
last_clr->v.a = 1.0f;
memcpy(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);
memcpy_neon(&last_clr->v, ptr_clr, tex_unit->color_array.size * tex_unit->color_array.num);

} else {
// Populating new color element with current color
memcpy(&last_clr->v, &current_color.r, sizeof(vector4f));
memcpy_neon(&last_clr->v, &current_color.r, sizeof(vector4f));
}

// Checking if current texture unit has GL_TEXTURE_COORD_ARRAY enabled
Expand All @@ -347,7 +347,7 @@ void glArrayElement(GLint i) {
}

// Populating new texcoord element
memcpy(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
memcpy_neon(&last_uv->v, ptr_tex, tex_unit->vertex_array.size * 2);
last_uv->next = NULL;
}
}
Expand Down Expand Up @@ -498,8 +498,8 @@ void glEnd(void) {
memset(vertices, 0, (vertex_count * sizeof(vector3f)));
indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
for (i = 0; i < vertex_count; i++) {
memcpy(&vertices[n], &object->v, sizeof(vector3f));
memcpy(&uv_map[n], &object_uv->v, sizeof(vector2f));
memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
memcpy_neon(&uv_map[n], &object_uv->v, sizeof(vector2f));
indices[n] = n;
object = object->next;
object_uv = object_uv->next;
Expand All @@ -522,8 +522,8 @@ void glEnd(void) {
indices[i * 6 + 5] = i * 4 + 3;
}
for (j = 0; j < vertex_count; j++) {
memcpy(&vertices[j], &object->v, sizeof(vector3f));
memcpy(&uv_map[j], &object_uv->v, sizeof(vector2f));
memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
memcpy_neon(&uv_map[j], &object_uv->v, sizeof(vector2f));
object = object->next;
object_uv = object_uv->next;
}
Expand Down Expand Up @@ -554,8 +554,8 @@ void glEnd(void) {
memset(vertices, 0, (vertex_count * sizeof(vector3f)));
indices = (uint16_t *)gpu_pool_memalign(idx_count * sizeof(uint16_t), sizeof(uint16_t));
for (i = 0; i < vertex_count; i++) {
memcpy(&vertices[n], &object->v, sizeof(vector3f));
memcpy(&colors[n], &object_clr->v, sizeof(vector4f));
memcpy_neon(&vertices[n], &object->v, sizeof(vector3f));
memcpy_neon(&colors[n], &object_clr->v, sizeof(vector4f));
indices[n] = n;
object = object->next;
object_clr = object_clr->next;
Expand All @@ -579,8 +579,8 @@ void glEnd(void) {
indices[i * 6 + 5] = i * 4 + 3;
}
for (j = 0; j < vertex_count; j++) {
memcpy(&vertices[j], &object->v, sizeof(vector3f));
memcpy(&colors[j], &object_clr->v, sizeof(vector4f));
memcpy_neon(&vertices[j], &object->v, sizeof(vector3f));
memcpy_neon(&colors[j], &object_clr->v, sizeof(vector4f));
object = object->next;
object_clr = object_clr->next;
}
Expand Down
4 changes: 2 additions & 2 deletions source/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ void glFogfv(GLenum pname, const GLfloat *params) {
fog_far = params[0];
break;
case GL_FOG_COLOR:
memcpy(&fog_color.r, params, sizeof(vector4f));
memcpy_neon(&fog_color.r, params, sizeof(vector4f));
break;
default:
vgl_error = GL_INVALID_ENUM;
Expand Down Expand Up @@ -545,7 +545,7 @@ void glClipPlane(GLenum plane, const GLdouble *equation) {
matrix4x4_transpose(inverted_transposed, inverted);
vector4f temp;
vector4f_matrix4x4_mult(&temp, inverted_transposed, &clip_plane0_eq);
memcpy(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
memcpy_neon(&clip_plane0_eq.x, &temp.x, sizeof(vector4f));
break;
default:
vgl_error = GL_INVALID_ENUM;
Expand Down
Loading

0 comments on commit 0c75f27

Please sign in to comment.