-
-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Moved to NEON optimized memcpy usage.
- Loading branch information
1 parent
9895189
commit 0c75f27
Showing
12 changed files
with
227 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* | ||
* NEON code contributed by Siarhei Siamashka <[email protected]>. | ||
* Origin: http://sourceware.org/ml/libc-ports/2009-07/msg00003.html | ||
* | ||
* The GNU C Library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public License. | ||
* | ||
* Tweaked for Android by Jim Huang <[email protected]> | ||
*/ | ||
|
||
.arm | ||
.fpu neon | ||
|
||
@ void* memcpy_n(void *destination, const void *source, size_t num) | ||
.global memcpy_neon | ||
.type memcpy_neon, %function | ||
/* | ||
* ENABLE_UNALIGNED_MEM_ACCESSES macro can be defined to permit the use | ||
* of unaligned load/store memory accesses supported since ARMv6. This | ||
* will further improve performance, but can purely theoretically cause | ||
* problems if somebody decides to set SCTLR.A bit in the OS kernel | ||
* (to trap each unaligned memory access) or somehow mess with strongly | ||
* ordered/device memory. | ||
*/ | ||
#define ENABLE_UNALIGNED_MEM_ACCESSES 1 | ||
|
||
#define NEON_MAX_PREFETCH_DISTANCE 320 | ||
|
||
.align 4 | ||
memcpy_neon: | ||
.fnstart | ||
mov ip, r0 | ||
cmp r2, #16 | ||
blt 4f @ Have less than 16 bytes to copy | ||
|
||
@ First ensure 16 byte alignment for the destination buffer | ||
tst r0, #0xF | ||
beq 2f | ||
tst r0, #1 | ||
ldrneb r3, [r1], #1 | ||
strneb r3, [ip], #1 | ||
subne r2, r2, #1 | ||
tst ip, #2 | ||
#ifdef ENABLE_UNALIGNED_MEM_ACCESSES | ||
ldrneh r3, [r1], #2 | ||
strneh r3, [ip], #2 | ||
#else | ||
ldrneb r3, [r1], #1 | ||
strneb r3, [ip], #1 | ||
ldrneb r3, [r1], #1 | ||
strneb r3, [ip], #1 | ||
#endif | ||
subne r2, r2, #2 | ||
|
||
tst ip, #4 | ||
beq 1f | ||
vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! | ||
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]! | ||
sub r2, r2, #4 | ||
1: | ||
tst ip, #8 | ||
beq 2f | ||
vld1.8 {d0}, [r1]! | ||
vst1.8 {d0}, [ip, :64]! | ||
sub r2, r2, #8 | ||
2: | ||
subs r2, r2, #32 | ||
blt 3f | ||
mov r3, #32 | ||
|
||
@ Main copy loop, 32 bytes are processed per iteration. | ||
@ ARM instructions are used for doing fine-grained prefetch, | ||
@ increasing prefetch distance progressively up to | ||
@ NEON_MAX_PREFETCH_DISTANCE at runtime | ||
1: | ||
vld1.8 {d0-d3}, [r1]! | ||
cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32) | ||
pld [r1, r3] | ||
addle r3, r3, #32 | ||
vst1.8 {d0-d3}, [ip, :128]! | ||
sub r2, r2, #32 | ||
cmp r2, r3 | ||
bge 1b | ||
cmp r2, #0 | ||
blt 3f | ||
1: @ Copy the remaining part of the buffer (already prefetched) | ||
vld1.8 {d0-d3}, [r1]! | ||
subs r2, r2, #32 | ||
vst1.8 {d0-d3}, [ip, :128]! | ||
bge 1b | ||
3: @ Copy up to 31 remaining bytes | ||
tst r2, #16 | ||
beq 4f | ||
vld1.8 {d0, d1}, [r1]! | ||
vst1.8 {d0, d1}, [ip, :128]! | ||
4: | ||
@ Use ARM instructions exclusively for the final trailing part | ||
@ not fully fitting into full 16 byte aligned block in order | ||
@ to avoid "ARM store after NEON store" hazard. Also NEON | ||
@ pipeline will be (mostly) flushed by the time when the | ||
@ control returns to the caller, making the use of NEON mostly | ||
@ transparent (and avoiding hazards in the caller code) | ||
|
||
#ifdef ENABLE_UNALIGNED_MEM_ACCESSES | ||
movs r3, r2, lsl #29 | ||
ldrcs r3, [r1], #4 | ||
strcs r3, [ip], #4 | ||
ldrcs r3, [r1], #4 | ||
strcs r3, [ip], #4 | ||
ldrmi r3, [r1], #4 | ||
strmi r3, [ip], #4 | ||
movs r2, r2, lsl #31 | ||
ldrcsh r3, [r1], #2 | ||
strcsh r3, [ip], #2 | ||
ldrmib r3, [r1], #1 | ||
strmib r3, [ip], #1 | ||
#else | ||
movs r3, r2, lsl #29 | ||
bcc 1f | ||
.rept 8 | ||
ldrcsb r3, [r1], #1 | ||
strcsb r3, [ip], #1 | ||
.endr | ||
1: | ||
bpl 1f | ||
.rept 4 | ||
ldrmib r3, [r1], #1 | ||
strmib r3, [ip], #1 | ||
.endr | ||
1: | ||
movs r2, r2, lsl #31 | ||
ldrcsb r3, [r1], #1 | ||
strcsb r3, [ip], #1 | ||
ldrcsb r3, [r1], #1 | ||
strcsb r3, [ip], #1 | ||
ldrmib r3, [r1], #1 | ||
strmib r3, [ip], #1 | ||
#endif | ||
bx lr | ||
.fnend |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.