Skip to content

Commit

Permalink
Core: Add preliminary support for ARM NEON-A64. The generic memset an…
Browse files Browse the repository at this point in the history
…d buffer_copy functions are now supported.

- Note that NEON support is assuming the A64 instruction set. But if there is enough user demand for running the A32 instruction set, and if it is feasible to backport the NEON code to A32, then this may be explored at a later date. But for now, we are sticking with A64.
  • Loading branch information
rogerman committed Apr 3, 2022
1 parent 03be216 commit 7e85253
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 4 deletions.
125 changes: 124 additions & 1 deletion desmume/src/matrix.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
Copyright (C) 2006-2007 shash
Copyright (C) 2007-2021 DeSmuME team
Copyright (C) 2007-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -576,6 +576,129 @@ static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *_
__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
}

#elif defined(ENABLE_NEON_A64)

static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
u16 *dst16 = (u16 *)dst;

const v128u16 val_vec128 = vdupq_n_u16(val);
for (size_t i = 0; i < elementCount; i+=(sizeof(v128u16)/sizeof(u16)))
vst1q_u16(dst16 + i, val_vec128);
}

template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val)
{
u16 *dst16 = (u16 *)dst;

const v128u16 val_vec128 = vdupq_n_u16(val);
const uint16x8x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 };
MACRODO_N( ELEMENTCOUNT / (sizeof(uint16x8x4_t) / sizeof(u16)), vst1q_u16_x4(dst16 + ((X) * (sizeof(uint16x8x4_t)/sizeof(u16))), val_vec128x4) );
}

static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
u32 *dst32 = (u32 *)dst;

const v128u32 val_vec128 = vdupq_n_u32(val);
for (size_t i = 0; i < elementCount; i+=(sizeof(v128u32)/sizeof(u32)))
vst1q_u32(dst32 + i, val_vec128);
}

template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
u32 *dst32 = (u32 *)dst;

const v128u32 val_vec128 = vdupq_n_u32(val);
const uint32x4x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 };
MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
}

template <size_t VECLENGTH>
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
{
memcpy(dst, src, VECLENGTH);
}

template <size_t VECLENGTH>
static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
{
MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
}

template <size_t VECLENGTH>
static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
{
MACRODO_N( VECLENGTH / sizeof(v128u8), vst1q_u8((u8 *)dst + ((X) * sizeof(v128u8)), vorrq_u8(vld1q_u8((u8 *)src + ((X) * sizeof(v128u8))), c_vec)) );
}

static void __buffer_copy_or_constant(void *__restrict dst, const void *__restrict src, const size_t vecLength, const v128u8 &c_vec)
{
switch (vecLength)
{
case 128: __buffer_copy_or_constant_fast<128>(dst, src, c_vec); break;
case 256: __buffer_copy_or_constant_fast<256>(dst, src, c_vec); break;
case 512: __buffer_copy_or_constant_fast<512>(dst, src, c_vec); break;
case 768: __buffer_copy_or_constant_fast<768>(dst, src, c_vec); break;
case 1024: __buffer_copy_or_constant_fast<1024>(dst, src, c_vec); break;
case 2048: __buffer_copy_or_constant_fast<2048>(dst, src, c_vec); break;
case 2304: __buffer_copy_or_constant_fast<2304>(dst, src, c_vec); break;
case 4096: __buffer_copy_or_constant_fast<4096>(dst, src, c_vec); break;

default:
{
for (size_t i = 0; i < vecLength; i+=sizeof(v128u8))
{
vst1q_u8( (u8 *)dst + i, vorrq_u8(vld1q_u8((u8 *)src + i), c_vec) );
}
break;
}
}
}

static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s8 c)
{
const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );
__buffer_copy_or_constant(dst, src, vecLength, c_vec);
}

template <size_t VECLENGTH>
static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c)
{
const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );
__buffer_copy_or_constant_fast<VECLENGTH, false>(dst, src, c_vec);
}

template <bool NEEDENDIANSWAP>
static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s16 c)
{
const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );
__buffer_copy_or_constant(dst, src, vecLength, c_vec);
}

template <size_t VECLENGTH, bool NEEDENDIANSWAP>
static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c)
{
const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );
__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
}

template <bool NEEDENDIANSWAP>
static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s32 c)
{
const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );
__buffer_copy_or_constant(dst, src, vecLength, c_vec);
}

template <size_t VECLENGTH, bool NEEDENDIANSWAP>
static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c)
{
const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );
__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
}

#elif defined(ENABLE_ALTIVEC)

static void memset_u16(void *dst, const u16 val, const size_t elementCount)
Expand Down
20 changes: 19 additions & 1 deletion desmume/src/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

// Determine CPU architecture for platforms that don't use the autoconf script
#if defined(HOST_WINDOWS) || defined(DESMUME_COCOA)
#if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(__ppc64__)
#if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__ppc64__)
#define HOST_64
#else
#define HOST_32
Expand All @@ -57,6 +57,14 @@
#define ENABLE_ALTIVEC
#endif

// For now, we'll be starting off with only using NEON-A64 for easier testing
// and development. If the development for A64 goes well and if an A32 backport
// is discovered to be feasible, then we may explore backporting the NEON code
// to A32 at a later date.
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && (defined(__aarch64__) || defined(_M_ARM64))
#define ENABLE_NEON_A64
#endif

#ifdef __SSE__
#define ENABLE_SSE
#endif
Expand Down Expand Up @@ -262,6 +270,16 @@ typedef vector unsigned int v128u32;
typedef vector signed int v128s32;
#endif

#ifdef ENABLE_NEON_A64
#include <arm_neon.h>
typedef uint8x16_t v128u8;
typedef int8x16_t v128s8;
typedef uint16x8_t v128u16;
typedef int16x8_t v128s16;
typedef uint32x4_t v128u32;
typedef int32x4_t v128s32;
#endif

#ifdef ENABLE_SSE2
#include <emmintrin.h>
typedef __m128i v128u8;
Expand Down
6 changes: 4 additions & 2 deletions desmume/src/version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (C) 2009-2021 DeSmuME team
Copyright (C) 2009-2022 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -50,7 +50,7 @@
#define DESMUME_PLATFORM_STRING " ARM"
#elif defined(__thumb__)
#define DESMUME_PLATFORM_STRING " ARM-Thumb"
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(_M_ARM64)
#if defined(__APPLE__)
#define DESMUME_PLATFORM_STRING " ARM64"
#else
Expand Down Expand Up @@ -78,6 +78,8 @@
#define DESMUME_CPUEXT_PRIMARY_STRING " SSE"
#elif defined(ENABLE_ALTIVEC)
#define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec"
#elif defined(ENABLE_NEON_A64)
#define DESMUME_CPUEXT_PRIMARY_STRING " NEON-A64"
#endif

#if defined(ENABLE_AVX512_3)
Expand Down

0 comments on commit 7e85253

Please sign in to comment.