Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libsleef] Add modified Payne Hanek argument reduction #197

Merged
merged 15 commits into from
Jul 6, 2018
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
https://github.com/shibatch/sleef/pull/192
- Power VSX target support is added to libsleef.
https://github.com/shibatch/sleef/pull/195
- Payne-Hanek like argument reduction is added to libsleef.
https://github.com/shibatch/sleef/pull/197
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
Expand Down
6 changes: 3 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ pipeline {
sh '''
echo "AArch64 SVE on" `hostname`
export PATH=$PATH:/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/bin
export LD_LIBRARY_PATH=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
export CC=/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/bin/armclang
export LD_LIBRARY_PATH=/opt/arm/arm-instruction-emulator-1.2.1_Generic-AArch64_Ubuntu-14.04_aarch64-linux/lib:/opt/arm/arm-hpc-compiler-18.1_Generic-AArch64_Ubuntu-16.04_aarch64-linux/lib
export CC=/opt/arm/arm-hpc-compiler-18.3_Generic-AArch64_Ubuntu-16.04_aarch64-linux/bin/armclang
rm -rf build
mkdir build
cd build
Expand All @@ -24,7 +24,7 @@ pipeline {
'''
}
}

stage('Intel Compiler') {
agent { label 'icc' }
steps {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperadvsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

// Basic logical operations for mask
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperavx.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[VECTLENDP];
vstoreu_v_p_vi(a, vi);
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[VECTLENSP];
vstoreu_v_p_vi2(a, vi2);
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
Expand Down Expand Up @@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

//

#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
5 changes: 5 additions & 0 deletions src/arch/helperavx512f.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }

//

static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
Expand Down Expand Up @@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
Expand All @@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }

//

static INLINE vdouble vposneg_vd_vd(vdouble d) {
Expand Down
14 changes: 10 additions & 4 deletions src/arch/helperneon32.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
return vget_lane_u32(x1, 0);
}

static vfloat vloaduf(float *p) { return vld1q_f32(p); }
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }

static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }

//

Expand Down Expand Up @@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpower_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[VECTLENDP];
vstoreu_v_p_vi(a, vi);
return ((vdouble) { ptr[a[0]], ptr[a[1]] });
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[VECTLENSP];
vstoreu_v_p_vi2(a, vi2);
return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
}

static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpurec.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
Expand Down Expand Up @@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helpersse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[sizeof(vint)/sizeof(int)];
vstoreu_v_p_vi(a, vi);
return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -373,6 +379,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
int a[VECTLENSP];
vstoreu_v_p_vi2(a, vi);
return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is useful when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
37 changes: 37 additions & 0 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,9 @@ static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
static INLINE vint vand_vi_vi_vi(vint x, vint y) {
return svand_s32_x(ptrue, x, y);
}
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) {
return svbic_s32_x(ptrue, y, x);
}
static INLINE vint vxor_vi_vi_vi(vint x, vint y) {
return sveor_s32_x(ptrue, x, y);
}
Expand Down Expand Up @@ -657,6 +660,15 @@ static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}

// Gather

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return svld1_gather_s64index_f64(ptrue, ptr, svreinterpret_s64_s32(vi));
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return svld1_gather_s32index_f32(ptrue, ptr, vi2);
}

// Operations for DFT

Expand Down Expand Up @@ -713,3 +725,28 @@ static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v);
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

// These functions are for debugging
static double vcast_d_vd(vdouble v) {
double a[svcntd()];
vstoreu_v_p_vd(a, v);
return a[0];
}

static float vcast_f_vf(vfloat v) {
float a[svcntw()];
vstoreu_v_p_vf(a, v);
return a[0];
}

static int vcast_i_vi(vint v) {
int a[svcntw()];
vstoreu_v_p_vi(a, v);
return a[0];
}

static int vcast_i_vi2(vint2 v) {
int a[svcntw()];
vstoreu_v_p_vi2(a, v);
return a[0];
}
12 changes: 12 additions & 0 deletions src/arch/helpervecext.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) {
return vd;
}

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
Expand Down Expand Up @@ -777,6 +783,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
Expand Down
Loading