-
Notifications
You must be signed in to change notification settings - Fork 132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
This patch introduces erf, erfc, lgamma, tgamma and sinpi. #23
Changes from 1 commit
eb8b397
031632f
00d9052
6bb9fcd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,11 +130,11 @@ static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vsqrtq_f32(d); } | |
|
||
// Multiply accumulate: z = z + x * y | ||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { | ||
return vmlaq_f32(z, x, y); | ||
return vfmaq_f32(z, x, y); | ||
} | ||
// Multiply subtract: z = z = x * y | ||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { | ||
return vmlsq_f32(z, x, y); | ||
return vfmsq_f32(z, x, y); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? |
||
} | ||
|
||
// |x|, -x | ||
|
@@ -290,11 +290,7 @@ static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { | |
|
||
// Multiply accumulate: z = z + x * y | ||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vmlaq_f64(z, x, y); | ||
} | ||
//[z = x * y - z] | ||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); | ||
return vfmaq_f64(z, x, y); | ||
} | ||
|
||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
|
@@ -309,6 +305,11 @@ static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z | |
return vfmsq_f64(z, x, y); | ||
} | ||
|
||
//[z = x * y - z] | ||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z)); | ||
} | ||
|
||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z | ||
return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z)); | ||
} | ||
|
@@ -350,6 +351,28 @@ static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) { | |
return vbslq_f64(vreinterpretq_u64_u32(mask), x, y); | ||
} | ||
|
||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't you use the same implementation as the one in the AVX target here? It seems very expensive to me to create the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idx vector is common among the same series of coefficients. So, it is generated only once. You can see the assembly output from gcc-6.3.0 via the following link. https://www.dropbox.com/s/n71fn42cscgzlbs/sleefsimddp.advsimd.s.txt?dl=0 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The assembly file is too long to check. :) If I understood correctly, the idx is generated only once because the function is inlined. Is that the case? Even so, the generic implementation in the AVX header file seems to need less instructions. Am I missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's "common subexpression elimination." All modern compilers remove redundant codes for calculating the same results. https://en.wikipedia.org/wiki/Common_subexpression_elimination There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to do benchmarking to get the definitive answer as to which implementation is faster, but the generic implementation for choosing between 4 values requires 4 loads into 4 different registers and three blending instructions. The tbl implementation requires two loads and one tbl instruction, in addition to two adrp instructions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took benchmark of both implementation. It looks the generic implementation is around 10% faster. The following values show execution time of each function in micro second. TBL Generic |
||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, | ||
(uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }); | ||
|
||
uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 }; | ||
return (vdouble) vqtbl1q_u8(tab, idx); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sam here - I think this is not optimal. Isn't the AVX code better? |
||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, | ||
vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 }, | ||
(uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 }))); | ||
|
||
uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } }; | ||
return (vdouble) vqtbl2q_u8(tab, idx); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { | ||
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2); | ||
} | ||
|
||
static INLINE vdouble vrint_vd_vd(vdouble d) { | ||
return vrndnq_f64(d); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason behind this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can safely assume that FMA is the faster than any other combination of multiplication and addition. I saw the assembly output from the compiler and vmlaq is converted into multiplication and addition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh! Strange. Could you please provide a minimal example that shows what seems to be an inconsistent behavior? You might have found a bug in gcc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Multiply-accumulate instructions are all fused in aarch64, so this is not a bug.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM, I was just asking you to provide an example of code that generated multiplication and addition (separated) from the
vmlaq_f32
intrinsics.No worries if you don't have time.