Skip to content

Commit

Permalink
sha + chacha: Move AArch64/X86-64 dispatching to C. (#1625)
Browse files Browse the repository at this point in the history
### Upstream Commits: 
*
[10c24cb](google/boringssl@10c24cb)
*
[f5e0c8f](google/boringssl@f5e0c8f)

Take a step towards removing all dispatching logic from assembly for
AArch64 and X86-64.

Change-Id: I1c965012e81837ff228c810d54e730c525cad54f
Reviewed-on:
https://boringssl-review.googlesource.com/c/boringssl/+/64208
Reviewed-by: Bob Beck <[email protected]>
Commit-Queue: David Benjamin <[email protected]>
Reviewed-by: David Benjamin <[email protected]>

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license and the ISC license.
  • Loading branch information
justsmth authored Jul 29, 2024
2 parents 916b3d1 + 9f17d95 commit b5a6227
Show file tree
Hide file tree
Showing 37 changed files with 843 additions and 822 deletions.
98 changes: 35 additions & 63 deletions crypto/chacha/asm/chacha-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@
$code.=<<___;
.text
.extern OPENSSL_ia32cap_P
.section .rodata
.align 64
.Lzero:
Expand Down Expand Up @@ -230,24 +228,12 @@ sub ROUND { # critical path is 24 cycles per round
########################################################################
# Generic code path that handles all lengths on pre-SSSE3 processors.
$code.=<<___;
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,\@function,5
.globl ChaCha20_ctr32_nohw
.type ChaCha20_ctr32_nohw,\@function,5
.align 64
ChaCha20_ctr32:
ChaCha20_ctr32_nohw:
.cfi_startproc
_CET_ENDBR
cmp \$0,$len
je .Lno_data
mov OPENSSL_ia32cap_P+4(%rip),%r10
___
$code.=<<___ if ($avx>2);
bt \$48,%r10 # check for AVX512F
jc .LChaCha20_avx512
___
$code.=<<___;
test \$`1<<(41-32)`,%r10d
jnz .LChaCha20_ssse3
push %rbx
.cfi_push rbx
push %rbp
Expand Down Expand Up @@ -419,7 +405,7 @@ sub ROUND { # critical path is 24 cycles per round
.Lno_data:
ret
.cfi_endproc
.size ChaCha20_ctr32,.-ChaCha20_ctr32
.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
___

########################################################################
Expand Down Expand Up @@ -454,19 +440,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
my $xframe = $win64 ? 32+8 : 8;

$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.globl ChaCha20_ctr32_ssse3
.type ChaCha20_ctr32_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
ChaCha20_ctr32_ssse3:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower
.Ldo_sse3_after_all:
sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
Expand Down Expand Up @@ -576,7 +559,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
.Lssse3_epilogue:
ret
.cfi_endproc
.size ChaCha20_ssse3,.-ChaCha20_ssse3
.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
___
}

Expand Down Expand Up @@ -714,29 +697,17 @@ sub SSSE3_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_4x,\@function,5
.globl ChaCha20_ctr32_ssse3_4x
.type ChaCha20_ctr32_ssse3_4x,\@function,5
.align 32
ChaCha20_4x:
.LChaCha20_4x:
ChaCha20_ctr32_ssse3_4x:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
mov %r10,%r11
___
$code.=<<___ if ($avx>1);
shr \$32,%r10 # OPENSSL_ia32cap_P+8
test \$`1<<5`,%r10 # test AVX2
jnz .LChaCha20_8x
___
$code.=<<___;
cmp \$192,$len
ja .Lproceed4x
and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
je .Ldo_sse3_after_all # to detect Atom
.Lproceed4x:
sub \$0x140+$xframe,%rsp
___
################ stack layout
Expand Down Expand Up @@ -1164,7 +1135,7 @@ sub SSSE3_lane_ROUND {
.L4x_epilogue:
ret
.cfi_endproc
.size ChaCha20_4x,.-ChaCha20_4x
.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
___
}

Expand Down Expand Up @@ -1293,11 +1264,12 @@ sub AVX2_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_8x,\@function,5
.globl ChaCha20_ctr32_avx2
.type ChaCha20_ctr32_avx2,\@function,5
.align 32
ChaCha20_8x:
.LChaCha20_8x:
ChaCha20_ctr32_avx2:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame register
.cfi_def_cfa_register r9
sub \$0x280+$xframe,%rsp
Expand Down Expand Up @@ -1809,7 +1781,7 @@ sub AVX2_lane_ROUND {
.L8x_epilogue:
ret
.cfi_endproc
.size ChaCha20_8x,.-ChaCha20_8x
.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
___
}

Expand Down Expand Up @@ -2719,22 +2691,22 @@ sub AVX512_lane_ROUND {
.section .pdata
.align 4
.rva .LSEH_begin_ChaCha20_ctr32
.rva .LSEH_end_ChaCha20_ctr32
.rva .LSEH_info_ChaCha20_ctr32
.rva .LSEH_begin_ChaCha20_ctr32_nohw
.rva .LSEH_end_ChaCha20_ctr32_nohw
.rva .LSEH_info_ChaCha20_ctr32_nohw
.rva .LSEH_begin_ChaCha20_ssse3
.rva .LSEH_end_ChaCha20_ssse3
.rva .LSEH_info_ChaCha20_ssse3
.rva .LSEH_begin_ChaCha20_ctr32_ssse3
.rva .LSEH_end_ChaCha20_ctr32_ssse3
.rva .LSEH_info_ChaCha20_ctr32_ssse3
.rva .LSEH_begin_ChaCha20_4x
.rva .LSEH_end_ChaCha20_4x
.rva .LSEH_info_ChaCha20_4x
.rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_end_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_info_ChaCha20_ctr32_ssse3_4x
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_ChaCha20_8x
.rva .LSEH_end_ChaCha20_8x
.rva .LSEH_info_ChaCha20_8x
.rva .LSEH_begin_ChaCha20_ctr32_avx2
.rva .LSEH_end_ChaCha20_ctr32_avx2
.rva .LSEH_info_ChaCha20_ctr32_avx2
___
$code.=<<___ if ($avx>2);
.rva .LSEH_begin_ChaCha20_avx512
Expand All @@ -2748,22 +2720,22 @@ sub AVX512_lane_ROUND {
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_ChaCha20_ctr32:
.LSEH_info_ChaCha20_ctr32_nohw:
.byte 9,0,0,0
.rva se_handler
.LSEH_info_ChaCha20_ssse3:
.LSEH_info_ChaCha20_ctr32_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
.rva .Lssse3_body,.Lssse3_epilogue
.LSEH_info_ChaCha20_4x:
.LSEH_info_ChaCha20_ctr32_ssse3_4x:
.byte 9,0,0,0
.rva full_handler
.rva .L4x_body,.L4x_epilogue
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
.LSEH_info_ChaCha20_ctr32_avx2:
.byte 9,0,0,0
.rva full_handler
.rva .L8x_body,.L8x_epilogue # HandlerData[]
Expand Down
18 changes: 18 additions & 0 deletions crypto/chacha/chacha.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,24 @@ static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len,
ChaCha20_ctr32_neon(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ChaCha20_ctr32_avx2_capable(in_len)) {
ChaCha20_ctr32_avx2(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_SSSE3_4X)
if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_SSSE3)
if (ChaCha20_ctr32_ssse3_capable(in_len)) {
ChaCha20_ctr32_ssse3(out, in, in_len, key, counter);
return;
}
#endif
if (in_len > 0) {
ChaCha20_ctr32_nohw(out, in, in_len, key, counter);
Expand Down
15 changes: 15 additions & 0 deletions crypto/chacha/chacha_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,21 @@ static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len,
CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ChaCha20_ctr32_avx2_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_SSSE3_4X)
if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_SSSE3)
if (ChaCha20_ctr32_ssse3_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_NOHW)
if (in_len > 0) {
CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter);
Expand Down
28 changes: 26 additions & 2 deletions crypto/chacha/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ extern "C" {
void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32],
const uint8_t nonce[16]);

#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)

#define CHACHA20_ASM

Expand All @@ -46,6 +45,31 @@ OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) {
}
void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define CHACHA20_ASM_NOHW

#define CHACHA20_ASM_AVX2
OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) {
return (len > 128) && CRYPTO_is_AVX2_capable();
}
void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);

#define CHACHA20_ASM_SSSE3_4X
OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) {
int capable = (len > 128) && CRYPTO_is_SSSE3_capable();
int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont();
return capable && faster;
}
void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);

#define CHACHA20_ASM_SSSE3
OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) {
return (len > 128) && CRYPTO_is_SSSE3_capable();
}
void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);
#endif

#if defined(CHACHA20_ASM)
Expand Down
6 changes: 4 additions & 2 deletions crypto/fipsmodule/cpucap/cpu_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ void OPENSSL_cpuid_setup(void) {

// Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
// some Silvermont-specific codepaths which perform better. See OpenSSL
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
// |CRYPTO_cpu_perf_is_like_silvermont|.
if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
(eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
ecx &= ~(1u << 26);
Expand All @@ -267,7 +268,8 @@ void OPENSSL_cpuid_setup(void) {
// Clear AVX2 and AVX512* bits.
//
// TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
// doesn't clear those.
// doesn't clear those. See the comments in
// |CRYPTO_hardware_supports_XSAVE|.
extended_features[0] &=
~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
}
Expand Down
Loading

0 comments on commit b5a6227

Please sign in to comment.