Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repr(simd) does not align to Intel recs on x86_64 #81931

Open
workingjubilee opened this issue Feb 9, 2021 · 1 comment
Open

repr(simd) does not align to Intel recs on x86_64 #81931

workingjubilee opened this issue Feb 9, 2021 · 1 comment
Labels
A-SIMD Area: SIMD (Single Instruction Multiple Data) C-bug Category: This is a bug. O-x86_64 Target: x86-64 processors (like x86_64-*)

Comments

@workingjubilee
Copy link
Member

workingjubilee commented Feb 9, 2021

I tried this code: (Playground). The random inputs are mostly just to keep the compiler fairly "honest" and block optimizing away the instructions it would use.

#![feature(repr_simd)]
#![feature(platform_intrinsics)]
use rand::random;

#[derive(Debug)]
#[repr(simd)]
struct f32x2(f32, f32);

extern "platform-intrinsic" {
    fn simd_shuffle2<T, U>(a: T, b: T, idx: [u32; 2]) -> U;
}

fn main() {
    let x = f32x2(rand::random(), rand::random());
    let y = f32x2(rand::random(), rand::random());
    let z: f32x2 = unsafe { simd_shuffle2(x, y, [0, 2]) };
    println!("Alignment is: {:?}", std::mem::align_of::<f32x2>());
    println!("Data is: {:?}", z);
}

For best performance, the Streaming SIMD Extensions and Streaming SIMD Extensions 2 require their memory operands to be aligned to 16-byte boundaries.

Thus, I expected to see this happen:

Alignment is: 16
Data is: f32x2(0.12946808, 0.4856578)

Instead, this happened:

Alignment is: 8
Data is: f32x2(0.12946808, 0.4856578)

That does not appear to be the correct alignment to report for this type, unless I am misunderstanding something here.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (0fc6756b4 2021-02-08)
binary: rustc
commit-hash: 0fc6756b42e0556cc2e18079f5fc6b4d58f4e81a
commit-date: 2021-02-08
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1

I believe this is related to, but not exactly the same as, #27060. Apologies if this is a total duplicate, or if I am misunderstanding something here about what Rust means by "alignment", but after careful review with @calebzulawski, we started to arrive at the conclusion that something was off.

Here is the generated assembly, as you can see, it uses multiple SSE instructions, including movaps, an aligned load, but I haven't exhaustively analyzed it so I can't immediately tell if actual alignment requirements are being adhered to here and I am just spooked by the seemingly misleading information.

x86_64 Assembly
std::sys_common::backtrace::__rust_begin_short_backtrace: # @std::sys_common::backtrace::__rust_begin_short_backtrace
# %bb.0:
	sub	rsp, 8
	call	rdi
	mov	rax, rsp
	#APP
	#NO_APP
	pop	rax
	ret
                                        # -- End function

std::rt::lang_start: # @std::rt::lang_start
# %bb.0:
	sub	rsp, 8
	mov	rcx, rdx
	mov	rdx, rsi
	mov	qword ptr [rsp], rdi
	lea	rsi, [rip + .L__unnamed_1]
	mov	rdi, rsp
	call	qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
	pop	rcx
	ret
                                        # -- End function

std::rt::lang_start::{{closure}}: # @"std::rt::lang_start::{{closure}}"
# %bb.0:
	sub	rsp, 8
	mov	rdi, qword ptr [rdi]
	call	std::sys_common::backtrace::__rust_begin_short_backtrace
	xor	eax, eax
	pop	rcx
	ret
                                        # -- End function

<&T as core::fmt::Debug>::fmt: # @"<&T as core::fmt::Debug>::fmt"
# %bb.0:
	mov	rdi, qword ptr [rdi]
	jmp	qword ptr [rip + core::fmt::float::<impl core::fmt::Debug for f32>::fmt@GOTPCREL] # TAILCALL
                                        # -- End function

core::fmt::num::<impl core::fmt::Debug for usize>::fmt: # @"core::fmt::num::<impl core::fmt::Debug for usize>::fmt"
# %bb.0:
	push	r14
	push	rbx
	sub	rsp, 8
	mov	rbx, rsi
	mov	r14, rdi
	mov	rdi, rsi
	call	qword ptr [rip + core::fmt::Formatter::debug_lower_hex@GOTPCREL]
	test	al, al
	je	.LBB4_1
# %bb.3:
	mov	rdi, r14
	mov	rsi, rbx
	add	rsp, 8
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::<impl core::fmt::LowerHex for usize>::fmt@GOTPCREL] # TAILCALL

.LBB4_1:
	mov	rdi, rbx
	call	qword ptr [rip + core::fmt::Formatter::debug_upper_hex@GOTPCREL]
	mov	rdi, r14
	mov	rsi, rbx
	add	rsp, 8
	test	al, al
	je	.LBB4_2
# %bb.4:
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::<impl core::fmt::UpperHex for usize>::fmt@GOTPCREL] # TAILCALL

.LBB4_2:
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for usize>::fmt@GOTPCREL] # TAILCALL
                                        # -- End function

core::ops::function::FnOnce::call_once{{vtable.shim}}: # @"core::ops::function::FnOnce::call_once{{vtable.shim}}"
# %bb.0:
	sub	rsp, 8
	mov	rdi, qword ptr [rdi]
	call	std::sys_common::backtrace::__rust_begin_short_backtrace
	xor	eax, eax
	pop	rcx
	ret
                                        # -- End function

core::ptr::drop_in_place<&f32>: # @"core::ptr::drop_in_place<&f32>"
# %bb.0:
	ret
                                        # -- End function

core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>: # @"core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>"
# %bb.0:
	mov	rax, qword ptr [rdi]
	add	qword ptr [rax], -1
	mov	rax, qword ptr [rdi]
	cmp	qword ptr [rax], 0
	jne	.LBB7_2
# %bb.1:
	add	qword ptr [rax + 8], -1
	mov	rdi, qword ptr [rdi]
	cmp	qword ptr [rdi + 8], 0
	je	.LBB7_3

.LBB7_2:
	ret

.LBB7_3:
	mov	esi, 368
	mov	edx, 16
	jmp	qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL
                                        # -- End function

rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate: # @"rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate"
# %bb.0:
	push	r15
	push	r14
	push	r13
	push	r12
	push	rbx
	sub	rsp, 160
	mov	r15, rdx
	mov	r14, rsi
	mov	rbx, rdi
	xorps	xmm0, xmm0
	movaps	xmmword ptr [rsp + 16], xmm0
	movaps	xmmword ptr [rsp], xmm0
	mov	rsi, rsp
	mov	edx, 32
	call	qword ptr [rip + <rand_core::os::OsRng as rand_core::RngCore>::try_fill_bytes@GOTPCREL]
	test	rax, rax
	je	.LBB8_1
# %bb.2:
	mov	r12, rax
	mov	r13, rdx
	mov	rdi, rax
	call	qword ptr [rdx]
# %bb.3:
	mov	rsi, qword ptr [r13 + 8]
	test	rsi, rsi
	je	.LBB8_5
# %bb.4:
	mov	rdx, qword ptr [r13 + 16]
	mov	rdi, r12
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]
	jmp	.LBB8_5

.LBB8_1:
	movaps	xmm0, xmmword ptr [rsp]
	movaps	xmm1, xmmword ptr [rsp + 16]
	movaps	xmmword ptr [rsp + 144], xmm1
	movaps	xmmword ptr [rsp + 128], xmm0
	lea	rdx, [rip + .L__unnamed_2]
	lea	rdi, [rsp + 80]
	lea	rsi, [rsp + 128]
	mov	ecx, 8
	call	qword ptr [rip + rand_chacha::guts::init_chacha@GOTPCREL]
	mov	rax, qword ptr [rsp + 80]
	mov	rcx, qword ptr [rsp + 120]
	mov	qword ptr [rsp + 64], rcx
	movups	xmm0, xmmword ptr [rsp + 104]
	movaps	xmmword ptr [rsp + 48], xmm0
	movups	xmm0, xmmword ptr [rsp + 88]
	movaps	xmmword ptr [rsp + 32], xmm0
	mov	rcx, qword ptr [rbx + 48]
	mov	qword ptr [rbx + 56], rcx
	mov	qword ptr [rbx], rax
	movaps	xmm0, xmmword ptr [rsp + 32]
	movups	xmmword ptr [rbx + 8], xmm0
	movaps	xmm0, xmmword ptr [rsp + 48]
	movups	xmmword ptr [rbx + 24], xmm0
	mov	rax, qword ptr [rsp + 64]
	mov	qword ptr [rbx + 40], rax

.LBB8_5:
	mov	qword ptr [rbx + 64], r15
	mov	rax, -256
	add	rax, qword ptr [rbx + 48]
	mov	qword ptr [rbx + 56], rax
	mov	rdi, rbx
	mov	esi, 6
	mov	rdx, r14
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	add	rsp, 160
	pop	rbx
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret
	mov	rbx, rax
	mov	rdi, r12
	mov	rsi, r13
	call	alloc::alloc::box_free
	mov	rdi, rbx
	call	_Unwind_Resume@PLT
	ud2
                                        # -- End function

alloc::alloc::box_free: # @alloc::alloc::box_free
# %bb.0:
	mov	rax, rsi
	mov	rsi, qword ptr [rsi + 8]
	test	rsi, rsi
	je	.LBB9_1
# %bb.2:
	mov	rdx, qword ptr [rax + 16]
	jmp	qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL

.LBB9_1:
	ret
                                        # -- End function

.LCPI10_0:
	.long	0x33800000                      # float 5.96046448E-8

playground::main: # @playground::main
# %bb.0:
	push	rbp
	push	r15
	push	r14
	push	rbx
	sub	rsp, 72
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	r14, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_7
# %bb.1:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.2:
	lea	rdx, [rbx + 24]
	add	rbx, 288
	mov	rcx, qword ptr [r14 + 344]
	test	rcx, rcx
	jle	.LBB10_4
# %bb.3:
	cmp	qword ptr [r14 + 352], rax
	js	.LBB10_4
# %bb.5:
	add	rcx, -256
	mov	qword ptr [r14 + 344], rcx
	mov	rdi, rbx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_6

.LBB10_4:
	mov	rdi, rbx
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_6:
	mov	qword ptr [r14 + 16], 0
	xor	eax, eax

.LBB10_7:
	mov	r15d, dword ptr [r14 + 4*rax + 24]
	add	rax, 1
	mov	qword ptr [r14 + 16], rax
	add	qword ptr [r14], -1
	jne	.LBB10_10
# %bb.8:
	add	qword ptr [r14 + 8], -1
	jne	.LBB10_10
# %bb.9:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, r14
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_10:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_19
# %bb.11:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.12:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_14
# %bb.13:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_14
# %bb.17:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_18

.LBB10_14:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_18:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_19:
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_22
# %bb.20:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_22
# %bb.21:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_22:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_29
# %bb.23:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.24:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_26
# %bb.25:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_26
# %bb.27:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_28

.LBB10_26:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_28:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_29:
	mov	ebp, dword ptr [rbx + 4*rax + 24]
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_32
# %bb.30:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_32
# %bb.31:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_32:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_39
# %bb.33:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.34:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_36
# %bb.35:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_36
# %bb.37:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_38

.LBB10_36:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_38:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_39:
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_42
# %bb.40:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_42
# %bb.41:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_42:
	shr	ebp, 8
	cvtsi2ss	xmm0, ebp
	shr	r15d, 8
	cvtsi2ss	xmm1, r15d
	movss	xmm2, dword ptr [rip + .LCPI10_0] # xmm2 = mem[0],zero,zero,zero
	mulss	xmm0, xmm2
	mulss	xmm1, xmm2
	unpcklps	xmm1, xmm0                      # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	movlps	qword ptr [rsp + 64], xmm1
	lea	rax, [rip + .L__unnamed_3]
	mov	qword ptr [rsp + 48], rax
	lea	rax, [rip + core::fmt::num::<impl core::fmt::Debug for usize>::fmt]
	mov	qword ptr [rsp + 56], rax
	lea	rax, [rip + .L__unnamed_4]
	mov	qword ptr [rsp], rax
	mov	qword ptr [rsp + 8], 2
	mov	qword ptr [rsp + 16], 0
	lea	rbx, [rsp + 48]
	mov	qword ptr [rsp + 32], rbx
	mov	qword ptr [rsp + 40], 1
	mov	rbp, qword ptr [rip + std::io::stdio::_print@GOTPCREL]
	mov	rdi, rsp
	call	rbp
	lea	rax, [rsp + 64]
	mov	qword ptr [rsp + 48], rax
	lea	rax, [rip + <playground::f32x2 as core::fmt::Debug>::fmt]
	mov	qword ptr [rsp + 56], rax
	lea	rax, [rip + .L__unnamed_5]
	mov	qword ptr [rsp], rax
	mov	qword ptr [rsp + 8], 2
	mov	qword ptr [rsp + 16], 0
	mov	qword ptr [rsp + 32], rbx
	mov	qword ptr [rsp + 40], 1
	mov	rdi, rsp
	call	rbp
	add	rsp, 72
	pop	rbx
	pop	r14
	pop	r15
	pop	rbp
	ret
	jmp	.LBB10_16
	jmp	.LBB10_16
	jmp	.LBB10_16

.LBB10_16:
	mov	rbx, rax
	mov	rdi, rsp
	call	core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>
	mov	rdi, rbx
	call	_Unwind_Resume@PLT
	ud2
                                        # -- End function

<playground::f32x2 as core::fmt::Debug>::fmt: # @"<playground::f32x2 as core::fmt::Debug>::fmt"
# %bb.0:
	push	r15
	push	r14
	push	r12
	push	rbx
	sub	rsp, 40
	mov	rbx, rdi
	lea	r15, [rdi + 4]
	lea	rdx, [rip + .L__unnamed_6]
	lea	r14, [rsp + 16]
	mov	ecx, 5
	mov	rdi, r14
	call	qword ptr [rip + core::fmt::Formatter::debug_tuple@GOTPCREL]
	mov	qword ptr [rsp + 8], rbx
	lea	rbx, [rip + .L__unnamed_7]
	mov	r12, qword ptr [rip + core::fmt::builders::DebugTuple::field@GOTPCREL]
	lea	rsi, [rsp + 8]
	mov	rdi, r14
	mov	rdx, rbx
	call	r12
	mov	qword ptr [rsp + 8], r15
	lea	rsi, [rsp + 8]
	mov	rdi, r14
	mov	rdx, rbx
	call	r12
	mov	rdi, r14
	call	qword ptr [rip + core::fmt::builders::DebugTuple::finish@GOTPCREL]
	add	rsp, 40
	pop	rbx
	pop	r12
	pop	r14
	pop	r15
	ret
                                        # -- End function

main:                                   # @main
# %bb.0:
	sub	rsp, 8
	mov	rcx, rsi
	movsxd	rdx, edi
	lea	rax, [rip + playground::main]
	mov	qword ptr [rsp], rax
	lea	rsi, [rip + .L__unnamed_1]
	mov	rdi, rsp
	call	qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
                                        # kill: def $eax killed $eax killed $rax
	pop	rcx
	ret
                                        # -- End function

.L__unnamed_1:
	.quad	core::ptr::drop_in_place<&f32>
	.quad	8                               # 0x8
	.quad	8                               # 0x8
	.quad	std::rt::lang_start::{{closure}}
	.quad	std::rt::lang_start::{{closure}}
	.quad	core::ops::function::FnOnce::call_once{{vtable.shim}}

.L__unnamed_2:
	.zero	8

.L__unnamed_8:
	.ascii	"Alignment is: "

.L__unnamed_9:
	.byte	10

.L__unnamed_4:
	.quad	.L__unnamed_8
	.asciz	"\016\000\000\000\000\000\000"
	.quad	.L__unnamed_9
	.asciz	"\001\000\000\000\000\000\000"

.L__unnamed_3:
	.asciz	"\b\000\000\000\000\000\000"

.L__unnamed_10:
	.ascii	"Data is: "

.L__unnamed_5:
	.quad	.L__unnamed_10
	.asciz	"\t\000\000\000\000\000\000"
	.quad	.L__unnamed_9
	.asciz	"\001\000\000\000\000\000\000"

.L__unnamed_6:
	.ascii	"f32x2"

.L__unnamed_7:
	.quad	core::ptr::drop_in_place<&f32>
	.quad	8                               # 0x8
	.quad	8                               # 0x8
	.quad	<&T as core::fmt::Debug>::fmt
@workingjubilee workingjubilee added the C-bug Category: This is a bug. label Feb 9, 2021
@jonas-schievink jonas-schievink added the A-SIMD Area: SIMD (Single Instruction Multiple Data) label Feb 9, 2021
@workingjubilee workingjubilee added the O-x86_64 Target: x86-64 processors (like x86_64-*) label Feb 9, 2021
@workingjubilee
Copy link
Member Author

workingjubilee commented Mar 23, 2022

For note, here: having done some thought and research, it's not clear using a higher alignment is actually desired in practice. It may be the case that actually the desired alignment for vectors which do not precisely align to the machine's vector sizes is lower, around the element size (as if it were an array), possibly unless it uses a power of 2 element count (as it does here).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-SIMD Area: SIMD (Single Instruction Multiple Data) C-bug Category: This is a bug. O-x86_64 Target: x86-64 processors (like x86_64-*)
Projects
None yet
Development

No branches or pull requests

2 participants