Skip to content

Commit

Permalink
Auto merge of rust-lang#111999 - scottmcm:codegen-less-memcpy, r=comp…
Browse files Browse the repository at this point in the history
…iler-errors

Use `load`+`store` instead of `memcpy` for small integer arrays

I was inspired by rust-lang#98892 to see whether, rather than making `mem::swap` do something smart in the library, we could update MIR assignments like `*_1 = *_2` to do something smarter than `memcpy` for sufficiently-small types that doing it inline is going to be better than a `memcpy` call in assembly anyway.  After all, special code may help `mem::swap`, but if the "obvious" MIR can just result in the correct thing that helps everything -- other code like `mem::replace`, people doing it manually, and just passing around by value in general -- as well as makes MIR inlining happier since it doesn't need to deal with all the complicated library code if it just sees a couple assignments.

LLVM will turn the short, known-length `memcpy`s into direct instructions in the backend, but that's too late for it to be able to remove `alloca`s.  In general, replacing `memcpy`s with typed instructions is hard in the middle-end -- even for `memcpy.inline` where it knows it won't be a function call -- is hard [due to poison propagation issues](https://rust-lang.zulipchat.com/#narrow/stream/187780-t-compiler.2Fwg-llvm/topic/memcpy.20vs.20load-store.20for.20MIR.20assignments/near/360376712).  So because we know more about the type invariants -- these are typed copies -- rustc can emit something more specific, allowing LLVM to `mem2reg` away the `alloca`s in some situations.

rust-lang#52051 previously did something like this in the library for `mem::swap`, but it ended up regressing during enabling mir inlining (rust-lang@cbbf06b), so this has been suboptimal on stable for ≈5 releases now.

The code in this PR is narrowly targeted at just integer arrays in LLVM, but works via a new method on the [`LayoutTypeMethods`](https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/trait.LayoutTypeMethods.html) trait, so specific backends based on cg_ssa can enable this for more situations over time, as we find them.  I don't want to try to bite off too much in this PR, though.  (Transparent newtypes and simple things like the 3×usize `String` would be obvious candidates for a follow-up.)

Codegen demonstrations: <https://llvm.godbolt.org/z/fK8hT9aqv>

Before:
```llvm
define void `@swap_rgb48_old(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #1 {
  %a.i = alloca [3 x i16], align 2
  call void `@llvm.lifetime.start.p0(i64` 6, ptr nonnull %a.i)
  call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %a.i, ptr noundef nonnull align 2 dereferenceable(6) %x, i64 6, i1 false)
  tail call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %x, ptr noundef nonnull align 2 dereferenceable(6) %y, i64 6, i1 false)
  call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %y, ptr noundef nonnull align 2 dereferenceable(6) %a.i, i64 6, i1 false)
  call void `@llvm.lifetime.end.p0(i64` 6, ptr nonnull %a.i)
  ret void
}
```
Note it going to stack:
```nasm
swap_rgb48_old:                         # `@swap_rgb48_old`
        movzx   eax, word ptr [rdi + 4]
        mov     word ptr [rsp - 4], ax
        mov     eax, dword ptr [rdi]
        mov     dword ptr [rsp - 8], eax
        movzx   eax, word ptr [rsi + 4]
        mov     word ptr [rdi + 4], ax
        mov     eax, dword ptr [rsi]
        mov     dword ptr [rdi], eax
        movzx   eax, word ptr [rsp - 4]
        mov     word ptr [rsi + 4], ax
        mov     eax, dword ptr [rsp - 8]
        mov     dword ptr [rsi], eax
        ret
```

Now:
```llvm
define void `@swap_rgb48(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #0 {
start:
  %0 = load <3 x i16>, ptr %x, align 2
  %1 = load <3 x i16>, ptr %y, align 2
  store <3 x i16> %1, ptr %x, align 2
  store <3 x i16> %0, ptr %y, align 2
  ret void
}
```
still lowers to `dword`+`word` operations, but has no stack traffic:
```nasm
swap_rgb48:                             # `@swap_rgb48`
        mov     eax, dword ptr [rdi]
        movzx   ecx, word ptr [rdi + 4]
        movzx   edx, word ptr [rsi + 4]
        mov     r8d, dword ptr [rsi]
        mov     dword ptr [rdi], r8d
        mov     word ptr [rdi + 4], dx
        mov     word ptr [rsi + 4], cx
        mov     dword ptr [rsi], eax
        ret
```

And as a demonstration that this isn't just `mem::swap`, a `mem::replace` on a small array (since replace doesn't use swap since rust-lang#83022), which used to be `memcpy`s in LLVM changes in IR
```llvm
define void `@replace_short_array(ptr` noalias nocapture noundef sret([3 x i32]) dereferenceable(12) %0, ptr noalias noundef align 4 dereferenceable(12) %r, ptr noalias nocapture noundef readonly dereferenceable(12) %v) unnamed_addr #0 {
start:
  %1 = load <3 x i32>, ptr %r, align 4
  store <3 x i32> %1, ptr %0, align 4
  %2 = load <3 x i32>, ptr %v, align 4
  store <3 x i32> %2, ptr %r, align 4
  ret void
}
```
but that lowers to reasonable `dword`+`qword` instructions still
```nasm
replace_short_array:                    # `@replace_short_array`
        mov     rax, rdi
        mov     rcx, qword ptr [rsi]
        mov     edi, dword ptr [rsi + 8]
        mov     dword ptr [rax + 8], edi
        mov     qword ptr [rax], rcx
        mov     rcx, qword ptr [rdx]
        mov     edx, dword ptr [rdx + 8]
        mov     dword ptr [rsi + 8], edx
        mov     qword ptr [rsi], rcx
        ret
```
  • Loading branch information
bors committed Jun 6, 2023
2 parents adc719d + e1b020d commit fd9bf59
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 6 deletions.
3 changes: 3 additions & 0 deletions compiler/rustc_codegen_llvm/src/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
ty.llvm_type(self)
}
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
layout.scalar_copy_llvm_type(self)
}
}

impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
Expand Down
33 changes: 33 additions & 0 deletions compiler/rustc_codegen_llvm/src/type_of.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use rustc_middle::bug;
use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
use rustc_target::abi::HasDataLayout;
use rustc_target::abi::{Abi, Align, FieldsShape};
use rustc_target::abi::{Int, Pointer, F32, F64};
use rustc_target::abi::{PointeeInfo, Scalar, Size, TyAbiInterface, Variants};
Expand Down Expand Up @@ -192,6 +193,7 @@ pub trait LayoutLlvmExt<'tcx> {
) -> &'a Type;
fn llvm_field_index<'a>(&self, cx: &CodegenCx<'a, 'tcx>, index: usize) -> u64;
fn pointee_info_at<'a>(&self, cx: &CodegenCx<'a, 'tcx>, offset: Size) -> Option<PointeeInfo>;
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
}

impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
Expand Down Expand Up @@ -414,4 +416,35 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
cx.pointee_infos.borrow_mut().insert((self.ty, offset), result);
result
}

fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
debug_assert!(self.is_sized());

// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
// (like AVX2 registers) seems at least like a tolerable starting point.
let threshold = cx.data_layout().pointer_size * 4;
if self.layout.size() > threshold {
return None;
}

// Vectors, even for non-power-of-two sizes, have the same layout as
// arrays but don't count as aggregate types
if let FieldsShape::Array { count, .. } = self.layout.fields()
&& let element = self.field(cx, 0)
&& element.ty.is_integral()
{
// `cx.type_ix(bits)` is tempting here, but while that works great
// for things that *stay* as memory-to-memory copies, it also ends
// up suppressing vectorization as it introduces shifts when it
// extracts all the individual values.

let ety = element.llvm_type(cx);
return Some(cx.type_vector(ety, *count));
}

// FIXME: The above only handled integer arrays; surely more things
// would also be possible. Be careful about provenance, though!
None
}
}
14 changes: 13 additions & 1 deletion compiler/rustc_codegen_ssa/src/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,19 @@ pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
return;
}

bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
if flags == MemFlags::empty()
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
{
// I look forward to only supporting opaque pointers
let pty = bx.type_ptr_to(bty);
let src = bx.pointercast(src, pty);
let dst = bx.pointercast(dst, pty);

let temp = bx.load(bty, src, src_align);
bx.store(temp, dst, dst_align);
} else {
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
}
}

pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
Expand Down
22 changes: 22 additions & 0 deletions compiler/rustc_codegen_ssa/src/traits/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
index: usize,
immediate: bool,
) -> Self::Type;

/// A type that can be used in a [`super::BuilderMethods::load`] +
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
/// such as a MIR `*_0 = *_1`.
///
/// It's always legal to return `None` here, as the provided impl does,
/// in which case callers should use [`super::BuilderMethods::memcpy`]
/// instead of the `load`+`store` pair.
///
/// This can be helpful for things like arrays, where the LLVM backend type
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
/// instead be copied via an `i48` that stays as the single `load`+`store`.
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
/// cases, due to `poison` handling, but in codegen we have more information
/// about the type invariants, so can emit something better instead.)
///
/// This *should* return `None` for particularly-large types, where leaving
/// the `memcpy` may well be important to avoid code size explosion.
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
let _ = layout;
None
}
}

// For backends that support CFI using type membership (i.e., testing whether a given pointer is
Expand Down
35 changes: 35 additions & 0 deletions tests/codegen/array-codegen.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// compile-flags: -O -C no-prepopulate-passes
// min-llvm-version: 15.0 (for opaque pointers)

#![crate_type = "lib"]

// CHECK-LABEL: @array_load
#[no_mangle]
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
// CHECK: %0 = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %0, align 1
// CHECK: %[[TEMP2:.+]] = load i32, ptr %0, align 1
// CHECK: ret i32 %[[TEMP2]]
*a
}

// CHECK-LABEL: @array_store
#[no_mangle]
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
// CHECK: %a = alloca [4 x i8]
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
*p = a;
}

// CHECK-LABEL: @array_copy
#[no_mangle]
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
*p = *a;
}
11 changes: 11 additions & 0 deletions tests/codegen/mem-replace-simple-type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,14 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str {
// CHECK: ret { ptr, i64 } %[[P2]]
std::mem::replace(r, v)
}

#[no_mangle]
// CHECK-LABEL: @replace_short_array(
pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
// CHECK-NOT: alloca
// CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4
// CHECK: store <3 x i32> %[[R]], ptr %0
// CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4
// CHECK: store <3 x i32> %[[V]], ptr %r
std::mem::replace(r, v)
}
9 changes: 9 additions & 0 deletions tests/codegen/swap-simd-types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,12 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
x.swap_with_slice(y);
}
}

// CHECK-LABEL: @swap_bytes32
#[no_mangle]
pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
// CHECK-NOT: alloca
// CHECK: load <32 x i8>{{.+}}align 1
// CHECK: store <32 x i8>{{.+}}align 1
swap(x, y)
}
25 changes: 20 additions & 5 deletions tests/codegen/swap-small-types.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// compile-flags: -O
// compile-flags: -O -Z merge-functions=disabled
// only-x86_64
// ignore-debug: the debug assertions get in the way

Expand All @@ -8,13 +8,28 @@ use std::mem::swap;

type RGB48 = [u16; 3];

// CHECK-LABEL: @swap_rgb48_manually(
#[no_mangle]
pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP0:.+]] = load <3 x i16>, ptr %x, align 2
// CHECK: %[[TEMP1:.+]] = load <3 x i16>, ptr %y, align 2
// CHECK: store <3 x i16> %[[TEMP1]], ptr %x, align 2
// CHECK: store <3 x i16> %[[TEMP0]], ptr %y, align 2

let temp = *x;
*x = *y;
*y = temp;
}

// CHECK-LABEL: @swap_rgb48
#[no_mangle]
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
// FIXME MIR inlining messes up LLVM optimizations.
// WOULD-CHECK-NOT: alloca
// WOULD-CHECK: load i48
// WOULD-CHECK: store i48
// CHECK-NOT: alloca
// CHECK: load <3 x i16>
// CHECK: load <3 x i16>
// CHECK: store <3 x i16>
// CHECK: store <3 x i16>
swap(x, y)
}

Expand Down

0 comments on commit fd9bf59

Please sign in to comment.