Skip to content

Commit

Permalink
Align destination in mem* instructions.
Browse files Browse the repository at this point in the history
While misaligned reads are generally fast, misaligned writes aren't and
can have severe penalties.
  • Loading branch information
Demindiro committed Jul 3, 2022
1 parent f10dbd9 commit db0ca0c
Showing 1 changed file with 94 additions and 36 deletions.
130 changes: 94 additions & 36 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

use core::arch::asm;
use core::intrinsics;
use core::mem;

Expand All @@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe movsq (%rsi), (%rdi)",
"mov {byte_count:e}, %ecx",
"repe movsb (%rsi), (%rdi)",
byte_count = in(reg) byte_count,
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
// It also allows us to trivially skip the rep movsb, which is faster when memcpying
// aligned data.
if pre_byte_count > 0 {
asm!(
"rep movsb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
inout("rsi") src => src,
options(nostack, preserves_flags)
);
}
asm!(
"rep movsq",
inout("rcx") qword_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
inout("rdi") dest => dest,
inout("rsi") src => src,
options(nostack, preserves_flags)
);
if byte_count > 0 {
asm!(
"rep movsb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
}
}

#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
let (pre_byte_count, qword_count, byte_count) = rep_param_rev(dest, count);
// We can't separate this block due to std/cld
asm!(
"std",
"repe movsq (%rsi), (%rdi)",
"movl {byte_count:e}, %ecx",
"addq $7, %rdi",
"addq $7, %rsi",
"repe movsb (%rsi), (%rdi)",
"rep movsb",
"sub rsi, 7",
"sub rdi, 7",
"mov rcx, {qword_count}",
"rep movsq",
"add rsi, 7",
"add rdi, 7",
"mov ecx, {byte_count:e}",
"rep movsb",
"cld",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest.add(count).wrapping_sub(8) => _,
inout("rsi") src.add(count).wrapping_sub(8) => _,
options(att_syntax, nostack)
qword_count = in(reg) qword_count,
inout("ecx") pre_byte_count => _,
inout("rdi") dest.add(count - 1) => _,
inout("rsi") src.add(count - 1) => _,
// We modify flags, but we restore it afterwards
options(nostack, preserves_flags)
);
}

Expand All @@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe stosq %rax, (%rdi)",
"mov {byte_count:e}, %ecx",
"repe stosb %al, (%rdi)",
byte_count = in(reg) byte_count,
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
// It also allows us to trivially skip the rep stosb, which is faster when memcpying
// aligned data.
if pre_byte_count > 0 {
asm!(
"rep stosb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
in("al") c,
options(nostack, preserves_flags)
);
}
asm!(
"rep stosq",
inout("rcx") qword_count => _,
inout("rdi") dest => _,
inout("rdi") dest => dest,
in("rax") (c as u64) * 0x0101010101010101,
options(att_syntax, nostack, preserves_flags)
options(nostack, preserves_flags)
);
if byte_count > 0 {
asm!(
"rep stosb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
in("al") c,
options(nostack, preserves_flags)
);
}
}

#[inline(always)]
Expand Down Expand Up @@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
c16(a.cast(), b.cast(), n)
}
}

/// Determine optimal parameters for a `rep` instruction.
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
count -= pre_byte_count;
let qword_count = count >> 3;
let byte_count = count & 0b111;
(pre_byte_count, qword_count, byte_count)
}

/// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set).
fn rep_param_rev(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
let pre_byte_count = ((dest as usize + count) & 0b111).min(count);
count -= pre_byte_count;
let qword_count = count >> 3;
let byte_count = count & 0b111;
(pre_byte_count, qword_count, byte_count)
}

0 comments on commit db0ca0c

Please sign in to comment.