Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align destination in x86_64's mem* instructions. #474

Merged
merged 6 commits into from
Jul 28, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 71 additions & 33 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

use core::arch::asm;
use core::intrinsics;
use core::mem;

Expand All @@ -34,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe movsq (%rsi), (%rdi)",
"mov {byte_count:e}, %ecx",
"repe movsb (%rsi), (%rdi)",
byte_count = in(reg) byte_count,
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
asm!(
"rep movsb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
inout("rsi") src => src,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep movsq",
inout("rcx") qword_count => _,
inout("rdi") dest => dest,
inout("rsi") src => src,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep movsb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
Expand All @@ -52,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {

#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// We can't separate this block due to std/cld
asm!(
"std",
"repe movsq (%rsi), (%rdi)",
"movl {byte_count:e}, %ecx",
"addq $7, %rdi",
"addq $7, %rsi",
"repe movsb (%rsi), (%rdi)",
"rep movsb",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to skip the rep movsb if the count is zero (like we do for the other functions?

"sub $7, %rsi",
"sub $7, %rdi",
"mov {qword_count}, %rcx",
"rep movsq",
"test {pre_byte_count:e}, {pre_byte_count:e}",
"add $7, %rsi",
"add $7, %rdi",
"mov {pre_byte_count:e}, %ecx",
"rep movsb",
"cld",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest.add(count).wrapping_sub(8) => _,
inout("rsi") src.add(count).wrapping_sub(8) => _,
options(att_syntax, nostack)
pre_byte_count = in(reg) pre_byte_count,
qword_count = in(reg) qword_count,
inout("ecx") byte_count => _,
inout("rdi") dest.add(count - 1) => _,
inout("rsi") src.add(count - 1) => _,
// We modify flags, but we restore it afterwards
options(att_syntax, nostack, preserves_flags)
);
}

Expand All @@ -86,18 +103,29 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe stosq %rax, (%rdi)",
"mov {byte_count:e}, %ecx",
"repe stosb %al, (%rdi)",
byte_count = in(reg) byte_count,
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
let c = c as u64 * 0x0101_0101_0101_0101;
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
asm!(
"rep stosb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep stosq",
inout("rcx") qword_count => _,
inout("rdi") dest => dest,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep stosb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
in("rax") (c as u64) * 0x0101010101010101,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
}
Expand Down Expand Up @@ -156,3 +184,13 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
c16(a.cast(), b.cast(), n)
}
}

/// Determine optimal parameters for a `rep` instruction.
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
count -= pre_byte_count;
let qword_count = count >> 3;
let byte_count = count & 0b111;
(pre_byte_count, qword_count, byte_count)
}