diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index 5e76d82da9..0994de579d 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -6,10 +6,12 @@ */ #include "arch.hh" +#include "arch-cpu.hh" #include "arch-setup.hh" #include #include #include "processor.hh" +#include "processor-flags.h" #include "msr.hh" #include "xen.hh" #include @@ -213,6 +215,28 @@ static inline void disable_pic() XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); processor::outb(0xff, 0xa1); }, {}); } +extern "C" void syscall_entry(void); + +// SYSCALL Enable +static const int IA32_EFER_SCE = 0x1 << 0; +// Selector shift +static const int CS_SELECTOR_SHIFT = 3; +// syscall shift +static const int IA_32_STAR_SYSCALL_SHIFT = 32; + +static void setup_syscall() +{ + unsigned long cs = gdt_cs; + processor::wrmsr(msr::IA32_STAR, (cs << CS_SELECTOR_SHIFT) << IA_32_STAR_SYSCALL_SHIFT); + // lstar is where syscall set rip so we set it to syscall_entry + processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast(syscall_entry)); + // syscall does rflag = rflag and not fmask + // we want no minimize the impact of the syscall instruction so we choose + // fmask as zero + processor::wrmsr(msr::IA32_FMASK, 0); + processor::wrmsr(msr::IA32_EFER, processor::rdmsr(msr::IA32_EFER) | IA32_EFER_SCE); +} + void arch_init_premain() { auto omb = *osv_multiboot_info; @@ -220,6 +244,7 @@ void arch_init_premain() debug_early_u64("Error reading disk (real mode): ", static_cast(omb.disk_err)); disable_pic(); + setup_syscall(); } #include "drivers/driver.hh" diff --git a/arch/x64/entry.S b/arch/x64/entry.S index b6f5abec43..0526aa741e 100644 --- a/arch/x64/entry.S +++ b/arch/x64/entry.S @@ -159,3 +159,99 @@ call_signal_handler_thunk: iretq .cfi_endproc +.align 16 +.global syscall_entry +syscall_entry: + .type syscall_entry, @function + .cfi_startproc simple + # There is no ring transition and rflags are left unchanged. + + # + # From http://stackoverflow.com/questions/2535989/what-are-the-calling-conventions-for-unix-linux-system-calls-on-x86-64: + # "User-level applications use as integer registers for passing the sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses %rdi, %rsi, %rdx, %r10, %r8 and %r9" + + # FIXME: fpu + # build the stack frame by hand + pushq %rsp + subq $8, %rsp # rip was saved in rcx by the syscall instruction + pushq %rax + pushq %rbx + pushq %rcx # contains rip before syscall instruction + pushq %rdx + pushq %rsi + pushq %rdi + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 # contains rflags before syscall instruction + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + # stack contains a signal_frame + .cfi_signal_frame + .cfi_def_cfa %rsp, 0 + .cfi_register rip,rcx # rcx took previous rip value + .cfi_register rflags,r11 # r11 took previous rflags value + .cfi_undefined rcx # was overwritten with rip by the syscall instruction + .cfi_undefined r11 # was overwritten with rflags by the syscall instruction + .cfi_offset %r15, 0x00 + .cfi_offset %r14, 0x08 + .cfi_offset %r13, 0x10 + .cfi_offset %r12, 0x18 + .cfi_offset %r11, 0x20 + .cfi_offset %r10, 0x28 + .cfi_offset %r9, 0x30 + .cfi_offset %r8, 0x38 + .cfi_offset %rbp, 0x40 + .cfi_offset %rdi, 0x48 + .cfi_offset %rsi, 0x50 + .cfi_offset %rdx, 0x58 + .cfi_offset %rcx, 0x60 + .cfi_offset %rbx, 0x68 + .cfi_offset %rax, 0x70 + .cfi_offset %rip, 0x80 + .cfi_offset %rsp, 0x98 + + # The kernel interface use r10 as fourth argument while the user interface use rcx + # so overwrite rcx with r10 + movq %r10, %rcx + + # prepare function call parameter: r9 is on the stack since it's the seventh param + # because we shift existing params by one to make room for syscall number + pushq %r9 + movq %r8, %r9 + movq %rcx, %r8 + movq %rdx, %rcx + movq %rsi, %rdx + movq %rdi, %rsi + # syscall number from rax as first argument + movq %rax, %rdi + + callq syscall_wrapper + + popq %r9 + # in Linux user and kernel return value are in rax so we have nothing to do for return values + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %rbx + addq $8, %rsp # skip rax emplacement (return value is in rax) + addq $8, %rsp # rip emplacement (rip cannot be popped) + popq %rsp + + # jump to rcx where the syscall instruction put rip + # (sysret would leave rxc cloberred so we have nothing to do to restore it) + jmpq *%rcx + .cfi_endproc diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh index 154bba7780..d77c75cffb 100644 --- a/arch/x64/msr.hh +++ b/arch/x64/msr.hh @@ -58,6 +58,9 @@ enum class msr : uint32_t { IA32_APIC_BASE = 0x0000001b, IA32_EFER = 0xc0000080, + IA32_STAR = 0xc0000081, + IA32_LSTAR = 0xc0000082, + IA32_FMASK = 0xc0000084, IA32_FS_BASE = 0xc0000100, KVM_WALL_CLOCK = 0x11, diff --git a/linux.cc b/linux.cc index bd82ca9ff1..843d131c16 100644 --- a/linux.cc +++ b/linux.cc @@ -291,3 +291,16 @@ long syscall(long number, ...) return -1; } long __syscall(long number, ...) __attribute__((alias("syscall"))); + +extern "C" long syscall_wrapper(long number, ...) +{ + int errno_backup = errno; + // syscall and function return value are in rax + auto ret = syscall(number); + int result = -errno; + errno = errno_backup; + if (ret < 0 && ret >= -4096) { + return result; + } + return ret; +}