Skip to content

Commit

Permalink
x64: 'syscall' instruction support
Browse files Browse the repository at this point in the history
Enable "fast system calls" via the 'syscall' instruction on OSv. The
instruction is used by Go programs on Linux/x86-64 for system calls.

Fixes cloudius-systems#590

Signed-off-by: Pekka Enberg <[email protected]>
Signed-off-by: Benoît Canet <[email protected]>
Message-Id: <[email protected]>
  • Loading branch information
Benoit Canet authored and myechuri committed Jun 22, 2017
1 parent b392f90 commit 9b014ce
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 0 deletions.
25 changes: 25 additions & 0 deletions arch/x64/arch-setup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
*/

#include "arch.hh"
#include "arch-cpu.hh"
#include "arch-setup.hh"
#include <osv/mempool.hh>
#include <osv/mmu.hh>
#include "processor.hh"
#include "processor-flags.h"
#include "msr.hh"
#include "xen.hh"
#include <osv/elf.hh>
Expand Down Expand Up @@ -213,13 +215,36 @@ static inline void disable_pic()
XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); processor::outb(0xff, 0xa1); }, {});
}

extern "C" void syscall_entry(void);

// SYSCALL Enable
static const int IA32_EFER_SCE = 0x1 << 0;
// Selector shift
static const int CS_SELECTOR_SHIFT = 3;
// syscall shift
static const int IA_32_STAR_SYSCALL_SHIFT = 32;

static void setup_syscall()
{
unsigned long cs = gdt_cs;
processor::wrmsr(msr::IA32_STAR, (cs << CS_SELECTOR_SHIFT) << IA_32_STAR_SYSCALL_SHIFT);
// lstar is where syscall set rip so we set it to syscall_entry
processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast<uint64_t>(syscall_entry));
// syscall does rflag = rflag and not fmask
// we want no minimize the impact of the syscall instruction so we choose
// fmask as zero
processor::wrmsr(msr::IA32_FMASK, 0);
processor::wrmsr(msr::IA32_EFER, processor::rdmsr(msr::IA32_EFER) | IA32_EFER_SCE);
}

void arch_init_premain()
{
auto omb = *osv_multiboot_info;
if (omb.disk_err)
debug_early_u64("Error reading disk (real mode): ", static_cast<u64>(omb.disk_err));

disable_pic();
setup_syscall();
}

#include "drivers/driver.hh"
Expand Down
96 changes: 96 additions & 0 deletions arch/x64/entry.S
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,99 @@ call_signal_handler_thunk:
iretq
.cfi_endproc

.align 16
.global syscall_entry
syscall_entry:
.type syscall_entry, @function
.cfi_startproc simple
# There is no ring transition and rflags are left unchanged.

#
# From http://stackoverflow.com/questions/2535989/what-are-the-calling-conventions-for-unix-linux-system-calls-on-x86-64:
# "User-level applications use as integer registers for passing the sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses %rdi, %rsi, %rdx, %r10, %r8 and %r9"

# FIXME: fpu
# build the stack frame by hand
pushq %rsp
subq $8, %rsp # rip was saved in rcx by the syscall instruction
pushq %rax
pushq %rbx
pushq %rcx # contains rip before syscall instruction
pushq %rdx
pushq %rsi
pushq %rdi
pushq %r8
pushq %r9
pushq %r10
pushq %r11 # contains rflags before syscall instruction
pushq %r12
pushq %r13
pushq %r14
pushq %r15

# stack contains a signal_frame
.cfi_signal_frame
.cfi_def_cfa %rsp, 0
.cfi_register rip,rcx # rcx took previous rip value
.cfi_register rflags,r11 # r11 took previous rflags value
.cfi_undefined rcx # was overwritten with rip by the syscall instruction
.cfi_undefined r11 # was overwritten with rflags by the syscall instruction
.cfi_offset %r15, 0x00
.cfi_offset %r14, 0x08
.cfi_offset %r13, 0x10
.cfi_offset %r12, 0x18
.cfi_offset %r11, 0x20
.cfi_offset %r10, 0x28
.cfi_offset %r9, 0x30
.cfi_offset %r8, 0x38
.cfi_offset %rbp, 0x40
.cfi_offset %rdi, 0x48
.cfi_offset %rsi, 0x50
.cfi_offset %rdx, 0x58
.cfi_offset %rcx, 0x60
.cfi_offset %rbx, 0x68
.cfi_offset %rax, 0x70
.cfi_offset %rip, 0x80
.cfi_offset %rsp, 0x98

# The kernel interface use r10 as fourth argument while the user interface use rcx
# so overwrite rcx with r10
movq %r10, %rcx

# prepare function call parameter: r9 is on the stack since it's the seventh param
# because we shift existing params by one to make room for syscall number
pushq %r9
movq %r8, %r9
movq %rcx, %r8
movq %rdx, %rcx
movq %rsi, %rdx
movq %rdi, %rsi
# syscall number from rax as first argument
movq %rax, %rdi

callq syscall_wrapper

popq %r9
# in Linux user and kernel return value are in rax so we have nothing to do for return values

popq %r15
popq %r14
popq %r13
popq %r12
popq %r11
popq %r10
popq %r9
popq %r8
popq %rdi
popq %rsi
popq %rdx
popq %rcx
popq %rbx
addq $8, %rsp # skip rax emplacement (return value is in rax)
addq $8, %rsp # rip emplacement (rip cannot be popped)
popq %rsp

# jump to rcx where the syscall instruction put rip
# (sysret would leave rxc cloberred so we have nothing to do to restore it)
jmpq *%rcx
.cfi_endproc
3 changes: 3 additions & 0 deletions arch/x64/msr.hh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ enum class msr : uint32_t {

IA32_APIC_BASE = 0x0000001b,
IA32_EFER = 0xc0000080,
IA32_STAR = 0xc0000081,
IA32_LSTAR = 0xc0000082,
IA32_FMASK = 0xc0000084,
IA32_FS_BASE = 0xc0000100,

KVM_WALL_CLOCK = 0x11,
Expand Down
13 changes: 13 additions & 0 deletions linux.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,16 @@ long syscall(long number, ...)
return -1;
}
long __syscall(long number, ...) __attribute__((alias("syscall")));

extern "C" long syscall_wrapper(long number, ...)
{
int errno_backup = errno;
// syscall and function return value are in rax
auto ret = syscall(number);
int result = -errno;
errno = errno_backup;
if (ret < 0 && ret >= -4096) {
return result;
}
return ret;
}

0 comments on commit 9b014ce

Please sign in to comment.