Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

seccomp: enosys: always return -ENOSYS for setup(2) #3474

Merged
merged 2 commits into from
May 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

* In case the runc binary resides on tmpfs, `runc init` no longer re-execs
itself twice. (#3342)
* Our seccomp `-ENOSYS` stub now correctly handles multiplexed syscalls on
s390 and s390x. This solves the issue where syscalls the host kernel did not
support would return `-EPERM` despite the existence of the `-ENOSYS` stub
code (this was due to how s390x does syscall multiplexing). (#3474)

## [1.1.0] - 2022-01-14

Expand Down
70 changes: 49 additions & 21 deletions libcontainer/seccomp/patchbpf/enosys_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ import "C"

var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)

// Assume sizeof(int) == 4 in the BPF program.
const bpfSizeofInt = 4

// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
// syscalls will end up with this syscall number, so we need to explcitly
// return -ENOSYS for this syscall on those architectures.
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0

func isAllowAction(action configs.Action) bool {
switch action {
// Trace is considered an "allow" action because a good tracer should
Expand All @@ -94,15 +102,14 @@ func isAllowAction(action configs.Action) bool {

func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
var program []bpf.RawInstruction
loop:
for {
// Read the next instruction. We have to use NativeEndian because
// seccomp_export_bpf outputs the program in *host* endian-ness.
var insn unix.SockFilter
if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
if errors.Is(err, io.EOF) {
// Parsing complete.
break loop
break
}
if errors.Is(err, io.ErrUnexpectedEOF) {
// Parsing stopped mid-instruction.
Expand Down Expand Up @@ -315,19 +322,46 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// directly from the arch code so we need to do it here. Sadly we can't
// share this code between architecture branches.
section := []bpf.Instruction{
// load [0]
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
// load [0] (syscall number)
bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt},
}

switch len(maxSyscalls) {
case 0:
// No syscalls found for this arch -- skip it and move on.
continue
case 1:
// Get the only syscall in the map.
var sysno libseccomp.ScmpSyscall
for _, no := range maxSyscalls {
// Get the only syscall and scmpArch in the map.
var (
scmpArch libseccomp.ScmpArch
sysno libseccomp.ScmpSyscall
)
for arch, no := range maxSyscalls {
sysno = no
scmpArch = arch
}

switch scmpArch {
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
// multiplexing "large syscall number" syscalls, but if the syscall
// number is not known to the kernel then the syscall number is
// left unchanged (and because it is sysno=0, you'll end up with
// EPERM for syscalls the kernel doesn't know about).
//
// The actual setup(2) syscall is never used by userspace anymore
// (and hasn't existed for decades) outside of this multiplexing
// scheme so returning -ENOSYS is fine.
case libseccomp.ArchS390, libseccomp.ArchS390X:
section = append(section, []bpf.Instruction{
// jne [setup=0],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(s390xMultiplexSyscall),
SkipTrue: 1,
},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
}...)
}

// The simplest case just boils down to a single jgt instruction,
Expand All @@ -349,8 +383,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
sectionTail = []bpf.Instruction{
// jle [syscall],1
bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
// ja [baseJumpEnosys+1]
bpf.Jump{Skip: baseJumpEnosys + 1},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
Expand All @@ -359,12 +393,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Grab the only architecture in the map.
var scmpArch libseccomp.ScmpArch
for arch := range maxSyscalls {
scmpArch = arch
}

// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
Expand Down Expand Up @@ -440,7 +468,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jset (1<<30),1
// jgt [x86 syscall],1,2
// jle [x32 syscall],1
// ja [baseJumpEnosys+1]
// ret [ENOSYS]
// ja [baseJumpFilter]
section = append(section, []bpf.Instruction{
// jset (1<<30),1
Expand All @@ -451,14 +479,14 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
Val: uint32(x86sysno),
SkipTrue: 1, SkipFalse: 2,
},
// jle [x32 syscall],[baseJumpEnosys]
// jle [x32 syscall],1
bpf.JumpIf{
Cond: bpf.JumpLessOrEqual,
Val: uint32(x32sysno),
SkipTrue: 1,
},
// ja [baseJumpEnosys+1]
bpf.Jump{Skip: baseJumpEnosys + 1},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
Expand Down Expand Up @@ -522,8 +550,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)

// Prepend the load instruction for the architecture.
programTail = append([]bpf.Instruction{
// load [4]
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
// load [4] (architecture)
bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt},
}, programTail...)

cyphar marked this conversation as resolved.
Show resolved Hide resolved
// And that's all folks!
Expand Down
13 changes: 13 additions & 0 deletions libcontainer/seccomp/patchbpf/enosys_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
})
}

// If we're on s390(x) make sure you get -ENOSYS for the "setup"
// syscall (this is done to work around an issue with s390x's
// syscall multiplexing which results in unknown syscalls being a
// setup(2) invocation).
switch scmpArch {
case libseccomp.ArchS390, libseccomp.ArchS390X:
syscallTests = append(syscallTests, syscallTest{
sysno: s390xMultiplexSyscall,
syscall: "setup",
expected: retErrnoEnosys,
})
}

// Test syscalls in the explicit list.
for _, test := range syscallTests {
// Override the expected value in the two special cases.
Expand Down