Add ability in policy to allow/disallow access to stdio (#1594)

This commit adds the ability of policy at the time of create container to allow or disallow access to standard io for that container. And on the external process side, if an external process is allowed to access standard io. This is done in the same way as dropping environment variables is implemented. At policy enforcement time, policy will indicate if standard io access is allowed as part of the create being allowed. So like with environment variables where it is "allow, but only with these environment variables" now we also have "allow, but do not allow standard io access". Turning off standard io for containers in a way that didn't break some expectation within the hcs/gcs relationship turned out to be remarkably difficult. Maksim and I tried a couple different approaches before settling on the approach of creating a new transport for handling the disallowed standard io access case. One of the things we had attempted was to have special TTY and PipeRelays. However, we abandonded that approach as it resulted in a ton of duplicated code. The "devnull transport approach" that this commit implements doesn't result in duplicated code. And most importantly, has been able to pass testing and not result in bugs somewhere else in the gcs/hcs relationship. When work was started on this, we expected this to take a few days to get correct. It turned out to take several weeks because the hcs/gcs standard io relationship is filled with expectations and invariants that aren't documented and are spread throughtout the code. Maksim and I settled on this approach as we felt it had the lowest overhead for maintenance and was the least likely going forward to introduce sublte bugs while passing current testing. Signed-off-by: Sean T. Allen <[email protected]>
microsoft · Dec 7, 2022 · 9782dee · 9782dee
1 parent 3e090b0
commit 9782dee
Show file tree

Hide file tree

Showing 14 changed files with 696 additions and 215 deletions.
diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go
@@ -58,8 +58,9 @@ type Host struct {
 	externalProcesses      map[int]*externalProcess
 
 	// Rtime is the Runtime interface used by the GCS core.
-	rtime runtime.Runtime
-	vsock transport.Transport
+	rtime            runtime.Runtime
+	vsock            transport.Transport
+	devNullTransport transport.Transport
 
 	// state required for the security policy enforcement
 	policyMutex               sync.Mutex
@@ -80,6 +81,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s
 		externalProcesses:         make(map[int]*externalProcess),
 		rtime:                     rtime,
 		vsock:                     vsock,
+		devNullTransport:          &transport.DevNullTransport{},
 		securityPolicyEnforcerSet: false,
 		securityPolicyEnforcer:    initialEnforcer,
 		logWriter:                 logWriter,
@@ -315,7 +317,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		}()
 	}
 
-	envToKeep, err := h.securityPolicyEnforcer.EnforceCreateContainerPolicy(
+	envToKeep, allowStdio, err := h.securityPolicyEnforcer.EnforceCreateContainerPolicy(
 		sandboxID,
 		id,
 		settings.OCISpecification.Process.Args,
@@ -327,6 +329,12 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		return nil, errors.Wrapf(err, "container creation denied due to policy")
 	}
 
+	if !allowStdio {
+		// stdio access isn't allow for this container. Switch to the /dev/null
+		// transport that will eat all input/ouput.
+		c.vsock = h.devNullTransport
+	}
+
 	if envToKeep != nil {
 		settings.OCISpecification.Process.Env = []string(envToKeep)
 	}
@@ -561,7 +569,8 @@ func (h *Host) ExecProcess(ctx context.Context, containerID string, params prot.
 	var c *Container
 	var envToKeep securitypolicy.EnvList
 	if params.IsExternal || containerID == UVMContainerID {
-		envToKeep, err = h.securityPolicyEnforcer.EnforceExecExternalProcessPolicy(
+		var allowStdioAccess bool
+		envToKeep, allowStdioAccess, err = h.securityPolicyEnforcer.EnforceExecExternalProcessPolicy(
 			params.CommandArgs,
 			processParamEnvToOCIEnv(params.Environment),
 			params.WorkingDirectory,
@@ -570,25 +579,42 @@ func (h *Host) ExecProcess(ctx context.Context, containerID string, params prot.
 			return pid, errors.Wrapf(err, "exec is denied due to policy")
 		}
 
+		// It makes no sense to allow access if stdio access is denied and the
+		// process requires a terminal.
+		if params.EmulateConsole && !allowStdioAccess {
+			return pid, errors.New("exec of process that requires terminal access denied due to policy not allowing stdio access")
+		}
+
 		if envToKeep != nil {
 			params.Environment = processOCIEnvToParam(envToKeep)
 		}
 
-		pid, err = h.runExternalProcess(ctx, params, conSettings)
+		var tport = h.vsock
+		if !allowStdioAccess {
+			tport = h.devNullTransport
+		}
+		pid, err = h.runExternalProcess(ctx, params, conSettings, tport)
 	} else if c, err = h.GetCreatedContainer(containerID); err == nil {
 		// We found a V2 container. Treat this as a V2 process.
 		if params.OCIProcess == nil {
 			// We've already done policy enforcement for creating a container so
 			// there's no policy enforcement to do for starting
 			pid, err = c.Start(ctx, conSettings)
 		} else {
+			var allowStdioAccess bool
 			// Windows uses a different field for command, there's no enforcement
 			// around this yet for Windows so this is Linux specific at the moment.
-			envToKeep, err = h.securityPolicyEnforcer.EnforceExecInContainerPolicy(containerID, params.OCIProcess.Args, params.OCIProcess.Env, params.OCIProcess.Cwd)
+			envToKeep, allowStdioAccess, err = h.securityPolicyEnforcer.EnforceExecInContainerPolicy(containerID, params.OCIProcess.Args, params.OCIProcess.Env, params.OCIProcess.Cwd)
 			if err != nil {
 				return pid, errors.Wrapf(err, "exec in container denied due to policy")
 			}
 
+			// It makes no sense to allow access if stdio access is denied and the
+			// process requires a terminal.
+			if params.OCIProcess.Terminal && !allowStdioAccess {
+				return pid, errors.New("exec in container of process that requires terminal access denied due to policy not allowing stdio access")
+			}
+
 			if envToKeep != nil {
 				params.OCIProcess.Env = envToKeep
 			}
@@ -659,9 +685,10 @@ func (h *Host) runExternalProcess(
 	ctx context.Context,
 	params prot.ProcessParameters,
 	conSettings stdio.ConnectionSettings,
+	tport transport.Transport,
 ) (_ int, err error) {
 	var stdioSet *stdio.ConnectionSet
-	stdioSet, err = stdio.Connect(h.vsock, conSettings)
+	stdioSet, err = stdio.Connect(tport, conSettings)
 	if err != nil {
 		return -1, err
 	}

diff --git a/internal/guest/transport/devnull.go b/internal/guest/transport/devnull.go
@@ -0,0 +1,132 @@
+//go:build linux
+// +build linux
+
+package transport
+
+import (
+	"os"
+
+	"github.com/sirupsen/logrus"
+)
+
+// DevNullTransport is a transport that will:
+//
+// For reads: return either closed or EOF for as appropriate.
+// For writers: return either closed or throw away the write as appropriate.
+//
+// The DevNullTransport is used as the container logging transport when stdio
+// access is denied. It is also used for non-terminal external processes (aka
+// a process run in the UVM) when stdio access is denied.
+type DevNullTransport struct{}
+
+func (t *DevNullTransport) Dial(fd uint32) (Connection, error) {
+	logrus.WithFields(logrus.Fields{
+		"fd": fd,
+	}).Info("opengcs::DevNullTransport::Dial")
+
+	return newDevNullConnection(), nil
+}
+
+// devNullConnection is the heart of our new transport. A devNullConnection
+// contains two file descriptors. One for read, and one for write. We need to
+// file descriptors as the code using a transport is written with the
+// expectation of a duplex connection where read and write can be closed
+// independent of one another. The protocol that uses the connection requires
+// a duplex connection in order to work correctly.
+//
+// The original design of devNullConnection didn't use os.File and instead
+// emulated the required behavior and only returned a os.File from the `File`
+// function. However, the amount of state required to manage that was somewhat
+// high and the logic to follow was somewhat convoluted. Using two file handles
+// one read, one write that have /dev/null open results in a much cleaner
+// design and easier to understand semantics. We want to mirror the sematics of
+// /dev/null as a duplex connection, this is the simplest, most understandable
+// way to achieve that.
+type devNullConnection struct {
+	read  *os.File
+	write *os.File
+}
+
+func newDevNullConnection() *devNullConnection {
+	r, _ := os.OpenFile(os.DevNull, os.O_RDONLY, 0)
+	w, _ := os.OpenFile(os.DevNull, os.O_WRONLY, 0)
+
+	return &devNullConnection{read: r, write: w}
+}
+
+func (c *devNullConnection) Close() error {
+	err1 := c.read.Close()
+	err2 := c.write.Close()
+
+	if err1 != nil {
+		return err1
+	}
+
+	if err2 != nil {
+		return err2
+	}
+
+	return nil
+}
+
+func (c *devNullConnection) CloseRead() error {
+	return c.read.Close()
+}
+
+func (c *devNullConnection) CloseWrite() error {
+	return c.write.Close()
+}
+
+func (c *devNullConnection) Read(buf []byte) (int, error) {
+	return c.read.Read(buf)
+}
+
+func (c *devNullConnection) Write(buf []byte) (int, error) {
+	return c.write.Write(buf)
+}
+
+// File() is where our lack of a real duplex connection is problematic. Code
+// that uses a connection sidesteps the connection abstraction and asks for
+// "the file" directly. With vsock, which is an actual duplex connection, this
+// isn't actually a problem as a vsock connection is duplex. It dups the
+// connection and returns and everything works just fine.
+//
+// Our emulating a duplex connection could be problematic depending on what
+// code using the os.File returned from File() expects it semantics to be.
+// In particular, with a dup like vsock does, if you close the os.File returned
+// from File() it closes the connection. That isn't the case with our emulated
+// duplex connection as closing the os.File that devNullConneciton.File()
+// returns has no impact on the `read` and `write` file handles that are used
+// during Read/Write calls.
+//
+// In the current usage in GCS, this isn't problematic but could becomes so.
+// If you end up here because of a bug, here is the statement from the author
+// about ways to fix this problem.
+//
+//  1. File() is a leaky abstraction. A transport should be required to be duplex
+//     so, having a single `File()` call that allows the caller to reach into the
+//     guts of a transport is bad and should be done away with. All usage of a
+//     transport/connection should go through their abstraction, not bypass it
+//     when bypassing is more convenient.
+//  2. If File() didn't return a concrete type like os.File and instead returned
+//     an interface, then a more complicated version of devNullConnection that
+//     doesn't use a file handle could be written and the object that we return
+//     could be able to correctly handle the emulation of a duplex object.
+//
+// Both of the above are somewhat large changes and were not done at the time
+// the initial version of DevNullTransport and devNullConnection were done. It
+// was decided to leave the leaky abstraction in place as the code that uses
+// transports is fiddly and error-prone based on not very well documentated
+// assumptions. If you are considering taking on either 1 or 2 from above, you
+// should have plenty of time on your hands as extensive testing will be
+// required and the introduction of bugs both subtle and obvious is a likely
+// possibility. Aka, reworking this code to remove the leaky abstraction is more
+// than a refactoring and should be done with extreme care.
+func (c *devNullConnection) File() (*os.File, error) {
+	f, err := os.OpenFile(os.DevNull, os.O_RDWR, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
diff --git a/internal/tools/securitypolicy/helpers/helpers.go b/internal/tools/securitypolicy/helpers/helpers.go
@@ -170,6 +170,7 @@ func PolicyContainersFromConfigs(containerConfigs []securitypolicy.ContainerConf
 			containerConfig.AllowElevated,
 			containerConfig.ExecProcesses,
 			containerConfig.Signals,
+			containerConfig.AllowStdioAccess,
 		)
 		if err != nil {
 			return nil, err

diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego
@@ -157,7 +157,7 @@ valid_envs_for_all(items) := envs {
         some item in items
         envs := valid_envs_subset(item.env_rules)
     ]
-    
+
     # we want to select the most specific matches, which in this
     # case consists of those matches which require dropping the
     # fewest environment variables (i.e. the longest lists)
@@ -201,7 +201,11 @@ container_started {
 
 default create_container := {"allowed": false}
 
-create_container := {"matches": matches, "env_list": env_list, "started": started, "allowed": true} {
+create_container := {"matches": matches,
+                     "env_list": env_list,
+                     "allow_stdio_access": allow_stdio_access,
+                     "started": started,
+                     "allowed": true} {
     not container_started
     # narrow the matches based upon command, working directory, and
     # mount list
@@ -223,6 +227,16 @@ create_container := {"matches": matches, "env_list": env_list, "started": starte
     ]
 
     count(containers) > 0
+
+    # we can't do narrowing based on allowing stdio access so at this point
+    # every container from the policy that might match this create request
+    # must have the same allow stdio value otherwise, we are in an undecidable
+    # state
+    allow_stdio_access := containers[0].allow_stdio_access
+    every c in containers {
+        c.allow_stdio_access == allow_stdio_access
+    }
+
     matches := {
         "action": "update",
         "key": input.containerID,
@@ -303,7 +317,9 @@ mountList_ok(mounts, allow_elevated) {
 
 default exec_in_container := {"allowed": false}
 
-exec_in_container := {"matches": matches, "env_list": env_list, "allowed": true} {
+exec_in_container := {"matches": matches,
+                      "env_list": env_list,
+                      "allowed": true} {
     container_started
     # narrow our matches based upon the process requested
     possible_containers := [container |
@@ -432,7 +448,9 @@ external_process_ok(process) {
 
 default exec_external := {"allowed": false}
 
-exec_external := {"allowed": true, "env_list": env_list} {
+exec_external := {"allowed": true,
+                  "allow_stdio_access": allow_stdio_access,
+                  "env_list": env_list} {
     # we need to assemble a list of all possible external processes which
     # have a matching working directory and command
     policy_processes := [process |
@@ -460,6 +478,11 @@ exec_external := {"allowed": true, "env_list": env_list} {
     ]
 
     count(processes) > 0
+
+    allow_stdio_access := processes[0].allow_stdio_access
+    every p in processes {
+        p.allow_stdio_access == allow_stdio_access
+    }
 }
 
 default get_properties := {"allowed": false}
@@ -715,11 +738,11 @@ env_matches(env) {
 
 errors[envError] {
     input.rule in ["create_container", "exec_in_container", "exec_external"]
-    bad_envs := [env | 
+    bad_envs := [env |
         env := input.envList[_]
         not env_matches(env)]
     count(bad_envs) > 0
-    envError := concat(" ", ["invalid env list:", concat(",", bad_envs)])    
+    envError := concat(" ", ["invalid env list:", concat(",", bad_envs)])
 }
 
 default workingDirectory_matches := false
@@ -748,7 +771,7 @@ mount_matches(mount) {
 
 errors[mountError] {
     input.rule == "create_container"
-    bad_mounts := [mount.destination | 
+    bad_mounts := [mount.destination |
         mount := input.mounts[_]
         not mount_matches(mount)]
     count(bad_mounts) > 0

diff --git a/pkg/securitypolicy/open_door.rego b/pkg/securitypolicy/open_door.rego
@@ -4,11 +4,11 @@ api_svn := "0.10.0"
 
 mount_device := {"allowed": true}
 mount_overlay := {"allowed": true}
-create_container := {"allowed": true}
+create_container := {"allowed": true, "allow_stdio_access": true}
 unmount_device := {"allowed": true}
 unmount_overlay := {"allowed": true}
 exec_in_container := {"allowed": true}
-exec_external := {"allowed": true}
+exec_external := {"allowed": true, "allow_stdio_access": true}
 shutdown_container := {"allowed": true}
 signal_container_process := {"allowed": true}
 plan9_mount := {"allowed": true}

diff --git a/pkg/securitypolicy/opts.go b/pkg/securitypolicy/opts.go
@@ -42,3 +42,11 @@ func WithCommand(cmd []string) ContainerConfigOpt {
 		return nil
 	}
 }
+
+// WithAllowStdioAccess enables or disables container init process stdio.
+func WithAllowStdioAccess(stdio bool) ContainerConfigOpt {
+	return func(c *ContainerConfig) error {
+		c.AllowStdioAccess = stdio
+		return nil
+	}
+}