Add Config Knob to allow Pod to use different VPC subnet

aws · Sep 3, 2018 · 8bb4c71 · 8bb4c71
1 parent 5751423
commit 8bb4c71
Show file tree

Hide file tree

Showing 17 changed files with 924 additions and 35 deletions.
diff --git a/config/v1.2/aws-k8s-cni.yaml b/config/v1.2/aws-k8s-cni.yaml
@@ -0,0 +1,130 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: aws-node
+rules:
+- apiGroups:
+  - crd.k8s.amazonaws.com
+  resources:
+  - "*"
+  - namespaecs
+  verbs:
+  - "*"
+- apiGroups: [""]
+  resources:
+  - pods
+  - nodes
+  - namespaces
+  verbs: ["list", "watch", "get"]
+- apiGroups: ["extensions"]
+  resources:
+  - daemonsets
+  verbs: ["list", "watch"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: aws-node
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: aws-node
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: aws-node
+subjects:
+- kind: ServiceAccount
+  name: aws-node
+  namespace: kube-system
+---
+kind: DaemonSet
+apiVersion: extensions/v1beta1
+metadata:
+  name: aws-node
+  namespace: kube-system
+  labels:
+    k8s-app: aws-node
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      k8s-app: aws-node
+  template:
+    metadata:
+      labels:
+        k8s-app: aws-node
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ''
+    spec:
+      serviceAccountName: aws-node
+      hostNetwork: true
+      tolerations:
+      - operator: Exists
+      containers:
+      - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/amazon-k8s-cni:1.2.0
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 60000
+          name: metrics
+        name: aws-node
+        env:
+          - name: AWS_VPC_K8S_CNI_LOGLEVEL
+            value: DEBUG
+          - name: MY_NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+          - name: WATCH_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+        resources:
+          requests:
+            cpu: 10m
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - mountPath: /host/opt/cni/bin
+          name: cni-bin-dir
+        - mountPath: /host/etc/cni/net.d
+          name: cni-net-dir
+        - mountPath: /host/var/log
+          name: log-dir
+        - mountPath: /var/run/docker.sock
+          name: dockersock
+      volumes:
+      - name: cni-bin-dir
+        hostPath:
+          path: /opt/cni/bin
+      - name: cni-net-dir
+        hostPath:
+          path: /etc/cni/net.d
+      - name: log-dir
+        hostPath:
+          path: /var/log
+      - name: dockersock
+        hostPath:
+          path: /var/run/docker.sock
+---
+apiVersion: apiextensions.k8s.io/v1beta1
+kind: CustomResourceDefinition
+metadata:
+  name: eniconfigs.crd.k8s.amazonaws.com
+spec:
+  scope: Cluster
+  group: crd.k8s.amazonaws.com
+  version: v1alpha1
+  names:
+    scope: Cluster
+    plural: eniconfigs
+    singuar: eniconfig
+    kind: ENIConfig
+
+
diff --git a/ipamd/datastore/data_store.go b/ipamd/datastore/data_store.go
@@ -351,8 +351,12 @@ func (ds *DataStore) getDeletableENI() *ENIIPPool {
 }
 
 // GetENINeedsIP finds out the eni in datastore which failed to get secondary IP address
-func (ds *DataStore) GetENINeedsIP(maxIPperENI int64) *ENIIPPool {
+func (ds *DataStore) GetENINeedsIP(maxIPperENI int64, skipPrimary bool) *ENIIPPool {
 	for _, eni := range ds.eniIPPools {
+		if skipPrimary && eni.IsPrimary {
+			log.Debugf("Skip the primary ENI for need IP check")
+			continue
+		}
 		if int64(len(eni.IPv4Addresses)) < maxIPperENI {
 			log.Debugf("Found eni %s that have less IP address allocated: cur=%d, max=%d",
 				eni.ID, len(eni.IPv4Addresses), maxIPperENI)

diff --git a/ipamd/introspect.go b/ipamd/introspect.go
@@ -69,9 +69,11 @@ func (c *IPAMContext) SetupHTTP() {
 
 func (c *IPAMContext) setupServer() *http.Server {
 	serverFunctions := map[string]func(w http.ResponseWriter, r *http.Request){
-		"/v1/enis":         eniV1RequestHandler(c),
-		"/v1/pods":         podV1RequestHandler(c),
-		"/v1/env-settings": envV1RequestHandler(c),
+		"/v1/enis":                      eniV1RequestHandler(c),
+		"/v1/pods":                      podV1RequestHandler(c),
+		"/v1/networkutils-env-settings": networkEnvV1RequestHandler(c),
+		"/v1/ipamd-env-settings":        ipamdEnvV1RequestHandler(c),
+		"/v1/eni-configs":               eniConfigRequestHandler(c),
 	}
 	paths := make([]string, 0, len(serverFunctions))
 	for path := range serverFunctions {
@@ -134,7 +136,19 @@ func podV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Requ
 	}
 }
 
-func envV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
+func eniConfigRequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
+	return func(w http.ResponseWriter, r *http.Request) {
+		responseJSON, err := json.Marshal(ipam.eniConfig.Getter())
+		if err != nil {
+			log.Error("Failed to marshal pod data: %v", err)
+			http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
+			return
+		}
+		w.Write(responseJSON)
+	}
+}
+
+func networkEnvV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
 	return func(w http.ResponseWriter, r *http.Request) {
 		responseJSON, err := json.Marshal(networkutils.GetConfigForDebug())
 		if err != nil {
@@ -146,6 +160,18 @@ func envV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Requ
 	}
 }
 
+func ipamdEnvV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
+	return func(w http.ResponseWriter, r *http.Request) {
+		responseJSON, err := json.Marshal(GetConfigForDebug())
+		if err != nil {
+			log.Error("Failed to marshal env var data: %v", err)
+			http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
+			return
+		}
+		w.Write(responseJSON)
+	}
+}
+
 func metricsHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
 	return func(w http.ResponseWriter, r *http.Request) {
 		promhttp.Handler()

diff --git a/ipamd/ipamd.go b/ipamd/ipamd.go
@@ -14,6 +14,7 @@
 package ipamd
 
 import (
+	"fmt"
 	"net"
 	"os"
 	"strconv"
@@ -31,6 +32,7 @@ import (
 	"github.com/aws/amazon-vpc-cni-k8s/ipamd/datastore"
 	"github.com/aws/amazon-vpc-cni-k8s/pkg/awsutils"
 	"github.com/aws/amazon-vpc-cni-k8s/pkg/docker"
+	"github.com/aws/amazon-vpc-cni-k8s/pkg/eniconfig"
 	"github.com/aws/amazon-vpc-cni-k8s/pkg/k8sapi"
 	"github.com/aws/amazon-vpc-cni-k8s/pkg/networkutils"
 )
@@ -43,11 +45,40 @@ const (
 	ipPoolMonitorInterval       = 5 * time.Second
 	maxRetryCheckENI            = 5
 	eniAttachTime               = 10 * time.Second
-	defaultWarmENITarget        = 1
 	nodeIPPoolReconcileInterval = 60 * time.Second
 	maxK8SRetries               = 12
 	retryK8SInterval            = 5 * time.Second
 	noWarmIPTarget              = 0
+
+	// This environment is used to specify the desired number of free IPs always available in "warm pool"
+	// When it is not set, ipamD defaut to use the number IPs per ENI for that instance.
+	// For example, for a m4.4xlarge node,
+	//     if WARM-IP-TARGET is set to 1, and there are 9 pods running on the node, ipamD will try
+	//     to make "warm pool" to have 10 IP address with 9 being assigned to Pod and 1 free IP.
+	//
+	//     if "WARM-IP-TARGET is not set, it will be defaulted to 30 (which the number of IPs per ENI). If there are 9 pods
+	//     running on the node, ipamD will try to make "warm pool" to have 39 IPs with 9 being assigned to Pod and 30 free IPs.
+	envWarmIPTarget = "WARM_IP_TARGET"
+
+	// This environment is used to specify the desired number of free ENIs along with all of its IP addresses
+	// always available in "warm pool".
+	// When it is not set, it is default to 1.
+	//
+	// when "WARM-IP-TARGET" is defined, ipamD will use behavior defined for "WARM-IP-TARGET".
+	//
+	// For example, for a m4.4xlarget node
+	//     if WARM_ENI_TARGET is set to 2, and there are 9 pods running on the node, ipamD will try to
+	//     make "warm  pool" to have 2 extra ENIs and its IP addresses,  in another word, 90 IP addresses with 9 IPs assigne to Pod
+	//     and 81 free IPs.
+	//
+	//     if "WARM_ENI_TARGET" is not set, it is default to 1,  if there 9 pods running on the node, ipamD will try to
+	//     make "warm pool" to have 1 extra ENI, in aother word 60 IPs with 9 being assigned to Pod and 51 free IPs.
+	envWarmENITarget     = "WARM_ENI_TARGET"
+	defaultWarmENITarget = 1
+
+	// This environment is used to specify whether Pods need to use securitygroup and subnet defined in ENIConfig CRD
+	// When it is NOT set or set to false, ipamD will use primary interface security group and subnet for Pod network.
+	envCustomNetworkCfg = "AWS_VPC_K8S_CNI_CUSTOM_NETWORK_CFG"
 )
 
 var (
@@ -105,6 +136,7 @@ type IPAMContext struct {
 	awsClient     awsutils.APIs
 	dataStore     *datastore.DataStore
 	k8sClient     k8sapi.K8SAPIs
+	eniConfig     eniconfig.ENIConfig
 	dockerClient  docker.APIs
 	networkClient networkutils.NetworkAPIs
 
@@ -133,13 +165,14 @@ func prometheusRegister() {
 
 // New retrieves IP address usage information from Instance MetaData service and Kubelet
 // then initializes IP address pool data store
-func New(k8sapiClient k8sapi.K8SAPIs) (*IPAMContext, error) {
+func New(k8sapiClient k8sapi.K8SAPIs, eniConfig *eniconfig.ENIConfigController) (*IPAMContext, error) {
 	prometheusRegister()
 	c := &IPAMContext{}
 
 	c.k8sClient = k8sapiClient
 	c.networkClient = networkutils.New()
 	c.dockerClient = docker.New()
+	c.eniConfig = eniConfig
 
 	client, err := awsutils.New()
 	if err != nil {
@@ -314,7 +347,7 @@ func (c *IPAMContext) retryAllocENIIP() {
 		log.Infof("Failed to retrieve ENI IP limit: %v", err)
 		return
 	}
-	eni := c.dataStore.GetENINeedsIP(maxIPLimit)
+	eni := c.dataStore.GetENINeedsIP(maxIPLimit, useCustomNetworkCfg())
 	if eni != nil {
 		log.Debugf("Attempt again to allocate IP address for eni :%s", eni.ID)
 		var err error
@@ -392,7 +425,25 @@ func (c *IPAMContext) increaseIPPool() {
 		log.Debugf("Skipping increase IPPOOL due to max ENI already attached to the instance : %d", c.maxENI)
 		return
 	}
-	eni, err := c.awsClient.AllocENI()
+
+	var securityGroups []string
+	var subnet string
+	customNetworkCfg := useCustomNetworkCfg()
+
+	if customNetworkCfg {
+		eniCfg, err := c.eniConfig.MyENIConfig()
+
+		if err != nil {
+			log.Errorf("Failed to get pod ENI config")
+			return
+		}
+
+		log.Infof("ipamd: using custom network config: %v, %s", eniCfg.SecurityGroups, eniCfg.Subnet)
+		securityGroups = eniCfg.SecurityGroups
+		subnet = eniCfg.Subnet
+	}
+
+	eni, err := c.awsClient.AllocENI(customNetworkCfg, securityGroups, subnet)
 	if err != nil {
 		log.Errorf("Failed to increase pool size due to not able to allocate ENI %v", err)
 
@@ -546,7 +597,7 @@ func (c *IPAMContext) waitENIAttached(eni string) (awsutils.ENIMetadata, error)
 }
 
 func getWarmENITarget() int {
-	inputStr, found := os.LookupEnv("WARM_ENI_TARGET")
+	inputStr, found := os.LookupEnv(envWarmENITarget)
 
 	if !found {
 		return defaultWarmENITarget
@@ -722,8 +773,21 @@ func (c *IPAMContext) eniIPPoolReconcile(ipPool map[string]*datastore.AddressInf
 
 }
 
+func useCustomNetworkCfg() bool {
+	defaultValue := false
+	if strValue := os.Getenv(envCustomNetworkCfg); strValue != "" {
+		parsedValue, err := strconv.ParseBool(strValue)
+		if err != nil {
+			log.Error("Failed to parse "+envCustomNetworkCfg+"; using default: "+fmt.Sprint(defaultValue), err.Error())
+			return defaultValue
+		}
+		return parsedValue
+	}
+	return defaultValue
+}
+
 func getWarmIPTarget() int {
-	inputStr, found := os.LookupEnv("WARM_IP_TARGET")
+	inputStr, found := os.LookupEnv(envWarmIPTarget)
 
 	if !found {
 		return noWarmIPTarget
@@ -753,3 +817,12 @@ func (c *IPAMContext) getCurWarmIPTarget() (int, bool) {
 
 	return curTarget, true
 }
+
+// GetConfigForDebug returns the active values of the configuration env vars (for debugging purposes).
+func GetConfigForDebug() map[string]interface{} {
+	return map[string]interface{}{
+		envWarmIPTarget:     getWarmIPTarget(),
+		envWarmENITarget:    getWarmENITarget(),
+		envCustomNetworkCfg: useCustomNetworkCfg(),
+	}
+}