diff --git a/Makefile b/Makefile index f8c503349e..1f90fc1d50 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ bcs-component:kube-sche apiserver-proxy \ bcs-network:ingress-controller bcs-services:bkcmdb-synchronizer gateway \ - storage user-manager cluster-manager cluster-reporter tools k8s-watch kube-agent data-manager \ + storage user-manager cluster-manager cluster-reporter nodeagent tools k8s-watch kube-agent data-manager \ helm-manager project-manager nodegroup-manager bcs-scenarios: kourse gitops @@ -258,7 +258,12 @@ cluster-manager:pre cluster-reporter: mkdir -p ${PACKAGEPATH}/bcs-services/bcs-cluster-reporter cp -R ${BCS_CONF_SERVICES_PATH}/bcs-cluster-reporter/* ${PACKAGEPATH}/bcs-services/bcs-cluster-reporter/ - cd ${BCS_SERVICES_PATH}/bcs-cluster-reporter && go mod tidy && go build ${LDFLAG} -o ${WORKSPACE}/${PACKAGEPATH}/bcs-services/bcs-cluster-reporter/bcs-cluster-reporter ./main.go + cd ${BCS_SERVICES_PATH}/bcs-cluster-reporter/cmd/reporter && go mod tidy && go build ${LDFLAG} -o ${WORKSPACE}/${PACKAGEPATH}/bcs-services/bcs-cluster-reporter/bcs-cluster-reporter ./main.go + +nodeagent: + mkdir -p ${PACKAGEPATH}/bcs-services/bcs-nodeagent + cp -R ${BCS_CONF_SERVICES_PATH}/bcs-nodeagent/* ${PACKAGEPATH}/bcs-services/bcs-nodeagent/ + cd ${BCS_SERVICES_PATH}/bcs-cluster-reporter/cmd/nodeagent && go mod tidy && go build ${LDFLAG} -o ${WORKSPACE}/${PACKAGEPATH}/bcs-services/bcs-nodeagent/bcs-nodeagent ./main.go project-manager:pre mkdir -p ${PACKAGEPATH}/bcs-services/bcs-project-manager/swagger @@ -364,4 +369,4 @@ gamestatefulset: make gamestatefulset -f bcs-scenarios/kourse/Makefile hook-operator: - make hook-operator -f bcs-scenarios/kourse/Makefile + make hook-operator -f bcs-scenarios/kourse/Makefile \ No newline at end of file diff --git a/bcs-services/bcs-client/.golangci.yml b/bcs-services/bcs-client/.golangci.yml index b30e2625fd..9008229a75 100644 --- a/bcs-services/bcs-client/.golangci.yml +++ b/bcs-services/bcs-client/.golangci.yml @@ -1,44 +1,57 @@ -# Code generated by scripts/gen-lint. DO NOT EDIT. - run: timeout: 10m - + skip-dirs: + - bcs-services/bcs-upgrader + - bcs-services/bcs-service-prometheus + - bcs-network + - bcs-runtime/bcs-mesos + - bcs-runtime/bcs-k8s/bcs-component/bcs-cc-agent + - bcs-runtime/bcs-k8s/bcs-component/bcs-cpuset-device + - .*/third_party/* + - api/* + - internal/cloudprovider/component/* + - internal/cloudprovider/qcloud/api/common_* + skip-files: + - .*\.docs\.go$ + - .*\.gen\.go$ + - .*\.pb\.go$ + - .*\.pb.gw\.go$ + - .*\.pb.micro\.go$ + - .*\.pb.validate\.go$ + - .*\_test\.go$ + - internal/cloudprovider/qcloud/tasks/createNodeGroup.go + - internal/cloudprovider/qcloud/tasks/createNodeGroup.go + - internal/cloudprovider/qcloud/tasks/createClusterTask.go + - internal/remote/resource/tresource/resource.go issues: - # 显示所有 issue max-issues-per-linter: 0 max-same-issues: 0 exclude-use-default: false - linters: disable-all: true enable: - # enable by default - - errcheck - - gosimple - - govet - - ineffassign +# - errcheck +# - gosimple +# - govet +# - ineffassign - staticcheck - - unused - - # custom - - funlen - - gci - - goconst - - gocritic - - gocyclo - - gofmt - - goheader - - goimports - - gosec - - lll - - misspell - - nakedret - - revive - - unconvert - - unparam - +# - unused +# - funlen +# - gci +# - goconst +# - gocritic +# - gocyclo +# - gofmt +# - goheader +# - goimports +# - gosec +# - lll +# - misspell +# - nakedret +# - revive +# - unconvert +# - unparam linters-settings: - # 只开启特定的规则 errcheck: exclude-functions: - (*os.File).Close @@ -48,39 +61,39 @@ linters-settings: - io.Copy - os.RemoveAll lll: - line-length: 120 # widely used and popular community recommended length + line-length: 120 funlen: - lines: 90 # 函数长度 default * 1.5 - statements: -1 # 不限制语句数量, 通过圈复杂度处理 + lines: 90 + statements: -1 gocyclo: - min-complexity: 30 # 函数圈复杂度 + min-complexity: 30 govet: check-shadowing: true goimports: - local-prefixes: github.com/Tencent/bk-bcs/bcs-services/bcs-client + local-prefixes: github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager gci: sections: - standard - default - - prefix(github.com/Tencent/bk-bcs/bcs-services/bcs-client) + - prefix(github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager) gocritic: settings: ifElseChain: minThreshold: 3 gosec: includes: - - G201 # SQL query construction using format string - - G202 # SQL query construction using string concatenation - - G101 # Look for hard coded credentials - - G401 # Detect the usage of DES, RC4, MD5 or SHA1 - - G402 # Look for bad TLS connection settings - - G403 # Ensure minimum RSA key length of 2048 bits - - G404 # Insecure random number source (rand) - - G504 # Import blocklist: net/http/cgi + - G201 + - G202 + - G101 + - G401 + - G402 + - G403 + - G404 + - G504 goheader: values: regexp: - YEAR: 20\d\d # 头部时间变量, 如: 2019等, 新增的为当前年份即可 + YEAR: 20\d\d template: |- * Tencent is pleased to support the open source community by making Blueking Container Service available. * Copyright (C) {{ YEAR }} THL A29 Limited, a Tencent company. All rights reserved. @@ -112,6 +125,4 @@ linters-settings: - name: range - name: receiver-naming - name: time-naming - # - name: unexported-return - name: var-declaration - # - name: var-naming diff --git a/bcs-services/bcs-cluster-reporter/Dockerfile b/bcs-services/bcs-cluster-reporter/Dockerfile index c6ab4e4a16..740a14221e 100644 --- a/bcs-services/bcs-cluster-reporter/Dockerfile +++ b/bcs-services/bcs-cluster-reporter/Dockerfile @@ -6,5 +6,7 @@ RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime RUN echo "Asia/Shanghai" > /etc/timezone WORKDIR / COPY ./$SRV_NAME . +COPY ./TencentSans-W7.ttf . +COPY ./TencentSans-W3.ttf . #COPY ./conf ./conf ENTRYPOINT ["/bcs-cluster-reporter"] diff --git a/bcs-services/bcs-cluster-reporter/Makefile b/bcs-services/bcs-cluster-reporter/Makefile deleted file mode 100644 index 9203e8f6b8..0000000000 --- a/bcs-services/bcs-cluster-reporter/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -SRV_NAME = bcs-cluster-reporter -VER = v0.0.8 -CURRENT_VERSION = $(VER) -NAMESPACE = build/blueking -DH_URL = mirrors.tencent.com - -LDFLAG=-ldflags "-X bcs-services/bcs-common/common/static.EncryptionKey=${bcs_encryption_key} \ --X github.com/Tencent/bk-bcs/bcs-common/common/static.EncryptionKey=${bcs_encryption_key}" - -clean: - -rm ./$(SRV_NAME) - -build:clean - CGO_ENABLED=1 go build -o $(SRV_NAME) ${LDFLAG} -v ./main.go - -publish:build - docker build --build-arg SRV_NAME=$(SRV_NAME) --rm -t $(SRV_NAME):$(CURRENT_VERSION) . - docker tag $(SRV_NAME):$(CURRENT_VERSION) $(DH_URL)/${NAMESPACE}/$(SRV_NAME):$(CURRENT_VERSION) - docker push $(DH_URL)/${NAMESPACE}/$(SRV_NAME):$(CURRENT_VERSION) \ No newline at end of file diff --git a/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf b/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf new file mode 100644 index 0000000000..1e12465997 Binary files /dev/null and b/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf differ diff --git a/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf b/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf new file mode 100644 index 0000000000..386dd20f5c Binary files /dev/null and b/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf differ diff --git a/bcs-services/bcs-cluster-reporter/cmd/nodeagent/main.go b/bcs-services/bcs-cluster-reporter/cmd/nodeagent/main.go new file mode 100644 index 0000000000..5259ed0698 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/cmd/nodeagent/main.go @@ -0,0 +1,225 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package main xxx +package main + +import ( + "encoding/json" + "flag" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + _ "net/http/pprof" + "os" + "os/signal" + "path/filepath" + "runtime/debug" + "syscall" + "time" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/cmd/options" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/gin-contrib/pprof" + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/spf13/cobra" + "github.com/spf13/viper" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + "k8s.io/klog" + + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader" +) + +var ( + cmdOptions = options.NewNodeAgentOptions() + + rootCmd = &cobra.Command{ + Use: "bcs-nodeagent", + Short: "bcs-nodeagent", + Long: ` +Basic Commands (Beginner): + bcs-cluster-reporter +`, + Run: func(cmd *cobra.Command, args []string) { + CheckErr(Complete(cmd, args)) + + err := Run() + if err != nil { + + klog.Fatalf("bcs-cluster-reporter failed: %s", err.Error()) + } + }, + } +) + +func initConfig() {} + +func init() { + flags := rootCmd.PersistentFlags() + cmdOptions.AddFlags(flags) + + cobra.OnInitialize(initConfig) + + // init klog flag + fs := flag.NewFlagSet("", flag.PanicOnError) + klog.InitFlags(fs) + rootCmd.PersistentFlags().AddGoFlagSet(fs) + + err := viper.BindPFlags(flags) + if err != nil { + klog.Fatalf("Viper bindPFlags failed: %s", err.Error()) + } + +} + +// Run main process +func Run() error { + config, err := rest.InClusterConfig() + if err != nil { + if cmdOptions.KubeConfigPath != "" { + config, err = k8s.GetRestConfigByConfig(cmdOptions.KubeConfigPath) + if err != nil { + klog.Fatalf("Error: %s", err.Error()) + } + } + } + + clientSet, err := k8s.GetClientsetByConfig(config) + if err != nil { + klog.Fatalf("Error: %s", err.Error()) + } + + nodeName := util.GetNodeName() + node, err := clientSet.CoreV1().Nodes().Get(util.GetCtx(10*time.Second), nodeName, v1.GetOptions{ResourceVersion: "0"}) + if err != nil { + klog.Fatalf("Error: %s", err.Error()) + } + + hostPath := cmdOptions.HostPath + if hostPath == "/" { + hostPath = util.GetHostPath() + } + pluginmanager.Pm.SetConfig(&pluginmanager.Config{ + NodeConfig: pluginmanager.NodeConfig{ + Config: config, + ClientSet: clientSet, + NodeName: nodeName, + Node: node, + HostPath: hostPath, + }, + }) + + // 读取配置文件 + go func() { + err := pluginmanager.Pm.SetupPlugin(cmdOptions.Plugins, cmdOptions.ConfigPath, cmdOptions.RunMode) + if err != nil { + klog.Fatalf(err.Error()) + } + }() + + // listening OS shutdown singal + if cmdOptions.RunMode == pluginmanager.RunModeDaemon { + r := gin.Default() + pprof.Register(r) + + r.GET("/metrics", gin.WrapH(promhttp.Handler())) + go func() { + if err := r.Run(cmdOptions.Addr); err != nil { + klog.Fatalf(err.Error()) + } + }() + + signalChan := make(chan os.Signal, 1) + signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) + <-signalChan + } else { + pluginmanager.Pm.Ready(cmdOptions.Plugins, "node") + result := pluginmanager.Pm.GetNodeResult(cmdOptions.Plugins) + // TODO 支持json格式输出 + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + for _, item := range result { + checkItemList = append(checkItemList, item.Items...) + } + infoItemList := make([]pluginmanager.InfoItem, 0, 0) + for _, item := range result { + infoItemList = append(infoItemList, item.InfoItemList...) + } + + data, _ := json.Marshal(checkItemList) + fmt.Println(string(data)) + data, _ = json.Marshal(infoItemList) + fmt.Println(string(data)) + } + + return nil +} + +// Execute rootCmd +func Execute() { + defer func() { + if r := recover(); r != nil { + klog.Fatalf("nodeagent failed: %s, stack: %v\n", r, string(debug.Stack())) + } + }() + + err := rootCmd.Execute() + if err != nil { + klog.Fatalf(err.Error()) + } +} + +// CheckErr deal with Complete error +func CheckErr(err error) { + if err != nil { + klog.Fatalf(err.Error()) + } +} + +// Complete xxx +func Complete(cmd *cobra.Command, args []string) error { + // 如果配置文件不存在则写入默认值 + _, err := os.Stat(cmdOptions.ConfigPath) + if err != nil { + if os.IsNotExist(err) { + err = os.MkdirAll(filepath.Dir(cmdOptions.ConfigPath), os.ModePerm) + if err != nil { + return err + } + + err = util.WriteConfigIfNotExist(filepath.Join(cmdOptions.ConfigPath, "config"), `interval: 86400 +pluginDir: /data/bcs/nodeagent`) + if err != nil { + return err + } + + } else { + return err + } + } + + return nil +} + +func main() { + Execute() + defer klog.Flush() +} diff --git a/bcs-services/bcs-cluster-reporter/cmd/options/options.go b/bcs-services/bcs-cluster-reporter/cmd/options/options.go index fcf386aace..0027e25313 100644 --- a/bcs-services/bcs-cluster-reporter/cmd/options/options.go +++ b/bcs-services/bcs-cluster-reporter/cmd/options/options.go @@ -42,6 +42,7 @@ type BcsClusterReporterOptions struct { InCluster bool ClusterID string BizID string + RunMode string } // NewBcsClusterReporterOptions init options @@ -68,10 +69,43 @@ func (bcro *BcsClusterReporterOptions) AddFlags(fs *pflag.FlagSet) { fs.StringSliceVar(&bcro.BcsClusterList, "bcsClusterList", []string{}, "Set the clusters id to check and report") fs.StringVarP(&bcro.KubeConfigDir, "kubeConfigDir", "", "", - "Set the kubeconfig path to load, kubeconfig file should have suffix of config") + "Set the kubeconfig path to load kubeconfig files, and the kubeconfig files’ name should end with \"config\"") // incluster选项 fs.BoolVarP(&bcro.InCluster, "inCluster", "", false, "Set true the reporter will work as in-cluster mode") fs.StringVarP(&bcro.ClusterID, "clusterID", "", "0", "Set clusterID") fs.StringVarP(&bcro.BizID, "bizID", "", "incluster", "Set cluster bizID") + fs.StringVar(&bcro.RunMode, "runMode", "daemon", "daemon, once") +} + +// NodeAgentOptions component options +type NodeAgentOptions struct { + HostPath string + Upstream string + ConfigPath string + Plugins string + RunMode string + PluginDir string + CMNamespace string + Addr string + KubeConfigPath string +} + +// NewNodeAgentOptions return NodeAgentOptions +func NewNodeAgentOptions() *NodeAgentOptions { + return &NodeAgentOptions{} +} + +// AddFlags xxx +func (brro *NodeAgentOptions) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&brro.Plugins, "plugins", "dnscheck,containercheck,hwcheck,processcheck,diskcheck,netcheck,timecheck,nodeinfocheck,uploader", "plugins") + fs.StringVar(&brro.KubeConfigPath, "kubeconfigPath", "/root/.kube/config", "if incluster failed, use this kubeconfig path") + + fs.StringVar(&brro.HostPath, "hostPath", "/", "set here or set HOST_PATH env") + fs.StringVar(&brro.Upstream, "upstream", "cluster", "cluster, mysql") + fs.StringVar(&brro.RunMode, "runMode", "once", "daemon, once") + fs.StringVar(&brro.PluginDir, "pluginDir", "/data/bcs/nodeagent", "/data/bcs/nodeagent") + fs.StringVar(&brro.ConfigPath, "configPath", "/data/bcs/nodeagent/", "/data/bcs/nodeagent/") + fs.StringVar(&brro.CMNamespace, "cmNamespace", "nodeagent", "namespace to store nodeagent checkresult configmap") + fs.StringVar(&brro.Addr, "addr", "0.0.0.0:6216", "addr to bind listen") } diff --git a/bcs-services/bcs-cluster-reporter/cmd/reporter/main.go b/bcs-services/bcs-cluster-reporter/cmd/reporter/main.go new file mode 100644 index 0000000000..2935f29805 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/cmd/reporter/main.go @@ -0,0 +1,555 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package main xxx +package main + +import ( + "context" + "flag" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin" + v12 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + metricsclientset "k8s.io/metrics/pkg/client/clientset/versioned" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + "time" + + cmproto "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager/api/clustermanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/cmd/options" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/api/bcs" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/gin-contrib/pprof" + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/spf13/cobra" + "github.com/spf13/viper" + "gopkg.in/yaml.v2" + "k8s.io/client-go/rest" + "k8s.io/klog" + + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck" + _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck" +) + +var ( + bcro = options.NewBcsClusterReporterOptions() + + rootCmd = &cobra.Command{ + Use: "bcs-cluster-reporter", + Short: "bcs-cluster-reporter", + Long: ` +Basic Commands (Beginner): + bcs-cluster-reporter +`, + Run: func(cmd *cobra.Command, args []string) { + if bcro.BcsGatewayToken == "" { + bcro.BcsGatewayToken = os.Getenv("gatewayToken") + } + + if bcro.BcsClusterManagerToken == "" { + bcro.BcsClusterManagerToken = os.Getenv("gatewayToken") + } + + CheckErr(Complete(cmd, args)) + + err := Run() + if err != nil { + klog.Fatalf("bcs-cluster-reporter failed: %s", err.Error()) + } + }, + } +) + +// Run main process +func Run() error { + run(context.Background()) + + return nil +} + +func run(ctx context.Context) { + r := gin.Default() + pprof.Register(r) + go func() { + if err := r.Run(":6216"); err != nil { + klog.Fatalf(err.Error()) + } + }() + + getClusters() + + go func() { + select { + case <-ctx.Done(): + break + default: + for { + time.Sleep(time.Minute * 30) + getClusters() + } + } + }() + + // start plugins + err := pluginmanager.Pm.SetupPlugin(bcro.Plugins, bcro.PluginConfDir, bcro.RunMode) + if err != nil { + klog.Fatalf("Setup plugin failed: %s", err.Error()) + } + + klog.Info("Setup plugins success") + + // start webserver + if bcro.RunMode == pluginmanager.RunModeDaemon { + r.GET("cluster/:clusterID/pdf", func(c *gin.Context) { + clusterID := c.Param("clusterID") + pdf, reportErr := pluginmanager.GetClusterReport(clusterID, bcro.Plugins) + if reportErr != nil { + c.String(404, fmt.Sprintf("cluster %s not found", clusterID)) + return + } + err = pdf.Output(c.Writer) + if err != nil { + c.String(http.StatusInternalServerError, "Failed to generate PDF") + klog.Errorf(err.Error()) + return + } + c.Header("Content-Type", "application/pdf") + c.Header("Content-Disposition", "attachment; filename=output.pdf") + return + // 将PDF内容写入HTTP响应 + }) + + r.GET("biz/:bizID/pdf", func(c *gin.Context) { + bizID := c.Param("bizID") + + pdf, reportErr := pluginmanager.GetBizReport(bizID, bcro.Plugins) + if reportErr != nil { + c.String(404, fmt.Sprintf("biz %s not found", bizID)) + return + } + + err = pdf.Output(c.Writer) + if err != nil { + c.String(http.StatusInternalServerError, "Failed to generate PDF") + klog.Errorf(err.Error()) + return + } + c.Header("Content-Type", "application/pdf") + c.Header("Content-Disposition", "attachment; filename=output.pdf") + return + }) + + r.GET("cluster/:clusterID/html", func(c *gin.Context) { + clusterID := c.Param("clusterID") + + html, htmlErr := pluginmanager.GetClusterReportHtml(clusterID, bcro.Plugins) + if htmlErr != nil { + c.String(404, fmt.Sprintf("cluster %s not found", clusterID)) + return + } + + c.String(200, html) + return + }) + + r.GET("biz/:bizID/html", func(c *gin.Context) { + bizID := c.Param("bizID") + + html, htmlErr := pluginmanager.GetBizReportHtml(bizID, bcro.Plugins) + if htmlErr != nil { + c.String(404, fmt.Sprintf("biz %s not found", bizID)) + return + } + + c.String(200, html) + return + }) + + r.GET("/metrics", gin.WrapH(promhttp.Handler())) + + // config mm + metricmanager.MM.SetEngine(r) + } else if bcro.RunMode == pluginmanager.RunModeOnce { + for _, clusterConfig := range pluginmanager.Pm.GetConfig().ClusterConfigs { + result := pluginmanager.Pm.GetClusterResult(bcro.Plugins, clusterConfig.ClusterID) + data, _ := yaml.Marshal(result) + fmt.Println(string(data)) + } + return + } + + <-ctx.Done() + // 停止模块的运行 + klog.Infof("start to stop plugins") + err = pluginmanager.Pm.StopPlugin(bcro.Plugins) + if err != nil { + klog.Fatalf("Setup plugin failed: %s", err.Error()) + } + klog.Infof("done stop plugins") +} + +// Execute rootCmd +func Execute() { + err := rootCmd.Execute() + if err != nil { + klog.Fatalf(err.Error()) + } +} + +// CheckErr check err +func CheckErr(err error) { + if err != nil { + klog.Fatalf(err.Error()) + } +} + +// Complete check for cmd args +func Complete(cmd *cobra.Command, args []string) error { + if (bcro.BcsClusterManagerToken != "" || bcro.BcsClusterManagerApiserver != "" || bcro.BcsGatewayApiserver != "" || + bcro.BcsGatewayToken != "") && (bcro.BcsClusterManagerToken == "" || bcro.BcsClusterManagerApiserver == "" || bcro.BcsGatewayApiserver == "" || + bcro.BcsGatewayToken == "") { + return fmt.Errorf( + "bcs config missing, BcsClusterManagerToken, BcsClusterManagerApiserver, BcsGatewayApiserver, BcsGatewayToken must be set") + + } + + if (bcro.BcsGatewayApiserver != "" || bcro.BcsClusterManagerApiserver != "" || bcro.BcsGatewayToken != "" || + bcro.BcsClusterManagerToken != "") && bcro.InCluster { + return fmt.Errorf("when run in in-cluster mode, no need to set bcs params") + } + + if bcro.KubeConfigDir != "" && bcro.InCluster { + return fmt.Errorf("when run in in-cluster mode, no need to set kubeConfigDir") + } + + if bcro.InCluster && (bcro.ClusterID == "" || bcro.BizID == "") { + return fmt.Errorf("when run in in-cluster mode, need to set clusterID and bizID") + } + return nil +} + +func init() { + flags := rootCmd.PersistentFlags() + bcro.AddFlags(flags) + + cobra.OnInitialize(initConfig) + + // init klog flag + fs := flag.NewFlagSet("", flag.PanicOnError) + klog.InitFlags(fs) + rootCmd.PersistentFlags().AddGoFlagSet(fs) + + err := viper.BindPFlags(flags) + if err != nil { + klog.Fatalf("Viper bindPFlags failed: %s", err.Error()) + } + +} + +// initConfig configure viper to read config +func initConfig() {} + +func getClusters() { + clusterConfigList := make(map[string]*pluginmanager.ClusterConfig) + if pluginmanager.Pm.GetConfig() != nil { + clusterConfigList = pluginmanager.Pm.GetConfig().ClusterConfigs + } + + // 从bcs获取BCS集群配置 + if bcro.BcsGatewayApiserver != "" && bcro.BcsClusterManagerApiserver != "" && bcro.BcsGatewayToken != "" && + bcro.BcsClusterManagerToken != "" { + bcsClusterConfigList, err := GetClusterConfigFromBCS(bcro.BcsClusterManagerToken, + bcro.BcsClusterManagerApiserver, bcro.BcsGatewayApiserver, bcro.BcsGatewayToken, clusterConfigList) + if err != nil { + klog.Fatalf(err.Error()) + } + clusterConfigList = bcsClusterConfigList + } + + // 从文件夹获取kubeconfig的配置 + if bcro.KubeConfigDir != "" { + klog.Infof(bcro.KubeConfigDir) + fileClusterConfigList, err := GetClusterInfo(bcro.KubeConfigDir, clusterConfigList) + if err != nil { + klog.Fatalf(err.Error()) + } + for key, value := range fileClusterConfigList { + clusterConfigList[key] = value + } + } + + // Incluster模式 + if bcro.InCluster { + config, err := rest.InClusterConfig() + if err != nil { + klog.Fatalf("Error: %s", err.Error()) + return + } + clusterConfigList[bcro.ClusterID] = &pluginmanager.ClusterConfig{BusinessID: bcro.BizID, + ClusterID: bcro.ClusterID, Config: config} + pluginmanager.Pm.SetConfig(&pluginmanager.Config{ + ClusterConfigs: clusterConfigList, + InClusterConfig: pluginmanager.ClusterConfig{BusinessID: bcro.BizID, ClusterID: bcro.ClusterID, Config: config}, + }) + } else { + // 集中化模式 + pluginmanager.Pm.SetConfig(&pluginmanager.Config{ + ClusterConfigs: clusterConfigList, + }) + } +} + +// GetClusterConfigFromBCS get clusterconfig from bcs api +func GetClusterConfigFromBCS(bcsClusterManagerToken, bcsClusterManagerApiserver, bcsGatewayApiserver, bcsGatewayToken string, existClusterConfigList map[string]*pluginmanager.ClusterConfig) (map[string]*pluginmanager.ClusterConfig, error) { + clusterConfigList := make(map[string]*pluginmanager.ClusterConfig) + bcsClusterManager, err := bcs.NewClusterManager(bcsClusterManagerToken, bcsClusterManagerApiserver, + bcsGatewayApiserver, bcsGatewayToken) + if err != nil { + klog.Fatalf("NewClusterManager failed: %s", err.Error()) + return nil, err + } + + clusterList, err := bcsClusterManager.GetClusters([]string{}) + if err != nil { + klog.Fatalf("GetClusters failed: %s", err.Error()) + return nil, err + } + + filteredClusterList := make([]cmproto.Cluster, 0, 0) + if len(bcro.BcsClusterList) != 0 { + for _, clusterId := range bcro.BcsClusterList { + for _, cluster := range clusterList { + if clusterId == cluster.ClusterID { + filteredClusterList = append(filteredClusterList, cluster) + break + } + } + } + } else { + clusterloop: + for _, cluster := range clusterList { + if cluster.IsShared == true || cluster.Status != "RUNNING" || cluster.EngineType != "k8s" || (cluster.Environment == bcro.BcsClusterType && bcro.BcsClusterType != "") { + continue // 跳过公共集群的记录 跳过未就绪集群 跳过非K8S集群 以及匹配对应参数的集群 + } else { + // 跳过master ip不正常的集群 + if len(cluster.Master) > 0 { + for masterName, _ := range cluster.Master { + if strings.Contains(masterName, "127.0.0") { + // 跳过算力集群 + continue clusterloop + } + } + } + + if cluster.CreateTime != "" { + createTime, err := time.Parse(time.RFC3339, cluster.CreateTime) + if err != nil { + klog.Errorf("parse cluster %s createtime failed %s", cluster.ClusterID, err.Error()) + continue + } + // 创建时间超过10分钟才进行巡检 + if (time.Now().Unix() - createTime.Unix()) > 60*30 { + filteredClusterList = append(filteredClusterList, cluster) + } + } + } + } + } + + var mapLock sync.Mutex + var wg sync.WaitGroup + routinePool := util.NewRoutinePool(50) + for _, cluster := range filteredClusterList { + wg.Add(1) + routinePool.Add(1) + go func(cluster cmproto.Cluster) { + defer func() { + wg.Done() + routinePool.Done() + }() + + config := bcsClusterManager.GetKubeconfig(cluster.ClusterID) + clusterConfig, err := GetClusterConfig(cluster.ClusterID, config) + if err != nil { + klog.Errorf("GetClusterConfig %s failed: %s", cluster.ClusterID, err.Error()) + return + } + + // 已存在的集群信息则直接复用 + if existClusterConfig, ok := existClusterConfigList[cluster.ClusterID]; ok { + existClusterConfig.Config = config + existClusterConfig.ClientSet = clusterConfig.ClientSet + existClusterConfig.MetricSet = clusterConfig.MetricSet + clusterConfig = existClusterConfig + } + + clusterConfig.ClusterID = cluster.ClusterID + clusterConfig.BusinessID = cluster.BusinessID + clusterConfig.BCSCluster = cluster + clusterConfig.NodeInfo = make(map[string]plugin.NodeInfo) + + if strings.HasPrefix(cluster.SystemID, "cls") { + clusterConfig.ClusterType = pluginmanager.TKECluster + } + + mapLock.Lock() + clusterConfigList[clusterConfig.ClusterID] = clusterConfig + mapLock.Unlock() + }(cluster) + } + wg.Wait() + + return clusterConfigList, nil +} + +// GetClusterInfo return ClusterConfig by parsing kubeconfig file +func GetClusterInfo(kubeConfigDir string, existClusterConfigList map[string]*pluginmanager.ClusterConfig) (map[string]*pluginmanager.ClusterConfig, error) { + clusterConfigList := make(map[string]*pluginmanager.ClusterConfig) + var filePathList []string + err := filepath.Walk(kubeConfigDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + if info.Mode()&os.ModeSymlink != 0 { + return nil + } + + if !info.IsDir() && strings.HasSuffix(info.Name(), "config") { + filePathList = append(filePathList, path) + } + return nil + }) + if err != nil { + return nil, err + } + + for _, filePath := range filePathList { + filenameWithExt := filepath.Base(filePath) // 获取文件名(包括后缀) + ext := filepath.Ext(filenameWithExt) // 获取文件后缀 + + filename := strings.TrimSuffix(filenameWithExt, ext) // 移除后缀 + + config, err := k8s.GetRestConfigByConfig(filePath) + if err != nil { + return nil, err + } + + if config.CAData == nil { + config.TLSClientConfig.Insecure = true + } + + clusterConfig, err := GetClusterConfig(filename, config) + if err != nil { + klog.Errorf("GetClusterConfig %s failed: %s", filename, err.Error()) + continue + } + + if existClusterConfig, ok := existClusterConfigList[filename]; ok { + existClusterConfig.Config = config + existClusterConfig.ClientSet = clusterConfig.ClientSet + existClusterConfig.MetricSet = clusterConfig.MetricSet + clusterConfig = existClusterConfig + } + + clusterConfig.ClusterID = filename + clusterConfig.BusinessID = "0" + clusterConfig.NodeInfo = make(map[string]plugin.NodeInfo) + + // 读取配置文件时没配置bizid + clusterConfigList[filename] = clusterConfig + + klog.Infof("load kubeconfig success, clusterID: %s", filename) + } + + return clusterConfigList, nil +} + +// GetClusterConfig return ClusterConfig by clusterID and rest config +func GetClusterConfig(clusterID string, config *rest.Config) (*pluginmanager.ClusterConfig, error) { + clusterConfig := &pluginmanager.ClusterConfig{} + + clientSet, err := k8s.GetClientsetByConfig(config) + if err != nil { + return nil, fmt.Errorf("get clientset failed: %s, skip", err.Error()) + } + metricsClient, err := metricsclientset.NewForConfig(config) + if err != nil { + klog.Errorf("%s Get metric set failed: %s", clusterID, err.Error()) + } + + // 跳过算力集群 + apiResources, err := k8s.GetK8sApi(clientSet) + if err != nil { + klog.Errorf("get %s apiresourcelist failed: %s", clusterID, err.Error()) + } else { + for _, group := range apiResources { + if group.GroupVersion == "cluster.karmada.io/v1alpha1" { + return nil, fmt.Errorf("has karmada resource, skip") + } + } + } + + // 跳过work node为0的集群 + nodeList, err := clientSet.CoreV1().Nodes().List(util.GetCtx(time.Second*10), v1.ListOptions{ResourceVersion: "0"}) + masterList := make([]string, 0, 0) + if err != nil { + klog.Errorf("get %s node failed: %s", clusterID, err.Error()) + } else { + nodeNum := len(nodeList.Items) + for _, node := range nodeList.Items { + for key, val := range node.Labels { + if key == "node-role.kubernetes.io/master" && (val == "true" || val == "") { + nodeNum = nodeNum - 1 + for _, address := range node.Status.Addresses { + if address.Type == v12.NodeInternalIP { + masterList = append(masterList, address.Address) + } + } + + } + } + } + + // 排除没有任何工作节点的集群 + if nodeNum == 0 && strings.Contains(clusterID, "BCS-K8S-4") { + return nil, fmt.Errorf("a cluster without any work nodes, skip") + } + } + + clusterConfig = &pluginmanager.ClusterConfig{ + Config: config, + ClientSet: clientSet, + MetricSet: metricsClient, + Master: masterList, + } + + clusterConfig.ClusterID = "incluster" + clusterConfig.BusinessID = "0" + clusterConfig.NodeInfo = make(map[string]plugin.NodeInfo) + return clusterConfig, nil +} + +func main() { + Execute() + defer klog.Flush() +} diff --git a/bcs-services/bcs-cluster-reporter/cmd/root.go b/bcs-services/bcs-cluster-reporter/cmd/root.go deleted file mode 100644 index 4bb6f0b1b2..0000000000 --- a/bcs-services/bcs-cluster-reporter/cmd/root.go +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package cmd -package cmd - -import ( - "context" - "flag" - "fmt" - _ "net/http/pprof" // pprof - "os" - "os/signal" - "path/filepath" - "runtime/pprof" - "strings" - "syscall" - "time" - - cmproto "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager/api/clustermanager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/cmd/options" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/api/bcs" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" - "github.com/spf13/cobra" - "github.com/spf13/viper" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/leaderelection" - "k8s.io/client-go/tools/leaderelection/resourcelock" - "k8s.io/klog" -) - -var ( - bcro = options.NewBcsClusterReporterOptions() - - rootCmd = &cobra.Command{ - Use: "tcctl", - Short: "bcs-cluster-reporter", - Long: ` -Basic Commands (Beginner): - Get Create a resource from a file or from stdin -`, - Run: func(cmd *cobra.Command, args []string) { - CheckErr(Complete(cmd, args)) - metric_manager.MM.RunPrometheusMetricsServer() - - err := Run() - if err != nil { - klog.Fatalf("bcs-cluster-reporter failed: %s", err.Error()) - } - }, - } -) - -// Run main process -func Run() error { - config, err := rest.InClusterConfig() - if err != nil { - return err - } - - client, err := kubernetes.NewForConfig(config) - if err != nil { - return err - } - - configFileBytes, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") - if err != nil { - return err - } - - id, err := os.Hostname() // os.Getenv("POD_NAME") - if err != nil { - return err - } - - leaseName := os.Getenv("DEPLOY_NAME") - if leaseName == "" { - leaseName = "bcs-cluster-reporter" - } - lock := &resourcelock.LeaseLock{ - LeaseMeta: metav1.ObjectMeta{ - Name: leaseName, - Namespace: string(configFileBytes), - }, - Client: client.CoordinationV1(), - LockConfig: resourcelock.ResourceLockConfig{ - Identity: id, - }, - } - - // 进行选举 - ctx, cancel := context.WithCancel(context.Background()) - - go func() { - for { - leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ - Lock: lock, - LeaseDuration: 30 * time.Second, - RenewDeadline: 15 * time.Second, - RetryPeriod: 5 * time.Second, - Callbacks: leaderelection.LeaderCallbacks{ - OnStartedLeading: run, - OnStoppedLeading: func() { - klog.Infof("leader lost: %s", id) - cancel() - ctx, cancel = context.WithCancel(context.Background()) - }, - }, - }) - } - }() - - // 退出清理 - signalChan := make(chan os.Signal, 1) - signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) - <-signalChan - // stop plugins - klog.Infof("start to shutdown bcs-cluster-reporter") - pprof.Lookup("heap").WriteTo(os.Stdout, 1) - cancel() - - return nil -} - -func run(ctx context.Context) { - getClusters() - - go func() { - select { - case <-ctx.Done(): - break - default: - for { - time.Sleep(time.Minute * 1) - getClusters() - } - } - }() - - // start plugins - err := plugin_manager.Pm.SetupPlugin(bcro.Plugins, bcro.PluginConfDir) - if err != nil { - klog.Fatalf("Setup plugin failed: %s", err.Error()) - } - - klog.Info("Setup plugins success") - - <-ctx.Done() - // 停止模块的运行 - klog.Infof("start to stop plugins") - err = plugin_manager.Pm.StopPlugin(bcro.Plugins) - if err != nil { - klog.Fatalf("Setup plugin failed: %s", err.Error()) - } - klog.Infof("done stop plugins") -} - -// Execute rootCmd -func Execute() { - err := rootCmd.Execute() - if err != nil { - klog.Fatalf(err.Error()) - } -} - -// CheckErr check err -func CheckErr(err error) { - if err != nil { - klog.Fatalf(err.Error()) - } -} - -// Complete xxx -func Complete(cmd *cobra.Command, args []string) error { - if bcro.BcsClusterManagerToken != "" || bcro.BcsClusterManagerApiserver != "" || bcro.BcsGatewayApiserver != "" || - bcro.BcsGatewayToken != "" { - if bcro.BcsClusterManagerToken == "" || bcro.BcsClusterManagerApiserver == "" || bcro.BcsGatewayApiserver == "" || - bcro.BcsGatewayToken == "" { - return fmt.Errorf( - "bcs config missing, BcsClusterManagerToken, BcsClusterManagerApiserver, BcsGatewayApiserver, BcsGatewayToken must be set") - } else { - bcro.BcsClusterManagerToken = util.Decode(bcro.BcsClusterManagerToken) - bcro.BcsGatewayToken = util.Decode(bcro.BcsGatewayToken) - } - } - - if (bcro.BcsGatewayApiserver != "" && bcro.BcsClusterManagerApiserver != "" && bcro.BcsGatewayToken != "" && - bcro.BcsClusterManagerToken != "") && bcro.InCluster { - return fmt.Errorf("when run in in-cluster mode, no need to set bcs params") - } - - if bcro.KubeConfigDir != "" && bcro.InCluster { - return fmt.Errorf("when run in in-cluster mode, no need to set kubeConfigDir") - } - - if bcro.InCluster && (bcro.ClusterID == "" || bcro.BizID == "") { - return fmt.Errorf("when run in in-cluster mode, need to set clusterID and bizID") - } - return nil -} - -func init() { - flags := rootCmd.PersistentFlags() - bcro.AddFlags(flags) - - cobra.OnInitialize(initConfig) - - // init klog flag - fs := flag.NewFlagSet("", flag.PanicOnError) - klog.InitFlags(fs) - rootCmd.PersistentFlags().AddGoFlagSet(fs) - - err := viper.BindPFlags(flags) - if err != nil { - klog.Fatalf("Viper bindPFlags failed: %s", err.Error()) - } - -} - -// initConfig -// configure viper to read config -func initConfig() {} - -func getClusters() { - clusterConfigList := make([]plugin_manager.ClusterConfig, 0, 0) - - // 获取BCS集群配置 - if bcro.BcsGatewayApiserver != "" && bcro.BcsClusterManagerApiserver != "" && bcro.BcsGatewayToken != "" && - bcro.BcsClusterManagerToken != "" { - bcsClusterManager, err := bcs.NewClusterManager(bcro.BcsClusterManagerToken, bcro.BcsClusterManagerApiserver, - bcro.BcsGatewayApiserver, bcro.BcsGatewayToken) - if err != nil { - klog.Fatalf("NewClusterManager failed: %s", err.Error()) - } - - clusterList, err := bcsClusterManager.GetClusters([]string{}) - if err != nil { - klog.Errorf("NewClusterManager failed: %s", err.Error()) - return - } - - filteredClusterList := make([]cmproto.Cluster, 0, 0) - if len(bcro.BcsClusterList) != 0 { - for _, clusterId := range bcro.BcsClusterList { - for _, cluster := range clusterList { - if clusterId == cluster.ClusterID { - filteredClusterList = append(filteredClusterList, cluster) - } - break - } - } - } else { - for _, cluster := range clusterList { - if cluster.IsShared == true { - continue // 跳过公共集群的记录 - } else if cluster.Status != "RUNNING" { - continue // 跳过未就绪集群 - } else if cluster.EngineType != "k8s" { - continue - } else { - if len(cluster.Master) > 0 { - continueFlag := false - for masterName, _ := range cluster.Master { - if strings.Contains(masterName, "127.0.0") { - // 跳过算力集群 - continueFlag = true - break - } - } - if continueFlag { - continue - } - } - - // 选取对应类型的集群 - if (cluster.Environment == bcro.BcsClusterType && bcro.BcsClusterType != "") || bcro.BcsClusterType == "" { - if cluster.CreateTime != "" { - createTime, err := time.Parse(time.RFC3339, cluster.CreateTime) - if err != nil { - klog.Errorf("parse cluster %s createtime failed %s", cluster.ClusterID, err.Error()) - continue - } - // 创建时间超过60分钟才进行巡检 - if (time.Now().Unix() - createTime.Unix()) > 60*60 { - filteredClusterList = append(filteredClusterList, cluster) - } - } - } - } - } - } - - for _, cluster := range filteredClusterList { - clusterConfigList = append(clusterConfigList, plugin_manager.ClusterConfig{ - ClusterID: cluster.ClusterID, - Config: bcsClusterManager.GetKubeconfig(cluster.ClusterID), - BusinessID: cluster.BusinessID, - }) - } - } - - // 获取kubeconfig的配置 - if bcro.KubeConfigDir != "" { - var filePathList []string - err := filepath.Walk(bcro.KubeConfigDir, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - if info.Mode()&os.ModeSymlink != 0 { - return nil - } - - if !info.IsDir() && strings.HasSuffix(info.Name(), "config") { - filePathList = append(filePathList, path) - } - return nil - }) - if err != nil { - klog.Fatalf("Error: %s", err.Error()) - return - } - - for _, filePath := range filePathList { - config, err := k8s.GetRestConfigByConfig(filePath) - if err != nil { - klog.Fatalf("Error: %s", err.Error()) - return - } - - config.TLSClientConfig.Insecure = true - - filenameWithExt := filepath.Base(filePath) // 获取文件名(包括后缀) - ext := filepath.Ext(filenameWithExt) // 获取文件后缀 - - filename := strings.TrimSuffix(filenameWithExt, ext) // 移除后缀 - // 读取配置文件时没配置bizid - clusterConfigList = append(clusterConfigList, plugin_manager.ClusterConfig{BusinessID: "0", ClusterID: filename, - Config: config}) - klog.Infof("load kubeconfig success, clusterID: %s", filename) - } - } - - // Incluster模式 - if bcro.InCluster { - config, err := rest.InClusterConfig() - if err != nil { - klog.Fatalf("Error: %s", err.Error()) - return - } - clusterConfigList = append(clusterConfigList, plugin_manager.ClusterConfig{BusinessID: bcro.BizID, - ClusterID: bcro.ClusterID, Config: config}) - plugin_manager.Pm.SetConfig(&plugin_manager.Config{ - ClusterConfigs: clusterConfigList, - InClusterConfig: plugin_manager.ClusterConfig{BusinessID: bcro.BizID, ClusterID: bcro.ClusterID, Config: config}, - }) - } else { - // 集中化模式 - plugin_manager.Pm.SetConfig(&plugin_manager.Config{ - ClusterConfigs: clusterConfigList, - }) - } -} diff --git a/bcs-services/bcs-cluster-reporter/go.mod b/bcs-services/bcs-cluster-reporter/go.mod index 08fac13504..ee332d252c 100644 --- a/bcs-services/bcs-cluster-reporter/go.mod +++ b/bcs-services/bcs-cluster-reporter/go.mod @@ -1,162 +1,244 @@ module github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter -go 1.20 +go 1.21 replace ( - k8s.io/api => k8s.io/api v0.26.0 - k8s.io/apimachinery => k8s.io/apimachinery v0.26.0 - k8s.io/cli-runtime => k8s.io/cli-runtime v0.26.0 - k8s.io/client-go => k8s.io/client-go v0.26.0 + k8s.io/api => k8s.io/api v0.27.3 + k8s.io/apimachinery => k8s.io/apimachinery v0.27.3 + k8s.io/cli-runtime => k8s.io/cli-runtime v0.27.3 + k8s.io/client-go => k8s.io/client-go v0.27.3 + k8s.io/kubectl => k8s.io/kubectl v0.27.3 ) require ( - github.com/Tencent/bk-bcs/bcs-common v0.0.0-20230202032803-47379be3c4c1 - github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager v0.0.0-20230209072106-23fc6b236f83 - github.com/containerd/containerd v1.6.17 - github.com/dlclark/regexp2 v1.10.0 - github.com/prometheus/client_golang v1.14.0 - github.com/spf13/cobra v1.6.1 + github.com/PuerkitoBio/goquery v1.9.2 + github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager v0.0.0-20240913071335-8812bb30f497 + github.com/beevik/ntp v1.4.1 + github.com/containerd/containerd v1.7.11 + github.com/docker/docker v24.0.7+incompatible + github.com/gin-contrib/pprof v1.5.0 + github.com/gin-gonic/gin v1.9.1 + github.com/hashicorp/go-version v1.6.0 + github.com/jaypipes/ghw v0.12.0 + github.com/jung-kurt/gofpdf v1.16.2 + github.com/miekg/dns v1.1.50 + github.com/patrickmn/go-cache v2.1.0+incompatible + github.com/prometheus/client_golang v1.19.0 + github.com/prometheus/client_model v0.5.0 + github.com/prometheus/common v0.48.0 + github.com/shirou/gopsutil v3.21.11+incompatible + github.com/spf13/cobra v1.8.0 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.15.0 - golang.org/x/net v0.5.0 + github.com/vishvananda/netlink v1.2.1-beta.2 + golang.org/x/net v0.24.0 + golang.org/x/sys v0.20.0 gopkg.in/yaml.v2 v2.4.0 - helm.sh/helm/v3 v3.11.1 - k8s.io/api v0.26.0 - k8s.io/apimachinery v0.26.0 + helm.sh/helm/v3 v3.13.3 + k8s.io/api v0.29.2 + k8s.io/apimachinery v0.29.2 + k8s.io/cli-runtime v0.29.0 k8s.io/client-go v11.0.0+incompatible k8s.io/klog v1.0.0 - k8s.io/klog/v2 v2.80.1 + k8s.io/klog/v2 v2.120.1 + k8s.io/kubectl v0.29.0 + k8s.io/metrics v0.29.0 + k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3 ) require ( - github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect - github.com/BurntSushi/toml v1.2.1 // indirect + github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/containerd/cgroups/v3 v3.0.3 // indirect + github.com/containerd/containerd/v2 v2.0.0-rc.1 + github.com/containerd/continuity v0.4.3 // indirect + github.com/containerd/errdefs v0.1.0 // indirect + github.com/containerd/fifo v1.1.0 // indirect + github.com/containerd/platforms v0.1.1 // indirect + github.com/containerd/plugin v0.1.0 // indirect + github.com/containerd/typeurl/v2 v2.1.1 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/moby/sys/mountinfo v0.7.1 + github.com/moby/sys/sequential v0.5.0 // indirect + github.com/moby/sys/signal v0.7.0 // indirect + github.com/moby/sys/user v0.1.0 // indirect + github.com/opencontainers/runtime-spec v1.2.0 // indirect + github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect + github.com/opencontainers/selinux v1.11.0 // indirect + github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect + go.opencensus.io v0.24.0 // indirect + tags.cncf.io/container-device-interface v0.7.2 // indirect + tags.cncf.io/container-device-interface/specs-go v0.7.0 // indirect +) + +require ( + github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect + github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/BurntSushi/toml v1.3.2 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect - github.com/Masterminds/semver/v3 v3.2.0 // indirect + github.com/Masterminds/semver/v3 v3.2.1 // indirect github.com/Masterminds/sprig/v3 v3.2.3 // indirect - github.com/Masterminds/squirrel v1.5.3 // indirect - github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 // indirect + github.com/Masterminds/squirrel v1.5.4 // indirect + github.com/Microsoft/hcsshim v0.12.3 // indirect + github.com/StackExchange/wmi v1.2.1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.1.2 // indirect + github.com/bytedance/sonic v1.11.6 // indirect + github.com/bytedance/sonic/loader v0.1.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect - github.com/containerd/ttrpc v1.1.0 // indirect - github.com/cyphar/filepath-securejoin v0.2.3 // indirect + github.com/cloudwego/base64x v0.1.4 // indirect + github.com/cloudwego/iasm v0.2.0 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/containerd/ttrpc v1.2.3 // indirect + github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/docker/cli v20.10.21+incompatible // indirect - github.com/docker/distribution v2.8.1+incompatible // indirect - github.com/docker/docker v20.10.21+incompatible // indirect - github.com/docker/docker-credential-helpers v0.7.0 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/cli v24.0.7+incompatible // indirect + github.com/docker/distribution v2.8.2+incompatible // indirect + github.com/docker/docker-credential-helpers v0.8.0 // indirect github.com/docker/go-connections v0.4.0 // indirect github.com/docker/go-metrics v0.0.1 // indirect - github.com/docker/go-units v0.4.0 // indirect - github.com/emicklei/go-restful/v3 v3.9.0 // indirect - github.com/envoyproxy/protoc-gen-validate v0.4.1 // indirect - github.com/evanphx/json-patch v5.6.0+incompatible // indirect - github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect - github.com/fatih/color v1.13.0 // indirect - github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/go-errors/errors v1.0.1 // indirect - github.com/go-gorp/gorp/v3 v3.0.2 // indirect - github.com/go-logr/logr v1.2.3 // indirect - github.com/go-openapi/jsonpointer v0.19.5 // indirect - github.com/go-openapi/jsonreference v0.20.0 // indirect - github.com/go-openapi/swag v0.19.14 // indirect + github.com/docker/go-units v0.5.0 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect + github.com/evanphx/json-patch v5.7.0+incompatible // indirect + github.com/evanphx/json-patch/v5 v5.5.0 // indirect + github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect + github.com/fatih/color v1.16.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/gabriel-vasile/mimetype v1.4.3 // indirect + github.com/ghodss/yaml v1.0.0 // indirect + github.com/gin-contrib/sse v0.1.0 // indirect + github.com/go-acme/lego/v4 v4.4.0 // indirect + github.com/go-errors/errors v1.5.1 // indirect + github.com/go-gorp/gorp/v3 v3.1.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/go-openapi/jsonpointer v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.20.4 // indirect + github.com/go-openapi/swag v0.22.7 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.1 // indirect + github.com/go-playground/validator/v10 v10.20.0 // indirect github.com/gobwas/glob v0.2.3 // indirect + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.3.0 // indirect + github.com/goccy/go-json v0.10.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.2 // indirect - github.com/google/btree v1.0.1 // indirect - github.com/google/gnostic v0.5.7-v3refs // indirect - github.com/google/go-cmp v0.5.9 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/btree v1.1.2 // indirect + github.com/google/gnostic v0.7.0 // indirect + github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect + github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect - github.com/google/uuid v1.3.0 // indirect - github.com/gorilla/mux v1.8.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/handlers v1.5.1 // indirect + github.com/gorilla/mux v1.8.1 // indirect github.com/gosuri/uitable v0.0.4 // indirect - github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/huandu/xstrings v1.3.3 // indirect - github.com/imdario/mergo v0.3.13 // indirect - github.com/inconshreveable/mousetrap v1.0.1 // indirect + github.com/huandu/xstrings v1.4.0 // indirect + github.com/imdario/mergo v0.3.16 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/jaypipes/pcidb v1.0.0 // indirect github.com/jmoiron/sqlx v1.3.5 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.13.6 // indirect + github.com/klauspost/compress v1.17.8 // indirect + github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect - github.com/lib/pq v1.10.7 // indirect + github.com/leodido/go-urn v1.4.0 // indirect + github.com/lib/pq v1.10.9 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect github.com/magiconair/properties v1.8.7 // indirect - github.com/mailru/easyjson v0.7.6 // indirect - github.com/mattn/go-colorable v0.1.12 // indirect - github.com/mattn/go-isatty v0.0.14 // indirect - github.com/mattn/go-runewidth v0.0.9 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/micro/go-micro/v2 v2.9.1 // indirect - github.com/miekg/dns v1.1.50 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect - github.com/mitchellh/go-wordwrap v1.0.0 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/moby/locker v1.0.1 // indirect github.com/moby/spdystream v0.2.0 // indirect - github.com/moby/term v0.0.0-20221205130635-1aeaba878587 // indirect + github.com/moby/term v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/morikuni/aec v1.0.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.0-rc2 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect - github.com/patrickmn/go-cache v2.1.0+incompatible // indirect - github.com/pelletier/go-toml/v2 v2.0.6 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.37.0 // indirect - github.com/prometheus/procfs v0.8.0 // indirect - github.com/rubenv/sql-migrate v1.2.0 // indirect + github.com/prometheus/procfs v0.12.0 // indirect + github.com/rivo/uniseg v0.4.4 // indirect + github.com/rubenv/sql-migrate v1.6.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/shopspring/decimal v1.2.0 // indirect - github.com/sirupsen/logrus v1.9.0 // indirect - github.com/spf13/afero v1.9.3 // indirect - github.com/spf13/cast v1.5.0 // indirect + github.com/shopspring/decimal v1.3.1 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + github.com/spf13/afero v1.10.0 // indirect + github.com/spf13/cast v1.6.0 // indirect github.com/spf13/jwalterweatherman v1.1.0 // indirect github.com/subosito/gotenv v1.4.2 // indirect - github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect + github.com/tklauser/go-sysconf v0.3.14 // indirect + github.com/tklauser/numcpus v0.8.0 // indirect + github.com/twitchyliquid64/golang-asm v0.15.1 // indirect + github.com/ugorji/go/codec v1.2.12 // indirect + github.com/vishvananda/netns v0.0.4 // indirect + github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/xeipuuv/gojsonschema v1.2.0 // indirect - github.com/xlab/treeprint v1.1.0 // indirect - go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect - golang.org/x/crypto v0.5.0 // indirect - golang.org/x/mod v0.6.0 // indirect - golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 // indirect - golang.org/x/sync v0.1.0 // indirect - golang.org/x/sys v0.4.0 // indirect - golang.org/x/term v0.4.0 // indirect - golang.org/x/text v0.6.0 // indirect - golang.org/x/time v0.1.0 // indirect - golang.org/x/tools v0.2.0 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef // indirect - google.golang.org/grpc v1.52.0 // indirect - google.golang.org/protobuf v1.28.1 // indirect + github.com/xlab/treeprint v1.2.0 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go-micro.dev/v4 v4.9.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0 // indirect + go.opentelemetry.io/otel v1.25.0 // indirect + go.opentelemetry.io/otel/metric v1.25.0 // indirect + go.opentelemetry.io/otel/trace v1.25.0 // indirect + go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect + golang.org/x/arch v0.8.0 // indirect + golang.org/x/crypto v0.22.0 // indirect + golang.org/x/mod v0.17.0 // indirect + golang.org/x/oauth2 v0.18.0 // indirect + golang.org/x/sync v0.7.0 // indirect + golang.org/x/term v0.19.0 // indirect + golang.org/x/text v0.15.0 // indirect + golang.org/x/time v0.5.0 // indirect + golang.org/x/tools v0.16.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect + google.golang.org/grpc v1.63.2 // indirect + google.golang.org/protobuf v1.34.0 // indirect + gopkg.in/evanphx/json-patch.v5 v5.7.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.26.0 // indirect - k8s.io/apiserver v0.26.0 // indirect - k8s.io/cli-runtime v0.26.0 // indirect - k8s.io/component-base v0.26.0 // indirect - k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect - k8s.io/kubectl v0.26.0 // indirect - k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect - oras.land/oras-go v1.2.2 // indirect - sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect - sigs.k8s.io/kustomize/api v0.12.1 // indirect - sigs.k8s.io/kustomize/kyaml v0.13.9 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect - sigs.k8s.io/yaml v1.3.0 // indirect + howett.net/plist v1.0.0 // indirect + k8s.io/apiextensions-apiserver v0.29.0 // indirect + k8s.io/apiserver v0.29.2 // indirect + k8s.io/component-base v0.29.2 // indirect + k8s.io/kube-openapi v0.0.0-20231214164306-ab13479f8bf8 // indirect + oras.land/oras-go v1.2.4 // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/kustomize/api v0.16.0 // indirect + sigs.k8s.io/kustomize/kyaml v0.16.0 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/bcs-services/bcs-cluster-reporter/internal/api/bcs/cluster.go b/bcs-services/bcs-cluster-reporter/internal/api/bcs/cluster.go index 650a652db9..bb5c3682af 100644 --- a/bcs-services/bcs-cluster-reporter/internal/api/bcs/cluster.go +++ b/bcs-services/bcs-cluster-reporter/internal/api/bcs/cluster.go @@ -10,12 +10,11 @@ * limitations under the License. */ -// Package bcs +// Package bcs xxx package bcs import ( "encoding/json" - "errors" "fmt" "net/http" "net/url" @@ -36,8 +35,8 @@ func (cm *ClusterManager) GetClusters(clusterIds []string) ([]cmproto.Cluster, e httpClient *http.Client ) - rt = &BcsTransport{ - token: cm.token, + rt = &rest.BcsTransport{ + Token: cm.token, } httpClient = &http.Client{Transport: rt} @@ -65,7 +64,7 @@ func (cm *ClusterManager) GetClusters(clusterIds []string) ([]cmproto.Cluster, e } if !result.Result { - e := errors.New(fmt.Sprintf("cluster response result failed: %s", result.Msg)) + e := fmt.Errorf("cluster response result failed: %s", result.Msg) klog.V(3).Info(e.Error()) return nil, e } @@ -74,15 +73,15 @@ func (cm *ClusterManager) GetClusters(clusterIds []string) ([]cmproto.Cluster, e clusterList := make([]cmproto.Cluster, 0, 0) err = json.Unmarshal(clusterData, &clusterList) if err != nil { + klog.Errorf(err.Error()) cluster := cmproto.Cluster{} err = json.Unmarshal(clusterData, &cluster) if err != nil { - e := errors.New(fmt.Sprintf("Unmarshal cluster response failed %s", err.Error())) - klog.V(3).Info(e.Error()) + e := fmt.Errorf("Unmarshal cluster response failed %s", err.Error()) return nil, e - } else { - clusterList = append(clusterList, cluster) } + + clusterList = append(clusterList, cluster) } resultList = append(resultList, clusterList...) } @@ -92,7 +91,7 @@ func (cm *ClusterManager) GetClusters(clusterIds []string) ([]cmproto.Cluster, e // GetNodesByClusterId get cluster nodes func (cm *ClusterManager) GetNodesByClusterId(clusterId string) ([]cmproto.Node, error) { if clusterId == "" { - return nil, errors.New("ClusterId cannot be blank") + return nil, fmt.Errorf("ClusterId cannot be blank") } svcUrl, _ := url.Parse(cm.url + fmt.Sprintf(_urlMap["GetNodesByClusterId"], clusterId)) klog.V(6).Infof("start ClusterManager request %s", svcUrl.String()) @@ -102,7 +101,7 @@ func (cm *ClusterManager) GetNodesByClusterId(clusterId string) ([]cmproto.Node, httpClient *http.Client ) - rt = &BcsTransport{token: cm.token} + rt = &rest.BcsTransport{Token: cm.token} httpClient = &http.Client{Transport: rt, Timeout: 10 * time.Second} req := rest.NewRequest(httpClient, "GET", svcUrl, nil) @@ -118,8 +117,8 @@ func (cm *ClusterManager) GetNodesByClusterId(clusterId string) ([]cmproto.Node, } if !result.Result { - e := errors.New(fmt.Sprintf("cluster response result failed: %s", result.Msg)) - klog.V(3).Info(e.Error()) + e := fmt.Errorf("cluster response result failed: %s", result.Msg) + klog.Info(e.Error()) return nil, e } @@ -127,7 +126,7 @@ func (cm *ClusterManager) GetNodesByClusterId(clusterId string) ([]cmproto.Node, nodeList := make([]cmproto.Node, 0, 0) err = json.Unmarshal(nodeData, &nodeList) if err != nil { - e := errors.New(fmt.Sprintf("Unmarshal cluster response failed %s", err.Error())) + e := fmt.Errorf("Unmarshal cluster response failed %s", err.Error()) klog.V(3).Info(e.Error()) return nil, e } @@ -144,7 +143,7 @@ func (cm *ClusterManager) GetNode(ip string) (*cmproto.Node, error) { httpClient *http.Client ) - rt = &BcsTransport{token: cm.token} + rt = &rest.BcsTransport{Token: cm.token} httpClient = &http.Client{Transport: rt} req := rest.NewRequest(httpClient, "GET", svcUrl, nil) @@ -160,8 +159,8 @@ func (cm *ClusterManager) GetNode(ip string) (*cmproto.Node, error) { } if !result.Result { - e := errors.New(fmt.Sprintf("getnode response result failed: %s", result.Msg)) - klog.V(3).Info(e.Error()) + e := fmt.Errorf("getnode response result failed: %s", result.Msg) + klog.Errorf(e.Error()) return nil, e } @@ -169,14 +168,14 @@ func (cm *ClusterManager) GetNode(ip string) (*cmproto.Node, error) { nodeList := make([]cmproto.Node, 0, 0) err = json.Unmarshal(nodeData, &nodeList) if err != nil { - e := errors.New(fmt.Sprintf("Unmarshal getnode response failed %s", err.Error())) - klog.V(3).Info(e.Error()) + e := fmt.Errorf("Unmarshal getnode response failed %s", err.Error()) + klog.Errorf(e.Error()) return nil, e } if len(nodeList) != 1 { - e := errors.New(fmt.Sprintf("getnode result num wrong %s", err.Error())) - klog.V(3).Info(e.Error()) + e := fmt.Errorf("getnode result num wrong %s", err.Error()) + klog.Errorf(e.Error()) return nil, e } @@ -193,6 +192,9 @@ func (cm *ClusterManager) GetKubeconfig(clusterID string) *k8srest.Config { CAFile: "", CAData: nil, }, + Timeout: time.Minute, + QPS: 30, + Burst: 60, } return config diff --git a/bcs-services/bcs-cluster-reporter/internal/api/bcs/clustermanager.go b/bcs-services/bcs-cluster-reporter/internal/api/bcs/clustermanager.go index 5daa3d1578..ddfbcea955 100644 --- a/bcs-services/bcs-cluster-reporter/internal/api/bcs/clustermanager.go +++ b/bcs-services/bcs-cluster-reporter/internal/api/bcs/clustermanager.go @@ -10,6 +10,7 @@ * limitations under the License. */ +// Package bcs xxx package bcs import ( diff --git a/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud.go b/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud.go new file mode 100644 index 0000000000..68e4a689a3 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud.go @@ -0,0 +1,72 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package qcloud xxx +package qcloud + +import ( + "fmt" + "net/http" + "net/url" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/rest" +) + +// NodeMetaData node info struct +type NodeMetaData struct { + InstanceID string + Region string + Zone string + InstanceType string + InstanceImageID string +} + +// GetQcloudNodeMetadata get cvm info +func GetQcloudNodeMetadata() (*NodeMetaData, error) { + var err error + var result = &NodeMetaData{} + + result.InstanceID, err = GetMetadata("instance-id") + if err != nil { + return nil, err + } + result.Region, err = GetMetadata("placement/region") + if err != nil { + return nil, err + } + result.Zone, err = GetMetadata("placement/zone") + if err != nil { + return nil, err + } + result.InstanceImageID, err = GetMetadata("instance/image-id") + if err != nil { + return nil, err + } + result.InstanceType, err = GetMetadata("instance/instance-type") + if err != nil { + return nil, err + } + + return result, nil +} + +// GetMetadata get cvm info by qcloud api +func GetMetadata(item string) (string, error) { + httpClient := &http.Client{} + svcUrl, _ := url.Parse(fmt.Sprintf("http://metadata.tencentyun.com/latest/meta-data/%s", item)) + req := rest.NewRequest(httpClient, "GET", svcUrl, nil) + data, err := req.Do() + if err != nil { + return "", err + } + return string(data), nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud_test.go b/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud_test.go new file mode 100644 index 0000000000..6151674dc4 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/api/qcloud/qcloud_test.go @@ -0,0 +1,16 @@ +package qcloud + +import ( + "fmt" + "testing" +) + +func TestGetQcloudNodeInfo(t *testing.T) { + result, err := GetQcloudNodeMetadata() + if err != nil { + t.Error(err) + } + + fmt.Println(result) + +} diff --git a/bcs-services/bcs-cluster-reporter/internal/k8s/get.go b/bcs-services/bcs-cluster-reporter/internal/k8s/get.go index 690d4cdd32..f15d4a3db1 100644 --- a/bcs-services/bcs-cluster-reporter/internal/k8s/get.go +++ b/bcs-services/bcs-cluster-reporter/internal/k8s/get.go @@ -14,38 +14,55 @@ package k8s import ( + "context" "fmt" "time" - version2 "k8s.io/apimachinery/pkg/version" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes" ) // GetK8sVersion get cluster k8s version func GetK8sVersion(clientSet *kubernetes.Clientset) (string, error) { - versionChann := make(chan int) - var version string + var versionStr string var versionError error - var versionInfo *version2.Info + var versionInfo *version.Info + + ctx, cancel := context.WithTimeout(context.Background(), 11*time.Second) go func() { versionInfo, versionError = clientSet.ServerVersion() - if versionError != nil { - version = "" - } else if versionInfo.GitVersion == "" { - version = "" - versionError = fmt.Errorf("get blank result") - } else { - version = versionInfo.GitVersion + if ctx.Err() != nil { + return } - versionChann <- 0 + cancel() }() select { case <-time.After(10 * time.Second): return "", fmt.Errorf("get k8s version timeout") - case <-versionChann: + case <-ctx.Done(): + } + + if versionError != nil { + versionStr = "" + } else if versionInfo.GitVersion == "" { + versionStr = "" + versionError = fmt.Errorf("get blank result") + } else { + versionStr = versionInfo.GitVersion + } + + return versionStr, versionError +} + +// GetK8sApi get apiserver api list +func GetK8sApi(clientSet *kubernetes.Clientset) ([]*v1.APIResourceList, error) { + apiResources, err := clientSet.Discovery().ServerPreferredResources() + if err != nil { + return nil, err } - return version, versionError + return apiResources, nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/k8s/init.go b/bcs-services/bcs-cluster-reporter/internal/k8s/init.go index d602b8398d..1673ccf844 100644 --- a/bcs-services/bcs-cluster-reporter/internal/k8s/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/k8s/init.go @@ -10,9 +10,12 @@ * limitations under the License. */ +// Package k8s xxx package k8s import ( + "os" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/client-go/discovery" "k8s.io/client-go/discovery/cached/memory" @@ -31,7 +34,7 @@ func GetClientsetByConfig(config *rest.Config) (*kubernetes.Clientset, error) { // GetRestClientGetterByConfig restClient func GetRestClientGetterByConfig(config *rest.Config) *RESTClientGetter { return &RESTClientGetter{ - clientconfig: clientcmd.NewDefaultClientConfig(BuildKubeconfig(config), nil), + clientconfig: clientcmd.NewDefaultClientConfig(BuildKubeconfig(config), &clientcmd.ConfigOverrides{Timeout: "20"}), } } @@ -41,11 +44,31 @@ func GetRestConfigByConfig(filePath string) (*rest.Config, error) { if err != nil { return nil, err } - kubeconfig.QPS = 300 - kubeconfig.Burst = 600 + kubeconfig.QPS = 30 + kubeconfig.Burst = 60 return kubeconfig, nil } +// GetRestConfig get rest config by kubeconfig file +func GetRestConfig() (*rest.Config, error) { + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + kubeconfig = os.Getenv("HOME") + "/.kube/config" + } + restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + restConfig, err = rest.InClusterConfig() + if err != nil { + restConfig = &rest.Config{ + Host: "http://localhost:8080", + } + } + } + restConfig.QPS = 30 + restConfig.Burst = 60 + return restConfig, nil +} + // BuildKubeconfig build cluster config func BuildKubeconfig(config *rest.Config) clientcmdapi.Config { kubeConfig := clientcmdapi.Config{ @@ -58,9 +81,10 @@ func BuildKubeconfig(config *rest.Config) clientcmdapi.Config { } kubeConfig.Clusters["default-cluster"] = &clientcmdapi.Cluster{ - Server: config.Host, - CertificateAuthority: config.CAFile, - InsecureSkipTLSVerify: config.Insecure, + Server: config.Host, + CertificateAuthority: config.CAFile, + InsecureSkipTLSVerify: config.Insecure, + CertificateAuthorityData: config.CAData, } kubeConfig.Contexts["default-context"] = &clientcmdapi.Context{ @@ -69,9 +93,11 @@ func BuildKubeconfig(config *rest.Config) clientcmdapi.Config { AuthInfo: "default", } kubeConfig.AuthInfos["default"] = &clientcmdapi.AuthInfo{ - TokenFile: config.BearerTokenFile, - Username: config.Username, - Token: config.BearerToken, + TokenFile: config.BearerTokenFile, + Username: config.Username, + Token: config.BearerToken, + ClientKeyData: config.TLSClientConfig.KeyData, + ClientCertificateData: config.TLSClientConfig.CertData, } return kubeConfig diff --git a/bcs-services/bcs-cluster-reporter/internal/k8s/k8s_test.go b/bcs-services/bcs-cluster-reporter/internal/k8s/k8s_test.go new file mode 100644 index 0000000000..afbd902351 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/k8s/k8s_test.go @@ -0,0 +1,39 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package k8s xxx +package k8s + +import ( + "fmt" + "testing" + + "k8s.io/client-go/tools/clientcmd" +) + +func TestGetK8sVersion(t *testing.T) { + config, err := clientcmd.BuildConfigFromFlags("", "/root/.kube/config") + if err != nil { + // 处理错误 + } + + clientset, err := GetClientsetByConfig(config) + if err != nil { + // 处理错误 + } + + version, err := GetK8sVersion(clientset) + if err != nil { + fmt.Println(err.Error()) + } + fmt.Println("集群版本:", version) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/k8s/pod.go b/bcs-services/bcs-cluster-reporter/internal/k8s/pod.go index f1511d880c..d4370a0937 100644 --- a/bcs-services/bcs-cluster-reporter/internal/k8s/pod.go +++ b/bcs-services/bcs-cluster-reporter/internal/k8s/pod.go @@ -1,15 +1,16 @@ /* - * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Tencent is pleased to support the open source community by making Blueking Container Service available. * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. * Licensed under the MIT License (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, + * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language governing permissions and * limitations under the License. */ +// Package k8s xxx package k8s import ( @@ -33,14 +34,13 @@ func GetPods(clientSet *kubernetes.Clientset, namespace string, opts v1.ListOpti if nameRe == "" { return podList.Items, nil - } else { - re, _ := regexp.Compile(nameRe) - result := make([]corev1.Pod, 0, 0) - for _, pod := range podList.Items { - if re.MatchString(pod.Name) || strings.Contains(pod.Name, nameRe) { - result = append(result, pod) - } + } + re, _ := regexp.Compile(nameRe) + result := make([]corev1.Pod, 0, 0) + for _, pod := range podList.Items { + if re.MatchString(pod.Name) || strings.Contains(pod.Name, nameRe) { + result = append(result, pod) } - return result, nil } + return result, nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/metric_manager/metric.go b/bcs-services/bcs-cluster-reporter/internal/metricmanager/metric.go similarity index 68% rename from bcs-services/bcs-cluster-reporter/internal/metric_manager/metric.go rename to bcs-services/bcs-cluster-reporter/internal/metricmanager/metric.go index c38b83901f..544ba49df3 100644 --- a/bcs-services/bcs-cluster-reporter/internal/metric_manager/metric.go +++ b/bcs-services/bcs-cluster-reporter/internal/metricmanager/metric.go @@ -10,16 +10,16 @@ * limitations under the License. */ -// Package metric_manager -package metric_manager +// Package metricmanager xxx +package metricmanager import ( "net/http" - "runtime/debug" "strings" "sync" "time" + "github.com/gin-gonic/gin" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/klog" @@ -43,8 +43,8 @@ var ( requestLatencyAPI = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "plugin_latency_time", Help: "plugin latency statistic ", - Buckets: []float64{0.1, 0.5, 0.75, 1.0, 2.0, 3.0, 5.0, 10.0, 20.0}, - }, []string{"plugin", "condition1", "condition2", "condition3"}) + Buckets: []float64{30, 60, 90, 120, 150, 180, 210, 240, 270}, + }, []string{"plugin", "target", "condition2", "condition3"}) // MM MM *MetricManger @@ -62,6 +62,7 @@ func init() { type MetricManger struct { registryMap map[string]*prometheus.Registry registryMapLock sync.Mutex + engine *gin.Engine } // NewMetricManger init metric manager @@ -71,6 +72,11 @@ func NewMetricManger() *MetricManger { } } +// SetEngine xxx +func (mm *MetricManger) SetEngine(r *gin.Engine) { + mm.engine = r +} + // SetSeperatedMetric 将指标暴露在独立于/metrics的其他路径上 /path/metrics func (mm *MetricManger) SetSeperatedMetric(path string) { if _, ok := mm.registryMap[path]; !ok { @@ -81,7 +87,7 @@ func (mm *MetricManger) SetSeperatedMetric(path string) { promhttp.HandlerOpts{}, ) - http.Handle("/"+path+"/metrics", componentAHandler) + mm.engine.GET("/"+path+"/metrics", gin.WrapH(componentAHandler)) } } @@ -104,27 +110,52 @@ type GaugeVecSet struct { Value float64 } +// RefreshMetric refresh metric +func RefreshMetric(metricVec *prometheus.GaugeVec, gaugeVecSetList []*GaugeVecSet) { + metricVec.Reset() + + metricMap := make(map[string]string) + for _, gaugeVecSet := range gaugeVecSetList { + if _, ok := metricMap[strings.Join(gaugeVecSet.Labels, "-")]; ok { + metricVec.WithLabelValues(gaugeVecSet.Labels...).Add(gaugeVecSet.Value) + } else { + metricMap[strings.Join(gaugeVecSet.Labels, "-")] = strings.Join(gaugeVecSet.Labels, "-") + metricVec.WithLabelValues(gaugeVecSet.Labels...).Set(gaugeVecSet.Value) + } + } +} + // SetMetric xxx func SetMetric(metricVec *prometheus.GaugeVec, gaugeVecSetList []*GaugeVecSet) { - metricVec.Reset() + //metricVec.Reset() + metricMap := make(map[string]string) for _, gaugeVecSet := range gaugeVecSetList { - metricVec.WithLabelValues(gaugeVecSet.Labels...).Set(gaugeVecSet.Value) - - defer func() { - if r := recover(); r != nil { - klog.Errorf("SetMetric failed: %s, stack: %v\n", r, string(debug.Stack())) - // klog.Errorf("SetMetric failed: %s", gaugeVecSet) - for _, gaugeVecSet1 := range gaugeVecSetList { - if gaugeVecSet1 != nil { - klog.Errorf("SetMetric failed: %s", strings.Join(gaugeVecSet1.Labels, ";")) - break - } - } + if _, ok := metricMap[strings.Join(gaugeVecSet.Labels, "-")]; ok { + metricVec.WithLabelValues(gaugeVecSet.Labels...).Add(gaugeVecSet.Value) + } else { + metricMap[strings.Join(gaugeVecSet.Labels, "-")] = strings.Join(gaugeVecSet.Labels, "-") + metricVec.WithLabelValues(gaugeVecSet.Labels...).Set(gaugeVecSet.Value) + } + } +} + +// DeleteMetric xxx +func DeleteMetric(metricVec *prometheus.GaugeVec, gaugeVecSetList []*GaugeVecSet) { + wg := sync.WaitGroup{} + for _, gaugeVecSet := range gaugeVecSetList { + wg.Add(1) + go func(gaugeVecSet *GaugeVecSet) { + result := metricVec.DeleteLabelValues(gaugeVecSet.Labels...) + if !result { + klog.Error("delete metric result failed: ", result, metricVec, gaugeVecSet.Labels) } - }() + + wg.Done() + }(gaugeVecSet) } + wg.Wait() } // SetCommonDurationMetric xxx @@ -144,3 +175,8 @@ func (mm *MetricManger) RunPrometheusMetricsServer() { }() klog.Infof("run prometheus server ok") } + +// GetHttpHandler xxx +func (mm *MetricManger) GetHttpHandler() http.Handler { + return promhttp.Handler() +} diff --git a/bcs-services/bcs-cluster-reporter/internal/metric_manager/metric_test.go b/bcs-services/bcs-cluster-reporter/internal/metricmanager/metric_test.go similarity index 86% rename from bcs-services/bcs-cluster-reporter/internal/metric_manager/metric_test.go rename to bcs-services/bcs-cluster-reporter/internal/metricmanager/metric_test.go index 6c44e936fb..050c2dee9e 100644 --- a/bcs-services/bcs-cluster-reporter/internal/metric_manager/metric_test.go +++ b/bcs-services/bcs-cluster-reporter/internal/metricmanager/metric_test.go @@ -10,7 +10,8 @@ * limitations under the License. */ -package metric_manager +// Package metricmanager xxx +package metricmanager import ( "fmt" @@ -34,10 +35,6 @@ func TestSendMessage(t *testing.T) { }, }) - vec = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_availability", - Help: "cluster_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}) MM.SetSeperatedMetric("123") _, ok := MM.registryMap["123"] diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/capacitycheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/capacitycheck.go new file mode 100644 index 0000000000..3de5f2832c --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/capacitycheck.go @@ -0,0 +1,464 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package capacitycheck xxx +package capacitycheck + +import ( + "bytes" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + pluginmanager "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "math" + "net" + "strconv" + "strings" + "sync" + "time" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/prometheus/client_golang/prometheus" + io_prometheus_client "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/rest" + "k8s.io/klog" + "k8s.io/kubectl/pkg/rawhttp" +) + +// Plugin xxx +type Plugin struct { + opt *Options + testYamlString string + pluginmanager.ClusterPlugin +} + +var ( + clusterGVSMap = make(map[string][]*metricmanager.GaugeVecSet) + clusterCapacity = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: ClusterCapacityMetricName, + Help: ClusterCapacityMetricName, + }, []string{"target", "bk_biz_id", "item", "status"}) + + routinePool = util.NewRoutinePool(20) +) + +func init() { + metricmanager.Register(clusterCapacity) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.Result = make(map[string]pluginmanager.CheckResult) + p.ReadyMap = make(map[string]bool) + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + if p.opt.Synchronization { + pluginmanager.Pm.Lock() + } + go p.Check() + } else { + klog.V(3).Infof("the former clustercheck didn't over, skip in this loop") + } + select { + case result := <-p.StopChan: + klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + return nil +} + +// Name return plugin name +func (p *Plugin) Name() string { + return pluginName +} + +// Check cluster capacity and store result +func (p *Plugin) Check() { + start := time.Now() + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + if p.opt.Synchronization { + pluginmanager.Pm.UnLock() + } + p.CheckLock.Unlock() + metricmanager.SetCommonDurationMetric([]string{"clustercheck", "", "", ""}, start) + }() + + clusterConfigs := pluginmanager.Pm.GetConfig().ClusterConfigs + + wg := sync.WaitGroup{} + + // 遍历所有集群 + for _, cluster := range clusterConfigs { + wg.Add(1) + routinePool.Add(1) + + pluginmanager.Pm.Ready("systemappcheck,nodecheck", cluster.ClusterID) + go func(cluster *pluginmanager.ClusterConfig) { + cluster.Lock() + klog.Infof("start capacitycheck for %s", cluster.ClusterID) + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = false + p.WriteLock.Unlock() + + defer func() { + wg.Done() + routinePool.Done() + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = true + p.WriteLock.Unlock() + cluster.Unlock() + klog.Infof("end capacitycheck for %s", cluster.ClusterID) + }() + + clusterResult := pluginmanager.CheckResult{ + Items: make([]pluginmanager.CheckItem, 0, 0), + InfoItemList: make([]pluginmanager.InfoItem, 0, 0), + } + + defer func() { + p.WriteLock.Lock() + for key, val := range clusterResult.Items { + val.ItemName = StringMap[val.ItemName] + val.ItemTarget = StringMap[val.ItemTarget] + val.Status = StringMap[val.Status] + clusterResult.Items[key] = val + } + + p.Result[cluster.ClusterID] = clusterResult + p.WriteLock.Unlock() + }() + + // 获取apiserver的metric指标 + metricFamilies, err := GetApiserverMetric(cluster.Config) + if err != nil { + klog.Errorf("get cluster %s metric failed: %s", cluster.ClusterID, err.Error()) + return + } + + // 获取集群 各类resource的object数量 + resourceList := []string{"pods", "nodes", "services", "configmaps"} + for _, resource := range resourceList { + objectNum, err := GetObjectNum(metricFamilies, resource) + if err != nil { + klog.Errorf("get cluster %s %s failed: %s", cluster.ClusterID, resource, err.Error()) + continue + } + + clusterResult.InfoItemList = append(clusterResult.InfoItemList, pluginmanager.InfoItem{ + ItemName: fmt.Sprintf("%s num", resource), + Result: objectNum, + }) + + switch resource { + case "services": + cluster.ServiceNum = objectNum + case "nodes": + cluster.NodeNum = objectNum + } + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, fmt.Sprintf("%s num", resource), NormalStatus}, + Value: float64(objectNum), + }) + } + + // 获取集群的service网段信息 + if _, _, err := net.ParseCIDR(cluster.ServiceCidr); err == nil { + mask, _ := strconv.Atoi(strings.Split(cluster.ServiceCidr, "/")[1]) + cluster.ServiceMaxNum = 1 << uint(32-mask) + + clusterResult.InfoItemList = append(clusterResult.InfoItemList, pluginmanager.InfoItem{ + ItemName: ServiceMaxNumCheckItemType, + Result: cluster.ServiceMaxNum, + }) + + clusterResult.InfoItemList = append(clusterResult.InfoItemList, pluginmanager.InfoItem{ + ItemName: ServiceCidrCheckItemType, + Result: cluster.ServiceCidr, + }) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, ServiceNumCheckItemType, NormalStatus}, + Value: float64(cluster.ServiceMaxNum - cluster.ServiceNum), + }) + } else { + klog.Errorf("%s parse service cidr %s failed: %s", cluster.ClusterID, cluster.ServiceCidr, err.Error()) + } + + // 获取集群还可以分配的cidr数量 + if len(cluster.Cidr) > 0 { + totalIPNum := 0 + nodePodNum := math.Pow(2, float64(32-cluster.MaskSize)) + for _, cidr := range cluster.Cidr { + mask, _ := strconv.Atoi(strings.Split(cidr, "/")[1]) + ipNum := math.Pow(2, float64(32-mask)) + totalIPNum = totalIPNum + int(ipNum) + } + totalIPNum = totalIPNum - cluster.ServiceMaxNum + + maxNodeNum := totalIPNum / int(nodePodNum) + + // cidr允许的最大节点数 + clusterResult.InfoItemList = append(clusterResult.InfoItemList, pluginmanager.InfoItem{ + ItemName: NodeCidrNumCheckItemType, + Result: maxNodeNum, + }) + + clusterResult.InfoItemList = append(clusterResult.InfoItemList, pluginmanager.InfoItem{ + ItemName: NodeMaxPodCheckItemType, + Result: nodePodNum, + }) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, NodeCidrNumCheckItemType, NormalStatus}, + Value: float64(maxNodeNum), + }) + } + + // master检查 + checkItemList, masterGVSList := GetMasterCheckResult(cluster) + gvsList = append(gvsList, masterGVSList...) + clusterResult.Items = append(clusterResult.Items, checkItemList...) + + // node检查 + infoItemList, masterGVSList := GetNodeCheckResult(cluster) + gvsList = append(gvsList, masterGVSList...) + clusterResult.InfoItemList = append(clusterResult.InfoItemList, infoItemList...) + + p.WriteLock.Lock() + metricmanager.DeleteMetric(clusterCapacity, clusterGVSMap[cluster.ClusterID]) + metricmanager.SetMetric(clusterCapacity, gvsList) + clusterGVSMap[cluster.ClusterID] = gvsList + p.WriteLock.Unlock() + }(cluster) + + } + wg.Wait() + + p.WriteLock.Lock() + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + p.ReadyMap[clusterID] = false + klog.Infof("delete cluster %s", clusterID) + } + } + + // 从readymap和指标中清理已删除集群 + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + delete(p.ReadyMap, clusterID) + metricmanager.DeleteMetric(clusterCapacity, clusterGVSMap[clusterID]) + delete(clusterGVSMap, clusterID) + klog.Infof("delete cluster %s", clusterID) + } + } + p.WriteLock.Unlock() +} + +// GetApiserverMetric Get metric from apiserver api +func GetApiserverMetric(config *rest.Config) (map[string]*io_prometheus_client.MetricFamily, error) { + metricServer := config.Host + "/metrics" + out := &bytes.Buffer{} + o := genericclioptions.IOStreams{In: &bytes.Buffer{}, Out: out, ErrOut: &bytes.Buffer{}} + + config.GroupVersion = &schema.GroupVersion{Group: "mygroup", Version: "v1"} + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.WithoutConversionCodecFactory{CodecFactory: serializer.NewCodecFactory(runtime.NewScheme())} + c, err := rest.RESTClientFor(config) + if err != nil { + return nil, err + } + err = rawhttp.RawGet(c, o, metricServer) + if err != nil { + return nil, err + } else { + var parser expfmt.TextParser + metricFamilies, err := parser.TextToMetricFamilies(out) + if err != nil { + return nil, err + } + + return metricFamilies, nil + } + +} + +// GetObjectNum Get object number by metric data +func GetObjectNum(metricFamilies map[string]*io_prometheus_client.MetricFamily, resource string) (int, error) { + for key, metricFamily := range metricFamilies { + if key == "etcd_object_counts" || key == "apiserver_storage_objects" { + for _, metric := range metricFamily.Metric { + for _, label := range metric.Label { + if *label.Name == "resource" && *label.Value == resource { + return int(*metric.Gauge.Value), nil + } + } + } + } + } + + return 0, fmt.Errorf("not found %s metric", resource) + +} + +// GetMasterCheckResult Check master info and generate check result +func GetMasterCheckResult(clusterInfo *pluginmanager.ClusterConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet) { + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: MasterTarget, + Status: NormalStatus, + Normal: len(clusterInfo.Master) >= 3, + Level: pluginmanager.WARNLevel, + Tags: nil, + } + if len(clusterInfo.Master) < 3 { + checkItem.Status = MasterNumHAErrorStatus + checkItem.Detail = fmt.Sprintf(StringMap[MasterNumDetailFormart], len(clusterInfo.Master)) + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterInfo.ClusterID, clusterInfo.BusinessID, MasterNumItemType, MasterNumHAErrorStatus}, + Value: float64(len(clusterInfo.Master))}) + } else { + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterInfo.ClusterID, clusterInfo.BusinessID, MasterNumItemType, NormalStatus}, + Value: float64(len(clusterInfo.Master))}) + } + + checkItemList = append(checkItemList, checkItem) + + zoneNum := make(map[string]int) + for _, master := range clusterInfo.Master { + if clusterInfo.NodeInfo[master].Zone == "" { + continue + } + zoneNum[clusterInfo.NodeInfo[master].Zone] = zoneNum[clusterInfo.NodeInfo[master].Zone] + 1 + } + + for zone, num := range zoneNum { + if (num)*2 >= len(clusterInfo.Master) { + checkItem = pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: MasterTarget, + Status: MasterZoneHAErrorStatus, + Normal: (num)*2 < len(clusterInfo.Master), + Detail: fmt.Sprintf(StringMap[MasterZoneDetailFormart], zone, num), + Level: pluginmanager.WARNLevel, + Tags: nil, + } + checkItemList = append(checkItemList, checkItem) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterInfo.ClusterID, clusterInfo.BusinessID, MasterZoneItemType, MasterZoneHAErrorStatus}, + Value: 1}) + break + } + } + + return checkItemList, gvsList +} + +// GetNodeCheckResult Check node info and generate check result +func GetNodeCheckResult(clusterInfo *pluginmanager.ClusterConfig) ([]pluginmanager.InfoItem, []*metricmanager.GaugeVecSet) { + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + infoItemList := make([]pluginmanager.InfoItem, 0, 0) + + zoneNum := make(map[string]int) + for _, nodeInfo := range clusterInfo.NodeInfo { + if nodeInfo.Zone == "" { + continue + } + zoneNum[nodeInfo.Zone] = zoneNum[nodeInfo.Zone] + 1 + } + + for zone, num := range zoneNum { + infoItemList = append(infoItemList, pluginmanager.InfoItem{ + ItemName: pluginName, + Labels: map[string]string{"zone": zone}, + Result: fmt.Sprintf("%d", num), + }) + } + + typeNum := make(map[string]int) + for _, nodeInfo := range clusterInfo.NodeInfo { + if nodeInfo.Type == "" { + continue + } + typeNum[nodeInfo.Type] = typeNum[nodeInfo.Type] + 1 + } + + for nodeType, num := range typeNum { + infoItemList = append(infoItemList, pluginmanager.InfoItem{ + ItemName: pluginName, + Labels: map[string]string{"type": nodeType}, + Result: fmt.Sprintf("%d", num), + }) + } + + return infoItemList, gvsList +} + +// Ready return true if cluster check is over +func (p *Plugin) Ready(clusterID string) bool { + p.WriteLock.Lock() + defer p.WriteLock.Unlock() + return p.ReadyMap[clusterID] +} + +// GetResult return check result by cluster ID +func (p *Plugin) GetResult(clusterID string) pluginmanager.CheckResult { + return p.Result[clusterID] +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/constant.go b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/constant.go new file mode 100644 index 0000000000..cc0655c782 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/constant.go @@ -0,0 +1,87 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under, + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package capacitycheck xxx +package capacitycheck + +const ( + initContent = `interval: 600` + pluginName = "capacitycheck" + ClusterCapacityMetricName = "cluster_capacity" + + // status + NormalStatus = "ok" + + ServiceMaxNumCheckItemType = "service max num" + ServiceNumCheckItemType = "service available num" + ServiceCidrCheckItemType = "service cidr" + NodeCidrNumCheckItemType = "cidr max node num" + NodeMaxPodCheckItemType = "node max pod num" + ObjectNumItemType = "object num" + MasterTarget = "master node" + MasterNumItemType = "master num" + MasterZoneItemType = "master zone" + NodeTypeItemType = "node type" + MasterNumDetailFormart = "master num is %d, less than 3" + MasterZoneDetailFormart = "master num in zone %s is %d, larger than half" + MasterCheckItemType = "master" + MasterZoneHAErrorStatus = "zone ha error" + MasterNumHAErrorStatus = "num ha error" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "集群容量检查", + ServiceMaxNumCheckItemType: "service最大数检查", + ServiceCidrCheckItemType: "service cidr", + MasterCheckItemType: MasterCheckItemType, + MasterTarget: MasterTarget, + MasterNumItemType: "master节点数量", + MasterZoneItemType: "master节点可用区分布", + MasterNumDetailFormart: "master数为%d,少于3个", + MasterZoneDetailFormart: "%s 的节点数为%d, 超过半数", + ObjectNumItemType: "对象实例数", + NormalStatus: "正常", + NodeTypeItemType: "节点规格", + NodeCidrNumCheckItemType: NodeCidrNumCheckItemType, + ServiceNumCheckItemType: ServiceNumCheckItemType, + NodeMaxPodCheckItemType: NodeMaxPodCheckItemType, + MasterZoneHAErrorStatus: MasterZoneHAErrorStatus, + MasterNumHAErrorStatus: MasterNumHAErrorStatus, + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + + // status + NormalStatus: NormalStatus, + + ServiceMaxNumCheckItemType: ServiceMaxNumCheckItemType, + ServiceCidrCheckItemType: ServiceCidrCheckItemType, + MasterCheckItemType: MasterCheckItemType, + MasterTarget: MasterTarget, + MasterNumItemType: MasterNumItemType, + MasterZoneItemType: MasterZoneItemType, + MasterNumDetailFormart: MasterNumDetailFormart, + MasterZoneDetailFormart: MasterZoneDetailFormart, + ObjectNumItemType: ObjectNumItemType, + + NodeTypeItemType: NodeTypeItemType, + NodeCidrNumCheckItemType: NodeCidrNumCheckItemType, + ServiceNumCheckItemType: ServiceNumCheckItemType, + NodeMaxPodCheckItemType: NodeMaxPodCheckItemType, + MasterZoneHAErrorStatus: MasterZoneHAErrorStatus, + MasterNumHAErrorStatus: MasterNumHAErrorStatus, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/init.go similarity index 76% rename from bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/init.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/init.go index be105708ed..b8a0a31818 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/init.go @@ -10,17 +10,11 @@ * limitations under the License. */ -package logrecorder +// Package capacitycheck xxx +package capacitycheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/options.go similarity index 95% rename from bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/options.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/options.go index 9dd293df79..5726812163 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/capacitycheck/options.go @@ -10,7 +10,8 @@ * limitations under the License. */ -package netcheck +// Package capacitycheck xxx +package capacitycheck // Options bcs log options type Options struct { @@ -20,6 +21,5 @@ type Options struct { // Validate validate options func (o *Options) Validate() error { - return nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercache.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercache.go new file mode 100644 index 0000000000..c6b3adaaff --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercache.go @@ -0,0 +1,22 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package plugin xxx +package plugin + +// NodeInfo node info struct +type NodeInfo struct { + //CheckItemMap map[string][]pluginmanager.CheckItem + Region string + Zone string + Type string +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck.go index da2ccd18df..b0a8147672 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck.go @@ -1,73 +1,96 @@ /* - * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Tencent is pleased to support the open source community by making Blueking Container Service available. * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. * Licensed under the MIT License (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, + * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language governing permissions and * limitations under the License. */ -// Package clustercheck +// Package clustercheck xxx package clustercheck import ( "context" + "encoding/json" "fmt" - "k8s.io/client-go/kubernetes" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + pluginmanager "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "math/rand" "os" "runtime/debug" "strings" "sync" "time" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - "github.com/prometheus/client_golang/prometheus" + "gopkg.in/yaml.v2" + batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/util/json" - "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/dynamic" - "k8s.io/client-go/rest" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/restmapper" "k8s.io/klog" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" ) -// Plugin xxx +// Plugin define cluster check plugin type Plugin struct { - stopChan chan int opt *Options - checkLock sync.Mutex testYamlString string + pluginmanager.ClusterPlugin } +// define plugin vars var ( - clusterAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_availability", - Help: "cluster_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}) + clusterAvailabilityLabels = []string{"target", "bk_biz_id", "status"} + clusterCheckDurationLabels = []string{"target", "bk_biz_id", "step"} + clusterApiserverCertificateExpirationLabels = []string{"target", "bk_biz_id", "type"} + clusterVersionLabels = []string{"target", "bk_biz_id", "version"} + clusterAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: ClusterAvailabilityCheckMetricName, + Help: ClusterAvailabilityCheckMetricName, + }, clusterAvailabilityLabels) + clusterVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: ClusterVersionMetricName, + Help: ClusterVersionMetricName, + }, clusterVersionLabels) clusterCheckDuration = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_check_duration_seconds", - Help: "cluster_check_duration_seconds, 1 means OK", - }, []string{"target", "target_biz", "step"}) - clusterAvailabilityMap = make(map[string][]*prometheus.GaugeVec) - unstructuredObj = &unstructured.Unstructured{} - clusterAvailabilityMapLock sync.Mutex + Name: ClusterCheckDurationMeticName, + Help: ClusterCheckDurationMeticName, + }, clusterCheckDurationLabels) + + clusterApiserverCertificateExpiration = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: ClusterApiserverCertExpirationMetricName, + Help: ClusterApiserverCertExpirationMetricName, + }, clusterApiserverCertificateExpirationLabels) + unstructuredObj = &unstructured.Unstructured{} + + clusterCheckGaugeVecSetList = make(map[string][]*metricmanager.GaugeVecSet) + clusterCheckDurationGaugeVecSetList = make(map[string][]*metricmanager.GaugeVecSet) + certificateExpirationGVSList = make(map[string][]*metricmanager.GaugeVecSet) + clusterVersionGaugeVecSetList = make(map[string][]*metricmanager.GaugeVecSet) ) func init() { - metric_manager.Register(clusterAvailability) - metric_manager.Register(clusterCheckDuration) + // register plugin metric + metricmanager.Register(clusterAvailability) + metricmanager.Register(clusterCheckDuration) + metricmanager.Register(clusterApiserverCertificateExpiration) + metricmanager.Register(clusterVersion) } // Setup xxx -func (p *Plugin) Setup(configFilePath string) error { +func (p *Plugin) Setup(configFilePath string, runMode string) error { configFileBytes, err := os.ReadFile(configFilePath) if err != nil { return fmt.Errorf("read clustercheck config file %s failed, err %s", configFilePath, err.Error()) @@ -83,62 +106,62 @@ func (p *Plugin) Setup(configFilePath string) error { return err } - unstructuredObj.SetUnstructuredContent(p.opt.TestYaml) + p.Result = make(map[string]pluginmanager.CheckResult) + p.ReadyMap = make(map[string]bool) + + decode := scheme.Codecs.UniversalDeserializer().Decode + yamlData, _ := yaml.Marshal(p.opt.TestYaml) + obj, gKV, _ := decode(yamlData, nil, nil) // 给测试workload添加标签 - kind := unstructuredObj.GetKind() - switch strings.ToLower(kind) { - case "replicaset": - fallthrough - case "deployment": - fallthrough - case "job": - objectMap := unstructuredObj.Object - updateNestedMap(objectMap, []string{"spec", "template", "metadata", "labels", "bcs-cluster-reporter"}, - "bcs-cluster-reporter") - //updateNestedMap(objectMap, []string{"spec", "selector", "matchLabels", "bcs-cluster-reporter"}, - // "bcs-cluster-reporter") - //updateNestedMap(objectMap, []string{"spec", "selector", "matchLabels", "bcs-cluster-reporter"}, - // "bcs-cluster-reporter") - //klog.Info(objectMap) - unstructuredObj.SetUnstructuredContent(objectMap) + switch gKV.Kind { + case "Job": + job := obj.(*batchv1.Job) + job.Spec.Template.ObjectMeta.Labels["bcs-cluster-reporter"] = "bcs-cluster-reporter" + job.ObjectMeta.Namespace = p.opt.Namespace + objMap, _ := runtime.DefaultUnstructuredConverter.ToUnstructured(job) + unstructuredObj.SetUnstructuredContent(objMap) default: klog.Fatalf("workload %s type is %s, not supported, please use job, deployment, replicaset", - unstructuredObj.GetName(), kind) + unstructuredObj.GetName(), gKV.Kind) } interval := p.opt.Interval if interval == 0 { - interval = 60 + interval = 300 } - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - if p.opt.Synchronization { - plugin_manager.Pm.Lock() + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + if p.opt.Synchronization { + pluginmanager.Pm.Lock() + } + go p.Check() + } else { + klog.Infof("the former clustercheck didn't over, skip in this loop") + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue } - go p.Check() - } else { - klog.V(3).Infof("the former clustercheck didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue } - } - }() + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } return nil } // Stop xxx func (p *Plugin) Stop() error { - p.stopChan <- 1 + p.StopChan <- 1 klog.Infof("plugin %s stopped", p.Name()) return nil } @@ -150,255 +173,338 @@ func (p *Plugin) Name() string { func int64Ptr(i int64) *int64 { return &i } -// Check xxx +// Check check for cluster apiserver cert, control panael availability and store result func (p *Plugin) Check() { start := time.Now() - p.checkLock.Lock() + p.CheckLock.Lock() klog.Infof("start %s", p.Name()) defer func() { klog.Infof("end %s", p.Name()) if p.opt.Synchronization { - plugin_manager.Pm.UnLock() + pluginmanager.Pm.UnLock() } - p.checkLock.Unlock() - metric_manager.SetCommonDurationMetric([]string{"clustercheck", "", "", ""}, start) + p.CheckLock.Unlock() + metricmanager.SetCommonDurationMetric([]string{"clustercheck", "", "", ""}, start) }() - // 根据internal来调整超时时间的长短 - interval := p.opt.Interval - - namespace := unstructuredObj.GetNamespace() - if namespace == "" { - namespace = "default" - } - + clusterConfigs := pluginmanager.Pm.GetConfig().ClusterConfigs wg := sync.WaitGroup{} - clusterChecktGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - clusterCheckDurationGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - for _, cluster := range plugin_manager.Pm.GetConfig().ClusterConfigs { + + // 遍历所有集群 + for _, cluster := range clusterConfigs { wg.Add(1) - config := cluster.Config - clusterId := cluster.ClusterID - clusterbiz := cluster.BusinessID - plugin_manager.Pm.Add() - go func() { - status := "error" + pluginmanager.Pm.Add() + + go func(cluster *pluginmanager.ClusterConfig) { + defer func() { + wg.Done() + pluginmanager.Pm.Done() + }() + + clusterId := cluster.ClusterID + clusterResult := pluginmanager.CheckResult{ + Items: make([]pluginmanager.CheckItem, 0, 0), + InfoItemList: make([]pluginmanager.InfoItem, 0, 0), + } + + klog.Infof("start clustercheck for %s", clusterId) + + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = false + p.WriteLock.Unlock() + + loopClusterChecktGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopClusterCheckDurationGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopCertificateExpirationGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopClusterVersionGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + defer func() { if r := recover(); r != nil { klog.Errorf("%s clustercheck failed: %s, stack: %v\n", clusterId, r, string(debug.Stack())) - clientSet1, _ := k8s.GetClientsetByConfig(config) var responseContentType string - body, _ := clientSet1.RESTClient().Get(). + body, _ := cluster.ClientSet.RESTClient().Get(). AbsPath("/apis"). SetHeader("Accept", "application/json"). Do(context.TODO()). ContentType(&responseContentType). Raw() - klog.V(3).Infof("Try get apis for %s: %s", clusterId, string(body)) - status = "panic" + klog.Infof("Try get apis for %s: %s", clusterId, string(body)) } - - plugin_manager.Pm.Done() - wg.Done() }() - klog.Infof("start clustercheck for %s", clusterId) - workloadToScheduleCost, workloadToPodCost, worloadToRunningCost, err := testClusterByCreateUnstructuredObj( - unstructuredObj, config, &status, interval, clusterId) - - clusterChecktGaugeVecSetList = append(clusterChecktGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, status}, Value: float64(1)}) - clusterCheckDurationGaugeVecSetList = append(clusterCheckDurationGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, "create_pod"}, - Value: float64(workloadToPodCost) / 1000000000}) - clusterCheckDurationGaugeVecSetList = append(clusterCheckDurationGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, "schedule_pod"}, - Value: float64(workloadToScheduleCost) / 1000000000}) - clusterCheckDurationGaugeVecSetList = append(clusterCheckDurationGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, "start_pod"}, - Value: float64(worloadToRunningCost) / 1000000000}) - - // 集群单独路径的指标配置 - clusterAvailabilityMapLock.Lock() - if _, ok := clusterAvailabilityMap[clusterId]; !ok { - clusterAvailabilityMap[clusterId] = make([]*prometheus.GaugeVec, 0, 0) - clusterAvailabilityMap[clusterId] = append(clusterAvailabilityMap[clusterId], - prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_availability", - Help: "cluster_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}), - prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_check_duration_seconds", - Help: "cluster_check_duration_seconds", - }, []string{"target", "target_biz", "step"})) - - for index, _ := range clusterAvailabilityMap[clusterId] { - metric_manager.MM.RegisterSeperatedMetric(clusterId, clusterAvailabilityMap[clusterId][index]) - } + // check apiserver cert + getServerCertWG := sync.WaitGroup{} + if len(cluster.Master) > 0 { + getServerCertWG.Add(1) + go func() { + defer func() { + getServerCertWG.Done() + }() + checkItemList, gvsList, err := getApiserverCert(cluster) + if err != nil { + klog.Errorf("%s check apiserver cert expiration failed: %s", cluster.ClusterID, err.Error()) + } else { + clusterResult.Items = append(clusterResult.Items, checkItemList...) + loopCertificateExpirationGVSList = append(loopCertificateExpirationGVSList, gvsList...) + } + }() } - clusterAvailabilityMapLock.Unlock() - - metric_manager.SetMetric(clusterAvailabilityMap[clusterId][0], []*metric_manager.GaugeVecSet{ - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, status}, Value: float64(1)}, - }) - metric_manager.SetMetric(clusterAvailabilityMap[clusterId][1], []*metric_manager.GaugeVecSet{ - &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, "create_pod"}, Value: float64(workloadToPodCost) / 1000000000}, - &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, "schedule_pod"}, Value: float64(workloadToScheduleCost) / 1000000000}, - &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, "start_pod"}, Value: float64(worloadToRunningCost) / 1000000000}, - }) + // blackbox check + checkItemList, infoItemList, gvs, gvsList, err := testClusterByCreateUnstructuredObj(unstructuredObj, cluster) if err != nil { klog.Errorf("%s testClusterByCreateUnstructuredObj failed: %s", clusterId, err.Error()) } + + clusterResult.Items = append(clusterResult.Items, checkItemList...) + clusterResult.InfoItemList = append(clusterResult.InfoItemList, infoItemList...) + loopClusterChecktGaugeVecSetList = append(loopClusterChecktGaugeVecSetList, gvs) + loopClusterCheckDurationGaugeVecSetList = append(loopClusterCheckDurationGaugeVecSetList, gvsList...) + loopClusterVersionGaugeVecSetList = append(loopClusterVersionGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterId, cluster.BusinessID, cluster.Version}, + Value: 1, + }) + klog.Infof("end clustercheck for %s", clusterId) - klog.V(6).Infof("%s clustercheck result %s", clusterId, status) - }() + + p.WriteLock.Lock() + + // delete former metric + if _, ok := clusterCheckGaugeVecSetList[cluster.ClusterID]; !ok { + clusterCheckGaugeVecSetList[clusterId] = make([]*metricmanager.GaugeVecSet, 0, 0) + clusterCheckDurationGaugeVecSetList[clusterId] = make([]*metricmanager.GaugeVecSet, 0, 0) + certificateExpirationGVSList[clusterId] = make([]*metricmanager.GaugeVecSet, 0, 0) + clusterVersionGaugeVecSetList[clusterId] = make([]*metricmanager.GaugeVecSet, 0, 0) + } else { + metricmanager.DeleteMetric(clusterAvailability, clusterCheckGaugeVecSetList[clusterId]) + metricmanager.DeleteMetric(clusterCheckDuration, clusterCheckDurationGaugeVecSetList[clusterId]) + metricmanager.DeleteMetric(clusterApiserverCertificateExpiration, certificateExpirationGVSList[clusterId]) + metricmanager.DeleteMetric(clusterVersion, clusterVersionGaugeVecSetList[clusterId]) + } + + // refresh new metric data + for key, val := range clusterResult.Items { + val.ItemName = StringMap[val.ItemName] + val.ItemTarget = StringMap[val.ItemTarget] + val.Status = StringMap[val.Status] + clusterResult.Items[key] = val + } + p.Result[clusterId] = clusterResult + + clusterCheckGaugeVecSetList[clusterId] = loopClusterChecktGaugeVecSetList + clusterCheckDurationGaugeVecSetList[clusterId] = loopClusterCheckDurationGaugeVecSetList + certificateExpirationGVSList[clusterId] = loopCertificateExpirationGVSList + clusterVersionGaugeVecSetList[clusterId] = loopClusterVersionGaugeVecSetList + + metricmanager.SetMetric(clusterAvailability, clusterCheckGaugeVecSetList[clusterId]) + metricmanager.SetMetric(clusterCheckDuration, clusterCheckDurationGaugeVecSetList[clusterId]) + metricmanager.SetMetric(clusterApiserverCertificateExpiration, certificateExpirationGVSList[clusterId]) + metricmanager.SetMetric(clusterVersion, clusterVersionGaugeVecSetList[clusterId]) + + p.ReadyMap[clusterId] = true + p.WriteLock.Unlock() + getServerCertWG.Wait() + }(cluster) } + wg.Wait() - metric_manager.SetMetric(clusterAvailability, clusterChecktGaugeVecSetList) - metric_manager.SetMetric(clusterCheckDuration, clusterCheckDurationGaugeVecSetList) - - // 去掉已经不存在的集群的指标 - for clusterId, _ := range clusterAvailabilityMap { - deleted := true - for _, cluster := range plugin_manager.Pm.GetConfig().ClusterConfigs { - if clusterId == cluster.ClusterID { - deleted = false - break - } + + // clean deleted cluster data + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + p.ReadyMap[clusterID] = false + klog.Infof("delete cluster %s", clusterID) } - if deleted { - delete(clusterAvailabilityMap, clusterId) + } + + // 从readymap和指标中清理已删除集群 + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + delete(p.ReadyMap, clusterID) + metricmanager.DeleteMetric(clusterAvailability, clusterCheckGaugeVecSetList[clusterID]) + metricmanager.DeleteMetric(clusterCheckDuration, clusterCheckDurationGaugeVecSetList[clusterID]) + metricmanager.DeleteMetric(clusterApiserverCertificateExpiration, certificateExpirationGVSList[clusterID]) + metricmanager.DeleteMetric(clusterVersion, clusterVersionGaugeVecSetList[clusterID]) + delete(clusterCheckGaugeVecSetList, clusterID) + delete(clusterCheckDurationGaugeVecSetList, clusterID) + delete(certificateExpirationGVSList, clusterID) + delete(clusterVersionGaugeVecSetList, clusterID) + delete(p.Result, clusterID) + klog.Infof("delete cluster %s", clusterID) } } + } -func testClusterByCreateUnstructuredObj(unstructuredObj *unstructured.Unstructured, config *rest.Config, status *string, - interval int, clusterID string) ( - workloadToScheduleCost, workloadToPodCost, worloadToRunningCost time.Duration, err error) { - workloadToScheduleCost = time.Duration(0) - workloadToPodCost = time.Duration(0) - worloadToRunningCost = time.Duration(0) - ctx, _ := context.WithTimeout(context.Background(), time.Duration(interval/6)*time.Second) - namespace := unstructuredObj.GetNamespace() - if namespace == "" { - namespace = "default" +// getApiserverCert get apsierver cert expiration through api port +func getApiserverCert(clusterConfig *pluginmanager.ClusterConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + // 检查自签证书 + index := rand.Intn(len(clusterConfig.Master)) + expiration, err := util.GetServerCert("apiserver-loopback-client", clusterConfig.Master[index], "60002") + if err != nil { + expiration, err = util.GetServerCert("apiserver-loopback-client", clusterConfig.Master[index], "6443") + if err != nil { + klog.Errorf("%s check apiserver self-signed cert expiration failed: %s", clusterConfig.ClusterID, err.Error()) + return checkItemList, gvsList, err + } } - clientSet, err := k8s.GetClientsetByConfig(config) - if err != nil { - *status = "配置失败" - err = fmt.Errorf("GetClientsetByConfig failed: %s", err.Error()) - return + checkItem := pluginmanager.CheckItem{ + ItemName: ClusterApiserverCertExpirationCheckItem, + ItemTarget: ApiserverTarget, + Normal: true, + Status: NormalStatus, + Detail: fmt.Sprintf(StringMap[AboutToExpireDetail], clusterConfig.ClusterID, expiration.Sub(time.Now())/time.Second), + Level: pluginmanager.WARNLevel, + Tags: nil, } - if clientSet == nil { - *status = "配置失败" - err = fmt.Errorf("Get clientSet failed %s", err.Error()) - return + // 时间在1周以内则返回异常 + if expiration.Sub(time.Now()) < 604800*time.Second { + checkItem.Normal = false + checkItem.Status = AboutToExpireStatus + checkItem.SetDetail(fmt.Sprintf(StringMap[AboutToExpireDetail], clusterConfig.ClusterID, expiration.Sub(time.Now())/time.Second)) } - clusterUnstructuredObj := unstructuredObj.DeepCopy() - clusterGVK := clusterUnstructuredObj.GroupVersionKind() + checkItemList = append(checkItemList, checkItem) - // 获取k8s集群version,确认集群是否可访问 - _, err = k8s.GetK8sVersion(clientSet) - if err != nil { - *status = "访问集群失败" // 访问集群失败 - err = fmt.Errorf("GetK8sVersion failed: %s", err.Error()) - return - } + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, "self signed"}, + Value: float64(expiration.Sub(time.Now()) / time.Second), + }) - // 确认test yaml的命名空间是否存在 - _, err = clientSet.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + // 检查apiserver证书 + expiration, err = util.GetServerCert(clusterConfig.Master[index], clusterConfig.Master[index], "60002") if err != nil { - *status = "命名空间不存在" - _, createError := clientSet.CoreV1().Namespaces().Create(ctx, &v1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: namespace, - }, - }, metav1.CreateOptions{}) - if createError != nil { - klog.Errorf("create namespace failed: %s", createError.Error()) + expiration, err = util.GetServerCert(clusterConfig.Master[index], clusterConfig.Master[index], "6443") + if err != nil { + klog.Errorf("%s check apiserver cert expiration failed: %s", clusterConfig.ClusterID, err.Error()) + return checkItemList, gvsList, err } - err = fmt.Errorf("get target resource namespace failed: %s", err.Error()) - return } - discoveryInterface := clientSet.Discovery().WithLegacy() - if discoveryInterface == nil { - *status = "配置失败" - err = fmt.Errorf("Get discoveryInterface failed %s", err.Error()) - return + checkItem = pluginmanager.CheckItem{ + ItemName: ClusterApiserverCertExpirationCheckItem, + ItemTarget: ApiserverTarget, + Normal: true, + Status: NormalStatus, + Detail: fmt.Sprintf(StringMap[AboutToExpireDetail], clusterConfig.ClusterID, expiration.Sub(time.Now())/time.Second), + Level: pluginmanager.WARNLevel, + Tags: nil, } - // discoveryInterface.ServerGroupsAndResources() - groupResource, err := restmapper.GetAPIGroupResources(discoveryInterface) - if err != nil { - *status = "配置失败" - err = fmt.Errorf("GetAPIGroupResources failed %s", err.Error()) + + // 时间在1周以内则返回异常 + if expiration.Sub(time.Now()) < 604800*time.Second { + checkItem.Normal = false + checkItem.Status = AboutToExpireStatus + checkItem.SetDetail(fmt.Sprintf(StringMap[AboutToExpireDetail], clusterConfig.ClusterID, expiration.Sub(time.Now())/time.Second)) } - mapper := restmapper.NewDiscoveryRESTMapper(groupResource) - mapping, err := mapper.RESTMapping(clusterGVK.GroupKind(), clusterGVK.Version) + + checkItemList = append(checkItemList, checkItem) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, "apiserver"}, + Value: float64(expiration.Sub(time.Now()) / time.Second), + }) + + return checkItemList, gvsList, err +} + +// testClusterByCreateUnstructuredObj test cluster by create a unstructuredObj workload and watch what will happen +func testClusterByCreateUnstructuredObj(unstructuredObj *unstructured.Unstructured, clusterConfig *pluginmanager.ClusterConfig, +) ([]pluginmanager.CheckItem, []pluginmanager.InfoItem, *metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + infoItemList := make([]pluginmanager.InfoItem, 0, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + var gvs *metricmanager.GaugeVecSet + + var workloadToScheduleCost time.Duration + var workloadToPodCost time.Duration + var worloadToRunningCost time.Duration + + clusterUnstructuredObj := unstructuredObj.DeepCopy() + // 随机workload名,避免重复导致的问题 + clusterUnstructuredObj.SetName("bcs-blackbox-job-" + time.Now().Format("150405")) + var status string + + checkItem := pluginmanager.CheckItem{ + ItemName: ClusterAvailabilityItem, + ItemTarget: ApiserverTarget, + Detail: "", + Level: pluginmanager.WARNLevel, + Tags: nil, + } + + // 获取k8s集群version,确认集群是否可访问 + version, err := k8s.GetK8sVersion(clusterConfig.ClientSet) if err != nil { - *status = "配置失败" - err = fmt.Errorf("RESTMapping failed %s", err.Error()) - return + // 如果失败则直接返回 + status = AvailabilityClusterFailStatus + gvs = &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, status}, + Value: 1, + } + checkItem.Status = status + checkItem.Normal = NormalStatus == status + if !checkItem.Normal { + checkItem.Detail = fmt.Sprintf(StringMap[ClusterAvailabilityDetail], clusterConfig.ClusterID, status) + } + err = fmt.Errorf("GetK8sVersion failed: %s", err.Error()) + return checkItemList, infoItemList, gvs, gvsList, err } - dynamicConfig, err := dynamic.NewForConfig(config) + // store version info + infoItem := pluginmanager.InfoItem{ + ItemName: ClusterVersionItem, + Result: version, + } + clusterConfig.Version = version + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, version}, + Value: 1, + }) + infoItemList = append(infoItemList, infoItem) + + // 获取dynamic resource interface + dri, err := getResourceInterface(clusterConfig, unstructuredObj, &status) if err != nil { - *status = "配置失败" - err = fmt.Errorf("%s Create dynamicConfig %s", err.Error()) - return + gvs = &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, status}, + Value: 1, + } + checkItem.Status = status + checkItem.Normal = NormalStatus == status + if !checkItem.Normal { + checkItem.Detail = fmt.Sprintf(StringMap[ClusterAvailabilityDetail], clusterConfig.ClusterID, status) + } + return checkItemList, infoItemList, gvs, gvsList, err } - clusterUnstructuredObj.SetName("bcs-blackbox-job-" + time.Now().Format("150405")) - dri := dynamicConfig.Resource(mapping.Resource).Namespace(namespace) defer func() { + // 清理残留resource go func() { backgroundDeletion := metav1.DeletePropagationBackground - podList, err := clientSet.CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{ + // 获取所有的匹配job,避免历史残留 + jobList, listErr := clusterConfig.ClientSet.BatchV1().Jobs(namespace).List(context.Background(), metav1.ListOptions{ ResourceVersion: "0", LabelSelector: "bcs-cluster-reporter=bcs-cluster-reporter", }) - if err != nil { - klog.Errorf("%s get pod failed %s", clusterID, err.Error()) - } else { - for _, pod := range podList.Items { - if pod.Status.Phase != "Completed" && pod.Status.Phase != "Succeeded" && time.Now(). - Unix()-pod.CreationTimestamp.Unix() < 600 { - continue - } - - klog.Infof("%s start to delete targetPod %s", clusterID, pod.Name) - err = clientSet.CoreV1().Pods(namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{ - GracePeriodSeconds: int64Ptr(0), - }) - if err != nil { - klog.Errorf("%s delete pod failed %s", clusterID, err.Error()) - } - } - } - - jobList, err := clientSet.BatchV1().Jobs(namespace).List(context.Background(), metav1.ListOptions{ - ResourceVersion: "0", - LabelSelector: "bcs-cluster-reporter=bcs-cluster-reporter", - }) - if err != nil { - klog.Errorf("%s get job failed %s", clusterID, err.Error()) + if listErr != nil { + klog.Errorf("%s get job failed %s", clusterConfig.ClusterID, listErr.Error()) } else { + // 避免过快删除导致异常事件 + time.Sleep(5 * time.Second) for _, job := range jobList.Items { - klog.Infof("%s start to delete job %s", clusterID, job.Name) - err = clientSet.BatchV1().Jobs(namespace).Delete(context.Background(), job.Name, metav1.DeleteOptions{ + klog.Infof("%s start to delete job %s", clusterConfig.ClusterID, job.Name) + err = clusterConfig.ClientSet.BatchV1().Jobs(namespace).Delete(context.Background(), job.Name, metav1.DeleteOptions{ GracePeriodSeconds: int64Ptr(5), PropagationPolicy: &backgroundDeletion, }) if err != nil { - klog.Errorf("%s delete job failed %s", clusterID, err.Error()) + klog.Errorf("%s delete job failed %s", clusterConfig.ClusterID, err.Error()) } } } @@ -406,24 +512,110 @@ func testClusterByCreateUnstructuredObj(unstructuredObj *unstructured.Unstructur }() }() - *status, workloadToScheduleCost, workloadToPodCost, worloadToRunningCost, err = - getWatchStatus(clientSet, clusterUnstructuredObj, dri, namespace, interval, clusterID, context.Background()) + // watch并判断创建clusterUnstructuredObj中发生的各种情况 + status, workloadToScheduleCost, workloadToPodCost, worloadToRunningCost, err = + getWatchStatus(clusterConfig.ClientSet, clusterUnstructuredObj, dri, namespace, clusterConfig.ClusterID) + + infoItemList = append(infoItemList, + pluginmanager.InfoItem{ItemName: worloadToRunningItem, Result: worloadToRunningCost}, + pluginmanager.InfoItem{ItemName: workloadToScheduleItem, Result: workloadToScheduleCost}, + pluginmanager.InfoItem{ItemName: workloadToPodItem, Result: workloadToPodCost}) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, workloadToPod}, + Value: float64(workloadToPodCost) / float64(time.Second)}, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, workloadToSchedule}, + Value: float64(workloadToScheduleCost) / float64(time.Second), + }, &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, worloadToRunning}, + Value: float64(worloadToRunningCost) / float64(time.Second), + }) + + // 集群可用性检测结果单独一个指标 + gvs = &metricmanager.GaugeVecSet{ + Labels: []string{clusterConfig.ClusterID, clusterConfig.BusinessID, status}, + Value: 1, + } + checkItem.Status = status + checkItem.Normal = NormalStatus == status + if !checkItem.Normal { + checkItem.Detail = fmt.Sprintf(StringMap[ClusterAvailabilityDetail], clusterConfig.ClusterID, status) + } + return checkItemList, infoItemList, gvs, gvsList, err +} + +// getResourceInterface get dynamic resource interface +func getResourceInterface(clusterConfig *pluginmanager.ClusterConfig, clusterUnstructuredObj *unstructured.Unstructured, status *string) (dynamic.ResourceInterface, error) { + clusterGVK := clusterUnstructuredObj.GroupVersionKind() + ctx := util.GetCtx(10 * time.Second) + + _, err := clusterConfig.ClientSet.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{ResourceVersion: "0"}) + if err != nil { + *status = AvailabilityNamespaceFailStatus + _, createError := clusterConfig.ClientSet.CoreV1().Namespaces().Create(ctx, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + ResourceVersion: "0", + }, + }, metav1.CreateOptions{}) + if createError != nil { + klog.Errorf("%s create namespace failed: %s", clusterConfig.ClusterID, createError.Error()) + } + err = fmt.Errorf("get target resource namespace failed: %s", err.Error()) + return nil, err + } + + discoveryInterface := clusterConfig.ClientSet.Discovery().WithLegacy() + if discoveryInterface == nil { + *status = AvailabilityConfigFailStatus + return nil, fmt.Errorf("Get discoveryInterface failed %s", err.Error()) + } + // discoveryInterface.ServerGroupsAndResources() + groupResource, err := restmapper.GetAPIGroupResources(discoveryInterface) + if err != nil { + *status = AvailabilityConfigFailStatus + return nil, fmt.Errorf("GetAPIGroupResources failed %s", err.Error()) + } + mapper := restmapper.NewDiscoveryRESTMapper(groupResource) + mapping, err := mapper.RESTMapping(clusterGVK.GroupKind(), clusterGVK.Version) + if err != nil { + *status = AvailabilityConfigFailStatus + return nil, fmt.Errorf("RESTMapping failed %s", err.Error()) + } + + dynamicConfig, err := dynamic.NewForConfig(clusterConfig.Config) + if err != nil { + *status = AvailabilityConfigFailStatus + return nil, fmt.Errorf("%s Create dynamicConfig %s", clusterConfig.ClusterID, err.Error()) + } - return + dri := dynamicConfig.Resource(mapping.Resource).Namespace(namespace) + return dri, nil } +// getWatchStatus get pod status of the workload, and return it. func getWatchStatus(clientSet *kubernetes.Clientset, clusterUnstructuredObj *unstructured.Unstructured, - dri dynamic.ResourceInterface, namespace string, interval int, clusterID string, ctx context.Context) (status string, + dri dynamic.ResourceInterface, namespace string, clusterID string) (status string, workloadToScheduleCost, workloadToPodCost, worloadToRunningCost time.Duration, err error) { - // TimeoutSeconds: int64Ptr(500) + startTime := time.Now() + + ctx := util.GetCtx(30 * time.Second) + // 测试集群的timeout时间缩短到10s + if strings.Contains(clusterID, "BCS-K8S-2") { + ctx = util.GetCtx(10 * time.Second) + } + + defer func() { + klog.Infof("%s getWatchStatus duration %.2f s", clusterID, float64(time.Now().Sub(startTime)/time.Second)) + }() + // 启动watch,观察对应label的pod的所有事件 watchInterface, err := clientSet.CoreV1().Pods(namespace).Watch(ctx, metav1.ListOptions{ResourceVersion: "0", - LabelSelector: "bcs-cluster-reporter=bcs-cluster-reporter", TimeoutSeconds: int64Ptr(int64(interval / 6))}) + LabelSelector: "bcs-cluster-reporter=bcs-cluster-reporter", TimeoutSeconds: int64Ptr(30)}) if err != nil { - status = "watch失败" - err = fmt.Errorf("%s watch failed %s", clusterID, err.Error()) + status = AvailabilityWatchErrorStatus + err = fmt.Errorf("%s start watch failed %s", clusterID, err.Error()) return } - watchStartTime := time.Now() defer func() { go func() { @@ -433,95 +625,88 @@ func getWatchStatus(clientSet *kubernetes.Clientset, clusterUnstructuredObj *uns }() }() + // 记录发起创建workload的时间 + createStartTime := time.Now() + + // 创建workload testObj, err := dri.Create(ctx, clusterUnstructuredObj, metav1.CreateOptions{}) if err != nil { - klog.Errorf("Create failed %s", err.Error()) + klog.Errorf("%s Create failed %s", clusterID, err.Error()) if strings.Contains(err.Error(), "already exists") { time.Sleep(5 * time.Second) - status = "workload已存在" + status = AvailabilityWorkloadExistStatus } else { - status = "创建workload失败" + status = AvailabilityCreateWorkloadErrorStatus } return } - createStartTime := testObj.GetCreationTimestamp().Time + // 校验testObj创建时间,以检测apiserver时间是否有偏差 + createTS := testObj.GetCreationTimestamp() + if createStartTime.Sub(createTS.Local()) > time.Second*5 || createStartTime.Sub(createTS.Local()) < 0-time.Second*5 { + status = AvailabilityTimeOffsetStatus + return + } createPodFlag := false - for { select { + // 等待watch返回 case e, ok := <-watchInterface.ResultChan(): if !ok { + // watch异常结束 + klog.Errorf("%s watch failed", clusterID) watchInterface.Stop() - if status != "ok" { - status = "watch关闭" - klog.Infof("%s restart watch", clusterID) - // may miss some events here - watchInterface, err = clientSet.CoreV1().Pods(namespace).Watch(ctx, metav1.ListOptions{ResourceVersion: "0", - LabelSelector: "bcs-cluster-reporter=bcs-cluster-reporter", TimeoutSeconds: int64Ptr(int64(interval / 6))}) - if err != nil { - status = "watch失败" - err = fmt.Errorf("%s watch failed %s", clusterID, err.Error()) - return - } - } + status = AvailabilityWatchErrorStatus + err = fmt.Errorf("%s watch failed %s", clusterID, err.Error()) + return } else if pod, ok := e.Object.(*v1.Pod); ok { + // 获取到对应pod的事件 if !createPodFlag { workloadToPodCost = pod.CreationTimestamp.Sub(createStartTime) createPodFlag = true } - if strings.Contains(pod.Name, clusterUnstructuredObj.GetName()) && createStartTime.Unix() <= pod.CreationTimestamp.Unix() { + // 判断pod是否已经成功调度 + if strings.Contains(pod.Name, clusterUnstructuredObj.GetName()) { if pod.Spec.NodeName != "" { - status = "ok" - - // pod调度成功耗时 - klog.V(6).Infof("cluster schedule pod successful") - + status = NormalStatus + // pod调度成功耗时,调度成功直接返回 + klog.Infof("cluster %s schedule pod successful", clusterID) workloadToScheduleCost, worloadToRunningCost = getPodLifeCycleTimePoint(pod, createStartTime) - } else { - for _, condition := range pod.Status.Conditions { - if strings.Contains(condition.Message, "nodes are available") { - status = "无可用节点" - return - } + return + } + + // 判断是否为调度失败的事件 + for _, condition := range pod.Status.Conditions { + if strings.Contains(condition.Message, "nodes are available") { + status = AvailabilityNoNodeErrorStatus + return } } } + } else { + klog.Errorf(clusterID, e) } - case <-time.After(time.Duration(interval/4) * time.Second): - if status != "ok" { - if !createPodFlag { - status = "创建pod超时" - klog.Errorf("create pod timeout") - } else { - status = "调度超时" - klog.Errorf("watch timeout") - } - } - return - } - - if time.Since(watchStartTime).Seconds() > float64(interval/6) { - if status != "ok" { - if !createPodFlag { - status = "创建pod超时" - klog.Errorf("timeout waiting pod created %s %f s", status, time.Since(watchStartTime).Seconds()) - } else { - klog.Errorf("timeout waiting pod scheduled %s %f s", status, time.Since(watchStartTime).Seconds()) - status = "调度超时" - } + case <-ctx.Done(): + // 时间到期时判断当前已获得的pod状态并返回 + if !createPodFlag { + status = AvailabilityCreatePodTimeoutStatus + klog.Errorf("%s create pod timeout", clusterID) + } else { + status = AvailabilitySchedulePodTimeoutStatus + klog.Errorf("%s wait scheduled watch event timeout", clusterID) } return } } } -func getPodLifeCycleTimePoint(pod *v1.Pod, createStartTime time.Time) (workloadToScheduleCost, worloadToRunningCost time.Duration) { - workloadToScheduleCost = 0 - worloadToRunningCost = 0 +// getPodLifeCycleTimePoint get the time costed of every stag after the workload is created +func getPodLifeCycleTimePoint(pod *v1.Pod, createStartTime time.Time) (time.Duration, time.Duration) { + var workloadToScheduleCost, worloadToRunningCost time.Duration for _, condition := range pod.Status.Conditions { + // 获取pod调度的时间 if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionTrue { if workloadToScheduleCost == 0 { workloadToScheduleCost = condition.LastTransitionTime.Sub(createStartTime) @@ -534,31 +719,17 @@ func getPodLifeCycleTimePoint(pod *v1.Pod, createStartTime time.Time) (workloadT } } - if pod.Status.Phase == "Running" { - if worloadToRunningCost == 0 { - worloadToRunningCost = time.Since(createStartTime) - } - } else if pod.Status.Phase == "Completed" || pod.Status.Phase == "Succeeded" { - if worloadToRunningCost == 0 { - worloadToRunningCost = time.Since(createStartTime) - } - return - } - - return + return workloadToScheduleCost, worloadToRunningCost } -func updateNestedMap(obj map[string]interface{}, keyPath []string, newValue interface{}) { - if len(keyPath) == 1 { - obj[keyPath[0]] = newValue - return - } - - nestedObj, ok := obj[keyPath[0]].(map[string]interface{}) - if !ok { - nestedObj = make(map[string]interface{}) - obj[keyPath[0]] = nestedObj - } +// Ready return true if cluster check is over +func (p *Plugin) Ready(clusterID string) bool { + p.WriteLock.Lock() + defer p.WriteLock.Unlock() + return p.ReadyMap[clusterID] +} - updateNestedMap(nestedObj, keyPath[1:], newValue) +// GetResult return check result by cluster ID +func (p *Plugin) GetResult(clusterID string) pluginmanager.CheckResult { + return p.Result[clusterID] } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck_test.go new file mode 100644 index 0000000000..c130152c54 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/clustercheck_test.go @@ -0,0 +1,22 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under, + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package clustercheck xxx +package clustercheck + +import ( + "testing" +) + +func TestCheckServerCert(t *testing.T) { + +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/constant.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/constant.go new file mode 100644 index 0000000000..567e460a0f --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/constant.go @@ -0,0 +1,111 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under, + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package clustercheck xxx +package clustercheck + +const ( + pluginName = "clustercheck" + ClusterAvailabilityCheckMetricName = "cluster_availability" + ClusterVersionMetricName = "cluster_version" + ClusterCheckDurationMeticName = "cluster_check_duration_seconds" + ClusterApiserverCertExpirationMetricName = "cluster_apiserver_cert_expiration" + + // Status + NormalStatus = "ok" + AboutToExpireStatus = "expire_soon" + ClusterAvailabilityPanicStatus = "panic" + + AvailabilityConfigFailStatus = "config_fail" + AvailabilityClusterFailStatus = "connect_cluster_fail" + AvailabilityNamespaceFailStatus = "namespace_fail" + AvailabilityWorkloadExistStatus = "workload_exist" + AvailabilityCreateWorkloadErrorStatus = "create_workload_fail" + AvailabilityCreatePodTimeoutStatus = "create_pod_timeout" + AvailabilitySchedulePodTimeoutStatus = "schedule_pod_timeout" + AvailabilityTimeOffsetStatus = "time_offset" + AvailabilityWatchErrorStatus = "watch_fail" + AvailabilityNoNodeErrorStatus = "node_fail" + + // Detail + AboutToExpireDetail = "AboutToExpireDetail" + ClusterAvailabilityDetail = "ClusterAvailabilityDetail" + + ClusterVersionLabel = "ClusterVersionLabel" + ClusterVersionItem = "ClusterVersionItem" + ClusterApiserverCertExpirationCheckItem = "ClusterApiserverCertExpiration" + ClusterAvailabilityItem = "ClusterAvailabilityItem" + ClusterLatencyItem = "ClusterLatencyItem" + ApiserverTarget = "apiserver" + + workloadToPod = "create_pod" + workloadToSchedule = "schedule_pod" + worloadToRunning = "start_pod" + + workloadToPodItem = "workloadToPodTarget" + workloadToScheduleItem = "workloadToSchedule" + worloadToRunningItem = "worloadToRunning" +) + +var ( + ChinenseStringMap = map[string]string{ + // status + NormalStatus: "正常", + ClusterAvailabilityPanicStatus: ClusterAvailabilityPanicStatus, + AvailabilityWorkloadExistStatus: "workload已存在", + AvailabilityCreateWorkloadErrorStatus: "创建workload失败", + AvailabilityTimeOffsetStatus: "apiserver时间偏移", + AvailabilityWatchErrorStatus: "watch失败", + + pluginName: "集群控制面检查", + AboutToExpireDetail: "%s Apiserver 的证书将在 %d 秒内过期", + ClusterAvailabilityDetail: "%s 的黑盒监控检测结果异常: %s", + ClusterVersionLabel: "集群版本", + ClusterVersionItem: "集群版本", + ClusterApiserverCertExpirationCheckItem: "apiserver证书过期时间", + ClusterAvailabilityItem: "集群黑盒监控", + ClusterLatencyItem: "黑盒监控延迟", + + workloadToPodItem: "创建pod", + workloadToScheduleItem: "调度pod", + worloadToRunningItem: "执行pod", + + ApiserverTarget: ApiserverTarget, + } + + EnglishStringMap = map[string]string{ + // status + NormalStatus: NormalStatus, + ClusterAvailabilityPanicStatus: ClusterAvailabilityPanicStatus, + AvailabilityWorkloadExistStatus: AvailabilityWorkloadExistStatus, + AvailabilityCreateWorkloadErrorStatus: AvailabilityCreateWorkloadErrorStatus, + AvailabilityTimeOffsetStatus: AvailabilityTimeOffsetStatus, + AvailabilityWatchErrorStatus: AvailabilityWatchErrorStatus, + + pluginName: pluginName, + AboutToExpireDetail: "%s Apiserver cert is about to expiration in %d seconds, ", + ClusterAvailabilityDetail: "%s blackbox check result is %s", + ClusterVersionLabel: "cluster version", + ClusterVersionItem: "cluster version", + ClusterApiserverCertExpirationCheckItem: "apiserver cert expiration", + ClusterAvailabilityItem: "cluster blackbox check", + ClusterLatencyItem: "blackbox check latency", + + workloadToPodItem: "create pod", + workloadToScheduleItem: "schedule pod", + worloadToRunningItem: "excute pod", + + ApiserverTarget: ApiserverTarget, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/init.go index 32f4a0ca9c..12e9b5bcb9 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/init.go @@ -10,17 +10,12 @@ * limitations under the License. */ +// Package clustercheck xxx package clustercheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" +// 在集群中创建job探测集群是否正常运行 func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/options.go index 2144867240..23d96c4276 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck/options.go @@ -10,30 +10,72 @@ * limitations under the License. */ +// Package clustercheck xxx package clustercheck import ( - "fmt" + "gopkg.in/yaml.v2" + "k8s.io/klog/v2" +) + +const ( + namespace = "bkmonitor-operator" ) // Options bcs log options type Options struct { - Interval int `json:"interval" yaml:"interval"` - TestYaml map[string]interface{} `json:"testyaml" yaml:"testyaml"` - Synchronization bool `json:"synchronization" yaml:"synchronization"` + Interval int `json:"interval" yaml:"interval"` + TestYaml interface{} `json:"testyaml" yaml:"testyaml"` + Synchronization bool `json:"synchronization" yaml:"synchronization"` + Namespace string `json:"namespace" yaml:"namespace"` } // Validate validate options func (o *Options) Validate() error { - // if len(o.KubeMaster) == 0 { - // return fmt.Errorf("kube_master cannot be empty") - // } - // if len(o.Kubeconfig) == 0 { - // return fmt.Errorf("kubeconfig cannot be empty") - // } + if o.Namespace == "" { + o.Namespace = namespace + } if o.TestYaml == nil { - return fmt.Errorf("must set testyaml file") + yamlStr := ` +apiVersion: batch/v1 +kind: Job +metadata: + name: bcs-blackbox-job + namespace: bcs-blackbox-job +spec: + backoffLimit: 1 + template: + metadata: + labels: + test-yaml: test-yaml + spec: + automountServiceAccountToken: false + containers: + - image: hub.bktencent.com/library/hello-world:latest + imagePullPolicy: Always + name: blackbox + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + nodeSelector: + kubernetes.io/os: linux + kubernetes.io/arch: amd64 + restartPolicy: Never + tolerations: + - effect: NoSchedule + operator: Exists +` + o.TestYaml = make(map[string]interface{}) + err := yaml.Unmarshal([]byte(yamlStr), o.TestYaml) + if err != nil { + klog.Infof(err.Error()) + } } + return nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/configcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/configcheck.go new file mode 100644 index 0000000000..aaddf3ad67 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/configcheck.go @@ -0,0 +1,62 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package plugin xxx +package plugin + +import ( + "fmt" + "strconv" + "strings" +) + +// FloatFlag xxx +type FloatFlag struct { + Name string + // ge, le, eq, none + CompareType string + Value float64 + Needed bool +} + +// CheckFlag check flags +func CheckFlag(flagList []string, floatFlag FloatFlag) string { + checked := false + for _, flagStr := range flagList { + if strings.HasPrefix(flagStr, floatFlag.Name) { + checked = true + strList := strings.Split(flagStr, "=") + if len(strList) == 2 { + value, err := strconv.ParseFloat(strList[1], 64) + if err != nil { + return fmt.Sprintf("%s value is %s, not float64", floatFlag.Name, strList[1]) + } + + if floatFlag.CompareType == "ge" { + if floatFlag.Value > value { + return fmt.Sprintf(StringMap[CheckFlagLeDetailFormat], floatFlag.Name, strList[1], floatFlag.Value) + } + } + + } else { + return fmt.Sprintf("%s value is %s, not float64", floatFlag.Name, flagStr) + } + + } + } + + if !checked && floatFlag.Needed { + return fmt.Sprintf(StringMap[CheckFlagNotSetDetailFormat], floatFlag.Name, floatFlag.Value) + } + + return "" +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/const.go new file mode 100644 index 0000000000..84623f87f3 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/const.go @@ -0,0 +1,44 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package plugin xxx +package plugin + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + +const ( + RISKLevel = "RISK" + WARNLevel = "WARN" + SERIOUSLevel = "SERIOUS" + + CheckFlagNotSetDetailFormat = "%s not found, recommand set as %.2f" + CheckFlagLeDetailFormat = "%s value is %s, recommand >= %.2f" +) + +var ( + LevelColor = map[string]util.Color{ + RISKLevel: {Red: 60, Green: 17, Blue: 14}, + WARNLevel: {Red: 98, Green: 85, Blue: 29}, + SERIOUSLevel: {Red: 98, Green: 29, Blue: 41}, + } + ChinenseStringMap = map[string]string{ + CheckFlagNotSetDetailFormat: "未找到参数%s, 推荐设置该参数为%.2f", + CheckFlagLeDetailFormat: "%s 的值为%s, 推荐设置为%.2f", + } + + EnglishStringMap = map[string]string{ + CheckFlagNotSetDetailFormat: CheckFlagNotSetDetailFormat, + CheckFlagLeDetailFormat: CheckFlagLeDetailFormat, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/dnscheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/dnscheck.go deleted file mode 100644 index 3846f486c5..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/dnscheck.go +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package dnscheck -package dnscheck - -import ( - "context" - "fmt" - "net" - "os" - "runtime/debug" - "sync" - "time" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/json" - "k8s.io/apimachinery/pkg/util/yaml" - "k8s.io/client-go/kubernetes" - "k8s.io/klog" -) - -// Plugin xxx -type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex - clusterId string - clientSet *kubernetes.Clientset - businessID string - dnsLock sync.Mutex - cancel context.CancelFunc -} - -var ( - dnsAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "dns_availability", - Help: "dns_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}) - - dnsLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "dns_latency", - Help: "dns_latency", - Buckets: []float64{0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2}, - }, []string{"target", "target_biz"}) -) - -func init() { - metric_manager.Register(dnsAvailability) - metric_manager.Register(dnsLatency) -} - -// Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read dnscheck config file %s failed, err %s", configFilePath, err.Error()) - } - p.opt = &Options{} - if err = json.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode dnscheck config file %s failed, err %s", configFilePath, err.Error()) - } - } - - if err = p.opt.Validate(); err != nil { - return err - } - - p.stopChan = make(chan int) - interval := p.opt.Interval - if interval == 0 { - interval = 60 - } - - clusterConfig := plugin_manager.Pm.GetConfig().InClusterConfig - if clusterConfig.Config == nil { - klog.Fatalf("netcheck get incluster config failed, only can run as incluster mode") - } - p.clusterId = clusterConfig.ClusterID - p.businessID = clusterConfig.BusinessID - - p.clientSet, err = k8s.GetClientsetByConfig(clusterConfig.Config) - if err != nil { - klog.Fatalf("netcheck get incluster config failed, only can run as incluster mode") - } - - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - if p.opt.Synchronization { - plugin_manager.Pm.Lock() - } - go p.Check() - } else { - klog.V(3).Infof("the former dnscheck didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue - } - } - }() - - return nil -} - -// Stop xxx -func (p *Plugin) Stop() error { - p.checkLock.Lock() - p.stopChan <- 1 - klog.Infof("plugin %s stopped", p.Name()) - p.checkLock.Unlock() - p.cancel() - return nil -} - -// Name xxx -func (p *Plugin) Name() string { - return "dnscheck" -} - -// Check xxx -func (p *Plugin) Check() { - p.checkLock.Lock() - klog.Infof("start %s", p.Name()) - defer func() { - klog.Infof("end %s", p.Name()) - if p.opt.Synchronization { - plugin_manager.Pm.UnLock() - } - p.checkLock.Unlock() - }() - - defer func() { - if r := recover(); r != nil { - klog.Errorf("%s dnscheck failed: %s, stack: %v\n", p.clusterId, r, string(debug.Stack())) - } - }() - - if p.dnsLock.TryLock() { - p.dnsLock.Unlock() - ctx, cancel := context.WithCancel(context.Background()) - p.cancel = cancel - go p.checkDNSEndpoints(ctx, p.opt.CheckDomain) - } -} - -func (p *Plugin) checkDNSEndpoints(ctx context.Context, domainList []string) { - p.dnsLock.Lock() - defer func() { - p.dnsLock.Unlock() - }() - - for { - start := time.Now() - select { - case <-ctx.Done(): - klog.Infof("Stop checkDNSEndpoints") - break - - default: - status := "ok" - ep, err := p.clientSet.CoreV1().Endpoints("kube-system").Get(context.Background(), "kube-dns", v1.GetOptions{}) - if err != nil { - klog.Errorf(err.Error()) - status = "getepfailed" - } - - ipList := make([]string, 0, 0) - for _, subset := range ep.Subsets { - for _, address := range subset.Addresses { - ipList = append(ipList, address.IP) - } - } - - if len(ipList) > 0 { - for ip := 0; ip < len(ipList); ip++ { - r, err := createResolver(ipList[ip]) - if err != nil { - klog.Errorf(err.Error()) - status = "setresolverfailed" - } - - for _, domain := range domainList { - latency, err := dnsLookup(r, domain) - if err != nil { - klog.Errorf(err.Error()) - status = "resolvefailed" - } - - dnsLatency.WithLabelValues(p.clusterId, p.businessID).Observe(float64(latency) / float64(time.Second)) - } - } - } else { - status = "noepfound" - klog.Errorf("No endpoints available for service") - } - - dnsAvailability.WithLabelValues(p.clusterId, p.businessID, status).Set(1) - } - - // 最快1s执行一次 - if time.Since(start) < time.Second { - <-time.After(time.Second - time.Since(start)) - } - start = time.Now() - - } - -} - -func createResolver(ip string) (*net.Resolver, error) { - r := &net.Resolver{} - // if we're supplied a null string, return an error - if len(ip) < 1 { - return r, fmt.Errorf("Need a valid ip to create Resolver") - } - // attempt to create the resolver based on the string - r = &net.Resolver{ - PreferGo: true, - Dial: func(ctx context.Context, network, address2 string) (net.Conn, error) { - d := net.Dialer{ - Timeout: time.Millisecond * time.Duration(10000), - } - return d.DialContext(ctx, "udp", ip+":53") - }, - } - return r, nil -} - -func dnsLookup(r *net.Resolver, host string) (time.Duration, error) { - start := time.Now() - addrs, err := r.LookupHost(context.Background(), host) - if err != nil { - errorMessage := "DNS Status check determined that " + host + " is DOWN: " + err.Error() - return 0, fmt.Errorf(errorMessage) - } - - if len(addrs) == 0 { - return 0, fmt.Errorf("No host was found") - } - - return time.Since(start), nil -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/eventrecorder.go b/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/eventrecorder.go deleted file mode 100644 index 7af78f1526..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/eventrecorder.go +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package eventrecorder -package eventrecorder - -import ( - "context" - "fmt" - "os" - "runtime/debug" - "sync" - "time" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/prometheus/client_golang/prometheus" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/yaml" - "k8s.io/client-go/informers" - "k8s.io/client-go/tools/cache" - "k8s.io/klog" -) - -// Plugin xxx -type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex - // eventChecktGaugeVecSetMap map[string]map[string]map[string]*metric_manager.GaugeVecSet - eventChecktGaugeVecSetMap map[string]map[string]*metric_manager.GaugeVecSet -} - -var ( - eventRecord = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "event_record", - Help: "event_record, count of event", - }, []string{"target", "target_biz", "resource_kind", "event_reason"}) -) - -func init() { - metric_manager.Register(eventRecord) -} - -// Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read dnscheck config file %s failed, err %s", configFilePath, err.Error()) - } - p.opt = &Options{} - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode eventrecorder config file %s failed, err %s", configFilePath, err.Error()) - } - } - - if err = p.opt.Validate(); err != nil { - return err - } - - p.stopChan = make(chan int) - - // 开始获取数据 - // p.eventChecktGaugeVecSetMap = make(map[string]map[string]map[string]*metric_manager.GaugeVecSet) - p.eventChecktGaugeVecSetMap = make(map[string]map[string]*metric_manager.GaugeVecSet) - cluster := plugin_manager.Pm.GetConfig().InClusterConfig - if cluster.Config == nil { - klog.Fatalf("eventrecorder get incluster config failed") - } - - go func() { - recordEvent(p.eventChecktGaugeVecSetMap, cluster, p.stopChan) - }() - - interval := p.opt.Interval - if interval == 0 { - interval = 60 - } - - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - if p.opt.Synchronization { - plugin_manager.Pm.Lock() - } - go p.Check() - } else { - klog.V(3).Infof("the former eventrecorder didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue - } - } - }() - - return nil -} - -// Stop xxx -func (p *Plugin) Stop() error { - p.checkLock.Lock() - p.stopChan <- 1 - p.stopChan <- 1 - klog.Infof("plugin %s stopped", p.Name()) - p.checkLock.Unlock() - return nil -} - -// Name xxx -func (p *Plugin) Name() string { - return "eventrecorder" -} - -// Check xxx -func (p *Plugin) Check() { - start := time.Now() - p.checkLock.Lock() - klog.Infof("start %s", p.Name()) - defer func() { - klog.Infof("end %s", p.Name()) - if p.opt.Synchronization { - plugin_manager.Pm.UnLock() - } - p.checkLock.Unlock() - metric_manager.SetCommonDurationMetric([]string{"eventrecorder", "", "", ""}, start) - }() - metric_manager.SetMetric(eventRecord, flatten(p.eventChecktGaugeVecSetMap)) -} - -func flatten(data interface{}) []*metric_manager.GaugeVecSet { - result := make([]*metric_manager.GaugeVecSet, 0, 0) - - if m, ok := data.(map[string]map[string]*metric_manager.GaugeVecSet); ok { - for _, value := range m { - result = append(result, flatten(value)...) - } - } else { - for _, value := range data.(map[string]*metric_manager.GaugeVecSet) { - result = append(result, value) - } - } - - return result -} - -func recordEvent(eventChecktGaugeVecSetMap map[string]map[string]*metric_manager.GaugeVecSet, - cluster plugin_manager.ClusterConfig, stopChan <-chan int) { - defer func() { - if r := recover(); r != nil { - klog.Errorf("%s eventrecorder failed: %s, stack: %v\n", cluster.ClusterID, r, string(debug.Stack())) - clientSet1, _ := k8s.GetClientsetByConfig(cluster.Config) - var responseContentType string - body, _ := clientSet1.RESTClient().Get(). - AbsPath("/apis"). - SetHeader("Accept", "application/json"). - Do(context.TODO()). - ContentType(&responseContentType). - Raw() - klog.V(3).Infof("Try get apis for %s: %s", cluster.ClusterID, string(body)) - } - }() - - clientSet, err := k8s.GetClientsetByConfig(cluster.Config) - if err != nil { - klog.Fatalf("eventrecorder GetClientsetByConfig failed: %s", err.Error()) - } - - clientSet.CoreV1().Events(metav1.NamespaceAll).Watch(context.Background(), metav1.ListOptions{}) - factory := informers.NewSharedInformerFactoryWithOptions(clientSet, time.Second*60, - informers.WithTweakListOptions(func(options *metav1.ListOptions) { - options.ResourceVersion = "0" - })) - - eventInformer := factory.Core().V1().Events().Informer() - _, err = eventInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - if event, ok := obj.(*corev1.Event); ok { - // 只记录非Normal的event - if event.Type == "Normal" { - return - } - if eventChecktGaugeVecSetMap[event.Kind] == nil { - eventChecktGaugeVecSetMap[event.Kind] = make(map[string]*metric_manager.GaugeVecSet) - } - if eventChecktGaugeVecSetMap[event.Kind][event.Reason] == nil { - eventChecktGaugeVecSetMap[event.Kind][event.Reason] = &metric_manager.GaugeVecSet{Labels: []string{ - cluster.ClusterID, cluster.BusinessID, - event.InvolvedObject.Kind, event.Reason}, Value: float64(1)} - } else { - eventChecktGaugeVecSetMap[event.Kind][event.Reason].Value++ - } - - } else { - klog.Infof("unknown obj: %s", obj) - } - - }, - }) - if err != nil { - klog.Fatalf("eventrecorder AddEventHandler failed: %s", err.Error()) - } - - informerStopChan := make(<-chan struct{}) - factory.Start(informerStopChan) - if !cache.WaitForCacheSync(informerStopChan, eventInformer.HasSynced) { - klog.Fatalf("Timed out waiting for event caches to sync") - } - - <-stopChan -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/logrecorder.go b/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/logrecorder.go deleted file mode 100644 index 99665ec76d..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/logrecorder.go +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package logrecorder -package logrecorder - -import ( - "bufio" - "context" - "encoding/json" - "fmt" - "os" - "strconv" - "strings" - "sync" - "time" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/prometheus/client_golang/prometheus" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/yaml" - "k8s.io/klog" -) - -// Plugin xxx -type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex -} - -var ( - etcdTookTooLongMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "etcd_took_too_long", - Help: "etcd_took_too_long", - }, []string{"target", "target_biz", "request"}) - stopFlag = false -) - -func init() { - metric_manager.Register(etcdTookTooLongMetric) -} - -// Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read dnscheck config file %s failed, err %s", configFilePath, err.Error()) - } - p.opt = &Options{} - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode logrecorder config file %s failed, err %s", configFilePath, err.Error()) - } - } - - if err = p.opt.Validate(); err != nil { - return err - } - - p.stopChan = make(chan int) - - interval := p.opt.Interval - if interval == 0 { - interval = 60 - } - - cluster := plugin_manager.Pm.GetConfig().InClusterConfig - if cluster.Config == nil { - klog.Fatalf("eventrecorder get incluster config failed") - } - - getLogRecord(cluster) - - return nil -} - -// Stop xxx -func (p *Plugin) Stop() error { - p.checkLock.Lock() - // p.stopChan <- 1 - stopFlag = true - klog.Infof("plugin %s stopped", p.Name()) - p.checkLock.Unlock() - return nil -} - -// Name xxx -func (p *Plugin) Name() string { - return "logrecorder" -} - -var ( - // pod container log - logCacheLock sync.RWMutex - logCache map[string]map[string][]PodLog -) - -// PodLog xxx -type PodLog struct { - LogTime time.Time - Log string -} - -func getLogRecord(cluster plugin_manager.ClusterConfig) { - logCache = make(map[string]map[string][]PodLog) - clientSet, err := k8s.GetClientsetByConfig(cluster.Config) - if err != nil { - klog.Fatalf("logrecorder getLogMetric failed: %s", err.Error()) - } - - podList, err := clientSet.CoreV1().Pods("kube-system").List(context.Background(), v1.ListOptions{ - ResourceVersion: "0", - LabelSelector: "component=etcd", - }) - - if err != nil { - klog.Fatalf("logrecorder getLogMetric failed: %s", err.Error()) - } - - // 创建日志选项 - var sinceSec int64 = 300 - logOptions := &corev1.PodLogOptions{ - Container: "etcd", - Follow: true, - Timestamps: true, - SinceSeconds: &sinceSec, - } - - // 获取容器的标准输出日志 - for _, pod := range podList.Items { - // 初始化对应pod的日志缓存 - podName := pod.Name - containerName := "etcd" - _, ok := logCache[pod.Name] - if !ok { - logCache[pod.Name] = make(map[string][]PodLog) - logCache[pod.Name]["etcd"] = make([]PodLog, 0, 0) - } - - logsStream, err := clientSet.CoreV1().Pods("kube-system").GetLogs(pod.Name, logOptions).Stream(context.Background()) - if err != nil { - klog.Fatal(err.Error()) - } - scanner := bufio.NewScanner(logsStream) - go func(podName, containerName string) { - defer func() { - logsStream.Close() - }() - - for { - if scanner.Scan() { - line := scanner.Text() - CacheLog(podName, containerName, line) - } else { - if err = scanner.Err(); err != nil { - klog.Infof(err.Error()) - } - // 重建链接 - logsStream.Close() - logsStream, err = clientSet.CoreV1().Pods("kube-system").GetLogs(pod.Name, logOptions).Stream(context.Background()) - if err != nil { - klog.Fatalf(err.Error()) - } - scanner = bufio.NewScanner(logsStream) - } - - if stopFlag { - stopFlag = false - return - } - - } - }(podName, containerName) - } - - go func() { - for { - select { - case <-time.After(1 * time.Minute): - LogAnalysis(plugin_manager.Pm.GetConfig().InClusterConfig) - } - } - }() -} - -func cleanPodLogCache() { - for podName, podLogs := range logCache { - for containerName, logList := range podLogs { - newLogList := make([]PodLog, 0, 0) - for _, logItem := range logList { - if !logItem.LogTime.Before(time.Now().Add(-5 * time.Minute)) { - newLogList = append(newLogList, logItem) - } - } - logCacheLock.Lock() - logCache[podName][containerName] = newLogList - logCacheLock.Unlock() - } - } -} - -// CacheLog xxx -func CacheLog(podName, containerName, log string) { - if t, err := time.Parse("2006-01-02T15:04:05.999999999Z", strings.Split(log, " ")[0]); err == nil { - logCacheLock.Lock() - logCache[podName][containerName] = append(logCache[podName][containerName], - PodLog{ - LogTime: t, - Log: strings.SplitN(log, " ", 2)[1], - }) - logCacheLock.Unlock() - } else { - klog.Infof(err.Error()) - } -} - -// LogAnalysis xxx -func LogAnalysis(cluster plugin_manager.ClusterConfig) { - // 清理日志缓存 - cleanPodLogCache() - - logCacheLock.RLock() - defer func() { - logCacheLock.RUnlock() - }() - - etcdTookTooLongGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - - tookToLongList := make(map[string]int) - - for podName, podLogs := range logCache { - for containerName, logList := range podLogs { - if strings.Contains(podName, "etcd") && containerName == "etcd" { - for _, logItem := range logList { - logMap := make(map[string]interface{}) - err := json.Unmarshal([]byte(strings.Replace(logItem.Log, "-", "", -1)), &logMap) - if err != nil { - klog.Errorf("unmarshal failed: %s, %s", err.Error(), logItem.Log) - break - } - if strings.Contains(logMap["msg"].(string), "took too long") { - var requestPath string - for _, str := range strings.Split(logMap["request"].(string), " ") { - if strings.Contains(str, "key:") { - requestPath = strings.Replace(str, "key:", "", -1) - requestPath = strings.Replace(requestPath, "\"", "", -1) - break - } - - } - - var responseSize string - if strings.Contains(logMap["response"].(string), " ") { - responseSize = strings.Replace( - strings.Split(logMap["response"].(string), " ")[1], "size:", "", -1) - } else { - responseSize = strings.Replace(logMap["response"].(string), "size:", "", -1) - } - - size, err := strconv.Atoi(responseSize) - if err != nil { - klog.Errorf(err.Error()) - continue - } - - if _, ok := tookToLongList[requestPath]; !ok { - tookToLongList[requestPath] += size - } - } - - } - - } - } - } - - // top 5 - for request, size := range tookToLongList { - if len(etcdTookTooLongGaugeVecSetList) == 0 { - etcdTookTooLongGaugeVecSetList = append(etcdTookTooLongGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{cluster.ClusterID, cluster.BusinessID, request}, Value: float64(size)}) - } else { - for index := len(etcdTookTooLongGaugeVecSetList) - 1; index >= 0; index-- { - if float64(size) > etcdTookTooLongGaugeVecSetList[index].Value { - if index+1 < len(etcdTookTooLongGaugeVecSetList) { - etcdTookTooLongGaugeVecSetList = append(etcdTookTooLongGaugeVecSetList[:index+1], - append( - []*metric_manager.GaugeVecSet{&metric_manager.GaugeVecSet{Labels: []string{cluster.ClusterID, - cluster.BusinessID, request}, Value: float64(size)}}, - etcdTookTooLongGaugeVecSetList[index+1:]...)...) - } else { - etcdTookTooLongGaugeVecSetList = append(etcdTookTooLongGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{cluster.ClusterID, cluster.BusinessID, request}, - Value: float64(size)}) - } - break - } - } - } - } - if len(etcdTookTooLongGaugeVecSetList) < 5 { - metric_manager.SetMetric(etcdTookTooLongMetric, etcdTookTooLongGaugeVecSetList) - } else { - metric_manager.SetMetric(etcdTookTooLongMetric, - etcdTookTooLongGaugeVecSetList[len(etcdTookTooLongGaugeVecSetList)-5:]) - } -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/masterpodcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/masterpodcheck.go deleted file mode 100644 index 66ce9a9280..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/masterpodcheck.go +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package masterpodcheck -package masterpodcheck - -import ( - "bytes" - "context" - "fmt" - "os" - "reflect" - "regexp" - "sort" - "strconv" - "strings" - "sync" - "time" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/dlclark/regexp2" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/yaml.v2" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/json" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/remotecommand" - "k8s.io/klog" -) - -var ( - masterPodNameList = []string{ - "kube-apiserver", - "kube-controller-manager", - "kube-scheduler", - "etcd", - "cloud-controller-manager", - } -) - -// Plugin xxx -type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex -} - -var ( - masterPodCheck = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "master_pod_check", - Help: "the result of master pod configuration consistency check operation, 1 means ok", - }, []string{"target", "target_biz", "status", "component", "detection_item"}) - masterPodCheckMap = make(map[string]*prometheus.GaugeVec) - masterPodMapLock sync.Mutex -) - -// MetricLabel xxx -type MetricLabel struct { - Target string - TargetBiz string - Status string - Component string - DetectionItem string -} - -// ToLabelList xxx -func (l *MetricLabel) ToLabelList() []string { - result := make([]string, 0, 0) - result = append(result, l.Target) - result = append(result, l.TargetBiz) - result = append(result, l.Status) - result = append(result, l.Component) - result = append(result, l.DetectionItem) - return result -} - -func init() { - metric_manager.Register(masterPodCheck) -} - -// Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read systemappcheck config file %s failed, err %s", configFilePath, err.Error()) - } - p.opt = &Options{} - if err = json.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode masterpodcheck config file %s failed, err %s", configFilePath, err.Error()) - } - } - if err = p.opt.Validate(); err != nil { - return err - } - - p.stopChan = make(chan int) - interval := p.opt.Interval - if interval == 0 { - interval = 60 - } - - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - plugin_manager.Pm.Lock() - go p.Check() - } else { - klog.V(3).Infof("the former masterpodcheck didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue - } - } - }() - - return nil -} - -// Stop xxx -func (p *Plugin) Stop() error { - p.stopChan <- 1 - klog.Infof("plugin %s stopped", p.Name()) - return nil -} - -// Name xxx -func (p *Plugin) Name() string { - return "masterpodcheck" -} - -// Check xxx -func (p *Plugin) Check() { - start := time.Now() - p.checkLock.Lock() - klog.Infof("start %s", p.Name()) - defer func() { - klog.Infof("end %s", p.Name()) - plugin_manager.Pm.UnLock() - p.checkLock.Unlock() - metric_manager.SetCommonDurationMetric([]string{"masterpodcheck", "", "", ""}, start) - }() - - wg := sync.WaitGroup{} - masterPodCheckGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - // check master pod - for _, cluster := range plugin_manager.Pm.GetConfig().ClusterConfigs { - wg.Add(1) - config := cluster.Config - clusterId := cluster.ClusterID - clusterbiz := cluster.BusinessID - plugin_manager.Pm.Add() - go func() { - defer func() { - klog.V(9).Infof("end masterpodcheck for %s", clusterId) - wg.Done() - plugin_manager.Pm.Done() - }() - klog.V(9).Infof("start masterpodcheck for %s", clusterId) - clientSet, err := k8s.GetClientsetByConfig(config) - if err != nil { - klog.Errorf("%s GetClientsetByClusterId failed: %s", clusterId, err.Error()) - return - } - - clusterVersion, err := k8s.GetK8sVersion(clientSet) - if err != nil { - klog.Errorf("%s GetK8sVersion failed: %s", clusterId, err.Error()) - } - - ALLPodList, err := k8s.GetPods(clientSet, "kube-system", v1.ListOptions{}, "") - if err != nil { - klog.Errorf("%s GetPods failed: %s", clusterId, err.Error()) - } - - // 筛选静态pod - // 去掉IP匹配规则"(([1-9]?[0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])" - podList := make([]corev1.Pod, 0, 0) - for _, pod := range ALLPodList { - if strings.Contains(pod.Name, pod.Spec.NodeName) { - podList = append(podList, pod) - } - } - - clusterResult := make([]*metric_manager.GaugeVecSet, 0, 0) - for _, masterPod := range masterPodNameList { - var nodeLabelSelector string - - if clusterVersion == "" || podList == nil { - masterPodCheckGaugeVecSetList = append(masterPodCheckGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{clusterId, clusterbiz, "访问集群失败", masterPod, "all"}, Value: 1}) - continue - } - - if masterPod == "etcd" { - nodeLabelSelector = "kubernetes.io/node-role-etcd=true" - } else if masterPod == "cloud-controller-manager" { - nodeLabelSelector = "node-role.kubernetes.io/master=true" - } else { - nodeLabelSelector = "node-role.kubernetes.io/master" - } - - result := p.checkMasterPod(clientSet, podList, masterPod, nodeLabelSelector, clusterId, clusterbiz, config) - masterPodCheckGaugeVecSetList = append(masterPodCheckGaugeVecSetList, result...) - clusterResult = append(clusterResult, result...) - - } - - // 集群单独路径的指标配置 - masterPodMapLock.Lock() - if _, ok := masterPodCheckMap[clusterId]; !ok { - masterPodCheckMap[clusterId] = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "master_pod_check", - Help: "the result of master pod configuration consistency check operation, 1 means ok", - }, []string{"target", "target_biz", "status", "component", "detection_item"}) - metric_manager.MM.RegisterSeperatedMetric(clusterId, masterPodCheckMap[clusterId]) - } - metric_manager.SetMetric(masterPodCheckMap[clusterId], clusterResult) - masterPodMapLock.Unlock() - }() - } - - wg.Wait() - // reset metric value - klog.Infof("length is %d", len(masterPodCheckGaugeVecSetList)) - metric_manager.SetMetric(masterPodCheck, masterPodCheckGaugeVecSetList) -} - -func (p *Plugin) checkMasterPod(clientSet *kubernetes.Clientset, podList []corev1.Pod, podName string, - nodeLabelSelector string, clusterId string, clusterBiz string, config *rest.Config) []*metric_manager.GaugeVecSet { - result := make([]*metric_manager.GaugeVecSet, 0, 0) - - metricLabel := MetricLabel{ - Target: clusterId, - TargetBiz: clusterBiz, - Component: podName, - } - var err error - - // 获取本次要检查的master pod列表 - masterPodList := make([]corev1.Pod, 0, 0) - for _, pod := range podList { - if strings.Contains(pod.Name, podName) { - masterPodList = append(masterPodList, pod) - - // 检查pod label - metricLabel.DetectionItem = "pod_label_check" - if pod.Labels == nil || len(pod.Labels) == 0 { - metricLabel.Status = "notok" - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } - } - } - - // master pod的实例数检测 - metricLabel.DetectionItem = "实例数检测" - metricLabel.Status = p.checkPodNum(clientSet, nodeLabelSelector, masterPodList, podName) - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - - // pod如果只有一个则无需检查一致性 - if len(masterPodList) > 1 { - if podName == "kube-scheduler" { - metricLabel.DetectionItem = "配置文件一致性" - metricLabel.Status, err = p.checkSchedulerPolicy(clientSet, config, masterPodList) - if err != nil { - klog.Errorf("%s checkSchedulerPolicy failed: %s", clusterId, err.Error()) - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } else { - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } - } - metricLabel.DetectionItem = "配置一致性" - metricLabel.Status = p.checkStaticPodConsistency(masterPodList) - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } - - // 检查配置文件中的其他检测项 - for _, checkConfig := range p.opt.CheckConfigs { - if checkConfig.Name == podName { - metricLabel.DetectionItem = checkConfig.DetectionItem - for _, pod := range masterPodList { - checkResult, err := p.checkPodConfig(pod, checkConfig.ConfigPath, checkConfig.ConfigRegex) - if err != nil { - klog.Errorf("%s %s checkPodConfig failed: %s", clusterId, podName, err.Error()) - metricLabel.Status = "检测失败" - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } else if !checkResult { - metricLabel.Status = checkConfig.Status - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - break - } else if checkResult { - metricLabel.Status = "ok" - result = append(result, - &metric_manager.GaugeVecSet{Labels: metricLabel.ToLabelList(), Value: 1}) - } - } - } - } - - return result -} - -func (p *Plugin) checkPodNum(clientSet *kubernetes.Clientset, nodeLabelSelector string, masterPodList []corev1.Pod, - podName string) string { - masterPodNum := len(masterPodList) - - // ensure number of master node - if podName != "etcd" { - ctx := context.Background() - nodeList, err := clientSet.CoreV1().Nodes().List(ctx, v1.ListOptions{ - LabelSelector: nodeLabelSelector, - ResourceVersion: "0", - }) - if err != nil { - return "访问集群失败" - } - masterNum := len(nodeList.Items) - if masterNum == 0 { - return "无master节点" - } else if masterNum == 1 { - return "单master节点" - } - if masterNum != masterPodNum { - return "节点pod数量不等" - } - } - return "ok" -} - -func (p *Plugin) checkStaticPodConsistency(podList []corev1.Pod) string { - podSpecList := make(map[string]corev1.PodSpec) - argsList := make(map[string]map[string][]string) - for _, pod := range podList { - spec := pod.Spec - spec.NodeName = "" - for index, container := range spec.Containers { - if argsList[pod.Name] == nil { - argsList[pod.Name] = make(map[string][]string) - } - - sort.Strings(container.Args) - argsList[pod.Name][container.Name] = container.Args - spec.Containers[index].Args = nil - } - podSpecList[pod.Name] = spec - } - - // 对比容器配置以外的spec是否一致 - var sampleSpec corev1.PodSpec - var sampleName string - for podName, spec := range podSpecList { - if reflect.DeepEqual(sampleSpec, corev1.PodSpec{}) { - sampleSpec = spec - sampleName = podName - } else { - if !reflect.DeepEqual(sampleSpec, spec) { - klog.V(9).Infof("%s: %s not equal %s: %s", podName, spec.String(), sampleName, sampleSpec.String()) - return "配置不一致" - } - } - } - - // 对比容器命令行参数是否一致 - var samplePodName string - for podName, _ := range argsList { - if samplePodName == "" { - samplePodName = podName - break - } - } - if samplePodName == "" { - return "无有效pod" - } - - for podName, containers := range argsList { - if podName == samplePodName { - continue - } - - for containerName, args := range containers { - if sampleArgs, ok := argsList[samplePodName][containerName]; !ok { - klog.Infof("pod %s doesn't have container %s", samplePodName, containerName) - return "配置不一致" - } else { - err := checkArguments(args, sampleArgs) - if err != nil { - klog.Infof("pod %s container %s doesn't equal pod %s : %s", - samplePodName, containerName, podName, err.Error()) - return "配置不一致" - } - } - } - } - - return "ok" -} - -func checkArguments(argList1 []string, argList2 []string) error { - if len(argList1) != len(argList2) { - return fmt.Errorf("length not equal") - } else { - for index, arg1 := range argList1 { - arg2 := argList2[index] - if arg1 != arg2 { - // exclude ip address - re, _ := regexp.Compile( - "(([1-9]?[0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])") - arg1WithoutIp := re.ReplaceAllString(arg1, "") - arg2WithoutIp := re.ReplaceAllString(arg2, "") - if arg1WithoutIp != arg2WithoutIp { - return fmt.Errorf("arg %s not equal", arg1WithoutIp) - } - } - } - } - - return nil -} - -func (p *Plugin) checkSchedulerPolicy(clientSet *kubernetes.Clientset, restConfig *rest.Config, - podList []corev1.Pod) (string, error) { - if len(podList) == 0 { - klog.Infof("no kube-scheduler pods were found") - return "访问集群失败", nil - } else if len(podList) == 1 { - return "单实例", nil - } - - var filePath string - for _, item := range podList { - for _, arg := range item.Spec.Containers[0].Args { - if strings.Contains(arg, "policy-config-file") || strings.Contains(arg, "--config") { - filePath = strings.Split(arg, "=")[1] - break - } - } - } - - if filePath == "" { - return "ok", nil - } - return p.checkPodFileConsistency(restConfig, clientSet, podList, "kube-scheduler", filePath) -} - -func (p *Plugin) checkPodFileConsistency(restConfig *rest.Config, clientSet *kubernetes.Clientset, podList []corev1.Pod, - containerName string, filePath string) (string, error) { - ctx := context.Background() - - var sampleFile string - var sampleName string - // var sampleName string - for _, pod := range podList { - req := clientSet.CoreV1().RESTClient().Post().Resource("pods").Name(pod.Name). - Namespace("kube-system").SubResource("exec").Param("container", containerName). - VersionedParams(&corev1.PodExecOptions{ - Command: []string{"cat", filePath}, - Stdin: false, - Stdout: true, - Stderr: true, - TTY: false, - }, scheme.ParameterCodec) - - exec, err := remotecommand.NewSPDYExecutor(restConfig, "POST", req.URL()) - if err != nil { - return "访问集群失败", fmt.Errorf("NewSPDYExecutor failed: %s", err.Error()) - } - - var stdout, stderr bytes.Buffer - // StreamWithContext - if err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ - Stdin: nil, - Stdout: &stdout, - Stderr: &stderr, - Tty: false, - }); err != nil { - return "error", fmt.Errorf("Stream failed: %s %s", err.Error(), stderr.String()) - } - klog.V(6).Infof("%s %s", stdout.String(), stderr.String()) - errMsg := stderr.String() - execMsg := stdout.String() - if errMsg != "" { - klog.Infof("Exec failed: %s", errMsg) - return "访问集群失败", nil - } else if execMsg == "" { - klog.Infof("%s is blank", filePath) - return "访问集群失败", nil - } - - if sampleFile == "" { - sampleFile = execMsg - sampleName = pod.Name - } else { - if sampleFile != execMsg { - klog.Infof("pod %s policy %s doesn't equal pod %s policy %s", - sampleName, sampleFile, pod.Name, execMsg) - return "配置不一致", nil - } - } - } - return "ok", nil -} - -func (p *Plugin) checkPodConfig(obj interface{}, path string, regex string) (bool, error) { - value := reflect.ValueOf(obj) - fields := strings.Split(path, ".") - for _, field := range fields { - if value.Kind() != reflect.Struct && value.Kind() != reflect.Slice && value.Kind() != reflect.Pointer { - return false, fmt.Errorf("invalid field %s in path %s %s", field, path, value) - } - if value.Kind() == reflect.Slice { - indexStr := strings.Trim(field, "[]") - index, err := strconv.Atoi(indexStr) - if err != nil { - return false, fmt.Errorf("invalid field %s in path %s %s", field, path, value) - } - if value.Len() >= index+1 { - value = value.Index(index) - } else { - return true, nil - } - } else if value.Kind() == reflect.Struct { - // 如果当前值是结构体,就按照字段名获取对应属性值 - fieldValue := value.FieldByName(field) - if !fieldValue.IsValid() { - return false, fmt.Errorf("invalid field %s in path %s %s", field, path, value) - } - value = fieldValue - } else if value.Kind() == reflect.Pointer && !value.IsNil() { - value = reflect.Indirect(value) - // 如果当前值是结构体,就按照字段名获取对应属性值 - fieldValue := value.FieldByName(field) - if !fieldValue.IsValid() { - return false, fmt.Errorf("invalid field %s in path %s %s", field, path, value) - } - value = fieldValue - } else if value.IsNil() { - return true, nil - } - } - - result := value.Interface().(string) - reg, err := regexp2.Compile(regex, 0) - if err != nil { - return false, err - } - return reg.MatchString(result) -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/options.go deleted file mode 100644 index 59f6cc12b2..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/options.go +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -package masterpodcheck - -// CheckConfig pod check config -type CheckConfig struct { - Name string `json:"name" yaml:"name"` - ConfigPath string `json:"configPath" yaml:"configPath"` - ConfigRegex string `json:"configRegex" yaml:"configRegex"` - Status string `json:"status" yaml:"status"` - DetectionItem string `json:"detectionItem" yaml:"detectionItem"` -} - -// Options bcs log options -type Options struct { - Interval int `json:"interval" yaml:"interval"` - CheckConfigs []CheckConfig `json:"checkConfigs" yaml:"checkConfigs"` -} - -// Validate validate options -func (o *Options) Validate() error { - // if len(o.KubeMaster) == 0 { - // return fmt.Errorf("kube_master cannot be empty") - // } - // if len(o.Kubeconfig) == 0 { - // return fmt.Errorf("kubeconfig cannot be empty") - // } - return nil -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck.go deleted file mode 100644 index dd9ae1028e..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck.go +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package netcheck -package netcheck - -import ( - "context" - "fmt" - "net" - "os" - "runtime/debug" - "sync" - "time" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/prometheus/client_golang/prometheus" - "golang.org/x/net/icmp" - "golang.org/x/net/ipv4" - "gopkg.in/yaml.v2" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/json" - "k8s.io/client-go/kubernetes" - "k8s.io/klog" -) - -// Plugin xxx -type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex - clusterId string - clientSet *kubernetes.Clientset - businessID string - conn *icmp.PacketConn - dnsConn *icmp.PacketConn - msg []byte - svcNetLock sync.Mutex - cancel context.CancelFunc -} - -var ( - podNetAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "pod_net_availability", - Help: "pod_net_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}) - - podNetLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "pod_net_latency", - Help: "pod_net_latency", - Buckets: []float64{0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2}, - }, []string{"target", "target_biz"}) - - svcNetAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "svc_net_availability", - Help: "svc_net_availability, 1 means OK", - }, []string{"target", "target_biz", "status"}) - - svcNetLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Name: "svc_net_latency", - Help: "pod_net_latency", - Buckets: []float64{0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2}, - }, []string{"target", "target_biz"}) -) - -func init() { - metric_manager.Register(podNetAvailability) - metric_manager.Register(podNetLatency) - metric_manager.Register(svcNetAvailability) - metric_manager.Register(svcNetLatency) -} - -// Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read netcheck config file %s failed, err %s", configFilePath, err.Error()) - } - p.opt = &Options{} - if err = json.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode netcheck config file %s failed, err %s", configFilePath, err.Error()) - } - } - - if err = p.opt.Validate(); err != nil { - return err - } - - p.stopChan = make(chan int) - interval := p.opt.Interval - if interval == 0 { - interval = 60 - } - - clusterConfig := plugin_manager.Pm.GetConfig().InClusterConfig - if clusterConfig.Config == nil { - klog.Fatalf("netcheck get incluster config failed, only can run as incluster mode") - } - p.clusterId = clusterConfig.ClusterID - p.businessID = clusterConfig.BusinessID - - p.clientSet, err = k8s.GetClientsetByConfig(clusterConfig.Config) - if err != nil { - klog.Fatalf("netcheck get incluster config failed, only can run as incluster mode") - } - - p.conn, err = icmp.ListenPacket("ip4:icmp", "0.0.0.0") - if err != nil { - klog.Fatalf(err.Error()) - } - p.dnsConn, err = icmp.ListenPacket("ip4:icmp", "0.0.0.0") - if err != nil { - klog.Fatalf(err.Error()) - } - - msg := icmp.Message{ - Type: ipv4.ICMPTypeEcho, - Code: 0, - Body: &icmp.Echo{ - ID: os.Getpid() & 0xffff, - Seq: 1, - Data: []byte("Hello, world!"), - }, - } - - p.msg, err = msg.Marshal(nil) - if err != nil { - klog.Fatalf(err.Error()) - } - - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - if p.opt.Synchronization { - plugin_manager.Pm.Lock() - } - go p.Check() - } else { - klog.V(3).Infof("the former netcheck didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue - } - } - }() - - return nil -} - -// Stop xxx -func (p *Plugin) Stop() error { - p.checkLock.Lock() - p.stopChan <- 1 - klog.Infof("plugin %s stopped", p.Name()) - p.checkLock.Unlock() - - p.cancel() - return nil -} - -// Name xxx -func (p *Plugin) Name() string { - return "netcheck" -} - -// Check xxx -func (p *Plugin) Check() { - start := time.Now() - p.checkLock.Lock() - klog.Infof("start %s", p.Name()) - defer func() { - klog.Infof("end %s", p.Name()) - if p.opt.Synchronization { - plugin_manager.Pm.UnLock() - } - p.checkLock.Unlock() - metric_manager.SetCommonDurationMetric([]string{"netcheck", "", "", ""}, start) - }() - - netChecktGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - - status := "error" - defer func() { - if r := recover(); r != nil { - klog.Errorf("%s netcheck failed: %s, stack: %v\n", p.clusterId, r, string(debug.Stack())) - status = "panic" - } - }() - - p.checkPodNet(&status) - klog.Infof("%s netcheck result %s", p.clusterId, status) - netChecktGaugeVecSetList = append(netChecktGaugeVecSetList, - &metric_manager.GaugeVecSet{Labels: []string{p.clusterId, p.businessID, status}, Value: float64(1)}) - metric_manager.SetMetric(podNetAvailability, netChecktGaugeVecSetList) - - if p.svcNetLock.TryLock() { - p.svcNetLock.Unlock() - ctx, cancel := context.WithCancel(context.Background()) - p.cancel = cancel - go p.checkSVCNet(ctx) - } -} - -func (p *Plugin) checkPodNet(status *string) { - failedToPing := false - defer func() { - if !failedToPing { - *status = "ok" - } - }() - - pods, err := p.clientSet.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{ - ResourceVersion: "0", - }) - if err != nil { - *status = "getpodfailed" - klog.Errorf("%s failed to list all pods, %s", p.clusterId, err.Error()) - } - - wg := sync.WaitGroup{} - - pingChan := make(chan struct{}, 5) - for _, pod := range pods.Items { - if pod.Status.Phase != v1.PodRunning || pod.Spec.HostNetwork { - continue - } - - wg.Add(1) - plugin_manager.Pm.Add() - go func(pod v1.Pod) { - start := time.Now() - pingChan <- struct{}{} - defer func() { - wg.Done() - plugin_manager.Pm.Done() - }() - - conn, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0") - if err != nil { - klog.Fatalf(err.Error()) - } - err = conn.SetDeadline(start.Add(60 * time.Second)) - if err != nil { - klog.Errorf(err.Error()) - } - - podIP := pod.Status.PodIP - - _, err = conn.WriteTo(p.msg, &net.IPAddr{IP: net.ParseIP(podIP)}) - if err != nil { - failedToPing = true - klog.Errorf(err.Error()) - *status = "sendfailed" - return - } - - reply := make([]byte, 1500) - _, _, err = conn.ReadFrom(reply) - duration := time.Since(start) - if err != nil { - failedToPing = true - klog.Errorf("read reply from %s:%s:%s failed: %s", pod.Namespace, pod.Name, podIP, err.Error()) - *status = "readfailed" - return - } - - // 统一指标配置方法 - podNetLatency.WithLabelValues(p.clusterId, p.businessID).Observe(float64(duration) / float64(time.Second)) - <-pingChan - }(pod) - } - - wg.Wait() -} - -func (p *Plugin) checkSVCNet(ctx context.Context) { - p.svcNetLock.Lock() - defer func() { - p.svcNetLock.Unlock() - }() - - svc, err := p.clientSet.CoreV1().Services(metav1.NamespaceSystem).Get(context.TODO(), "kube-dns", metav1.GetOptions{}) - if err != nil { - klog.Fatalf("%s failed to list all pods, %s", p.clusterId, err.Error()) - } - - failedToPing := false - - start := time.Now() - for { - select { - case <-ctx.Done(): - klog.Infof("Stop checkSVCNet") - return - default: - err = p.dnsConn.SetDeadline(start.Add(60 * time.Second)) - if err != nil { - klog.Errorf(err.Error()) - } - - _, err = p.dnsConn.WriteTo(p.msg, &net.IPAddr{IP: net.ParseIP(svc.Spec.ClusterIP)}) - if err != nil { - failedToPing = true - klog.Errorf(err.Error()) - } - - reply := make([]byte, 1500) - _, _, err = p.dnsConn.ReadFrom(reply) - if err != nil { - failedToPing = true - klog.Errorf("read reply from %s:%s:%s failed: %s", svc.Namespace, svc.Name, svc.Spec.ClusterIP, err.Error()) - } - duration := time.Since(start) - svcNetLatency.WithLabelValues(p.clusterId, p.businessID).Observe(float64(duration) / float64(time.Second)) - - svcNetAvailability.Reset() - if failedToPing { - svcNetAvailability.WithLabelValues(p.clusterId, p.businessID, "notok").Set(1) - } else { - svcNetAvailability.WithLabelValues(p.clusterId, p.businessID, "ok").Set(1) - } - } - - // 最快1s执行一次 - if time.Since(start) < time.Second { - <-time.After(time.Second - time.Since(start)) - } - start = time.Now() - } -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck_test.go deleted file mode 100644 index 273c56c3e0..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/netcheck_test.go +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -package netcheck - -import ( - "fmt" - "net" - "os" - "testing" - "time" - - "golang.org/x/net/icmp" - "golang.org/x/net/ipv4" -) - -func TestCheckClusterNet(t *testing.T) { - conn, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0") - if err != nil { - t.Errorf(err.Error()) - return - } - - msg := icmp.Message{ - Type: ipv4.ICMPTypeEcho, - Code: 0, - Body: &icmp.Echo{ - ID: os.Getpid() & 0xffff, - Seq: 1, - Data: []byte("Hello, world!"), - }, - } - msgBytes, err := msg.Marshal(nil) - if err != nil { - t.Errorf(err.Error()) - return - } - - host := "127.0.0.1" - start := time.Now() - conn.SetDeadline(start.Add(5 * time.Second)) - _, err = conn.WriteTo(msgBytes, &net.IPAddr{IP: net.ParseIP(host)}) - if err != nil { - t.Errorf(err.Error()) - return - } - - reply := make([]byte, 1500) - _, _, err = conn.ReadFrom(reply) - if err != nil { - t.Errorf(err.Error()) - return - } - - duration := time.Since(start) - fmt.Printf("Ping successful, time=%v\n", duration) - -} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/const.go new file mode 100644 index 0000000000..3ec4fb7514 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/const.go @@ -0,0 +1,43 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package containercheck xxx +package containercheck + +const ( + pluginName = "containercheck" + Normalstatus = "ok" + runtimeTarget = "runtime" + initContent = `interval: 600` + readFileFailStatus = "read file failed" + dnsInconsistencyStatus = "dns inconsistency" + inconsistentStatus = "inconsistent" + getProcessFailStatus = "get process failed" + runtimeErrorStatus = "runtime error" + processNotExistStatus = "process not exist" + containerNotFoundStatus = "container not found" + inspectCoantainerError = "inspect container error" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "容器检查", + Normalstatus: "正常", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + Normalstatus: Normalstatus, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck.go new file mode 100644 index 0000000000..6311f33a5d --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck.go @@ -0,0 +1,547 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package containercheck xxx +package containercheck + +import ( + "context" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "net" + "os" + "path" + "sort" + "strings" + "time" + + "github.com/containerd/containerd/namespaces" + containerd "github.com/containerd/containerd/v2/client" + "github.com/docker/docker/api/types" + "github.com/docker/docker/client" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/types/process" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +var ( + containerStatusLabels = []string{"id", "name", "node", "status"} + containerPorcessStatusLabels = []string{"id", "name", "node", "status"} + runtimeStatusLabels = []string{"node", "status"} + containerStatusMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "container_status", + Help: "container_status", + }, containerStatusLabels) + containerPorcessStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "container_process_status", + Help: "container_process_status", + }, containerPorcessStatusLabels) + runtimeStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "runtime_status", + Help: "runtime_status", + }, runtimeStatusLabels) + + sockPathes = []string{ + "/run/docker.sock", + "/run/containerd/containerd.sock", + } +) + +func init() { + metricmanager.Register(containerStatusMetric) + metricmanager.Register(containerPorcessStatus) + metricmanager.Register(runtimeStatus) +} + +// Plugin xxx +type Plugin struct { + opt *Options + ready bool + Detail Detail + pluginmanager.NodePlugin +} + +// Detail xxx +type Detail struct { +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check check container status and state +func (p *Plugin) Check() { + // 初始化变量 + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + + node := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := node.NodeName + + var runtimeErr error + + containerStatusGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + containerPidStatusGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + runtimeStatusGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + + p.ready = false + + defer func() { + p.CheckLock.Unlock() + + if runtimeErr != nil { + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Detail: fmt.Sprintf("check %s failed: %s", runtimeTarget, runtimeErr.Error()), + Normal: false, + Status: runtimeErrorStatus, + } + klog.Errorf("runtime error: %s", runtimeErr.Error()) + checkItem.Detail = fmt.Sprintf("runtime error: %s", runtimeErr.Error()) + result = append(result, checkItem) + + runtimeStatusGaugeVecSetList = append(runtimeStatusGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, runtimeErrorStatus}, Value: float64(1), + }) + } + + metricmanager.RefreshMetric(containerStatusMetric, containerStatusGaugeVecSetList) + metricmanager.RefreshMetric(containerPorcessStatus, containerPidStatusGaugeVecSetList) + metricmanager.RefreshMetric(runtimeStatus, runtimeStatusGaugeVecSetList) + + p.Result = pluginmanager.CheckResult{ + Items: result, + } + p.ready = true + klog.Infof("end %s", p.Name()) + }() + + var sockList = sockPathes + var socketPath string + + if p.opt.SockPath != "" { + sockList = []string{p.opt.SockPath} + klog.Infof("sockPath param is %s, remove default sockpathes", p.opt.SockPath) + } + + // 获取可用的socket + for _, socketPath = range sockList { + conn, err := net.Dial("unix", path.Join(node.HostPath, socketPath)) + if err != nil { + socketPath = "" + klog.Errorf(err.Error()) + continue + } else { + err = conn.Close() + if err != nil { + klog.Errorf("close socket failed: %s", err.Error()) + } + break + } + } + + socketPath = path.Join(node.HostPath, socketPath) + if strings.Contains(socketPath, "docker.sock") { + checkItemList, gvsList, err := dockerCheck(socketPath, node) + if err != nil { + runtimeErr = err + return + } + result = append(result, checkItemList...) + containerStatusGaugeVecSetList = append(containerStatusGaugeVecSetList, gvsList...) + } else if strings.Contains(socketPath, "containerd.sock") { + checkItemList, gvsList, err := containerdCheck(socketPath, node) + if err != nil { + runtimeErr = err + return + } + result = append(result, checkItemList...) + containerStatusGaugeVecSetList = append(containerStatusGaugeVecSetList, gvsList...) + } else { + runtimeErr = fmt.Errorf("unknown socket %s", socketPath) + return + } + + runtimeStatusGaugeVecSetList = append(runtimeStatusGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, Normalstatus}, Value: float64(1), + }) + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.WARNLevel, + Normal: true, + Status: Normalstatus, + }) +} + +// dockerCheck 检查docker容器状态 +func dockerCheck(socketPath string, node pluginmanager.NodeConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0) + nodeName := node.NodeName + // 检查docker容器状态 + cli, err := GetDockerCli(socketPath) + if err != nil { + return nil, nil, err + } + + defer func() { + _ = cli.Close() + }() + + containerList, err := cli.ContainerList(context.Background(), types.ContainerListOptions{}) + if err != nil { + return nil, nil, err + } + + // check container status + for _, container := range containerList { + klog.Infof("start check for docker container %s", container.Names) + status, containerInfo, err := DockerContainerCheck(cli, container.ID, container.State) + if err != nil { + klog.Errorf("check container %s failed: %s", container.Names, err.Error()) + } + + if status != Normalstatus { + klog.Errorf("container id: %s,inspect: %s, state: %s", container.ID, status, container.State) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{container.ID, strings.Join(container.Names, "_"), nodeName, status}, Value: float64(1), + }) + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("container %s state is %s", strings.Join(container.Names, "_"), status), + Status: inspectCoantainerError, + }) + continue + } + + // 验证dns pod中的resolv内容正确 + checkItem, status, err := CheckDNSContainer(containerInfo.Name, containerInfo.ResolvConfPath, nodeName, node.HostPath) + if err != nil { + klog.Errorf("check container %s failed: %s", container.Names, err.Error()) + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{container.ID, strings.Join(container.Names, "_"), nodeName, status}, Value: float64(1), + }) + checkItemList = append(checkItemList, *checkItem) + } + } + + return checkItemList, gvsList, nil +} + +// containerdCheck 检查containerd容器状态 +func containerdCheck(socketPath string, node pluginmanager.NodeConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0) + nodeName := node.NodeName + + // 连接到 containerd + cli, err := containerd.New(socketPath) + if err != nil { + return nil, nil, err + } + defer func() { + _ = cli.Close() + }() + + ctx := namespaces.WithNamespace(util.GetCtx(10*time.Second), "k8s.io") + + containerList, err := cli.Containers(ctx) + if err != nil { + return nil, nil, err + } + + // check container status + for _, container := range containerList { + klog.Infof("start check for containerd container %s", container.ID()) + status, podName, err := ContainerdContainerCheck(container, ctx) + if err != nil { + klog.Errorf("check container %s failed: %s", podName, err.Error()) + } + + if status != Normalstatus { + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{container.ID(), podName, nodeName, status}, Value: float64(1), + }) + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Status: inconsistentStatus, + Detail: fmt.Sprintf("container of %s state is %s", podName, status), + }) + continue + } + + // 验证dns pod中的resolv内容正确 + spec, err := container.Spec(ctx) + if err != nil { + klog.Errorf("check container %s failed: %s", podName, err.Error()) + continue + } + resolvConfPath := "" + for _, mount := range spec.Mounts { + if mount.Destination == "/etc/resolv.conf" { + resolvConfPath = mount.Source + } + } + // 验证dns pod中的resolv内容正确 + checkItem, status, err := CheckDNSContainer(podName, resolvConfPath, nodeName, node.HostPath) + if err != nil { + klog.Errorf("check container %s failed: %s", podName, err.Error()) + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{container.ID(), podName, nodeName, status}, Value: float64(1), + }) + checkItemList = append(checkItemList, *checkItem) + } + } + + return checkItemList, gvsList, nil +} + +// DockerContainerCheck 检查容器状态一致性以及进程状态 +func DockerContainerCheck(cli *client.Client, containerID string, state string) (string, types.ContainerJSON, error) { + containerInfo, err := GetContainerInfo(cli, containerID) + if err != nil { + if strings.Contains(err.Error(), "No such container") { + return containerNotFoundStatus, containerInfo, err + } else { + return inspectCoantainerError, containerInfo, err + } + } + + if containerInfo.State.Status != state { + return inconsistentStatus, containerInfo, nil + } + + if containerInfo.State.Pid == 0 { + return processNotExistStatus, containerInfo, nil + } + + pidStatus, err := GetContainerPIDStatus(containerInfo.State.Pid) + if err != nil { + return getProcessFailStatus, containerInfo, err + } + + if pidStatus == "D" || pidStatus == "Z" { + return pidStatus, containerInfo, err + } + + return Normalstatus, containerInfo, nil +} + +// ContainerdContainerCheck 检查容器状态一致性以及进程状态 +func ContainerdContainerCheck(container containerd.Container, ctx context.Context) (string, string, error) { + info, err := container.Info(ctx, containerd.WithoutRefreshedMetadata) + if err != nil { + return inspectCoantainerError, "", err + } + + podName := "" + // docker runtime的情况下,虽然containerd sock可以访问,但没有K8S的信息 + if name, ok := info.Labels["io.kubernetes.pod.name"]; ok { + podName = name + } + + task, err := container.Task(ctx, nil) + if err != nil { + return Normalstatus, podName, nil + } + + pid := task.Pid() + pidStatus, err := GetContainerPIDStatus(int(pid)) + if err != nil { + return inspectCoantainerError, "", err + } + + if pidStatus == "D" || pidStatus == "Z" { + return pidStatus, podName, err + } + return Normalstatus, podName, nil +} + +// CheckDNSContainer 验证dns pod中的resolv内容正确 +func CheckDNSContainer(name string, resolvConfPath string, nodeName string, hostPath string) (*pluginmanager.CheckItem, string, error) { + checkItem := &pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: true, + } + + // 判断该容器是否为kube-system下的 dns 容器 + if strings.Contains(name, "kube-system") && (strings.Contains(name, "coredns") || strings.Contains(name, "kube-dns")) && !strings.Contains(name, "k8s_POD") { + klog.Infof("check dns pod %s %s", name, resolvConfPath) + + containerPath := path.Join(hostPath, resolvConfPath) + dnsResolv, err := os.ReadFile(containerPath) + if err != nil { + checkItem.Normal = false + checkItem.Detail = fmt.Sprintf("dns container %s read %s failed: %s", name, containerPath, err.Error()) + checkItem.Status = readFileFailStatus + return checkItem, readFileFailStatus, err + } + + hostResolv, err := os.ReadFile(path.Join(hostPath, "/etc/resolv.conf")) + if err != nil { + checkItem.Detail = fmt.Sprintf("read %s failed: %s", hostPath, err.Error()) + checkItem.Status = readFileFailStatus + if err != nil { + return checkItem, readFileFailStatus, err + } + } + + dnsLines := make([]string, 0, 0) + for _, dnsLine := range strings.Split(string(dnsResolv), "\n") { + if !strings.HasPrefix(dnsLine, "nameserver") { + continue + } + dnsLines = append(dnsLines, dnsLine) + } + + hostLines := make([]string, 0, 0) + for _, hostLine := range strings.Split(string(hostResolv), "\n") { + if !strings.HasPrefix(hostLine, "nameserver") { + continue + } + hostLines = append(hostLines, hostLine) + } + + sort.Strings(dnsLines) + sort.Strings(hostLines) + + // 判断容器内的resolv文件中的nameserver配置和母机上的是否一致 + equal := true + if len(dnsLines) != len(hostLines) { + equal = false + } else { + for i, item := range dnsLines { + if hostLines[i] != item { + equal = false + break + } + } + } + + if !equal { + err = fmt.Errorf("content of dns %s is %s, different from %s ", containerPath, dnsLines, hostPath) + checkItem.Normal = false + checkItem.Detail = err.Error() + checkItem.Status = Normalstatus + return checkItem, dnsInconsistencyStatus, err + } + } + + return nil, Normalstatus, nil +} + +// GetDockerCli xxx +func GetDockerCli(sockPath string) (*client.Client, error) { + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation(), client.WithHost(fmt.Sprintf("unix://%s", sockPath))) + return cli, err + +} + +// GetContainerInfo xxx +func GetContainerInfo(cli *client.Client, containerID string) (types.ContainerJSON, error) { + ctx := util.GetCtx(10 * time.Second) + containerInfo, err := cli.ContainerInspect(ctx, containerID) + return containerInfo, err +} + +// GetContainerPIDStatus xxx +func GetContainerPIDStatus(pid int) (string, error) { + processInfo, err := process.GetProcess(int32(pid)) + if err != nil { + return "", err + } else { + return processInfo.Status() + } +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck_test.go new file mode 100644 index 0000000000..8895ac41a1 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/containercheck_test.go @@ -0,0 +1,46 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package containercheck xxx +package containercheck + +import ( + "fmt" + "testing" + "time" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/docker/docker/api/types" +) + +func TestGetDockerCli(t *testing.T) { + cli, err := GetDockerCli("/var/run/docker.sock") + if err != nil { + t.Errorf(err.Error()) + } + + ctx := util.GetCtx(time.Second * 10) + containerList, err := cli.ContainerList(ctx, types.ContainerListOptions{}) + if err != nil { + t.Errorf(err.Error()) + } + + for _, container := range containerList { + status, err := GetContainerPIDStatus(1) + if err != nil { + t.Errorf(err.Error()) + } + fmt.Printf("%s: %s\n", container.ID, status) + + } + +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/init.go similarity index 76% rename from bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/init.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/init.go index 447ed74538..556e907d01 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/init.go @@ -10,17 +10,11 @@ * limitations under the License. */ -package eventrecorder +// Package containercheck xxx +package containercheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/options.go new file mode 100644 index 0000000000..30ffaf8339 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/containercheck/options.go @@ -0,0 +1,25 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package containercheck xxx +package containercheck + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` + SockPath string `json:"sockPath" yaml:"sockPath"` +} + +// Validate validate options +func (o *Options) Validate() error { + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/const.go new file mode 100644 index 0000000000..38cc4f6c03 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/const.go @@ -0,0 +1,40 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package diskcheck xxx +package diskcheck + +const ( + pluginName = "diskcheck" + NormalStatus = "ok" + initContent = `interval: 600` + nodeItemTarget = "node" + testFailStatus = "test disk fail" +) + +var ( + ChinenseStringMap = map[string]string{ + NormalStatus: "正常", + nodeItemTarget: "节点", + pluginName: "磁盘检查", + testFailStatus: "检测磁盘失败", + } + + EnglishStringMap = map[string]string{ + NormalStatus: NormalStatus, + nodeItemTarget: nodeItemTarget, + pluginName: pluginName, + testFailStatus: testFailStatus, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/diskcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/diskcheck.go new file mode 100644 index 0000000000..be84d24e82 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/diskcheck.go @@ -0,0 +1,268 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package diskcheck xxx +package diskcheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "os" + "path/filepath" + "time" + + "github.com/moby/sys/mountinfo" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + ready bool + pluginmanager.NodePlugin + Detail Detail +} + +// Detail xxx +type Detail struct { +} + +var ( + fsAvailabilityLabels = []string{"mountpoint", "node", "status"} + fsAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "fs_availability", + Help: "fs_availability, 1 means OK", + }, fsAvailabilityLabels) +) + +func init() { + metricmanager.Register(fsAvailability) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.StopChan = make(chan int) + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + if err != nil { + klog.Fatalf("%s get incluster config failed, only can run as incluster mode", p.Name()) + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.CheckLock.Lock() + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + p.CheckLock.Unlock() + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check xxx +func (p *Plugin) Check() { + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + p.ready = false + + node := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := node.NodeName + fsGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + + mountInfoList, err := GetFSMountInfo(node.HostPath) + if err != nil { + klog.Errorf(err.Error()) + return + } + + for _, mountInfo := range mountInfoList { + err = TestFS(node.HostPath, mountInfo.Mountpoint) + if err != nil { + klog.Infof("test fs %s failed: %s", mountInfo.Mountpoint, err.Error()) + fsGaugeVecSetList = append(fsGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{mountInfo.Mountpoint, nodeName, "notok"}, Value: float64(1), + }) + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("testfs %s failed: %s", mountInfo.Mountpoint, err.Error()), + Status: testFailStatus, + Level: pluginmanager.WARNLevel, + }) + + } else { + klog.Infof("test fs %s success", mountInfo.Mountpoint) + } + } + + if len(fsGaugeVecSetList) == 0 { + fsGaugeVecSetList = append(fsGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{"/", nodeName, NormalStatus}, Value: float64(1), + }) + + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: true, + Detail: "", + Status: NormalStatus, + Level: pluginmanager.WARNLevel, + }) + } + + metricmanager.RefreshMetric(fsAvailability, fsGaugeVecSetList) + p.Result = pluginmanager.CheckResult{ + Items: result, + } + + if !p.ready { + p.ready = true + } +} + +// GetFSMountInfo get mount point info +func GetFSMountInfo(hostPath string) ([]*mountinfo.Info, error) { + mountInfoList, err := GetProcessMountInfo(hostPath, 1) + if err != nil { + return nil, err + } + + fsMountInfoList := make([]*mountinfo.Info, 0, 0) + for _, mountInfo := range mountInfoList { + if mountInfo.FSType == "ext4" || mountInfo.FSType == "xfs" { + if mountInfo.Root == "/" { + fsMountInfoList = append(fsMountInfoList, mountInfo) + } + } + } + return fsMountInfoList, err +} + +// TestFS check if data can be writen in fs +func TestFS(hostPath, path string) error { + rand.Seed(time.Now().UnixNano()) + fileName := filepath.Join(hostPath, path, fmt.Sprintf("%d", rand.Intn(1000))+".nodeagent") + file, err := os.Create(fileName) + defer func() { + if file != nil { + err = file.Close() + if err != nil { + klog.Error(err.Error()) + } + } + + err = os.Remove(fileName) + if err != nil { + klog.Error(err.Error()) + } + }() + + if err != nil { + return err + } + + _, err = file.WriteString("test") + if err != nil { + return err + } + + return nil +} + +// GetProcessMountInfo get process mount info +func GetProcessMountInfo(hostPath string, pid int32) ([]*mountinfo.Info, error) { + f, err := os.Open(fmt.Sprintf("%s/proc/%d/mountinfo", hostPath, pid)) + if err != nil { + return nil, err + } + + mountInfoList, err := mountinfo.GetMountsFromReader(f, nil) + if err != nil { + return nil, err + } + + return mountInfoList, nil +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/init.go new file mode 100644 index 0000000000..74813e92ea --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under, + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package diskcheck xxx +package diskcheck + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/options.go new file mode 100644 index 0000000000..4e265f1319 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck/options.go @@ -0,0 +1,25 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package diskcheck xxx +package diskcheck + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` +} + +// Validate validate options +func (o *Options) Validate() error { + + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/const.go new file mode 100644 index 0000000000..5dda04005a --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/const.go @@ -0,0 +1,49 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package dnscheck xxx +package dnscheck + +const ( + pluginName = "dnscheck" + NormalStatus = "ok" + ResolvFailStauts = "resolvefailed" + initContent = `interval: 600` + + clusterDNSType = "pod dns check" + clusterDNSclusterType = "cluster" + clusterDNShostType = "node" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "dns检查", + clusterDNSType: "节点DNS检查", + clusterDNSclusterType: "容器域名解析", + clusterDNShostType: "节点域名解析", + + ResolvFailStauts: "解析失败", + NormalStatus: "正常", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + clusterDNSType: clusterDNSType, + clusterDNSclusterType: clusterDNSclusterType, + clusterDNShostType: clusterDNShostType, + + ResolvFailStauts: ResolvFailStauts, + NormalStatus: NormalStatus, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/dnscheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/dnscheck.go new file mode 100644 index 0000000000..d742685f6e --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/dnscheck.go @@ -0,0 +1,336 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package dnscheck xxx +package dnscheck + +import ( + "context" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "net" + "runtime/debug" + "strings" + "sync" + "time" + + "github.com/miekg/dns" + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + dnsLock sync.Mutex + ready bool + Detail Detail + pluginmanager.NodePlugin +} + +// DnsCheckResult xxx +type DnsCheckResult struct { + Type string `yaml:"type"` + Node string `yaml:"node"` + Status string `yaml:"status"` +} + +// Detail xxx +type Detail struct { +} + +var ( + dnsAvailabilityLabels = []string{"type", "node", "status"} + dnsAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "dns_availability", + Help: "dns_availability, 1 means OK", + }, dnsAvailabilityLabels) + + dnsLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "dns_latency", + Help: "dns_latency", + Buckets: []float64{0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2}, + }, []string{}) +) + +func init() { + metricmanager.Register(dnsAvailability) + metricmanager.Register(dnsLatency) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.StopChan = make(chan int) + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.CheckLock.Lock() + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + p.CheckLock.Unlock() + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check xxx +func (p *Plugin) Check() { + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + node := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := node.NodeName + p.ready = false + + defer func() { + if r := recover(); r != nil { + klog.Errorf("dnscheck failed: %s, stack: %v\n", r, string(debug.Stack())) + } + }() + + dnsStatusGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + + ctx := util.GetCtx(time.Second * 10) + + status, err := p.checkDNS(ctx, append(p.opt.CheckDomain, "kubernetes.default.svc.cluster.local"), "", node.ClientSet) + dnsStatusGaugeVecSetList = append(dnsStatusGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{"pod", nodeName, status}, + Value: float64(1), + }) + if status != NormalStatus { + result = append(result, pluginmanager.CheckItem{ + // 写入configmap默认使用英文 + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: false, + Detail: fmt.Sprintf("pod cluster dns resolv failed: %s", err.Error()), + Status: status, + }) + } else { + result = append(result, pluginmanager.CheckItem{ + // 写入configmap默认使用英文 + ItemName: pluginName, + ItemTarget: nodeName, + Status: status, + Normal: true, + Level: pluginmanager.RISKLevel, + Detail: fmt.Sprintf("pod cluster dns resolv %v normally", append(p.opt.CheckDomain, "kubernetes.default.svc.cluster.local")), + }) + klog.Infof("cluster dns check ok") + } + + ctx = util.GetCtx(time.Second * 10) + status, err = p.checkDNS(ctx, p.opt.CheckDomain, fmt.Sprintf("%s/etc/resolv.conf", node.HostPath), node.ClientSet) + dnsStatusGaugeVecSetList = append(dnsStatusGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{"host", nodeName, status}, + Value: float64(1), + }) + + if status != NormalStatus { + if err != nil { + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: false, + Detail: fmt.Sprintf("pod cluster dns failed: %s", err.Error()), + Status: status, + }) + klog.Errorf("host dns check failed: %s %s", status, err.Error()) + } + } else { + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: true, + Status: pluginmanager.NormalStatus, + Detail: fmt.Sprintf("pod cluster dns resolv %v normally", p.opt.CheckDomain), + }) + klog.Infof("host dns check ok") + } + + p.Result = pluginmanager.CheckResult{ + Items: result, + } + metricmanager.RefreshMetric(dnsAvailability, dnsStatusGaugeVecSetList) + + if !p.ready { + p.ready = true + } +} + +func (p *Plugin) checkDNS(ctx context.Context, domainList []string, path string, clientSet *kubernetes.Clientset) (string, error) { + status := NormalStatus + select { + case <-ctx.Done(): + status = "timeout" + break + + default: + ipList := make([]string, 0, 0) + + if path != "" { + config, _ := dns.ClientConfigFromFile(path) + ipList = config.Servers + } else { + ep, err := clientSet.CoreV1().Endpoints("kube-system").Get(context.Background(), "kube-dns", v1.GetOptions{ResourceVersion: "0"}) + if err != nil { + klog.Errorf("get dns endpoint failed: %s", err.Error()) + return status, err + } + + for _, subset := range ep.Subsets { + for _, address := range subset.Addresses { + ipList = append(ipList, address.IP) + } + } + } + + if len(ipList) > 0 { + for ip := 0; ip < len(ipList); ip++ { + r, err := createResolver(ipList[ip]) + if err != nil { + klog.Errorf("create resolver failed: %s", err.Error()) + status = "setresolverfailed" + return status, err + } + + for _, domain := range domainList { + if path != "" && strings.Contains(domain, "svc.cluster.local") { + continue + } + + latency, err := dnsLookup(r, domain) + if err != nil { + klog.Errorf("%s resolve %s failed: %s", ipList[ip], domain, err.Error()) + status = ResolvFailStauts + return status, err + } else { + klog.Errorf("%s resolve %s success", ipList[ip], domain) + } + + dnsLatency.WithLabelValues().Observe(float64(latency) / float64(time.Second)) + } + } + } else { + status = "noserver" + err := fmt.Errorf("No available dns server") + klog.Errorf(err.Error()) + return status, err + } + } + + return status, nil +} + +func createResolver(ip string) (*net.Resolver, error) { + r := &net.Resolver{} + // if we're supplied a null string, return an error + if len(ip) < 1 { + return r, fmt.Errorf("Need a valid ip to create Resolver") + } + // attempt to create the resolver based on the string + r = &net.Resolver{ + PreferGo: true, + Dial: func(ctx context.Context, network, address2 string) (net.Conn, error) { + d := net.Dialer{ + Timeout: time.Millisecond * time.Duration(10000), + } + return d.DialContext(ctx, "udp", ip+":53") + }, + } + return r, nil +} + +func dnsLookup(r *net.Resolver, host string) (time.Duration, error) { + start := time.Now() + addrs, err := r.LookupHost(util.GetCtx(10*time.Second), host) + if err != nil { + return 0, fmt.Errorf("DNS Status check determined that %s is DOWN: %s", host, err.Error()) + } + + if len(addrs) == 0 { + return 0, fmt.Errorf("No host was found") + } + + return time.Since(start), nil +} + +// GetResult return check result by cluster ID +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} + +// Ready return true if cluster check is over +func (p *Plugin) Ready(string) bool { + return p.ready +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/init.go similarity index 78% rename from bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/init.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/init.go index 3dbc255057..bd26abe3c0 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/init.go @@ -10,17 +10,11 @@ * limitations under the License. */ +// Package dnscheck xxx package dnscheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/options.go similarity index 97% rename from bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/options.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/options.go index 699bb17d16..86903c2448 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck/options.go @@ -10,6 +10,7 @@ * limitations under the License. */ +// Package dnscheck xxx package dnscheck // Options bcs log options diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/const.go new file mode 100644 index 0000000000..95359ab975 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/const.go @@ -0,0 +1,56 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package hwcheck xxx +package hwcheck + +const ( + pluginName = "hwcheck" + NormalStatus = "ok" + ffStatus = "ff" + logMatchedStatus = "matched" + + DeviceCheckItemType = "device check" + DeviceCheckItemTarget = "device" + + LogCheckItemType = "log check" + LogCheckItemTarget = "log" + + initContent = `interval: 600` +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "硬件检查", + NormalStatus: "正常", + ffStatus: ffStatus, + logMatchedStatus: "匹配到异常日志", + DeviceCheckItemType: "设备检查", + DeviceCheckItemTarget: "设备", + + LogCheckItemType: "日志检查", + LogCheckItemTarget: "日志", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + NormalStatus: NormalStatus, + ffStatus: ffStatus, + logMatchedStatus: logMatchedStatus, + DeviceCheckItemType: DeviceCheckItemType, + DeviceCheckItemTarget: DeviceCheckItemTarget, + LogCheckItemType: LogCheckItemType, + LogCheckItemTarget: LogCheckItemTarget, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck.go new file mode 100644 index 0000000000..40ed38bdfe --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck.go @@ -0,0 +1,276 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package hwcheck xxx +package hwcheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "os" + "path" + "strings" + "time" + + "github.com/jaypipes/ghw" + "github.com/jaypipes/ghw/pkg/option" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +var ( + deviceStatusLabels = []string{"id", "name", "node", "revision"} + hardwareErrorLabels = []string{"type", "node"} + deviceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "device_status", + Help: "device_status", + }, deviceStatusLabels) + + hardwareError = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "hardware_error_count", + Help: "hardware_error_count", + }, hardwareErrorLabels) +) + +func init() { + metricmanager.Register(deviceStatus) + metricmanager.Register(hardwareError) +} + +// Plugin XXX +type Plugin struct { + opt *Options + ready bool + Detail Detail + pluginmanager.NodePlugin +} + +// Detail XXX +type Detail struct { +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + //err := util.ReadFromStr(p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + node := pluginmanager.Pm.GetConfig().NodeConfig + + logFileConfigList := make([]LogFileConfig, 0, 0) + for _, logFileConfig := range p.opt.LogFileConfigList { + logFileConfig.logFile = util.NewLogFile(path.Join(node.HostPath, logFileConfig.Path)) + if logFileConfig.logFile == nil { + klog.Errorf("%s no such file or directory, skip", logFileConfig.Path) + continue + } + + logFileConfig.logFile.SetSearchKey(logFileConfig.KeyWordList) + + logFileConfigList = append(logFileConfigList, logFileConfig) + } + + p.opt.LogFileConfigList = logFileConfigList + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check check for hardware problem +func (p *Plugin) Check() { + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + node := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := node.NodeName + p.ready = false + + deviceList, err := GetDeviceStatus(node.HostPath) + if err != nil { + klog.Errorf(err.Error()) + return + } + + deviceStatusGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + for _, device := range deviceList { + deviceStatusGaugeVecSetList = append(deviceStatusGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{device.Address, strings.Replace(device.Vendor.Name, " ", "_", -1), nodeName, device.Revision}, + Value: float64(1), + }) + if device.Revision == "ff" { + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("device %s revision is %s", device.Vendor.Name, device.Revision), + Status: ffStatus, + }) + } + + } + + metricmanager.SetMetric(deviceStatus, deviceStatusGaugeVecSetList) + + hardwareErrorGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + for _, logFileConfig := range p.opt.LogFileConfigList { + logList, err := logFileConfig.logFile.CheckNewEntriesOnce() + if err != nil { + klog.Errorf(err.Error()) + } else { + + for _, key := range logFileConfig.KeyWordList { + count := 0 + for _, line := range logList { + if strings.Contains(line, key) { + count++ + //hardwareError.WithLabelValues(logFileConfig.Rule, nodeName).Add(1) + //break + } + } + + hardwareErrorGVSList = append(hardwareErrorGVSList, &metricmanager.GaugeVecSet{ + Labels: []string{logFileConfig.Rule, nodeName}, + Value: float64(count), + }) + + if count > 0 { + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("%s found %s in logfile %s", logFileConfig.Rule, key, logFileConfig.Path), + Status: logMatchedStatus, + }) + } + } + } + } + metricmanager.RefreshMetric(hardwareError, hardwareErrorGVSList) + + p.Result = pluginmanager.CheckResult{ + Items: result, + } + + if !p.ready { + p.ready = true + } +} + +// GetDeviceStatus xxx +func GetDeviceStatus(hostPath string) ([]*ghw.PCIDevice, error) { + pciInfo, err := ghw.PCI(&option.Option{ + Chroot: &hostPath, + }) + if err != nil { + return nil, err + } + + deviceList := make([]*ghw.PCIDevice, 0, 0) + for _, device := range pciInfo.Devices { + file, openErr := os.Open(fmt.Sprintf("/sys/bus/pci/devices/%s/config", device.Address)) + if openErr != nil { + klog.Errorf("Error opening file: %s", openErr.Error()) + continue + } + defer file.Close() + + revision := make([]byte, 1) + _, err = file.ReadAt(revision, 8) // Revision ID is at offset 8 + if err != nil { + klog.Errorf("Error reading file: %s", err.Error()) + continue + } + + if fmt.Sprintf("%x", revision[0]) == "ff" { + device.Revision = fmt.Sprintf("%x", revision[0]) + deviceList = append(deviceList, device) + } + + klog.Infof("%s %s %s %s status is %x", device.Address, device.Vendor.Name, device.Class.Name, device.Product.Name, revision[0]) + } + + return deviceList, nil +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck_test.go new file mode 100644 index 0000000000..1e2280d08c --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/hwcheck_test.go @@ -0,0 +1,14 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package hwcheck xxx +package hwcheck diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/init.go new file mode 100644 index 0000000000..9949b32718 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package hwcheck xxx +package hwcheck + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/options.go new file mode 100644 index 0000000000..389eb4f3e7 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck/options.go @@ -0,0 +1,37 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package hwcheck xxx +package hwcheck + +import ( + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` + LogFileConfigList []LogFileConfig `json:"logFileConfigs" yaml:"logFileConfigs"` +} + +// LogFileConfig xxx +type LogFileConfig struct { + Path string `json:"path" yaml:"path"` + KeyWordList []string `json:"keyWordList" yaml:"keyWordList"` + Rule string `json:"rule" yaml:"rule"` + logFile *util.LogFile +} + +// Validate validate options +func (o *Options) Validate() error { + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/const.go new file mode 100644 index 0000000000..c6b0335628 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/const.go @@ -0,0 +1,54 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package netcheck xxx +package netcheck + +const ( + NormalStatus = "ok" + pluginName = "netcheck" + netCheckTarget = pluginName + // 包含list namespace下全量的pod操作,不建议太过频繁 + initContent = `interval: 3600` + errorStatus = "err" + devDistinctStatus = "dev_distinct" + devCheckItemType = "dev" + NodeagentItemTarget = "node agent pod" + PingFailedStatus = "pingfailed" + NoTargetPodStatus = "notargetpod" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "网络检查", + NodeagentItemTarget: NodeagentItemTarget, + errorStatus: errorStatus, + PingFailedStatus: "ping失败", + NoTargetPodStatus: "没有可探测的pod", + NormalStatus: "正常", + devDistinctStatus: devDistinctStatus, + devCheckItemType: devCheckItemType, + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + NodeagentItemTarget: NodeagentItemTarget, + PingFailedStatus: PingFailedStatus, + NoTargetPodStatus: NoTargetPodStatus, + errorStatus: errorStatus, + NormalStatus: NormalStatus, + devDistinctStatus: devDistinctStatus, + devCheckItemType: devCheckItemType, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/init.go similarity index 78% rename from bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/init.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/init.go index 459688c7d2..40eff4da69 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/init.go @@ -10,17 +10,11 @@ * limitations under the License. */ +// Package netcheck xxx package netcheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck.go new file mode 100644 index 0000000000..7d42cc2910 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck.go @@ -0,0 +1,353 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package netcheck xxx +package netcheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + pluginmanager "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "net" + "os/exec" + "runtime/debug" + "strings" + "sync" + "syscall" + "time" + "unsafe" + + "github.com/prometheus/client_golang/prometheus" + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + dnsLock sync.Mutex + ready bool + Detail Detail + pluginmanager.NodePlugin +} + +// Detail xxx +type Detail struct { +} + +var ( + netAvailabilityLabels = []string{"node", "targetnode", "status"} + netAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "net_availability", + Help: "net_availability, 1 means OK", + }, netAvailabilityLabels) +) + +func init() { + metricmanager.Register(netAvailability) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.StopChan = make(chan int) + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.CheckLock.Lock() + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + p.CheckLock.Unlock() + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return "netcheck" +} + +// Check xxx +func (p *Plugin) Check() { + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + p.ready = false + result := make([]pluginmanager.CheckItem, 0, 0) + nodeconfig := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := nodeconfig.NodeName + + defer func() { + if r := recover(); r != nil { + klog.Errorf("netcheck failed: %s, stack: %v\n", r, string(debug.Stack())) + } + p.Result = pluginmanager.CheckResult{ + Items: result, + } + + if !p.ready { + p.ready = true + } + }() + + gaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + defer func() { + metricmanager.RefreshMetric(netAvailability, gaugeVecSetList) + }() + + cidr := nodeconfig.Node.Spec.PodCIDR + for key, val := range nodeconfig.Node.Annotations { + if key == "tke.cloud.tencent.com/pod-cidrs" { + cidr = val + break + } + } + // 检测网卡配置 + devStatus, err := CheckDevIP(cidr) + if err != nil { + klog.Errorf(err.Error()) + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: false, + Detail: fmt.Sprintf("check interface failed: %s", err.Error()), + Status: devStatus, + }) + + gaugeVecSetList = append(gaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeconfig.NodeName, nodeconfig.NodeName, devStatus}, + Value: float64(1), + }) + return + } + + // 检查节点的容器网络 + // checkitem上报是否有pod ping不通 + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: true, + Detail: "ping dns pod success", + Status: NormalStatus, + } + + status, err := CheckOverLay(nodeconfig.ClientSet) + if err != nil { + checkItem.Normal = false + checkItem.Detail = err.Error() + checkItem.Status = status + klog.Errorf(err.Error()) + } + + result = append(result, checkItem) + gaugeVecSetList = append(gaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeconfig.NodeName, nodeconfig.NodeName, status}, + Value: float64(1), + }) +} + +// CheckOverLay xxx +func CheckOverLay(clientSet *kubernetes.Clientset) (string, error) { + //测试访问dns pod是否OK + ipList := make([]string, 0, 0) + ep, err := clientSet.CoreV1().Endpoints("kube-system").Get(util.GetCtx(10*time.Second), "kube-dns", v1.GetOptions{ResourceVersion: "0"}) + if err != nil { + return errorStatus, err + } + + for _, subset := range ep.Subsets { + for _, address := range subset.Addresses { + ipList = append(ipList, address.IP) + } + } + + for _, ip := range ipList { + pingStatus := PINGCheck(ip) + if pingStatus != NormalStatus { + return pingStatus, fmt.Errorf("ping failed: %s", ip) + } + } + + klog.Infof("olveray ping %v success", ipList) + + return NormalStatus, nil +} + +// PINGCheck xxx +func PINGCheck(ip string) string { + pingCmd := exec.Command("ping", "-c1", "-W1", ip) + output, err := pingCmd.CombinedOutput() + if err != nil { + klog.Error(string(output), err.Error()) + return PingFailedStatus + } + + return NormalStatus +} + +// GetIFAddr xxx +func GetIFAddr(ifName string) (net.IP, error) { + sock, err := syscall.Socket(syscall.AF_INET, syscall.SOCK_DGRAM, 0) + if err != nil { + return nil, err + } + defer func() { + err = syscall.Close(sock) + if err != nil { + klog.Infof("close sock failed: %s ", err.Error()) + } + }() + + ifreq, err := unix.NewIfreq(ifName) + + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(sock), uintptr(unix.SIOCGIFADDR), uintptr(unsafe.Pointer(ifreq))) + if errno != 0 { + return nil, errno + } + + addr, err := ifreq.Inet4Addr() + if err != nil { + return nil, err + } + + ip := net.IPv4(addr[0], addr[1], addr[2], addr[3]) + return ip, err +} + +// CheckDevIP 目前只兼容cni0与flannel的场景 +func CheckDevIP(cidr string) (string, error) { + _, subnet, _ := net.ParseCIDR(cidr) + + ip, _, linkName, err := GetLinkIp("bridge") + if err != nil { + if err.Error() == "not found" { + return NormalStatus, nil + } + return errorStatus, err + } + + if !subnet.Contains(ip) { + return devDistinctStatus, fmt.Errorf("node cidr is %s, bridge %s is %s", cidr, linkName, ip) + } else { + klog.Infof("check netinterface success, cidr: %s, bridge %s: %s", cidr, linkName, ip) + } + + vxLanIp, _, linkName, err := GetLinkIp("vxlan") + if err != nil { + if err.Error() == "not found" { + return NormalStatus, nil + } + return errorStatus, err + } + + if !subnet.Contains(vxLanIp) { + return devDistinctStatus, fmt.Errorf("node cidr is %s, vxlan %s is %s", cidr, linkName, vxLanIp) + } else { + klog.Infof("check netinterface success, cidr: %s, vxlanIP %s: %s", cidr, linkName, vxLanIp) + } + + return NormalStatus, nil +} + +// GetLinkIp xxx +func GetLinkIp(deviceType string) (net.IP, net.IPMask, string, error) { + links, err := netlink.LinkList() + if err != nil { + return nil, nil, "", err + } + + for _, link := range links { + if strings.Contains(link.Attrs().Name, "docker") { + continue + } + if link.Type() == deviceType { + + addrs, err := netlink.AddrList(link, netlink.FAMILY_ALL) + if err != nil { + return nil, nil, link.Attrs().Name, err + } + for _, addr := range addrs { + return addr.IP, addr.Mask, link.Attrs().Name, nil + } + } + } + + return nil, nil, "", fmt.Errorf("not found") +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck_test.go new file mode 100644 index 0000000000..abac8bfbed --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/netcheck_test.go @@ -0,0 +1,22 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package netcheck xxx +package netcheck + +import ( + "testing" +) + +func TestGetIFAddr(t *testing.T) { + GetIFAddr("eth1") +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/options.go similarity index 74% rename from bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/options.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/options.go index ddb58960bd..4b49aabab3 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck/options.go @@ -10,18 +10,20 @@ * limitations under the License. */ -package eventrecorder +// Package netcheck xxx +package netcheck // Options bcs log options type Options struct { - Interval int `json:"interval" yaml:"interval"` - ResourceKinds []string `json:"resourceKinds" yaml:"resourceKinds"` - EventReasons []string `json:"eventReasons" yaml:"eventReasons"` - Synchronization bool `json:"synchronization" yaml:"synchronization"` + Interval int `json:"interval" yaml:"interval"` + LabelSelector string `json:"labelSelector" yaml:"labelSelector"` } // Validate validate options func (o *Options) Validate() error { + if o.LabelSelector == "" { + o.LabelSelector = "name=nodeagent" + } return nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/const.go new file mode 100644 index 0000000000..4d0539afa0 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/const.go @@ -0,0 +1,47 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodeinfocheck xxx +package nodeinfocheck + +const ( + pluginName = "nodeinfocheck" + normalStatus = "ok" + errorStatus = "error" + initContent = `interval: 600` + nodeItemTarget = "node" + ZoneItemType = "zone" + RegionItemType = "region" + InstanceTypeItemType = "instance type" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "节点信息检查", + normalStatus: normalStatus, + nodeItemTarget: "节点", + ZoneItemType: "可用区", + RegionItemType: "地域", + InstanceTypeItemType: "机型", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + normalStatus: normalStatus, + nodeItemTarget: nodeItemTarget, + ZoneItemType: ZoneItemType, + RegionItemType: RegionItemType, + InstanceTypeItemType: InstanceTypeItemType, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/init.go new file mode 100644 index 0000000000..0eee58721e --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodeinfocheck xxx +package nodeinfocheck + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/nodeinfocheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/nodeinfocheck.go new file mode 100644 index 0000000000..cf4de9015e --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/nodeinfocheck.go @@ -0,0 +1,209 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodeinfocheck xxx +package nodeinfocheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + pluginmanager "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "os" + "path" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/api/qcloud" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + ready bool + pluginmanager.NodePlugin + Detail Detail +} + +// Detail xxx +type Detail struct { +} + +var ( + nodeMetadataLabel = []string{"node", "item", "value"} + nodeMetadataMetric = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "node_metadata", + Help: "node_metadata, 1 means OK", + }, nodeMetadataLabel) +) + +func init() { + metricmanager.Register(nodeMetadataMetric) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.StopChan = make(chan int) + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + if err != nil { + klog.Fatalf("%s get incluster config failed, only can run as incluster mode", p.Name()) + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.CheckLock.Lock() + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + p.CheckLock.Unlock() + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check for node's platform info +func (p *Plugin) Check() { + result := pluginmanager.CheckResult{ + Items: make([]pluginmanager.CheckItem, 0, 0), + InfoItemList: make([]pluginmanager.InfoItem, 0, 0), + } + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + p.ready = false + + nodeconfig := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := nodeconfig.NodeName + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + + // qcloudinfo + _, err := os.Stat(path.Join(nodeconfig.HostPath, "/etc/cloud/cloud.cfg")) + if err != nil { + klog.Error("now only support qcloud cvm get nodeinfo, skip, %s", err.Error()) + } else { + nodeMetadata, err := qcloud.GetQcloudNodeMetadata() + if err != nil { + klog.Errorf("get cvm info failed: %s", err.Error()) + if !os.IsNotExist(err) { + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("get nodeinfo failed: %s", err.Error()), + Status: errorStatus, + } + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, ZoneItemType, errorStatus}, Value: float64(1), + }) + result.Items = append(result.Items, checkItem) + } + } else { + klog.Infof("node metadata is %v", *nodeMetadata) + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, ZoneItemType, nodeMetadata.Zone}, Value: float64(1), + }) + result.InfoItemList = append(result.InfoItemList, pluginmanager.InfoItem{ + ItemName: ZoneItemType, + Labels: map[string]string{"type": ZoneItemType}, + Result: nodeMetadata.Zone, + }) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, RegionItemType, nodeMetadata.Region}, Value: float64(1), + }) + result.InfoItemList = append(result.InfoItemList, pluginmanager.InfoItem{ + ItemName: RegionItemType, + Labels: map[string]string{"type": RegionItemType}, + Result: nodeMetadata.Region, + }) + + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, InstanceTypeItemType, nodeMetadata.InstanceType}, Value: float64(1), + }) + result.InfoItemList = append(result.InfoItemList, pluginmanager.InfoItem{ + ItemName: InstanceTypeItemType, + Labels: map[string]string{"type": InstanceTypeItemType}, + Result: nodeMetadata.InstanceType, + }) + } + } + + metricmanager.RefreshMetric(nodeMetadataMetric, gvsList) + p.Result = result + + if !p.ready { + p.ready = true + } +} + +// GetResult return check result by cluster ID +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} + +// Ready return true if cluster check is over +func (p *Plugin) Ready(string) bool { + return p.ready +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/options.go new file mode 100644 index 0000000000..fe880d6956 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck/options.go @@ -0,0 +1,25 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodeinfocheck xxx +package nodeinfocheck + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` +} + +// Validate validate options +func (o *Options) Validate() error { + + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/const.go new file mode 100644 index 0000000000..0047f28316 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/const.go @@ -0,0 +1,55 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package processcheck xxx +package processcheck + +const ( + pluginName = "processcheck" + NormalStatus = "ok" + ProcessTarget = "process" + processStatusCheckItemType = "process status check" + processNotFoundStatus = "process notfound" + processConfigFileNotFoundStatus = "configfile_notfound" + zStatus = "z" + dStatus = "d" + processOtherErrorStatus = "error" + initContent = `interval: 600` +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "进程检查", + processNotFoundStatus: "进程不存在", + processConfigFileNotFoundStatus: "配置文件不存在", + processStatusCheckItemType: "进程状态检查", + NormalStatus: "正常", + processOtherErrorStatus: processOtherErrorStatus, + zStatus: zStatus, + dStatus: dStatus, + ProcessTarget: "进程", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + processNotFoundStatus: processNotFoundStatus, + processConfigFileNotFoundStatus: processConfigFileNotFoundStatus, + processStatusCheckItemType: processStatusCheckItemType, + NormalStatus: NormalStatus, + processOtherErrorStatus: processOtherErrorStatus, + zStatus: zStatus, + dStatus: dStatus, + ProcessTarget: ProcessTarget, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/init.go new file mode 100644 index 0000000000..85740740b2 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package processcheck xxx +package processcheck + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/options.go new file mode 100644 index 0000000000..9f7bfbe68b --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/options.go @@ -0,0 +1,60 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package processcheck xxx +package processcheck + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` + Processes []ProcessCheckConfig `json:"processes" yaml:"processes"` +} + +// ProcessCheckConfig xxx +type ProcessCheckConfig struct { + Name string `json:"name" yaml:"name"` + ConfigFile string `json:"configFile" yaml:"configFile"` +} + +// Validate validate options +func (o *Options) Validate() error { + if o.Processes == nil { + o.Processes = []ProcessCheckConfig{ + {Name: "kubelet"}, + {Name: "containerd", ConfigFile: "/etc/containerd/config.toml"}, + {Name: "dockerd", ConfigFile: "/etc/docker/daemon.json"}, + } + } + + o.Processes = removeDuplicates(o.Processes) + return nil +} + +func removeDuplicates(pList []ProcessCheckConfig) []ProcessCheckConfig { + result := []ProcessCheckConfig{} + + for _, p1 := range pList { + flag := false + for _, p2 := range result { + if p1.Name == p2.Name && p1.ConfigFile == p2.ConfigFile { + flag = true + break + } + } + + if !flag { + result = append(result, p1) + } + } + + return result +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/processcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/processcheck.go new file mode 100644 index 0000000000..cbabf3a9f2 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck/processcheck.go @@ -0,0 +1,365 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package processcheck xxx +package processcheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "os" + "path" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/types/process" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +var ( + processStatusLabels = []string{"name", "status", "node"} + processStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "process_status", + Help: "process_status", + }, processStatusLabels) + processCPU = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "agent_process_cpu_seconds_total", + Help: "agent_process_cpu_seconds_total, 1 means OK", + }, []string{"name", "node"}) + abnormalProcessStatusMap = make(map[int32]process.ProcessStatus) + + processCPUCheckFlag = false + processCPUCheckMap = make(map[int32]int32) + processCPUCheckMapLock sync.Mutex +) + +func init() { + metricmanager.Register(processStatus) + metricmanager.Register(processCPU) +} + +// Plugin xxx +type Plugin struct { + opt *Options + ready bool + + // detail用来记录详细的检查信息到configmap,提供给cluster-reporter做进一步分析 + Detail Detail + pluginmanager.NodePlugin +} + +// Detail xxx +type Detail struct { + ProcessInfo []process.ProcessInfo + ProcessStatus []process.ProcessStatus +} + +// ProcessCheckResult xxx +type ProcessCheckResult struct { + Node string `yaml:"node"` + Status string `yaml:"status"` + Name string `yaml:"name"` +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + p.Detail = Detail{} + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 1 + processCPUCheckFlag = false + klog.Infof("plugin %s stopped", p.Name()) + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check xxx +func (p *Plugin) Check() { + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + nodeconfig := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := nodeconfig.NodeName + + processInfoList := make([]process.ProcessInfo, 0, 0) + processGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + + processStatusList, err := process.GetProcessStatus() + if err != nil { + klog.Errorf("Get process status failed: %s", err.Error()) + } + + // 检测所有进程状态 + newAbnormalProcessStatusMap := make(map[int32]process.ProcessStatus) + abnormalProcessStatusList := make([]process.ProcessStatus, 0, 0) + processCPUGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + for _, pstatus := range processStatusList { + if pstatus.Status == "D" || pstatus.Status == "Z" { + klog.Infof("status of process %s %s is %s", pstatus.Pid, pstatus.Name, pstatus.Status) + // 避免如正常的等待IO被计入D状态进程 + newAbnormalProcessStatusMap[pstatus.Pid] = pstatus + if abnormalProcessStatus, ok := abnormalProcessStatusMap[pstatus.Pid]; ok && pstatus.Status == "D" { + if abnormalProcessStatus.Pid == pstatus.Pid && abnormalProcessStatus.CreateTime == pstatus.CreateTime && abnormalProcessStatus.CpuTime == pstatus.CpuTime { + // cputime didn't increase, means process stayed in D status in this interval + processGaugeVecSetList = append(processGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{pstatus.Name, pstatus.Status, nodeName}, Value: float64(1), + }) + + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("%s process %s status is %s", nodeName, pstatus.Name, pstatus.Status), + Level: pluginmanager.WARNLevel, + Status: dStatus, + }) + abnormalProcessStatusList = append(abnormalProcessStatusList, pstatus) + } + } else if pstatus.Status == "Z" { + processGaugeVecSetList = append(processGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{pstatus.Name, pstatus.Status, nodeName}, Value: float64(1), + }) + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("%s process %s status is %s", nodeName, pstatus.Name, pstatus.Status), + Level: pluginmanager.WARNLevel, + Status: zStatus, + }) + abnormalProcessStatusList = append(abnormalProcessStatusList, pstatus) + } + } + } + + RecordProcessCpu([]string{"kswapd"}, processStatusList) + + if processCPUCheckFlag { + go func() { + for { + processCPUCheckMapLock.Lock() + defer func() { + processCPUCheckMapLock.Unlock() + }() + if !processCPUCheckFlag { + return + } + + for _, pid := range processCPUCheckMap { + pStatus, err := process.GetProcessStatusByPID(pid) + if err != nil { + klog.Errorf(err.Error()) + continue + } + + processCPUGVSList = append(processCPUGVSList, &metricmanager.GaugeVecSet{ + Labels: []string{pStatus.Name, nodeName}, + Value: pStatus.CpuTime, + }) + } + + metricmanager.RefreshMetric(processCPU, processCPUGVSList) + time.Sleep(time.Second * 30) + + } + }() + } + + abnormalProcessStatusMap = newAbnormalProcessStatusMap + + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + Normal: true, + Detail: "", + Level: pluginmanager.WARNLevel, + Status: NormalStatus, + } + // status中只记录异常的进程状态 + p.Detail.ProcessStatus = abnormalProcessStatusList + + // 检测opt中指定的进程,记录进程详细信息 + for _, pcc := range p.opt.Processes { + checkItem.ItemTarget = nodeName + processInfo, processErr := process.GetProcessInfo(pcc.Name, 0) + if processErr != nil { + klog.Errorf("Get process %s info failed: %s", pcc.Name, processErr.Error()) + checkItem.Detail = fmt.Sprintf("Get process %s info failed: %s", pcc.Name, processErr.Error()) + checkItem.Normal = false + checkItem.Status = processOtherErrorStatus + + result = append(result, checkItem) + + processGaugeVecSetList = append(processGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{pcc.Name, processOtherErrorStatus, nodeName}, Value: float64(1), + }) + + continue + } + + if pcc.ConfigFile != "" { + configFile, configFileErr := getConfigFile(pcc) + if configFileErr != nil { + klog.Errorf(configFileErr.Error()) + + checkItem.Normal = false + checkItem.Detail = fmt.Sprintf("Get process %s info failed: %s", pcc.Name, configFileErr.Error()) + checkItem.ItemTarget = nodeName + checkItem.Status = processOtherErrorStatus + + result = append(result, checkItem) + + processGaugeVecSetList = append(processGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{pcc.Name, processOtherErrorStatus, nodeName}, Value: float64(1), + }) + } else if pcc.ConfigFile != "" { + processInfo.ConfigFiles[pcc.ConfigFile] = configFile + } + } + + if processInfo != nil { + processInfoList = append(processInfoList, *processInfo) + } + } + + if len(processGaugeVecSetList) == 0 { + checkItem.ItemTarget = nodeName + processGaugeVecSetList = append(processGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{"", NormalStatus, nodeName}, Value: float64(1), + }) + + result = append(result, checkItem) + } + + // info中记录所有指定的进程信息 + p.Detail.ProcessInfo = processInfoList + + p.Result = pluginmanager.CheckResult{ + Items: result, + } + + if !p.ready { + p.ready = true + } + + // return result + metricmanager.RefreshMetric(processStatus, processGaugeVecSetList) +} + +// RecordProcessCpu xxx +func RecordProcessCpu(processNameList []string, processStatusList []process.ProcessStatus) { + for _, pstatus := range processStatusList { + for _, name := range processNameList { + if strings.Contains(pstatus.Name, name) { + if _, ok := processCPUCheckMap[pstatus.Pid]; !ok { + processCPUCheckMapLock.Lock() + processCPUCheckMap[pstatus.Pid] = pstatus.Pid + processCPUCheckMapLock.Unlock() + } + } + } + } +} + +// GetConfigfile xxx +func getConfigFile(p ProcessCheckConfig) (string, error) { + if p.ConfigFile != "" { + data, err := os.ReadFile(path.Join(os.Getenv("HOST_PATH"), p.ConfigFile)) + if err != nil { + return "", err + } + return string(data), nil + } else { + return "", nil + } +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetString xxx +func (p *Plugin) GetString(key string) string { + return StringMap[key] +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/const.go new file mode 100644 index 0000000000..0a3b12674b --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/const.go @@ -0,0 +1,42 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package timecheck xxx +package timecheck + +const ( + pluginName = "timecheck" + timeCheckTarget = pluginName + NormalStatus = "ok" + timeErrorStatus = "error" + timeOffsetErrorStatus = "hugeoffset" + initContent = `interval: 600 +timeServers: "time1.tencentyun.com,time2.tencentyun.com,time3.tencentyun.com,time4.tencentyun.com,time5.tencentyun.com"` +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "时间检查", + timeOffsetErrorStatus: "时间偏移过大", + timeErrorStatus: "检测时间失败", + NormalStatus: "正常", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + timeOffsetErrorStatus: timeOffsetErrorStatus, + timeErrorStatus: timeErrorStatus, + NormalStatus: NormalStatus, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/init.go new file mode 100644 index 0000000000..e4f73a553b --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package timecheck xxx +package timecheck + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/options.go new file mode 100644 index 0000000000..9e3d63f2a2 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/options.go @@ -0,0 +1,25 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package timecheck xxx +package timecheck + +// Options bcs log options +type Options struct { + Interval int `json:"interval" yaml:"interval"` + TimeServers string `json:"timeServers" yaml:"timeServers"` +} + +// Validate validate options +func (o *Options) Validate() error { + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck.go new file mode 100644 index 0000000000..9bac7e5be4 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck.go @@ -0,0 +1,225 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package timecheck xxx +package timecheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "strings" + "time" + + "github.com/beevik/ntp" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + ready bool + pluginmanager.NodePlugin + Detail Detail +} + +// Detail xxx +type Detail struct { +} + +var ( + ntpAvailabilityLabels = []string{"node", "status"} + ntpAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ntp_availability", + Help: "ntp_availability, 1 means OK", + }, ntpAvailabilityLabels) +) + +func init() { + metricmanager.Register(ntpAvailability) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.StopChan = make(chan int) + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.CheckLock.Lock() + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + p.CheckLock.Unlock() + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check xxx +func (p *Plugin) Check() { + result := make([]pluginmanager.CheckItem, 0, 0) + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + }() + + nodeconfig := pluginmanager.Pm.GetConfig().NodeConfig + nodeName := nodeconfig.NodeName + p.ready = false + + var gaugeVecSet *metricmanager.GaugeVecSet + + servers := strings.Split(p.opt.TimeServers, ",") + offset, err := GetTimeOffset(servers[rand.Intn(len(servers)-1)]) + if err != nil { + klog.Errorf("get time offset failed: %s", err.Error()) + gaugeVecSet = &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, timeErrorStatus}, + Value: 0, + } + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Normal: false, + Detail: fmt.Sprintf("get time offset failed: %s", err.Error()), + Status: timeErrorStatus, + }) + } else { + klog.Infof("%s result is %.8fs", p.Name(), float64(offset/time.Second)) + + if offset > 3*time.Second { + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Normal: false, + Detail: fmt.Sprintf("%s offset is %v", nodeName, offset), + Status: timeOffsetErrorStatus, + }) + + gaugeVecSet = &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, timeOffsetErrorStatus}, + Value: float64(offset) / float64(time.Second), + } + } + } + + if len(result) == 0 { + gaugeVecSet = &metricmanager.GaugeVecSet{ + Labels: []string{nodeName, "ok"}, + Value: float64(offset) / float64(time.Second), + } + + result = append(result, pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: nodeName, + Level: pluginmanager.RISKLevel, + Status: pluginmanager.NormalStatus, + Normal: true, + Detail: fmt.Sprintf("%s offset is %v", nodeName, offset), + }) + + } + metricmanager.RefreshMetric(ntpAvailability, []*metricmanager.GaugeVecSet{gaugeVecSet}) + p.Result = pluginmanager.CheckResult{ + Items: result, + } + + if !p.ready { + p.ready = true + } + +} + +// GetTimeOffset xxx +func GetTimeOffset(timeserver string) (time.Duration, error) { + localTime := time.Now() + + ntpServer := timeserver + + ntpTime, err := ntp.Time(ntpServer) + if err != nil { + return 0, err + } + + diff := ntpTime.Sub(localTime) + + return diff, nil +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return p.Result +} + +// Execute xxx +func (p *Plugin) Execute() { + p.Check() +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return p.Detail +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck_test.go new file mode 100644 index 0000000000..3f08eb107f --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck/timecheck_test.go @@ -0,0 +1,27 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package timecheck xxx +package timecheck + +import ( + "fmt" + "testing" +) + +func TestKubernetes(t *testing.T) { + offset, err := GetTimeOffset("0.pool.ntp.org") + if err != nil { + t.Errorf(err.Error()) + } + fmt.Println(offset) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/const.go new file mode 100644 index 0000000000..d23d4b85e9 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/const.go @@ -0,0 +1,26 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package uploader xxx +package uploader + +const ( + pluginName = "uploader" +) + +var ( + ChinenseStringMap = map[string]string{} + + EnglishStringMap = map[string]string{} + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/init.go new file mode 100644 index 0000000000..cfbb566275 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/init.go @@ -0,0 +1,20 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package uploader xxx +package uploader + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + +func init() { + pluginmanager.Register(&Plugin{}) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/options.go similarity index 65% rename from bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/options.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/options.go index ea12b69165..ff991570b4 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/options.go @@ -10,24 +10,37 @@ * limitations under the License. */ -package logrecorder +// Package uploader xxx +package uploader + +const ( + initContent = `interval: 600 +` +) // Options bcs log options type Options struct { - Interval int `yaml:"interval" json:"interval"` - Synchronization bool `yaml:"synchronization" json:"synchronization"` - LogRules []LogRule `yaml:"rules" json:"rules"` -} - -// LogRule xxx -type LogRule struct { - RegexStr string `yaml:"regexStr" json:"regex_str"` - LabelList []string `yaml:"labels" json:"labels"` - Name string `yaml:"name" json:"name"` + Interval int `json:"interval" yaml:"interval"` + Type string `json:"type" yaml:"type"` + CopyNum int `json:"copyNum" yaml:"copyNum"` + Namespace string `json:"namespace" yaml:"namespace"` } // Validate validate options func (o *Options) Validate() error { + if o.Type == "" { + o.Type = "k8s" + } + + if o.Interval == 0 { + o.Interval = 300 + } + + if o.CopyNum == 0 { + o.CopyNum = 1 + } + + o.Namespace = "nodeagent" return nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/uploader.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/uploader.go new file mode 100644 index 0000000000..a29f9bd15b --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/uploader/uploader.go @@ -0,0 +1,301 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package uploader xxx +package uploader + +import ( + "os" + "sort" + "strconv" + "strings" + "time" + + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin uploader +type Plugin struct { + opt *Options + pluginmanager.NodePlugin + ready bool +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return nil +} + +// Setup plugin +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err + } + + if err = p.opt.Validate(); err != nil { + return err + } + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + // run as daemon + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } + + return nil +} + +// Check xxx +func (p *Plugin) Check() { + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + p.ready = false + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + p.ready = true + }() + + pluginstr := strings.Replace(pluginmanager.Pm.GetPluginstr(), p.Name(), "", 1) + pluginstr = strings.Replace(pluginstr, ",,", ",", 1) + pluginmanager.Pm.Ready(pluginstr, "") + checkResult := pluginmanager.Pm.GetNodeResult(pluginmanager.Pm.GetPluginstr()) + checkDetail := pluginmanager.Pm.GetNodeDetail(pluginmanager.Pm.GetPluginstr()) + uploadResult := make(map[string]pluginmanager.PluginInfo) + + for _, name := range strings.Split(pluginmanager.Pm.GetPluginstr(), ",") { + uploadResult[name] = pluginmanager.PluginInfo{ + Result: checkResult[name], + Detail: checkDetail[name], + } + } + + nodeinfoData, _ := yaml.Marshal(uploadResult) + + nodeName := os.Getenv("NODE_NAME") + if nodeName == "" { + klog.Errorf("env NODE_NAME must be set, skip upload") + return + } + + restConfig, err := k8s.GetRestConfig() + if err != nil { + klog.Errorf(err.Error()) + return + } + + cs, err := kubernetes.NewForConfig(restConfig) + if err != nil { + klog.Errorf(err.Error()) + return + } + + if p.opt.Type == "k8s" { + ctx := util.GetCtx(time.Second * 10) + + cmList, err := cs.CoreV1().ConfigMaps(p.opt.Namespace).List(ctx, v1.ListOptions{ + ResourceVersion: "0", LabelSelector: "nodeagent=" + nodeName}) + if err != nil { + klog.Errorf(err.Error()) + return + } + + sort.Slice(cmList.Items, func(i, j int) bool { + return cmList.Items[i].Name > cmList.Items[j].Name + }) + + // 获取当前所有的configmap + versionCMList := make(map[int]corev1.ConfigMap) + for _, cm := range cmList.Items { + if !strings.Contains(cm.Name, nodeName+"-v") { + continue + } + + if len(strings.Split(cm.Name, nodeName+"-v")) != 2 { + continue + } + + version, err := strconv.Atoi(strings.Split(cm.Name, nodeName+"-v")[1]) + if err != nil { + klog.Errorf(err.Error()) + return + } + + if version > p.opt.CopyNum { + ctx = util.GetCtx(time.Second * 10) + err = cs.CoreV1().ConfigMaps(p.opt.Namespace).Delete(ctx, cm.Name, v1.DeleteOptions{}) + if err != nil { + klog.Errorf(err.Error()) + return + } + + } + versionCMList[version] = cm + } + + for version, cm := range versionCMList { + if version == p.opt.CopyNum { + //版本最大的configmap无需处理 + continue + } + //滚动当前configmap的内容 + if targetCM, ok := versionCMList[version+1]; ok { + //apiVersion := "v1" + //kind := "ConfigMap" + + targetCM.Data = cm.Data + + ctx = util.GetCtx(time.Second * 10) + _, err = cs.CoreV1().ConfigMaps(p.opt.Namespace).Update(ctx, &targetCM, v1.UpdateOptions{}) + if err != nil { + klog.Errorf(err.Error()) + return + } + + if err != nil { + klog.Errorf(err.Error()) + return + } + + } else { + newCm := corev1.ConfigMap{ + ObjectMeta: v1.ObjectMeta{ + Name: nodeName + "-v" + strconv.Itoa(version+1), + Namespace: p.opt.Namespace, + Labels: map[string]string{ + "nodeagent": nodeName, + }, + }, + Data: cm.Data, + } + + ctx = util.GetCtx(time.Second * 10) + _, err = cs.CoreV1().ConfigMaps(p.opt.Namespace).Create(ctx, &newCm, v1.CreateOptions{}) + if err != nil { + klog.Errorf(err.Error()) + return + } + } + + } + + _, ok := versionCMList[1] + + // 如果**-v1不存在则创建 + if len(versionCMList) == 0 || !ok { + // 新建configmap + newCm := corev1.ConfigMap{ + ObjectMeta: v1.ObjectMeta{ + Name: nodeName + "-v1", + Namespace: p.opt.Namespace, + Labels: map[string]string{ + "nodeagent": nodeName, + }, + }, + } + + newCm.Data = map[string]string{ + "nodeinfo": string(nodeinfoData), + "updateTime": time.Now().Format("2006-01-02 15:04:05.999999999 -0700 MST"), + "nodename": nodeName, + } + + ctx = util.GetCtx(time.Second * 10) + _, err = cs.CoreV1().ConfigMaps(p.opt.Namespace).Create(ctx, &newCm, v1.CreateOptions{}) + if err != nil { + klog.Errorf(err.Error()) + return + } + } else { + // 存在则直接patch + //apiVersion := "v1" + //kind := "ConfigMap" + targetCM := versionCMList[1] + + targetCM.Data = map[string]string{ + "nodeinfo": string(nodeinfoData), + "updateTime": time.Now().Format("2006-01-02 15:04:05.999999999 -0700 MST"), + "nodename": nodeName, + } + + ctx = util.GetCtx(time.Second * 10) + _, err = cs.CoreV1().ConfigMaps(p.opt.Namespace).Update(ctx, &targetCM, v1.UpdateOptions{}) + if err != nil { + klog.Errorf(err.Error()) + return + } + + if err != nil { + klog.Errorf(err.Error()) + return + } + } + } +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Ready xxx +func (p *Plugin) Ready(string) bool { + return p.ready +} + +// GetResult xxx +func (p *Plugin) GetResult(string) pluginmanager.CheckResult { + return pluginmanager.CheckResult{} +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 0 + return nil +} + +// GetString xxx +func (p *Plugin) GetString(key string) string { + return StringMap[key] +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/checkfunctions.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/checkfunctions.go new file mode 100644 index 0000000000..a0dfa454f7 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/checkfunctions.go @@ -0,0 +1,116 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodecheck xxx +package nodecheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/types/process" + "path/filepath" + "strings" +) + +func checkProcess(detail processcheck.Detail, nodeName string) []pluginmanager.CheckItem { + result := make([]pluginmanager.CheckItem, 0, 0) + + // 做一些通用的检查 + // TODO 后续考虑支持在nodeagent中配置自定义参数检查项 + // 有问题的检查项添加到result + for _, processInfo := range detail.ProcessInfo { + switch filepath.Base(processInfo.BinaryPath) { + case "dockerd": + result = append(result, checkDocker(processInfo)...) + case "kubelet": + result = append(result, checkKubelet(processInfo)...) + } + } + + for key, val := range result { + val.ItemTarget = nodeName + result[key] = val + } + + return result +} + +func checkDocker(processInfo process.ProcessInfo) []pluginmanager.CheckItem { + checkItem := pluginmanager.CheckItem{ + ItemName: processConfigCheckItem, + Status: normalStatus, + Level: pluginmanager.WARNLevel, + Detail: fmt.Sprintf(StringMap[ConfigFileDetail], "docker daemon.json"), + Normal: true, + } + + result := make([]pluginmanager.CheckItem, 0, 0) + for fileName, configfile := range processInfo.ConfigFiles { + if filepath.Base(fileName) == "daemon.json" { + if !strings.Contains(configfile, "data-root") && !strings.Contains(configfile, "\"graph\"") { + checkItem.Status = configErrorStatus + checkItem.Normal = false + checkItem.Detail = fmt.Sprintf(StringMap[flagNotSetDetail], "data-root,graph") + } + result = append(result, checkItem) + } + } + + checkItem = pluginmanager.CheckItem{ + ItemName: processConfigCheckItem, + Status: normalStatus, + Level: pluginmanager.WARNLevel, + Detail: fmt.Sprintf(StringMap[ConfigFileDetail], "docker service"), + Normal: true, + } + for fileName, serviceFile := range processInfo.ServiceFiles { + if strings.HasSuffix(filepath.Base(fileName), ".service") { + if strings.Contains(serviceFile, "BindsTo") { + checkItem.Status = configErrorStatus + checkItem.Detail = fmt.Sprintf(StringMap[flagNotSetDetail], "BindsTo") + } + result = append(result, checkItem) + } + } + + return result +} + +func checkKubelet(processInfo process.ProcessInfo) []pluginmanager.CheckItem { + checkItem := pluginmanager.CheckItem{ + ItemName: processConfigCheckItem, + Level: pluginmanager.WARNLevel, + Normal: true, + } + + result := make([]pluginmanager.CheckItem, 0, 0) + + flags := []string{"--root-dir", "--read-only-port=0"} + for _, param := range processInfo.Params { + for index, flag := range flags { + if strings.Contains(param, flag) && flag != "" { + flags[index] = "" + } + } + } + + for _, flag := range flags { + if flag != "" { + checkItem.Status = ConfigNotFoundStatus + checkItem.Detail = "kubelet " + fmt.Sprintf(StringMap[flagNotSetDetail], flag) + result = append(result, checkItem) + } + } + + return result +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/const.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/const.go new file mode 100644 index 0000000000..70431918ee --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/const.go @@ -0,0 +1,54 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodecheck xxx +package nodecheck + +const ( + initContent = `interval: 3600` + nodeagentNamespace = "nodeagent" + pluginName = "nodecheck" + configErrorStatus = "config error" + ConfigNotFoundStatus = "confignotfound" + ConfigFileDetail = "%s config file" + + normalStatus = "ok" + + flagNotSetDetail = "%s is not set, which is recommanded" + + processConfigCheckItem = "process config check" +) + +// 统一format格式 +var ( + ChinenseStringMap = map[string]string{ + processConfigCheckItem: "进程配置检查", + configErrorStatus: "配置错误", + ConfigNotFoundStatus: "配置不存在", + flagNotSetDetail: "%s 参数没有配置,推荐配置", + normalStatus: "正常", + pluginName: "节点检查", + ConfigFileDetail: "%s 配置文件", + } + + EnglishStringMap = map[string]string{ + processConfigCheckItem: processConfigCheckItem, + configErrorStatus: configErrorStatus, + ConfigNotFoundStatus: ConfigNotFoundStatus, + flagNotSetDetail: flagNotSetDetail, + normalStatus: normalStatus, + pluginName: pluginName, + ConfigFileDetail: ConfigFileDetail, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/init.go similarity index 75% rename from bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/init.go rename to bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/init.go index 6d391a8824..69ceb932c4 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/init.go @@ -10,17 +10,12 @@ * limitations under the License. */ -package masterpodcheck +// Package nodecheck xxx +package nodecheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" +// 检测nodeagent namespace下各个节点上报的configmap func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/nodecheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/nodecheck.go new file mode 100644 index 0000000000..323e143f04 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/nodecheck.go @@ -0,0 +1,403 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodecheck xxx +package nodecheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/diskcheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/netcheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/nodeinfocheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/timecheck" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + yaml "gopkg.in/yaml.v2" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/dnscheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/hwcheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/nodeagent/processcheck" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// Plugin xxx +type Plugin struct { + opt *Options + testYamlString string + pluginmanager.ClusterPlugin +} + +var ( + nodeAvailabilityLabels = []string{"target", "bk_biz_id", "item", "item_target", "status"} + nodeAvailability = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_node_availability", + Help: "cluster_node_availability, 1 means OK", + }, nodeAvailabilityLabels) + nodeAvailabilityGaugeVecSetMap = make(map[string][]*metricmanager.GaugeVecSet) +) + +func init() { + metricmanager.Register(nodeAvailability) +} + +// Setup xxx +func (p *Plugin) Setup(configFilePath string, runMode string) error { + p.opt = &Options{} + err := util.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return fmt.Errorf("read clustercheck config file %s failed, err %s", configFilePath, err.Error()) + } + + if err = p.opt.Validate(); err != nil { + return err + } + + p.Result = make(map[string]pluginmanager.CheckResult) + p.ReadyMap = make(map[string]bool) + + interval := p.opt.Interval + if interval == 0 { + interval = 60 + } + + if runMode == "daemon" { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + go p.Check() + } else { + klog.V(3).Infof("the former %s didn't over, skip in this loop", p.Name()) + } + select { + case result := <-p.StopChan: + klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } + } + }() + } else if runMode == "once" { + p.Check() + } + + return nil +} + +// Stop xxx +func (p *Plugin) Stop() error { + p.StopChan <- 1 + klog.Infof("plugin %s stopped", p.Name()) + return nil +} + +// Name xxx +func (p *Plugin) Name() string { + return pluginName +} + +// Check xxx +func (p *Plugin) Check() { + start := time.Now() + p.CheckLock.Lock() + klog.Infof("start %s", p.Name()) + defer func() { + klog.Infof("end %s", p.Name()) + p.CheckLock.Unlock() + metricmanager.SetCommonDurationMetric([]string{p.Name(), "", "", ""}, start) + }() + + clusterConfigs := pluginmanager.Pm.GetConfig().ClusterConfigs + + wg := sync.WaitGroup{} + + // 遍历所有集群 + for _, cluster := range clusterConfigs { + wg.Add(1) + pluginmanager.Pm.Add() + + go func(cluster *pluginmanager.ClusterConfig) { + cluster.Lock() + klog.Infof("start nodecheck for %s", cluster.ClusterID) + + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = false + p.WriteLock.Unlock() + + config := cluster.Config + clusterId := cluster.ClusterID + clusterbiz := cluster.BusinessID + + defer func() { + cluster.Unlock() + pluginmanager.Pm.Done() + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = true + p.WriteLock.Unlock() + wg.Done() + klog.Infof("end nodecheck for %s", cluster.ClusterID) + }() + clusterResult := pluginmanager.CheckResult{ + Items: make([]pluginmanager.CheckItem, 0, 0), + } + + clientSet, _ := k8s.GetClientsetByConfig(config) + cmList, err := clientSet.CoreV1().ConfigMaps(nodeagentNamespace).List(util.GetCtx(10*time.Second), v1.ListOptions{ + ResourceVersion: "0", + }) + if err != nil { + klog.Errorf("get nodeagent configmap from cluster %s failed: %s", clusterId, err.Error()) + return + } + + nodeAvailabilityGVSMap := make(map[string][]*metricmanager.GaugeVecSet) + //遍历该集群的nodeagent configmap + klog.Infof("%s nodeagent configmap num: %d", clusterId, len(cmList.Items)) + for _, configmap := range cmList.Items { + if !strings.HasSuffix(configmap.Name, "-v1") { + continue + } + if _, ok := configmap.Data["nodeinfo"]; !ok { + continue + } + + // 检查更新时间 + if _, err := time.Parse("2006-01-02 15:04:05.999999999 -0700 MST", configmap.Data["updateTime"]); err == nil { + //if time.Now().Sub(updateTime) > time.Hour*24 { + // continue + //} + } else { + continue + } + nodeName := strings.TrimSuffix(configmap.Name, "-v1") + + nodeinfo := make(map[string]pluginmanager.PluginInfo) + err = yaml.Unmarshal([]byte(configmap.Data["nodeinfo"]), nodeinfo) + if err != nil { + klog.Errorf("unmarshal %s nodeinfo %s failed: %s", clusterId, configmap.Name, err.Error()) + continue + } + + // 获取节点的checkitem并生成metric的map + nodeInfo := plugin.NodeInfo{} + checkItemList, infoItemList, nodeGVSMap := checkNodePluginResult(nodeinfo, strings.TrimSuffix(configmap.Name, "-v1"), clusterId, clusterbiz, &nodeInfo) + // 一个节点每类异常指标只能有一个 + for name, list := range nodeGVSMap { + if len(list) > 1 { + nodeGVSMap[name] = list[:1] + } + } + + clusterResult.Items = append(clusterResult.Items, checkItemList...) + clusterResult.InfoItemList = append(clusterResult.InfoItemList, infoItemList...) + for key, nodeGVSList := range nodeGVSMap { + if _, ok := nodeAvailabilityGVSMap[key]; !ok { + nodeAvailabilityGVSMap[key] = make([]*metricmanager.GaugeVecSet, 0, 0) + } + nodeAvailabilityGVSMap[key] = append(nodeAvailabilityGVSMap[key], nodeGVSList...) + } + + cluster.NodeInfo[nodeName] = nodeInfo + } + + nodeAvailabilityGaugeVecSetList := make([]*metricmanager.GaugeVecSet, 0, 0) + for key, gvsList := range nodeAvailabilityGVSMap { + if len(gvsList) == 0 { + nodeAvailabilityGaugeVecSetList = append(nodeAvailabilityGaugeVecSetList, &metricmanager.GaugeVecSet{ + Labels: []string{clusterId, clusterbiz, key, "node", normalStatus}, + Value: 1, + }) + } else { + // 汇总所有节点检测异常指标,一个节点,一类异常指标只能有一条 + for _, gaugeVecSet := range gvsList { + nodeAvailabilityGaugeVecSetList = append(nodeAvailabilityGaugeVecSetList, gaugeVecSet) + } + } + } + + p.WriteLock.Lock() + metricmanager.DeleteMetric(nodeAvailability, nodeAvailabilityGaugeVecSetMap[clusterId]) + nodeAvailabilityGaugeVecSetMap[clusterId] = nodeAvailabilityGaugeVecSetList + metricmanager.SetMetric(nodeAvailability, nodeAvailabilityGaugeVecSetMap[clusterId]) + p.Result[clusterId] = clusterResult + p.WriteLock.Unlock() + }(cluster) + } + + wg.Wait() + + // clean deleted cluster data + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + delete(p.ReadyMap, clusterID) + delete(nodeAvailabilityGaugeVecSetMap, clusterID) + delete(p.Result, clusterID) + metricmanager.DeleteMetric(nodeAvailability, nodeAvailabilityGaugeVecSetMap[clusterID]) + klog.Infof("delete cluster %s", clusterID) + klog.Infof("delete cluster %s", clusterID) + } + } +} + +// checkNodePluginResult 解析node check PluginInfo +func checkNodePluginResult(nodeinfo map[string]pluginmanager.PluginInfo, nodeName string, clusterId, clusterbiz string, nodeInfo *plugin.NodeInfo) ([]pluginmanager.CheckItem, []pluginmanager.InfoItem, map[string][]*metricmanager.GaugeVecSet) { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + infoItemList := make([]pluginmanager.InfoItem, 0, 0) + nodeGVSMap := make(map[string][]*metricmanager.GaugeVecSet) + + // 所有节点检测项,不管正常与否都应该返回对应checkitem + for name, pluginInfo := range nodeinfo { + for _, checkItem := range pluginInfo.Result.Items { + if _, ok := nodeGVSMap[checkItem.ItemName]; !ok { + nodeGVSMap[checkItem.ItemName] = make([]*metricmanager.GaugeVecSet, 0, 0) + } + + nodeGVSMap[checkItem.ItemName] = append(nodeGVSMap[checkItem.ItemName], &metricmanager.GaugeVecSet{ + Labels: []string{clusterId, clusterbiz, checkItem.ItemName, "node", checkItem.Status}, + Value: 1, + }) + } + + switch name { + case "processcheck": + pluginCheckItemList, gvsList, err := getProcessCheckResult(pluginInfo, nodeName, clusterId, clusterbiz) + if err != nil { + klog.Errorf(err.Error()) + continue + } + checkItemList = append(checkItemList, pluginCheckItemList...) + nodeGVSMap[processConfigCheckItem] = gvsList + + case "dnscheck": + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = dnscheck.StringMap[checkItem.ItemName] + checkItem.Status = dnscheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + case "timecheck": + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = timecheck.StringMap[checkItem.ItemName] + checkItem.Status = timecheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + case "netcheck": + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = netcheck.StringMap[checkItem.ItemName] + checkItem.Status = netcheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + case "diskcheck": + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = diskcheck.StringMap[checkItem.ItemName] + checkItem.Status = diskcheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + case "hwcheck": + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = hwcheck.StringMap[checkItem.ItemName] + checkItem.Status = hwcheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + case "nodeinfocheck": + checkItemList = append(checkItemList, getNodeinfoCheckResult(pluginInfo, nodeInfo)...) + } + } + + return checkItemList, infoItemList, nodeGVSMap +} + +func getProcessCheckResult(pluginInfo pluginmanager.PluginInfo, nodeName, clusterId, clusterbiz string) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0) + nodeGVSMap := make([]*metricmanager.GaugeVecSet, 0) + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = processcheck.StringMap[checkItem.ItemName] + checkItem.Status = processcheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + detailBytes, err := yaml.Marshal(pluginInfo.Detail) + if err != nil { + return checkItemList, nodeGVSMap, err + } + + detail := processcheck.Detail{} + err = yaml.Unmarshal(detailBytes, &detail) + if err != nil { + return checkItemList, nodeGVSMap, err + } + + // 检查进程配置,生成checkitem + processResult := checkProcess(detail, nodeName) + checkItemList = append(checkItemList, processResult...) + + for index, checkItem := range processResult { + nodeGVSMap = append(nodeGVSMap, &metricmanager.GaugeVecSet{ + Labels: []string{clusterId, clusterbiz, checkItem.ItemName, "node", checkItem.Status}, + Value: 1, + }) + + checkItem.ItemName = StringMap[checkItem.ItemName] + checkItem.Status = StringMap[checkItem.Status] + processResult[index] = checkItem + } + + return checkItemList, nodeGVSMap, nil +} + +func getNodeinfoCheckResult(pluginInfo pluginmanager.PluginInfo, nodeInfo *plugin.NodeInfo) []pluginmanager.CheckItem { + checkItemList := make([]pluginmanager.CheckItem, 0) + + for _, checkItem := range pluginInfo.Result.Items { + checkItem.ItemName = nodeinfocheck.StringMap[checkItem.ItemName] + checkItem.Status = nodeinfocheck.StringMap[checkItem.Status] + checkItemList = append(checkItemList, checkItem) + } + + for _, infoItem := range pluginInfo.Result.InfoItemList { + switch infoItem.ItemName { + case nodeinfocheck.ZoneItemType: + nodeInfo.Zone = infoItem.Result.(string) + case nodeinfocheck.RegionItemType: + nodeInfo.Zone = infoItem.Result.(string) + case nodeinfocheck.InstanceTypeItemType: + nodeInfo.Zone = infoItem.Result.(string) + } + } + + return checkItemList +} + +// Ready xxx +func (p *Plugin) Ready(clusterID string) bool { + p.WriteLock.Lock() + defer p.WriteLock.Unlock() + return p.ReadyMap[clusterID] +} + +// GetResult xxx +func (p *Plugin) GetResult(s string) pluginmanager.CheckResult { + return p.Result[s] +} + +// GetDetail xxx +func (p *Plugin) GetDetail() interface{} { + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/options.go new file mode 100644 index 0000000000..53e63e67d6 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/nodecheck/options.go @@ -0,0 +1,28 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package nodecheck xxx +package nodecheck + +// Options xxx +type Options struct { + Interval int `json:"interval" yaml:"interval"` +} + +// Validate validate options +func (o *Options) Validate() error { + if o.Interval == 0 { + o.Interval = 300 + } + + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/configmap.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/configmap.go new file mode 100644 index 0000000000..bd3acc95e8 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/configmap.go @@ -0,0 +1,55 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package systemappcheck xxx +package systemappcheck + +import ( + "k8s.io/klog/v2" + "strings" + "time" + + "gopkg.in/yaml.v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// CheckTKENetwork xxx +func CheckTKENetwork(cluster *pluginmanager.ClusterConfig) []pluginmanager.CheckItem { + result := make([]pluginmanager.CheckItem, 0, 0) + + cm, err := cluster.ClientSet.CoreV1().ConfigMaps("kube-system").Get(util.GetCtx(10*time.Second), "tke-network-conf", metav1.GetOptions{ResourceVersion: "0"}) + if err != nil { + if !strings.Contains(err.Error(), "not found") { + klog.Errorf("%s %s", cluster.ClusterID, err.Error()) + } + return result + } + + tkeNetworkConfig := make(map[string]interface{}) + err = yaml.Unmarshal([]byte(cm.Data["tke-network-conf.yaml"]), &tkeNetworkConfig) + if err != nil { + klog.Errorf("%s %s", cluster.ClusterID, err.Error()) + return result + } + + if cidrList, ok := tkeNetworkConfig["cidr.cluster-cidrs"].([]interface{}); ok { + for _, cidr := range cidrList { + cluster.Cidr = append(cluster.Cidr, cidr.(string)) + } + + } + + return result +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/constant.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/constant.go new file mode 100644 index 0000000000..f547a6ec6f --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/constant.go @@ -0,0 +1,127 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package systemappcheck xxx +package systemappcheck + +const ( + pluginName = "systemappcheck" + SystemAppImageVersionCheckItemName = "system_app_image_version" + SystemAppStatusCheckItemName = "system_app_status" + SystemAppChartVersionCheckItem = "system_app_chart_version" + SystemAppConfigCheckItem = "system_app_config" + + NormalStatus = "ok" + ImageStatusNeedUpgrade = "need_upgrade" + ImageStatusNiceToUpgrade = "nice_to_upgrade" + ImageStatusUnknown = "unknown" + + AppStatusNotReadyStatus = "notready" + AppStatusMemoryHighStatus = "memoryhigh" + AppStatusCpuHighStatus = "cpuhigh" + AppErrorStatus = "error" + AppMetricErrorStatus = "metric_error" + + ChartVersionNormalStatus = "deployed" + APPNotfoundAppStatus = "appnotfound" + + ConfigErrorStatus = "configerr" + NolabelStatus = "no labels" + ConfigOtherErrorStatus = "ConfigOtherErrorStatus" + UnrecommandedStatus = "UnrecommandedStatus" + ConfigNotFoundStatus = "confignotfound" + initContent = `interval: 300` + + StaticPodConfigTarget = "StaticPodConfigTarget" + SystemAppConfigTarget = "SystemAppConfigTarget" + ServiceConfigTarget = "ServiceConfigTarget" + + FlagUnsetDetailFormat = "%s is not set, which is recommanded set" + NoLabelDetailFormat = "%s has no labels, cannot be selected" + + etcdDataDiskDetail = "etcd used %s, recommand use data disk" + kubeProxyIpvsDetail = "when kube-proxy is ipvs mode, --ipvs-udp-timeout=10s is recommanded" + + lbSVCNoIpDetail = "service %s %s has no external ips." + GetResourceFailedDetail = "get resource %s failed: %s" + + deployment = "Deployment" + daemonset = "Daemonset" + statefulset = "Statefulset" +) + +var ( + ChinenseStringMap = map[string]string{ + pluginName: "系统应用检查", + NormalStatus: "正常", + StaticPodConfigTarget: "静态pod配置检查", + SystemAppConfigTarget: "系统应用配置检查", + ServiceConfigTarget: "service配置检查", + + ConfigErrorStatus: "配置错误", + ConfigNotFoundStatus: "配置不存在", + ConfigOtherErrorStatus: "其它问题", + APPNotfoundAppStatus: "应用不存在", + NolabelStatus: "没有标签", + AppErrorStatus: "错误", + AppStatusMemoryHighStatus: "应用内存高", + AppStatusCpuHighStatus: "应用cpu高", + UnrecommandedStatus: "非推荐值", + GetResourceFailedDetail: "获取 %s 失败: %s", + + SystemAppImageVersionCheckItemName: "应用镜像版本检查", + SystemAppConfigCheckItem: "应用配置检查", + SystemAppStatusCheckItemName: "应用状态检查", + SystemAppChartVersionCheckItem: "应用chart版本检查", + + FlagUnsetDetailFormat: "没有配置%s 参数,推荐配置", + NoLabelDetailFormat: "%s 没有任何标签,不能被选中", + + etcdDataDiskDetail: "etcd存储在%s, 推荐存储在数据盘", + kubeProxyIpvsDetail: "kube-proxy使用ipvs时,推荐设置--ipvs-udp-timeout=10s", + + lbSVCNoIpDetail: "service %s %s没有external ip", + } + + EnglishStringMap = map[string]string{ + pluginName: pluginName, + NormalStatus: NormalStatus, + StaticPodConfigTarget: "staic pod config check", + SystemAppConfigTarget: "system app config check", + ServiceConfigTarget: "service config check", + + ConfigErrorStatus: ConfigErrorStatus, + ConfigNotFoundStatus: ConfigNotFoundStatus, + ConfigOtherErrorStatus: "other err", + APPNotfoundAppStatus: APPNotfoundAppStatus, + NolabelStatus: NolabelStatus, + AppErrorStatus: AppErrorStatus, + AppStatusMemoryHighStatus: AppStatusMemoryHighStatus, + AppStatusCpuHighStatus: AppStatusCpuHighStatus, + UnrecommandedStatus: UnrecommandedStatus, + GetResourceFailedDetail: GetResourceFailedDetail, + + SystemAppImageVersionCheckItemName: SystemAppImageVersionCheckItemName, + SystemAppConfigCheckItem: SystemAppConfigCheckItem, + SystemAppStatusCheckItemName: SystemAppStatusCheckItemName, + SystemAppChartVersionCheckItem: SystemAppChartVersionCheckItem, + + FlagUnsetDetailFormat: FlagUnsetDetailFormat, + NoLabelDetailFormat: NoLabelDetailFormat, + + etcdDataDiskDetail: etcdDataDiskDetail, + kubeProxyIpvsDetail: kubeProxyIpvsDetail, + lbSVCNoIpDetail: lbSVCNoIpDetail, + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/init.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/init.go index bac67ff0b0..ed82b99348 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/init.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/init.go @@ -1,26 +1,21 @@ /* - * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Tencent is pleased to support the open source community by making Blueking Container Service available. * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. * Licensed under the MIT License (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, + * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language governing permissions and * limitations under the License. */ -package systempodcheck +// Package systemappcheck xxx +package systemappcheck -import ( - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" -) +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" +// 检测系统命名空间下的应用与静态pod func init() { - plugin_manager.Register(&Plugin{ - checkLock: sync.Mutex{}, - stopChan: make(chan int), - }) + pluginmanager.Register(&Plugin{}) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/options.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/options.go index c3ad08d689..68ea5b0d3e 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/options.go @@ -10,12 +10,19 @@ * limitations under the License. */ -package systempodcheck +// Package systemappcheck xxx +package systemappcheck + +import ( + "strings" +) // Options bcs log options type Options struct { - Components []Component `json:"components" yaml:"components"` - Interval int `json:"interval" yaml:"interval"` + Components []Component `json:"components" yaml:"components"` + ComponentVersionConf []ComponentVersionConf `json:"componentVersionConf" yaml:"componentVersionConf"` + Interval int `json:"interval" yaml:"interval"` + Namespaces []string `json:"namespaces" yaml:"namespaces"` } // Component xxx @@ -25,6 +32,14 @@ type Component struct { Resource string `json:"resource" yaml:"resource"` } +// ComponentVersionConf component version config +type ComponentVersionConf struct { + Name string `json:"name" yaml:"name"` + + NiceToUpgrade string `json:"niceToUpgrade" yaml:"niceToUpgrade"` + NeedUpgrade string `json:"needUpgrade" yaml:"needUpgrade"` +} + // Validate validate options func (o *Options) Validate() error { // if len(o.KubeMaster) == 0 { @@ -33,5 +48,58 @@ func (o *Options) Validate() error { // if len(o.Kubeconfig) == 0 { // return fmt.Errorf("kubeconfig cannot be empty") // } + + if o.Namespaces == nil || len(o.Namespaces) == 0 { + o.Namespaces = []string{"kube-system", "default", "bk-system", "bcs-system", "bkmonitor-operator"} + } + + if o.Components != nil { + components := []Component{ + { + Namespace: "kube-system", + Name: "kube-proxy", + Resource: "daemonset", + }, { + Namespace: "kube-system", + Name: "kube-dns", + Resource: "deployment", + }, { + Namespace: "kube-system", + Name: "coredns", + Resource: "deployment", + }, + } + for _, component := range components { + setFlag := false + for _, optionComponent := range o.Components { + if optionComponent.Name == component.Name && + optionComponent.Namespace == component.Namespace && strings.EqualFold(optionComponent.Resource, component.Resource) { + setFlag = true + break + } + } + + if !setFlag { + o.Components = append(o.Components, component) + } + } + } else { + o.Components = []Component{ + { + Namespace: "kube-system", + Name: "kube-proxy", + Resource: "daemonset", + }, { + Namespace: "kube-system", + Name: "kube-dns", + Resource: "deployment", + }, { + Namespace: "kube-system", + Name: "coredns", + Resource: "deployment", + }, + } + } + return nil } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/pod.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/pod.go new file mode 100644 index 0000000000..f865f3fb87 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/pod.go @@ -0,0 +1,414 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package systemappcheck xxx +package systemappcheck + +import ( + "context" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "strconv" + "strings" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// CheckStaticPod check static pod config +func CheckStaticPod(cluster *pluginmanager.ClusterConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + staticPodcache, ok := util.GetCache(cluster.ClusterID + "staticpod") + podList := make([]v1.Pod, 0, 0) + if ok { + staticPodNameList, ok1 := staticPodcache.([]string) + klog.Infof("%s has static pod caches, get pod from kube-system namespace", cluster.ClusterID) + if !ok1 { + return nil, nil, fmt.Errorf("%s get staticPodcache failed %s", cluster.ClusterID, staticPodcache) + } + + for _, staticPodName := range staticPodNameList { + pod, err := cluster.ClientSet.CoreV1().Pods("kube-system").Get(context.Background(), staticPodName, metav1.GetOptions{ResourceVersion: "0"}) + if err != nil { + if errors.IsNotFound(err) { + ok = false + } + klog.Errorf("%s get static pod failed: %s", cluster.ClusterID, err.Error()) + } else { + podList = append(podList, *pod) + } + } + } + if !ok { + klog.Infof("%s has no static pod caches, list from cluster kube-system namespace", cluster.ClusterID) + result, err := cluster.ClientSet.CoreV1().Pods("kube-system").List(context.Background(), metav1.ListOptions{ResourceVersion: "0"}) + if err != nil { + return nil, nil, err + } else { + podList = result.Items + } + } + + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + + newStaticPodNameList := make([]string, 0, 0) + for _, pod := range podList { + if len(pod.OwnerReferences) == 0 { + continue + } + if pod.OwnerReferences[0].Kind != "Node" { + continue + } + + newStaticPodNameList = append(newStaticPodNameList, pod.Name) + + if strings.HasPrefix(pod.Name, "cloud-controller-manager") { + continue + } + + if strings.HasPrefix(pod.Name, "kube-apiserver") { + checkItemList = append(checkItemList, CheckApiserver(&pod, cluster)...) + } + if strings.HasPrefix(pod.Name, "etcd") { + checkItemList = append(checkItemList, CheckETCD(&pod)...) + } + if strings.HasPrefix(pod.Name, "kube-controller-manager") { + checkItemList = append(checkItemList, CheckKCM(&pod, cluster)...) + } + + checkItemList = append(checkItemList, CheckLabel(&pod)...) + } + + if !ok { + // 缓存集群当前static pod信息,避免频繁list pod + util.SetCacheWithTimeout(cluster.ClusterID+"staticpod", newStaticPodNameList, time.Hour*24) + } + + if len(checkItemList) == 0 { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: "static pod", + Normal: true, + Detail: "", + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + } + + // 生成异常配置指标 + okFlag := true + for index, checkItem := range checkItemList { + if !checkItem.Normal { + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, "kube-system", checkItem.ItemTarget, "pod", checkItem.Status}, + Value: 1, + }) + okFlag = false + } + checkItemList[index] = checkItem + } + + if okFlag { + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, "kube-system", "static pod", "pod", pluginmanager.NormalStatus}, + Value: 1, + }) + } + + // 静态pod参数不一致问题检查 + return checkItemList, gvsList, nil +} + +// CheckKCM check kcm config +func CheckKCM(pod *v1.Pod, cluster *pluginmanager.ClusterConfig) []pluginmanager.CheckItem { + cidrFlag := false + cidr := make([]string, 0, 0) + for _, arg := range append(pod.Spec.Containers[0].Command, pod.Spec.Containers[0].Args...) { + if strings.HasPrefix(arg, "--service-cluster-ip-range") { + cluster.ServiceCidr = strings.SplitN(arg, "=", 2)[1] + } else if strings.HasPrefix(arg, "--cluster-cidr") { + cidr = append(cidr, strings.SplitN(arg, "=", 2)[1]) + } else if arg == "--allocate-node-cidrs=true" { + cidrFlag = true + } else if strings.HasPrefix(arg, "--node-cidr-mask-size") { + cluster.MaskSize, _ = strconv.Atoi(strings.SplitN(arg, "=", 2)[1]) + } + } + + // 重置cidr + cluster.Cidr = make([]string, 0, 0) + if cidrFlag { + cluster.Cidr = append(cluster.Cidr, cidr...) + } + + if cluster.MaskSize == 0 { + cluster.MaskSize = 24 + } + + return nil +} + +// CheckETCD check etcd config +func CheckETCD(pod *v1.Pod) []pluginmanager.CheckItem { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + + // 检查参数 + floatFlagList := []plugin.FloatFlag{ + {Name: "--heartbeat-interval", + CompareType: "ge", + Value: 1000, + Needed: true, + }, + } + + for _, floatFlag := range floatFlagList { + detail := plugin.CheckFlag(append(pod.Spec.Containers[0].Command, pod.Spec.Containers[0].Args...), floatFlag) + if detail != "" { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Status: UnrecommandedStatus, + Normal: false, + Detail: detail, + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + } + } + + // 磁盘配置 + checkFlag := false + for _, volume := range pod.Spec.Volumes { + if volume.HostPath.Path == "/var/lib/etcd" { + checkFlag = true + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Status: ConfigErrorStatus, + Normal: false, + Detail: fmt.Sprintf(StringMap[etcdDataDiskDetail], volume.HostPath.Path), + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + break + } + } + + if !checkFlag { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Status: pluginmanager.NormalStatus, + Normal: true, + Detail: "", + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + } + + // 检查状态 + return checkItemList +} + +// CheckApiserver check apiserver config +func CheckApiserver(pod *v1.Pod, cluster *pluginmanager.ClusterConfig) []pluginmanager.CheckItem { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + + setFlagList := []string{"--goaway-chance", "--audit-policy-file"} + + for _, arg := range append(pod.Spec.Containers[0].Command, pod.Spec.Containers[0].Args...) { + for index, flag := range setFlagList { + if strings.Contains(arg, flag) && flag != "" { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Status: pluginmanager.NormalStatus, + Normal: true, + Detail: "", + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + setFlagList[index] = "" + } + } + + if strings.HasPrefix(arg, "--service-cluster-ip-range") { + cluster.ServiceCidr = strings.SplitN(arg, "=", 2)[1] + } + } + + for _, setFlag := range setFlagList { + if setFlag != "" { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Status: ConfigNotFoundStatus, + Normal: false, + Detail: fmt.Sprintf(StringMap[FlagUnsetDetailFormat], setFlag), + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + return checkItemList + } + } + + return checkItemList + + // 检查参数 + // 检查状态 +} + +// CheckLabel xxx +func CheckLabel(pod *v1.Pod) []pluginmanager.CheckItem { + checkItem := pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: pod.Name, + Detail: fmt.Sprintf(StringMap[NoLabelDetailFormat], pod.Name), + Tags: nil, + Level: pluginmanager.RISKLevel, + } + + result := make([]pluginmanager.CheckItem, 0, 0) + if len(pod.Labels) == 0 { + checkItem.Status = NolabelStatus + checkItem.Normal = false + } else { + checkItem.Status = pluginmanager.NormalStatus + checkItem.Normal = true + } + + result = append(result, checkItem) + + return result +} + +// CheckSystemWorkLoadConfig 检查系统应用配置 +func CheckSystemWorkLoadConfig(cluster *pluginmanager.ClusterConfig) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet) { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + checkItemList = append(checkItemList, CheckCoredns(cluster.ClientSet)...) + checkItemList = append(checkItemList, CheckKubeProxy(cluster.ClientSet)...) + + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + + for _, checkItem := range checkItemList { + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, "kube-system", checkItem.ItemTarget, "app", checkItem.Status}, + Value: 1, + }) + } + + return checkItemList, gvsList +} + +// CheckCoredns 检查coredns config +func CheckCoredns(clientSet *kubernetes.Clientset) []pluginmanager.CheckItem { + result := make([]pluginmanager.CheckItem, 0, 0) + checkItem := pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: "coredns", + Tags: nil, + Level: pluginmanager.RISKLevel, + } + + cm, err := clientSet.CoreV1().ConfigMaps("kube-system").Get(util.GetCtx(10*time.Second), "coredns", metav1.GetOptions{ResourceVersion: "0"}) + + if err != nil { + checkItem.Normal = false + if strings.Contains(err.Error(), "not found") { + checkItem.Status = ConfigNotFoundStatus + } else { + checkItem.Status = ConfigErrorStatus + } + checkItem.Detail = fmt.Sprintf(StringMap[GetResourceFailedDetail], "coredns configmap", err.Error()) + result = append(result, checkItem) + return result + } + + // 检查coredns是否配置了健康检查端口以及lameduck配置 + flagList := []string{ + "ready", "lameduck", + } + + unSetFlagList := make([]string, 0, 0) + for _, flag := range flagList { + if !strings.Contains(cm.Data["Corefile"], flag) { + checkItem.Detail = fmt.Sprintf(StringMap[FlagUnsetDetailFormat], unSetFlagList) + checkItem.Normal = false + checkItem.Status = ConfigErrorStatus + result = append(result, checkItem) + return result + } + } + + if len(result) == 0 { + checkItem.Status = pluginmanager.NormalStatus + checkItem.Normal = true + result = append(result, checkItem) + } + + return result +} + +// CheckKubeProxy check kube-proxy config +func CheckKubeProxy(clientSet *kubernetes.Clientset) []pluginmanager.CheckItem { + result := make([]pluginmanager.CheckItem, 0, 0) + checkItem := pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: "kube-proxy", + Tags: nil, + Level: pluginmanager.RISKLevel, + Normal: true, + } + + ds, err := clientSet.AppsV1().DaemonSets("kube-system").Get(util.GetCtx(10*time.Second), "kube-proxy", metav1.GetOptions{ResourceVersion: "0"}) + + if err != nil { + checkItem.Normal = false + if strings.Contains(err.Error(), "not found") { + checkItem.Status = ConfigNotFoundStatus + } else { + checkItem.Status = ConfigErrorStatus + } + checkItem.Detail = err.Error() + result = append(result, checkItem) + return result + } + + // 检查proxy模式是否为ipvs以及udp timeout是否设置 + var ipvsFlag, udpTimeoutFlag bool + for _, arg := range append(ds.Spec.Template.Spec.Containers[0].Command, ds.Spec.Template.Spec.Containers[0].Args...) { + if strings.Contains(arg, "proxy-mode=ipvs") { + ipvsFlag = true + } else if strings.Contains(arg, "ipvs-udp-timeout=10s") { + udpTimeoutFlag = true + } + } + + if ipvsFlag && !udpTimeoutFlag { + checkItem.Normal = false + checkItem.Detail = StringMap[kubeProxyIpvsDetail] + checkItem.Status = ConfigErrorStatus + } else { + checkItem.Status = pluginmanager.NormalStatus + } + + result = append(result, checkItem) + return result +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/svc.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/svc.go new file mode 100644 index 0000000000..ea33aa6138 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/svc.go @@ -0,0 +1,82 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package systemappcheck xxx +package systemappcheck + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" +) + +// CheckService xxx +func CheckService(cluster *pluginmanager.ClusterConfig, clusterID string) ([]pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, error) { + namespaceList, err := cluster.ClientSet.CoreV1().Namespaces().List(util.GetCtx(time.Second*10), metav1.ListOptions{ResourceVersion: "0"}) + if err != nil { + return nil, nil, err + } + + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + gvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + + var wg sync.WaitGroup + routinePool := util.NewRoutinePool(20) + for _, namespace := range namespaceList.Items { + wg.Add(1) + go func(namespace v1.Namespace) { + routinePool.Add(1) + defer func() { + wg.Done() + routinePool.Done() + }() + + serviceList, err := cluster.ClientSet.CoreV1().Services(namespace.Name).List(util.GetCtx(time.Second*10), metav1.ListOptions{ResourceVersion: "0"}) + if err != nil { + klog.Errorf("%s get service in namespace %s failed: %s", clusterID, namespace.Name, err.Error()) + return + } + + for _, svc := range serviceList.Items { + if svc.Spec.Type == "LoadBalancer" { + if len(svc.Status.LoadBalancer.Ingress) == 0 { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppConfigCheckItem, + ItemTarget: svc.Name, + Status: ConfigErrorStatus, + Normal: false, + Detail: fmt.Sprintf(StringMap[lbSVCNoIpDetail], svc.Namespace, svc.Name), + Tags: nil, + Level: pluginmanager.WARNLevel, + }) + gvsList = append(gvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, namespace.Name, svc.Name, "service", ConfigErrorStatus}, + Value: 1, + }) + } + } + } + }(namespace) + + } + wg.Wait() + + return checkItemList, gvsList, nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck.go index 0ac75913bd..5b9a000da3 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck.go @@ -1,499 +1,855 @@ /* - * Tencent is pleased to support the open source community by making Blueking Container Service available., + * Tencent is pleased to support the open source community by making Blueking Container Service available. * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. * Licensed under the MIT License (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, + * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language governing permissions and * limitations under the License. */ -package systempodcheck +// Package systemappcheck xxx +package systemappcheck import ( - "context" "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metricmanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/pluginmanager" + utiltrace "k8s.io/utils/trace" "os" + "reflect" "regexp" - goruntime "runtime" "strings" "sync" "time" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin_manager" - - "github.com/containerd/containerd/pkg/cri/util" + "github.com/hashicorp/go-version" "github.com/prometheus/client_golang/prometheus" "gopkg.in/yaml.v2" "helm.sh/helm/v3/pkg/action" - "helm.sh/helm/v3/pkg/release" v1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/json" "k8s.io/client-go/kubernetes" "k8s.io/klog" + metricsclientset "k8s.io/metrics/pkg/client/clientset/versioned" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/k8s" + internalUtil "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" ) // Plugin xxx type Plugin struct { - stopChan chan int - opt *Options - checkLock sync.Mutex + opt *Options + pluginmanager.ClusterPlugin } var ( + systemAppChartVersionLabels = []string{"target", "bk_biz_id", "namespace", "chart", "version", "status", "rel"} + systemAppImageVersionLabels = []string{"target", "bk_biz_id", "namespace", "component", "resource", + "container", "version", "status", "rel"} + systemAppStatusLabels = []string{"target", "bk_biz_id", "namespace", "component", "resource", "status", "rel"} + systemAppConfigLabels = []string{"target", "bk_biz_id", "namespace", "component", "resource", "status"} + // 应用release 状态检查 systemAppChartVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "system_app_chart_version", Help: "system_app_chart_version, 1 means deployed", - }, []string{"target", "target_biz", "namespace", "chart", "version", "status"}) + }, systemAppChartVersionLabels) + // 应用镜像版本检查 systemAppImageVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "system_app_image_version", Help: "system_app_image_version, 1 means ok", - }, []string{"target", "target_biz", "namespace", "chart", "component", "resource", "container", "version", "status"}) - systemAppChartMap = make(map[string]*prometheus.GaugeVec) - systemAppImageMap = make(map[string]*prometheus.GaugeVec) - systemAppMapLock sync.Mutex + }, systemAppImageVersionLabels) + // 应用部署状态 + systemAppStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "system_app_status", + Help: "system_app_status, 1 means ok", + }, systemAppStatusLabels) + // 应用配置状态 + systemAppConfig = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "system_app_config", + Help: "system_app_config, 1 means ok", + }, systemAppConfigLabels) + + systemAppChartGVSList = make(map[string][]*metricmanager.GaugeVecSet) + systemAppImageGVSList = make(map[string][]*metricmanager.GaugeVecSet) + systemAppStatusGVSList = make(map[string][]*metricmanager.GaugeVecSet) + systemAppConfigGVSList = make(map[string][]*metricmanager.GaugeVecSet) ) func init() { - metric_manager.Register(systemAppChartVersion) - metric_manager.Register(systemAppImageVersion) + // 注册指标 + metricmanager.Register(systemAppChartVersion) + metricmanager.Register(systemAppImageVersion) + metricmanager.Register(systemAppStatus) + metricmanager.Register(systemAppConfig) } // Setup xxx -func (p *Plugin) Setup(configFilePath string) error { - configFileBytes, err := os.ReadFile(configFilePath) - if err != nil { - return fmt.Errorf("read systemappcheck config file %s failed, err %s", configFilePath, err.Error()) - } +func (p *Plugin) Setup(configFilePath string, runMode string) error { p.opt = &Options{} - if err = json.Unmarshal(configFileBytes, p.opt); err != nil { - if err = yaml.Unmarshal(configFileBytes, p.opt); err != nil { - return fmt.Errorf("decode systemappcheck config file %s failed, err %s", configFilePath, err.Error()) - } + + err := internalUtil.ReadorInitConf(configFilePath, p.opt, initContent) + if err != nil { + return err } + if err = p.opt.Validate(); err != nil { return err } + p.Result = make(map[string]pluginmanager.CheckResult) + p.ReadyMap = make(map[string]bool) + interval := p.opt.Interval if interval == 0 { interval = 60 } - go func() { - for { - if p.checkLock.TryLock() { - p.checkLock.Unlock() - plugin_manager.Pm.Lock() - go p.Check() - } else { - klog.V(3).Infof("the former systemappcheck didn't over, skip in this loop") - } - select { - case result := <-p.stopChan: - klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) - return - case <-time.After(time.Duration(interval) * time.Second): - continue + if runMode == pluginmanager.RunModeDaemon { + go func() { + for { + if p.CheckLock.TryLock() { + p.CheckLock.Unlock() + pluginmanager.Pm.Lock() + + // 重载opt + opt := &Options{} + err = internalUtil.ReadorInitConf(configFilePath, opt, initContent) + if err != nil { + klog.Errorf("reload config failed: %s", err.Error()) + } else { + if err = p.opt.Validate(); err != nil { + klog.Errorf("validate config failed: %s", err.Error()) + } else { + if reflect.DeepEqual(&p.opt, &opt) { + p.opt = opt + klog.Info("config reload success") + } + } + } + // 执行检查 + go p.Check() + } else { + klog.V(3).Infof("the former systemappcheck didn't over, skip in this loop") + } + + select { + case result := <-p.StopChan: + klog.V(3).Infof("stop plugin %s by signal %d", p.Name(), result) + return + case <-time.After(time.Duration(interval) * time.Second): + continue + } } - } - }() + }() + } else if runMode == pluginmanager.RunModeOnce { + p.Check() + } return nil } // Stop xxx func (p *Plugin) Stop() error { - p.stopChan <- 1 + p.StopChan <- 1 klog.Infof("plugin %s stopped", p.Name()) return nil } // Name xxx func (p *Plugin) Name() string { - return "systemappcheck" + return pluginName } // Check xxx -// -// how to check resource version without chart -// -// informer 改造 -// -// 内存使用优化 func (p *Plugin) Check() { start := time.Now() - p.checkLock.Lock() + p.CheckLock.Lock() klog.Infof("start %s", p.Name()) defer func() { klog.Infof("end %s", p.Name()) - plugin_manager.Pm.UnLock() - p.checkLock.Unlock() - metric_manager.SetCommonDurationMetric([]string{"systemappcheck", "", "", ""}, start) + pluginmanager.Pm.UnLock() + p.CheckLock.Unlock() + metricmanager.SetCommonDurationMetric([]string{"systemappcheck", "", "", ""}, start) }() wg := sync.WaitGroup{} - chartGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - imageGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - for _, cluster := range plugin_manager.Pm.GetConfig().ClusterConfigs { + // 默认获取以下这些namespace中的应用 + namespaceList := p.opt.Namespaces + + for _, cluster := range pluginmanager.Pm.GetConfig().ClusterConfigs { wg.Add(1) - clusterId := cluster.ClusterID - clusterbiz := cluster.BusinessID - config := cluster.Config - plugin_manager.Pm.Add() - go func() { + pluginmanager.Pm.Add() + + go func(cluster *pluginmanager.ClusterConfig) { + cluster.Lock() + klog.Infof("start systemappcheck for %s", cluster.ClusterID) + + clusterId := cluster.ClusterID + clientSet := cluster.ClientSet + config := cluster.Config + clusterResult := pluginmanager.CheckResult{ + Items: make([]pluginmanager.CheckItem, 0, 0), + } + + trace := utiltrace.New("systemappcheck", utiltrace.Field{"target", clusterId}) + defer func() { - klog.V(9).Infof("end systemappcheck for %s", clusterId) + cluster.Unlock() + klog.Infof("end systemappcheck for %s", clusterId) wg.Done() - plugin_manager.Pm.Done() + pluginmanager.Pm.Done() + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = true + p.WriteLock.Unlock() + trace.LogIfLong(20 * time.Second) }() - klog.V(9).Infof("start systemappcheck for %s", clusterId) + p.WriteLock.Lock() + p.ReadyMap[cluster.ClusterID] = false + p.WriteLock.Unlock() - clientSet, err := k8s.GetClientsetByConfig(config) + loopSystemAppChartGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopSystemAppImageGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopSystemAppStatusGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + loopSystemAppConfigGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + + clusterVersion, err := k8s.GetK8sVersion(clientSet) if err != nil { - klog.Errorf("%s GetClientsetByClusterId failed: %s", clusterId, err.Error()) + klog.Errorf("%s GetK8sVersion failed: %s", clusterId, err.Error()) return } - _, err = k8s.GetK8sVersion(clientSet) - if err != nil { - klog.Errorf("%s GetK8sVersion failed: %s", clusterId, err.Error()) + + // don't check for v1.8 cluster + if strings.Contains(clusterVersion, "v1.8") { + klog.Infof("%s version is %s, skip", clusterId, clusterVersion) return } + // 检查静态pod配置 + podCheckItemList, staticPodGVSList, err := CheckStaticPod(cluster) + if err != nil { + klog.Errorf("%s CheckStaticPod failed %s", clusterId, err.Error()) + } else { + loopSystemAppConfigGVSList = append(loopSystemAppConfigGVSList, staticPodGVSList...) + clusterResult.Items = append(clusterResult.Items, podCheckItemList...) + } + trace.Step("check static pod") + + // 检查svc配置 + svcCheckItemList, GVSList, err := CheckService(cluster, clusterId) + if err != nil { + klog.Errorf("%s CheckService failed %s", clusterId, err.Error()) + } else { + // 生成异常配置指标 + loopSystemAppConfigGVSList = append(loopSystemAppConfigGVSList, GVSList...) + clusterResult.Items = append(clusterResult.Items, svcCheckItemList...) + } + + // 检查tke服务的配置 + if cluster.ClusterType == pluginmanager.TKECluster { + // 检查TKE应用配置 + CheckTKENetwork(cluster) + } + + trace.Step("check service") + getter := k8s.GetRestClientGetterByConfig(config) - //getter := &genericclioptions.ConfigFlags{ - // APIServer: &config.Host, - // BearerToken: &config.BearerToken, - // Username: &config.Username, - // Password: &config.Password, - // Insecure: &config.Insecure, - //} // 获取配置文件中的component列表(有可能不是helm方式部署的) - var clusterComponent []Component - err = util.DeepCopy(&clusterComponent, p.opt.Components) - if err != nil { - klog.Errorf("%s DeepCopy failed: %s", clusterId, err.Error()) - return + clusterComponent := make([]Component, 0, 0) + for index, _ := range p.opt.Components { + if p.opt.Components[index].Resource != deployment && p.opt.Components[index].Resource != daemonset && p.opt.Components[index].Resource != statefulset { + klog.Errorf("unsupported resource type %s", p.opt.Components[index].Resource) + continue + } + clusterComponent = append(clusterComponent, p.opt.Components[index]) + } - // Get releases - // 默认获取以下这些namespace中的应用 - namespaceList := []string{ - "kube-system", "bcs-system", "bk-system", "default", "bkmonitor-operator"} - chartCheckResult := make([]*metric_manager.GaugeVecSet, 0, 0) - for _, namespace := range namespaceList { - actionConfig := new(action.Configuration) - if err := actionConfig.Init(getter, namespace, os.Getenv("HELM_DRIVER"), klog.Infof); err != nil { - klog.Errorf("Config helm client failed: %s", err.Error()) - return + // 遍历各个namespace下的releases,基于release进行应用状态的检测 + checkItemList, relStatusGVSList, statusGVSList, imageGVSList, configGVSList, componentList := + CheckRelease(namespaceList, getter, cluster, p.opt.ComponentVersionConf) + clusterResult.Items = append(clusterResult.Items, checkItemList...) + loopSystemAppChartGVSList = append(loopSystemAppChartGVSList, relStatusGVSList...) + loopSystemAppStatusGVSList = append(loopSystemAppStatusGVSList, statusGVSList...) + loopSystemAppImageGVSList = append(loopSystemAppImageGVSList, imageGVSList...) + loopSystemAppConfigGVSList = append(loopSystemAppConfigGVSList, configGVSList...) + + // 如果component列表中的应用已经通过helm部署了,则不再需要单独确认 + for _, component := range componentList { + for index, c := range clusterComponent { + if c.Name == component.Name && + c.Namespace == component.Namespace && + c.Resource == component.Resource { + clusterComponent = append(clusterComponent[:index], clusterComponent[index+1:]...) + break + } } + } + trace.Step("check release") - // 获取release列表 - client := action.NewList(actionConfig) - client.Deployed = true - client.AllNamespaces = true - relList, err := client.Run() + // 检查指定组件 + for _, component := range clusterComponent { + componentCheckItemList, componentStatusGVSList, componentImageGVSList, componentConfigGVSList, err := + CheckComponent(component, cluster, "", p.opt.ComponentVersionConf) if err != nil { - klog.Errorf("%s helm get deployed chart failed: %s", clusterId, err.Error()) - return + klog.Errorf("%s %s CheckComponent failed: %s", clusterId, component.Name, err.Error()) } - for _, rel := range relList { - // 生成对应的metric配置 - - chartCheckResult = append(chartCheckResult, &metric_manager.GaugeVecSet{ - Labels: []string{ - clusterId, - clusterbiz, - rel.Namespace, - rel.Name, - rel.Chart.AppVersion(), - rel.Info.Status.String()}, - Value: 1}) - - manifest := rel.Manifest - // exclude non-workload resource manifest - resourceManifestList := strings.Split(manifest, "---") - workLoadManifestList := make([]string, 0, 0) - - re, _ := regexp.Compile("\nkind: Deployment|\nkind: DaemonSet|\nkind: StatefulSet") - for _, resourceManifest := range resourceManifestList { - if re.MatchString(resourceManifest) { - workLoadManifestList = append(workLoadManifestList, resourceManifest) - } - } + clusterResult.Items = append(clusterResult.Items, componentCheckItemList...) + loopSystemAppStatusGVSList = append(loopSystemAppStatusGVSList, componentStatusGVSList...) + loopSystemAppImageGVSList = append(loopSystemAppImageGVSList, componentImageGVSList...) + loopSystemAppConfigGVSList = append(loopSystemAppConfigGVSList, componentConfigGVSList...) + } + trace.Step("check component") + + // 检查特定systemapp的config + workCheckItemList, workGVSList := CheckSystemWorkLoadConfig(cluster) + loopSystemAppConfigGVSList = append(loopSystemAppConfigGVSList, workGVSList...) + clusterResult.Items = append(clusterResult.Items, workCheckItemList...) + trace.Step("check config") + + // 刷新metric + // get former metric data + p.WriteLock.Lock() + // 删除上一次检查的指标 + if _, ok := systemAppChartGVSList[cluster.ClusterID]; ok { + metricmanager.DeleteMetric(systemAppChartVersion, systemAppChartGVSList[clusterId]) + metricmanager.DeleteMetric(systemAppImageVersion, systemAppImageGVSList[clusterId]) + metricmanager.DeleteMetric(systemAppStatus, systemAppStatusGVSList[clusterId]) + metricmanager.DeleteMetric(systemAppConfig, systemAppConfigGVSList[clusterId]) + } - if len(workLoadManifestList) == 0 { - klog.V(9).Infof("%s %s manifest workload is nil", clusterId, rel.Name) - continue - } + systemAppChartGVSList[clusterId] = loopSystemAppChartGVSList + systemAppImageGVSList[clusterId] = loopSystemAppImageGVSList + systemAppStatusGVSList[clusterId] = loopSystemAppStatusGVSList + systemAppConfigGVSList[clusterId] = loopSystemAppConfigGVSList + // get new metric data + p.WriteLock.Unlock() + + // 写入指标 + metricmanager.SetMetric(systemAppChartVersion, loopSystemAppChartGVSList) + metricmanager.SetMetric(systemAppImageVersion, loopSystemAppImageGVSList) + metricmanager.SetMetric(systemAppStatus, loopSystemAppStatusGVSList) + metricmanager.SetMetric(systemAppConfig, loopSystemAppConfigGVSList) + + for key, val := range clusterResult.Items { + val.ItemName = StringMap[val.ItemName] + if _, ok := StringMap[val.ItemTarget]; ok { + val.ItemTarget = StringMap[val.ItemTarget] + } + val.Status = StringMap[val.Status] + clusterResult.Items[key] = val + } + p.Result[clusterId] = clusterResult - // 查询对应的资源对象 - for _, worloadManifest := range workLoadManifestList { - var objMap map[string]interface{} - if err := yaml.Unmarshal([]byte(worloadManifest), &objMap); err != nil { - fmt.Printf("Error unmarshalling YAML: %v\n", err) - continue - } - if _, ok := objMap["kind"]; !ok { - klog.Errorf("%s wrong workload yaml, no kind, rel: %s, workload: %s", clusterId, rel.Name, worloadManifest) - continue - } - if _, ok := objMap["metadata"]; !ok { - klog.Errorf("%s wrong workload yaml, no name, rel: %s, workload: %s", clusterId, rel.Name, worloadManifest) - continue - } + trace.Step("refresh metric") + }(cluster) + } + wg.Wait() - kind := objMap["kind"].(string) - name := objMap["metadata"].(map[interface{}]interface{})["name"].(string) - ctx, _ := context.WithTimeout(context.Background(), time.Duration(10)*time.Second) - - var workload runtime.Object - switch kind { - case "Deployment": - deploy, err := clientSet.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) - deploy.TypeMeta.Kind = "Deployment" - if err != nil { - klog.Errorf("%s deployment %s not found in namespace %s, release %s", clusterId, name, namespace, rel.Name) - continue - } - workload = deploy.DeepCopyObject() - case "DaemonSet": - ds, err := clientSet.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) - ds.TypeMeta.Kind = "DaemonSet" - if err != nil { - klog.Errorf("%s daemonset %s not found in namespace %s, release %s", clusterId, name, namespace, rel.Name) - continue - } - workload = ds.DeepCopyObject() - case "StatefulSet": - sts, err := clientSet.AppsV1().StatefulSets(namespace).Get(ctx, name, metav1.GetOptions{}) - sts.TypeMeta.Kind = "StatefulSet" - if err != nil { - klog.Errorf("%s statefulset %s not found in namespace %s, release %s", clusterId, name, namespace, rel.Name) - continue + // clean deleted cluster metric data + clusterConfigs := pluginmanager.Pm.GetConfig().ClusterConfigs + p.WriteLock.Lock() + + for clusterID, _ := range p.ReadyMap { + if _, ok := clusterConfigs[clusterID]; !ok { + delete(p.ReadyMap, clusterID) + metricmanager.DeleteMetric(systemAppChartVersion, systemAppChartGVSList[clusterID]) + metricmanager.DeleteMetric(systemAppImageVersion, systemAppImageGVSList[clusterID]) + metricmanager.DeleteMetric(systemAppStatus, systemAppStatusGVSList[clusterID]) + metricmanager.DeleteMetric(systemAppConfig, systemAppConfigGVSList[clusterID]) + delete(systemAppChartGVSList, clusterID) + delete(systemAppImageGVSList, clusterID) + delete(systemAppStatusGVSList, clusterID) + delete(systemAppConfigGVSList, clusterID) + delete(p.Result, clusterID) + klog.Infof("delete cluster %s", clusterID) + } + } + p.WriteLock.Unlock() +} + +// CheckRelease 检查release详情 +func CheckRelease(namespaceList []string, getter *k8s.RESTClientGetter, cluster *pluginmanager.ClusterConfig, componentVersionConfList []ComponentVersionConf) ( + []pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, []Component) { + + var syncLock sync.Mutex + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + relStatusGvsList := make([]*metricmanager.GaugeVecSet, 0, 0) + relComponentStatusGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + relImageGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + relConfigGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + componentList := make([]Component, 0, 0) + + var wg sync.WaitGroup + + // 遍历指定的namespace + for _, namespace := range namespaceList { + wg.Add(1) + go func(namespace string) { + defer wg.Done() + + actionConfig := new(action.Configuration) + if err := actionConfig.Init(getter, namespace, os.Getenv("HELM_DRIVER"), klog.Infof); err != nil { + klog.Errorf("%s Config helm client failed: %s", cluster.ClusterID, err.Error()) + return + } + + // 获取release列表 + client := action.NewList(actionConfig) + client.Deployed = true + // client.AllNamespaces = true + relList, err := client.Run() + if err != nil { + klog.Errorf("%s helm get deployed chart failed: %s", cluster.ClusterID, err.Error()) + return + } + + // 基于helm release进行检查 + for _, rel := range relList { + // 生成对应的release 状态的checkitem + syncLock.Lock() + checkItem := pluginmanager.CheckItem{ + ItemName: pluginName, + ItemTarget: rel.Name, + Status: rel.Info.Status.String(), + Detail: "", + Level: pluginmanager.WARNLevel, + Normal: rel.Info.Status.String() == ChartVersionNormalStatus, + Tags: map[string]string{"component": rel.Name}, + } + + if checkItem.Normal { + checkItem.Status = NormalStatus + } + checkItemList = append(checkItemList, checkItem) + + // 生成release状态指标 + relStatusGvsList = append(relStatusGvsList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, namespace, rel.Chart.Name(), rel.Chart.AppVersion(), rel.Info.Status.String(), rel.Name}, + Value: 1, + }) + syncLock.Unlock() + + manifest := rel.Manifest + // exclude non-workload resource manifest + resourceRe := regexp.MustCompile(`(?m)^---$`) + resourceManifestList := resourceRe.Split(manifest, -1) + + workloadTypeList := []string{deployment, daemonset, statefulset} + workLoadManifestMap := make(map[string][]string) + + // 获取各个workload的yaml文件 + for _, workloadType := range workloadTypeList { + workloadRe, _ := regexp.Compile(fmt.Sprintf("\nkind: %s", workloadType)) + for _, resourceManifest := range resourceManifestList { + if workloadRe.MatchString(resourceManifest) { + if _, ok := workLoadManifestMap[workloadType]; !ok { + workLoadManifestMap[workloadType] = make([]string, 0, 0) } - workload = sts.DeepCopyObject() + workLoadManifestMap[workloadType] = append(workLoadManifestMap[workloadType], resourceManifest) } + } + } - if workload == nil { - klog.Infof("Unknown resource type: %s", kind) + // 通过workload的yaml文件 获取各个应用的状态 + for _, worloadManifestList := range workLoadManifestMap { + for _, worloadManifest := range worloadManifestList { + component := Component{Namespace: namespace} + err = GetComponentFromManifest(worloadManifest, &component) + if err != nil { + klog.Errorf("GetComponentFromManifest %s failed: %s", rel.Name, err.Error()) continue } - // 如果component列表中的应用已经通过helm部署了,则不再需要单独确认 - for index, component := range clusterComponent { - if component.Name == name && - component.Namespace == namespace && - strings.ToLower(component.Resource) == - strings.ToLower(kind) { - clusterComponent = append(clusterComponent[:index], clusterComponent[index+1:]...) - } + // tke daemonset告警过多,先不进行检查 + if !strings.Contains(strings.ToLower(component.Name), "bk") && + !strings.Contains(strings.ToLower(component.Name), "bcs") && component.Resource == daemonset { + continue } - resourceGaugeVecSet := GetResourceGaugeVecSet(clusterId, clusterbiz, workload, *rel) - if resourceGaugeVecSet == nil { - resourceGaugeVecSet = &metric_manager.GaugeVecSet{ - Labels: []string{ - clusterId, - clusterbiz, - rel.Namespace, - rel.Name, - name, - kind, - "", "", - "notready"}, Value: 1} + // 获取release下workload的详细信息 + relCheckItemList, statusGVSList, imageGVSList, configGVSList, err := + CheckComponent(component, cluster, rel.Name, componentVersionConfList) + if err != nil { + klog.Errorf("%s %s %s CheckComponent failed: %s", cluster.ClusterID, rel.Name, component.Name, err.Error()) } - imageGaugeVecSetList = append(imageGaugeVecSetList, resourceGaugeVecSet) + syncLock.Lock() + componentList = append(componentList, component) + checkItemList = append(checkItemList, relCheckItemList...) + relComponentStatusGVSList = append(relComponentStatusGVSList, statusGVSList...) + relImageGVSList = append(relImageGVSList, imageGVSList...) + relConfigGVSList = append(relConfigGVSList, configGVSList...) + syncLock.Unlock() } } } + }(namespace) + } - chartGaugeVecSetList = append(chartGaugeVecSetList, chartCheckResult...) + wg.Wait() + return checkItemList, relStatusGvsList, relComponentStatusGVSList, relImageGVSList, relConfigGVSList, componentList +} - // get result of CheckComponents - componentImageGaugeVecSetList, _ := p.CheckComponents(clusterId, clusterbiz, clientSet, clusterComponent) - imageGaugeVecSetList = append(imageGaugeVecSetList, componentImageGaugeVecSetList...) +// GetStatus get workload ready status +func GetStatus(updatedReplicas int, availableReplicas int, replicas int) string { + // 期望副本数=可用副本数=最新副本数 + if availableReplicas == replicas && updatedReplicas == replicas { + return NormalStatus + } else { + return AppStatusNotReadyStatus + } - // 集群单独路径的指标配置 - systemAppMapLock.Lock() - if _, ok := systemAppChartMap[clusterId]; !ok { - systemAppChartMap[clusterId] = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "system_app_chart_version", - Help: "system_app_chart_version, 1 means deployed", - }, []string{"target", "target_biz", "namespace", "chart", "version", "status"}) - metric_manager.MM.RegisterSeperatedMetric(clusterId, systemAppChartMap[clusterId]) - } +} - if _, ok := systemAppImageMap[clusterId]; !ok { - systemAppImageMap[clusterId] = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "system_app_image_version", - Help: "system_app_image_version, 1 means ok", - }, []string{"target", "target_biz", "namespace", "chart", "component", "resource", "container", "version", - "status"}) - metric_manager.MM.RegisterSeperatedMetric(clusterId, systemAppImageMap[clusterId]) - } +// CheckComponent 检查应用workload详情 +func CheckComponent(component Component, cluster *pluginmanager.ClusterConfig, rel string, + componentVersionConfList []ComponentVersionConf) ( + []pluginmanager.CheckItem, []*metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, []*metricmanager.GaugeVecSet, error) { + checkItemList := make([]pluginmanager.CheckItem, 0, 0) + statusGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + imageGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + configGVSList := make([]*metricmanager.GaugeVecSet, 0, 0) + + // 检查workload status, 是否ready + status, containerImageList, err := getWorkLoadStatus(component.Resource, component.Name, component.Namespace, cluster.ClientSet, cluster.MetricSet) + if err != nil { + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppStatusCheckItemName, + ItemTarget: component.Name, + Status: status, + Level: pluginmanager.WARNLevel, + Detail: err.Error(), + Normal: status == NormalStatus, + Tags: map[string]string{"component": component.Name}, + }) - metric_manager.SetMetric(systemAppChartMap[clusterId], chartCheckResult) - metric_manager.SetMetric(systemAppImageMap[clusterId], componentImageGaugeVecSetList) - systemAppMapLock.Unlock() - }() + statusGVSList = append(statusGVSList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, component.Namespace, component.Name, component.Resource, status, rel}, + Value: 1, + }) + return checkItemList, statusGVSList, imageGVSList, configGVSList, err } - wg.Wait() - // reset metric value - metric_manager.SetMetric(systemAppChartVersion, chartGaugeVecSetList) - metric_manager.SetMetric(systemAppImageVersion, imageGaugeVecSetList) - goruntime.GC() -} + if status != NormalStatus { + klog.Infof("%s %s %s status is %s", cluster.ClusterID, component.Name, component.Namespace, status) + // 如果找不到对应的workload,则直接返回 + if status == APPNotfoundAppStatus { + return checkItemList, statusGVSList, imageGVSList, configGVSList, err + } + } -// GetStatus -func GetStatus(updatedReplicas int, availableReplicas int, replicas int) string { - if replicas > 0 && availableReplicas == replicas && updatedReplicas == replicas { - return "ready" - } else { - return "notready" + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppStatusCheckItemName, + ItemTarget: component.Name, + Status: status, + Detail: "", + Level: pluginmanager.WARNLevel, + Normal: status == NormalStatus, + Tags: map[string]string{"component": component.Name}, + }) + statusGVSList = append(statusGVSList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, component.Namespace, component.Name, component.Resource, status, rel}, + Value: 1, + }) + + // 检查workload镜像 + for _, ci := range containerImageList { + status = GetImageStatus(ci, componentVersionConfList) + checkItemList = append(checkItemList, pluginmanager.CheckItem{ + ItemName: SystemAppImageVersionCheckItemName, + ItemTarget: component.Name, + Status: NormalStatus, + Detail: "", + Level: pluginmanager.WARNLevel, + Normal: status == NormalStatus, + Tags: map[string]string{"component": component.Name}, + }) + + imageGVSList = append(imageGVSList, &metricmanager.GaugeVecSet{ + Labels: []string{cluster.ClusterID, cluster.BusinessID, component.Namespace, component.Name, component.Resource, ci.container, ci.image, status, rel}, + Value: 1, + }) } + return checkItemList, statusGVSList, imageGVSList, configGVSList, err } -// CheckComponents check specified component status -func (p *Plugin) CheckComponents( - clusterId string, clusterbiz string, clientSet *kubernetes.Clientset, componentList []Component) ( - []*metric_manager.GaugeVecSet, error) { - imageGaugeVecSetList := make([]*metric_manager.GaugeVecSet, 0, 0) - for _, component := range componentList { - var workload runtime.Object - var err error - switch component.Resource { - case "Deployment", "deployment": - workload, err = clientSet.AppsV1().Deployments(component.Namespace).Get(context.Background(), component.Name, - metav1.GetOptions{}) - if err != nil { - klog.Errorf("%s get %s %s %s failed: %s", - clusterId, component.Namespace, component.Resource, component.Name, err.Error()) - continue +// getWorkLoadStatus xxx +func getWorkLoadStatus(workloadType string, workloadName string, namespace string, clientSet *kubernetes.Clientset, metricsClient *metricsclientset.Clientset) (string, []containerImage, error) { + var containerImageList []containerImage + var status = NormalStatus + var err error + ctx := internalUtil.GetCtx(10 * time.Second) + switch workloadType { + case deployment: + var deploy *v1.Deployment + deploy, err = clientSet.AppsV1().Deployments(namespace).Get(ctx, workloadName, metav1.GetOptions{ + ResourceVersion: "0", + }) + + if err != nil { + status = AppErrorStatus + if strings.Contains(err.Error(), "not found") { + status = APPNotfoundAppStatus } + } else { + containerImageList, status, err = getDeployCheckResult(deploy, metricsClient) + } - case "DaemonSet", "daemonSet": - workload, err = clientSet.AppsV1().DaemonSets(component.Namespace).Get(context.Background(), component.Name, - metav1.GetOptions{}) - if err != nil { - klog.Errorf("%s get %s %s %s failed: %s", - clusterId, component.Namespace, component.Resource, component.Name, err.Error()) - continue + case daemonset: + var ds *v1.DaemonSet + ds, err = clientSet.AppsV1().DaemonSets(namespace).Get(ctx, workloadName, metav1.GetOptions{ + ResourceVersion: "0", + }) + + if err != nil { + status = AppErrorStatus + if strings.Contains(err.Error(), "not found") { + status = APPNotfoundAppStatus } - case "StatefulSet", "statefulset": - workload, err = clientSet.AppsV1().StatefulSets(component.Namespace).Get(context.Background(), component.Name, - metav1.GetOptions{}) - if err != nil { - klog.Errorf("%s get %s %s %s failed: %s", - clusterId, component.Namespace, component.Resource, component.Name, err.Error()) - continue + } else { + containerImageList, status, err = getDSCheckResult(ds, metricsClient) + } + + case statefulset: + var sts *v1.StatefulSet + sts, err = clientSet.AppsV1().StatefulSets(namespace).Get(ctx, workloadName, metav1.GetOptions{ + ResourceVersion: "0", + }) + + if err != nil { + status = AppErrorStatus + if strings.Contains(err.Error(), "not found") { + status = APPNotfoundAppStatus } + } else { + containerImageList, status, err = getSTSCheckResult(sts, metricsClient) } - if workload == nil { - klog.Infof("%s Unknown resource type: %s", clusterId, component.Resource) - continue + } + + if err != nil { + return status, nil, fmt.Errorf("namespace: %s worload: %s GetWorkLoad failed: %s", namespace, workloadName, err.Error()) + } + + return status, containerImageList, nil + +} + +// GetComponentFromManifest get component by parsing workload manifest +func GetComponentFromManifest(worloadManifest string, component *Component) error { + var objMap map[string]interface{} + if err := yaml.Unmarshal([]byte(worloadManifest), &objMap); err != nil { + return fmt.Errorf("Error unmarshalling YAML: %v", err) + } + if _, ok := objMap["kind"]; !ok { + return fmt.Errorf("wrong workload yaml, no kind: %s", worloadManifest) + } + if _, ok := objMap["metadata"]; !ok { + return fmt.Errorf("wrong workload yaml, no metadata%s", worloadManifest) + } + + if _, ok := objMap["metadata"].(map[interface{}]interface{}); ok { + if _, ok = objMap["metadata"].(map[interface{}]interface{})["namespace"]; ok { + if namespace, ok := objMap["metadata"].(map[interface{}]interface{})["namespace"].(string); ok { + component.Namespace = namespace + } } + } else { + return fmt.Errorf("wrong workload yaml, wrong metadata type %s", worloadManifest) + } - // unstrObj, _ := runtime.DefaultUnstructuredConverter.ToUnstructured(workload) - resourceGaugeVecSet := GetResourceGaugeVecSet(clusterId, clusterbiz, workload, release.Release{ - Namespace: component.Namespace, - Name: "nonchart", + component.Resource = objMap["kind"].(string) + component.Name = objMap["metadata"].(map[interface{}]interface{})["name"].(string) + + return nil +} + +type containerImage struct { + container string + image string +} + +// getSTSCheckResult 检查statefulset类型workload +func getSTSCheckResult(sts *v1.StatefulSet, ms *metricsclientset.Clientset) ([]containerImage, string, error) { + containerImageList := make([]containerImage, 0, 0) + for _, container := range sts.Spec.Template.Spec.Containers { + containerImageList = append(containerImageList, containerImage{ + container: container.Name, + image: container.Image, }) - if resourceGaugeVecSet == nil { - resourceGaugeVecSet = &metric_manager.GaugeVecSet{ - Labels: []string{ - clusterId, clusterbiz, component.Namespace, "nonchart", component.Name, workload.GetObjectKind(). - GroupVersionKind().Kind, - "", "", - "notready"}, Value: 1} - } - imageGaugeVecSetList = append(imageGaugeVecSetList, resourceGaugeVecSet) } - return imageGaugeVecSetList, nil + // 获取workload当前是否ready + status := GetStatus( + int(sts.Status.UpdatedReplicas), + int(sts.Status.ReadyReplicas), + int(sts.Status.Replicas)) + + if status != NormalStatus { + return containerImageList, status, nil + } + + // 检测workload当前资源使用情况 + status, err := CheckPodMetric(sts.Spec.Template.Spec.Containers, ms, sts.Namespace, sts.Spec.Selector.MatchLabels) + return containerImageList, status, err } -// GetResourceGaugeVecSet generate GaugeVecSet from workload status -func GetResourceGaugeVecSet( - clusterId string, clusterbiz string, object runtime.Object, rel release.Release) *metric_manager.GaugeVecSet { - - var resourceGaugeVecSet *metric_manager.GaugeVecSet - // unstr, ok := object.(*unstructured.Unstructured) - // if !ok { - // klog.Errorf("attempt to decode non-Unstructured object: %s", object) - // return nil - // } - objectMap, _ := runtime.DefaultUnstructuredConverter.ToUnstructured(object) - unstr := &unstructured.Unstructured{Object: objectMap} - - kind := unstr.GetKind() - switch strings.ToLower(kind) { - case "deployment": - deploy := &v1.Deployment{} - err := runtime.DefaultUnstructuredConverter.FromUnstructured(objectMap, deploy) - if err != nil { - klog.Errorf("DefaultUnstructuredConverter failed: %s", err.Error()) - return nil +// getDSCheckResult 检查daemonset类型workload +func getDSCheckResult(ds *v1.DaemonSet, ms *metricsclientset.Clientset) ([]containerImage, string, error) { + containerImageList := make([]containerImage, 0, 0) + for _, container := range ds.Spec.Template.Spec.Containers { + containerImageList = append(containerImageList, containerImage{ + container: container.Name, + image: container.Image, + }) + } + + // 获取workload当前是否ready + status := GetStatus( + int(ds.Status.UpdatedNumberScheduled), + int(ds.Status.NumberReady), + int(ds.Status.DesiredNumberScheduled)) + if status != NormalStatus { + return containerImageList, status, nil + } + + // ds不检测资源使用情况 + //status, err := CheckPodMetric(ds.Spec.Template.Spec.Containers, ms, ds.Namespace, ds.Spec.Selector.MatchLabels) + return containerImageList, status, nil +} + +// getDeployCheckResult 检查deployment类型workload +func getDeployCheckResult(deploy *v1.Deployment, ms *metricsclientset.Clientset) ([]containerImage, string, error) { + containerImageList := make([]containerImage, 0, 0) + for _, container := range deploy.Spec.Template.Spec.Containers { + containerImageList = append(containerImageList, containerImage{ + container: container.Name, + image: container.Image, + }) + } + + // 获取workload当前是否ready + status := GetStatus( + int(deploy.Status.UpdatedReplicas), + int(deploy.Status.ReadyReplicas), + int(deploy.Status.Replicas)) + if status != NormalStatus { + return containerImageList, status, nil + } + + // 检测workload当前资源使用情况 + status, err := CheckPodMetric(deploy.Spec.Template.Spec.Containers, ms, deploy.Namespace, deploy.Spec.Selector.MatchLabels) + return containerImageList, status, err +} + +// CheckPodMetric check pod resource metric, generate high load gvs +func CheckPodMetric(containerList []corev1.Container, ms *metricsclientset.Clientset, namespace string, matchLabels map[string]string) (string, error) { + if ms == nil { + return NormalStatus, nil + } + // 基于workload的label获取pod metric + podMetricList, err := ms.MetricsV1beta1().PodMetricses(namespace).List(internalUtil.GetCtx(15*time.Second), + metav1.ListOptions{ + ResourceVersion: "0", + LabelSelector: metav1.FormatLabelSelector(&metav1.LabelSelector{MatchLabels: matchLabels})}) + + if err != nil { + if strings.Contains(err.Error(), "the server could not find the requested resource") { + klog.Infof(err.Error()) + return NormalStatus, nil + } else { + return AppMetricErrorStatus, err } + } - for _, container := range deploy.Spec.Template.Spec.Containers { - resourceGaugeVecSet = &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, rel.Namespace, rel.Name, deploy.Name, kind, - container.Name, container.Image, - GetStatus( - int(deploy.Status.UpdatedReplicas), - int(deploy.Status.ReadyReplicas), - int(deploy.Status.Replicas))}, Value: 1} + for _, container := range containerList { + for _, podMetric := range podMetricList.Items { + for _, containerMetric := range podMetric.Containers { + if container.Name != containerMetric.Name { + continue + } + + // 如果使用率大于95%则返回异常status + if container.Resources.Limits.Memory().MilliValue() != 0 { + memoryUsagePercent := containerMetric.Usage.Memory().MilliValue() * 100 / container.Resources.Limits.Memory().MilliValue() + if memoryUsagePercent > 95 { + return AppStatusMemoryHighStatus, nil + } + } + + if container.Resources.Limits.Cpu().MilliValue() != 0 { + cpuUsagePercent := containerMetric.Usage.Cpu().MilliValue() * 100 / container.Resources.Limits.Cpu().MilliValue() + if cpuUsagePercent > 95 { + return AppStatusCpuHighStatus, nil + } + } + } } - break - case "statefulset": - sts := &v1.StatefulSet{} - err := runtime.DefaultUnstructuredConverter.FromUnstructured(objectMap, sts) - if err != nil { - klog.Errorf("DefaultUnstructuredConverter failed: %s", err.Error()) - return nil + + } + + return NormalStatus, nil +} + +// Ready return true if cluster check is over +func (p *Plugin) Ready(clusterID string) bool { + p.WriteLock.Lock() + defer p.WriteLock.Unlock() + return p.ReadyMap[clusterID] +} + +// GetResult return check result by cluster ID +func (p *Plugin) GetResult(s string) pluginmanager.CheckResult { + return p.Result[s] +} + +// GetImageStatus check container image version +func GetImageStatus(image containerImage, componentVersionConfList []ComponentVersionConf) string { + for _, versionConf := range componentVersionConfList { + if versionConf.Name != image.container { + continue } - for _, container := range sts.Spec.Template.Spec.Containers { - resourceGaugeVecSet = &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, rel.Namespace, rel.Name, sts.Name, kind, - container.Name, container.Image, - GetStatus( - int(sts.Status.UpdatedReplicas), - int(sts.Status.ReadyReplicas), - int(sts.Status.Replicas))}, Value: 1} + + images := strings.Split(image.image, ":") + if len(images) != 2 { + // image format: mirrors.tencent.com/xxx: v1.29.0-alpha.1 + return ImageStatusUnknown } - break - case "daemonset": - ds := &v1.DaemonSet{} - err := runtime.DefaultUnstructuredConverter.FromUnstructured(objectMap, ds) + + imageVersion, err := version.NewVersion(images[1]) if err != nil { - klog.Errorf("DefaultUnstructuredConverter failed: %s", err.Error()) - return nil + return NormalStatus } - for _, container := range ds.Spec.Template.Spec.Containers { - resourceGaugeVecSet = &metric_manager.GaugeVecSet{ - Labels: []string{clusterId, clusterbiz, rel.Namespace, rel.Name, ds.Name, kind, - container.Name, container.Image, - GetStatus( - int(ds.Status.UpdatedNumberScheduled), - int(ds.Status.NumberReady), - int(ds.Status.DesiredNumberScheduled))}, Value: 1} + + if versionConf.NeedUpgrade != "" { + needUpgradeVersion, err := version.NewVersion(versionConf.NeedUpgrade) + if err != nil { + return ImageStatusUnknown + } + + if imageVersion.LessThan(needUpgradeVersion) { + return ImageStatusNeedUpgrade + } } - break - default: - klog.V(6).Infof("%s type is %s", unstr.GetName(), kind) + + if versionConf.NiceToUpgrade != "" { + niceToUpgradeVersion, err := version.NewVersion(versionConf.NiceToUpgrade) + if err != nil { + return ImageStatusNeedUpgrade + } + + if imageVersion.LessThan(niceToUpgradeVersion) { + return ImageStatusNiceToUpgrade + } + } + + return NormalStatus } - return resourceGaugeVecSet + return NormalStatus } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck_test.go b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck_test.go index 18b6e71360..723067a1ac 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck_test.go +++ b/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck/systemappcheck_test.go @@ -10,13 +10,13 @@ * limitations under the License. */ -package systempodcheck +// Package systemappcheck xxx +package systemappcheck import ( "fmt" "testing" - "helm.sh/helm/v3/pkg/release" v1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -69,9 +69,4 @@ func TestGetResourceGaugeVecSet(t *testing.T) { fmt.Println(unstr.GetName()) fmt.Println(unstr.GetObjectKind().GroupVersionKind().Kind) fmt.Println(unstr.GetObjectKind()) - - GetResourceGaugeVecSet("aa", "bb", obj, release.Release{ - Namespace: "cc", - Name: "nonchart", - }) } diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin_manager/plugin_manager.go b/bcs-services/bcs-cluster-reporter/internal/plugin_manager/plugin_manager.go deleted file mode 100644 index 7f90504714..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/plugin_manager/plugin_manager.go +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package plugin_manager -package plugin_manager - -import ( - "fmt" - "path/filepath" - "strings" - "sync" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/metric_manager" - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" - - "github.com/prometheus/client_golang/prometheus" -) - -var ( - // Pm xxx - Pm *pluginManager - clusterTotal *prometheus.GaugeVec -) - -// Plugin xxx -type Plugin interface { - Name() string - Setup(configFilePath string) error - Stop() error -} - -func init() { - Pm = NewPluginManager() - // set default metric - clusterTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_total_num", - Help: "cluster_total_num", - }, []string{}) - - prometheus.MustRegister(clusterTotal) -} - -// Register xxx -func Register(plugin Plugin) { - Pm.Register(plugin) -} - -type pluginManager struct { - plugins map[string]Plugin - config *Config - configLock sync.Mutex - concurrencyLock sync.Mutex - routinePool *util.RoutinePool -} - -func (pm *pluginManager) Register(plugin Plugin) { - pm.plugins[plugin.Name()] = plugin -} - -func (pm *pluginManager) GetPlugin(plugin string) Plugin { - if p, ok := pm.plugins[plugin]; ok { - return p - } else { - return nil - } -} - -// SetConfig configure pluginmanager by config file -func (pm *pluginManager) SetConfig(config *Config) { - pm.configLock.Lock() - defer pm.configLock.Unlock() - if config != nil { - pm.config = config - } - - for _, cluster := range config.ClusterConfigs { - metric_manager.MM.SetSeperatedMetric(cluster.ClusterID) - } - - clusterTotal.WithLabelValues().Set(float64(len(pm.config.ClusterConfigs))) -} - -func (pm *pluginManager) GetConfig() *Config { - pm.configLock.Lock() - defer pm.configLock.Unlock() - return pm.config -} - -func (pm *pluginManager) SetupPlugin(plugins string, pluginDir string) error { - for _, plugin := range strings.Split(plugins, ",") { - if p := pm.GetPlugin(plugin); p == nil { - return fmt.Errorf("Get Plugin %s failed, nil result", plugin) - } else { - err := p.Setup(filepath.Join(pluginDir, plugin+".conf")) - if err != nil { - return fmt.Errorf("Setup plugin %s failed: %s", p.Name(), err.Error()) - } - } - } - return nil -} - -func (pm *pluginManager) Lock() { - pm.concurrencyLock.Lock() -} - -func (pm *pluginManager) UnLock() { - pm.concurrencyLock.Unlock() -} - -func (pm *pluginManager) Add() { - pm.routinePool.Add(1) -} - -func (pm *pluginManager) Done() { - pm.routinePool.Done() -} - -func (pm *pluginManager) StopPlugin(plugins string) error { - for _, plugin := range strings.Split(plugins, ",") { - if p := pm.GetPlugin(plugin); p == nil { - return fmt.Errorf("Get Plugin %s failed, nil result", plugin) - } else { - err := p.Stop() - if err != nil { - return fmt.Errorf("StopPlugin plugin %s failed: %s", p.Name(), err.Error()) - } - } - } - return nil -} - -// NewPluginManager xxx -func NewPluginManager() *pluginManager { - return &pluginManager{ - routinePool: util.NewRoutinePool(40), - plugins: make(map[string]Plugin), - } -} diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/const.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/const.go new file mode 100644 index 0000000000..bdf04286c0 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/const.go @@ -0,0 +1,89 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + +const ( + RISKLevel = "RISK" + WARNLevel = "WARN" + SERIOUSLevel = "SERIOUS" + + ClusterAvailabilityCheckItemName = "ClusterAvailabilityCheckItemName" + ClusterAvailabilityOkStatus = "ClusterAvailabilityOkStatus" + ClusterAvailabilityPanicStatus = "ClusterAvailabilityPanicStatus" + + // checkitemname + NormalStatus = "ok" + + ClusterID = "ClusterID" + CheckItemName = "CheckItemName" + CheckItemType = "CheckItemType" + CheckItemTarget = "CheckItemTarget" + CheckItemResult = "CheckItemResult" + CheckItemLevel = "CheckItemLevel" + CheckItemDetail = "CheckItemDetail" + + CheckItemSolution = "CheckItemSolution" + + promotStrFormat = "%s %s result is %s, detail: %s" +) + +var ( + LevelColor = map[string]util.Color{ + RISKLevel: {Red: 60, Green: 17, Blue: 14}, + WARNLevel: {Red: 98, Green: 85, Blue: 29}, + SERIOUSLevel: {Red: 98, Green: 29, Blue: 41}, + } + ChinenseStringMap = map[string]string{ + ClusterAvailabilityCheckItemName: "集群可用性", + ClusterAvailabilityOkStatus: "ok", + ClusterAvailabilityPanicStatus: "panic", + NormalStatus: "ok", + + CheckItemLevel: "问题等级", + CheckItemDetail: "检测详情", + CheckItemName: "检测项", + CheckItemType: "检测类型", + CheckItemTarget: "检测对象", + CheckItemResult: "检测结果", + + ClusterID: "集群ID", + + promotStrFormat: "%s针对%s的检查结果为%s, 检查详情:%s", + + CheckItemSolution: "检测详情", + } + + EnglishStringMap = map[string]string{ + ClusterAvailabilityCheckItemName: "cluster availability", + ClusterAvailabilityOkStatus: "ok", + ClusterAvailabilityPanicStatus: "ok", + NormalStatus: "ok", + + CheckItemName: "check item", + CheckItemDetail: "detail", + CheckItemType: "check item type", + CheckItemTarget: "check item target", + CheckItemResult: "check item result", + + ClusterID: "clusterID", + + promotStrFormat: promotStrFormat, + + CheckItemSolution: "check item solution", + } + + StringMap = ChinenseStringMap +) diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/html.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/html.go new file mode 100644 index 0000000000..9ab7fa69c0 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/html.go @@ -0,0 +1,44 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import ( + "github.com/PuerkitoBio/goquery" +) + +// GetBizReportHtml xxx +func GetBizReportHtml(bizID string, pluginStr string) (string, error) { + return "", nil +} + +// GetClusterReportHtml xxx +func GetClusterReportHtml(clusterID string, pluginStr string) (string, error) { + return "", nil +} + +// SolutionHtmlTable xxx +type SolutionHtmlTable struct { + ItemName string + ItemType string + ItemTarget string + Level string + Result string + Advise string +} + +// HTMLTable xxx +type HTMLTable struct { + doc *goquery.Document + headers []string +} diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/options.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/options.go new file mode 100644 index 0000000000..448f154acf --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/options.go @@ -0,0 +1,85 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import ( + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-manager/api/clustermanager" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + metricsclientset "k8s.io/metrics/pkg/client/clientset/versioned" + "sync" +) + +const ( + RunModeOnce = "once" + RunModeDaemon = "daemon" + + TKECluster = "tke" +) + +// Config Options bcs log options +type Config struct { + ClusterConfigs map[string]*ClusterConfig + NodeConfig NodeConfig + InClusterConfig ClusterConfig +} + +// ClusterConfig xxx +type ClusterConfig struct { + Config *rest.Config + ClusterID string + BusinessID string + Master []string + BCSCluster clustermanager.Cluster + ClusterType string + ClientSet *kubernetes.Clientset + MetricSet *metricsclientset.Clientset + Version string + + // net + ServiceCidr string + Cidr []string + MaskSize int + ServiceMaxNum int + ServiceNum int + + // node + NodeNum int + NodeInfo map[string]plugin.NodeInfo + + // mutex + sync.Mutex +} + +// NodeConfig xxx +type NodeConfig struct { + Config *rest.Config + ClientSet *kubernetes.Clientset + NodeName string + Node *v1.Node + HostPath string +} + +// Validate validate options +func (o *Config) Validate() error { + // if len(o.KubeMaster) == 0 { + // return fmt.Errorf("kube_master cannot be empty") + // } + // if len(o.Kubeconfig) == 0 { + // return fmt.Errorf("kubeconfig cannot be empty") + // } + return nil +} diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/pdf.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/pdf.go new file mode 100644 index 0000000000..b2d4e0b567 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/pdf.go @@ -0,0 +1,202 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import ( + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/jung-kurt/gofpdf" +) + +// GetBizReport xxx +func GetBizReport(bizID string, pluginStr string) (gofpdf.Pdf, error) { + pdf := gofpdf.New("P", "mm", "A3", "") + + return pdf, nil +} + +// GetClusterReport xxx +func GetClusterReport(clusterID string, pluginStr string) (gofpdf.Pdf, error) { + pdf := gofpdf.New("P", "mm", "A3", "") + for _, clusterConfig := range Pm.GetConfig().ClusterConfigs { + if clusterID != clusterConfig.ClusterID { + continue + } + result := Pm.GetClusterResult(pluginStr, clusterID) + + pdf.AddPage() + pdf.AddUTF8Font("tencent", "", "TencentSans-W3.ttf") + pdf.AddUTF8Font("tencent", "B", "TencentSans-W7.ttf") + pdf.SetFont("tencent", "", 12) + pdf.SetXY(0, 10) + + // 打印集群信息 + infoTable := ConvertInfoItemToPDFTable(InfoItem{ItemName: ClusterID, Result: clusterConfig.ClusterID}, "集群信息") + WriteClusterInfo(clusterConfig, infoTable) + util.WritePDFTable(pdf, *infoTable, true) + + // 打印节点信息 + typeNumMap := make(map[string]int) + regionNumMap := make(map[string]int) + zoneNumMap := make(map[string]int) + for _, nodeInfo := range clusterConfig.NodeInfo { + if _, ok := typeNumMap[nodeInfo.Type]; !ok { + typeNumMap[nodeInfo.Type] = 1 + } else { + typeNumMap[nodeInfo.Type] = typeNumMap[nodeInfo.Type] + 1 + } + + if _, ok := regionNumMap[nodeInfo.Region]; !ok { + regionNumMap[nodeInfo.Region] = 1 + } else { + regionNumMap[nodeInfo.Region] = regionNumMap[nodeInfo.Region] + 1 + } + + if _, ok := zoneNumMap[nodeInfo.Zone]; !ok { + zoneNumMap[nodeInfo.Zone] = 1 + } else { + zoneNumMap[nodeInfo.Zone] = zoneNumMap[nodeInfo.Zone] + 1 + } + } + + nodeInfoTable := NewInfoItemToPDFTable("节点信息") + for key, value := range typeNumMap { + AddInfoItemToPDFTable(InfoItem{ItemName: "机型:" + key, Result: value}, nodeInfoTable) + } + for key, value := range regionNumMap { + AddInfoItemToPDFTable(InfoItem{ItemName: "地域:" + key, Result: value}, nodeInfoTable) + } + for key, value := range zoneNumMap { + AddInfoItemToPDFTable(InfoItem{ItemName: "可用区:" + key, Result: value}, nodeInfoTable) + } + util.WritePDFTable(pdf, *nodeInfoTable, true) + + // 打印各类检查项 + for pluginName, checkItemList := range result { + checkItemTable := NewCheckItemPDFTable(pluginName) + + for _, checkItem := range checkItemList.Items { + //if checkItem.Normal { + // continue + //} + AddCheckItemToPDFTable(checkItem, checkItemTable) + } + + if checkItemTable.Line == 0 { + continue + } + util.WritePDFTable(pdf, *checkItemTable, true) + } + return pdf, nil + } + return nil, fmt.Errorf("%s not found", clusterID) +} + +// WriteClusterInfo xxx +func WriteClusterInfo(clusterConfig *ClusterConfig, infoTable *util.PDFTable) { + if clusterConfig.BCSCluster.ClusterID != "" { + AddInfoItemToPDFTable(InfoItem{ItemName: "ClusterName", Result: clusterConfig.BCSCluster.ClusterName}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Creator", Result: clusterConfig.BCSCluster.Creator}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Managetype", Result: clusterConfig.BCSCluster.ManageType}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "CreateTime", Result: clusterConfig.BCSCluster.CreateTime}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Systemid", Result: clusterConfig.BCSCluster.SystemID}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Vpc", Result: clusterConfig.BCSCluster.VpcID}, infoTable) + } + AddInfoItemToPDFTable(InfoItem{ItemName: "ClusterType", Result: clusterConfig.ClusterType}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "BusinessID", Result: clusterConfig.BusinessID}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Master", Result: clusterConfig.Master}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "ServiceCidr", Result: clusterConfig.ServiceCidr}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "ServiceMaxNum", Result: clusterConfig.ServiceMaxNum}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "ServiceNum", Result: clusterConfig.ServiceNum}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "Cidr", Result: clusterConfig.Cidr}, infoTable) + AddInfoItemToPDFTable(InfoItem{ItemName: "NodeNum", Result: clusterConfig.NodeNum}, infoTable) +} + +// NewInfoItemToPDFTable xxx +func NewInfoItemToPDFTable(title string) *util.PDFTable { + keys := make([]util.Column, 0, 0) + + result := &util.PDFTable{ + Header: append(keys, util.Column{Content: StringMap[CheckItemName]}, util.Column{Content: StringMap[CheckItemResult]}), + Title: util.Column{Content: title}, + Data: [][]util.Column{}, + } + + return result +} + +// ConvertInfoItemToPDFTable xxx +func ConvertInfoItemToPDFTable(item InfoItem, title string) *util.PDFTable { + keys := make([]util.Column, 0, 0) + + result := &util.PDFTable{ + Header: append(keys, util.Column{Content: StringMap[CheckItemName]}, util.Column{Content: StringMap[CheckItemResult]}), + Title: util.Column{Content: item.ItemName}, + Data: [][]util.Column{}, + } + + if title != "" { + result.Title.Content = title + } + + AddInfoItemToPDFTable(item, result) + return result +} + +// AddInfoItemToPDFTable xxx +func AddInfoItemToPDFTable(item InfoItem, table *util.PDFTable) { + values := make([]util.Column, 0, 0) + values = append(values, util.Column{Content: item.ItemName}, util.Column{Content: fmt.Sprintf("%v", item.Result)}) + + table.Data = append(table.Data, values) +} + +// NewCheckItemPDFTable xxx +func NewCheckItemPDFTable(title string) *util.PDFTable { + keys := make([]util.Column, 0, 0) + + result := &util.PDFTable{ + Header: append(keys, util.Column{Content: StringMap[CheckItemName]}, util.Column{Content: StringMap[CheckItemTarget]}, + util.Column{Content: StringMap[CheckItemLevel]}, util.Column{Content: StringMap[CheckItemResult]}, util.Column{Content: StringMap[CheckItemDetail]}), + Title: util.Column{Content: title}, + Data: [][]util.Column{}, + } + + return result +} + +// AddCheckItemToPDFTable xxx +func AddCheckItemToPDFTable(item CheckItem, table *util.PDFTable) { + values := make([]util.Column, 0, 0) + if !item.Normal { + values = append(values, util.Column{Content: item.ItemName}, util.Column{Content: item.ItemTarget}, util.Column{Content: item.Level}, + util.Column{Content: item.Status, Color: util.Color{Red: 238, Green: 56, Blue: 43}}, util.Column{Content: item.Detail}) + } else { + values = append(values, util.Column{Content: item.ItemName}, util.Column{Content: item.ItemTarget}, util.Column{Content: item.Level}, + util.Column{Content: item.Status}, util.Column{Content: item.Detail}) + } + table.Line++ + + table.Data = append(table.Data, values) +} + +// SolutionTable xxx +type SolutionTable struct { + ItemName util.Column + ItemType util.Column + ItemTarget util.Column + Level util.Column + Result util.Column + Advise util.Column +} diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin.go new file mode 100644 index 0000000000..0acb59079a --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin.go @@ -0,0 +1,126 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import ( + "sync" +) + +// Plugin xxx +type Plugin interface { + // return plugin name + Name() string + // setup plugin work flow + Setup(configFilePath string, runMode string) error + // stop plugin work flow + Stop() error + // check if plugin result is ready + Ready(string) bool + // get serilized check result + GetResult(string) CheckResult + // get check details for further analysiss + GetDetail() interface{} + // Check function for one time execute + Check() +} + +// CheckItem struct to store check result +type CheckItem struct { + // 检查项的名字 集群可用性 . etc 或者说大类 + ItemName string + // ItemTarget 该诊断的对象 + ItemTarget string + // 检查的其它相关信息,用来提示用户以及匹配文档 + Detail string + // tag 用来聚合输出 + Tags map[string]string + // level + Level string + Normal bool + + // 需要对接生成不同状态的metric,所以需要status属性 + Status string +} + +// InfoItem store check info +type InfoItem struct { + // 检查项的名字 集群可用性 . etc 或者说大类 + ItemName string + // label,检查项的相关信息,会输出在报告和metric中,主要用于nodeagent + Labels map[string]string + // 检查的结果 + Result interface{} +} + +// SetTags xxx +func (i CheckItem) SetTags(key, value string) CheckItem { + i.Tags[key] = value + return i +} + +// SetItemTarget xxx +func (i CheckItem) SetItemTarget(target string) CheckItem { + i.ItemTarget = target + return i +} + +// SetDetail xxx +func (i CheckItem) SetDetail(detail string) CheckItem { + i.Detail = detail + return i +} + +// SetLevel xxx +func (i CheckItem) SetLevel(level string) CheckItem { + i.Level = level + return i +} + +// CheckResult xxx +type CheckResult struct { + Items []CheckItem `yaml:"items"` + InfoItemList []InfoItem `yaml:"infoItems"` +} + +// BasePlugin xxx +type BasePlugin struct { + PluginName string + StopChan chan int + CheckLock sync.Mutex + WriteLock sync.Mutex +} + +// NodePlugin xxx +type NodePlugin struct { + BasePlugin + Result CheckResult +} + +// ClusterPlugin xxx +type ClusterPlugin struct { + BasePlugin + ReadyMap map[string]bool + Result map[string]CheckResult +} + +// GetDetail get check detail +func (p *ClusterPlugin) GetDetail() interface{} { + return false +} + +// PluginInfo xxx +type PluginInfo struct { + Result CheckResult `yaml:"result"` + Detail interface{} `yaml:"detail"` +} diff --git a/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin_manager.go b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin_manager.go new file mode 100644 index 0000000000..4346b56d36 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/pluginmanager/plugin_manager.go @@ -0,0 +1,227 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package pluginmanager xxx +package pluginmanager + +import ( + "fmt" + "k8s.io/klog" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + // Pm xxx + Pm *PluginManager + clusterTotal *prometheus.GaugeVec +) + +func init() { + Pm = NewPluginManager() + // set default metric + clusterTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_total_num", + Help: "cluster_total_num", + }, []string{}) + + prometheus.MustRegister(clusterTotal) +} + +// Register xxx +func Register(plugin Plugin) { + Pm.Register(plugin) +} + +// PluginManager xxx +type PluginManager struct { + plugins map[string]Plugin + config *Config + configLock sync.Mutex + concurrencyLock sync.Mutex + routinePool *util.RoutinePool + clusterReportList map[string]map[string]string +} + +// Register xxx +func (pm *PluginManager) Register(plugin Plugin) { + pm.plugins[plugin.Name()] = plugin +} + +// GetPlugin xxx +func (pm *PluginManager) GetPlugin(plugin string) Plugin { + if p, ok := pm.plugins[plugin]; ok { + return p + } else { + return nil + } +} + +// GetPluginstr xxx +func (pm *PluginManager) GetPluginstr() string { + result := "" + for name, _ := range pm.plugins { + result = fmt.Sprintf("%s,%s", name, result) + } + result = strings.TrimSuffix(result, ",") + return result +} + +// SetConfig configure pluginmanager by config file +func (pm *PluginManager) SetConfig(config *Config) { + pm.configLock.Lock() + defer pm.configLock.Unlock() + if config != nil { + pm.config = config + } + + clusterTotal.WithLabelValues().Set(float64(len(pm.config.ClusterConfigs))) +} + +// GetConfig xxx +func (pm *PluginManager) GetConfig() *Config { + pm.configLock.Lock() + defer pm.configLock.Unlock() + return pm.config +} + +// SetClusterReport xxx +func (pm *PluginManager) SetClusterReport(clusterID, name, report string) { + pm.clusterReportList[clusterID][name] = report +} + +// SetupPlugin xxx +func (pm *PluginManager) SetupPlugin(plugins string, pluginDir string, runMode string) error { + var wg sync.WaitGroup + for _, plugin := range strings.Split(plugins, ",") { + if p := pm.GetPlugin(plugin); p == nil { + return fmt.Errorf("Get Plugin %s failed, nil result", plugin) + } else { + wg.Add(1) + go func(plugin string) { + err := p.Setup(filepath.Join(pluginDir, plugin+".conf"), runMode) + if err != nil { + klog.Fatalf("Setup plugin %s failed: %s", p.Name(), err.Error()) + } + wg.Done() + }(plugin) + } + } + wg.Wait() + return nil +} + +// Lock xxx +func (pm *PluginManager) Lock() { + pm.concurrencyLock.Lock() +} + +// UnLock xxx +func (pm *PluginManager) UnLock() { + pm.concurrencyLock.Unlock() +} + +// Add xxx +func (pm *PluginManager) Add() { + pm.routinePool.Add(1) +} + +// Done xxx +func (pm *PluginManager) Done() { + pm.routinePool.Done() +} + +// StopPlugin xxx +func (pm *PluginManager) StopPlugin(plugins string) error { + for _, plugin := range strings.Split(plugins, ",") { + if p := pm.GetPlugin(plugin); p == nil { + return fmt.Errorf("Get Plugin %s failed, nil result", plugin) + } else { + err := p.Stop() + if err != nil { + return fmt.Errorf("StopPlugin plugin %s failed: %s", p.Name(), err.Error()) + } + } + } + return nil +} + +// NewPluginManager xxx +func NewPluginManager() *PluginManager { + + return &PluginManager{ + routinePool: util.NewRoutinePool(50), + plugins: make(map[string]Plugin), + clusterReportList: make(map[string]map[string]string), + } +} + +// Ready xxx +func (pm *PluginManager) Ready(pluginStr string, targetID string) bool { + for _, plugin := range strings.Split(pluginStr, ",") { + p := pm.GetPlugin(plugin) + if p == nil { + continue + } + for { + if p.Ready(targetID) { + break + } + klog.Infof("%s for %s is not ready", plugin, targetID) + time.Sleep(5 * time.Second) + } + } + return true +} + +// GetClusterResult xxx +func (pm *PluginManager) GetClusterResult(pluginStr string, clusterID string) map[string]CheckResult { + Pm.Ready(pluginStr, clusterID) + result := make(map[string]CheckResult) + for _, plugin := range strings.Split(pluginStr, ",") { + p := pm.GetPlugin(plugin) + result[plugin] = p.GetResult(clusterID) + } + return result +} + +// GetNodeResult xxx +func (pm *PluginManager) GetNodeResult(pluginStr string) map[string]CheckResult { + result := make(map[string]CheckResult) + for _, plugin := range strings.Split(pluginStr, ",") { + p := pm.GetPlugin(plugin) + if p == nil { + continue + } + result[plugin] = p.GetResult("") + } + return result +} + +// GetNodeDetail XXX +func (pm *PluginManager) GetNodeDetail(pluginStr string) map[string]interface{} { + result := make(map[string]interface{}) + for _, plugin := range strings.Split(pluginStr, ",") { + p := pm.GetPlugin(plugin) + if p == nil { + continue + } + result[plugin] = p.GetDetail() + } + return result +} diff --git a/bcs-services/bcs-cluster-reporter/internal/plugin_manager/options.go b/bcs-services/bcs-cluster-reporter/internal/rest/token.go similarity index 56% rename from bcs-services/bcs-cluster-reporter/internal/plugin_manager/options.go rename to bcs-services/bcs-cluster-reporter/internal/rest/token.go index 600a284470..eae8b8186c 100644 --- a/bcs-services/bcs-cluster-reporter/internal/plugin_manager/options.go +++ b/bcs-services/bcs-cluster-reporter/internal/rest/token.go @@ -10,32 +10,32 @@ * limitations under the License. */ -package plugin_manager +// Package rest xxx +package rest import ( - "k8s.io/client-go/rest" + "crypto/tls" + "net/http" ) -// Config Options bcs log options -type Config struct { - ClusterConfigs []ClusterConfig - InClusterConfig ClusterConfig +// BcsTransport client +type BcsTransport struct { + Token string } -// ClusterConfig xxx -type ClusterConfig struct { - Config *rest.Config - ClusterID string - BusinessID string -} +// RoundTrip xxx +func (t *BcsTransport) RoundTrip(req *http.Request) (*http.Response, error) { + header := http.Header{} + header.Set("accept", "application/json") + header.Set("Content-Type", "application/json") + header.Set("Authorization", "Bearer "+t.Token) + req.Header = header + + tr := &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, // 设置为 true 来禁用证书验证 + }, + } -// Validate validate options -func (o *Config) Validate() error { - // if len(o.KubeMaster) == 0 { - // return fmt.Errorf("kube_master cannot be empty") - // } - // if len(o.Kubeconfig) == 0 { - // return fmt.Errorf("kubeconfig cannot be empty") - // } - return nil + return tr.RoundTrip(req) } diff --git a/bcs-services/bcs-cluster-reporter/internal/types/process/process.go b/bcs-services/bcs-cluster-reporter/internal/types/process/process.go new file mode 100644 index 0000000000..479a14c998 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/types/process/process.go @@ -0,0 +1,463 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package process xxx +package process + +import ( + "bufio" + "fmt" + "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/util" + "github.com/moby/sys/mountinfo" + "github.com/shirou/gopsutil/process" + "k8s.io/klog/v2" + "os" + "path" + "path/filepath" + "regexp" + "strings" + "syscall" +) + +// HOST_PROC + +var ( + mnt NS = "mnt" + hostPath = util.GetHostPath() + systemdPaths = []string{ + "/etc/systemd/system/", + "/usr/lib/systemd/system/", + } +) + +// GetProcessNS xxx +func GetProcessNS(pid int32, ns NS) (syscall.Stat_t, error) { + mntNSFile := fmt.Sprintf("/proc/%d/ns/%s", pid, ns) + + var stat syscall.Stat_t + err := syscall.Stat(mntNSFile, &stat) + return stat, err +} + +// GetProcessServiceConfigfiles xxx +func GetProcessServiceConfigfiles(starter string) (map[string]string, error) { + serviceFiles := make(map[string]string) + for _, systemdPath := range systemdPaths { + var err error + serviceFiles[starter], err = GetConfigfile(path.Join(hostPath, systemdPath, starter)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + + fileInfo, err := os.Stat(path.Join(hostPath, systemdPath, fmt.Sprintf("%s.d", starter))) + if err == nil { + if fileInfo.IsDir() { + files, err := os.ReadDir(path.Join(hostPath, systemdPath, fmt.Sprintf("%s.d", starter))) + if err != nil { + return nil, err + } + + for _, serviceFile := range files { + if !serviceFile.IsDir() { + serviceFiles[serviceFile.Name()], err = GetConfigfile(path.Join(hostPath, systemdPath, fmt.Sprintf("%s.d", starter), serviceFile.Name())) + if err != nil { + return nil, err + } + } + } + } + } + break + } + + return serviceFiles, nil +} + +// GetStarter other, systemd, container, crontab, cmdline +func GetStarter(pid, ppid int32) (string, error) { + starter := "other" + switch ppid { + case 1: + cgroupFile := fmt.Sprintf("%s/proc/%d/cgroup", hostPath, pid) + file, err := os.Open(cgroupFile) + if err != nil { + klog.Errorf("Get process cgroup info failed: %s", err.Error()) + return starter, err + } + defer file.Close() + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "/system.slice/") { + starter = filepath.Base(strings.Split(line, ":")[2]) + break + } + } + + default: + parent, err := process.NewProcess(ppid) + if err != nil { + klog.Errorf("Get process parent info failed: %s", err.Error()) + return starter, err + } + + parentName, err := parent.Name() + if err != nil { + klog.Errorf("Get process parentName failed: %s", err.Error()) + return starter, err + } + + if parentName == "cron" { + starter = "crontab" + } else if strings.Contains(parentName, "runc") || strings.Contains(parentName, "containerd-shim") { + starter = "container" + } else if strings.Contains(parentName, "-bash") || strings.Contains(parentName, "-sh") { + starter = "cmdline" + } + + } + + return starter, nil +} + +// GetConfigfile xxx +func GetConfigfile(path string) (string, error) { + contentBytes, err := os.ReadFile(path) + if err != nil { + return "", err + } + return string(contentBytes), nil +} + +// GetMountInfoSourcePath xxx +func GetMountInfoSourcePath(source string) (string, error) { + f, err := os.Open(fmt.Sprintf("%s/proc/1/mountinfo", util.GetHostPath())) + if err != nil { + return "", fmt.Errorf("open path %s failed: %s", fmt.Sprintf("%s/proc/1/mountinfo", util.GetHostPath()), err.Error()) + } + + mountInfoList, err := mountinfo.GetMountsFromReader(f, nil) + if err != nil { + return "", fmt.Errorf("Get process mountinfo failed: %s", err.Error()) + } + + for _, mountInfo := range mountInfoList { + if mountInfo.Source == source { + return mountInfo.Mountpoint, nil + } + } + return "", fmt.Errorf("Get process mountinfo failed: notfound") + +} + +// GetConfigfileList xxx +func GetConfigfileList(params []string, pid int32) (map[string]string, error) { + configFiles := make(map[string]string) + + // 获取该进程mount信息 + f, err := os.Open(fmt.Sprintf("%s/proc/%d/mountinfo", util.GetHostPath(), pid)) + if err != nil { + klog.Errorf("Get process %d ConfigFiles failed: %s", pid, err.Error()) + } + + mountInfoList, err := mountinfo.GetMountsFromReader(f, nil) + if err != nil { + klog.Errorf("Get process %d ConfigFiles failed: %s", pid, err.Error()) + } + for _, param := range params { + if strings.Contains(param, "log") { + continue + } + + re := regexp.MustCompile(`(?:^|=)(/[a-zA-Z0-9\._-]+)+`) + //re := regexp.MustCompile(`(/[^/]+)+`) + paths := re.FindAllString(param, -1) + + for _, configFilepath := range paths { + configFilepath = strings.TrimPrefix(configFilepath, "=") + if strings.HasSuffix(configFilepath, "sock") { + continue + } + + for _, mountInfo := range mountInfoList { + // 还需要判断 /install-cni.sh 此类情况 // 获取到work目录,并找到对应的merged目录 + if mountInfo.Mountpoint == "/" || mountInfo.Source == "tmpfs" { + continue + } + if strings.Contains(configFilepath, mountInfo.Mountpoint) { + sourcePath, err := GetMountInfoSourcePath(mountInfo.Source) + if err != nil { + klog.Errorf("Get process %d ConfigFiles failed: %s", pid, err.Error()) + } + + remainingPath := strings.Replace(configFilepath, mountInfo.Mountpoint, "", -1) + if remainingPath != "" { + configFilepath = path.Join(sourcePath, mountInfo.Root, remainingPath) + } else { + configFilepath = path.Join(sourcePath, mountInfo.Root) + } + + break + } + } + + processConfigFilepath := path.Join(hostPath, configFilepath) + fileInfo, err := os.Stat(processConfigFilepath) + if err != nil { + klog.Infof("%d Get GetConfigfile %s content failed: %s", pid, configFilepath, err.Error()) + configFiles[configFilepath] = err.Error() + continue + } + + mode := fileInfo.Mode() + + if mode.IsDir() { + configFiles[configFilepath] = "dir" + } else if mode.IsRegular() { + if mode&0111 != 0 { + configFiles[configFilepath] = "executable" + } else if fileInfo.Size() > 1024*1024*5 { + configFiles[configFilepath] = "data" + } else { + contentBytes, err := os.ReadFile(processConfigFilepath) + if err != nil { + return nil, err + } + + configFiles[configFilepath] = string(contentBytes) + } + } else { + configFiles[configFilepath] = "other" + } + + } + + } + + return configFiles, nil + +} + +// GetProcessStatusByPID xxx +func GetProcessStatusByPID(pid int32) (ProcessStatus, error) { + var p *process.Process + var err error + var processStatus = ProcessStatus{} + + p, err = process.NewProcess(pid) + if err != nil { + klog.Errorf("Get processList failed: %s", err.Error()) + return processStatus, err + } + + processStatus.Name, err = p.Name() + if err != nil { + return processStatus, err + } + + processStatus.CreateTime, err = p.CreateTime() + if err != nil { + return processStatus, err + } + + cpustat, err := p.Times() + if err != nil { + return processStatus, err + } + processStatus.CpuTime = cpustat.User + cpustat.System + + processStatus.Pid, err = p.Ppid() + if err != nil { + return processStatus, err + } + + processStatus.Status, err = p.Status() + if err != nil { + return processStatus, err + } + + return processStatus, nil +} + +// GetProcessStatus xxx +func GetProcessStatus() ([]ProcessStatus, error) { + var processList []*process.Process + var err error + + processList, err = process.Processes() + if err != nil { + klog.Errorf("Get processList failed: %s", err.Error()) + return nil, err + } + + processStatusList := make([]ProcessStatus, 0, 0) + + for _, p := range processList { + processStatus := ProcessStatus{} + processStatus.Name, err = p.Name() + if err != nil { + continue + } + + processStatus.CreateTime, err = p.CreateTime() + if err != nil { + continue + } + + cpustat, err := p.Times() + if err != nil { + continue + } + processStatus.CpuTime = cpustat.User + cpustat.System + + processStatus.Pid = p.Pid + + processStatus.Status, err = p.Status() + if err != nil { + klog.Errorf("Get process ppid status failed: %s", err.Error()) + continue + } + + processStatusList = append(processStatusList, processStatus) + } + + return processStatusList, nil +} + +// GetProcessInfo xxx +func GetProcessInfo(exe string, id int32) (*ProcessInfo, error) { + var processList []*process.Process + var err error + + processList, err = process.Processes() + if err != nil { + klog.Errorf("Get processList failed: %s", err.Error()) + return nil, err + } + + if exe == "" && id == 0 { + return nil, fmt.Errorf("exe is %s, id is %d, not valid", exe, id) + } + + processInfo := &ProcessInfo{ + ConfigFiles: make(map[string]string), + } + + for _, p := range processList { + if id != 0 && p.Pid != id { + continue + } + + name, err := p.Name() + if err != nil { + continue + } + + if exe != "" && !strings.Contains(exe, name) { + continue + } + + processInfo.Params, err = p.CmdlineSlice() + if err != nil { + klog.Errorf("Get process cmdline info failed: %s", err.Error()) + continue + } + + if len(processInfo.Params) == 0 { + continue + } + processInfo.BinaryPath = processInfo.Params[0] + + filename := filepath.Base(processInfo.BinaryPath) + if filename != exe && exe != "" { + continue + } + + ppid, err := p.Ppid() + if err != nil { + klog.Errorf("Get process ppid info failed: %s", err.Error()) + continue + } + + processInfo.Starter, err = GetStarter(p.Pid, ppid) + if err != nil { + klog.Errorf("Get process starter info failed: %s", err.Error()) + } + + //processInfo.Params = AddProcessParam(exe, processInfo.Params) + if len(processInfo.Params) > 1 { + processInfo.ConfigFiles, err = GetConfigfileList(processInfo.Params[1:], p.Pid) + if err != nil { + klog.Errorf("Get process %d ConfigFiles failed: %s", p.Pid, err.Error()) + } + } + + if strings.HasSuffix(processInfo.Starter, ".service") { + processInfo.ServiceFiles, err = GetProcessServiceConfigfiles(processInfo.Starter) + if err != nil { + klog.Infof("Get process %d ServiceFiles failed: %s", p.Pid, err.Error()) + } + } + + processInfo.Status, err = p.Status() + if err != nil { + klog.Infof("Get process %d status failed: %s", p.Pid, err.Error()) + } + + return processInfo, nil + } + + return nil, fmt.Errorf("%s process not found", exe) +} + +// AddProcessParam xxx +func AddProcessParam(exe string, params []string) []string { + // 有些进程会有默认读取的配置文件路径 + if strings.Contains(exe, "docker") { + configFileFlag := false + for _, param := range params { + if strings.Contains(param, "config-file") { + configFileFlag = true + break + } + } + if !configFileFlag { + params = append(params, "--config-file") + params = append(params, "/etc/docker/daemon.json") + } + } else if strings.Contains(exe, "containerd") { + configFileFlag := false + for _, param := range params { + if strings.Contains(param, "--config") { + configFileFlag = true + break + } + } + + if !configFileFlag { + params = append(params, "--config") + params = append(params, "/etc/containerd/config.toml") + } + } else if strings.Contains(exe, "coredns") { + params = append(params, "/etc/resolv.conf") + } + + return params +} + +// GetProcess xxx +func GetProcess(id int32) (*process.Process, error) { + return process.NewProcess(id) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/types/process/process_test.go b/bcs-services/bcs-cluster-reporter/internal/types/process/process_test.go new file mode 100644 index 0000000000..9920d3ef96 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/types/process/process_test.go @@ -0,0 +1,26 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package process xxx +package process + +import ( + "fmt" + "testing" +) + +func TestGetProcessStatus(t *testing.T) { + result, _ := GetProcessStatus() + for _, status := range result { + fmt.Println(status) + } +} diff --git a/bcs-services/bcs-cluster-reporter/internal/types/process/types.go b/bcs-services/bcs-cluster-reporter/internal/types/process/types.go new file mode 100644 index 0000000000..85783793f9 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/types/process/types.go @@ -0,0 +1,40 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package process xxx +package process + +// ProcessInfo xxx +type ProcessInfo struct { + Starter string // the way start this process + BinaryPath string // + Params []string + Env []string + ConfigFiles map[string]string + ServiceFiles map[string]string + Status string + // 配置文件修改时间,进程启动时间, +} + +// ProcessStatus xxx +type ProcessStatus struct { + Name string // the way start this process + Pid int32 + Status string + CreateTime int64 + CpuTime float64 + + // 配置文件修改时间,进程启动时间, +} + +// NS xxx +type NS string diff --git a/bcs-services/bcs-cluster-reporter/internal/util/cache.go b/bcs-services/bcs-cluster-reporter/internal/util/cache.go new file mode 100644 index 0000000000..dbaea712f6 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/cache.go @@ -0,0 +1,39 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "github.com/patrickmn/go-cache" + "time" +) + +var ( + DefaultCache = cache.New(20*time.Minute, 20*time.Minute) +) + +// GetCache get cache interface struct by key +func GetCache(key string) (interface{}, bool) { + result, exist := DefaultCache.Get(key) + return result, exist +} + +// SetCache set cache by key +func SetCache(key string, value interface{}) { + DefaultCache.Set(key, value, time.Hour) +} + +// SetCacheWithTimeout set cache by key with timeout +func SetCacheWithTimeout(key string, value interface{}, duration time.Duration) { + DefaultCache.Set(key, value, duration) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/crypt.go b/bcs-services/bcs-cluster-reporter/internal/util/crypt.go deleted file mode 100644 index 148f4dc894..0000000000 --- a/bcs-services/bcs-cluster-reporter/internal/util/crypt.go +++ /dev/null @@ -1,404 +0,0 @@ -/* - * Tencent is pleased to support the open source community by making Blueking Container Service available., - * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. - * Licensed under the MIT License (the "License"); you may not use this file except - * in compliance with the License. You may obtain a copy of the License at - * http://opensource.org/licenses/MIT - * Unless required by applicable law or agreed to in writing, software distributed under, - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package util -package util - -import ( - "bytes" - "crypto/aes" - "crypto/cipher" - "crypto/sha1" - "encoding/base64" - "encoding/binary" - "encoding/xml" - "fmt" - "math/rand" - "runtime/debug" - "sort" - "strings" - - "github.com/Tencent/bk-bcs/bcs-common/common/encrypt" - "k8s.io/klog/v2" -) - -const letterBytes = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" - -const ( - // ValidateSignatureError - ValidateSignatureError int = -40001 - // ParseXmlError - ParseXmlError int = -40002 - // ComputeSignatureError - ComputeSignatureError int = -40003 - // IllegalAesKey - IllegalAesKey int = -40004 - // ValidateCorpidError - ValidateCorpidError int = -40005 - // EncryptAESError - EncryptAESError int = -40006 - // DecryptAESError - DecryptAESError int = -40007 - // IllegalBuffer - IllegalBuffer int = -40008 - // EncodeBase64Error - EncodeBase64Error int = -40009 - // DecodeBase64Error - DecodeBase64Error int = -40010 - // GenXmlError - GenXmlError int = -40010 - // ParseJsonError - ParseJsonError int = -40012 - // GenJsonError - GenJsonError int = -40013 - // IllegalProtocolType - IllegalProtocolType int = -40014 -) - -// ProtocolType xxx -type ProtocolType int - -const ( - // XmlType xxx - XmlType ProtocolType = 1 -) - -// CryptError xxx -type CryptError struct { - ErrCode int - ErrMsg string -} - -// NewCryptError xxx -func NewCryptError(err_code int, err_msg string) *CryptError { - return &CryptError{ErrCode: err_code, ErrMsg: err_msg} -} - -// WXBizMsg4Recv xxx -type WXBizMsg4Recv struct { - Tousername string `xml:"ToUserName"` - Encrypt string `xml:"Encrypt"` - Agentid string `xml:"AgentID"` -} - -// CDATA xxx -type CDATA struct { - Value string `xml:",cdata"` -} - -// WXBizMsg4Send xxx -type WXBizMsg4Send struct { - XMLName xml.Name `xml:"xml"` - Encrypt CDATA `xml:"Encrypt"` - Signature CDATA `xml:"MsgSignature"` - Timestamp string `xml:"TimeStamp"` - Nonce CDATA `xml:"Nonce"` -} - -// NewWXBizMsg4Send xxx -func NewWXBizMsg4Send(encrypt, signature, timestamp, nonce string) *WXBizMsg4Send { - return &WXBizMsg4Send{Encrypt: CDATA{Value: encrypt}, Signature: CDATA{Value: signature}, Timestamp: timestamp, - Nonce: CDATA{Value: nonce}} -} - -// ProtocolProcessor xxx -type ProtocolProcessor interface { - parse(src_data []byte) (*WXBizMsg4Recv, *CryptError) - serialize(msg_send *WXBizMsg4Send) ([]byte, *CryptError) -} - -// WXBizMsgCrypt xxx -type WXBizMsgCrypt struct { - token string - encoding_aeskey string - receiver_id string - protocol_processor ProtocolProcessor -} - -// XmlProcessor xxx -type XmlProcessor struct { -} - -func (xmlp *XmlProcessor) parse(src_data []byte) (*WXBizMsg4Recv, *CryptError) { - var msg4_recv WXBizMsg4Recv - err := xml.Unmarshal(src_data, &msg4_recv) - if nil != err { - return nil, NewCryptError(ParseXmlError, "xml to msg fail") - } - return &msg4_recv, nil -} - -func (xmlp *XmlProcessor) serialize(msg4_send *WXBizMsg4Send) ([]byte, *CryptError) { - xml_msg, err := xml.Marshal(msg4_send) - if nil != err { - return nil, NewCryptError(GenXmlError, err.Error()) - } - return xml_msg, nil -} - -// NewWXBizMsgCrypt -func NewWXBizMsgCrypt(token, encoding_aeskey, receiver_id string, protocol_type ProtocolType) *WXBizMsgCrypt { - var protocol_processor ProtocolProcessor - if protocol_type != XmlType { - panic("unsupport protocal") - } else { - protocol_processor = new(XmlProcessor) - } - - return &WXBizMsgCrypt{token: token, encoding_aeskey: (encoding_aeskey + "="), receiver_id: receiver_id, - protocol_processor: protocol_processor} -} - -func (wx *WXBizMsgCrypt) randString(n int) string { - b := make([]byte, n) - for i := range b { - b[i] = letterBytes[rand.Int63()%int64(len(letterBytes))] - } - return string(b) -} - -func (wx *WXBizMsgCrypt) pKCS7Padding(plaintext string, block_size int) []byte { - padding := block_size - (len(plaintext) % block_size) - padtext := bytes.Repeat([]byte{byte(padding)}, padding) - var buffer bytes.Buffer - buffer.WriteString(plaintext) - buffer.Write(padtext) - return buffer.Bytes() -} - -func (wx *WXBizMsgCrypt) pKCS7Unpadding(plaintext []byte, block_size int) ([]byte, *CryptError) { - plaintext_len := len(plaintext) - if nil == plaintext || plaintext_len == 0 { - return nil, NewCryptError(DecryptAESError, "pKCS7Unpadding error nil or zero") - } - if plaintext_len%block_size != 0 { - return nil, NewCryptError(DecryptAESError, "pKCS7Unpadding text not a multiple of the block size") - } - padding_len := int(plaintext[plaintext_len-1]) - return plaintext[:plaintext_len-padding_len], nil -} - -func (wx *WXBizMsgCrypt) cbcEncrypter(plaintext string) ([]byte, *CryptError) { - aeskey, err := base64.StdEncoding.DecodeString(wx.encoding_aeskey) - if nil != err { - return nil, NewCryptError(DecodeBase64Error, err.Error()) - } - const block_size = 32 - pad_msg := wx.pKCS7Padding(plaintext, block_size) - - block, err := aes.NewCipher(aeskey) - if err != nil { - return nil, NewCryptError(EncryptAESError, err.Error()) - } - - ciphertext := make([]byte, len(pad_msg)) - iv := aeskey[:aes.BlockSize] - - mode := cipher.NewCBCEncrypter(block, iv) - - mode.CryptBlocks(ciphertext, pad_msg) - base64_msg := make([]byte, base64.StdEncoding.EncodedLen(len(ciphertext))) - base64.StdEncoding.Encode(base64_msg, ciphertext) - - return base64_msg, nil -} - -func (wx *WXBizMsgCrypt) cbcDecrypter(base64_encrypt_msg string) ([]byte, *CryptError) { - aeskey, err := base64.StdEncoding.DecodeString(wx.encoding_aeskey) - if nil != err { - return nil, NewCryptError(DecodeBase64Error, err.Error()) - } - - encrypt_msg, err := base64.StdEncoding.DecodeString(base64_encrypt_msg) - if nil != err { - return nil, NewCryptError(DecodeBase64Error, err.Error()) - } - - block, err := aes.NewCipher(aeskey) - if err != nil { - return nil, NewCryptError(DecryptAESError, err.Error()) - } - - if len(encrypt_msg) < aes.BlockSize { - return nil, NewCryptError(DecryptAESError, "encrypt_msg size is not valid") - } - - iv := aeskey[:aes.BlockSize] - - if len(encrypt_msg)%aes.BlockSize != 0 { - return nil, NewCryptError(DecryptAESError, "encrypt_msg not a multiple of the block size") - } - - mode := cipher.NewCBCDecrypter(block, iv) - - mode.CryptBlocks(encrypt_msg, encrypt_msg) - - return encrypt_msg, nil -} - -func (wx *WXBizMsgCrypt) calSignature(timestamp, nonce, data string) string { - sort_arr := []string{wx.token, timestamp, nonce, data} - sort.Strings(sort_arr) - var buffer bytes.Buffer - for _, value := range sort_arr { - buffer.WriteString(value) - } - - sha := sha1.New() - sha.Write(buffer.Bytes()) - signature := fmt.Sprintf("%x", sha.Sum(nil)) - return string(signature) -} - -// ParsePlainText xxx -func (wx *WXBizMsgCrypt) ParsePlainText(plaintext []byte) ([]byte, uint32, []byte, []byte, *CryptError) { - const block_size = 32 - plaintext, err := wx.pKCS7Unpadding(plaintext, block_size) - if nil != err { - return nil, 0, nil, nil, err - } - - text_len := uint32(len(plaintext)) - if text_len < 20 { - return nil, 0, nil, nil, NewCryptError(IllegalBuffer, "plain is to small 1") - } - random := plaintext[:16] - msg_len := binary.BigEndian.Uint32(plaintext[16:20]) - if text_len < (20 + msg_len) { - return nil, 0, nil, nil, NewCryptError(IllegalBuffer, "plain is to small 2") - } - - msg := plaintext[20 : 20+msg_len] - receiver_id := plaintext[20+msg_len:] - - return random, msg_len, msg, receiver_id, nil -} - -// VerifyURL xxx -func (wx *WXBizMsgCrypt) VerifyURL(msg_signature, timestamp, nonce, echostr string) ([]byte, *CryptError) { - signature := wx.calSignature(timestamp, nonce, echostr) - - if strings.Compare(signature, msg_signature) != 0 { - return nil, NewCryptError(ValidateSignatureError, "signature not equal") - } - - plaintext, err := wx.cbcDecrypter(echostr) - if nil != err { - return nil, err - } - - _, _, msg, receiver_id, err := wx.ParsePlainText(plaintext) - if nil != err { - return nil, err - } - - if len(wx.receiver_id) > 0 && strings.Compare(string(receiver_id), wx.receiver_id) != 0 { - fmt.Println(string(receiver_id), wx.receiver_id, len(receiver_id), len(wx.receiver_id)) - return nil, NewCryptError(ValidateCorpidError, "receiver_id is not equil") - } - - return msg, nil -} - -// EncryptMsg xxx -func (wx *WXBizMsgCrypt) EncryptMsg(reply_msg, timestamp, nonce string) ([]byte, *CryptError) { - rand_str := wx.randString(16) - var buffer bytes.Buffer - buffer.WriteString(rand_str) - - msg_len_buf := make([]byte, 4) - binary.BigEndian.PutUint32(msg_len_buf, uint32(len(reply_msg))) - buffer.Write(msg_len_buf) - buffer.WriteString(reply_msg) - buffer.WriteString(wx.receiver_id) - - tmp_ciphertext, err := wx.cbcEncrypter(buffer.String()) - if nil != err { - return nil, err - } - ciphertext := string(tmp_ciphertext) - - signature := wx.calSignature(timestamp, nonce, ciphertext) - - msg4_send := NewWXBizMsg4Send(ciphertext, signature, timestamp, nonce) - return wx.protocol_processor.serialize(msg4_send) -} - -// DecryptMsg xxx -func (wx *WXBizMsgCrypt) DecryptMsg(msg_signature, timestamp, nonce string, post_data []byte) ([]byte, *CryptError) { - msg4_recv, crypt_err := wx.protocol_processor.parse(post_data) - if nil != crypt_err { - return nil, crypt_err - } - - signature := wx.calSignature(timestamp, nonce, msg4_recv.Encrypt) - - if strings.Compare(signature, msg_signature) != 0 { - return nil, NewCryptError(ValidateSignatureError, "signature not equal") - } - - plaintext, crypt_err := wx.cbcDecrypter(msg4_recv.Encrypt) - if nil != crypt_err { - return nil, crypt_err - } - - _, _, msg, receiver_id, crypt_err := wx.ParsePlainText(plaintext) - if nil != crypt_err { - return nil, crypt_err - } - - if len(wx.receiver_id) > 0 && strings.Compare(string(receiver_id), wx.receiver_id) != 0 { - return nil, NewCryptError(ValidateCorpidError, "receiver_id is not equil") - } - - return msg, nil -} - -// Decode xxx -func Decode(data string) string { - defer func() { - if r := recover(); r != nil { - klog.Fatalf("decrypt token failed: %s, %s", r, string(debug.Stack())) - } - }() - - if data == "" { - klog.Errorf("can not be blank") - } - - decryptedData, err := encrypt.DesDecryptFromBase([]byte(data)) - if err != nil { - klog.Fatalf("decrypt token failed: %s", err.Error()) - } - - return string(decryptedData) -} - -// Encode xxx -func Encode(data string) string { - defer func() { - if r := recover(); r != nil { - klog.Fatalf("enctypt token failed: %s, %s", r, string(debug.Stack())) - } - }() - - if data == "" { - klog.Errorf("can not be blank") - } - - decryptedData, err := encrypt.DesEncryptToBase([]byte(data)) - if err != nil { - klog.Fatalf("enctypt token failed: %s", err.Error()) - } - - return string(decryptedData) -} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/html.go b/bcs-services/bcs-cluster-reporter/internal/util/html.go new file mode 100644 index 0000000000..0bd4d57135 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/html.go @@ -0,0 +1,183 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" + "strings" +) + +var ( + HtmlEmailTemplate = ` + + + +   + + + + + +
+ +

xxxxx

+
+ +{{ range .}} +

{{ .Title }}

+ + + {{ range .Headers}} + + {{ end}} + + {{ range .Data}} + + {{ range . }} + + {{ end}} + + {{ end}} +
{{ . }}
{{ . }}
+{{ end}} + + + + + + + + + + +
11 22 + 11 22 + 11 22 +
11 22 +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Header 1Header 2Header 3Header 4
Row 1, Cell 1Row 1, Cell 2Row 1, Cell 3
Row 2, Cell 1Row 2, Cell 2 (spanning 2 columns)
Row 3, Cell 1Row 3, Cell 2Row 3, Cell 3Row 3, Cell 4
Row 4, Cell 1
+ + +` +) + +// HtmlEmail XXX +type HtmlEmail struct { + content string + doc *goquery.Document +} + +// HtmlTable xxx +type HtmlTable struct { + Title string + Headers []string + Data [][]string +} + +// NewHtmlEmail xxx +func NewHtmlEmail() *HtmlEmail { + // test goquery + doc, _ := goquery.NewDocumentFromReader(strings.NewReader(` + +   + + + + + +`)) + //doc.Find("p").AfterHtml("

Text for article #2222222

") + //doc.Find("body").AppendHtml("

Text for article #333333

") + + return &HtmlEmail{ + doc: doc, + } +} + +// GetText xxx +func (h *HtmlEmail) GetText() string { + ret, err := h.doc.Html() + if err != nil { + return err.Error() + } + return ret +} + +// Append xxx +func (h *HtmlEmail) Append(html string) { + h.doc.Find("body").AppendHtml(html) +} + +// AppendNodes xxx +func (h *HtmlEmail) AppendNodes(ns ...*html.Node) { + h.doc.Find("body").AppendNodes(ns...) +} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/html_test.go b/bcs-services/bcs-cluster-reporter/internal/util/html_test.go new file mode 100644 index 0000000000..c7d868219f --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/html_test.go @@ -0,0 +1 @@ +package util diff --git a/bcs-services/bcs-cluster-reporter/internal/util/log.go b/bcs-services/bcs-cluster-reporter/internal/util/log.go new file mode 100644 index 0000000000..b4d7a93d17 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/log.go @@ -0,0 +1,203 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "bufio" + "fmt" + "io" + "k8s.io/klog/v2" + "os" + "strings" + "syscall" + "time" +) + +// LogFile xxx +type LogFile struct { + filename string + file *os.File + pos int64 + ino uint64 + LogChann chan string + searchKeyList []string +} + +func openLogFile(filename string) (*LogFile, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + + info, err := file.Stat() + if err != nil { + file.Close() + return nil, err + } + + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + file.Close() + return nil, fmt.Errorf("failed to get inode number for %s", filename) + } + + return &LogFile{filename: filename, file: file, pos: info.Size(), ino: stat.Ino, LogChann: make(chan string)}, nil + //return &LogFile{filename: filename, file: file, pos: 0, ino: stat.Ino, LogChann: make(chan string)}, nil +} + +// SetSearchKey xxx +func (f *LogFile) SetSearchKey(searchKeyList []string) { + f.searchKeyList = searchKeyList +} + +// Start xxx +func (f *LogFile) Start() { + go func() { + for { + err := f.checkNewEntries() + if err != nil { + fmt.Println("Error checking file:", err) + close(f.LogChann) + break + } + + time.Sleep(10 * time.Second) + } + }() +} + +// CheckNewEntriesOnce xxx +func (f *LogFile) CheckNewEntriesOnce() ([]string, error) { + info, err := f.file.Stat() + if err != nil { + return nil, err + } + + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return nil, fmt.Errorf("failed to get inode number for %s", f.filename) + } + + if stat.Ino != f.ino { + // 文件已经被轮转,关闭旧的文件并打开新的文件 + klog.Info("%s file already changed, reopen it.", f.filename) + f.file.Close() + + newFile, err := openLogFile(f.filename) + if err != nil { + return nil, err + } + + *f = *newFile + f.pos = 0 + + info, err = f.file.Stat() + if err != nil { + return nil, err + } + } + + result := make([]string, 0, 0) + if info.Size() > f.pos { + if _, err := f.file.Seek(f.pos, io.SeekStart); err != nil { + return nil, err + } + + scanner := bufio.NewScanner(f.file) + for scanner.Scan() { + line := scanner.Text() + + if f.searchKeyList != nil && len(f.searchKeyList) > 0 { + for _, key := range f.searchKeyList { + if strings.Contains(line, key) { + result = append(result, line) + break + } + } + } else { + result = append(result, line) + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + + f.pos = info.Size() + } + + return result, nil +} + +func (f *LogFile) checkNewEntries() error { + info, err := f.file.Stat() + if err != nil { + return err + } + + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("failed to get inode number for %s", f.filename) + } + + if stat.Ino != f.ino { + // 文件已经被轮转,关闭旧的文件并打开新的文件 + f.file.Close() + + newFile, err := openLogFile(f.filename) + if err != nil { + return err + } + + *f = *newFile + } + + if info.Size() > f.pos { + if _, err := f.file.Seek(f.pos, io.SeekStart); err != nil { + return err + } + scanner := bufio.NewScanner(f.file) + for scanner.Scan() { + line := scanner.Text() + + // mce: [Hardware Error] + if f.searchKeyList != nil && len(f.searchKeyList) > 0 { + for _, key := range f.searchKeyList { + if strings.Contains(line, key) { + f.LogChann <- line + } + } + } else { + f.LogChann <- line + } + } + if err := scanner.Err(); err != nil { + return err + } + + f.pos = info.Size() + } + + return nil +} + +// NewLogFile xxx +func NewLogFile(path string) *LogFile { + logFile, err := openLogFile(path) + if err != nil { + fmt.Println("Error opening file:", err) + return nil + } + + return logFile +} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/pdf.go b/bcs-services/bcs-cluster-reporter/internal/util/pdf.go new file mode 100644 index 0000000000..c96cfdac5c --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/pdf.go @@ -0,0 +1,268 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "github.com/jung-kurt/gofpdf" + "math" + "strings" +) + +// PDFTable xxx +type PDFTable struct { + Data [][]Column + Header []Column + Title Column + Line int +} + +// Column xxx +type Column struct { + Content string + Color Color +} + +// Color xxx +type Color struct { + Red int + Green int + Blue int +} + +const ( + maxRowLineNum = 5 + minRowLineWidth = 80 +) + +// WritePDFTable xxx +func WritePDFTable(pdf *gofpdf.Fpdf, pdfTable PDFTable, wrap bool) { + pdf.SetTextColor(0, 0, 0) // 设置表格文本颜色 + pdf.SetDrawColor(0, 0, 0) // 设置表格边框颜色 + pdf.SetLineWidth(0.2) // 设置表格边框宽度 + + // 打印标题 + WritePDFTableTtile(pdf, pdfTable.Title) + + // 打印tablebody + WritePDFTableBody(pdf, pdfTable.Header, pdfTable.Data) + + // 与后面的表格隔开 + if wrap { + pdf.Ln(-1) + } +} + +// WritePDFTableTtile xxx +func WritePDFTableTtile(pdf *gofpdf.Fpdf, title Column) { + pdf.SetDrawColor(0, 0, 0) // 设置表格边框颜色 + pdf.SetLineWidth(0.2) // 设置表格边框宽度 + pdf.SetFont("tencent", "", 12) + + pageWidth, _ := pdf.GetPageSize() + // 打印标题 + y := pdf.GetY() + + pdf.SetXY(0, y) + + pdf.SetTextColor(title.Color.Red, title.Color.Green, title.Color.Blue) // 设置表格文本颜色 + pdf.MultiCell(pageWidth, 10, title.Content, "0", "L", false) +} + +// WritePDFTableBody xxx +func WritePDFTableBody(pdf *gofpdf.Fpdf, headers []Column, rowList [][]Column) { + pdf.SetTextColor(0, 0, 0) // 设置表格文本颜色 + pdf.SetDrawColor(0, 0, 0) // 设置表格边框颜色 + pdf.SetLineWidth(0.2) // 设置表格边框宽度 + + pageWidth, pageHeight := pdf.GetPageSize() + var lineWidth = pageWidth - 10 + var lineHeight float64 = 5 + columnWidthList := GetcolumnWidthList(append(rowList, headers), lineWidth, pdf) + + // 计算当前页是否足够打印 + var lineNumSum float64 + for _, row := range rowList { + var lines float64 = 1 + for index, value := range row { + relLines := getStringLines(value.Content, columnWidthList[index], pdf) + if relLines > lines { + lines = relLines + } + } + lineNumSum = lineNumSum + 1 + } + + y := pdf.GetY() + pageNo := pdf.PageNo() + if (pageHeight - y) < (lineNumSum * lineHeight) { + pdf.SetPage(pageNo + 1) + } + + startX := (pageWidth - lineWidth) / 2 + if startX < 0 { + startX = 0 + } + + // 设置表头 + pdf.SetFont("tencent", "", 12) + pdf.SetFillColor(182, 215, 228) // 设置表格背景颜色 + for index, header := range headers { + // 防止自动换页时,每一列都另起一页 + if pdf.PageNo() > pageNo { + pageNo = pdf.PageNo() + pdf.SetPage(pageNo) + _, top, _, _ := pdf.GetMargins() + y = top + } + + pdf.SetXY(startX, y) + pdf.SetTextColor(header.Color.Red, header.Color.Green, header.Color.Blue) // 设置表格文本颜色 + pdf.MultiCell(columnWidthList[index], lineHeight, header.Content, "1", "L", true) + startX = startX + columnWidthList[index] + + } + + // 打印表格 + pdf.SetFont("tencent", "", 12) + pdf.SetFillColor(240, 240, 240) // 设置表格背景颜色 + for _, row := range rowList { + var lines float64 = 1 + for index, value := range row { + relLines := getStringLines(value.Content, columnWidthList[index], pdf) + if relLines > lines { + lines = relLines + } + } + + startX = (pageWidth - lineWidth) / 2 + y = pdf.GetY() + for index, value := range row { + // 防止自动换页时,每一列都另起一页 + if pdf.PageNo() > pageNo { + pageNo = pdf.PageNo() + pdf.SetPage(pageNo) + _, top, _, _ := pdf.GetMargins() + y = top + } + + if value.Content == "" { + value.Content = " " + } + pdf.SetXY(startX, y) + pdf.SetTextColor(value.Color.Red, value.Color.Green, value.Color.Blue) // 设置表格文本颜色 + pdf.MultiCell(columnWidthList[index], + lineHeight*(lines/getStringLines(value.Content, columnWidthList[index], pdf)), + value.Content, "1", "L", true) + startX = startX + columnWidthList[index] + } + } +} + +// WriteHorizontalPDFTableLine xxx +func WriteHorizontalPDFTableLine(columnList []Column, columnWidthList []float64, pdf *gofpdf.Fpdf) { + y := pdf.GetY() + pageWidth, _ := pdf.GetPageSize() + lineWidth := pageWidth - 10 + startX := (pageWidth - lineWidth) / 2 + var lineHeight float64 = 5 + + var lines float64 = 1 + for index, value := range columnList { + relLines := getStringLines(value.Content, columnWidthList[index], pdf) + if relLines > lines { + lines = relLines + } + } + + pageNo := pdf.PageNo() + for index, value := range columnList { + // 防止自动换页时,每一列都另起一页 + if pdf.PageNo() > pageNo { + pageNo = pdf.PageNo() + pdf.SetPage(pageNo) + _, top, _, _ := pdf.GetMargins() + y = top + } + + if value.Content == "" { + value.Content = " " + } + pdf.SetXY(startX, y) + if (index % 2) == 0 { + pdf.SetFont("tencent", "", 12) + pdf.SetFillColor(182, 215, 228) + pdf.SetTextColor(value.Color.Red, value.Color.Green, value.Color.Blue) // 设置表格文本颜色 + pdf.MultiCell(columnWidthList[index], lineHeight*(lines/getStringLines(value.Content, columnWidthList[index], pdf)), value.Content, "1", "L", true) + + } else { + pdf.SetFont("tencent", "", 12) + pdf.SetFillColor(240, 240, 240) + pdf.SetTextColor(value.Color.Red, value.Color.Green, value.Color.Blue) // 设置表格文本颜色 + pdf.MultiCell(columnWidthList[index], lineHeight*(lines/getStringLines(value.Content, columnWidthList[index], pdf)), value.Content, "1", "L", false) + } + startX = startX + columnWidthList[index] + } +} + +// 判断该字符串最终会被打印成多少行 +func getStringLines(str string, width float64, pdf *gofpdf.Fpdf) float64 { + var relLines float64 + for _, line := range strings.Split(str, "\n") { + addLines := math.Ceil((pdf.GetStringWidth(line)) / (width - 2)) // 每行实际上还有margin的宽度 + if addLines == 0 { + addLines = 1 + } + relLines = relLines + addLines + } + return relLines +} + +// GetcolumnWidthList xxx +func GetcolumnWidthList(lines [][]Column, lineWidth float64, pdf *gofpdf.Fpdf) []float64 { + maxWidth := lineWidth + var columnWidthList = make([]float64, len(lines[0]), len(lines[0])) + for _, row := range lines { + for index, value := range row { + // 增加5宽度做margin和冗余 + if columnWidthList[index] < pdf.GetStringWidth(value.Content)+2 { + columnWidthList[index] = pdf.GetStringWidth(value.Content) + 2 + } + // 至少需要留给后面几列的空间 + if columnWidthList[index] > (maxWidth - float64(len(row)-index-1)*30) { + columnWidthList[index] = maxWidth - float64(len(row)-index-1)*30 + } + maxWidth = maxWidth - columnWidthList[index] + } + maxWidth = lineWidth + } + + // 平均分配剩余的width + var columnWidthSum float64 + for _, columnWidth := range columnWidthList { + columnWidthSum = columnWidthSum + columnWidth + } + + if lineWidth > columnWidthSum { + for index, columnWidth := range columnWidthList { + columnWidthList[index] = columnWidth + (lineWidth-columnWidthSum)/float64(len(columnWidthList)) + } + } + + var columnWidthSum1 float64 + for _, columnWidth := range columnWidthList { + columnWidthSum1 = columnWidthSum1 + columnWidth + } + + return columnWidthList +} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/routinepool.go b/bcs-services/bcs-cluster-reporter/internal/util/routinepool.go index 9b9f04b447..d1276f9398 100644 --- a/bcs-services/bcs-cluster-reporter/internal/util/routinepool.go +++ b/bcs-services/bcs-cluster-reporter/internal/util/routinepool.go @@ -10,6 +10,7 @@ * limitations under the License. */ +// Package util xxx package util import "sync" diff --git a/bcs-services/bcs-cluster-reporter/internal/util/util.go b/bcs-services/bcs-cluster-reporter/internal/util/util.go new file mode 100644 index 0000000000..8cd621c136 --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/util.go @@ -0,0 +1,184 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "context" + "crypto/tls" + "fmt" + "gopkg.in/yaml.v2" + "k8s.io/apimachinery/pkg/util/json" + "k8s.io/klog" + "net" + "net/http" + "os" + "time" +) + +// GetCtx xxx +func GetCtx(duration time.Duration) context.Context { + ctx, cancel := context.WithTimeout(context.Background(), duration) + go func() { + time.Sleep(duration) + cancel() + }() + return ctx +} + +// GetServerCert xxx +func GetServerCert(domain, ip, port string) (time.Time, error) { + // 创建自定义的Transport,用于指定IP地址 + tr := &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, // 跳过证书验证 + ServerName: domain, + }, + DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + dialer := &net.Dialer{ + Timeout: 2 * time.Second, + KeepAlive: 2 * time.Second, + } + conn, err := dialer.DialContext(ctx, "tcp", ip+":"+port) + if err != nil { + return nil, err + } + tlsConn := tls.Client(conn, &tls.Config{ + InsecureSkipVerify: true, + ServerName: domain, + }) + if err := tlsConn.Handshake(); err != nil { + return nil, err + } + return tlsConn, nil + }, + } + + // 创建自定义的Client,使用自定义的Transport + client := &http.Client{ + Transport: tr, + Timeout: 10 * time.Second, + } + + // 发起HTTPS请求 + resp, err := client.Get("https://" + domain) + if err != nil { + return time.Now(), err + } + defer resp.Body.Close() + + // 获取远程证书 + cert := resp.TLS.PeerCertificates[0] + + // 获取证书的过期时间 + expiration := cert.NotAfter + + return expiration, nil +} + +// WriteConfigIfNotExist xxx +func WriteConfigIfNotExist(filePath, content string) error { + _, err := os.Stat(filePath) + if err != nil { + if os.IsNotExist(err) { + file, openErr := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if openErr != nil { + return err + } + + defer file.Close() + + // 写入文本信息 + _, err = file.WriteString(content) + if err != nil { + return err + } + + return nil + } else { + return err + } + + } else { + return nil + } +} + +// ReadorInitConf xxx +func ReadorInitConf(configFilePath string, obj interface{}, initContent string) error { + _, err := os.Stat(configFilePath) + if err != nil { + if os.IsNotExist(err) { + err = WriteConfigIfNotExist(configFilePath, initContent) + if err != nil { + return err + } else { + return ReadConf(configFilePath, obj) + } + } + return err + } else { + return ReadConf(configFilePath, obj) + } +} + +// ReadFromStr xxx +func ReadFromStr(obj interface{}, initContent string) error { + if err := json.Unmarshal([]byte(initContent), obj); err != nil { + if err = yaml.Unmarshal([]byte(initContent), obj); err != nil { + return fmt.Errorf("decode config %s failed, err %s", initContent, err.Error()) + } + } + + return nil +} + +// ReadConf xxx +func ReadConf(configFilePath string, obj interface{}) error { + configFileBytes, err := os.ReadFile(configFilePath) + if err != nil { + return err + } + + if err = json.Unmarshal(configFileBytes, obj); err != nil { + if err = yaml.Unmarshal(configFileBytes, obj); err != nil { + return fmt.Errorf("decode clustercheck config file %s failed, err %s", configFilePath, err.Error()) + } + } + + return nil +} + +// GetHostPath xxx +func GetHostPath() string { + hostPath := os.Getenv("HOST_PATH") + if hostPath == "" { + hostPath = "/" + } + + return hostPath +} + +// GetNodeName xxx +func GetNodeName() string { + name := os.Getenv("NODE_NAME") + var err error + if name == "" { + name, err = os.Hostname() + if err != nil { + klog.Fatal(err.Error()) + } + } + + return name +} diff --git a/bcs-services/bcs-cluster-reporter/internal/util/util_test.go b/bcs-services/bcs-cluster-reporter/internal/util/util_test.go new file mode 100644 index 0000000000..037c28183d --- /dev/null +++ b/bcs-services/bcs-cluster-reporter/internal/util/util_test.go @@ -0,0 +1,27 @@ +/* + * Tencent is pleased to support the open source community by making Blueking Container Service available. + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * Licensed under the MIT License (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * http://opensource.org/licenses/MIT + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package util xxx +package util + +import ( + "fmt" + "testing" +) + +func TestGetServerCert(t *testing.T) { + expiration, err := GetServerCert("apiserver-loopback-client", "apiserverIP", "6443") + if err != nil { + fmt.Println(err.Error()) + } + fmt.Println("证书过期时间:", expiration) +} diff --git a/bcs-services/bcs-cluster-reporter/main.go b/bcs-services/bcs-cluster-reporter/main.go deleted file mode 100644 index 068e398886..0000000000 --- a/bcs-services/bcs-cluster-reporter/main.go +++ /dev/null @@ -1,23 +0,0 @@ -package main - -import ( - "runtime/debug" - - "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/cmd" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/clustercheck" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/dnscheck" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/eventrecorder" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/logrecorder" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/masterpodcheck" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/netcheck" - _ "github.com/Tencent/bk-bcs/bcs-services/bcs-cluster-reporter/internal/plugin/systemappcheck" - - "k8s.io/klog" -) - -func main() { - debug.SetGCPercent(100) - - cmd.Execute() - defer klog.Flush() -} diff --git a/install/conf/bcs-services/bcs-cluster-reporter/Dockerfile b/install/conf/bcs-services/bcs-cluster-reporter/Dockerfile index 61a321f8b2..a72462577b 100644 --- a/install/conf/bcs-services/bcs-cluster-reporter/Dockerfile +++ b/install/conf/bcs-services/bcs-cluster-reporter/Dockerfile @@ -5,5 +5,7 @@ RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime RUN echo "Asia/Shanghai" > /etc/timezone WORKDIR / COPY ./bcs-cluster-reporter . +COPY ./TencentSans-W7.ttf . +COPY ./TencentSans-W3.ttf . #COPY ./conf ./conf ENTRYPOINT ["/bcs-cluster-reporter"] diff --git a/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf b/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf new file mode 100644 index 0000000000..1e12465997 Binary files /dev/null and b/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W3.ttf differ diff --git a/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf b/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf new file mode 100644 index 0000000000..386dd20f5c Binary files /dev/null and b/install/conf/bcs-services/bcs-cluster-reporter/TencentSans-W7.ttf differ diff --git a/install/conf/bcs-services/bcs-nodeagent/Dockerfile b/install/conf/bcs-services/bcs-nodeagent/Dockerfile new file mode 100644 index 0000000000..8f4f0d5ee7 --- /dev/null +++ b/install/conf/bcs-services/bcs-nodeagent/Dockerfile @@ -0,0 +1,9 @@ +FROM centos:7 +RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo && sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo && sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo +MAINTAINER xxx xxx +RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime +RUN echo "Asia/Shanghai" > /etc/timezone +WORKDIR / +COPY ./bcs-nodeagent . +#COPY ./conf ./conf +ENTRYPOINT ["/bcs-nodeagent"]