From c471994c37c5b281642307cb324de8619b3f00f5 Mon Sep 17 00:00:00 2001 From: adityamaru Date: Tue, 2 May 2023 14:17:41 -0400 Subject: [PATCH] server: add `debug/pprof/ui/cpu/?node=all` endpoint for cluster-wide CPU profile This change adds a debug endpoint to the status server that collects a CPU profiles from all nodes in the cluster and merges them before rendering the profile. The merge is performed using `pprof.Merge` https://pkg.go.dev/github.com/google/pprof/profile#Merge. A future patch will teach the jobs page to filter out samples from this cluster-wide profile that are only labelled with the job's job ID. In this way a user can get the cluster-wide CPU profile for a particular job making it easier to identify the cause for a hotspot or a performance regression. Release note: None --- docs/generated/http/full.md | 1 + pkg/server/BUILD.bazel | 1 + pkg/server/serverpb/status.proto | 9 ++ pkg/server/status.go | 105 ++++++++++++++++++ .../views/reports/containers/debug/index.tsx | 5 + 5 files changed, 121 insertions(+) diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index a556e20fc736..02e6b8288a54 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -3045,6 +3045,7 @@ Support status: [reserved](#support-status) | type | [ProfileRequest.Type](#cockroach.server.serverpb.ProfileRequest-cockroach.server.serverpb.ProfileRequest.Type) | | The type of profile to retrieve. | [reserved](#support-status) | | seconds | [int32](#cockroach.server.serverpb.ProfileRequest-int32) | | applies only to Type=CPU, defaults to 30 | [reserved](#support-status) | | labels | [bool](#cockroach.server.serverpb.ProfileRequest-bool) | | applies only to Type=CPU, defaults to false | [reserved](#support-status) | +| sender_server_version | [cockroach.roachpb.Version](#cockroach.server.serverpb.ProfileRequest-cockroach.roachpb.Version) | | SenderServerVersion is the server version of the node sending the Profile request. If this field is set then the node processing the request will only collect the profile if its server version is equal to the sender's server version.

Currently, this is only used when collecting profiles that will be merged using pprof.Merge as all the samples must be from the same binary version. | [reserved](#support-status) | diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 7245412f00cc..6bde905e3668 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -332,6 +332,7 @@ go_library( "@com_github_cockroachdb_redact//:redact", "@com_github_getsentry_sentry_go//:sentry-go", "@com_github_gogo_protobuf//proto", + "@com_github_google_pprof//profile", "@com_github_gorilla_mux//:mux", "@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library", "@com_github_grpc_ecosystem_grpc_gateway//utilities:go_default_library", diff --git a/pkg/server/serverpb/status.proto b/pkg/server/serverpb/status.proto index 9ce095e714db..f6eda58d2d1b 100644 --- a/pkg/server/serverpb/status.proto +++ b/pkg/server/serverpb/status.proto @@ -772,6 +772,15 @@ message ProfileRequest { int32 seconds = 6; // applies only to Type=CPU, defaults to 30 bool labels = 7; // applies only to Type=CPU, defaults to false + + // SenderServerVersion is the server version of the node sending the Profile + // request. If this field is set then the node processing the request will + // only collect the profile if its server version is equal to the sender's + // server version. + // + // Currently, this is only used when collecting profiles that will be merged + // using pprof.Merge as all the samples must be from the same binary version. + cockroach.roachpb.Version sender_server_version = 8; } message MetricsRequest { diff --git a/pkg/server/status.go b/pkg/server/status.go index 5aee4c3f7515..5715cbbf4be5 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -84,6 +84,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uint128" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" + "github.com/google/pprof/profile" gwruntime "github.com/grpc-ecosystem/grpc-gateway/runtime" raft "go.etcd.io/raft/v3" "google.golang.org/grpc" @@ -1465,6 +1466,95 @@ func (s *statusServer) Stacks( return stacksLocal(req) } +// fetchProfileFromAllNodes fetches the CPU profiles from all live nodes in the +// cluster and merges the samples across all profiles. +func (s *statusServer) fetchProfileFromAllNodes( + ctx context.Context, req *serverpb.ProfileRequest, +) (*serverpb.JSONResponse, error) { + type profData struct { + data []byte + err error + } + type profDataResponse struct { + profDataByNodeID map[roachpb.NodeID]*profData + } + response := profDataResponse{profDataByNodeID: make(map[roachpb.NodeID]*profData)} + + resp, err := s.Node(ctx, &serverpb.NodeRequest{NodeId: "local"}) + if err != nil { + return nil, err + } + senderServerVersion := resp.Desc.ServerVersion + dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) { + client, err := s.dialNode(ctx, nodeID) + return client, err + } + nodeFn := func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error) { + statusClient := client.(serverpb.StatusClient) + resp, err := statusClient.Node(ctx, &serverpb.NodeRequest{NodeId: "local"}) + if err != nil { + return nodeID, err + } + var pd *profData + err = contextutil.RunWithTimeout(ctx, "fetch cpu profile", 1*time.Minute, func(ctx context.Context) error { + log.Infof(ctx, "fetching a CPU profile for %d", resp.Desc.NodeID) + resp, err := statusClient.Profile(ctx, &serverpb.ProfileRequest{ + NodeId: fmt.Sprintf("%d", nodeID), + Type: serverpb.ProfileRequest_CPU, + Seconds: req.Seconds, + Labels: req.Labels, + SenderServerVersion: &senderServerVersion, + }) + if err != nil { + return err + } + pd = &profData{data: resp.Data} + return nil + }) + return pd, err + } + responseFn := func(nodeID roachpb.NodeID, resp interface{}) { + profResp := resp.(*profData) + response.profDataByNodeID[nodeID] = profResp + } + errorFn := func(nodeID roachpb.NodeID, err error) { + response.profDataByNodeID[nodeID] = &profData{err: err} + } + if err := s.iterateNodes(ctx, "cluster-wide CPU profile", dialFn, nodeFn, responseFn, errorFn); err != nil { + return nil, serverError(ctx, err) + } + + profs := make([]*profile.Profile, 0, len(response.profDataByNodeID)) + for nodeID, pd := range response.profDataByNodeID { + if len(pd.data) == 0 && pd.err == nil { + log.Warningf(ctx, "no profile collected for node %d", nodeID) + continue // skipped node + } + + if pd.err != nil { + log.Warningf(ctx, "failed to collect profile for node %d: %v", nodeID, pd.err) + continue + } + + p, err := profile.ParseData(pd.data) + if err != nil { + return nil, err + } + p.Comments = append(p.Comments, fmt.Sprintf("n%d", nodeID)) + profs = append(profs, p) + } + mergedProfiles, err := profile.Merge(profs) + if err != nil { + return nil, err + } + + var buf bytes.Buffer + if err := mergedProfiles.Write(&buf); err != nil { + return nil, status.Errorf(codes.Internal, err.Error()) + } + return &serverpb.JSONResponse{Data: buf.Bytes()}, nil +} + // TODO(tschottdorf): significant overlap with /debug/pprof/heap, except that // this one allows querying by NodeID. // @@ -1482,6 +1572,12 @@ func (s *statusServer) Profile( return nil, err } + // If the request is for "all" nodes then we collect profiles from all nodes + // in the cluster and merge them before returning to the user. + if req.NodeId == "all" { + return s.fetchProfileFromAllNodes(ctx, req) + } + nodeID, local, err := s.parseNodeID(req.NodeId) if err != nil { return nil, status.Errorf(codes.InvalidArgument, err.Error()) @@ -1495,6 +1591,15 @@ func (s *statusServer) Profile( return status.Profile(ctx, req) } + // If the request has a SenderVersion, then ensure the current node has the + // same server version before collecting a profile. + if req.SenderServerVersion != nil { + serverVersion := s.st.Version.BinaryVersion() + if !serverVersion.Equal(*req.SenderServerVersion) { + return nil, errors.Newf("server version of the node being profiled %s != sender version %s", + serverVersion.String(), req.SenderServerVersion.String()) + } + } return profileLocal(ctx, req, s.st) } diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx index 1c19eb6e869b..efd3a0c7497b 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx @@ -445,6 +445,11 @@ export default function Debug() { url="debug/pprof/ui/cpu/" params={{ node: nodeID, seconds: "5", labels: "true" }} /> +