diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 8a96cbf3eb85..2236c4f5e952 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -331,6 +331,7 @@ go_library( "@com_github_cockroachdb_redact//:redact", "@com_github_getsentry_sentry_go//:sentry-go", "@com_github_gogo_protobuf//proto", + "@com_github_google_pprof//profile", "@com_github_gorilla_mux//:mux", "@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library", "@com_github_grpc_ecosystem_grpc_gateway//utilities:go_default_library", diff --git a/pkg/server/status.go b/pkg/server/status.go index 611421cc078a..8e563f3ac28d 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -83,6 +83,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uint128" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" + "github.com/google/pprof/profile" gwruntime "github.com/grpc-ecosystem/grpc-gateway/runtime" raft "go.etcd.io/raft/v3" "google.golang.org/grpc" @@ -1464,6 +1465,102 @@ func (s *statusServer) Stacks( return stacksLocal(req) } +// fetchProfileFromAllNodes fetches the CPU profiles from all live nodes in the +// cluster and merges the samples across all profiles. +func (s *statusServer) fetchProfileFromAllNodes( + ctx context.Context, req *serverpb.ProfileRequest, +) (*serverpb.JSONResponse, error) { + type profData struct { + data []byte + err error + } + type profDataResponse struct { + profDataByNodeID map[roachpb.NodeID]*profData + } + response := profDataResponse{profDataByNodeID: make(map[roachpb.NodeID]*profData)} + + resp, err := s.Node(ctx, &serverpb.NodeRequest{NodeId: "local"}) + if err != nil { + return nil, err + } + senderServerVersion := resp.Desc.ServerVersion + dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) { + client, err := s.dialNode(ctx, nodeID) + return client, err + } + nodeFn := func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error) { + // pprof.Merge can only merge samples collected on the same binary version. + // So we must first check that the node is running the same binary version + // as the node that is going to merge the profiles. + statusClient := client.(serverpb.StatusClient) + resp, err := statusClient.Node(ctx, &serverpb.NodeRequest{NodeId: "local"}) + if err != nil { + return nodeID, err + } + nodeServerVersion := resp.Desc.ServerVersion + if !senderServerVersion.Equal(nodeServerVersion) { + return nil, errors.Newf("node %d has a different server version %s to the node merging CPU profiles %s", + nodeID, nodeServerVersion.String(), senderServerVersion.String()) + } + var pd *profData + err = contextutil.RunWithTimeout(ctx, "fetch cpu profile", 1*time.Minute, func(ctx context.Context) error { + log.Infof(ctx, "fetching a CPU profile for %d", resp.Desc.NodeID) + resp, err := statusClient.Profile(ctx, &serverpb.ProfileRequest{ + NodeId: fmt.Sprintf("%d", nodeID), + Type: serverpb.ProfileRequest_CPU, + Seconds: req.Seconds, + Labels: req.Labels, + }) + if err != nil { + return err + } + pd = &profData{data: resp.Data} + return nil + }) + return pd, err + } + responseFn := func(nodeID roachpb.NodeID, resp interface{}) { + profResp := resp.(*profData) + response.profDataByNodeID[nodeID] = profResp + } + errorFn := func(nodeID roachpb.NodeID, err error) { + response.profDataByNodeID[nodeID] = &profData{err: err} + } + if err := s.iterateNodes(ctx, "cluster-wide CPU profile", dialFn, nodeFn, responseFn, errorFn); err != nil { + return nil, serverError(ctx, err) + } + + profs := make([]*profile.Profile, 0, len(response.profDataByNodeID)) + for nodeID, pd := range response.profDataByNodeID { + if len(pd.data) == 0 && pd.err == nil { + log.Warningf(ctx, "no profile collected for node %d", nodeID) + continue // skipped node + } + + if pd.err != nil { + log.Warningf(ctx, "failed to collect profile for node %d: %v", nodeID, pd.err) + continue + } + + p, err := profile.ParseData(pd.data) + if err != nil { + return nil, err + } + p.Comments = append(p.Comments, fmt.Sprintf("n%d", nodeID)) + profs = append(profs, p) + } + mergedProfiles, err := profile.Merge(profs) + if err != nil { + return nil, err + } + + var buf bytes.Buffer + if err := mergedProfiles.Write(&buf); err != nil { + return nil, status.Errorf(codes.Internal, err.Error()) + } + return &serverpb.JSONResponse{Data: buf.Bytes()}, nil +} + // TODO(tschottdorf): significant overlap with /debug/pprof/heap, except that // this one allows querying by NodeID. // @@ -1481,6 +1578,12 @@ func (s *statusServer) Profile( return nil, err } + // If the request is for "all" nodes then we collect profiles from all nodes + // in the cluster and merge them before returning to the user. + if req.NodeId == "all" { + return s.fetchProfileFromAllNodes(ctx, req) + } + nodeID, local, err := s.parseNodeID(req.NodeId) if err != nil { return nil, status.Errorf(codes.InvalidArgument, err.Error()) diff --git a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx index 1c19eb6e869b..efd3a0c7497b 100644 --- a/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx +++ b/pkg/ui/workspaces/db-console/src/views/reports/containers/debug/index.tsx @@ -445,6 +445,11 @@ export default function Debug() { url="debug/pprof/ui/cpu/" params={{ node: nodeID, seconds: "5", labels: "true" }} /> +