Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: add debug/pprof/ui/cpu/?node=all endpoint for cluster-wide CPU profile #102734

Merged
merged 1 commit into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -3045,6 +3045,7 @@ Support status: [reserved](#support-status)
| type | [ProfileRequest.Type](#cockroach.server.serverpb.ProfileRequest-cockroach.server.serverpb.ProfileRequest.Type) | | The type of profile to retrieve. | [reserved](#support-status) |
| seconds | [int32](#cockroach.server.serverpb.ProfileRequest-int32) | | applies only to Type=CPU, defaults to 30 | [reserved](#support-status) |
| labels | [bool](#cockroach.server.serverpb.ProfileRequest-bool) | | applies only to Type=CPU, defaults to false | [reserved](#support-status) |
| sender_server_version | [cockroach.roachpb.Version](#cockroach.server.serverpb.ProfileRequest-cockroach.roachpb.Version) | | SenderServerVersion is the server version of the node sending the Profile request. If this field is set then the node processing the request will only collect the profile if its server version is equal to the sender's server version.<br><br>Currently, this is only used when collecting profiles that will be merged using pprof.Merge as all the samples must be from the same binary version. | [reserved](#support-status) |



Expand Down
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ go_library(
"@com_github_cockroachdb_redact//:redact",
"@com_github_getsentry_sentry_go//:sentry-go",
"@com_github_gogo_protobuf//proto",
"@com_github_google_pprof//profile",
"@com_github_gorilla_mux//:mux",
"@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library",
"@com_github_grpc_ecosystem_grpc_gateway//utilities:go_default_library",
Expand Down
9 changes: 9 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,15 @@ message ProfileRequest {

int32 seconds = 6; // applies only to Type=CPU, defaults to 30
bool labels = 7; // applies only to Type=CPU, defaults to false

// SenderServerVersion is the server version of the node sending the Profile
// request. If this field is set then the node processing the request will
// only collect the profile if its server version is equal to the sender's
// server version.
//
// Currently, this is only used when collecting profiles that will be merged
// using pprof.Merge as all the samples must be from the same binary version.
cockroach.roachpb.Version sender_server_version = 8;
}

message MetricsRequest {
Expand Down
105 changes: 105 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uint128"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/google/pprof/profile"
gwruntime "github.com/grpc-ecosystem/grpc-gateway/runtime"
raft "go.etcd.io/raft/v3"
"google.golang.org/grpc"
Expand Down Expand Up @@ -1465,6 +1466,95 @@ func (s *statusServer) Stacks(
return stacksLocal(req)
}

// fetchProfileFromAllNodes fetches the CPU profiles from all live nodes in the
// cluster and merges the samples across all profiles.
func (s *statusServer) fetchProfileFromAllNodes(
ctx context.Context, req *serverpb.ProfileRequest,
) (*serverpb.JSONResponse, error) {
type profData struct {
data []byte
err error
}
type profDataResponse struct {
profDataByNodeID map[roachpb.NodeID]*profData
}
response := profDataResponse{profDataByNodeID: make(map[roachpb.NodeID]*profData)}

resp, err := s.Node(ctx, &serverpb.NodeRequest{NodeId: "local"})
if err != nil {
return nil, err
}
senderServerVersion := resp.Desc.ServerVersion
dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) {
client, err := s.dialNode(ctx, nodeID)
return client, err
}
nodeFn := func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error) {
statusClient := client.(serverpb.StatusClient)
resp, err := statusClient.Node(ctx, &serverpb.NodeRequest{NodeId: "local"})
if err != nil {
return nodeID, err
}
var pd *profData
err = contextutil.RunWithTimeout(ctx, "fetch cpu profile", 1*time.Minute, func(ctx context.Context) error {
log.Infof(ctx, "fetching a CPU profile for %d", resp.Desc.NodeID)
resp, err := statusClient.Profile(ctx, &serverpb.ProfileRequest{
NodeId: fmt.Sprintf("%d", nodeID),
Type: serverpb.ProfileRequest_CPU,
Seconds: req.Seconds,
Labels: req.Labels,
SenderServerVersion: &senderServerVersion,
})
if err != nil {
return err
}
pd = &profData{data: resp.Data}
return nil
})
return pd, err
}
responseFn := func(nodeID roachpb.NodeID, resp interface{}) {
profResp := resp.(*profData)
response.profDataByNodeID[nodeID] = profResp
}
errorFn := func(nodeID roachpb.NodeID, err error) {
response.profDataByNodeID[nodeID] = &profData{err: err}
}
if err := s.iterateNodes(ctx, "cluster-wide CPU profile", dialFn, nodeFn, responseFn, errorFn); err != nil {
return nil, serverError(ctx, err)
}

profs := make([]*profile.Profile, 0, len(response.profDataByNodeID))
for nodeID, pd := range response.profDataByNodeID {
if len(pd.data) == 0 && pd.err == nil {
log.Warningf(ctx, "no profile collected for node %d", nodeID)
continue // skipped node
}

if pd.err != nil {
log.Warningf(ctx, "failed to collect profile for node %d: %v", nodeID, pd.err)
continue
}

p, err := profile.ParseData(pd.data)
if err != nil {
return nil, err
}
p.Comments = append(p.Comments, fmt.Sprintf("n%d", nodeID))
profs = append(profs, p)
}
mergedProfiles, err := profile.Merge(profs)
if err != nil {
return nil, err
}

var buf bytes.Buffer
if err := mergedProfiles.Write(&buf); err != nil {
return nil, status.Errorf(codes.Internal, err.Error())
}
return &serverpb.JSONResponse{Data: buf.Bytes()}, nil
}

// TODO(tschottdorf): significant overlap with /debug/pprof/heap, except that
// this one allows querying by NodeID.
//
Expand All @@ -1482,6 +1572,12 @@ func (s *statusServer) Profile(
return nil, err
}

// If the request is for "all" nodes then we collect profiles from all nodes
// in the cluster and merge them before returning to the user.
if req.NodeId == "all" {
return s.fetchProfileFromAllNodes(ctx, req)
}

nodeID, local, err := s.parseNodeID(req.NodeId)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
Expand All @@ -1495,6 +1591,15 @@ func (s *statusServer) Profile(
return status.Profile(ctx, req)
}

// If the request has a SenderVersion, then ensure the current node has the
// same server version before collecting a profile.
if req.SenderServerVersion != nil {
serverVersion := s.st.Version.BinaryVersion()
if !serverVersion.Equal(*req.SenderServerVersion) {
return nil, errors.Newf("server version of the node being profiled %s != sender version %s",
serverVersion.String(), req.SenderServerVersion.String())
}
}
return profileLocal(ctx, req, s.st)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,11 @@ export default function Debug() {
url="debug/pprof/ui/cpu/"
params={{ node: nodeID, seconds: "5", labels: "true" }}
/>
<DebugTableLink
name="Cluster-wide CPU Profile (profiles all nodes; MEMORY OVERHEAD)"
url="debug/pprof/ui/cpu/"
params={{ node: "all", seconds: "5", labels: "true" }}
/>
<DebugTableLink
name="Block"
url="debug/pprof/ui/block/"
Expand Down