Skip to content

Commit

Permalink
Merge #68595 #69103
Browse files Browse the repository at this point in the history
68595: ui,admission: observability improvements for admission control r=sumeerbhola a=sumeerbhola

- Trace statements for latency incurred in admission queues.
- Certain admission control metrics are now included in the
  overload dashboard. Specifically,
  - Resource bottlenecks can be identified using the
    "KV Admission Slots" and "KV Admission IO Tokens Exhausted
    Duration Per Second" graphs.
  - The rate at which admission control is admitting requests
    is in the "Admission Work Rate" graphs and the corresponding
    latency rate (for all requests) is in
    "Admission Latency Rate". Dividing the latter by the former
    gives the mean admission latency.
  - The 75th percentile latency for those requests that actually
    waited for admission is in the
    "Admission Latency: 75th percentile" graph.
  When admission control is off most of these graphs will be
  empty or zero, and the total KV admission slots will be 1.

Informs #65955

Release note (ui change): admission control metrics are added to
Overload dashboard.

69103: sql: make TestFailureToMarkCanceledReversalLeadsToCanceledStatus faster r=sajjadrizvi a=ajwerner

This test was made slow by #66889.

Release note: None

Co-authored-by: sumeerbhola <[email protected]>
Co-authored-by: Andrew Werner <[email protected]>
  • Loading branch information
3 people committed Aug 19, 2021
3 parents a305273 + 5e44a7f + 8d35a4c commit 8d0f75e
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 17 deletions.
28 changes: 11 additions & 17 deletions pkg/sql/schema_changer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6612,30 +6612,24 @@ func TestFailureToMarkCanceledReversalLeadsToCanceledStatus(t *testing.T) {
defer jobCancellationsToFail.Unlock()
f(jobCancellationsToFail.jobs)
}
jobInterval := 100 * time.Millisecond
jobKnobs := jobs.NewTestingKnobsWithShortIntervals()
jobKnobs.BeforeUpdate = func(orig, updated jobs.JobMetadata) (err error) {
withJobsToFail(func(m map[jobspb.JobID]struct{}) {
if _, ok := m[orig.ID]; ok && updated.Status == jobs.StatusCanceled {
delete(m, orig.ID)
err = errors.Errorf("boom")
}
})
return err
}
params.Knobs = base.TestingKnobs{
SQLSchemaChanger: &sql.SchemaChangerTestingKnobs{
RunBeforeBackfill: func() error {
<-canProceed
return nil
},
},
JobsTestingKnobs: &jobs.TestingKnobs{
BeforeUpdate: func(orig, updated jobs.JobMetadata) (err error) {
withJobsToFail(func(m map[jobspb.JobID]struct{}) {
if _, ok := m[orig.ID]; ok && updated.Status == jobs.StatusCanceled {
delete(m, orig.ID)
err = errors.Errorf("boom")
}
})
return err
},
// Decrease the adopt loop interval so that retries happen quickly.
IntervalOverrides: jobs.TestingIntervalOverrides{
Adopt: &jobInterval,
Cancel: &jobInterval,
},
},
JobsTestingKnobs: jobKnobs,
}

s, sqlDB, _ := serverutils.StartServer(t, params)
Expand Down
12 changes: 12 additions & 0 deletions pkg/ui/cluster-ui/src/store/nodes/nodes.fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ export const getNodeStatus = () => {
started_at: Long.fromString("1611220033263119000"),
updated_at: Long.fromString("1611221833393566000"),
metrics: {
"admission.admitted.kv": 0,
"admission.admitted.sql-kv-response": 0,
"admission.admitted.sql-sql-response": 0,
"admission.granter.io_tokens_exhausted_duration.kv": 0,
"admission.granter.total_slots.kv": 0,
"admission.granter.used_slots.kv": 0,
"admission.wait_durations.kv-p75": 0,
"admission.wait_durations.sql-kv-response-p75": 0,
"admission.wait_durations.sql-sql-response-p75": 0,
"admission.wait_sum.kv": 0,
"admission.wait_sum.sql-kv-response": 0,
"admission.wait_sum.sql-sql-response": 0,
"build.timestamp": 1610970297,
"changefeed.buffer_entries.in": 0,
"changefeed.buffer_entries.out": 0,
Expand Down
127 changes: 127 additions & 0 deletions pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/overload.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,132 @@ export default function (props: GraphDashboardProps) {
))}
</Axis>
</LineGraph>,

<LineGraph title="KV Admission Slots" sources={nodeSources}>
<Axis label="slots">
{nodeIDs.map((nid) => (
<>
<Metric
key={nid}
name="cr.node.admission.granter.total_slots.kv"
title="Total Slots"
sources={[nid]}
/>
<Metric
key={nid}
name="cr.node.admission.granter.used_slots.kv"
title="Used Slots"
sources={[nid]}
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="KV Admission IO Tokens Exhausted Duration Per Second"
sources={nodeSources}
>
<Axis label="duration (micros/sec)">
{nodeIDs.map((nid) => (
<Metric
key={nid}
name="cr.node.admission.granter.io_tokens_exhausted_duration.kv"
title="IO Exhausted"
sources={[nid]}
/>
))}
</Axis>
</LineGraph>,

<LineGraph title="Admission Work Rate" sources={nodeSources}>
<Axis label="work rate">
{nodeIDs.map((nid) => (
<>
<Metric
key={nid}
name="cr.node.admission.admitted.kv"
title="KV request rate"
sources={[nid]}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.node.admission.admitted.sql-kv-response"
title="SQL-KV response rate"
sources={[nid]}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.node.admission.admitted.sql-sql-response"
title="SQL-SQL response rate"
sources={[nid]}
nonNegativeRate
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph title="Admission Delay Rate" sources={nodeSources}>
<Axis label="delay rate (micros/sec)">
{nodeIDs.map((nid) => (
<>
<Metric
key={nid}
name="cr.node.admission.wait_sum.kv"
title="KV"
sources={[nid]}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.node.admission.wait_sum.sql-kv-response"
title="SQL-KV response"
sources={[nid]}
nonNegativeRate
/>
<Metric
key={nid}
name="cr.node.admission.wait_sum.sql-sql-response"
title="SQL-SQL response"
sources={[nid]}
nonNegativeRate
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph title="Admission Delay: 75th percentile" sources={nodeSources}>
<Axis label="delay for requests that waited (nanos)">
{nodeIDs.map((nid) => (
<>
<Metric
key={nid}
name="cr.node.admission.wait_durations.kv-p75"
title="KV"
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.sql-kv-response-p75"
title="SQL-KV response"
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.sql-sql-response-p75"
title="SQL-SQL response"
sources={[nid]}
downsampleMax
/>
</>
))}
</Axis>
</LineGraph>,
];
}
4 changes: 4 additions & 0 deletions pkg/util/admission/work_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
Expand Down Expand Up @@ -353,6 +354,8 @@ func (q *WorkQueue) Admit(ctx context.Context, info WorkInfo) (enabled bool, err
q.metrics.WaitDurations.RecordValue(waitDur.Nanoseconds())
q.metrics.WaitQueueLength.Dec(1)
deadline, _ := ctx.Deadline()
log.Eventf(ctx, "deadline expired, waited in %s queue for %v",
workKindString(q.workKind), waitDur)
return true,
errors.Newf("work %s deadline expired while waiting: deadline: %v, start: %v, dur: %v",
workKindString(q.workKind), deadline, startTime, waitDur)
Expand All @@ -369,6 +372,7 @@ func (q *WorkQueue) Admit(ctx context.Context, info WorkInfo) (enabled bool, err
if work.heapIndex != -1 {
panic(errors.AssertionFailedf("grantee should be removed from heap"))
}
log.Eventf(ctx, "admitted, waited in %s queue for %v", workKindString(q.workKind), waitDur)
q.granter.continueGrantChain(chainID)
return true, nil
}
Expand Down

0 comments on commit 8d0f75e

Please sign in to comment.