Skip to content

Commit

Permalink
sched/fair: Bring back select_idle_smt(), but differently
Browse files Browse the repository at this point in the history
Mel Gorman did some nice work in 9fe1f12 ("sched/fair: Merge
select_idle_core/cpu()"), resulting in the kernel being more efficient
at finding an idle CPU, and in tasks spending less time waiting to be
run, both according to the schedstats run_delay numbers, and according
to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about 30%
more), higher cache misses, higher memory bandwidth utilization, and
higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw a
consistent 1-3% increase in total CPU use on the system, due to those
increased task migrations leading to higher L2 cache miss numbers, and
higher memory utilization. The exclusive L3 cache on Skylake does us
no favors there.

On our web serving workload, that effect is usually negligible.

It appears that the increased number of CPU migrations is generally a
good thing, since it leads to lower cpu_delay numbers, reflecting the
fact that tasks get to run faster. However, the reduced locality and
the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing
select_idle_smt with a twist: when a socket has no idle cores, check
to see if the sibling of "prev" is idle, before searching all the
other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, task migrations are still
high, but L2 cache misses, memory bandwidth, and CPU time used are
back down to what they were before. The p95 and p99 response times for
the memcache type application improve by about 10% over what they were
before Mel's patches got merged.

Signed-off-by: Rik van Riel <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Mel Gorman <[email protected]>
Acked-by: Vincent Guittot <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
  • Loading branch information
rikvanriel authored and Peter Zijlstra committed Apr 9, 2021
1 parent 6db12ee commit c722f35
Showing 1 changed file with 43 additions and 12 deletions.
55 changes: 43 additions & 12 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;

if (static_branch_likely(&sched_smt_present)) {
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
}
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);

return def;
}
Expand Down Expand Up @@ -6112,6 +6110,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
return -1;
}

/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;

for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}

return -1;
}

#else /* CONFIG_SCHED_SMT */

static inline void set_idle_cores(int cpu, int val)
Expand All @@ -6128,18 +6144,22 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core);
}

static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}

#endif /* CONFIG_SCHED_SMT */

/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
bool smt = test_idle_cores(target, false);
int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time;
Expand All @@ -6150,7 +6170,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t

cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);

if (sched_feat(SIS_PROP) && !smt) {
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;

/*
Expand All @@ -6170,7 +6190,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}

for_each_cpu_wrap(cpu, cpus, target) {
if (smt) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
return i;
Expand All @@ -6184,10 +6204,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
}

if (smt)
if (has_idle_core)
set_idle_cores(this, false);

if (sched_feat(SIS_PROP) && !smt) {
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
update_avg(&this_sd->avg_scan_cost, time);
}
Expand Down Expand Up @@ -6242,6 +6262,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
Expand Down Expand Up @@ -6321,7 +6342,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;

i = select_idle_cpu(p, sd, target);
if (sched_smt_active()) {
has_idle_core = test_idle_cores(target, false);

if (!has_idle_core && cpus_share_cache(prev, target)) {
i = select_idle_smt(p, sd, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
}

i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;

Expand Down

0 comments on commit c722f35

Please sign in to comment.