Skip to content

Commit 36528c7

Browse files
author
James Morse
committed
arm_mpam: resctrl: Add support for 'MB' resource
resctrl supports 'MB', as a percentage throttling of traffic from the L3. This is the control that mba_sc uses, so ideally the class chosen should be as close as possible to the counters used for mbm_total. If there is a single L3, it's the last cache, and the topology of the memory matches then the traffic at the memory controller will be equivalent to that at egress of the L3. If these conditions are met allow the memory class to back MB. MB's percentage control should be backed either with the fixed point fraction MBW_MAX or bandwidth portion bitmaps. The bandwidth portion bitmaps is not used as its tricky to pick which bits to use to avoid contention, and may be possible to expose this as something other than a percentage in the future. Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com> Tested-by: Zeng Heng <zengheng4@huawei.com> Tested-by: Punit Agrawal <punit.agrawal@oss.qualcomm.com> Tested-by: Gavin Shan <gshan@redhat.com> Tested-by: Jesse Chick <jessechick@os.amperecomputing.com> Reviewed-by: Zeng Heng <zengheng4@huawei.com> Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com> Reviewed-by: Gavin Shan <gshan@redhat.com> Co-developed-by: Dave Martin <Dave.Martin@arm.com> Signed-off-by: Dave Martin <Dave.Martin@arm.com> Co-developed-by: Ben Horgan <ben.horgan@arm.com> Signed-off-by: Ben Horgan <ben.horgan@arm.com> Signed-off-by: James Morse <james.morse@arm.com>
1 parent 1c1e296 commit 36528c7

1 file changed

Lines changed: 280 additions & 1 deletion

File tree

drivers/resctrl/mpam_resctrl.c

Lines changed: 280 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,33 @@ static bool cache_has_usable_cpor(struct mpam_class *class)
267267
return class->props.cpbm_wd <= 32;
268268
}
269269

270+
static bool mba_class_use_mbw_max(struct mpam_props *cprops)
271+
{
272+
return (mpam_has_feature(mpam_feat_mbw_max, cprops) &&
273+
cprops->bwa_wd);
274+
}
275+
276+
static bool class_has_usable_mba(struct mpam_props *cprops)
277+
{
278+
return mba_class_use_mbw_max(cprops);
279+
}
280+
281+
/*
282+
* Calculate the worst-case percentage change from each implemented step
283+
* in the control.
284+
*/
285+
static u32 get_mba_granularity(struct mpam_props *cprops)
286+
{
287+
if (!mba_class_use_mbw_max(cprops))
288+
return 0;
289+
290+
/*
291+
* bwa_wd is the number of bits implemented in the 0.xxx
292+
* fixed point fraction. 1 bit is 50%, 2 is 25% etc.
293+
*/
294+
return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd);
295+
}
296+
270297
/*
271298
* Each fixed-point hardware value architecturally represents a range
272299
* of values: the full range 0% - 100% is split contiguously into
@@ -317,6 +344,160 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
317344
return val;
318345
}
319346

347+
static u32 get_mba_min(struct mpam_props *cprops)
348+
{
349+
if (!mba_class_use_mbw_max(cprops)) {
350+
WARN_ON_ONCE(1);
351+
return 0;
352+
}
353+
354+
return mbw_max_to_percent(0, cprops);
355+
}
356+
357+
/* Find the L3 cache that has affinity with this CPU */
358+
static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask)
359+
{
360+
u32 cache_id = get_cpu_cacheinfo_id(cpu, 3);
361+
362+
lockdep_assert_cpus_held();
363+
364+
return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask);
365+
}
366+
367+
/*
368+
* topology_matches_l3() - Is the provided class the same shape as L3
369+
* @victim: The class we'd like to pretend is L3.
370+
*
371+
* resctrl expects all the world's a Xeon, and all counters are on the
372+
* L3. We allow some mapping counters on other classes. This requires
373+
* that the CPU->domain mapping is the same kind of shape.
374+
*
375+
* Using cacheinfo directly would make this work even if resctrl can't
376+
* use the L3 - but cacheinfo can't tell us anything about offline CPUs.
377+
* Using the L3 resctrl domain list also depends on CPUs being online.
378+
* Using the mpam_class we picked for L3 so we can use its domain list
379+
* assumes that there are MPAM controls on the L3.
380+
* Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
381+
* helper which can tell us about offline CPUs ... but getting the cache_id
382+
* to start with relies on at least one CPU per L3 cache being online at
383+
* boot.
384+
*
385+
* Walk the victim component list and compare the affinity mask with the
386+
* corresponding L3. The topology matches if each victim:component's affinity
387+
* mask is the same as the CPU's corresponding L3's. These lists/masks are
388+
* computed from firmware tables so don't change at runtime.
389+
*/
390+
static bool topology_matches_l3(struct mpam_class *victim)
391+
{
392+
int cpu, err;
393+
struct mpam_component *victim_iter;
394+
395+
lockdep_assert_cpus_held();
396+
397+
cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
398+
if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL))
399+
return false;
400+
401+
guard(srcu)(&mpam_srcu);
402+
list_for_each_entry_srcu(victim_iter, &victim->components, class_list,
403+
srcu_read_lock_held(&mpam_srcu)) {
404+
if (cpumask_empty(&victim_iter->affinity)) {
405+
pr_debug("class %u has CPU-less component %u - can't match L3!\n",
406+
victim->level, victim_iter->comp_id);
407+
return false;
408+
}
409+
410+
cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask);
411+
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
412+
return false;
413+
414+
cpumask_clear(tmp_cpumask);
415+
err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
416+
if (err) {
417+
pr_debug("Failed to find L3's equivalent component to class %u component %u\n",
418+
victim->level, victim_iter->comp_id);
419+
return false;
420+
}
421+
422+
/* Any differing bits in the affinity mask? */
423+
if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) {
424+
pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
425+
"L3:%*pbl != victim:%*pbl\n",
426+
victim->level, victim_iter->comp_id,
427+
cpumask_pr_args(tmp_cpumask),
428+
cpumask_pr_args(&victim_iter->affinity));
429+
430+
return false;
431+
}
432+
}
433+
434+
return true;
435+
}
436+
437+
/*
438+
* Test if the traffic for a class matches that at egress from the L3. For
439+
* MSC at memory controllers this is only possible if there is a single L3
440+
* as otherwise the counters at the memory can include bandwidth from the
441+
* non-local L3.
442+
*/
443+
static bool traffic_matches_l3(struct mpam_class *class)
444+
{
445+
int err, cpu;
446+
447+
lockdep_assert_cpus_held();
448+
449+
if (class->type == MPAM_CLASS_CACHE && class->level == 3)
450+
return true;
451+
452+
if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
453+
pr_debug("class %u is a different cache from L3\n", class->level);
454+
return false;
455+
}
456+
457+
if (class->type != MPAM_CLASS_MEMORY) {
458+
pr_debug("class %u is neither of type cache or memory\n", class->level);
459+
return false;
460+
}
461+
462+
cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
463+
if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) {
464+
pr_debug("cpumask allocation failed\n");
465+
return false;
466+
}
467+
468+
cpu = cpumask_any_and(&class->affinity, cpu_online_mask);
469+
err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
470+
if (err) {
471+
pr_debug("Failed to find L3 downstream to cpu %d\n", cpu);
472+
return false;
473+
}
474+
475+
if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) {
476+
pr_debug("There is more than one L3\n");
477+
return false;
478+
}
479+
480+
/* Be strict; the traffic might stop in the intermediate cache. */
481+
if (get_cpu_cacheinfo_id(cpu, 4) != -1) {
482+
pr_debug("L3 isn't the last level of cache\n");
483+
return false;
484+
}
485+
486+
if (num_possible_nodes() > 1) {
487+
pr_debug("There is more than one numa node\n");
488+
return false;
489+
}
490+
491+
#ifdef CONFIG_HMEM_REPORTING
492+
if (node_devices[cpu_to_node(cpu)]->cache_dev) {
493+
pr_debug("There is a memory side cache\n");
494+
return false;
495+
}
496+
#endif
497+
498+
return true;
499+
}
500+
320501
/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
321502
static void mpam_resctrl_pick_caches(void)
322503
{
@@ -358,9 +539,68 @@ static void mpam_resctrl_pick_caches(void)
358539
}
359540
}
360541

542+
static void mpam_resctrl_pick_mba(void)
543+
{
544+
struct mpam_class *class, *candidate_class = NULL;
545+
struct mpam_resctrl_res *res;
546+
547+
lockdep_assert_cpus_held();
548+
549+
guard(srcu)(&mpam_srcu);
550+
list_for_each_entry_srcu(class, &mpam_classes, classes_list,
551+
srcu_read_lock_held(&mpam_srcu)) {
552+
struct mpam_props *cprops = &class->props;
553+
554+
if (class->level != 3 && class->type == MPAM_CLASS_CACHE) {
555+
pr_debug("class %u is a cache but not the L3\n", class->level);
556+
continue;
557+
}
558+
559+
if (!class_has_usable_mba(cprops)) {
560+
pr_debug("class %u has no bandwidth control\n",
561+
class->level);
562+
continue;
563+
}
564+
565+
if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
566+
pr_debug("class %u has missing CPUs\n", class->level);
567+
continue;
568+
}
569+
570+
if (!topology_matches_l3(class)) {
571+
pr_debug("class %u topology doesn't match L3\n",
572+
class->level);
573+
continue;
574+
}
575+
576+
if (!traffic_matches_l3(class)) {
577+
pr_debug("class %u traffic doesn't match L3 egress\n",
578+
class->level);
579+
continue;
580+
}
581+
582+
/*
583+
* Pick a resource to be MBA that as close as possible to
584+
* the L3. mbm_total counts the bandwidth leaving the L3
585+
* cache and MBA should correspond as closely as possible
586+
* for proper operation of mba_sc.
587+
*/
588+
if (!candidate_class || class->level < candidate_class->level)
589+
candidate_class = class;
590+
}
591+
592+
if (candidate_class) {
593+
pr_debug("selected class %u to back MBA\n",
594+
candidate_class->level);
595+
res = &mpam_resctrl_controls[RDT_RESOURCE_MBA];
596+
res->class = candidate_class;
597+
}
598+
}
599+
361600
static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
362601
{
363602
struct mpam_class *class = res->class;
603+
struct mpam_props *cprops = &class->props;
364604
struct rdt_resource *r = &res->resctrl_res;
365605

366606
switch (r->rid) {
@@ -392,6 +632,19 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
392632
r->cache.shareable_bits = resctrl_get_default_ctrl(r);
393633
r->alloc_capable = true;
394634
break;
635+
case RDT_RESOURCE_MBA:
636+
r->schema_fmt = RESCTRL_SCHEMA_RANGE;
637+
r->ctrl_scope = RESCTRL_L3_CACHE;
638+
639+
r->membw.delay_linear = true;
640+
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
641+
r->membw.min_bw = get_mba_min(cprops);
642+
r->membw.max_bw = MAX_MBA_BW;
643+
r->membw.bw_gran = get_mba_granularity(cprops);
644+
645+
r->name = "MB";
646+
r->alloc_capable = true;
647+
break;
395648
default:
396649
return -EINVAL;
397650
}
@@ -406,7 +659,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
406659
if (class->type == MPAM_CLASS_CACHE)
407660
return comp->comp_id;
408661

409-
/* TODO: repaint domain ids to match the L3 domain ids */
662+
if (topology_matches_l3(class)) {
663+
/* Use the corresponding L3 component ID as the domain ID */
664+
int id = get_cpu_cacheinfo_id(cpu, 3);
665+
666+
/* Implies topology_matches_l3() made a mistake */
667+
if (WARN_ON_ONCE(id == -1))
668+
return comp->comp_id;
669+
670+
return id;
671+
}
672+
410673
/* Otherwise, expose the ID used by the firmware table code. */
411674
return comp->comp_id;
412675
}
@@ -446,6 +709,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
446709
case RDT_RESOURCE_L3:
447710
configured_by = mpam_feat_cpor_part;
448711
break;
712+
case RDT_RESOURCE_MBA:
713+
if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
714+
configured_by = mpam_feat_mbw_max;
715+
break;
716+
}
717+
fallthrough;
449718
default:
450719
return resctrl_get_default_ctrl(r);
451720
}
@@ -457,6 +726,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
457726
switch (configured_by) {
458727
case mpam_feat_cpor_part:
459728
return cfg->cpbm;
729+
case mpam_feat_mbw_max:
730+
return mbw_max_to_percent(cfg->mbw_max, cprops);
460731
default:
461732
return resctrl_get_default_ctrl(r);
462733
}
@@ -504,6 +775,13 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
504775
cfg.cpbm = cfg_val;
505776
mpam_set_feature(mpam_feat_cpor_part, &cfg);
506777
break;
778+
case RDT_RESOURCE_MBA:
779+
if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
780+
cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops);
781+
mpam_set_feature(mpam_feat_mbw_max, &cfg);
782+
break;
783+
}
784+
fallthrough;
507785
default:
508786
return -EINVAL;
509787
}
@@ -775,6 +1053,7 @@ int mpam_resctrl_setup(void)
7751053

7761054
/* Find some classes to use for controls */
7771055
mpam_resctrl_pick_caches();
1056+
mpam_resctrl_pick_mba();
7781057

7791058
/* Initialise the resctrl structures from the classes */
7801059
for_each_mpam_resctrl_control(res, rid) {

0 commit comments

Comments
 (0)