@@ -267,6 +267,33 @@ static bool cache_has_usable_cpor(struct mpam_class *class)
267267 return class -> props .cpbm_wd <= 32 ;
268268}
269269
270+ static bool mba_class_use_mbw_max (struct mpam_props * cprops )
271+ {
272+ return (mpam_has_feature (mpam_feat_mbw_max , cprops ) &&
273+ cprops -> bwa_wd );
274+ }
275+
276+ static bool class_has_usable_mba (struct mpam_props * cprops )
277+ {
278+ return mba_class_use_mbw_max (cprops );
279+ }
280+
281+ /*
282+ * Calculate the worst-case percentage change from each implemented step
283+ * in the control.
284+ */
285+ static u32 get_mba_granularity (struct mpam_props * cprops )
286+ {
287+ if (!mba_class_use_mbw_max (cprops ))
288+ return 0 ;
289+
290+ /*
291+ * bwa_wd is the number of bits implemented in the 0.xxx
292+ * fixed point fraction. 1 bit is 50%, 2 is 25% etc.
293+ */
294+ return DIV_ROUND_UP (MAX_MBA_BW , 1 << cprops -> bwa_wd );
295+ }
296+
270297/*
271298 * Each fixed-point hardware value architecturally represents a range
272299 * of values: the full range 0% - 100% is split contiguously into
@@ -317,6 +344,160 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
317344 return val ;
318345}
319346
347+ static u32 get_mba_min (struct mpam_props * cprops )
348+ {
349+ if (!mba_class_use_mbw_max (cprops )) {
350+ WARN_ON_ONCE (1 );
351+ return 0 ;
352+ }
353+
354+ return mbw_max_to_percent (0 , cprops );
355+ }
356+
357+ /* Find the L3 cache that has affinity with this CPU */
358+ static int find_l3_equivalent_bitmask (int cpu , cpumask_var_t tmp_cpumask )
359+ {
360+ u32 cache_id = get_cpu_cacheinfo_id (cpu , 3 );
361+
362+ lockdep_assert_cpus_held ();
363+
364+ return mpam_get_cpumask_from_cache_id (cache_id , 3 , tmp_cpumask );
365+ }
366+
367+ /*
368+ * topology_matches_l3() - Is the provided class the same shape as L3
369+ * @victim: The class we'd like to pretend is L3.
370+ *
371+ * resctrl expects all the world's a Xeon, and all counters are on the
372+ * L3. We allow some mapping counters on other classes. This requires
373+ * that the CPU->domain mapping is the same kind of shape.
374+ *
375+ * Using cacheinfo directly would make this work even if resctrl can't
376+ * use the L3 - but cacheinfo can't tell us anything about offline CPUs.
377+ * Using the L3 resctrl domain list also depends on CPUs being online.
378+ * Using the mpam_class we picked for L3 so we can use its domain list
379+ * assumes that there are MPAM controls on the L3.
380+ * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
381+ * helper which can tell us about offline CPUs ... but getting the cache_id
382+ * to start with relies on at least one CPU per L3 cache being online at
383+ * boot.
384+ *
385+ * Walk the victim component list and compare the affinity mask with the
386+ * corresponding L3. The topology matches if each victim:component's affinity
387+ * mask is the same as the CPU's corresponding L3's. These lists/masks are
388+ * computed from firmware tables so don't change at runtime.
389+ */
390+ static bool topology_matches_l3 (struct mpam_class * victim )
391+ {
392+ int cpu , err ;
393+ struct mpam_component * victim_iter ;
394+
395+ lockdep_assert_cpus_held ();
396+
397+ cpumask_var_t __free (free_cpumask_var ) tmp_cpumask = CPUMASK_VAR_NULL ;
398+ if (!alloc_cpumask_var (& tmp_cpumask , GFP_KERNEL ))
399+ return false;
400+
401+ guard (srcu )(& mpam_srcu );
402+ list_for_each_entry_srcu (victim_iter , & victim -> components , class_list ,
403+ srcu_read_lock_held (& mpam_srcu )) {
404+ if (cpumask_empty (& victim_iter -> affinity )) {
405+ pr_debug ("class %u has CPU-less component %u - can't match L3!\n" ,
406+ victim -> level , victim_iter -> comp_id );
407+ return false;
408+ }
409+
410+ cpu = cpumask_any_and (& victim_iter -> affinity , cpu_online_mask );
411+ if (WARN_ON_ONCE (cpu >= nr_cpu_ids ))
412+ return false;
413+
414+ cpumask_clear (tmp_cpumask );
415+ err = find_l3_equivalent_bitmask (cpu , tmp_cpumask );
416+ if (err ) {
417+ pr_debug ("Failed to find L3's equivalent component to class %u component %u\n" ,
418+ victim -> level , victim_iter -> comp_id );
419+ return false;
420+ }
421+
422+ /* Any differing bits in the affinity mask? */
423+ if (!cpumask_equal (tmp_cpumask , & victim_iter -> affinity )) {
424+ pr_debug ("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
425+ "L3:%*pbl != victim:%*pbl\n" ,
426+ victim -> level , victim_iter -> comp_id ,
427+ cpumask_pr_args (tmp_cpumask ),
428+ cpumask_pr_args (& victim_iter -> affinity ));
429+
430+ return false;
431+ }
432+ }
433+
434+ return true;
435+ }
436+
437+ /*
438+ * Test if the traffic for a class matches that at egress from the L3. For
439+ * MSC at memory controllers this is only possible if there is a single L3
440+ * as otherwise the counters at the memory can include bandwidth from the
441+ * non-local L3.
442+ */
443+ static bool traffic_matches_l3 (struct mpam_class * class )
444+ {
445+ int err , cpu ;
446+
447+ lockdep_assert_cpus_held ();
448+
449+ if (class -> type == MPAM_CLASS_CACHE && class -> level == 3 )
450+ return true;
451+
452+ if (class -> type == MPAM_CLASS_CACHE && class -> level != 3 ) {
453+ pr_debug ("class %u is a different cache from L3\n" , class -> level );
454+ return false;
455+ }
456+
457+ if (class -> type != MPAM_CLASS_MEMORY ) {
458+ pr_debug ("class %u is neither of type cache or memory\n" , class -> level );
459+ return false;
460+ }
461+
462+ cpumask_var_t __free (free_cpumask_var ) tmp_cpumask = CPUMASK_VAR_NULL ;
463+ if (!alloc_cpumask_var (& tmp_cpumask , GFP_KERNEL )) {
464+ pr_debug ("cpumask allocation failed\n" );
465+ return false;
466+ }
467+
468+ cpu = cpumask_any_and (& class -> affinity , cpu_online_mask );
469+ err = find_l3_equivalent_bitmask (cpu , tmp_cpumask );
470+ if (err ) {
471+ pr_debug ("Failed to find L3 downstream to cpu %d\n" , cpu );
472+ return false;
473+ }
474+
475+ if (!cpumask_equal (tmp_cpumask , cpu_possible_mask )) {
476+ pr_debug ("There is more than one L3\n" );
477+ return false;
478+ }
479+
480+ /* Be strict; the traffic might stop in the intermediate cache. */
481+ if (get_cpu_cacheinfo_id (cpu , 4 ) != -1 ) {
482+ pr_debug ("L3 isn't the last level of cache\n" );
483+ return false;
484+ }
485+
486+ if (num_possible_nodes () > 1 ) {
487+ pr_debug ("There is more than one numa node\n" );
488+ return false;
489+ }
490+
491+ #ifdef CONFIG_HMEM_REPORTING
492+ if (node_devices [cpu_to_node (cpu )]-> cache_dev ) {
493+ pr_debug ("There is a memory side cache\n" );
494+ return false;
495+ }
496+ #endif
497+
498+ return true;
499+ }
500+
320501/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
321502static void mpam_resctrl_pick_caches (void )
322503{
@@ -358,9 +539,68 @@ static void mpam_resctrl_pick_caches(void)
358539 }
359540}
360541
542+ static void mpam_resctrl_pick_mba (void )
543+ {
544+ struct mpam_class * class , * candidate_class = NULL ;
545+ struct mpam_resctrl_res * res ;
546+
547+ lockdep_assert_cpus_held ();
548+
549+ guard (srcu )(& mpam_srcu );
550+ list_for_each_entry_srcu (class , & mpam_classes , classes_list ,
551+ srcu_read_lock_held (& mpam_srcu )) {
552+ struct mpam_props * cprops = & class -> props ;
553+
554+ if (class -> level != 3 && class -> type == MPAM_CLASS_CACHE ) {
555+ pr_debug ("class %u is a cache but not the L3\n" , class -> level );
556+ continue ;
557+ }
558+
559+ if (!class_has_usable_mba (cprops )) {
560+ pr_debug ("class %u has no bandwidth control\n" ,
561+ class -> level );
562+ continue ;
563+ }
564+
565+ if (!cpumask_equal (& class -> affinity , cpu_possible_mask )) {
566+ pr_debug ("class %u has missing CPUs\n" , class -> level );
567+ continue ;
568+ }
569+
570+ if (!topology_matches_l3 (class )) {
571+ pr_debug ("class %u topology doesn't match L3\n" ,
572+ class -> level );
573+ continue ;
574+ }
575+
576+ if (!traffic_matches_l3 (class )) {
577+ pr_debug ("class %u traffic doesn't match L3 egress\n" ,
578+ class -> level );
579+ continue ;
580+ }
581+
582+ /*
583+ * Pick a resource to be MBA that as close as possible to
584+ * the L3. mbm_total counts the bandwidth leaving the L3
585+ * cache and MBA should correspond as closely as possible
586+ * for proper operation of mba_sc.
587+ */
588+ if (!candidate_class || class -> level < candidate_class -> level )
589+ candidate_class = class ;
590+ }
591+
592+ if (candidate_class ) {
593+ pr_debug ("selected class %u to back MBA\n" ,
594+ candidate_class -> level );
595+ res = & mpam_resctrl_controls [RDT_RESOURCE_MBA ];
596+ res -> class = candidate_class ;
597+ }
598+ }
599+
361600static int mpam_resctrl_control_init (struct mpam_resctrl_res * res )
362601{
363602 struct mpam_class * class = res -> class ;
603+ struct mpam_props * cprops = & class -> props ;
364604 struct rdt_resource * r = & res -> resctrl_res ;
365605
366606 switch (r -> rid ) {
@@ -392,6 +632,19 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
392632 r -> cache .shareable_bits = resctrl_get_default_ctrl (r );
393633 r -> alloc_capable = true;
394634 break ;
635+ case RDT_RESOURCE_MBA :
636+ r -> schema_fmt = RESCTRL_SCHEMA_RANGE ;
637+ r -> ctrl_scope = RESCTRL_L3_CACHE ;
638+
639+ r -> membw .delay_linear = true;
640+ r -> membw .throttle_mode = THREAD_THROTTLE_UNDEFINED ;
641+ r -> membw .min_bw = get_mba_min (cprops );
642+ r -> membw .max_bw = MAX_MBA_BW ;
643+ r -> membw .bw_gran = get_mba_granularity (cprops );
644+
645+ r -> name = "MB" ;
646+ r -> alloc_capable = true;
647+ break ;
395648 default :
396649 return - EINVAL ;
397650 }
@@ -406,7 +659,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
406659 if (class -> type == MPAM_CLASS_CACHE )
407660 return comp -> comp_id ;
408661
409- /* TODO: repaint domain ids to match the L3 domain ids */
662+ if (topology_matches_l3 (class )) {
663+ /* Use the corresponding L3 component ID as the domain ID */
664+ int id = get_cpu_cacheinfo_id (cpu , 3 );
665+
666+ /* Implies topology_matches_l3() made a mistake */
667+ if (WARN_ON_ONCE (id == -1 ))
668+ return comp -> comp_id ;
669+
670+ return id ;
671+ }
672+
410673 /* Otherwise, expose the ID used by the firmware table code. */
411674 return comp -> comp_id ;
412675}
@@ -446,6 +709,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
446709 case RDT_RESOURCE_L3 :
447710 configured_by = mpam_feat_cpor_part ;
448711 break ;
712+ case RDT_RESOURCE_MBA :
713+ if (mpam_has_feature (mpam_feat_mbw_max , cprops )) {
714+ configured_by = mpam_feat_mbw_max ;
715+ break ;
716+ }
717+ fallthrough ;
449718 default :
450719 return resctrl_get_default_ctrl (r );
451720 }
@@ -457,6 +726,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
457726 switch (configured_by ) {
458727 case mpam_feat_cpor_part :
459728 return cfg -> cpbm ;
729+ case mpam_feat_mbw_max :
730+ return mbw_max_to_percent (cfg -> mbw_max , cprops );
460731 default :
461732 return resctrl_get_default_ctrl (r );
462733 }
@@ -504,6 +775,13 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
504775 cfg .cpbm = cfg_val ;
505776 mpam_set_feature (mpam_feat_cpor_part , & cfg );
506777 break ;
778+ case RDT_RESOURCE_MBA :
779+ if (mpam_has_feature (mpam_feat_mbw_max , cprops )) {
780+ cfg .mbw_max = percent_to_mbw_max (cfg_val , cprops );
781+ mpam_set_feature (mpam_feat_mbw_max , & cfg );
782+ break ;
783+ }
784+ fallthrough ;
507785 default :
508786 return - EINVAL ;
509787 }
@@ -775,6 +1053,7 @@ int mpam_resctrl_setup(void)
7751053
7761054 /* Find some classes to use for controls */
7771055 mpam_resctrl_pick_caches ();
1056+ mpam_resctrl_pick_mba ();
7781057
7791058 /* Initialise the resctrl structures from the classes */
7801059 for_each_mpam_resctrl_control (res , rid ) {
0 commit comments