Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

arm_mpam: resctrl: Add support for 'MB' resource

resctrl supports 'MB', as a percentage throttling of traffic from the
L3. This is the control that mba_sc uses, so ideally the class chosen
should be as close as possible to the counters used for mbm_total. If there
is a single L3, it's the last cache, and the topology of the memory matches
then the traffic at the memory controller will be equivalent to that at
egress of the L3. If these conditions are met allow the memory class to
back MB.

MB's percentage control should be backed either with the fixed point
fraction MBW_MAX or bandwidth portion bitmaps. The bandwidth portion
bitmaps is not used as its tricky to pick which bits to use to avoid
contention, and may be possible to expose this as something other than a
percentage in the future.

Tested-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Tested-by: Zeng Heng <zengheng4@huawei.com>
Tested-by: Punit Agrawal <punit.agrawal@oss.qualcomm.com>
Tested-by: Gavin Shan <gshan@redhat.com>
Tested-by: Jesse Chick <jessechick@os.amperecomputing.com>
Reviewed-by: Zeng Heng <zengheng4@huawei.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Co-developed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Co-developed-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>

+280 -1
+280 -1
drivers/resctrl/mpam_resctrl.c
··· 267 267 return class->props.cpbm_wd <= 32; 268 268 } 269 269 270 + static bool mba_class_use_mbw_max(struct mpam_props *cprops) 271 + { 272 + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && 273 + cprops->bwa_wd); 274 + } 275 + 276 + static bool class_has_usable_mba(struct mpam_props *cprops) 277 + { 278 + return mba_class_use_mbw_max(cprops); 279 + } 280 + 281 + /* 282 + * Calculate the worst-case percentage change from each implemented step 283 + * in the control. 284 + */ 285 + static u32 get_mba_granularity(struct mpam_props *cprops) 286 + { 287 + if (!mba_class_use_mbw_max(cprops)) 288 + return 0; 289 + 290 + /* 291 + * bwa_wd is the number of bits implemented in the 0.xxx 292 + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. 293 + */ 294 + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); 295 + } 296 + 270 297 /* 271 298 * Each fixed-point hardware value architecturally represents a range 272 299 * of values: the full range 0% - 100% is split contiguously into ··· 344 317 return val; 345 318 } 346 319 320 + static u32 get_mba_min(struct mpam_props *cprops) 321 + { 322 + if (!mba_class_use_mbw_max(cprops)) { 323 + WARN_ON_ONCE(1); 324 + return 0; 325 + } 326 + 327 + return mbw_max_to_percent(0, cprops); 328 + } 329 + 330 + /* Find the L3 cache that has affinity with this CPU */ 331 + static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) 332 + { 333 + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); 334 + 335 + lockdep_assert_cpus_held(); 336 + 337 + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); 338 + } 339 + 340 + /* 341 + * topology_matches_l3() - Is the provided class the same shape as L3 342 + * @victim: The class we'd like to pretend is L3. 343 + * 344 + * resctrl expects all the world's a Xeon, and all counters are on the 345 + * L3. We allow some mapping counters on other classes. This requires 346 + * that the CPU->domain mapping is the same kind of shape. 347 + * 348 + * Using cacheinfo directly would make this work even if resctrl can't 349 + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. 350 + * Using the L3 resctrl domain list also depends on CPUs being online. 351 + * Using the mpam_class we picked for L3 so we can use its domain list 352 + * assumes that there are MPAM controls on the L3. 353 + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() 354 + * helper which can tell us about offline CPUs ... but getting the cache_id 355 + * to start with relies on at least one CPU per L3 cache being online at 356 + * boot. 357 + * 358 + * Walk the victim component list and compare the affinity mask with the 359 + * corresponding L3. The topology matches if each victim:component's affinity 360 + * mask is the same as the CPU's corresponding L3's. These lists/masks are 361 + * computed from firmware tables so don't change at runtime. 362 + */ 363 + static bool topology_matches_l3(struct mpam_class *victim) 364 + { 365 + int cpu, err; 366 + struct mpam_component *victim_iter; 367 + 368 + lockdep_assert_cpus_held(); 369 + 370 + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; 371 + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) 372 + return false; 373 + 374 + guard(srcu)(&mpam_srcu); 375 + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, 376 + srcu_read_lock_held(&mpam_srcu)) { 377 + if (cpumask_empty(&victim_iter->affinity)) { 378 + pr_debug("class %u has CPU-less component %u - can't match L3!\n", 379 + victim->level, victim_iter->comp_id); 380 + return false; 381 + } 382 + 383 + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); 384 + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) 385 + return false; 386 + 387 + cpumask_clear(tmp_cpumask); 388 + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); 389 + if (err) { 390 + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", 391 + victim->level, victim_iter->comp_id); 392 + return false; 393 + } 394 + 395 + /* Any differing bits in the affinity mask? */ 396 + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { 397 + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" 398 + "L3:%*pbl != victim:%*pbl\n", 399 + victim->level, victim_iter->comp_id, 400 + cpumask_pr_args(tmp_cpumask), 401 + cpumask_pr_args(&victim_iter->affinity)); 402 + 403 + return false; 404 + } 405 + } 406 + 407 + return true; 408 + } 409 + 410 + /* 411 + * Test if the traffic for a class matches that at egress from the L3. For 412 + * MSC at memory controllers this is only possible if there is a single L3 413 + * as otherwise the counters at the memory can include bandwidth from the 414 + * non-local L3. 415 + */ 416 + static bool traffic_matches_l3(struct mpam_class *class) 417 + { 418 + int err, cpu; 419 + 420 + lockdep_assert_cpus_held(); 421 + 422 + if (class->type == MPAM_CLASS_CACHE && class->level == 3) 423 + return true; 424 + 425 + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { 426 + pr_debug("class %u is a different cache from L3\n", class->level); 427 + return false; 428 + } 429 + 430 + if (class->type != MPAM_CLASS_MEMORY) { 431 + pr_debug("class %u is neither of type cache or memory\n", class->level); 432 + return false; 433 + } 434 + 435 + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; 436 + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { 437 + pr_debug("cpumask allocation failed\n"); 438 + return false; 439 + } 440 + 441 + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); 442 + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); 443 + if (err) { 444 + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); 445 + return false; 446 + } 447 + 448 + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { 449 + pr_debug("There is more than one L3\n"); 450 + return false; 451 + } 452 + 453 + /* Be strict; the traffic might stop in the intermediate cache. */ 454 + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { 455 + pr_debug("L3 isn't the last level of cache\n"); 456 + return false; 457 + } 458 + 459 + if (num_possible_nodes() > 1) { 460 + pr_debug("There is more than one numa node\n"); 461 + return false; 462 + } 463 + 464 + #ifdef CONFIG_HMEM_REPORTING 465 + if (node_devices[cpu_to_node(cpu)]->cache_dev) { 466 + pr_debug("There is a memory side cache\n"); 467 + return false; 468 + } 469 + #endif 470 + 471 + return true; 472 + } 473 + 347 474 /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ 348 475 static void mpam_resctrl_pick_caches(void) 349 476 { ··· 539 358 } 540 359 } 541 360 361 + static void mpam_resctrl_pick_mba(void) 362 + { 363 + struct mpam_class *class, *candidate_class = NULL; 364 + struct mpam_resctrl_res *res; 365 + 366 + lockdep_assert_cpus_held(); 367 + 368 + guard(srcu)(&mpam_srcu); 369 + list_for_each_entry_srcu(class, &mpam_classes, classes_list, 370 + srcu_read_lock_held(&mpam_srcu)) { 371 + struct mpam_props *cprops = &class->props; 372 + 373 + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { 374 + pr_debug("class %u is a cache but not the L3\n", class->level); 375 + continue; 376 + } 377 + 378 + if (!class_has_usable_mba(cprops)) { 379 + pr_debug("class %u has no bandwidth control\n", 380 + class->level); 381 + continue; 382 + } 383 + 384 + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { 385 + pr_debug("class %u has missing CPUs\n", class->level); 386 + continue; 387 + } 388 + 389 + if (!topology_matches_l3(class)) { 390 + pr_debug("class %u topology doesn't match L3\n", 391 + class->level); 392 + continue; 393 + } 394 + 395 + if (!traffic_matches_l3(class)) { 396 + pr_debug("class %u traffic doesn't match L3 egress\n", 397 + class->level); 398 + continue; 399 + } 400 + 401 + /* 402 + * Pick a resource to be MBA that as close as possible to 403 + * the L3. mbm_total counts the bandwidth leaving the L3 404 + * cache and MBA should correspond as closely as possible 405 + * for proper operation of mba_sc. 406 + */ 407 + if (!candidate_class || class->level < candidate_class->level) 408 + candidate_class = class; 409 + } 410 + 411 + if (candidate_class) { 412 + pr_debug("selected class %u to back MBA\n", 413 + candidate_class->level); 414 + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; 415 + res->class = candidate_class; 416 + } 417 + } 418 + 542 419 static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) 543 420 { 544 421 struct mpam_class *class = res->class; 422 + struct mpam_props *cprops = &class->props; 545 423 struct rdt_resource *r = &res->resctrl_res; 546 424 547 425 switch (r->rid) { ··· 632 392 r->cache.shareable_bits = resctrl_get_default_ctrl(r); 633 393 r->alloc_capable = true; 634 394 break; 395 + case RDT_RESOURCE_MBA: 396 + r->schema_fmt = RESCTRL_SCHEMA_RANGE; 397 + r->ctrl_scope = RESCTRL_L3_CACHE; 398 + 399 + r->membw.delay_linear = true; 400 + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; 401 + r->membw.min_bw = get_mba_min(cprops); 402 + r->membw.max_bw = MAX_MBA_BW; 403 + r->membw.bw_gran = get_mba_granularity(cprops); 404 + 405 + r->name = "MB"; 406 + r->alloc_capable = true; 407 + break; 635 408 default: 636 409 return -EINVAL; 637 410 } ··· 659 406 if (class->type == MPAM_CLASS_CACHE) 660 407 return comp->comp_id; 661 408 662 - /* TODO: repaint domain ids to match the L3 domain ids */ 409 + if (topology_matches_l3(class)) { 410 + /* Use the corresponding L3 component ID as the domain ID */ 411 + int id = get_cpu_cacheinfo_id(cpu, 3); 412 + 413 + /* Implies topology_matches_l3() made a mistake */ 414 + if (WARN_ON_ONCE(id == -1)) 415 + return comp->comp_id; 416 + 417 + return id; 418 + } 419 + 663 420 /* Otherwise, expose the ID used by the firmware table code. */ 664 421 return comp->comp_id; 665 422 } ··· 709 446 case RDT_RESOURCE_L3: 710 447 configured_by = mpam_feat_cpor_part; 711 448 break; 449 + case RDT_RESOURCE_MBA: 450 + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { 451 + configured_by = mpam_feat_mbw_max; 452 + break; 453 + } 454 + fallthrough; 712 455 default: 713 456 return resctrl_get_default_ctrl(r); 714 457 } ··· 726 457 switch (configured_by) { 727 458 case mpam_feat_cpor_part: 728 459 return cfg->cpbm; 460 + case mpam_feat_mbw_max: 461 + return mbw_max_to_percent(cfg->mbw_max, cprops); 729 462 default: 730 463 return resctrl_get_default_ctrl(r); 731 464 } ··· 775 504 cfg.cpbm = cfg_val; 776 505 mpam_set_feature(mpam_feat_cpor_part, &cfg); 777 506 break; 507 + case RDT_RESOURCE_MBA: 508 + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { 509 + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); 510 + mpam_set_feature(mpam_feat_mbw_max, &cfg); 511 + break; 512 + } 513 + fallthrough; 778 514 default: 779 515 return -EINVAL; 780 516 } ··· 1053 775 1054 776 /* Find some classes to use for controls */ 1055 777 mpam_resctrl_pick_caches(); 778 + mpam_resctrl_pick_mba(); 1056 779 1057 780 /* Initialise the resctrl structures from the classes */ 1058 781 for_each_mpam_resctrl_control(res, rid) {