Merge tag 'x86_cache_for_v6.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-1

Documentation/arch/x86/index.rst

··· 32 32 pti 33 33 mds 34 34 microcode 35 - resctrl 36 35 tsx_async_abort 37 36 buslock 38 37 usb-legacy-support

+3 -3

Documentation/arch/x86/resctrl.rst Documentation/filesystems/resctrl.rst

··· 1 1 .. SPDX-License-Identifier: GPL-2.0 2 2 .. include:: <isonum.txt> 3 3 4 - =========================================== 5 - User Interface for Resource Control feature 6 - =========================================== 4 + ===================================================== 5 + User Interface for Resource Control feature (resctrl) 6 + ===================================================== 7 7 8 8 :Copyright: |copy| 2016 Intel Corporation 9 9 :Authors: - Fenghua Yu <fenghua.yu@intel.com>

+1

Documentation/filesystems/index.rst

··· 113 113 qnx6 114 114 ramfs-rootfs-initramfs 115 115 relay 116 + resctrl 116 117 romfs 117 118 smb/index 118 119 spufs/index

+4 -1

MAINTAINERS

··· 20501 20501 RDT - RESOURCE ALLOCATION 20502 20502 M: Tony Luck <tony.luck@intel.com> 20503 20503 M: Reinette Chatre <reinette.chatre@intel.com> 20504 + R: Dave Martin <Dave.Martin@arm.com> 20505 + R: James Morse <james.morse@arm.com> 20504 20506 L: linux-kernel@vger.kernel.org 20505 20507 S: Supported 20506 - F: Documentation/arch/x86/resctrl* 20508 + F: Documentation/filesystems/resctrl.rst 20507 20509 F: arch/x86/include/asm/resctrl.h 20508 20510 F: arch/x86/kernel/cpu/resctrl/ 20511 + F: fs/resctrl/ 20509 20512 F: include/linux/resctrl*.h 20510 20513 F: tools/testing/selftests/resctrl/ 20511 20514

+8

arch/Kconfig

··· 1518 1518 config ARCH_HAS_PHYS_TO_DMA 1519 1519 bool 1520 1520 1521 + config ARCH_HAS_CPU_RESCTRL 1522 + bool 1523 + help 1524 + An architecture selects this option to indicate that the necessary 1525 + hooks are provided to support the common memory system usage 1526 + monitoring and control interfaces provided by the 'resctrl' 1527 + filesystem (see RESCTRL_FS). 1528 + 1521 1529 config HAVE_ARCH_COMPILER_H 1522 1530 bool 1523 1531 help

+3 -8

arch/x86/Kconfig

··· 507 507 config X86_CPU_RESCTRL 508 508 bool "x86 CPU resource control support" 509 509 depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) 510 - select KERNFS 511 - select PROC_CPU_RESCTRL if PROC_FS 510 + depends on MISC_FILESYSTEMS 511 + select ARCH_HAS_CPU_RESCTRL 512 + select RESCTRL_FS 512 513 select RESCTRL_FS_PSEUDO_LOCK 513 514 help 514 515 Enable x86 CPU resource control support. ··· 526 525 Platform Quality of Service Extensions manual. 527 526 528 527 Say N if unsure. 529 - 530 - config RESCTRL_FS_PSEUDO_LOCK 531 - bool 532 - help 533 - Software mechanism to pin data in a cache portion using 534 - micro-architecture specific knowledge. 535 528 536 529 config X86_FRED 537 530 bool "Flexible Return and Event Delivery"

+8 -11

arch/x86/include/asm/resctrl.h

··· 177 177 return READ_ONCE(tsk->rmid) == rmid; 178 178 } 179 179 180 - static inline void resctrl_sched_in(struct task_struct *tsk) 180 + static inline void resctrl_arch_sched_in(struct task_struct *tsk) 181 181 { 182 182 if (static_branch_likely(&rdt_enable_key)) 183 183 __resctrl_sched_in(tsk); ··· 196 196 197 197 /* x86 can always read an rmid, nothing needs allocating */ 198 198 struct rdt_resource; 199 - static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) 199 + static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, 200 + enum resctrl_event_id evtid) 200 201 { 201 202 might_sleep(); 202 203 return NULL; 203 - }; 204 + } 204 205 205 - static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, 206 - void *ctx) { }; 206 + static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, 207 + enum resctrl_event_id evtid, 208 + void *ctx) { } 207 209 208 - u64 resctrl_arch_get_prefetch_disable_bits(void); 209 - int resctrl_arch_pseudo_lock_fn(void *_plr); 210 - int resctrl_arch_measure_cycles_lat_fn(void *_plr); 211 - int resctrl_arch_measure_l2_residency(void *_plr); 212 - int resctrl_arch_measure_l3_residency(void *_plr); 213 210 void resctrl_cpu_detect(struct cpuinfo_x86 *c); 214 211 215 212 #else 216 213 217 - static inline void resctrl_sched_in(struct task_struct *tsk) {} 214 + static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} 218 215 static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} 219 216 220 217 #endif /* CONFIG_X86_CPU_RESCTRL */

+2

arch/x86/kernel/cpu/resctrl/Makefile

··· 2 2 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o 3 3 obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o 4 4 obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o 5 + 6 + # To allow define_trace.h's recursive include: 5 7 CFLAGS_pseudo_lock.o = -I$(src)

+8 -23

arch/x86/kernel/cpu/resctrl/core.c

··· 61 61 [RDT_RESOURCE_L3] = 62 62 { 63 63 .r_resctrl = { 64 - .rid = RDT_RESOURCE_L3, 65 64 .name = "L3", 66 65 .ctrl_scope = RESCTRL_L3_CACHE, 67 66 .mon_scope = RESCTRL_L3_CACHE, ··· 74 75 [RDT_RESOURCE_L2] = 75 76 { 76 77 .r_resctrl = { 77 - .rid = RDT_RESOURCE_L2, 78 78 .name = "L2", 79 79 .ctrl_scope = RESCTRL_L2_CACHE, 80 80 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), ··· 85 87 [RDT_RESOURCE_MBA] = 86 88 { 87 89 .r_resctrl = { 88 - .rid = RDT_RESOURCE_MBA, 89 90 .name = "MB", 90 91 .ctrl_scope = RESCTRL_L3_CACHE, 91 92 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), ··· 94 97 [RDT_RESOURCE_SMBA] = 95 98 { 96 99 .r_resctrl = { 97 - .rid = RDT_RESOURCE_SMBA, 98 100 .name = "SMBA", 99 101 .ctrl_scope = RESCTRL_L3_CACHE, 100 102 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), ··· 159 163 r->alloc_capable = true; 160 164 161 165 rdt_alloc_capable = true; 162 - } 163 - 164 - bool is_mba_sc(struct rdt_resource *r) 165 - { 166 - if (!r) 167 - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 168 - 169 - /* 170 - * The software controller support is only applicable to MBA resource. 171 - * Make sure to check for resource type. 172 - */ 173 - if (r->rid != RDT_RESOURCE_MBA) 174 - return false; 175 - 176 - return r->membw.mba_sc; 177 166 } 178 167 179 168 /* ··· 719 738 bool force_off, force_on; 720 739 }; 721 740 722 - static struct rdt_options rdt_options[] __initdata = { 741 + static struct rdt_options rdt_options[] __ro_after_init = { 723 742 RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), 724 743 RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), 725 744 RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), ··· 759 778 } 760 779 __setup("rdt", set_rdt_options); 761 780 762 - bool __init rdt_cpu_has(int flag) 781 + bool rdt_cpu_has(int flag) 763 782 { 764 783 bool ret = boot_cpu_has(flag); 765 784 struct rdt_options *o; ··· 779 798 return ret; 780 799 } 781 800 782 - __init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) 801 + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) 783 802 { 784 803 if (!rdt_cpu_has(X86_FEATURE_BMEC)) 785 804 return false; ··· 993 1012 static int __init resctrl_arch_late_init(void) 994 1013 { 995 1014 struct rdt_resource *r; 996 - int state, ret; 1015 + int state, ret, i; 1016 + 1017 + /* for_each_rdt_resource() requires all rid to be initialised. */ 1018 + for (i = 0; i < RDT_NUM_RESOURCES; i++) 1019 + rdt_resources_all[i].r_resctrl.rid = i; 997 1020 998 1021 /* 999 1022 * Initialize functions(or definitions) that are different

-635

arch/x86/kernel/cpu/resctrl/ctrlmondata.c

··· 16 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 17 18 18 #include <linux/cpu.h> 19 - #include <linux/kernfs.h> 20 - #include <linux/seq_file.h> 21 - #include <linux/slab.h> 22 - #include <linux/tick.h> 23 19 24 20 #include "internal.h" 25 - 26 - struct rdt_parse_data { 27 - struct rdtgroup *rdtgrp; 28 - char *buf; 29 - }; 30 - 31 - typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, 32 - struct resctrl_schema *s, 33 - struct rdt_ctrl_domain *d); 34 - 35 - /* 36 - * Check whether MBA bandwidth percentage value is correct. The value is 37 - * checked against the minimum and max bandwidth values specified by the 38 - * hardware. The allocated bandwidth percentage is rounded to the next 39 - * control step available on the hardware. 40 - */ 41 - static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) 42 - { 43 - int ret; 44 - u32 bw; 45 - 46 - /* 47 - * Only linear delay values is supported for current Intel SKUs. 48 - */ 49 - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { 50 - rdt_last_cmd_puts("No support for non-linear MB domains\n"); 51 - return false; 52 - } 53 - 54 - ret = kstrtou32(buf, 10, &bw); 55 - if (ret) { 56 - rdt_last_cmd_printf("Invalid MB value %s\n", buf); 57 - return false; 58 - } 59 - 60 - /* Nothing else to do if software controller is enabled. */ 61 - if (is_mba_sc(r)) { 62 - *data = bw; 63 - return true; 64 - } 65 - 66 - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { 67 - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", 68 - bw, r->membw.min_bw, r->membw.max_bw); 69 - return false; 70 - } 71 - 72 - *data = roundup(bw, (unsigned long)r->membw.bw_gran); 73 - return true; 74 - } 75 - 76 - static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, 77 - struct rdt_ctrl_domain *d) 78 - { 79 - struct resctrl_staged_config *cfg; 80 - u32 closid = data->rdtgrp->closid; 81 - struct rdt_resource *r = s->res; 82 - u32 bw_val; 83 - 84 - cfg = &d->staged_config[s->conf_type]; 85 - if (cfg->have_new_ctrl) { 86 - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 87 - return -EINVAL; 88 - } 89 - 90 - if (!bw_validate(data->buf, &bw_val, r)) 91 - return -EINVAL; 92 - 93 - if (is_mba_sc(r)) { 94 - d->mbps_val[closid] = bw_val; 95 - return 0; 96 - } 97 - 98 - cfg->new_ctrl = bw_val; 99 - cfg->have_new_ctrl = true; 100 - 101 - return 0; 102 - } 103 - 104 - /* 105 - * Check whether a cache bit mask is valid. 106 - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: 107 - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 108 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 109 - * 110 - * Haswell does not support a non-contiguous 1s value and additionally 111 - * requires at least two bits set. 112 - * AMD allows non-contiguous bitmasks. 113 - */ 114 - static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) 115 - { 116 - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; 117 - unsigned int cbm_len = r->cache.cbm_len; 118 - unsigned long first_bit, zero_bit, val; 119 - int ret; 120 - 121 - ret = kstrtoul(buf, 16, &val); 122 - if (ret) { 123 - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); 124 - return false; 125 - } 126 - 127 - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { 128 - rdt_last_cmd_puts("Mask out of range\n"); 129 - return false; 130 - } 131 - 132 - first_bit = find_first_bit(&val, cbm_len); 133 - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 134 - 135 - /* Are non-contiguous bitmasks allowed? */ 136 - if (!r->cache.arch_has_sparse_bitmasks && 137 - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { 138 - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); 139 - return false; 140 - } 141 - 142 - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { 143 - rdt_last_cmd_printf("Need at least %d bits in the mask\n", 144 - r->cache.min_cbm_bits); 145 - return false; 146 - } 147 - 148 - *data = val; 149 - return true; 150 - } 151 - 152 - /* 153 - * Read one cache bit mask (hex). Check that it is valid for the current 154 - * resource type. 155 - */ 156 - static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, 157 - struct rdt_ctrl_domain *d) 158 - { 159 - struct rdtgroup *rdtgrp = data->rdtgrp; 160 - struct resctrl_staged_config *cfg; 161 - struct rdt_resource *r = s->res; 162 - u32 cbm_val; 163 - 164 - cfg = &d->staged_config[s->conf_type]; 165 - if (cfg->have_new_ctrl) { 166 - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 167 - return -EINVAL; 168 - } 169 - 170 - /* 171 - * Cannot set up more than one pseudo-locked region in a cache 172 - * hierarchy. 173 - */ 174 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 175 - rdtgroup_pseudo_locked_in_hierarchy(d)) { 176 - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); 177 - return -EINVAL; 178 - } 179 - 180 - if (!cbm_validate(data->buf, &cbm_val, r)) 181 - return -EINVAL; 182 - 183 - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || 184 - rdtgrp->mode == RDT_MODE_SHAREABLE) && 185 - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { 186 - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); 187 - return -EINVAL; 188 - } 189 - 190 - /* 191 - * The CBM may not overlap with the CBM of another closid if 192 - * either is exclusive. 193 - */ 194 - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { 195 - rdt_last_cmd_puts("Overlaps with exclusive group\n"); 196 - return -EINVAL; 197 - } 198 - 199 - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { 200 - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || 201 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 202 - rdt_last_cmd_puts("Overlaps with other group\n"); 203 - return -EINVAL; 204 - } 205 - } 206 - 207 - cfg->new_ctrl = cbm_val; 208 - cfg->have_new_ctrl = true; 209 - 210 - return 0; 211 - } 212 - 213 - /* 214 - * For each domain in this resource we expect to find a series of: 215 - * id=mask 216 - * separated by ";". The "id" is in decimal, and must match one of 217 - * the "id"s for this resource. 218 - */ 219 - static int parse_line(char *line, struct resctrl_schema *s, 220 - struct rdtgroup *rdtgrp) 221 - { 222 - enum resctrl_conf_type t = s->conf_type; 223 - ctrlval_parser_t *parse_ctrlval = NULL; 224 - struct resctrl_staged_config *cfg; 225 - struct rdt_resource *r = s->res; 226 - struct rdt_parse_data data; 227 - struct rdt_ctrl_domain *d; 228 - char *dom = NULL, *id; 229 - unsigned long dom_id; 230 - 231 - /* Walking r->domains, ensure it can't race with cpuhp */ 232 - lockdep_assert_cpus_held(); 233 - 234 - switch (r->schema_fmt) { 235 - case RESCTRL_SCHEMA_BITMAP: 236 - parse_ctrlval = &parse_cbm; 237 - break; 238 - case RESCTRL_SCHEMA_RANGE: 239 - parse_ctrlval = &parse_bw; 240 - break; 241 - } 242 - 243 - if (WARN_ON_ONCE(!parse_ctrlval)) 244 - return -EINVAL; 245 - 246 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 247 - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { 248 - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); 249 - return -EINVAL; 250 - } 251 - 252 - next: 253 - if (!line || line[0] == '\0') 254 - return 0; 255 - dom = strsep(&line, ";"); 256 - id = strsep(&dom, "="); 257 - if (!dom || kstrtoul(id, 10, &dom_id)) { 258 - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); 259 - return -EINVAL; 260 - } 261 - dom = strim(dom); 262 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 263 - if (d->hdr.id == dom_id) { 264 - data.buf = dom; 265 - data.rdtgrp = rdtgrp; 266 - if (parse_ctrlval(&data, s, d)) 267 - return -EINVAL; 268 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 269 - cfg = &d->staged_config[t]; 270 - /* 271 - * In pseudo-locking setup mode and just 272 - * parsed a valid CBM that should be 273 - * pseudo-locked. Only one locked region per 274 - * resource group and domain so just do 275 - * the required initialization for single 276 - * region and return. 277 - */ 278 - rdtgrp->plr->s = s; 279 - rdtgrp->plr->d = d; 280 - rdtgrp->plr->cbm = cfg->new_ctrl; 281 - d->plr = rdtgrp->plr; 282 - return 0; 283 - } 284 - goto next; 285 - } 286 - } 287 - return -EINVAL; 288 - } 289 21 290 22 int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, 291 23 u32 closid, enum resctrl_conf_type t, u32 cfg_val) ··· 83 351 return 0; 84 352 } 85 353 86 - static int rdtgroup_parse_resource(char *resname, char *tok, 87 - struct rdtgroup *rdtgrp) 88 - { 89 - struct resctrl_schema *s; 90 - 91 - list_for_each_entry(s, &resctrl_schema_all, list) { 92 - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) 93 - return parse_line(tok, s, rdtgrp); 94 - } 95 - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); 96 - return -EINVAL; 97 - } 98 - 99 - ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 100 - char *buf, size_t nbytes, loff_t off) 101 - { 102 - struct resctrl_schema *s; 103 - struct rdtgroup *rdtgrp; 104 - struct rdt_resource *r; 105 - char *tok, *resname; 106 - int ret = 0; 107 - 108 - /* Valid input requires a trailing newline */ 109 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 110 - return -EINVAL; 111 - buf[nbytes - 1] = '\0'; 112 - 113 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 114 - if (!rdtgrp) { 115 - rdtgroup_kn_unlock(of->kn); 116 - return -ENOENT; 117 - } 118 - rdt_last_cmd_clear(); 119 - 120 - /* 121 - * No changes to pseudo-locked region allowed. It has to be removed 122 - * and re-created instead. 123 - */ 124 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 125 - ret = -EINVAL; 126 - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); 127 - goto out; 128 - } 129 - 130 - rdt_staged_configs_clear(); 131 - 132 - while ((tok = strsep(&buf, "\n")) != NULL) { 133 - resname = strim(strsep(&tok, ":")); 134 - if (!tok) { 135 - rdt_last_cmd_puts("Missing ':'\n"); 136 - ret = -EINVAL; 137 - goto out; 138 - } 139 - if (tok[0] == '\0') { 140 - rdt_last_cmd_printf("Missing '%s' value\n", resname); 141 - ret = -EINVAL; 142 - goto out; 143 - } 144 - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); 145 - if (ret) 146 - goto out; 147 - } 148 - 149 - list_for_each_entry(s, &resctrl_schema_all, list) { 150 - r = s->res; 151 - 152 - /* 153 - * Writes to mba_sc resources update the software controller, 154 - * not the control MSR. 155 - */ 156 - if (is_mba_sc(r)) 157 - continue; 158 - 159 - ret = resctrl_arch_update_domains(r, rdtgrp->closid); 160 - if (ret) 161 - goto out; 162 - } 163 - 164 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 165 - /* 166 - * If pseudo-locking fails we keep the resource group in 167 - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service 168 - * active and updated for just the domain the pseudo-locked 169 - * region was requested for. 170 - */ 171 - ret = rdtgroup_pseudo_lock_create(rdtgrp); 172 - } 173 - 174 - out: 175 - rdt_staged_configs_clear(); 176 - rdtgroup_kn_unlock(of->kn); 177 - return ret ?: nbytes; 178 - } 179 - 180 354 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, 181 355 u32 closid, enum resctrl_conf_type type) 182 356 { ··· 90 452 u32 idx = resctrl_get_config_index(closid, type); 91 453 92 454 return hw_dom->ctrl_val[idx]; 93 - } 94 - 95 - static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) 96 - { 97 - struct rdt_resource *r = schema->res; 98 - struct rdt_ctrl_domain *dom; 99 - bool sep = false; 100 - u32 ctrl_val; 101 - 102 - /* Walking r->domains, ensure it can't race with cpuhp */ 103 - lockdep_assert_cpus_held(); 104 - 105 - seq_printf(s, "%*s:", max_name_width, schema->name); 106 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 107 - if (sep) 108 - seq_puts(s, ";"); 109 - 110 - if (is_mba_sc(r)) 111 - ctrl_val = dom->mbps_val[closid]; 112 - else 113 - ctrl_val = resctrl_arch_get_config(r, dom, closid, 114 - schema->conf_type); 115 - 116 - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); 117 - sep = true; 118 - } 119 - seq_puts(s, "\n"); 120 - } 121 - 122 - int rdtgroup_schemata_show(struct kernfs_open_file *of, 123 - struct seq_file *s, void *v) 124 - { 125 - struct resctrl_schema *schema; 126 - struct rdtgroup *rdtgrp; 127 - int ret = 0; 128 - u32 closid; 129 - 130 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 131 - if (rdtgrp) { 132 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 133 - list_for_each_entry(schema, &resctrl_schema_all, list) { 134 - seq_printf(s, "%s:uninitialized\n", schema->name); 135 - } 136 - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 137 - if (!rdtgrp->plr->d) { 138 - rdt_last_cmd_clear(); 139 - rdt_last_cmd_puts("Cache domain offline\n"); 140 - ret = -ENODEV; 141 - } else { 142 - seq_printf(s, "%s:%d=%x\n", 143 - rdtgrp->plr->s->res->name, 144 - rdtgrp->plr->d->hdr.id, 145 - rdtgrp->plr->cbm); 146 - } 147 - } else { 148 - closid = rdtgrp->closid; 149 - list_for_each_entry(schema, &resctrl_schema_all, list) { 150 - if (closid < schema->num_closid) 151 - show_doms(s, schema, closid); 152 - } 153 - } 154 - } else { 155 - ret = -ENOENT; 156 - } 157 - rdtgroup_kn_unlock(of->kn); 158 - return ret; 159 - } 160 - 161 - static int smp_mon_event_count(void *arg) 162 - { 163 - mon_event_count(arg); 164 - 165 - return 0; 166 - } 167 - 168 - ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 169 - char *buf, size_t nbytes, loff_t off) 170 - { 171 - struct rdtgroup *rdtgrp; 172 - int ret = 0; 173 - 174 - /* Valid input requires a trailing newline */ 175 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 176 - return -EINVAL; 177 - buf[nbytes - 1] = '\0'; 178 - 179 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 180 - if (!rdtgrp) { 181 - rdtgroup_kn_unlock(of->kn); 182 - return -ENOENT; 183 - } 184 - rdt_last_cmd_clear(); 185 - 186 - if (!strcmp(buf, "mbm_local_bytes")) { 187 - if (resctrl_arch_is_mbm_local_enabled()) 188 - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; 189 - else 190 - ret = -EINVAL; 191 - } else if (!strcmp(buf, "mbm_total_bytes")) { 192 - if (resctrl_arch_is_mbm_total_enabled()) 193 - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; 194 - else 195 - ret = -EINVAL; 196 - } else { 197 - ret = -EINVAL; 198 - } 199 - 200 - if (ret) 201 - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); 202 - 203 - rdtgroup_kn_unlock(of->kn); 204 - 205 - return ret ?: nbytes; 206 - } 207 - 208 - int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 209 - struct seq_file *s, void *v) 210 - { 211 - struct rdtgroup *rdtgrp; 212 - int ret = 0; 213 - 214 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 215 - 216 - if (rdtgrp) { 217 - switch (rdtgrp->mba_mbps_event) { 218 - case QOS_L3_MBM_LOCAL_EVENT_ID: 219 - seq_puts(s, "mbm_local_bytes\n"); 220 - break; 221 - case QOS_L3_MBM_TOTAL_EVENT_ID: 222 - seq_puts(s, "mbm_total_bytes\n"); 223 - break; 224 - default: 225 - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); 226 - ret = -EINVAL; 227 - break; 228 - } 229 - } else { 230 - ret = -ENOENT; 231 - } 232 - 233 - rdtgroup_kn_unlock(of->kn); 234 - 235 - return ret; 236 - } 237 - 238 - struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, 239 - struct list_head **pos) 240 - { 241 - struct rdt_domain_hdr *d; 242 - struct list_head *l; 243 - 244 - list_for_each(l, h) { 245 - d = list_entry(l, struct rdt_domain_hdr, list); 246 - /* When id is found, return its domain. */ 247 - if (id == d->id) 248 - return d; 249 - /* Stop searching when finding id's position in sorted list. */ 250 - if (id < d->id) 251 - break; 252 - } 253 - 254 - if (pos) 255 - *pos = l; 256 - 257 - return NULL; 258 - } 259 - 260 - void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 261 - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 262 - cpumask_t *cpumask, int evtid, int first) 263 - { 264 - int cpu; 265 - 266 - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ 267 - lockdep_assert_cpus_held(); 268 - 269 - /* 270 - * Setup the parameters to pass to mon_event_count() to read the data. 271 - */ 272 - rr->rgrp = rdtgrp; 273 - rr->evtid = evtid; 274 - rr->r = r; 275 - rr->d = d; 276 - rr->first = first; 277 - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); 278 - if (IS_ERR(rr->arch_mon_ctx)) { 279 - rr->err = -EINVAL; 280 - return; 281 - } 282 - 283 - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); 284 - 285 - /* 286 - * cpumask_any_housekeeping() prefers housekeeping CPUs, but 287 - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. 288 - * MPAM's resctrl_arch_rmid_read() is unable to read the 289 - * counters on some platforms if its called in IRQ context. 290 - */ 291 - if (tick_nohz_full_cpu(cpu)) 292 - smp_call_function_any(cpumask, mon_event_count, rr, 1); 293 - else 294 - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); 295 - 296 - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); 297 - } 298 - 299 - int rdtgroup_mondata_show(struct seq_file *m, void *arg) 300 - { 301 - struct kernfs_open_file *of = m->private; 302 - struct rdt_domain_hdr *hdr; 303 - struct rmid_read rr = {0}; 304 - struct rdt_mon_domain *d; 305 - u32 resid, evtid, domid; 306 - struct rdtgroup *rdtgrp; 307 - struct rdt_resource *r; 308 - union mon_data_bits md; 309 - int ret = 0; 310 - 311 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 312 - if (!rdtgrp) { 313 - ret = -ENOENT; 314 - goto out; 315 - } 316 - 317 - md.priv = of->kn->priv; 318 - resid = md.u.rid; 319 - domid = md.u.domid; 320 - evtid = md.u.evtid; 321 - r = resctrl_arch_get_resource(resid); 322 - 323 - if (md.u.sum) { 324 - /* 325 - * This file requires summing across all domains that share 326 - * the L3 cache id that was provided in the "domid" field of the 327 - * mon_data_bits union. Search all domains in the resource for 328 - * one that matches this cache id. 329 - */ 330 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 331 - if (d->ci->id == domid) { 332 - rr.ci = d->ci; 333 - mon_event_read(&rr, r, NULL, rdtgrp, 334 - &d->ci->shared_cpu_map, evtid, false); 335 - goto checkresult; 336 - } 337 - } 338 - ret = -ENOENT; 339 - goto out; 340 - } else { 341 - /* 342 - * This file provides data from a single domain. Search 343 - * the resource to find the domain with "domid". 344 - */ 345 - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); 346 - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { 347 - ret = -ENOENT; 348 - goto out; 349 - } 350 - d = container_of(hdr, struct rdt_mon_domain, hdr); 351 - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); 352 - } 353 - 354 - checkresult: 355 - 356 - if (rr.err == -EIO) 357 - seq_puts(m, "Error\n"); 358 - else if (rr.err == -EINVAL) 359 - seq_puts(m, "Unavailable\n"); 360 - else 361 - seq_printf(m, "%llu\n", rr.val); 362 - 363 - out: 364 - rdtgroup_kn_unlock(of->kn); 365 - return ret; 366 455 }

+9 -390

arch/x86/kernel/cpu/resctrl/internal.h

··· 3 3 #define _ASM_X86_RESCTRL_INTERNAL_H 4 4 5 5 #include <linux/resctrl.h> 6 - #include <linux/sched.h> 7 - #include <linux/kernfs.h> 8 - #include <linux/fs_context.h> 9 - #include <linux/jump_label.h> 10 - #include <linux/tick.h> 11 - 12 - #include <asm/resctrl.h> 13 6 14 7 #define L3_QOS_CDP_ENABLE 0x01ULL 15 8 16 9 #define L2_QOS_CDP_ENABLE 0x01ULL 17 10 18 - #define CQM_LIMBOCHECK_INTERVAL 1000 19 - 20 11 #define MBM_CNTR_WIDTH_BASE 24 21 - #define MBM_OVERFLOW_INTERVAL 1000 22 - #define MAX_MBA_BW 100u 12 + 23 13 #define MBA_IS_LINEAR 0x4 14 + 24 15 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 25 16 26 17 #define RMID_VAL_ERROR BIT_ULL(63) 18 + 27 19 #define RMID_VAL_UNAVAIL BIT_ULL(62) 20 + 28 21 /* 29 22 * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for 30 23 * data to be returned. The counter width is discovered from the hardware 31 24 * as an offset from MBM_CNTR_WIDTH_BASE. 32 25 */ 33 26 #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) 34 - 35 - /** 36 - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that 37 - * aren't marked nohz_full 38 - * @mask: The mask to pick a CPU from. 39 - * @exclude_cpu:The CPU to avoid picking. 40 - * 41 - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping 42 - * CPUs that don't use nohz_full, these are preferred. Pass 43 - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. 44 - * 45 - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. 46 - */ 47 - static inline unsigned int 48 - cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) 49 - { 50 - unsigned int cpu, hk_cpu; 51 - 52 - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) 53 - cpu = cpumask_any(mask); 54 - else 55 - cpu = cpumask_any_but(mask, exclude_cpu); 56 - 57 - /* Only continue if tick_nohz_full_mask has been initialized. */ 58 - if (!tick_nohz_full_enabled()) 59 - return cpu; 60 - 61 - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ 62 - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) 63 - return cpu; 64 - 65 - /* Try to find a CPU that isn't nohz_full to use in preference */ 66 - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); 67 - if (hk_cpu == exclude_cpu) 68 - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); 69 - 70 - if (hk_cpu < nr_cpu_ids) 71 - cpu = hk_cpu; 72 - 73 - return cpu; 74 - } 75 - 76 - struct rdt_fs_context { 77 - struct kernfs_fs_context kfc; 78 - bool enable_cdpl2; 79 - bool enable_cdpl3; 80 - bool enable_mba_mbps; 81 - bool enable_debug; 82 - }; 83 - 84 - static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) 85 - { 86 - struct kernfs_fs_context *kfc = fc->fs_private; 87 - 88 - return container_of(kfc, struct rdt_fs_context, kfc); 89 - } 90 - 91 - /** 92 - * struct mon_evt - Entry in the event list of a resource 93 - * @evtid: event id 94 - * @name: name of the event 95 - * @configurable: true if the event is configurable 96 - * @list: entry in &rdt_resource->evt_list 97 - */ 98 - struct mon_evt { 99 - enum resctrl_event_id evtid; 100 - char *name; 101 - bool configurable; 102 - struct list_head list; 103 - }; 104 - 105 - /** 106 - * union mon_data_bits - Monitoring details for each event file. 107 - * @priv: Used to store monitoring event data in @u 108 - * as kernfs private data. 109 - * @u.rid: Resource id associated with the event file. 110 - * @u.evtid: Event id associated with the event file. 111 - * @u.sum: Set when event must be summed across multiple 112 - * domains. 113 - * @u.domid: When @u.sum is zero this is the domain to which 114 - * the event file belongs. When @sum is one this 115 - * is the id of the L3 cache that all domains to be 116 - * summed share. 117 - * @u: Name of the bit fields struct. 118 - */ 119 - union mon_data_bits { 120 - void *priv; 121 - struct { 122 - unsigned int rid : 10; 123 - enum resctrl_event_id evtid : 7; 124 - unsigned int sum : 1; 125 - unsigned int domid : 14; 126 - } u; 127 - }; 128 - 129 - /** 130 - * struct rmid_read - Data passed across smp_call*() to read event count. 131 - * @rgrp: Resource group for which the counter is being read. If it is a parent 132 - * resource group then its event count is summed with the count from all 133 - * its child resource groups. 134 - * @r: Resource describing the properties of the event being read. 135 - * @d: Domain that the counter should be read from. If NULL then sum all 136 - * domains in @r sharing L3 @ci.id 137 - * @evtid: Which monitor event to read. 138 - * @first: Initialize MBM counter when true. 139 - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. 140 - * @err: Error encountered when reading counter. 141 - * @val: Returned value of event counter. If @rgrp is a parent resource group, 142 - * @val includes the sum of event counts from its child resource groups. 143 - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, 144 - * (summed across child resource groups if @rgrp is a parent resource group). 145 - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). 146 - */ 147 - struct rmid_read { 148 - struct rdtgroup *rgrp; 149 - struct rdt_resource *r; 150 - struct rdt_mon_domain *d; 151 - enum resctrl_event_id evtid; 152 - bool first; 153 - struct cacheinfo *ci; 154 - int err; 155 - u64 val; 156 - void *arch_mon_ctx; 157 - }; 158 - 159 - extern struct list_head resctrl_schema_all; 160 - extern bool resctrl_mounted; 161 - 162 - enum rdt_group_type { 163 - RDTCTRL_GROUP = 0, 164 - RDTMON_GROUP, 165 - RDT_NUM_GROUP, 166 - }; 167 - 168 - /** 169 - * enum rdtgrp_mode - Mode of a RDT resource group 170 - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations 171 - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed 172 - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking 173 - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations 174 - * allowed AND the allocations are Cache Pseudo-Locked 175 - * @RDT_NUM_MODES: Total number of modes 176 - * 177 - * The mode of a resource group enables control over the allowed overlap 178 - * between allocations associated with different resource groups (classes 179 - * of service). User is able to modify the mode of a resource group by 180 - * writing to the "mode" resctrl file associated with the resource group. 181 - * 182 - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by 183 - * writing the appropriate text to the "mode" file. A resource group enters 184 - * "pseudo-locked" mode after the schemata is written while the resource 185 - * group is in "pseudo-locksetup" mode. 186 - */ 187 - enum rdtgrp_mode { 188 - RDT_MODE_SHAREABLE = 0, 189 - RDT_MODE_EXCLUSIVE, 190 - RDT_MODE_PSEUDO_LOCKSETUP, 191 - RDT_MODE_PSEUDO_LOCKED, 192 - 193 - /* Must be last */ 194 - RDT_NUM_MODES, 195 - }; 196 - 197 - /** 198 - * struct mongroup - store mon group's data in resctrl fs. 199 - * @mon_data_kn: kernfs node for the mon_data directory 200 - * @parent: parent rdtgrp 201 - * @crdtgrp_list: child rdtgroup node list 202 - * @rmid: rmid for this rdtgroup 203 - */ 204 - struct mongroup { 205 - struct kernfs_node *mon_data_kn; 206 - struct rdtgroup *parent; 207 - struct list_head crdtgrp_list; 208 - u32 rmid; 209 - }; 210 - 211 - /** 212 - * struct rdtgroup - store rdtgroup's data in resctrl file system. 213 - * @kn: kernfs node 214 - * @rdtgroup_list: linked list for all rdtgroups 215 - * @closid: closid for this rdtgroup 216 - * @cpu_mask: CPUs assigned to this rdtgroup 217 - * @flags: status bits 218 - * @waitcount: how many cpus expect to find this 219 - * group when they acquire rdtgroup_mutex 220 - * @type: indicates type of this rdtgroup - either 221 - * monitor only or ctrl_mon group 222 - * @mon: mongroup related data 223 - * @mode: mode of resource group 224 - * @mba_mbps_event: input monitoring event id when mba_sc is enabled 225 - * @plr: pseudo-locked region 226 - */ 227 - struct rdtgroup { 228 - struct kernfs_node *kn; 229 - struct list_head rdtgroup_list; 230 - u32 closid; 231 - struct cpumask cpu_mask; 232 - int flags; 233 - atomic_t waitcount; 234 - enum rdt_group_type type; 235 - struct mongroup mon; 236 - enum rdtgrp_mode mode; 237 - enum resctrl_event_id mba_mbps_event; 238 - struct pseudo_lock_region *plr; 239 - }; 240 - 241 - /* rdtgroup.flags */ 242 - #define RDT_DELETED 1 243 - 244 - /* rftype.flags */ 245 - #define RFTYPE_FLAGS_CPUS_LIST 1 246 - 247 - /* 248 - * Define the file type flags for base and info directories. 249 - */ 250 - #define RFTYPE_INFO BIT(0) 251 - #define RFTYPE_BASE BIT(1) 252 - #define RFTYPE_CTRL BIT(4) 253 - #define RFTYPE_MON BIT(5) 254 - #define RFTYPE_TOP BIT(6) 255 - #define RFTYPE_RES_CACHE BIT(8) 256 - #define RFTYPE_RES_MB BIT(9) 257 - #define RFTYPE_DEBUG BIT(10) 258 - #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) 259 - #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) 260 - #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) 261 - #define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) 262 - #define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) 263 - 264 - /* List of all resource groups */ 265 - extern struct list_head rdt_all_groups; 266 - 267 - extern int max_name_width; 268 - 269 - /** 270 - * struct rftype - describe each file in the resctrl file system 271 - * @name: File name 272 - * @mode: Access mode 273 - * @kf_ops: File operations 274 - * @flags: File specific RFTYPE_FLAGS_* flags 275 - * @fflags: File specific RFTYPE_* flags 276 - * @seq_show: Show content of the file 277 - * @write: Write to the file 278 - */ 279 - struct rftype { 280 - char *name; 281 - umode_t mode; 282 - const struct kernfs_ops *kf_ops; 283 - unsigned long flags; 284 - unsigned long fflags; 285 - 286 - int (*seq_show)(struct kernfs_open_file *of, 287 - struct seq_file *sf, void *v); 288 - /* 289 - * write() is the generic write callback which maps directly to 290 - * kernfs write operation and overrides all other operations. 291 - * Maximum write size is determined by ->max_write_len. 292 - */ 293 - ssize_t (*write)(struct kernfs_open_file *of, 294 - char *buf, size_t nbytes, loff_t off); 295 - }; 296 - 297 - /** 298 - * struct mbm_state - status for each MBM counter in each domain 299 - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation 300 - * @prev_bw: The most recent bandwidth in MBps 301 - */ 302 - struct mbm_state { 303 - u64 prev_bw_bytes; 304 - u32 prev_bw; 305 - }; 306 27 307 28 /** 308 29 * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s ··· 122 401 return container_of(r, struct rdt_hw_resource, r_resctrl); 123 402 } 124 403 125 - extern struct mutex rdtgroup_mutex; 126 - 127 - static inline const char *rdt_kn_name(const struct kernfs_node *kn) 128 - { 129 - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); 130 - } 131 - 132 404 extern struct rdt_hw_resource rdt_resources_all[]; 133 - extern struct rdtgroup rdtgroup_default; 134 - extern struct dentry *debugfs_resctrl; 135 - extern enum resctrl_event_id mba_mbps_default_event; 136 - 137 - static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) 138 - { 139 - return rdt_resources_all[l].cdp_enabled; 140 - } 141 - 142 - int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); 143 405 144 406 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); 145 407 ··· 159 455 unsigned int full; 160 456 }; 161 457 162 - void rdt_last_cmd_clear(void); 163 - void rdt_last_cmd_puts(const char *s); 164 - __printf(1, 2) 165 - void rdt_last_cmd_printf(const char *fmt, ...); 166 - 167 458 void rdt_ctrl_update(void *arg); 168 - struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); 169 - void rdtgroup_kn_unlock(struct kernfs_node *kn); 170 - int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); 171 - int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 172 - umode_t mask); 173 - ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 174 - char *buf, size_t nbytes, loff_t off); 175 - int rdtgroup_schemata_show(struct kernfs_open_file *of, 176 - struct seq_file *s, void *v); 177 - ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 178 - char *buf, size_t nbytes, loff_t off); 179 - int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 180 - struct seq_file *s, void *v); 181 - bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 182 - unsigned long cbm, int closid, bool exclusive); 183 - unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, 184 - unsigned long cbm); 185 - enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); 186 - int rdtgroup_tasks_assigned(struct rdtgroup *r); 187 - int closids_supported(void); 188 - void closid_free(int closid); 189 - int alloc_rmid(u32 closid); 190 - void free_rmid(u32 closid, u32 rmid); 459 + 191 460 int rdt_get_mon_l3_config(struct rdt_resource *r); 192 - void resctrl_mon_resource_exit(void); 193 - bool __init rdt_cpu_has(int flag); 194 - void mon_event_count(void *info); 195 - int rdtgroup_mondata_show(struct seq_file *m, void *arg); 196 - void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 197 - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 198 - cpumask_t *cpumask, int evtid, int first); 199 - int __init resctrl_mon_resource_init(void); 200 - void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, 201 - unsigned long delay_ms, 202 - int exclude_cpu); 203 - void mbm_handle_overflow(struct work_struct *work); 461 + 462 + bool rdt_cpu_has(int flag); 463 + 204 464 void __init intel_rdt_mbm_apply_quirk(void); 205 - bool is_mba_sc(struct rdt_resource *r); 206 - void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 207 - int exclude_cpu); 208 - void cqm_handle_limbo(struct work_struct *work); 209 - bool has_busy_rmid(struct rdt_mon_domain *d); 210 - void __check_limbo(struct rdt_mon_domain *d, bool force_free); 465 + 211 466 void rdt_domain_reconfigure_cdp(struct rdt_resource *r); 212 - void resctrl_file_fflags_init(const char *config, unsigned long fflags); 213 - void rdt_staged_configs_clear(void); 214 - bool closid_allocated(unsigned int closid); 215 - int resctrl_find_cleanest_closid(void); 216 - 217 - #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 218 - int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); 219 - int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); 220 - bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); 221 - bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); 222 - int rdt_pseudo_lock_init(void); 223 - void rdt_pseudo_lock_release(void); 224 - int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); 225 - void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); 226 - #else 227 - static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 228 - { 229 - return -EOPNOTSUPP; 230 - } 231 - 232 - static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 233 - { 234 - return -EOPNOTSUPP; 235 - } 236 - 237 - static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 238 - { 239 - return false; 240 - } 241 - 242 - static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 243 - { 244 - return false; 245 - } 246 - 247 - static inline int rdt_pseudo_lock_init(void) { return 0; } 248 - static inline void rdt_pseudo_lock_release(void) { } 249 - static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 250 - { 251 - return -EOPNOTSUPP; 252 - } 253 - 254 - static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } 255 - #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 256 467 257 468 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */

+7 -911

arch/x86/kernel/cpu/resctrl/monitor.c

··· 18 18 #define pr_fmt(fmt) "resctrl: " fmt 19 19 20 20 #include <linux/cpu.h> 21 - #include <linux/module.h> 22 - #include <linux/sizes.h> 23 - #include <linux/slab.h> 21 + #include <linux/resctrl.h> 24 22 25 23 #include <asm/cpu_device_id.h> 26 24 #include <asm/msr.h> 27 - #include <asm/resctrl.h> 28 25 29 26 #include "internal.h" 30 - #include "trace.h" 31 - 32 - /** 33 - * struct rmid_entry - dirty tracking for all RMID. 34 - * @closid: The CLOSID for this entry. 35 - * @rmid: The RMID for this entry. 36 - * @busy: The number of domains with cached data using this RMID. 37 - * @list: Member of the rmid_free_lru list when busy == 0. 38 - * 39 - * Depending on the architecture the correct monitor is accessed using 40 - * both @closid and @rmid, or @rmid only. 41 - * 42 - * Take the rdtgroup_mutex when accessing. 43 - */ 44 - struct rmid_entry { 45 - u32 closid; 46 - u32 rmid; 47 - int busy; 48 - struct list_head list; 49 - }; 50 - 51 - /* 52 - * @rmid_free_lru - A least recently used list of free RMIDs 53 - * These RMIDs are guaranteed to have an occupancy less than the 54 - * threshold occupancy 55 - */ 56 - static LIST_HEAD(rmid_free_lru); 57 - 58 - /* 59 - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 60 - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 61 - * Indexed by CLOSID. Protected by rdtgroup_mutex. 62 - */ 63 - static u32 *closid_num_dirty_rmid; 64 - 65 - /* 66 - * @rmid_limbo_count - count of currently unused but (potentially) 67 - * dirty RMIDs. 68 - * This counts RMIDs that no one is currently using but that 69 - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 70 - * change the threshold occupancy value. 71 - */ 72 - static unsigned int rmid_limbo_count; 73 - 74 - /* 75 - * @rmid_entry - The entry in the limbo and free lists. 76 - */ 77 - static struct rmid_entry *rmid_ptrs; 78 27 79 28 /* 80 29 * Global boolean for rdt_monitor which is true if any ··· 36 87 */ 37 88 unsigned int rdt_mon_features; 38 89 39 - /* 40 - * This is the threshold cache occupancy in bytes at which we will consider an 41 - * RMID available for re-allocation. 42 - */ 43 - unsigned int resctrl_rmid_realloc_threshold; 44 - 45 - /* 46 - * This is the maximum value for the reallocation threshold, in bytes. 47 - */ 48 - unsigned int resctrl_rmid_realloc_limit; 49 - 50 90 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 51 91 52 92 static int snc_nodes_per_l3_cache = 1; 53 93 54 94 /* 55 - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 95 + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 56 96 * If rmid > rmid threshold, MBM total and local values should be multiplied 57 97 * by the correction factor. 58 98 * ··· 90 152 }; 91 153 92 154 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 155 + 93 156 static u64 mbm_cf __read_mostly; 94 157 95 158 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) ··· 100 161 val = (val * mbm_cf) >> 20; 101 162 102 163 return val; 103 - } 104 - 105 - /* 106 - * x86 and arm64 differ in their handling of monitoring. 107 - * x86's RMID are independent numbers, there is only one source of traffic 108 - * with an RMID value of '1'. 109 - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 110 - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 111 - * value is no longer unique. 112 - * To account for this, resctrl uses an index. On x86 this is just the RMID, 113 - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 114 - * 115 - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 116 - * must accept an attempt to read every index. 117 - */ 118 - static inline struct rmid_entry *__rmid_entry(u32 idx) 119 - { 120 - struct rmid_entry *entry; 121 - u32 closid, rmid; 122 - 123 - entry = &rmid_ptrs[idx]; 124 - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 125 - 126 - WARN_ON_ONCE(entry->closid != closid); 127 - WARN_ON_ONCE(entry->rmid != rmid); 128 - 129 - return entry; 130 164 } 131 165 132 166 /* ··· 173 261 return &hw_dom->arch_mbm_total[rmid]; 174 262 case QOS_L3_MBM_LOCAL_EVENT_ID: 175 263 return &hw_dom->arch_mbm_local[rmid]; 264 + default: 265 + /* Never expect to get here */ 266 + WARN_ON_ONCE(1); 267 + return NULL; 176 268 } 177 - 178 - /* Never expect to get here */ 179 - WARN_ON_ONCE(1); 180 - 181 - return NULL; 182 269 } 183 270 184 271 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, ··· 256 345 *val = chunks * hw_res->mon_scale; 257 346 258 347 return 0; 259 - } 260 - 261 - static void limbo_release_entry(struct rmid_entry *entry) 262 - { 263 - lockdep_assert_held(&rdtgroup_mutex); 264 - 265 - rmid_limbo_count--; 266 - list_add_tail(&entry->list, &rmid_free_lru); 267 - 268 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 269 - closid_num_dirty_rmid[entry->closid]--; 270 - } 271 - 272 - /* 273 - * Check the RMIDs that are marked as busy for this domain. If the 274 - * reported LLC occupancy is below the threshold clear the busy bit and 275 - * decrement the count. If the busy count gets to zero on an RMID, we 276 - * free the RMID 277 - */ 278 - void __check_limbo(struct rdt_mon_domain *d, bool force_free) 279 - { 280 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 281 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 282 - struct rmid_entry *entry; 283 - u32 idx, cur_idx = 1; 284 - void *arch_mon_ctx; 285 - bool rmid_dirty; 286 - u64 val = 0; 287 - 288 - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 289 - if (IS_ERR(arch_mon_ctx)) { 290 - pr_warn_ratelimited("Failed to allocate monitor context: %ld", 291 - PTR_ERR(arch_mon_ctx)); 292 - return; 293 - } 294 - 295 - /* 296 - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 297 - * are marked as busy for occupancy < threshold. If the occupancy 298 - * is less than the threshold decrement the busy counter of the 299 - * RMID and move it to the free list when the counter reaches 0. 300 - */ 301 - for (;;) { 302 - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 303 - if (idx >= idx_limit) 304 - break; 305 - 306 - entry = __rmid_entry(idx); 307 - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 308 - QOS_L3_OCCUP_EVENT_ID, &val, 309 - arch_mon_ctx)) { 310 - rmid_dirty = true; 311 - } else { 312 - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 313 - 314 - /* 315 - * x86's CLOSID and RMID are independent numbers, so the entry's 316 - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 317 - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 318 - * used to select the configuration. It is thus necessary to track both 319 - * CLOSID and RMID because there may be dependencies between them 320 - * on some architectures. 321 - */ 322 - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 323 - } 324 - 325 - if (force_free || !rmid_dirty) { 326 - clear_bit(idx, d->rmid_busy_llc); 327 - if (!--entry->busy) 328 - limbo_release_entry(entry); 329 - } 330 - cur_idx = idx + 1; 331 - } 332 - 333 - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 334 - } 335 - 336 - bool has_busy_rmid(struct rdt_mon_domain *d) 337 - { 338 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 339 - 340 - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 341 - } 342 - 343 - static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 344 - { 345 - struct rmid_entry *itr; 346 - u32 itr_idx, cmp_idx; 347 - 348 - if (list_empty(&rmid_free_lru)) 349 - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 350 - 351 - list_for_each_entry(itr, &rmid_free_lru, list) { 352 - /* 353 - * Get the index of this free RMID, and the index it would need 354 - * to be if it were used with this CLOSID. 355 - * If the CLOSID is irrelevant on this architecture, the two 356 - * index values are always the same on every entry and thus the 357 - * very first entry will be returned. 358 - */ 359 - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 360 - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 361 - 362 - if (itr_idx == cmp_idx) 363 - return itr; 364 - } 365 - 366 - return ERR_PTR(-ENOSPC); 367 - } 368 - 369 - /** 370 - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 371 - * RMID are clean, or the CLOSID that has 372 - * the most clean RMID. 373 - * 374 - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 375 - * may not be able to allocate clean RMID. To avoid this the allocator will 376 - * choose the CLOSID with the most clean RMID. 377 - * 378 - * When the CLOSID and RMID are independent numbers, the first free CLOSID will 379 - * be returned. 380 - */ 381 - int resctrl_find_cleanest_closid(void) 382 - { 383 - u32 cleanest_closid = ~0; 384 - int i = 0; 385 - 386 - lockdep_assert_held(&rdtgroup_mutex); 387 - 388 - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 389 - return -EIO; 390 - 391 - for (i = 0; i < closids_supported(); i++) { 392 - int num_dirty; 393 - 394 - if (closid_allocated(i)) 395 - continue; 396 - 397 - num_dirty = closid_num_dirty_rmid[i]; 398 - if (num_dirty == 0) 399 - return i; 400 - 401 - if (cleanest_closid == ~0) 402 - cleanest_closid = i; 403 - 404 - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 405 - cleanest_closid = i; 406 - } 407 - 408 - if (cleanest_closid == ~0) 409 - return -ENOSPC; 410 - 411 - return cleanest_closid; 412 - } 413 - 414 - /* 415 - * For MPAM the RMID value is not unique, and has to be considered with 416 - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 417 - * allows all domains to be managed by a single free list. 418 - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 419 - */ 420 - int alloc_rmid(u32 closid) 421 - { 422 - struct rmid_entry *entry; 423 - 424 - lockdep_assert_held(&rdtgroup_mutex); 425 - 426 - entry = resctrl_find_free_rmid(closid); 427 - if (IS_ERR(entry)) 428 - return PTR_ERR(entry); 429 - 430 - list_del(&entry->list); 431 - return entry->rmid; 432 - } 433 - 434 - static void add_rmid_to_limbo(struct rmid_entry *entry) 435 - { 436 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 437 - struct rdt_mon_domain *d; 438 - u32 idx; 439 - 440 - lockdep_assert_held(&rdtgroup_mutex); 441 - 442 - /* Walking r->domains, ensure it can't race with cpuhp */ 443 - lockdep_assert_cpus_held(); 444 - 445 - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 446 - 447 - entry->busy = 0; 448 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 449 - /* 450 - * For the first limbo RMID in the domain, 451 - * setup up the limbo worker. 452 - */ 453 - if (!has_busy_rmid(d)) 454 - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 455 - RESCTRL_PICK_ANY_CPU); 456 - set_bit(idx, d->rmid_busy_llc); 457 - entry->busy++; 458 - } 459 - 460 - rmid_limbo_count++; 461 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 462 - closid_num_dirty_rmid[entry->closid]++; 463 - } 464 - 465 - void free_rmid(u32 closid, u32 rmid) 466 - { 467 - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 468 - struct rmid_entry *entry; 469 - 470 - lockdep_assert_held(&rdtgroup_mutex); 471 - 472 - /* 473 - * Do not allow the default rmid to be free'd. Comparing by index 474 - * allows architectures that ignore the closid parameter to avoid an 475 - * unnecessary check. 476 - */ 477 - if (!resctrl_arch_mon_capable() || 478 - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 479 - RESCTRL_RESERVED_RMID)) 480 - return; 481 - 482 - entry = __rmid_entry(idx); 483 - 484 - if (resctrl_arch_is_llc_occupancy_enabled()) 485 - add_rmid_to_limbo(entry); 486 - else 487 - list_add_tail(&entry->list, &rmid_free_lru); 488 - } 489 - 490 - static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 491 - u32 rmid, enum resctrl_event_id evtid) 492 - { 493 - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 494 - 495 - switch (evtid) { 496 - case QOS_L3_MBM_TOTAL_EVENT_ID: 497 - return &d->mbm_total[idx]; 498 - case QOS_L3_MBM_LOCAL_EVENT_ID: 499 - return &d->mbm_local[idx]; 500 - default: 501 - return NULL; 502 - } 503 - } 504 - 505 - static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 506 - { 507 - int cpu = smp_processor_id(); 508 - struct rdt_mon_domain *d; 509 - struct mbm_state *m; 510 - int err, ret; 511 - u64 tval = 0; 512 - 513 - if (rr->first) { 514 - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 515 - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 516 - if (m) 517 - memset(m, 0, sizeof(struct mbm_state)); 518 - return 0; 519 - } 520 - 521 - if (rr->d) { 522 - /* Reading a single domain, must be on a CPU in that domain. */ 523 - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 524 - return -EINVAL; 525 - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 526 - rr->evtid, &tval, rr->arch_mon_ctx); 527 - if (rr->err) 528 - return rr->err; 529 - 530 - rr->val += tval; 531 - 532 - return 0; 533 - } 534 - 535 - /* Summing domains that share a cache, must be on a CPU for that cache. */ 536 - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 537 - return -EINVAL; 538 - 539 - /* 540 - * Legacy files must report the sum of an event across all 541 - * domains that share the same L3 cache instance. 542 - * Report success if a read from any domain succeeds, -EINVAL 543 - * (translated to "Unavailable" for user space) if reading from 544 - * all domains fail for any reason. 545 - */ 546 - ret = -EINVAL; 547 - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 548 - if (d->ci->id != rr->ci->id) 549 - continue; 550 - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 551 - rr->evtid, &tval, rr->arch_mon_ctx); 552 - if (!err) { 553 - rr->val += tval; 554 - ret = 0; 555 - } 556 - } 557 - 558 - if (ret) 559 - rr->err = ret; 560 - 561 - return ret; 562 - } 563 - 564 - /* 565 - * mbm_bw_count() - Update bw count from values previously read by 566 - * __mon_event_count(). 567 - * @closid: The closid used to identify the cached mbm_state. 568 - * @rmid: The rmid used to identify the cached mbm_state. 569 - * @rr: The struct rmid_read populated by __mon_event_count(). 570 - * 571 - * Supporting function to calculate the memory bandwidth 572 - * and delta bandwidth in MBps. The chunks value previously read by 573 - * __mon_event_count() is compared with the chunks value from the previous 574 - * invocation. This must be called once per second to maintain values in MBps. 575 - */ 576 - static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 577 - { 578 - u64 cur_bw, bytes, cur_bytes; 579 - struct mbm_state *m; 580 - 581 - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 582 - if (WARN_ON_ONCE(!m)) 583 - return; 584 - 585 - cur_bytes = rr->val; 586 - bytes = cur_bytes - m->prev_bw_bytes; 587 - m->prev_bw_bytes = cur_bytes; 588 - 589 - cur_bw = bytes / SZ_1M; 590 - 591 - m->prev_bw = cur_bw; 592 - } 593 - 594 - /* 595 - * This is scheduled by mon_event_read() to read the CQM/MBM counters 596 - * on a domain. 597 - */ 598 - void mon_event_count(void *info) 599 - { 600 - struct rdtgroup *rdtgrp, *entry; 601 - struct rmid_read *rr = info; 602 - struct list_head *head; 603 - int ret; 604 - 605 - rdtgrp = rr->rgrp; 606 - 607 - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 608 - 609 - /* 610 - * For Ctrl groups read data from child monitor groups and 611 - * add them together. Count events which are read successfully. 612 - * Discard the rmid_read's reporting errors. 613 - */ 614 - head = &rdtgrp->mon.crdtgrp_list; 615 - 616 - if (rdtgrp->type == RDTCTRL_GROUP) { 617 - list_for_each_entry(entry, head, mon.crdtgrp_list) { 618 - if (__mon_event_count(entry->closid, entry->mon.rmid, 619 - rr) == 0) 620 - ret = 0; 621 - } 622 - } 623 - 624 - /* 625 - * __mon_event_count() calls for newly created monitor groups may 626 - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 627 - * Discard error if any of the monitor event reads succeeded. 628 - */ 629 - if (ret == 0) 630 - rr->err = 0; 631 - } 632 - 633 - static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 634 - struct rdt_resource *r) 635 - { 636 - struct rdt_ctrl_domain *d; 637 - 638 - lockdep_assert_cpus_held(); 639 - 640 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 641 - /* Find the domain that contains this CPU */ 642 - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 643 - return d; 644 - } 645 - 646 - return NULL; 647 - } 648 - 649 - /* 650 - * Feedback loop for MBA software controller (mba_sc) 651 - * 652 - * mba_sc is a feedback loop where we periodically read MBM counters and 653 - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 654 - * that: 655 - * 656 - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 657 - * 658 - * This uses the MBM counters to measure the bandwidth and MBA throttle 659 - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 660 - * fact that resctrl rdtgroups have both monitoring and control. 661 - * 662 - * The frequency of the checks is 1s and we just tag along the MBM overflow 663 - * timer. Having 1s interval makes the calculation of bandwidth simpler. 664 - * 665 - * Although MBA's goal is to restrict the bandwidth to a maximum, there may 666 - * be a need to increase the bandwidth to avoid unnecessarily restricting 667 - * the L2 <-> L3 traffic. 668 - * 669 - * Since MBA controls the L2 external bandwidth where as MBM measures the 670 - * L3 external bandwidth the following sequence could lead to such a 671 - * situation. 672 - * 673 - * Consider an rdtgroup which had high L3 <-> memory traffic in initial 674 - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 675 - * after some time rdtgroup has mostly L2 <-> L3 traffic. 676 - * 677 - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 678 - * throttle MSRs already have low percentage values. To avoid 679 - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 680 - */ 681 - static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 682 - { 683 - u32 closid, rmid, cur_msr_val, new_msr_val; 684 - struct mbm_state *pmbm_data, *cmbm_data; 685 - struct rdt_ctrl_domain *dom_mba; 686 - enum resctrl_event_id evt_id; 687 - struct rdt_resource *r_mba; 688 - struct list_head *head; 689 - struct rdtgroup *entry; 690 - u32 cur_bw, user_bw; 691 - 692 - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 693 - evt_id = rgrp->mba_mbps_event; 694 - 695 - closid = rgrp->closid; 696 - rmid = rgrp->mon.rmid; 697 - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 698 - if (WARN_ON_ONCE(!pmbm_data)) 699 - return; 700 - 701 - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 702 - if (!dom_mba) { 703 - pr_warn_once("Failure to get domain for MBA update\n"); 704 - return; 705 - } 706 - 707 - cur_bw = pmbm_data->prev_bw; 708 - user_bw = dom_mba->mbps_val[closid]; 709 - 710 - /* MBA resource doesn't support CDP */ 711 - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 712 - 713 - /* 714 - * For Ctrl groups read data from child monitor groups. 715 - */ 716 - head = &rgrp->mon.crdtgrp_list; 717 - list_for_each_entry(entry, head, mon.crdtgrp_list) { 718 - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 719 - if (WARN_ON_ONCE(!cmbm_data)) 720 - return; 721 - cur_bw += cmbm_data->prev_bw; 722 - } 723 - 724 - /* 725 - * Scale up/down the bandwidth linearly for the ctrl group. The 726 - * bandwidth step is the bandwidth granularity specified by the 727 - * hardware. 728 - * Always increase throttling if current bandwidth is above the 729 - * target set by user. 730 - * But avoid thrashing up and down on every poll by checking 731 - * whether a decrease in throttling is likely to push the group 732 - * back over target. E.g. if currently throttling to 30% of bandwidth 733 - * on a system with 10% granularity steps, check whether moving to 734 - * 40% would go past the limit by multiplying current bandwidth by 735 - * "(30 + 10) / 30". 736 - */ 737 - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 738 - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 739 - } else if (cur_msr_val < MAX_MBA_BW && 740 - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 741 - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 742 - } else { 743 - return; 744 - } 745 - 746 - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 747 - } 748 - 749 - static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 750 - u32 closid, u32 rmid, enum resctrl_event_id evtid) 751 - { 752 - struct rmid_read rr = {0}; 753 - 754 - rr.r = r; 755 - rr.d = d; 756 - rr.evtid = evtid; 757 - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 758 - if (IS_ERR(rr.arch_mon_ctx)) { 759 - pr_warn_ratelimited("Failed to allocate monitor context: %ld", 760 - PTR_ERR(rr.arch_mon_ctx)); 761 - return; 762 - } 763 - 764 - __mon_event_count(closid, rmid, &rr); 765 - 766 - /* 767 - * If the software controller is enabled, compute the 768 - * bandwidth for this event id. 769 - */ 770 - if (is_mba_sc(NULL)) 771 - mbm_bw_count(closid, rmid, &rr); 772 - 773 - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 774 - } 775 - 776 - static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 777 - u32 closid, u32 rmid) 778 - { 779 - /* 780 - * This is protected from concurrent reads from user as both 781 - * the user and overflow handler hold the global mutex. 782 - */ 783 - if (resctrl_arch_is_mbm_total_enabled()) 784 - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 785 - 786 - if (resctrl_arch_is_mbm_local_enabled()) 787 - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 788 - } 789 - 790 - /* 791 - * Handler to scan the limbo list and move the RMIDs 792 - * to free list whose occupancy < threshold_occupancy. 793 - */ 794 - void cqm_handle_limbo(struct work_struct *work) 795 - { 796 - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 797 - struct rdt_mon_domain *d; 798 - 799 - cpus_read_lock(); 800 - mutex_lock(&rdtgroup_mutex); 801 - 802 - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 803 - 804 - __check_limbo(d, false); 805 - 806 - if (has_busy_rmid(d)) { 807 - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 808 - RESCTRL_PICK_ANY_CPU); 809 - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 810 - delay); 811 - } 812 - 813 - mutex_unlock(&rdtgroup_mutex); 814 - cpus_read_unlock(); 815 - } 816 - 817 - /** 818 - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 819 - * domain. 820 - * @dom: The domain the limbo handler should run for. 821 - * @delay_ms: How far in the future the handler should run. 822 - * @exclude_cpu: Which CPU the handler should not run on, 823 - * RESCTRL_PICK_ANY_CPU to pick any CPU. 824 - */ 825 - void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 826 - int exclude_cpu) 827 - { 828 - unsigned long delay = msecs_to_jiffies(delay_ms); 829 - int cpu; 830 - 831 - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 832 - dom->cqm_work_cpu = cpu; 833 - 834 - if (cpu < nr_cpu_ids) 835 - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 836 - } 837 - 838 - void mbm_handle_overflow(struct work_struct *work) 839 - { 840 - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 841 - struct rdtgroup *prgrp, *crgrp; 842 - struct rdt_mon_domain *d; 843 - struct list_head *head; 844 - struct rdt_resource *r; 845 - 846 - cpus_read_lock(); 847 - mutex_lock(&rdtgroup_mutex); 848 - 849 - /* 850 - * If the filesystem has been unmounted this work no longer needs to 851 - * run. 852 - */ 853 - if (!resctrl_mounted || !resctrl_arch_mon_capable()) 854 - goto out_unlock; 855 - 856 - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 857 - d = container_of(work, struct rdt_mon_domain, mbm_over.work); 858 - 859 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 860 - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 861 - 862 - head = &prgrp->mon.crdtgrp_list; 863 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) 864 - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 865 - 866 - if (is_mba_sc(NULL)) 867 - update_mba_bw(prgrp, d); 868 - } 869 - 870 - /* 871 - * Re-check for housekeeping CPUs. This allows the overflow handler to 872 - * move off a nohz_full CPU quickly. 873 - */ 874 - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 875 - RESCTRL_PICK_ANY_CPU); 876 - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 877 - 878 - out_unlock: 879 - mutex_unlock(&rdtgroup_mutex); 880 - cpus_read_unlock(); 881 - } 882 - 883 - /** 884 - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 885 - * domain. 886 - * @dom: The domain the overflow handler should run for. 887 - * @delay_ms: How far in the future the handler should run. 888 - * @exclude_cpu: Which CPU the handler should not run on, 889 - * RESCTRL_PICK_ANY_CPU to pick any CPU. 890 - */ 891 - void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 892 - int exclude_cpu) 893 - { 894 - unsigned long delay = msecs_to_jiffies(delay_ms); 895 - int cpu; 896 - 897 - /* 898 - * When a domain comes online there is no guarantee the filesystem is 899 - * mounted. If not, there is no need to catch counter overflow. 900 - */ 901 - if (!resctrl_mounted || !resctrl_arch_mon_capable()) 902 - return; 903 - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 904 - dom->mbm_work_cpu = cpu; 905 - 906 - if (cpu < nr_cpu_ids) 907 - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 908 - } 909 - 910 - static int dom_data_init(struct rdt_resource *r) 911 - { 912 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 913 - u32 num_closid = resctrl_arch_get_num_closid(r); 914 - struct rmid_entry *entry = NULL; 915 - int err = 0, i; 916 - u32 idx; 917 - 918 - mutex_lock(&rdtgroup_mutex); 919 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 920 - u32 *tmp; 921 - 922 - /* 923 - * If the architecture hasn't provided a sanitised value here, 924 - * this may result in larger arrays than necessary. Resctrl will 925 - * use a smaller system wide value based on the resources in 926 - * use. 927 - */ 928 - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 929 - if (!tmp) { 930 - err = -ENOMEM; 931 - goto out_unlock; 932 - } 933 - 934 - closid_num_dirty_rmid = tmp; 935 - } 936 - 937 - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 938 - if (!rmid_ptrs) { 939 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 940 - kfree(closid_num_dirty_rmid); 941 - closid_num_dirty_rmid = NULL; 942 - } 943 - err = -ENOMEM; 944 - goto out_unlock; 945 - } 946 - 947 - for (i = 0; i < idx_limit; i++) { 948 - entry = &rmid_ptrs[i]; 949 - INIT_LIST_HEAD(&entry->list); 950 - 951 - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 952 - list_add_tail(&entry->list, &rmid_free_lru); 953 - } 954 - 955 - /* 956 - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 957 - * are always allocated. These are used for the rdtgroup_default 958 - * control group, which will be setup later in resctrl_init(). 959 - */ 960 - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 961 - RESCTRL_RESERVED_RMID); 962 - entry = __rmid_entry(idx); 963 - list_del(&entry->list); 964 - 965 - out_unlock: 966 - mutex_unlock(&rdtgroup_mutex); 967 - 968 - return err; 969 - } 970 - 971 - static void dom_data_exit(struct rdt_resource *r) 972 - { 973 - mutex_lock(&rdtgroup_mutex); 974 - 975 - if (!r->mon_capable) 976 - goto out_unlock; 977 - 978 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 979 - kfree(closid_num_dirty_rmid); 980 - closid_num_dirty_rmid = NULL; 981 - } 982 - 983 - kfree(rmid_ptrs); 984 - rmid_ptrs = NULL; 985 - 986 - out_unlock: 987 - mutex_unlock(&rdtgroup_mutex); 988 - } 989 - 990 - static struct mon_evt llc_occupancy_event = { 991 - .name = "llc_occupancy", 992 - .evtid = QOS_L3_OCCUP_EVENT_ID, 993 - }; 994 - 995 - static struct mon_evt mbm_total_event = { 996 - .name = "mbm_total_bytes", 997 - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 998 - }; 999 - 1000 - static struct mon_evt mbm_local_event = { 1001 - .name = "mbm_local_bytes", 1002 - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 1003 - }; 1004 - 1005 - /* 1006 - * Initialize the event list for the resource. 1007 - * 1008 - * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1009 - * because as per the SDM the total and local memory bandwidth 1010 - * are enumerated as part of L3 monitoring. 1011 - */ 1012 - static void l3_mon_evt_init(struct rdt_resource *r) 1013 - { 1014 - INIT_LIST_HEAD(&r->evt_list); 1015 - 1016 - if (resctrl_arch_is_llc_occupancy_enabled()) 1017 - list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1018 - if (resctrl_arch_is_mbm_total_enabled()) 1019 - list_add_tail(&mbm_total_event.list, &r->evt_list); 1020 - if (resctrl_arch_is_mbm_local_enabled()) 1021 - list_add_tail(&mbm_local_event.list, &r->evt_list); 1022 348 } 1023 349 1024 350 /* ··· 341 1193 return ret; 342 1194 } 343 1195 344 - /** 345 - * resctrl_mon_resource_init() - Initialise global monitoring structures. 346 - * 347 - * Allocate and initialise global monitor resources that do not belong to a 348 - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 349 - * Called once during boot after the struct rdt_resource's have been configured 350 - * but before the filesystem is mounted. 351 - * Resctrl's cpuhp callbacks may be called before this point to bring a domain 352 - * online. 353 - * 354 - * Returns 0 for success, or -ENOMEM. 355 - */ 356 - int __init resctrl_mon_resource_init(void) 357 - { 358 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 359 - int ret; 360 - 361 - if (!r->mon_capable) 362 - return 0; 363 - 364 - ret = dom_data_init(r); 365 - if (ret) 366 - return ret; 367 - 368 - l3_mon_evt_init(r); 369 - 370 - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 371 - mbm_total_event.configurable = true; 372 - resctrl_file_fflags_init("mbm_total_bytes_config", 373 - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 374 - } 375 - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 376 - mbm_local_event.configurable = true; 377 - resctrl_file_fflags_init("mbm_local_bytes_config", 378 - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 379 - } 380 - 381 - if (resctrl_arch_is_mbm_local_enabled()) 382 - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 383 - else if (resctrl_arch_is_mbm_total_enabled()) 384 - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 385 - 386 - return 0; 387 - } 388 - 389 1196 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 390 1197 { 391 1198 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; ··· 386 1283 r->mon_capable = true; 387 1284 388 1285 return 0; 389 - } 390 - 391 - void resctrl_mon_resource_exit(void) 392 - { 393 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 394 - 395 - dom_data_exit(r); 396 1286 } 397 1287 398 1288 void __init intel_rdt_mbm_apply_quirk(void)

+4 -1088

arch/x86/kernel/cpu/resctrl/pseudo_lock.c

··· 11 11 12 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 13 14 + #include <linux/cacheflush.h> 14 15 #include <linux/cpu.h> 15 - #include <linux/cpumask.h> 16 - #include <linux/debugfs.h> 17 - #include <linux/kthread.h> 18 - #include <linux/mman.h> 19 16 #include <linux/perf_event.h> 20 17 #include <linux/pm_qos.h> 21 - #include <linux/slab.h> 22 - #include <linux/uaccess.h> 18 + #include <linux/resctrl.h> 23 19 24 - #include <asm/cacheflush.h> 25 20 #include <asm/cpu_device_id.h> 26 - #include <asm/resctrl.h> 27 21 #include <asm/perf_event.h> 28 22 #include <asm/msr.h> 29 23 ··· 25 31 #include "internal.h" 26 32 27 33 #define CREATE_TRACE_POINTS 28 - #include "trace.h" 34 + 35 + #include "pseudo_lock_trace.h" 29 36 30 37 /* 31 38 * The bits needed to disable hardware prefetching varies based on the 32 39 * platform. During initialization we will discover which bits to use. 33 40 */ 34 41 static u64 prefetch_disable_bits; 35 - 36 - /* 37 - * Major number assigned to and shared by all devices exposing 38 - * pseudo-locked regions. 39 - */ 40 - static unsigned int pseudo_lock_major; 41 - static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 42 - 43 - static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 44 - { 45 - const struct rdtgroup *rdtgrp; 46 - 47 - rdtgrp = dev_get_drvdata(dev); 48 - if (mode) 49 - *mode = 0600; 50 - guard(mutex)(&rdtgroup_mutex); 51 - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 52 - } 53 - 54 - static const struct class pseudo_lock_class = { 55 - .name = "pseudo_lock", 56 - .devnode = pseudo_lock_devnode, 57 - }; 58 42 59 43 /** 60 44 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported ··· 92 120 } 93 121 94 122 return prefetch_disable_bits; 95 - } 96 - 97 - /** 98 - * pseudo_lock_minor_get - Obtain available minor number 99 - * @minor: Pointer to where new minor number will be stored 100 - * 101 - * A bitmask is used to track available minor numbers. Here the next free 102 - * minor number is marked as unavailable and returned. 103 - * 104 - * Return: 0 on success, <0 on failure. 105 - */ 106 - static int pseudo_lock_minor_get(unsigned int *minor) 107 - { 108 - unsigned long first_bit; 109 - 110 - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 111 - 112 - if (first_bit == MINORBITS) 113 - return -ENOSPC; 114 - 115 - __clear_bit(first_bit, &pseudo_lock_minor_avail); 116 - *minor = first_bit; 117 - 118 - return 0; 119 - } 120 - 121 - /** 122 - * pseudo_lock_minor_release - Return minor number to available 123 - * @minor: The minor number made available 124 - */ 125 - static void pseudo_lock_minor_release(unsigned int minor) 126 - { 127 - __set_bit(minor, &pseudo_lock_minor_avail); 128 - } 129 - 130 - /** 131 - * region_find_by_minor - Locate a pseudo-lock region by inode minor number 132 - * @minor: The minor number of the device representing pseudo-locked region 133 - * 134 - * When the character device is accessed we need to determine which 135 - * pseudo-locked region it belongs to. This is done by matching the minor 136 - * number of the device to the pseudo-locked region it belongs. 137 - * 138 - * Minor numbers are assigned at the time a pseudo-locked region is associated 139 - * with a cache instance. 140 - * 141 - * Return: On success return pointer to resource group owning the pseudo-locked 142 - * region, NULL on failure. 143 - */ 144 - static struct rdtgroup *region_find_by_minor(unsigned int minor) 145 - { 146 - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 147 - 148 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 149 - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 150 - rdtgrp_match = rdtgrp; 151 - break; 152 - } 153 - } 154 - return rdtgrp_match; 155 - } 156 - 157 - /** 158 - * struct pseudo_lock_pm_req - A power management QoS request list entry 159 - * @list: Entry within the @pm_reqs list for a pseudo-locked region 160 - * @req: PM QoS request 161 - */ 162 - struct pseudo_lock_pm_req { 163 - struct list_head list; 164 - struct dev_pm_qos_request req; 165 - }; 166 - 167 - static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 168 - { 169 - struct pseudo_lock_pm_req *pm_req, *next; 170 - 171 - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 172 - dev_pm_qos_remove_request(&pm_req->req); 173 - list_del(&pm_req->list); 174 - kfree(pm_req); 175 - } 176 - } 177 - 178 - /** 179 - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 180 - * @plr: Pseudo-locked region 181 - * 182 - * To prevent the cache from being affected by power management entering 183 - * C6 has to be avoided. This is accomplished by requesting a latency 184 - * requirement lower than lowest C6 exit latency of all supported 185 - * platforms as found in the cpuidle state tables in the intel_idle driver. 186 - * At this time it is possible to do so with a single latency requirement 187 - * for all supported platforms. 188 - * 189 - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 190 - * the ACPI latencies need to be considered while keeping in mind that C2 191 - * may be set to map to deeper sleep states. In this case the latency 192 - * requirement needs to prevent entering C2 also. 193 - * 194 - * Return: 0 on success, <0 on failure 195 - */ 196 - static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 197 - { 198 - struct pseudo_lock_pm_req *pm_req; 199 - int cpu; 200 - int ret; 201 - 202 - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 203 - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 204 - if (!pm_req) { 205 - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 206 - ret = -ENOMEM; 207 - goto out_err; 208 - } 209 - ret = dev_pm_qos_add_request(get_cpu_device(cpu), 210 - &pm_req->req, 211 - DEV_PM_QOS_RESUME_LATENCY, 212 - 30); 213 - if (ret < 0) { 214 - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 215 - cpu); 216 - kfree(pm_req); 217 - ret = -1; 218 - goto out_err; 219 - } 220 - list_add(&pm_req->list, &plr->pm_reqs); 221 - } 222 - 223 - return 0; 224 - 225 - out_err: 226 - pseudo_lock_cstates_relax(plr); 227 - return ret; 228 - } 229 - 230 - /** 231 - * pseudo_lock_region_clear - Reset pseudo-lock region data 232 - * @plr: pseudo-lock region 233 - * 234 - * All content of the pseudo-locked region is reset - any memory allocated 235 - * freed. 236 - * 237 - * Return: void 238 - */ 239 - static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 240 - { 241 - plr->size = 0; 242 - plr->line_size = 0; 243 - kfree(plr->kmem); 244 - plr->kmem = NULL; 245 - plr->s = NULL; 246 - if (plr->d) 247 - plr->d->plr = NULL; 248 - plr->d = NULL; 249 - plr->cbm = 0; 250 - plr->debugfs_dir = NULL; 251 - } 252 - 253 - /** 254 - * pseudo_lock_region_init - Initialize pseudo-lock region information 255 - * @plr: pseudo-lock region 256 - * 257 - * Called after user provided a schemata to be pseudo-locked. From the 258 - * schemata the &struct pseudo_lock_region is on entry already initialized 259 - * with the resource, domain, and capacity bitmask. Here the information 260 - * required for pseudo-locking is deduced from this data and &struct 261 - * pseudo_lock_region initialized further. This information includes: 262 - * - size in bytes of the region to be pseudo-locked 263 - * - cache line size to know the stride with which data needs to be accessed 264 - * to be pseudo-locked 265 - * - a cpu associated with the cache instance on which the pseudo-locking 266 - * flow can be executed 267 - * 268 - * Return: 0 on success, <0 on failure. Descriptive error will be written 269 - * to last_cmd_status buffer. 270 - */ 271 - static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 272 - { 273 - enum resctrl_scope scope = plr->s->res->ctrl_scope; 274 - struct cacheinfo *ci; 275 - int ret; 276 - 277 - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 278 - return -ENODEV; 279 - 280 - /* Pick the first cpu we find that is associated with the cache. */ 281 - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 282 - 283 - if (!cpu_online(plr->cpu)) { 284 - rdt_last_cmd_printf("CPU %u associated with cache not online\n", 285 - plr->cpu); 286 - ret = -ENODEV; 287 - goto out_region; 288 - } 289 - 290 - ci = get_cpu_cacheinfo_level(plr->cpu, scope); 291 - if (ci) { 292 - plr->line_size = ci->coherency_line_size; 293 - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 294 - return 0; 295 - } 296 - 297 - ret = -1; 298 - rdt_last_cmd_puts("Unable to determine cache line size\n"); 299 - out_region: 300 - pseudo_lock_region_clear(plr); 301 - return ret; 302 - } 303 - 304 - /** 305 - * pseudo_lock_init - Initialize a pseudo-lock region 306 - * @rdtgrp: resource group to which new pseudo-locked region will belong 307 - * 308 - * A pseudo-locked region is associated with a resource group. When this 309 - * association is created the pseudo-locked region is initialized. The 310 - * details of the pseudo-locked region are not known at this time so only 311 - * allocation is done and association established. 312 - * 313 - * Return: 0 on success, <0 on failure 314 - */ 315 - static int pseudo_lock_init(struct rdtgroup *rdtgrp) 316 - { 317 - struct pseudo_lock_region *plr; 318 - 319 - plr = kzalloc(sizeof(*plr), GFP_KERNEL); 320 - if (!plr) 321 - return -ENOMEM; 322 - 323 - init_waitqueue_head(&plr->lock_thread_wq); 324 - INIT_LIST_HEAD(&plr->pm_reqs); 325 - rdtgrp->plr = plr; 326 - return 0; 327 - } 328 - 329 - /** 330 - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 331 - * @plr: pseudo-lock region 332 - * 333 - * Initialize the details required to set up the pseudo-locked region and 334 - * allocate the contiguous memory that will be pseudo-locked to the cache. 335 - * 336 - * Return: 0 on success, <0 on failure. Descriptive error will be written 337 - * to last_cmd_status buffer. 338 - */ 339 - static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 340 - { 341 - int ret; 342 - 343 - ret = pseudo_lock_region_init(plr); 344 - if (ret < 0) 345 - return ret; 346 - 347 - /* 348 - * We do not yet support contiguous regions larger than 349 - * KMALLOC_MAX_SIZE. 350 - */ 351 - if (plr->size > KMALLOC_MAX_SIZE) { 352 - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 353 - ret = -E2BIG; 354 - goto out_region; 355 - } 356 - 357 - plr->kmem = kzalloc(plr->size, GFP_KERNEL); 358 - if (!plr->kmem) { 359 - rdt_last_cmd_puts("Unable to allocate memory\n"); 360 - ret = -ENOMEM; 361 - goto out_region; 362 - } 363 - 364 - ret = 0; 365 - goto out; 366 - out_region: 367 - pseudo_lock_region_clear(plr); 368 - out: 369 - return ret; 370 - } 371 - 372 - /** 373 - * pseudo_lock_free - Free a pseudo-locked region 374 - * @rdtgrp: resource group to which pseudo-locked region belonged 375 - * 376 - * The pseudo-locked region's resources have already been released, or not 377 - * yet created at this point. Now it can be freed and disassociated from the 378 - * resource group. 379 - * 380 - * Return: void 381 - */ 382 - static void pseudo_lock_free(struct rdtgroup *rdtgrp) 383 - { 384 - pseudo_lock_region_clear(rdtgrp->plr); 385 - kfree(rdtgrp->plr); 386 - rdtgrp->plr = NULL; 387 123 } 388 124 389 125 /** ··· 221 541 plr->thread_done = 1; 222 542 wake_up_interruptible(&plr->lock_thread_wq); 223 543 return 0; 224 - } 225 - 226 - /** 227 - * rdtgroup_monitor_in_progress - Test if monitoring in progress 228 - * @rdtgrp: resource group being queried 229 - * 230 - * Return: 1 if monitor groups have been created for this resource 231 - * group, 0 otherwise. 232 - */ 233 - static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 234 - { 235 - return !list_empty(&rdtgrp->mon.crdtgrp_list); 236 - } 237 - 238 - /** 239 - * rdtgroup_locksetup_user_restrict - Restrict user access to group 240 - * @rdtgrp: resource group needing access restricted 241 - * 242 - * A resource group used for cache pseudo-locking cannot have cpus or tasks 243 - * assigned to it. This is communicated to the user by restricting access 244 - * to all the files that can be used to make such changes. 245 - * 246 - * Permissions restored with rdtgroup_locksetup_user_restore() 247 - * 248 - * Return: 0 on success, <0 on failure. If a failure occurs during the 249 - * restriction of access an attempt will be made to restore permissions but 250 - * the state of the mode of these files will be uncertain when a failure 251 - * occurs. 252 - */ 253 - static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 254 - { 255 - int ret; 256 - 257 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 258 - if (ret) 259 - return ret; 260 - 261 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 262 - if (ret) 263 - goto err_tasks; 264 - 265 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 266 - if (ret) 267 - goto err_cpus; 268 - 269 - if (resctrl_arch_mon_capable()) { 270 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 271 - if (ret) 272 - goto err_cpus_list; 273 - } 274 - 275 - ret = 0; 276 - goto out; 277 - 278 - err_cpus_list: 279 - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 280 - err_cpus: 281 - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 282 - err_tasks: 283 - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 284 - out: 285 - return ret; 286 - } 287 - 288 - /** 289 - * rdtgroup_locksetup_user_restore - Restore user access to group 290 - * @rdtgrp: resource group needing access restored 291 - * 292 - * Restore all file access previously removed using 293 - * rdtgroup_locksetup_user_restrict() 294 - * 295 - * Return: 0 on success, <0 on failure. If a failure occurs during the 296 - * restoration of access an attempt will be made to restrict permissions 297 - * again but the state of the mode of these files will be uncertain when 298 - * a failure occurs. 299 - */ 300 - static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 301 - { 302 - int ret; 303 - 304 - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 305 - if (ret) 306 - return ret; 307 - 308 - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 309 - if (ret) 310 - goto err_tasks; 311 - 312 - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 313 - if (ret) 314 - goto err_cpus; 315 - 316 - if (resctrl_arch_mon_capable()) { 317 - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 318 - if (ret) 319 - goto err_cpus_list; 320 - } 321 - 322 - ret = 0; 323 - goto out; 324 - 325 - err_cpus_list: 326 - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 327 - err_cpus: 328 - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 329 - err_tasks: 330 - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 331 - out: 332 - return ret; 333 - } 334 - 335 - /** 336 - * rdtgroup_locksetup_enter - Resource group enters locksetup mode 337 - * @rdtgrp: resource group requested to enter locksetup mode 338 - * 339 - * A resource group enters locksetup mode to reflect that it would be used 340 - * to represent a pseudo-locked region and is in the process of being set 341 - * up to do so. A resource group used for a pseudo-locked region would 342 - * lose the closid associated with it so we cannot allow it to have any 343 - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 344 - * future. Monitoring of a pseudo-locked region is not allowed either. 345 - * 346 - * The above and more restrictions on a pseudo-locked region are checked 347 - * for and enforced before the resource group enters the locksetup mode. 348 - * 349 - * Returns: 0 if the resource group successfully entered locksetup mode, <0 350 - * on failure. On failure the last_cmd_status buffer is updated with text to 351 - * communicate details of failure to the user. 352 - */ 353 - int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 354 - { 355 - int ret; 356 - 357 - /* 358 - * The default resource group can neither be removed nor lose the 359 - * default closid associated with it. 360 - */ 361 - if (rdtgrp == &rdtgroup_default) { 362 - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 363 - return -EINVAL; 364 - } 365 - 366 - /* 367 - * Cache Pseudo-locking not supported when CDP is enabled. 368 - * 369 - * Some things to consider if you would like to enable this 370 - * support (using L3 CDP as example): 371 - * - When CDP is enabled two separate resources are exposed, 372 - * L3DATA and L3CODE, but they are actually on the same cache. 373 - * The implication for pseudo-locking is that if a 374 - * pseudo-locked region is created on a domain of one 375 - * resource (eg. L3CODE), then a pseudo-locked region cannot 376 - * be created on that same domain of the other resource 377 - * (eg. L3DATA). This is because the creation of a 378 - * pseudo-locked region involves a call to wbinvd that will 379 - * affect all cache allocations on particular domain. 380 - * - Considering the previous, it may be possible to only 381 - * expose one of the CDP resources to pseudo-locking and 382 - * hide the other. For example, we could consider to only 383 - * expose L3DATA and since the L3 cache is unified it is 384 - * still possible to place instructions there are execute it. 385 - * - If only one region is exposed to pseudo-locking we should 386 - * still keep in mind that availability of a portion of cache 387 - * for pseudo-locking should take into account both resources. 388 - * Similarly, if a pseudo-locked region is created in one 389 - * resource, the portion of cache used by it should be made 390 - * unavailable to all future allocations from both resources. 391 - */ 392 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 393 - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 394 - rdt_last_cmd_puts("CDP enabled\n"); 395 - return -EINVAL; 396 - } 397 - 398 - /* 399 - * Not knowing the bits to disable prefetching implies that this 400 - * platform does not support Cache Pseudo-Locking. 401 - */ 402 - if (resctrl_arch_get_prefetch_disable_bits() == 0) { 403 - rdt_last_cmd_puts("Pseudo-locking not supported\n"); 404 - return -EINVAL; 405 - } 406 - 407 - if (rdtgroup_monitor_in_progress(rdtgrp)) { 408 - rdt_last_cmd_puts("Monitoring in progress\n"); 409 - return -EINVAL; 410 - } 411 - 412 - if (rdtgroup_tasks_assigned(rdtgrp)) { 413 - rdt_last_cmd_puts("Tasks assigned to resource group\n"); 414 - return -EINVAL; 415 - } 416 - 417 - if (!cpumask_empty(&rdtgrp->cpu_mask)) { 418 - rdt_last_cmd_puts("CPUs assigned to resource group\n"); 419 - return -EINVAL; 420 - } 421 - 422 - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 423 - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 424 - return -EIO; 425 - } 426 - 427 - ret = pseudo_lock_init(rdtgrp); 428 - if (ret) { 429 - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 430 - goto out_release; 431 - } 432 - 433 - /* 434 - * If this system is capable of monitoring a rmid would have been 435 - * allocated when the control group was created. This is not needed 436 - * anymore when this group would be used for pseudo-locking. This 437 - * is safe to call on platforms not capable of monitoring. 438 - */ 439 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 440 - 441 - ret = 0; 442 - goto out; 443 - 444 - out_release: 445 - rdtgroup_locksetup_user_restore(rdtgrp); 446 - out: 447 - return ret; 448 - } 449 - 450 - /** 451 - * rdtgroup_locksetup_exit - resource group exist locksetup mode 452 - * @rdtgrp: resource group 453 - * 454 - * When a resource group exits locksetup mode the earlier restrictions are 455 - * lifted. 456 - * 457 - * Return: 0 on success, <0 on failure 458 - */ 459 - int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 460 - { 461 - int ret; 462 - 463 - if (resctrl_arch_mon_capable()) { 464 - ret = alloc_rmid(rdtgrp->closid); 465 - if (ret < 0) { 466 - rdt_last_cmd_puts("Out of RMIDs\n"); 467 - return ret; 468 - } 469 - rdtgrp->mon.rmid = ret; 470 - } 471 - 472 - ret = rdtgroup_locksetup_user_restore(rdtgrp); 473 - if (ret) { 474 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 475 - return ret; 476 - } 477 - 478 - pseudo_lock_free(rdtgrp); 479 - return 0; 480 - } 481 - 482 - /** 483 - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 484 - * @d: RDT domain 485 - * @cbm: CBM to test 486 - * 487 - * @d represents a cache instance and @cbm a capacity bitmask that is 488 - * considered for it. Determine if @cbm overlaps with any existing 489 - * pseudo-locked region on @d. 490 - * 491 - * @cbm is unsigned long, even if only 32 bits are used, to make the 492 - * bitmap functions work correctly. 493 - * 494 - * Return: true if @cbm overlaps with pseudo-locked region on @d, false 495 - * otherwise. 496 - */ 497 - bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 498 - { 499 - unsigned int cbm_len; 500 - unsigned long cbm_b; 501 - 502 - if (d->plr) { 503 - cbm_len = d->plr->s->res->cache.cbm_len; 504 - cbm_b = d->plr->cbm; 505 - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 506 - return true; 507 - } 508 - return false; 509 - } 510 - 511 - /** 512 - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 513 - * @d: RDT domain under test 514 - * 515 - * The setup of a pseudo-locked region affects all cache instances within 516 - * the hierarchy of the region. It is thus essential to know if any 517 - * pseudo-locked regions exist within a cache hierarchy to prevent any 518 - * attempts to create new pseudo-locked regions in the same hierarchy. 519 - * 520 - * Return: true if a pseudo-locked region exists in the hierarchy of @d or 521 - * if it is not possible to test due to memory allocation issue, 522 - * false otherwise. 523 - */ 524 - bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 525 - { 526 - struct rdt_ctrl_domain *d_i; 527 - cpumask_var_t cpu_with_psl; 528 - struct rdt_resource *r; 529 - bool ret = false; 530 - 531 - /* Walking r->domains, ensure it can't race with cpuhp */ 532 - lockdep_assert_cpus_held(); 533 - 534 - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 535 - return true; 536 - 537 - /* 538 - * First determine which cpus have pseudo-locked regions 539 - * associated with them. 540 - */ 541 - for_each_alloc_capable_rdt_resource(r) { 542 - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 543 - if (d_i->plr) 544 - cpumask_or(cpu_with_psl, cpu_with_psl, 545 - &d_i->hdr.cpu_mask); 546 - } 547 - } 548 - 549 - /* 550 - * Next test if new pseudo-locked region would intersect with 551 - * existing region. 552 - */ 553 - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 554 - ret = true; 555 - 556 - free_cpumask_var(cpu_with_psl); 557 - return ret; 558 544 } 559 545 560 546 /** ··· 514 1168 plr->thread_done = 1; 515 1169 wake_up_interruptible(&plr->lock_thread_wq); 516 1170 return 0; 517 - } 518 - 519 - /** 520 - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 521 - * @rdtgrp: Resource group to which the pseudo-locked region belongs. 522 - * @sel: Selector of which measurement to perform on a pseudo-locked region. 523 - * 524 - * The measurement of latency to access a pseudo-locked region should be 525 - * done from a cpu that is associated with that pseudo-locked region. 526 - * Determine which cpu is associated with this region and start a thread on 527 - * that cpu to perform the measurement, wait for that thread to complete. 528 - * 529 - * Return: 0 on success, <0 on failure 530 - */ 531 - static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 532 - { 533 - struct pseudo_lock_region *plr = rdtgrp->plr; 534 - struct task_struct *thread; 535 - unsigned int cpu; 536 - int ret = -1; 537 - 538 - cpus_read_lock(); 539 - mutex_lock(&rdtgroup_mutex); 540 - 541 - if (rdtgrp->flags & RDT_DELETED) { 542 - ret = -ENODEV; 543 - goto out; 544 - } 545 - 546 - if (!plr->d) { 547 - ret = -ENODEV; 548 - goto out; 549 - } 550 - 551 - plr->thread_done = 0; 552 - cpu = cpumask_first(&plr->d->hdr.cpu_mask); 553 - if (!cpu_online(cpu)) { 554 - ret = -ENODEV; 555 - goto out; 556 - } 557 - 558 - plr->cpu = cpu; 559 - 560 - if (sel == 1) 561 - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 562 - plr, cpu, "pseudo_lock_measure/%u"); 563 - else if (sel == 2) 564 - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 565 - plr, cpu, "pseudo_lock_measure/%u"); 566 - else if (sel == 3) 567 - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 568 - plr, cpu, "pseudo_lock_measure/%u"); 569 - else 570 - goto out; 571 - 572 - if (IS_ERR(thread)) { 573 - ret = PTR_ERR(thread); 574 - goto out; 575 - } 576 - 577 - ret = wait_event_interruptible(plr->lock_thread_wq, 578 - plr->thread_done == 1); 579 - if (ret < 0) 580 - goto out; 581 - 582 - ret = 0; 583 - 584 - out: 585 - mutex_unlock(&rdtgroup_mutex); 586 - cpus_read_unlock(); 587 - return ret; 588 - } 589 - 590 - static ssize_t pseudo_lock_measure_trigger(struct file *file, 591 - const char __user *user_buf, 592 - size_t count, loff_t *ppos) 593 - { 594 - struct rdtgroup *rdtgrp = file->private_data; 595 - size_t buf_size; 596 - char buf[32]; 597 - int ret; 598 - int sel; 599 - 600 - buf_size = min(count, (sizeof(buf) - 1)); 601 - if (copy_from_user(buf, user_buf, buf_size)) 602 - return -EFAULT; 603 - 604 - buf[buf_size] = '\0'; 605 - ret = kstrtoint(buf, 10, &sel); 606 - if (ret == 0) { 607 - if (sel != 1 && sel != 2 && sel != 3) 608 - return -EINVAL; 609 - ret = debugfs_file_get(file->f_path.dentry); 610 - if (ret) 611 - return ret; 612 - ret = pseudo_lock_measure_cycles(rdtgrp, sel); 613 - if (ret == 0) 614 - ret = count; 615 - debugfs_file_put(file->f_path.dentry); 616 - } 617 - 618 - return ret; 619 - } 620 - 621 - static const struct file_operations pseudo_measure_fops = { 622 - .write = pseudo_lock_measure_trigger, 623 - .open = simple_open, 624 - .llseek = default_llseek, 625 - }; 626 - 627 - /** 628 - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 629 - * @rdtgrp: resource group to which pseudo-lock region belongs 630 - * 631 - * Called when a resource group in the pseudo-locksetup mode receives a 632 - * valid schemata that should be pseudo-locked. Since the resource group is 633 - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 634 - * allocated and initialized with the essential information. If a failure 635 - * occurs the resource group remains in the pseudo-locksetup mode with the 636 - * &struct pseudo_lock_region associated with it, but cleared from all 637 - * information and ready for the user to re-attempt pseudo-locking by 638 - * writing the schemata again. 639 - * 640 - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 641 - * on failure. Descriptive error will be written to last_cmd_status buffer. 642 - */ 643 - int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 644 - { 645 - struct pseudo_lock_region *plr = rdtgrp->plr; 646 - struct task_struct *thread; 647 - unsigned int new_minor; 648 - struct device *dev; 649 - char *kn_name __free(kfree) = NULL; 650 - int ret; 651 - 652 - ret = pseudo_lock_region_alloc(plr); 653 - if (ret < 0) 654 - return ret; 655 - 656 - ret = pseudo_lock_cstates_constrain(plr); 657 - if (ret < 0) { 658 - ret = -EINVAL; 659 - goto out_region; 660 - } 661 - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 662 - if (!kn_name) { 663 - ret = -ENOMEM; 664 - goto out_cstates; 665 - } 666 - 667 - plr->thread_done = 0; 668 - 669 - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 670 - plr->cpu, "pseudo_lock/%u"); 671 - if (IS_ERR(thread)) { 672 - ret = PTR_ERR(thread); 673 - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 674 - goto out_cstates; 675 - } 676 - 677 - ret = wait_event_interruptible(plr->lock_thread_wq, 678 - plr->thread_done == 1); 679 - if (ret < 0) { 680 - /* 681 - * If the thread does not get on the CPU for whatever 682 - * reason and the process which sets up the region is 683 - * interrupted then this will leave the thread in runnable 684 - * state and once it gets on the CPU it will dereference 685 - * the cleared, but not freed, plr struct resulting in an 686 - * empty pseudo-locking loop. 687 - */ 688 - rdt_last_cmd_puts("Locking thread interrupted\n"); 689 - goto out_cstates; 690 - } 691 - 692 - ret = pseudo_lock_minor_get(&new_minor); 693 - if (ret < 0) { 694 - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 695 - goto out_cstates; 696 - } 697 - 698 - /* 699 - * Unlock access but do not release the reference. The 700 - * pseudo-locked region will still be here on return. 701 - * 702 - * The mutex has to be released temporarily to avoid a potential 703 - * deadlock with the mm->mmap_lock which is obtained in the 704 - * device_create() and debugfs_create_dir() callpath below as well as 705 - * before the mmap() callback is called. 706 - */ 707 - mutex_unlock(&rdtgroup_mutex); 708 - 709 - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 710 - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 711 - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 712 - debugfs_create_file("pseudo_lock_measure", 0200, 713 - plr->debugfs_dir, rdtgrp, 714 - &pseudo_measure_fops); 715 - } 716 - 717 - dev = device_create(&pseudo_lock_class, NULL, 718 - MKDEV(pseudo_lock_major, new_minor), 719 - rdtgrp, "%s", kn_name); 720 - 721 - mutex_lock(&rdtgroup_mutex); 722 - 723 - if (IS_ERR(dev)) { 724 - ret = PTR_ERR(dev); 725 - rdt_last_cmd_printf("Failed to create character device: %d\n", 726 - ret); 727 - goto out_debugfs; 728 - } 729 - 730 - /* We released the mutex - check if group was removed while we did so */ 731 - if (rdtgrp->flags & RDT_DELETED) { 732 - ret = -ENODEV; 733 - goto out_device; 734 - } 735 - 736 - plr->minor = new_minor; 737 - 738 - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 739 - closid_free(rdtgrp->closid); 740 - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 741 - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 742 - 743 - ret = 0; 744 - goto out; 745 - 746 - out_device: 747 - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 748 - out_debugfs: 749 - debugfs_remove_recursive(plr->debugfs_dir); 750 - pseudo_lock_minor_release(new_minor); 751 - out_cstates: 752 - pseudo_lock_cstates_relax(plr); 753 - out_region: 754 - pseudo_lock_region_clear(plr); 755 - out: 756 - return ret; 757 - } 758 - 759 - /** 760 - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 761 - * @rdtgrp: resource group to which the pseudo-locked region belongs 762 - * 763 - * The removal of a pseudo-locked region can be initiated when the resource 764 - * group is removed from user space via a "rmdir" from userspace or the 765 - * unmount of the resctrl filesystem. On removal the resource group does 766 - * not go back to pseudo-locksetup mode before it is removed, instead it is 767 - * removed directly. There is thus asymmetry with the creation where the 768 - * &struct pseudo_lock_region is removed here while it was not created in 769 - * rdtgroup_pseudo_lock_create(). 770 - * 771 - * Return: void 772 - */ 773 - void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 774 - { 775 - struct pseudo_lock_region *plr = rdtgrp->plr; 776 - 777 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 778 - /* 779 - * Default group cannot be a pseudo-locked region so we can 780 - * free closid here. 781 - */ 782 - closid_free(rdtgrp->closid); 783 - goto free; 784 - } 785 - 786 - pseudo_lock_cstates_relax(plr); 787 - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 788 - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 789 - pseudo_lock_minor_release(plr->minor); 790 - 791 - free: 792 - pseudo_lock_free(rdtgrp); 793 - } 794 - 795 - static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 796 - { 797 - struct rdtgroup *rdtgrp; 798 - 799 - mutex_lock(&rdtgroup_mutex); 800 - 801 - rdtgrp = region_find_by_minor(iminor(inode)); 802 - if (!rdtgrp) { 803 - mutex_unlock(&rdtgroup_mutex); 804 - return -ENODEV; 805 - } 806 - 807 - filp->private_data = rdtgrp; 808 - atomic_inc(&rdtgrp->waitcount); 809 - /* Perform a non-seekable open - llseek is not supported */ 810 - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 811 - 812 - mutex_unlock(&rdtgroup_mutex); 813 - 814 - return 0; 815 - } 816 - 817 - static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 818 - { 819 - struct rdtgroup *rdtgrp; 820 - 821 - mutex_lock(&rdtgroup_mutex); 822 - rdtgrp = filp->private_data; 823 - WARN_ON(!rdtgrp); 824 - if (!rdtgrp) { 825 - mutex_unlock(&rdtgroup_mutex); 826 - return -ENODEV; 827 - } 828 - filp->private_data = NULL; 829 - atomic_dec(&rdtgrp->waitcount); 830 - mutex_unlock(&rdtgroup_mutex); 831 - return 0; 832 - } 833 - 834 - static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 835 - { 836 - /* Not supported */ 837 - return -EINVAL; 838 - } 839 - 840 - static const struct vm_operations_struct pseudo_mmap_ops = { 841 - .mremap = pseudo_lock_dev_mremap, 842 - }; 843 - 844 - static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 845 - { 846 - unsigned long vsize = vma->vm_end - vma->vm_start; 847 - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 848 - struct pseudo_lock_region *plr; 849 - struct rdtgroup *rdtgrp; 850 - unsigned long physical; 851 - unsigned long psize; 852 - 853 - mutex_lock(&rdtgroup_mutex); 854 - 855 - rdtgrp = filp->private_data; 856 - WARN_ON(!rdtgrp); 857 - if (!rdtgrp) { 858 - mutex_unlock(&rdtgroup_mutex); 859 - return -ENODEV; 860 - } 861 - 862 - plr = rdtgrp->plr; 863 - 864 - if (!plr->d) { 865 - mutex_unlock(&rdtgroup_mutex); 866 - return -ENODEV; 867 - } 868 - 869 - /* 870 - * Task is required to run with affinity to the cpus associated 871 - * with the pseudo-locked region. If this is not the case the task 872 - * may be scheduled elsewhere and invalidate entries in the 873 - * pseudo-locked region. 874 - */ 875 - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 876 - mutex_unlock(&rdtgroup_mutex); 877 - return -EINVAL; 878 - } 879 - 880 - physical = __pa(plr->kmem) >> PAGE_SHIFT; 881 - psize = plr->size - off; 882 - 883 - if (off > plr->size) { 884 - mutex_unlock(&rdtgroup_mutex); 885 - return -ENOSPC; 886 - } 887 - 888 - /* 889 - * Ensure changes are carried directly to the memory being mapped, 890 - * do not allow copy-on-write mapping. 891 - */ 892 - if (!(vma->vm_flags & VM_SHARED)) { 893 - mutex_unlock(&rdtgroup_mutex); 894 - return -EINVAL; 895 - } 896 - 897 - if (vsize > psize) { 898 - mutex_unlock(&rdtgroup_mutex); 899 - return -ENOSPC; 900 - } 901 - 902 - memset(plr->kmem + off, 0, vsize); 903 - 904 - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 905 - vsize, vma->vm_page_prot)) { 906 - mutex_unlock(&rdtgroup_mutex); 907 - return -EAGAIN; 908 - } 909 - vma->vm_ops = &pseudo_mmap_ops; 910 - mutex_unlock(&rdtgroup_mutex); 911 - return 0; 912 - } 913 - 914 - static const struct file_operations pseudo_lock_dev_fops = { 915 - .owner = THIS_MODULE, 916 - .read = NULL, 917 - .write = NULL, 918 - .open = pseudo_lock_dev_open, 919 - .release = pseudo_lock_dev_release, 920 - .mmap = pseudo_lock_dev_mmap, 921 - }; 922 - 923 - int rdt_pseudo_lock_init(void) 924 - { 925 - int ret; 926 - 927 - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 928 - if (ret < 0) 929 - return ret; 930 - 931 - pseudo_lock_major = ret; 932 - 933 - ret = class_register(&pseudo_lock_class); 934 - if (ret) { 935 - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 936 - return ret; 937 - } 938 - 939 - return 0; 940 - } 941 - 942 - void rdt_pseudo_lock_release(void) 943 - { 944 - class_unregister(&pseudo_lock_class); 945 - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 946 - pseudo_lock_major = 0; 947 1171 }

+7 -4157

arch/x86/kernel/cpu/resctrl/rdtgroup.c

··· 18 18 #include <linux/fs_parser.h> 19 19 #include <linux/sysfs.h> 20 20 #include <linux/kernfs.h> 21 + #include <linux/resctrl.h> 21 22 #include <linux/seq_buf.h> 22 23 #include <linux/seq_file.h> 23 24 #include <linux/sched/signal.h> ··· 30 29 #include <uapi/linux/magic.h> 31 30 32 31 #include <asm/msr.h> 33 - #include <asm/resctrl.h> 34 32 #include "internal.h" 35 33 36 34 DEFINE_STATIC_KEY_FALSE(rdt_enable_key); 35 + 37 36 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); 37 + 38 38 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); 39 39 40 - /* Mutex to protect rdtgroup access. */ 41 - DEFINE_MUTEX(rdtgroup_mutex); 42 - 43 - static struct kernfs_root *rdt_root; 44 - struct rdtgroup rdtgroup_default; 45 - LIST_HEAD(rdt_all_groups); 46 - 47 - /* list of entries for the schemata file */ 48 - LIST_HEAD(resctrl_schema_all); 49 - 50 - /* The filesystem can only be mounted once. */ 51 - bool resctrl_mounted; 52 - 53 - /* Kernel fs node for "info" directory under root */ 54 - static struct kernfs_node *kn_info; 55 - 56 - /* Kernel fs node for "mon_groups" directory under root */ 57 - static struct kernfs_node *kn_mongrp; 58 - 59 - /* Kernel fs node for "mon_data" directory under root */ 60 - static struct kernfs_node *kn_mondata; 61 - 62 40 /* 63 - * Used to store the max resource name width to display the schemata names in 64 - * a tabular format. 65 - */ 66 - int max_name_width; 67 - 68 - static struct seq_buf last_cmd_status; 69 - static char last_cmd_status_buf[512]; 70 - 71 - static int rdtgroup_setup_root(struct rdt_fs_context *ctx); 72 - static void rdtgroup_destroy_root(void); 73 - 74 - struct dentry *debugfs_resctrl; 75 - 76 - /* 77 - * Memory bandwidth monitoring event to use for the default CTRL_MON group 78 - * and each new CTRL_MON group created by the user. Only relevant when 79 - * the filesystem is mounted with the "mba_MBps" option so it does not 80 - * matter that it remains uninitialized on systems that do not support 81 - * the "mba_MBps" option. 82 - */ 83 - enum resctrl_event_id mba_mbps_default_event; 84 - 85 - static bool resctrl_debug; 86 - 87 - void rdt_last_cmd_clear(void) 88 - { 89 - lockdep_assert_held(&rdtgroup_mutex); 90 - seq_buf_clear(&last_cmd_status); 91 - } 92 - 93 - void rdt_last_cmd_puts(const char *s) 94 - { 95 - lockdep_assert_held(&rdtgroup_mutex); 96 - seq_buf_puts(&last_cmd_status, s); 97 - } 98 - 99 - void rdt_last_cmd_printf(const char *fmt, ...) 100 - { 101 - va_list ap; 102 - 103 - va_start(ap, fmt); 104 - lockdep_assert_held(&rdtgroup_mutex); 105 - seq_buf_vprintf(&last_cmd_status, fmt, ap); 106 - va_end(ap); 107 - } 108 - 109 - void rdt_staged_configs_clear(void) 110 - { 111 - struct rdt_ctrl_domain *dom; 112 - struct rdt_resource *r; 113 - 114 - lockdep_assert_held(&rdtgroup_mutex); 115 - 116 - for_each_alloc_capable_rdt_resource(r) { 117 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) 118 - memset(dom->staged_config, 0, sizeof(dom->staged_config)); 119 - } 120 - } 121 - 122 - static bool resctrl_is_mbm_enabled(void) 123 - { 124 - return (resctrl_arch_is_mbm_total_enabled() || 125 - resctrl_arch_is_mbm_local_enabled()); 126 - } 127 - 128 - static bool resctrl_is_mbm_event(int e) 129 - { 130 - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && 131 - e <= QOS_L3_MBM_LOCAL_EVENT_ID); 132 - } 133 - 134 - /* 135 - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, 136 - * we can keep a bitmap of free CLOSIDs in a single integer. 137 - * 138 - * Using a global CLOSID across all resources has some advantages and 139 - * some drawbacks: 140 - * + We can simply set current's closid to assign a task to a resource 141 - * group. 142 - * + Context switch code can avoid extra memory references deciding which 143 - * CLOSID to load into the PQR_ASSOC MSR 144 - * - We give up some options in configuring resource groups across multi-socket 145 - * systems. 146 - * - Our choices on how to configure each resource become progressively more 147 - * limited as the number of resources grows. 148 - */ 149 - static unsigned long closid_free_map; 150 - static int closid_free_map_len; 151 - 152 - int closids_supported(void) 153 - { 154 - return closid_free_map_len; 155 - } 156 - 157 - static void closid_init(void) 158 - { 159 - struct resctrl_schema *s; 160 - u32 rdt_min_closid = 32; 161 - 162 - /* Compute rdt_min_closid across all resources */ 163 - list_for_each_entry(s, &resctrl_schema_all, list) 164 - rdt_min_closid = min(rdt_min_closid, s->num_closid); 165 - 166 - closid_free_map = BIT_MASK(rdt_min_closid) - 1; 167 - 168 - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ 169 - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); 170 - closid_free_map_len = rdt_min_closid; 171 - } 172 - 173 - static int closid_alloc(void) 174 - { 175 - int cleanest_closid; 176 - u32 closid; 177 - 178 - lockdep_assert_held(&rdtgroup_mutex); 179 - 180 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && 181 - resctrl_arch_is_llc_occupancy_enabled()) { 182 - cleanest_closid = resctrl_find_cleanest_closid(); 183 - if (cleanest_closid < 0) 184 - return cleanest_closid; 185 - closid = cleanest_closid; 186 - } else { 187 - closid = ffs(closid_free_map); 188 - if (closid == 0) 189 - return -ENOSPC; 190 - closid--; 191 - } 192 - __clear_bit(closid, &closid_free_map); 193 - 194 - return closid; 195 - } 196 - 197 - void closid_free(int closid) 198 - { 199 - lockdep_assert_held(&rdtgroup_mutex); 200 - 201 - __set_bit(closid, &closid_free_map); 202 - } 203 - 204 - /** 205 - * closid_allocated - test if provided closid is in use 206 - * @closid: closid to be tested 207 - * 208 - * Return: true if @closid is currently associated with a resource group, 209 - * false if @closid is free 210 - */ 211 - bool closid_allocated(unsigned int closid) 212 - { 213 - lockdep_assert_held(&rdtgroup_mutex); 214 - 215 - return !test_bit(closid, &closid_free_map); 216 - } 217 - 218 - /** 219 - * rdtgroup_mode_by_closid - Return mode of resource group with closid 220 - * @closid: closid if the resource group 221 - * 222 - * Each resource group is associated with a @closid. Here the mode 223 - * of a resource group can be queried by searching for it using its closid. 224 - * 225 - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid 226 - */ 227 - enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) 228 - { 229 - struct rdtgroup *rdtgrp; 230 - 231 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 232 - if (rdtgrp->closid == closid) 233 - return rdtgrp->mode; 234 - } 235 - 236 - return RDT_NUM_MODES; 237 - } 238 - 239 - static const char * const rdt_mode_str[] = { 240 - [RDT_MODE_SHAREABLE] = "shareable", 241 - [RDT_MODE_EXCLUSIVE] = "exclusive", 242 - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", 243 - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", 244 - }; 245 - 246 - /** 247 - * rdtgroup_mode_str - Return the string representation of mode 248 - * @mode: the resource group mode as &enum rdtgroup_mode 249 - * 250 - * Return: string representation of valid mode, "unknown" otherwise 251 - */ 252 - static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) 253 - { 254 - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) 255 - return "unknown"; 256 - 257 - return rdt_mode_str[mode]; 258 - } 259 - 260 - /* set uid and gid of rdtgroup dirs and files to that of the creator */ 261 - static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) 262 - { 263 - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 264 - .ia_uid = current_fsuid(), 265 - .ia_gid = current_fsgid(), }; 266 - 267 - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 268 - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 269 - return 0; 270 - 271 - return kernfs_setattr(kn, &iattr); 272 - } 273 - 274 - static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) 275 - { 276 - struct kernfs_node *kn; 277 - int ret; 278 - 279 - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, 280 - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 281 - 0, rft->kf_ops, rft, NULL, NULL); 282 - if (IS_ERR(kn)) 283 - return PTR_ERR(kn); 284 - 285 - ret = rdtgroup_kn_set_ugid(kn); 286 - if (ret) { 287 - kernfs_remove(kn); 288 - return ret; 289 - } 290 - 291 - return 0; 292 - } 293 - 294 - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 295 - { 296 - struct kernfs_open_file *of = m->private; 297 - struct rftype *rft = of->kn->priv; 298 - 299 - if (rft->seq_show) 300 - return rft->seq_show(of, m, arg); 301 - return 0; 302 - } 303 - 304 - static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, 305 - size_t nbytes, loff_t off) 306 - { 307 - struct rftype *rft = of->kn->priv; 308 - 309 - if (rft->write) 310 - return rft->write(of, buf, nbytes, off); 311 - 312 - return -EINVAL; 313 - } 314 - 315 - static const struct kernfs_ops rdtgroup_kf_single_ops = { 316 - .atomic_write_len = PAGE_SIZE, 317 - .write = rdtgroup_file_write, 318 - .seq_show = rdtgroup_seqfile_show, 319 - }; 320 - 321 - static const struct kernfs_ops kf_mondata_ops = { 322 - .atomic_write_len = PAGE_SIZE, 323 - .seq_show = rdtgroup_mondata_show, 324 - }; 325 - 326 - static bool is_cpu_list(struct kernfs_open_file *of) 327 - { 328 - struct rftype *rft = of->kn->priv; 329 - 330 - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; 331 - } 332 - 333 - static int rdtgroup_cpus_show(struct kernfs_open_file *of, 334 - struct seq_file *s, void *v) 335 - { 336 - struct rdtgroup *rdtgrp; 337 - struct cpumask *mask; 338 - int ret = 0; 339 - 340 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 341 - 342 - if (rdtgrp) { 343 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 344 - if (!rdtgrp->plr->d) { 345 - rdt_last_cmd_clear(); 346 - rdt_last_cmd_puts("Cache domain offline\n"); 347 - ret = -ENODEV; 348 - } else { 349 - mask = &rdtgrp->plr->d->hdr.cpu_mask; 350 - seq_printf(s, is_cpu_list(of) ? 351 - "%*pbl\n" : "%*pb\n", 352 - cpumask_pr_args(mask)); 353 - } 354 - } else { 355 - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", 356 - cpumask_pr_args(&rdtgrp->cpu_mask)); 357 - } 358 - } else { 359 - ret = -ENOENT; 360 - } 361 - rdtgroup_kn_unlock(of->kn); 362 - 363 - return ret; 364 - } 365 - 366 - /* 367 - * This is safe against resctrl_sched_in() called from __switch_to() 41 + * This is safe against resctrl_arch_sched_in() called from __switch_to() 368 42 * because __switch_to() is executed with interrupts disabled. A local call 369 43 * from update_closid_rmid() is protected against __switch_to() because 370 44 * preemption is disabled. ··· 58 382 * executing task might have its own closid selected. Just reuse 59 383 * the context switch code. 60 384 */ 61 - resctrl_sched_in(current); 62 - } 63 - 64 - /* 65 - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 66 - * 67 - * Per task closids/rmids must have been set up before calling this function. 68 - * @r may be NULL. 69 - */ 70 - static void 71 - update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) 72 - { 73 - struct resctrl_cpu_defaults defaults, *p = NULL; 74 - 75 - if (r) { 76 - defaults.closid = r->closid; 77 - defaults.rmid = r->mon.rmid; 78 - p = &defaults; 79 - } 80 - 81 - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); 82 - } 83 - 84 - static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 85 - cpumask_var_t tmpmask) 86 - { 87 - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; 88 - struct list_head *head; 89 - 90 - /* Check whether cpus belong to parent ctrl group */ 91 - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); 92 - if (!cpumask_empty(tmpmask)) { 93 - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); 94 - return -EINVAL; 95 - } 96 - 97 - /* Check whether cpus are dropped from this group */ 98 - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 99 - if (!cpumask_empty(tmpmask)) { 100 - /* Give any dropped cpus to parent rdtgroup */ 101 - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); 102 - update_closid_rmid(tmpmask, prgrp); 103 - } 104 - 105 - /* 106 - * If we added cpus, remove them from previous group that owned them 107 - * and update per-cpu rmid 108 - */ 109 - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 110 - if (!cpumask_empty(tmpmask)) { 111 - head = &prgrp->mon.crdtgrp_list; 112 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 113 - if (crgrp == rdtgrp) 114 - continue; 115 - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, 116 - tmpmask); 117 - } 118 - update_closid_rmid(tmpmask, rdtgrp); 119 - } 120 - 121 - /* Done pushing/pulling - update this group with new mask */ 122 - cpumask_copy(&rdtgrp->cpu_mask, newmask); 123 - 124 - return 0; 125 - } 126 - 127 - static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) 128 - { 129 - struct rdtgroup *crgrp; 130 - 131 - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); 132 - /* update the child mon group masks as well*/ 133 - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) 134 - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); 135 - } 136 - 137 - static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 138 - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) 139 - { 140 - struct rdtgroup *r, *crgrp; 141 - struct list_head *head; 142 - 143 - /* Check whether cpus are dropped from this group */ 144 - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 145 - if (!cpumask_empty(tmpmask)) { 146 - /* Can't drop from default group */ 147 - if (rdtgrp == &rdtgroup_default) { 148 - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); 149 - return -EINVAL; 150 - } 151 - 152 - /* Give any dropped cpus to rdtgroup_default */ 153 - cpumask_or(&rdtgroup_default.cpu_mask, 154 - &rdtgroup_default.cpu_mask, tmpmask); 155 - update_closid_rmid(tmpmask, &rdtgroup_default); 156 - } 157 - 158 - /* 159 - * If we added cpus, remove them from previous group and 160 - * the prev group's child groups that owned them 161 - * and update per-cpu closid/rmid. 162 - */ 163 - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 164 - if (!cpumask_empty(tmpmask)) { 165 - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { 166 - if (r == rdtgrp) 167 - continue; 168 - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); 169 - if (!cpumask_empty(tmpmask1)) 170 - cpumask_rdtgrp_clear(r, tmpmask1); 171 - } 172 - update_closid_rmid(tmpmask, rdtgrp); 173 - } 174 - 175 - /* Done pushing/pulling - update this group with new mask */ 176 - cpumask_copy(&rdtgrp->cpu_mask, newmask); 177 - 178 - /* 179 - * Clear child mon group masks since there is a new parent mask 180 - * now and update the rmid for the cpus the child lost. 181 - */ 182 - head = &rdtgrp->mon.crdtgrp_list; 183 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 184 - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); 185 - update_closid_rmid(tmpmask, rdtgrp); 186 - cpumask_clear(&crgrp->cpu_mask); 187 - } 188 - 189 - return 0; 190 - } 191 - 192 - static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 193 - char *buf, size_t nbytes, loff_t off) 194 - { 195 - cpumask_var_t tmpmask, newmask, tmpmask1; 196 - struct rdtgroup *rdtgrp; 197 - int ret; 198 - 199 - if (!buf) 200 - return -EINVAL; 201 - 202 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 203 - return -ENOMEM; 204 - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { 205 - free_cpumask_var(tmpmask); 206 - return -ENOMEM; 207 - } 208 - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { 209 - free_cpumask_var(tmpmask); 210 - free_cpumask_var(newmask); 211 - return -ENOMEM; 212 - } 213 - 214 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 215 - if (!rdtgrp) { 216 - ret = -ENOENT; 217 - goto unlock; 218 - } 219 - 220 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 221 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 222 - ret = -EINVAL; 223 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 224 - goto unlock; 225 - } 226 - 227 - if (is_cpu_list(of)) 228 - ret = cpulist_parse(buf, newmask); 229 - else 230 - ret = cpumask_parse(buf, newmask); 231 - 232 - if (ret) { 233 - rdt_last_cmd_puts("Bad CPU list/mask\n"); 234 - goto unlock; 235 - } 236 - 237 - /* check that user didn't specify any offline cpus */ 238 - cpumask_andnot(tmpmask, newmask, cpu_online_mask); 239 - if (!cpumask_empty(tmpmask)) { 240 - ret = -EINVAL; 241 - rdt_last_cmd_puts("Can only assign online CPUs\n"); 242 - goto unlock; 243 - } 244 - 245 - if (rdtgrp->type == RDTCTRL_GROUP) 246 - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); 247 - else if (rdtgrp->type == RDTMON_GROUP) 248 - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); 249 - else 250 - ret = -EINVAL; 251 - 252 - unlock: 253 - rdtgroup_kn_unlock(of->kn); 254 - free_cpumask_var(tmpmask); 255 - free_cpumask_var(newmask); 256 - free_cpumask_var(tmpmask1); 257 - 258 - return ret ?: nbytes; 259 - } 260 - 261 - /** 262 - * rdtgroup_remove - the helper to remove resource group safely 263 - * @rdtgrp: resource group to remove 264 - * 265 - * On resource group creation via a mkdir, an extra kernfs_node reference is 266 - * taken to ensure that the rdtgroup structure remains accessible for the 267 - * rdtgroup_kn_unlock() calls where it is removed. 268 - * 269 - * Drop the extra reference here, then free the rdtgroup structure. 270 - * 271 - * Return: void 272 - */ 273 - static void rdtgroup_remove(struct rdtgroup *rdtgrp) 274 - { 275 - kernfs_put(rdtgrp->kn); 276 - kfree(rdtgrp); 277 - } 278 - 279 - static void _update_task_closid_rmid(void *task) 280 - { 281 - /* 282 - * If the task is still current on this CPU, update PQR_ASSOC MSR. 283 - * Otherwise, the MSR is updated when the task is scheduled in. 284 - */ 285 - if (task == current) 286 - resctrl_sched_in(task); 287 - } 288 - 289 - static void update_task_closid_rmid(struct task_struct *t) 290 - { 291 - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) 292 - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); 293 - else 294 - _update_task_closid_rmid(t); 295 - } 296 - 297 - static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) 298 - { 299 - u32 closid, rmid = rdtgrp->mon.rmid; 300 - 301 - if (rdtgrp->type == RDTCTRL_GROUP) 302 - closid = rdtgrp->closid; 303 - else if (rdtgrp->type == RDTMON_GROUP) 304 - closid = rdtgrp->mon.parent->closid; 305 - else 306 - return false; 307 - 308 - return resctrl_arch_match_closid(tsk, closid) && 309 - resctrl_arch_match_rmid(tsk, closid, rmid); 310 - } 311 - 312 - static int __rdtgroup_move_task(struct task_struct *tsk, 313 - struct rdtgroup *rdtgrp) 314 - { 315 - /* If the task is already in rdtgrp, no need to move the task. */ 316 - if (task_in_rdtgroup(tsk, rdtgrp)) 317 - return 0; 318 - 319 - /* 320 - * Set the task's closid/rmid before the PQR_ASSOC MSR can be 321 - * updated by them. 322 - * 323 - * For ctrl_mon groups, move both closid and rmid. 324 - * For monitor groups, can move the tasks only from 325 - * their parent CTRL group. 326 - */ 327 - if (rdtgrp->type == RDTMON_GROUP && 328 - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { 329 - rdt_last_cmd_puts("Can't move task to different control group\n"); 330 - return -EINVAL; 331 - } 332 - 333 - if (rdtgrp->type == RDTMON_GROUP) 334 - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, 335 - rdtgrp->mon.rmid); 336 - else 337 - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, 338 - rdtgrp->mon.rmid); 339 - 340 - /* 341 - * Ensure the task's closid and rmid are written before determining if 342 - * the task is current that will decide if it will be interrupted. 343 - * This pairs with the full barrier between the rq->curr update and 344 - * resctrl_sched_in() during context switch. 345 - */ 346 - smp_mb(); 347 - 348 - /* 349 - * By now, the task's closid and rmid are set. If the task is current 350 - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource 351 - * group go into effect. If the task is not current, the MSR will be 352 - * updated when the task is scheduled in. 353 - */ 354 - update_task_closid_rmid(tsk); 355 - 356 - return 0; 357 - } 358 - 359 - static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) 360 - { 361 - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && 362 - resctrl_arch_match_closid(t, r->closid)); 363 - } 364 - 365 - static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) 366 - { 367 - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && 368 - resctrl_arch_match_rmid(t, r->mon.parent->closid, 369 - r->mon.rmid)); 370 - } 371 - 372 - /** 373 - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group 374 - * @r: Resource group 375 - * 376 - * Return: 1 if tasks have been assigned to @r, 0 otherwise 377 - */ 378 - int rdtgroup_tasks_assigned(struct rdtgroup *r) 379 - { 380 - struct task_struct *p, *t; 381 - int ret = 0; 382 - 383 - lockdep_assert_held(&rdtgroup_mutex); 384 - 385 - rcu_read_lock(); 386 - for_each_process_thread(p, t) { 387 - if (is_closid_match(t, r) || is_rmid_match(t, r)) { 388 - ret = 1; 389 - break; 390 - } 391 - } 392 - rcu_read_unlock(); 393 - 394 - return ret; 395 - } 396 - 397 - static int rdtgroup_task_write_permission(struct task_struct *task, 398 - struct kernfs_open_file *of) 399 - { 400 - const struct cred *tcred = get_task_cred(task); 401 - const struct cred *cred = current_cred(); 402 - int ret = 0; 403 - 404 - /* 405 - * Even if we're attaching all tasks in the thread group, we only 406 - * need to check permissions on one of them. 407 - */ 408 - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 409 - !uid_eq(cred->euid, tcred->uid) && 410 - !uid_eq(cred->euid, tcred->suid)) { 411 - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); 412 - ret = -EPERM; 413 - } 414 - 415 - put_cred(tcred); 416 - return ret; 417 - } 418 - 419 - static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, 420 - struct kernfs_open_file *of) 421 - { 422 - struct task_struct *tsk; 423 - int ret; 424 - 425 - rcu_read_lock(); 426 - if (pid) { 427 - tsk = find_task_by_vpid(pid); 428 - if (!tsk) { 429 - rcu_read_unlock(); 430 - rdt_last_cmd_printf("No task %d\n", pid); 431 - return -ESRCH; 432 - } 433 - } else { 434 - tsk = current; 435 - } 436 - 437 - get_task_struct(tsk); 438 - rcu_read_unlock(); 439 - 440 - ret = rdtgroup_task_write_permission(tsk, of); 441 - if (!ret) 442 - ret = __rdtgroup_move_task(tsk, rdtgrp); 443 - 444 - put_task_struct(tsk); 445 - return ret; 446 - } 447 - 448 - static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, 449 - char *buf, size_t nbytes, loff_t off) 450 - { 451 - struct rdtgroup *rdtgrp; 452 - char *pid_str; 453 - int ret = 0; 454 - pid_t pid; 455 - 456 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 457 - if (!rdtgrp) { 458 - rdtgroup_kn_unlock(of->kn); 459 - return -ENOENT; 460 - } 461 - rdt_last_cmd_clear(); 462 - 463 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 464 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 465 - ret = -EINVAL; 466 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 467 - goto unlock; 468 - } 469 - 470 - while (buf && buf[0] != '\0' && buf[0] != '\n') { 471 - pid_str = strim(strsep(&buf, ",")); 472 - 473 - if (kstrtoint(pid_str, 0, &pid)) { 474 - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); 475 - ret = -EINVAL; 476 - break; 477 - } 478 - 479 - if (pid < 0) { 480 - rdt_last_cmd_printf("Invalid pid %d\n", pid); 481 - ret = -EINVAL; 482 - break; 483 - } 484 - 485 - ret = rdtgroup_move_task(pid, rdtgrp, of); 486 - if (ret) { 487 - rdt_last_cmd_printf("Error while processing task %d\n", pid); 488 - break; 489 - } 490 - } 491 - 492 - unlock: 493 - rdtgroup_kn_unlock(of->kn); 494 - 495 - return ret ?: nbytes; 496 - } 497 - 498 - static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) 499 - { 500 - struct task_struct *p, *t; 501 - pid_t pid; 502 - 503 - rcu_read_lock(); 504 - for_each_process_thread(p, t) { 505 - if (is_closid_match(t, r) || is_rmid_match(t, r)) { 506 - pid = task_pid_vnr(t); 507 - if (pid) 508 - seq_printf(s, "%d\n", pid); 509 - } 510 - } 511 - rcu_read_unlock(); 512 - } 513 - 514 - static int rdtgroup_tasks_show(struct kernfs_open_file *of, 515 - struct seq_file *s, void *v) 516 - { 517 - struct rdtgroup *rdtgrp; 518 - int ret = 0; 519 - 520 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 521 - if (rdtgrp) 522 - show_rdt_tasks(rdtgrp, s); 523 - else 524 - ret = -ENOENT; 525 - rdtgroup_kn_unlock(of->kn); 526 - 527 - return ret; 528 - } 529 - 530 - static int rdtgroup_closid_show(struct kernfs_open_file *of, 531 - struct seq_file *s, void *v) 532 - { 533 - struct rdtgroup *rdtgrp; 534 - int ret = 0; 535 - 536 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 537 - if (rdtgrp) 538 - seq_printf(s, "%u\n", rdtgrp->closid); 539 - else 540 - ret = -ENOENT; 541 - rdtgroup_kn_unlock(of->kn); 542 - 543 - return ret; 544 - } 545 - 546 - static int rdtgroup_rmid_show(struct kernfs_open_file *of, 547 - struct seq_file *s, void *v) 548 - { 549 - struct rdtgroup *rdtgrp; 550 - int ret = 0; 551 - 552 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 553 - if (rdtgrp) 554 - seq_printf(s, "%u\n", rdtgrp->mon.rmid); 555 - else 556 - ret = -ENOENT; 557 - rdtgroup_kn_unlock(of->kn); 558 - 559 - return ret; 560 - } 561 - 562 - #ifdef CONFIG_PROC_CPU_RESCTRL 563 - 564 - /* 565 - * A task can only be part of one resctrl control group and of one monitor 566 - * group which is associated to that control group. 567 - * 568 - * 1) res: 569 - * mon: 570 - * 571 - * resctrl is not available. 572 - * 573 - * 2) res:/ 574 - * mon: 575 - * 576 - * Task is part of the root resctrl control group, and it is not associated 577 - * to any monitor group. 578 - * 579 - * 3) res:/ 580 - * mon:mon0 581 - * 582 - * Task is part of the root resctrl control group and monitor group mon0. 583 - * 584 - * 4) res:group0 585 - * mon: 586 - * 587 - * Task is part of resctrl control group group0, and it is not associated 588 - * to any monitor group. 589 - * 590 - * 5) res:group0 591 - * mon:mon1 592 - * 593 - * Task is part of resctrl control group group0 and monitor group mon1. 594 - */ 595 - int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, 596 - struct pid *pid, struct task_struct *tsk) 597 - { 598 - struct rdtgroup *rdtg; 599 - int ret = 0; 600 - 601 - mutex_lock(&rdtgroup_mutex); 602 - 603 - /* Return empty if resctrl has not been mounted. */ 604 - if (!resctrl_mounted) { 605 - seq_puts(s, "res:\nmon:\n"); 606 - goto unlock; 607 - } 608 - 609 - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { 610 - struct rdtgroup *crg; 611 - 612 - /* 613 - * Task information is only relevant for shareable 614 - * and exclusive groups. 615 - */ 616 - if (rdtg->mode != RDT_MODE_SHAREABLE && 617 - rdtg->mode != RDT_MODE_EXCLUSIVE) 618 - continue; 619 - 620 - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) 621 - continue; 622 - 623 - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", 624 - rdt_kn_name(rdtg->kn)); 625 - seq_puts(s, "mon:"); 626 - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, 627 - mon.crdtgrp_list) { 628 - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, 629 - crg->mon.rmid)) 630 - continue; 631 - seq_printf(s, "%s", rdt_kn_name(crg->kn)); 632 - break; 633 - } 634 - seq_putc(s, '\n'); 635 - goto unlock; 636 - } 637 - /* 638 - * The above search should succeed. Otherwise return 639 - * with an error. 640 - */ 641 - ret = -ENOENT; 642 - unlock: 643 - mutex_unlock(&rdtgroup_mutex); 644 - 645 - return ret; 646 - } 647 - #endif 648 - 649 - static int rdt_last_cmd_status_show(struct kernfs_open_file *of, 650 - struct seq_file *seq, void *v) 651 - { 652 - int len; 653 - 654 - mutex_lock(&rdtgroup_mutex); 655 - len = seq_buf_used(&last_cmd_status); 656 - if (len) 657 - seq_printf(seq, "%.*s", len, last_cmd_status_buf); 658 - else 659 - seq_puts(seq, "ok\n"); 660 - mutex_unlock(&rdtgroup_mutex); 661 - return 0; 662 - } 663 - 664 - static void *rdt_kn_parent_priv(struct kernfs_node *kn) 665 - { 666 - /* 667 - * The parent pointer is only valid within RCU section since it can be 668 - * replaced. 669 - */ 670 - guard(rcu)(); 671 - return rcu_dereference(kn->__parent)->priv; 672 - } 673 - 674 - static int rdt_num_closids_show(struct kernfs_open_file *of, 675 - struct seq_file *seq, void *v) 676 - { 677 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 678 - 679 - seq_printf(seq, "%u\n", s->num_closid); 680 - return 0; 681 - } 682 - 683 - static int rdt_default_ctrl_show(struct kernfs_open_file *of, 684 - struct seq_file *seq, void *v) 685 - { 686 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 687 - struct rdt_resource *r = s->res; 688 - 689 - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); 690 - return 0; 691 - } 692 - 693 - static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, 694 - struct seq_file *seq, void *v) 695 - { 696 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 697 - struct rdt_resource *r = s->res; 698 - 699 - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); 700 - return 0; 701 - } 702 - 703 - static int rdt_shareable_bits_show(struct kernfs_open_file *of, 704 - struct seq_file *seq, void *v) 705 - { 706 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 707 - struct rdt_resource *r = s->res; 708 - 709 - seq_printf(seq, "%x\n", r->cache.shareable_bits); 710 - return 0; 711 - } 712 - 713 - /* 714 - * rdt_bit_usage_show - Display current usage of resources 715 - * 716 - * A domain is a shared resource that can now be allocated differently. Here 717 - * we display the current regions of the domain as an annotated bitmask. 718 - * For each domain of this resource its allocation bitmask 719 - * is annotated as below to indicate the current usage of the corresponding bit: 720 - * 0 - currently unused 721 - * X - currently available for sharing and used by software and hardware 722 - * H - currently used by hardware only but available for software use 723 - * S - currently used and shareable by software only 724 - * E - currently used exclusively by one resource group 725 - * P - currently pseudo-locked by one resource group 726 - */ 727 - static int rdt_bit_usage_show(struct kernfs_open_file *of, 728 - struct seq_file *seq, void *v) 729 - { 730 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 731 - /* 732 - * Use unsigned long even though only 32 bits are used to ensure 733 - * test_bit() is used safely. 734 - */ 735 - unsigned long sw_shareable = 0, hw_shareable = 0; 736 - unsigned long exclusive = 0, pseudo_locked = 0; 737 - struct rdt_resource *r = s->res; 738 - struct rdt_ctrl_domain *dom; 739 - int i, hwb, swb, excl, psl; 740 - enum rdtgrp_mode mode; 741 - bool sep = false; 742 - u32 ctrl_val; 743 - 744 - cpus_read_lock(); 745 - mutex_lock(&rdtgroup_mutex); 746 - hw_shareable = r->cache.shareable_bits; 747 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 748 - if (sep) 749 - seq_putc(seq, ';'); 750 - sw_shareable = 0; 751 - exclusive = 0; 752 - seq_printf(seq, "%d=", dom->hdr.id); 753 - for (i = 0; i < closids_supported(); i++) { 754 - if (!closid_allocated(i)) 755 - continue; 756 - ctrl_val = resctrl_arch_get_config(r, dom, i, 757 - s->conf_type); 758 - mode = rdtgroup_mode_by_closid(i); 759 - switch (mode) { 760 - case RDT_MODE_SHAREABLE: 761 - sw_shareable |= ctrl_val; 762 - break; 763 - case RDT_MODE_EXCLUSIVE: 764 - exclusive |= ctrl_val; 765 - break; 766 - case RDT_MODE_PSEUDO_LOCKSETUP: 767 - /* 768 - * RDT_MODE_PSEUDO_LOCKSETUP is possible 769 - * here but not included since the CBM 770 - * associated with this CLOSID in this mode 771 - * is not initialized and no task or cpu can be 772 - * assigned this CLOSID. 773 - */ 774 - break; 775 - case RDT_MODE_PSEUDO_LOCKED: 776 - case RDT_NUM_MODES: 777 - WARN(1, 778 - "invalid mode for closid %d\n", i); 779 - break; 780 - } 781 - } 782 - for (i = r->cache.cbm_len - 1; i >= 0; i--) { 783 - pseudo_locked = dom->plr ? dom->plr->cbm : 0; 784 - hwb = test_bit(i, &hw_shareable); 785 - swb = test_bit(i, &sw_shareable); 786 - excl = test_bit(i, &exclusive); 787 - psl = test_bit(i, &pseudo_locked); 788 - if (hwb && swb) 789 - seq_putc(seq, 'X'); 790 - else if (hwb && !swb) 791 - seq_putc(seq, 'H'); 792 - else if (!hwb && swb) 793 - seq_putc(seq, 'S'); 794 - else if (excl) 795 - seq_putc(seq, 'E'); 796 - else if (psl) 797 - seq_putc(seq, 'P'); 798 - else /* Unused bits remain */ 799 - seq_putc(seq, '0'); 800 - } 801 - sep = true; 802 - } 803 - seq_putc(seq, '\n'); 804 - mutex_unlock(&rdtgroup_mutex); 805 - cpus_read_unlock(); 806 - return 0; 807 - } 808 - 809 - static int rdt_min_bw_show(struct kernfs_open_file *of, 810 - struct seq_file *seq, void *v) 811 - { 812 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 813 - struct rdt_resource *r = s->res; 814 - 815 - seq_printf(seq, "%u\n", r->membw.min_bw); 816 - return 0; 817 - } 818 - 819 - static int rdt_num_rmids_show(struct kernfs_open_file *of, 820 - struct seq_file *seq, void *v) 821 - { 822 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 823 - 824 - seq_printf(seq, "%d\n", r->num_rmid); 825 - 826 - return 0; 827 - } 828 - 829 - static int rdt_mon_features_show(struct kernfs_open_file *of, 830 - struct seq_file *seq, void *v) 831 - { 832 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 833 - struct mon_evt *mevt; 834 - 835 - list_for_each_entry(mevt, &r->evt_list, list) { 836 - seq_printf(seq, "%s\n", mevt->name); 837 - if (mevt->configurable) 838 - seq_printf(seq, "%s_config\n", mevt->name); 839 - } 840 - 841 - return 0; 842 - } 843 - 844 - static int rdt_bw_gran_show(struct kernfs_open_file *of, 845 - struct seq_file *seq, void *v) 846 - { 847 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 848 - struct rdt_resource *r = s->res; 849 - 850 - seq_printf(seq, "%u\n", r->membw.bw_gran); 851 - return 0; 852 - } 853 - 854 - static int rdt_delay_linear_show(struct kernfs_open_file *of, 855 - struct seq_file *seq, void *v) 856 - { 857 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 858 - struct rdt_resource *r = s->res; 859 - 860 - seq_printf(seq, "%u\n", r->membw.delay_linear); 861 - return 0; 862 - } 863 - 864 - static int max_threshold_occ_show(struct kernfs_open_file *of, 865 - struct seq_file *seq, void *v) 866 - { 867 - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); 868 - 869 - return 0; 870 - } 871 - 872 - static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, 873 - struct seq_file *seq, void *v) 874 - { 875 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 876 - struct rdt_resource *r = s->res; 877 - 878 - switch (r->membw.throttle_mode) { 879 - case THREAD_THROTTLE_PER_THREAD: 880 - seq_puts(seq, "per-thread\n"); 881 - return 0; 882 - case THREAD_THROTTLE_MAX: 883 - seq_puts(seq, "max\n"); 884 - return 0; 885 - case THREAD_THROTTLE_UNDEFINED: 886 - seq_puts(seq, "undefined\n"); 887 - return 0; 888 - } 889 - 890 - WARN_ON_ONCE(1); 891 - 892 - return 0; 893 - } 894 - 895 - static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, 896 - char *buf, size_t nbytes, loff_t off) 897 - { 898 - unsigned int bytes; 899 - int ret; 900 - 901 - ret = kstrtouint(buf, 0, &bytes); 902 - if (ret) 903 - return ret; 904 - 905 - if (bytes > resctrl_rmid_realloc_limit) 906 - return -EINVAL; 907 - 908 - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); 909 - 910 - return nbytes; 911 - } 912 - 913 - /* 914 - * rdtgroup_mode_show - Display mode of this resource group 915 - */ 916 - static int rdtgroup_mode_show(struct kernfs_open_file *of, 917 - struct seq_file *s, void *v) 918 - { 919 - struct rdtgroup *rdtgrp; 920 - 921 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 922 - if (!rdtgrp) { 923 - rdtgroup_kn_unlock(of->kn); 924 - return -ENOENT; 925 - } 926 - 927 - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); 928 - 929 - rdtgroup_kn_unlock(of->kn); 930 - return 0; 931 - } 932 - 933 - static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) 934 - { 935 - switch (my_type) { 936 - case CDP_CODE: 937 - return CDP_DATA; 938 - case CDP_DATA: 939 - return CDP_CODE; 940 - default: 941 - case CDP_NONE: 942 - return CDP_NONE; 943 - } 944 - } 945 - 946 - static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, 947 - struct seq_file *seq, void *v) 948 - { 949 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 950 - struct rdt_resource *r = s->res; 951 - 952 - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); 953 - 954 - return 0; 955 - } 956 - 957 - /** 958 - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other 959 - * @r: Resource to which domain instance @d belongs. 960 - * @d: The domain instance for which @closid is being tested. 961 - * @cbm: Capacity bitmask being tested. 962 - * @closid: Intended closid for @cbm. 963 - * @type: CDP type of @r. 964 - * @exclusive: Only check if overlaps with exclusive resource groups 965 - * 966 - * Checks if provided @cbm intended to be used for @closid on domain 967 - * @d overlaps with any other closids or other hardware usage associated 968 - * with this domain. If @exclusive is true then only overlaps with 969 - * resource groups in exclusive mode will be considered. If @exclusive 970 - * is false then overlaps with any resource group or hardware entities 971 - * will be considered. 972 - * 973 - * @cbm is unsigned long, even if only 32 bits are used, to make the 974 - * bitmap functions work correctly. 975 - * 976 - * Return: false if CBM does not overlap, true if it does. 977 - */ 978 - static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, 979 - unsigned long cbm, int closid, 980 - enum resctrl_conf_type type, bool exclusive) 981 - { 982 - enum rdtgrp_mode mode; 983 - unsigned long ctrl_b; 984 - int i; 985 - 986 - /* Check for any overlap with regions used by hardware directly */ 987 - if (!exclusive) { 988 - ctrl_b = r->cache.shareable_bits; 989 - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) 990 - return true; 991 - } 992 - 993 - /* Check for overlap with other resource groups */ 994 - for (i = 0; i < closids_supported(); i++) { 995 - ctrl_b = resctrl_arch_get_config(r, d, i, type); 996 - mode = rdtgroup_mode_by_closid(i); 997 - if (closid_allocated(i) && i != closid && 998 - mode != RDT_MODE_PSEUDO_LOCKSETUP) { 999 - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { 1000 - if (exclusive) { 1001 - if (mode == RDT_MODE_EXCLUSIVE) 1002 - return true; 1003 - continue; 1004 - } 1005 - return true; 1006 - } 1007 - } 1008 - } 1009 - 1010 - return false; 1011 - } 1012 - 1013 - /** 1014 - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware 1015 - * @s: Schema for the resource to which domain instance @d belongs. 1016 - * @d: The domain instance for which @closid is being tested. 1017 - * @cbm: Capacity bitmask being tested. 1018 - * @closid: Intended closid for @cbm. 1019 - * @exclusive: Only check if overlaps with exclusive resource groups 1020 - * 1021 - * Resources that can be allocated using a CBM can use the CBM to control 1022 - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test 1023 - * for overlap. Overlap test is not limited to the specific resource for 1024 - * which the CBM is intended though - when dealing with CDP resources that 1025 - * share the underlying hardware the overlap check should be performed on 1026 - * the CDP resource sharing the hardware also. 1027 - * 1028 - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the 1029 - * overlap test. 1030 - * 1031 - * Return: true if CBM overlap detected, false if there is no overlap 1032 - */ 1033 - bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 1034 - unsigned long cbm, int closid, bool exclusive) 1035 - { 1036 - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 1037 - struct rdt_resource *r = s->res; 1038 - 1039 - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, 1040 - exclusive)) 1041 - return true; 1042 - 1043 - if (!resctrl_arch_get_cdp_enabled(r->rid)) 1044 - return false; 1045 - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); 1046 - } 1047 - 1048 - /** 1049 - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive 1050 - * @rdtgrp: Resource group identified through its closid. 1051 - * 1052 - * An exclusive resource group implies that there should be no sharing of 1053 - * its allocated resources. At the time this group is considered to be 1054 - * exclusive this test can determine if its current schemata supports this 1055 - * setting by testing for overlap with all other resource groups. 1056 - * 1057 - * Return: true if resource group can be exclusive, false if there is overlap 1058 - * with allocations of other resource groups and thus this resource group 1059 - * cannot be exclusive. 1060 - */ 1061 - static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) 1062 - { 1063 - int closid = rdtgrp->closid; 1064 - struct rdt_ctrl_domain *d; 1065 - struct resctrl_schema *s; 1066 - struct rdt_resource *r; 1067 - bool has_cache = false; 1068 - u32 ctrl; 1069 - 1070 - /* Walking r->domains, ensure it can't race with cpuhp */ 1071 - lockdep_assert_cpus_held(); 1072 - 1073 - list_for_each_entry(s, &resctrl_schema_all, list) { 1074 - r = s->res; 1075 - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) 1076 - continue; 1077 - has_cache = true; 1078 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1079 - ctrl = resctrl_arch_get_config(r, d, closid, 1080 - s->conf_type); 1081 - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { 1082 - rdt_last_cmd_puts("Schemata overlaps\n"); 1083 - return false; 1084 - } 1085 - } 1086 - } 1087 - 1088 - if (!has_cache) { 1089 - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); 1090 - return false; 1091 - } 1092 - 1093 - return true; 1094 - } 1095 - 1096 - /* 1097 - * rdtgroup_mode_write - Modify the resource group's mode 1098 - */ 1099 - static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, 1100 - char *buf, size_t nbytes, loff_t off) 1101 - { 1102 - struct rdtgroup *rdtgrp; 1103 - enum rdtgrp_mode mode; 1104 - int ret = 0; 1105 - 1106 - /* Valid input requires a trailing newline */ 1107 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 1108 - return -EINVAL; 1109 - buf[nbytes - 1] = '\0'; 1110 - 1111 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 1112 - if (!rdtgrp) { 1113 - rdtgroup_kn_unlock(of->kn); 1114 - return -ENOENT; 1115 - } 1116 - 1117 - rdt_last_cmd_clear(); 1118 - 1119 - mode = rdtgrp->mode; 1120 - 1121 - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || 1122 - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || 1123 - (!strcmp(buf, "pseudo-locksetup") && 1124 - mode == RDT_MODE_PSEUDO_LOCKSETUP) || 1125 - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) 1126 - goto out; 1127 - 1128 - if (mode == RDT_MODE_PSEUDO_LOCKED) { 1129 - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); 1130 - ret = -EINVAL; 1131 - goto out; 1132 - } 1133 - 1134 - if (!strcmp(buf, "shareable")) { 1135 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1136 - ret = rdtgroup_locksetup_exit(rdtgrp); 1137 - if (ret) 1138 - goto out; 1139 - } 1140 - rdtgrp->mode = RDT_MODE_SHAREABLE; 1141 - } else if (!strcmp(buf, "exclusive")) { 1142 - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { 1143 - ret = -EINVAL; 1144 - goto out; 1145 - } 1146 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1147 - ret = rdtgroup_locksetup_exit(rdtgrp); 1148 - if (ret) 1149 - goto out; 1150 - } 1151 - rdtgrp->mode = RDT_MODE_EXCLUSIVE; 1152 - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && 1153 - !strcmp(buf, "pseudo-locksetup")) { 1154 - ret = rdtgroup_locksetup_enter(rdtgrp); 1155 - if (ret) 1156 - goto out; 1157 - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; 1158 - } else { 1159 - rdt_last_cmd_puts("Unknown or unsupported mode\n"); 1160 - ret = -EINVAL; 1161 - } 1162 - 1163 - out: 1164 - rdtgroup_kn_unlock(of->kn); 1165 - return ret ?: nbytes; 1166 - } 1167 - 1168 - /** 1169 - * rdtgroup_cbm_to_size - Translate CBM to size in bytes 1170 - * @r: RDT resource to which @d belongs. 1171 - * @d: RDT domain instance. 1172 - * @cbm: bitmask for which the size should be computed. 1173 - * 1174 - * The bitmask provided associated with the RDT domain instance @d will be 1175 - * translated into how many bytes it represents. The size in bytes is 1176 - * computed by first dividing the total cache size by the CBM length to 1177 - * determine how many bytes each bit in the bitmask represents. The result 1178 - * is multiplied with the number of bits set in the bitmask. 1179 - * 1180 - * @cbm is unsigned long, even if only 32 bits are used to make the 1181 - * bitmap functions work correctly. 1182 - */ 1183 - unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, 1184 - struct rdt_ctrl_domain *d, unsigned long cbm) 1185 - { 1186 - unsigned int size = 0; 1187 - struct cacheinfo *ci; 1188 - int num_b; 1189 - 1190 - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) 1191 - return size; 1192 - 1193 - num_b = bitmap_weight(&cbm, r->cache.cbm_len); 1194 - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); 1195 - if (ci) 1196 - size = ci->size / r->cache.cbm_len * num_b; 1197 - 1198 - return size; 1199 - } 1200 - 1201 - /* 1202 - * rdtgroup_size_show - Display size in bytes of allocated regions 1203 - * 1204 - * The "size" file mirrors the layout of the "schemata" file, printing the 1205 - * size in bytes of each region instead of the capacity bitmask. 1206 - */ 1207 - static int rdtgroup_size_show(struct kernfs_open_file *of, 1208 - struct seq_file *s, void *v) 1209 - { 1210 - struct resctrl_schema *schema; 1211 - enum resctrl_conf_type type; 1212 - struct rdt_ctrl_domain *d; 1213 - struct rdtgroup *rdtgrp; 1214 - struct rdt_resource *r; 1215 - unsigned int size; 1216 - int ret = 0; 1217 - u32 closid; 1218 - bool sep; 1219 - u32 ctrl; 1220 - 1221 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 1222 - if (!rdtgrp) { 1223 - rdtgroup_kn_unlock(of->kn); 1224 - return -ENOENT; 1225 - } 1226 - 1227 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1228 - if (!rdtgrp->plr->d) { 1229 - rdt_last_cmd_clear(); 1230 - rdt_last_cmd_puts("Cache domain offline\n"); 1231 - ret = -ENODEV; 1232 - } else { 1233 - seq_printf(s, "%*s:", max_name_width, 1234 - rdtgrp->plr->s->name); 1235 - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, 1236 - rdtgrp->plr->d, 1237 - rdtgrp->plr->cbm); 1238 - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); 1239 - } 1240 - goto out; 1241 - } 1242 - 1243 - closid = rdtgrp->closid; 1244 - 1245 - list_for_each_entry(schema, &resctrl_schema_all, list) { 1246 - r = schema->res; 1247 - type = schema->conf_type; 1248 - sep = false; 1249 - seq_printf(s, "%*s:", max_name_width, schema->name); 1250 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1251 - if (sep) 1252 - seq_putc(s, ';'); 1253 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1254 - size = 0; 1255 - } else { 1256 - if (is_mba_sc(r)) 1257 - ctrl = d->mbps_val[closid]; 1258 - else 1259 - ctrl = resctrl_arch_get_config(r, d, 1260 - closid, 1261 - type); 1262 - if (r->rid == RDT_RESOURCE_MBA || 1263 - r->rid == RDT_RESOURCE_SMBA) 1264 - size = ctrl; 1265 - else 1266 - size = rdtgroup_cbm_to_size(r, d, ctrl); 1267 - } 1268 - seq_printf(s, "%d=%u", d->hdr.id, size); 1269 - sep = true; 1270 - } 1271 - seq_putc(s, '\n'); 1272 - } 1273 - 1274 - out: 1275 - rdtgroup_kn_unlock(of->kn); 1276 - 1277 - return ret; 385 + resctrl_arch_sched_in(current); 1278 386 } 1279 387 1280 388 #define INVALID_CONFIG_INDEX UINT_MAX ··· 102 1642 config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; 103 1643 } 104 1644 105 - static void mondata_config_read(struct resctrl_mon_config_info *mon_info) 106 - { 107 - smp_call_function_any(&mon_info->d->hdr.cpu_mask, 108 - resctrl_arch_mon_event_config_read, mon_info, 1); 109 - } 110 - 111 - static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) 112 - { 113 - struct resctrl_mon_config_info mon_info; 114 - struct rdt_mon_domain *dom; 115 - bool sep = false; 116 - 117 - cpus_read_lock(); 118 - mutex_lock(&rdtgroup_mutex); 119 - 120 - list_for_each_entry(dom, &r->mon_domains, hdr.list) { 121 - if (sep) 122 - seq_puts(s, ";"); 123 - 124 - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); 125 - mon_info.r = r; 126 - mon_info.d = dom; 127 - mon_info.evtid = evtid; 128 - mondata_config_read(&mon_info); 129 - 130 - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); 131 - sep = true; 132 - } 133 - seq_puts(s, "\n"); 134 - 135 - mutex_unlock(&rdtgroup_mutex); 136 - cpus_read_unlock(); 137 - 138 - return 0; 139 - } 140 - 141 - static int mbm_total_bytes_config_show(struct kernfs_open_file *of, 142 - struct seq_file *seq, void *v) 143 - { 144 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 145 - 146 - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); 147 - 148 - return 0; 149 - } 150 - 151 - static int mbm_local_bytes_config_show(struct kernfs_open_file *of, 152 - struct seq_file *seq, void *v) 153 - { 154 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 155 - 156 - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); 157 - 158 - return 0; 159 - } 160 - 161 1645 void resctrl_arch_mon_event_config_write(void *_config_info) 162 1646 { 163 1647 struct resctrl_mon_config_info *config_info = _config_info; ··· 113 1709 return; 114 1710 } 115 1711 wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); 116 - } 117 - 118 - static void mbm_config_write_domain(struct rdt_resource *r, 119 - struct rdt_mon_domain *d, u32 evtid, u32 val) 120 - { 121 - struct resctrl_mon_config_info mon_info = {0}; 122 - 123 - /* 124 - * Read the current config value first. If both are the same then 125 - * no need to write it again. 126 - */ 127 - mon_info.r = r; 128 - mon_info.d = d; 129 - mon_info.evtid = evtid; 130 - mondata_config_read(&mon_info); 131 - if (mon_info.mon_config == val) 132 - return; 133 - 134 - mon_info.mon_config = val; 135 - 136 - /* 137 - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the 138 - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE 139 - * are scoped at the domain level. Writing any of these MSRs 140 - * on one CPU is observed by all the CPUs in the domain. 141 - */ 142 - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, 143 - &mon_info, 1); 144 - 145 - /* 146 - * When an Event Configuration is changed, the bandwidth counters 147 - * for all RMIDs and Events will be cleared by the hardware. The 148 - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for 149 - * every RMID on the next read to any event for every RMID. 150 - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) 151 - * cleared while it is tracked by the hardware. Clear the 152 - * mbm_local and mbm_total counts for all the RMIDs. 153 - */ 154 - resctrl_arch_reset_rmid_all(r, d); 155 - } 156 - 157 - static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) 158 - { 159 - char *dom_str = NULL, *id_str; 160 - unsigned long dom_id, val; 161 - struct rdt_mon_domain *d; 162 - 163 - /* Walking r->domains, ensure it can't race with cpuhp */ 164 - lockdep_assert_cpus_held(); 165 - 166 - next: 167 - if (!tok || tok[0] == '\0') 168 - return 0; 169 - 170 - /* Start processing the strings for each domain */ 171 - dom_str = strim(strsep(&tok, ";")); 172 - id_str = strsep(&dom_str, "="); 173 - 174 - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { 175 - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); 176 - return -EINVAL; 177 - } 178 - 179 - if (!dom_str || kstrtoul(dom_str, 16, &val)) { 180 - rdt_last_cmd_puts("Non-numeric event configuration value\n"); 181 - return -EINVAL; 182 - } 183 - 184 - /* Value from user cannot be more than the supported set of events */ 185 - if ((val & r->mbm_cfg_mask) != val) { 186 - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", 187 - r->mbm_cfg_mask); 188 - return -EINVAL; 189 - } 190 - 191 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 192 - if (d->hdr.id == dom_id) { 193 - mbm_config_write_domain(r, d, evtid, val); 194 - goto next; 195 - } 196 - } 197 - 198 - return -EINVAL; 199 - } 200 - 201 - static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, 202 - char *buf, size_t nbytes, 203 - loff_t off) 204 - { 205 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 206 - int ret; 207 - 208 - /* Valid input requires a trailing newline */ 209 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 210 - return -EINVAL; 211 - 212 - cpus_read_lock(); 213 - mutex_lock(&rdtgroup_mutex); 214 - 215 - rdt_last_cmd_clear(); 216 - 217 - buf[nbytes - 1] = '\0'; 218 - 219 - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); 220 - 221 - mutex_unlock(&rdtgroup_mutex); 222 - cpus_read_unlock(); 223 - 224 - return ret ?: nbytes; 225 - } 226 - 227 - static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, 228 - char *buf, size_t nbytes, 229 - loff_t off) 230 - { 231 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 232 - int ret; 233 - 234 - /* Valid input requires a trailing newline */ 235 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 236 - return -EINVAL; 237 - 238 - cpus_read_lock(); 239 - mutex_lock(&rdtgroup_mutex); 240 - 241 - rdt_last_cmd_clear(); 242 - 243 - buf[nbytes - 1] = '\0'; 244 - 245 - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); 246 - 247 - mutex_unlock(&rdtgroup_mutex); 248 - cpus_read_unlock(); 249 - 250 - return ret ?: nbytes; 251 - } 252 - 253 - /* rdtgroup information files for one cache resource. */ 254 - static struct rftype res_common_files[] = { 255 - { 256 - .name = "last_cmd_status", 257 - .mode = 0444, 258 - .kf_ops = &rdtgroup_kf_single_ops, 259 - .seq_show = rdt_last_cmd_status_show, 260 - .fflags = RFTYPE_TOP_INFO, 261 - }, 262 - { 263 - .name = "num_closids", 264 - .mode = 0444, 265 - .kf_ops = &rdtgroup_kf_single_ops, 266 - .seq_show = rdt_num_closids_show, 267 - .fflags = RFTYPE_CTRL_INFO, 268 - }, 269 - { 270 - .name = "mon_features", 271 - .mode = 0444, 272 - .kf_ops = &rdtgroup_kf_single_ops, 273 - .seq_show = rdt_mon_features_show, 274 - .fflags = RFTYPE_MON_INFO, 275 - }, 276 - { 277 - .name = "num_rmids", 278 - .mode = 0444, 279 - .kf_ops = &rdtgroup_kf_single_ops, 280 - .seq_show = rdt_num_rmids_show, 281 - .fflags = RFTYPE_MON_INFO, 282 - }, 283 - { 284 - .name = "cbm_mask", 285 - .mode = 0444, 286 - .kf_ops = &rdtgroup_kf_single_ops, 287 - .seq_show = rdt_default_ctrl_show, 288 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 289 - }, 290 - { 291 - .name = "min_cbm_bits", 292 - .mode = 0444, 293 - .kf_ops = &rdtgroup_kf_single_ops, 294 - .seq_show = rdt_min_cbm_bits_show, 295 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 296 - }, 297 - { 298 - .name = "shareable_bits", 299 - .mode = 0444, 300 - .kf_ops = &rdtgroup_kf_single_ops, 301 - .seq_show = rdt_shareable_bits_show, 302 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 303 - }, 304 - { 305 - .name = "bit_usage", 306 - .mode = 0444, 307 - .kf_ops = &rdtgroup_kf_single_ops, 308 - .seq_show = rdt_bit_usage_show, 309 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 310 - }, 311 - { 312 - .name = "min_bandwidth", 313 - .mode = 0444, 314 - .kf_ops = &rdtgroup_kf_single_ops, 315 - .seq_show = rdt_min_bw_show, 316 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 317 - }, 318 - { 319 - .name = "bandwidth_gran", 320 - .mode = 0444, 321 - .kf_ops = &rdtgroup_kf_single_ops, 322 - .seq_show = rdt_bw_gran_show, 323 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 324 - }, 325 - { 326 - .name = "delay_linear", 327 - .mode = 0444, 328 - .kf_ops = &rdtgroup_kf_single_ops, 329 - .seq_show = rdt_delay_linear_show, 330 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 331 - }, 332 - /* 333 - * Platform specific which (if any) capabilities are provided by 334 - * thread_throttle_mode. Defer "fflags" initialization to platform 335 - * discovery. 336 - */ 337 - { 338 - .name = "thread_throttle_mode", 339 - .mode = 0444, 340 - .kf_ops = &rdtgroup_kf_single_ops, 341 - .seq_show = rdt_thread_throttle_mode_show, 342 - }, 343 - { 344 - .name = "max_threshold_occupancy", 345 - .mode = 0644, 346 - .kf_ops = &rdtgroup_kf_single_ops, 347 - .write = max_threshold_occ_write, 348 - .seq_show = max_threshold_occ_show, 349 - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, 350 - }, 351 - { 352 - .name = "mbm_total_bytes_config", 353 - .mode = 0644, 354 - .kf_ops = &rdtgroup_kf_single_ops, 355 - .seq_show = mbm_total_bytes_config_show, 356 - .write = mbm_total_bytes_config_write, 357 - }, 358 - { 359 - .name = "mbm_local_bytes_config", 360 - .mode = 0644, 361 - .kf_ops = &rdtgroup_kf_single_ops, 362 - .seq_show = mbm_local_bytes_config_show, 363 - .write = mbm_local_bytes_config_write, 364 - }, 365 - { 366 - .name = "cpus", 367 - .mode = 0644, 368 - .kf_ops = &rdtgroup_kf_single_ops, 369 - .write = rdtgroup_cpus_write, 370 - .seq_show = rdtgroup_cpus_show, 371 - .fflags = RFTYPE_BASE, 372 - }, 373 - { 374 - .name = "cpus_list", 375 - .mode = 0644, 376 - .kf_ops = &rdtgroup_kf_single_ops, 377 - .write = rdtgroup_cpus_write, 378 - .seq_show = rdtgroup_cpus_show, 379 - .flags = RFTYPE_FLAGS_CPUS_LIST, 380 - .fflags = RFTYPE_BASE, 381 - }, 382 - { 383 - .name = "tasks", 384 - .mode = 0644, 385 - .kf_ops = &rdtgroup_kf_single_ops, 386 - .write = rdtgroup_tasks_write, 387 - .seq_show = rdtgroup_tasks_show, 388 - .fflags = RFTYPE_BASE, 389 - }, 390 - { 391 - .name = "mon_hw_id", 392 - .mode = 0444, 393 - .kf_ops = &rdtgroup_kf_single_ops, 394 - .seq_show = rdtgroup_rmid_show, 395 - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, 396 - }, 397 - { 398 - .name = "schemata", 399 - .mode = 0644, 400 - .kf_ops = &rdtgroup_kf_single_ops, 401 - .write = rdtgroup_schemata_write, 402 - .seq_show = rdtgroup_schemata_show, 403 - .fflags = RFTYPE_CTRL_BASE, 404 - }, 405 - { 406 - .name = "mba_MBps_event", 407 - .mode = 0644, 408 - .kf_ops = &rdtgroup_kf_single_ops, 409 - .write = rdtgroup_mba_mbps_event_write, 410 - .seq_show = rdtgroup_mba_mbps_event_show, 411 - }, 412 - { 413 - .name = "mode", 414 - .mode = 0644, 415 - .kf_ops = &rdtgroup_kf_single_ops, 416 - .write = rdtgroup_mode_write, 417 - .seq_show = rdtgroup_mode_show, 418 - .fflags = RFTYPE_CTRL_BASE, 419 - }, 420 - { 421 - .name = "size", 422 - .mode = 0444, 423 - .kf_ops = &rdtgroup_kf_single_ops, 424 - .seq_show = rdtgroup_size_show, 425 - .fflags = RFTYPE_CTRL_BASE, 426 - }, 427 - { 428 - .name = "sparse_masks", 429 - .mode = 0444, 430 - .kf_ops = &rdtgroup_kf_single_ops, 431 - .seq_show = rdt_has_sparse_bitmasks_show, 432 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 433 - }, 434 - { 435 - .name = "ctrl_hw_id", 436 - .mode = 0444, 437 - .kf_ops = &rdtgroup_kf_single_ops, 438 - .seq_show = rdtgroup_closid_show, 439 - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, 440 - }, 441 - 442 - }; 443 - 444 - static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) 445 - { 446 - struct rftype *rfts, *rft; 447 - int ret, len; 448 - 449 - rfts = res_common_files; 450 - len = ARRAY_SIZE(res_common_files); 451 - 452 - lockdep_assert_held(&rdtgroup_mutex); 453 - 454 - if (resctrl_debug) 455 - fflags |= RFTYPE_DEBUG; 456 - 457 - for (rft = rfts; rft < rfts + len; rft++) { 458 - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { 459 - ret = rdtgroup_add_file(kn, rft); 460 - if (ret) 461 - goto error; 462 - } 463 - } 464 - 465 - return 0; 466 - error: 467 - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); 468 - while (--rft >= rfts) { 469 - if ((fflags & rft->fflags) == rft->fflags) 470 - kernfs_remove_by_name(kn, rft->name); 471 - } 472 - return ret; 473 - } 474 - 475 - static struct rftype *rdtgroup_get_rftype_by_name(const char *name) 476 - { 477 - struct rftype *rfts, *rft; 478 - int len; 479 - 480 - rfts = res_common_files; 481 - len = ARRAY_SIZE(res_common_files); 482 - 483 - for (rft = rfts; rft < rfts + len; rft++) { 484 - if (!strcmp(rft->name, name)) 485 - return rft; 486 - } 487 - 488 - return NULL; 489 - } 490 - 491 - static void thread_throttle_mode_init(void) 492 - { 493 - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; 494 - struct rdt_resource *r_mba, *r_smba; 495 - 496 - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 497 - if (r_mba->alloc_capable && 498 - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 499 - throttle_mode = r_mba->membw.throttle_mode; 500 - 501 - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); 502 - if (r_smba->alloc_capable && 503 - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 504 - throttle_mode = r_smba->membw.throttle_mode; 505 - 506 - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) 507 - return; 508 - 509 - resctrl_file_fflags_init("thread_throttle_mode", 510 - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); 511 - } 512 - 513 - void resctrl_file_fflags_init(const char *config, unsigned long fflags) 514 - { 515 - struct rftype *rft; 516 - 517 - rft = rdtgroup_get_rftype_by_name(config); 518 - if (rft) 519 - rft->fflags = fflags; 520 - } 521 - 522 - /** 523 - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file 524 - * @r: The resource group with which the file is associated. 525 - * @name: Name of the file 526 - * 527 - * The permissions of named resctrl file, directory, or link are modified 528 - * to not allow read, write, or execute by any user. 529 - * 530 - * WARNING: This function is intended to communicate to the user that the 531 - * resctrl file has been locked down - that it is not relevant to the 532 - * particular state the system finds itself in. It should not be relied 533 - * on to protect from user access because after the file's permissions 534 - * are restricted the user can still change the permissions using chmod 535 - * from the command line. 536 - * 537 - * Return: 0 on success, <0 on failure. 538 - */ 539 - int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) 540 - { 541 - struct iattr iattr = {.ia_valid = ATTR_MODE,}; 542 - struct kernfs_node *kn; 543 - int ret = 0; 544 - 545 - kn = kernfs_find_and_get_ns(r->kn, name, NULL); 546 - if (!kn) 547 - return -ENOENT; 548 - 549 - switch (kernfs_type(kn)) { 550 - case KERNFS_DIR: 551 - iattr.ia_mode = S_IFDIR; 552 - break; 553 - case KERNFS_FILE: 554 - iattr.ia_mode = S_IFREG; 555 - break; 556 - case KERNFS_LINK: 557 - iattr.ia_mode = S_IFLNK; 558 - break; 559 - } 560 - 561 - ret = kernfs_setattr(kn, &iattr); 562 - kernfs_put(kn); 563 - return ret; 564 - } 565 - 566 - /** 567 - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file 568 - * @r: The resource group with which the file is associated. 569 - * @name: Name of the file 570 - * @mask: Mask of permissions that should be restored 571 - * 572 - * Restore the permissions of the named file. If @name is a directory the 573 - * permissions of its parent will be used. 574 - * 575 - * Return: 0 on success, <0 on failure. 576 - */ 577 - int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 578 - umode_t mask) 579 - { 580 - struct iattr iattr = {.ia_valid = ATTR_MODE,}; 581 - struct kernfs_node *kn, *parent; 582 - struct rftype *rfts, *rft; 583 - int ret, len; 584 - 585 - rfts = res_common_files; 586 - len = ARRAY_SIZE(res_common_files); 587 - 588 - for (rft = rfts; rft < rfts + len; rft++) { 589 - if (!strcmp(rft->name, name)) 590 - iattr.ia_mode = rft->mode & mask; 591 - } 592 - 593 - kn = kernfs_find_and_get_ns(r->kn, name, NULL); 594 - if (!kn) 595 - return -ENOENT; 596 - 597 - switch (kernfs_type(kn)) { 598 - case KERNFS_DIR: 599 - parent = kernfs_get_parent(kn); 600 - if (parent) { 601 - iattr.ia_mode |= parent->mode; 602 - kernfs_put(parent); 603 - } 604 - iattr.ia_mode |= S_IFDIR; 605 - break; 606 - case KERNFS_FILE: 607 - iattr.ia_mode |= S_IFREG; 608 - break; 609 - case KERNFS_LINK: 610 - iattr.ia_mode |= S_IFLNK; 611 - break; 612 - } 613 - 614 - ret = kernfs_setattr(kn, &iattr); 615 - kernfs_put(kn); 616 - return ret; 617 - } 618 - 619 - static int rdtgroup_mkdir_info_resdir(void *priv, char *name, 620 - unsigned long fflags) 621 - { 622 - struct kernfs_node *kn_subdir; 623 - int ret; 624 - 625 - kn_subdir = kernfs_create_dir(kn_info, name, 626 - kn_info->mode, priv); 627 - if (IS_ERR(kn_subdir)) 628 - return PTR_ERR(kn_subdir); 629 - 630 - ret = rdtgroup_kn_set_ugid(kn_subdir); 631 - if (ret) 632 - return ret; 633 - 634 - ret = rdtgroup_add_files(kn_subdir, fflags); 635 - if (!ret) 636 - kernfs_activate(kn_subdir); 637 - 638 - return ret; 639 - } 640 - 641 - static unsigned long fflags_from_resource(struct rdt_resource *r) 642 - { 643 - switch (r->rid) { 644 - case RDT_RESOURCE_L3: 645 - case RDT_RESOURCE_L2: 646 - return RFTYPE_RES_CACHE; 647 - case RDT_RESOURCE_MBA: 648 - case RDT_RESOURCE_SMBA: 649 - return RFTYPE_RES_MB; 650 - } 651 - 652 - return WARN_ON_ONCE(1); 653 - } 654 - 655 - static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 656 - { 657 - struct resctrl_schema *s; 658 - struct rdt_resource *r; 659 - unsigned long fflags; 660 - char name[32]; 661 - int ret; 662 - 663 - /* create the directory */ 664 - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 665 - if (IS_ERR(kn_info)) 666 - return PTR_ERR(kn_info); 667 - 668 - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); 669 - if (ret) 670 - goto out_destroy; 671 - 672 - /* loop over enabled controls, these are all alloc_capable */ 673 - list_for_each_entry(s, &resctrl_schema_all, list) { 674 - r = s->res; 675 - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; 676 - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); 677 - if (ret) 678 - goto out_destroy; 679 - } 680 - 681 - for_each_mon_capable_rdt_resource(r) { 682 - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; 683 - sprintf(name, "%s_MON", r->name); 684 - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); 685 - if (ret) 686 - goto out_destroy; 687 - } 688 - 689 - ret = rdtgroup_kn_set_ugid(kn_info); 690 - if (ret) 691 - goto out_destroy; 692 - 693 - kernfs_activate(kn_info); 694 - 695 - return 0; 696 - 697 - out_destroy: 698 - kernfs_remove(kn_info); 699 - return ret; 700 - } 701 - 702 - static int 703 - mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, 704 - char *name, struct kernfs_node **dest_kn) 705 - { 706 - struct kernfs_node *kn; 707 - int ret; 708 - 709 - /* create the directory */ 710 - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 711 - if (IS_ERR(kn)) 712 - return PTR_ERR(kn); 713 - 714 - if (dest_kn) 715 - *dest_kn = kn; 716 - 717 - ret = rdtgroup_kn_set_ugid(kn); 718 - if (ret) 719 - goto out_destroy; 720 - 721 - kernfs_activate(kn); 722 - 723 - return 0; 724 - 725 - out_destroy: 726 - kernfs_remove(kn); 727 - return ret; 728 1712 } 729 1713 730 1714 static void l3_qos_cfg_update(void *arg) ··· 127 2335 bool *enable = arg; 128 2336 129 2337 wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); 130 - } 131 - 132 - static inline bool is_mba_linear(void) 133 - { 134 - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; 135 2338 } 136 2339 137 2340 static int set_cache_qos_cfg(int level, bool enable) ··· 184 2397 l3_qos_cfg_update(&hw_res->cdp_enabled); 185 2398 } 186 2399 187 - static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) 188 - { 189 - u32 num_closid = resctrl_arch_get_num_closid(r); 190 - int cpu = cpumask_any(&d->hdr.cpu_mask); 191 - int i; 192 - 193 - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), 194 - GFP_KERNEL, cpu_to_node(cpu)); 195 - if (!d->mbps_val) 196 - return -ENOMEM; 197 - 198 - for (i = 0; i < num_closid; i++) 199 - d->mbps_val[i] = MBA_MAX_MBPS; 200 - 201 - return 0; 202 - } 203 - 204 - static void mba_sc_domain_destroy(struct rdt_resource *r, 205 - struct rdt_ctrl_domain *d) 206 - { 207 - kfree(d->mbps_val); 208 - d->mbps_val = NULL; 209 - } 210 - 211 - /* 212 - * MBA software controller is supported only if 213 - * MBM is supported and MBA is in linear scale, 214 - * and the MBM monitor scope is the same as MBA 215 - * control scope. 216 - */ 217 - static bool supports_mba_mbps(void) 218 - { 219 - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); 220 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 221 - 222 - return (resctrl_is_mbm_enabled() && 223 - r->alloc_capable && is_mba_linear() && 224 - r->ctrl_scope == rmbm->mon_scope); 225 - } 226 - 227 - /* 228 - * Enable or disable the MBA software controller 229 - * which helps user specify bandwidth in MBps. 230 - */ 231 - static int set_mba_sc(bool mba_sc) 232 - { 233 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 234 - u32 num_closid = resctrl_arch_get_num_closid(r); 235 - struct rdt_ctrl_domain *d; 236 - unsigned long fflags; 237 - int i; 238 - 239 - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) 240 - return -EINVAL; 241 - 242 - r->membw.mba_sc = mba_sc; 243 - 244 - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; 245 - 246 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 247 - for (i = 0; i < num_closid; i++) 248 - d->mbps_val[i] = MBA_MAX_MBPS; 249 - } 250 - 251 - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; 252 - resctrl_file_fflags_init("mba_MBps_event", fflags); 253 - 254 - return 0; 255 - } 256 - 257 2400 static int cdp_enable(int level) 258 2401 { 259 2402 struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; ··· 224 2507 return 0; 225 2508 } 226 2509 227 - /* 228 - * We don't allow rdtgroup directories to be created anywhere 229 - * except the root directory. Thus when looking for the rdtgroup 230 - * structure for a kernfs node we are either looking at a directory, 231 - * in which case the rdtgroup structure is pointed at by the "priv" 232 - * field, otherwise we have a file, and need only look to the parent 233 - * to find the rdtgroup. 234 - */ 235 - static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) 2510 + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) 236 2511 { 237 - if (kernfs_type(kn) == KERNFS_DIR) { 238 - /* 239 - * All the resource directories use "kn->priv" 240 - * to point to the "struct rdtgroup" for the 241 - * resource. "info" and its subdirectories don't 242 - * have rdtgroup structures, so return NULL here. 243 - */ 244 - if (kn == kn_info || 245 - rcu_access_pointer(kn->__parent) == kn_info) 246 - return NULL; 247 - else 248 - return kn->priv; 249 - } else { 250 - return rdt_kn_parent_priv(kn); 251 - } 252 - } 253 - 254 - static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 255 - { 256 - atomic_inc(&rdtgrp->waitcount); 257 - kernfs_break_active_protection(kn); 258 - } 259 - 260 - static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 261 - { 262 - if (atomic_dec_and_test(&rdtgrp->waitcount) && 263 - (rdtgrp->flags & RDT_DELETED)) { 264 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 265 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 266 - rdtgroup_pseudo_lock_remove(rdtgrp); 267 - kernfs_unbreak_active_protection(kn); 268 - rdtgroup_remove(rdtgrp); 269 - } else { 270 - kernfs_unbreak_active_protection(kn); 271 - } 272 - } 273 - 274 - struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) 275 - { 276 - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 277 - 278 - if (!rdtgrp) 279 - return NULL; 280 - 281 - rdtgroup_kn_get(rdtgrp, kn); 282 - 283 - cpus_read_lock(); 284 - mutex_lock(&rdtgroup_mutex); 285 - 286 - /* Was this group deleted while we waited? */ 287 - if (rdtgrp->flags & RDT_DELETED) 288 - return NULL; 289 - 290 - return rdtgrp; 291 - } 292 - 293 - void rdtgroup_kn_unlock(struct kernfs_node *kn) 294 - { 295 - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 296 - 297 - if (!rdtgrp) 298 - return; 299 - 300 - mutex_unlock(&rdtgroup_mutex); 301 - cpus_read_unlock(); 302 - 303 - rdtgroup_kn_put(rdtgrp, kn); 304 - } 305 - 306 - static int mkdir_mondata_all(struct kernfs_node *parent_kn, 307 - struct rdtgroup *prgrp, 308 - struct kernfs_node **mon_data_kn); 309 - 310 - static void rdt_disable_ctx(void) 311 - { 312 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 313 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 314 - set_mba_sc(false); 315 - 316 - resctrl_debug = false; 317 - } 318 - 319 - static int rdt_enable_ctx(struct rdt_fs_context *ctx) 320 - { 321 - int ret = 0; 322 - 323 - if (ctx->enable_cdpl2) { 324 - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); 325 - if (ret) 326 - goto out_done; 327 - } 328 - 329 - if (ctx->enable_cdpl3) { 330 - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); 331 - if (ret) 332 - goto out_cdpl2; 333 - } 334 - 335 - if (ctx->enable_mba_mbps) { 336 - ret = set_mba_sc(true); 337 - if (ret) 338 - goto out_cdpl3; 339 - } 340 - 341 - if (ctx->enable_debug) 342 - resctrl_debug = true; 343 - 344 - return 0; 345 - 346 - out_cdpl3: 347 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 348 - out_cdpl2: 349 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 350 - out_done: 351 - return ret; 352 - } 353 - 354 - static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) 355 - { 356 - struct resctrl_schema *s; 357 - const char *suffix = ""; 358 - int ret, cl; 359 - 360 - s = kzalloc(sizeof(*s), GFP_KERNEL); 361 - if (!s) 362 - return -ENOMEM; 363 - 364 - s->res = r; 365 - s->num_closid = resctrl_arch_get_num_closid(r); 366 - if (resctrl_arch_get_cdp_enabled(r->rid)) 367 - s->num_closid /= 2; 368 - 369 - s->conf_type = type; 370 - switch (type) { 371 - case CDP_CODE: 372 - suffix = "CODE"; 373 - break; 374 - case CDP_DATA: 375 - suffix = "DATA"; 376 - break; 377 - case CDP_NONE: 378 - suffix = ""; 379 - break; 380 - } 381 - 382 - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); 383 - if (ret >= sizeof(s->name)) { 384 - kfree(s); 385 - return -EINVAL; 386 - } 387 - 388 - cl = strlen(s->name); 389 - 390 - /* 391 - * If CDP is supported by this resource, but not enabled, 392 - * include the suffix. This ensures the tabular format of the 393 - * schemata file does not change between mounts of the filesystem. 394 - */ 395 - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) 396 - cl += 4; 397 - 398 - if (cl > max_name_width) 399 - max_name_width = cl; 400 - 401 - switch (r->schema_fmt) { 402 - case RESCTRL_SCHEMA_BITMAP: 403 - s->fmt_str = "%d=%x"; 404 - break; 405 - case RESCTRL_SCHEMA_RANGE: 406 - s->fmt_str = "%d=%u"; 407 - break; 408 - } 409 - 410 - if (WARN_ON_ONCE(!s->fmt_str)) { 411 - kfree(s); 412 - return -EINVAL; 413 - } 414 - 415 - INIT_LIST_HEAD(&s->list); 416 - list_add(&s->list, &resctrl_schema_all); 417 - 418 - return 0; 419 - } 420 - 421 - static int schemata_list_create(void) 422 - { 423 - struct rdt_resource *r; 424 - int ret = 0; 425 - 426 - for_each_alloc_capable_rdt_resource(r) { 427 - if (resctrl_arch_get_cdp_enabled(r->rid)) { 428 - ret = schemata_list_add(r, CDP_CODE); 429 - if (ret) 430 - break; 431 - 432 - ret = schemata_list_add(r, CDP_DATA); 433 - } else { 434 - ret = schemata_list_add(r, CDP_NONE); 435 - } 436 - 437 - if (ret) 438 - break; 439 - } 440 - 441 - return ret; 442 - } 443 - 444 - static void schemata_list_destroy(void) 445 - { 446 - struct resctrl_schema *s, *tmp; 447 - 448 - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { 449 - list_del(&s->list); 450 - kfree(s); 451 - } 452 - } 453 - 454 - static int rdt_get_tree(struct fs_context *fc) 455 - { 456 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 457 - unsigned long flags = RFTYPE_CTRL_BASE; 458 - struct rdt_mon_domain *dom; 459 - struct rdt_resource *r; 460 - int ret; 461 - 462 - cpus_read_lock(); 463 - mutex_lock(&rdtgroup_mutex); 464 - /* 465 - * resctrl file system can only be mounted once. 466 - */ 467 - if (resctrl_mounted) { 468 - ret = -EBUSY; 469 - goto out; 470 - } 471 - 472 - ret = rdtgroup_setup_root(ctx); 473 - if (ret) 474 - goto out; 475 - 476 - ret = rdt_enable_ctx(ctx); 477 - if (ret) 478 - goto out_root; 479 - 480 - ret = schemata_list_create(); 481 - if (ret) { 482 - schemata_list_destroy(); 483 - goto out_ctx; 484 - } 485 - 486 - closid_init(); 487 - 488 - if (resctrl_arch_mon_capable()) 489 - flags |= RFTYPE_MON; 490 - 491 - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); 492 - if (ret) 493 - goto out_schemata_free; 494 - 495 - kernfs_activate(rdtgroup_default.kn); 496 - 497 - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); 498 - if (ret < 0) 499 - goto out_schemata_free; 500 - 501 - if (resctrl_arch_mon_capable()) { 502 - ret = mongroup_create_dir(rdtgroup_default.kn, 503 - &rdtgroup_default, "mon_groups", 504 - &kn_mongrp); 505 - if (ret < 0) 506 - goto out_info; 507 - 508 - ret = mkdir_mondata_all(rdtgroup_default.kn, 509 - &rdtgroup_default, &kn_mondata); 510 - if (ret < 0) 511 - goto out_mongrp; 512 - rdtgroup_default.mon.mon_data_kn = kn_mondata; 513 - } 514 - 515 - ret = rdt_pseudo_lock_init(); 516 - if (ret) 517 - goto out_mondata; 518 - 519 - ret = kernfs_get_tree(fc); 520 - if (ret < 0) 521 - goto out_psl; 522 - 523 - if (resctrl_arch_alloc_capable()) 524 - resctrl_arch_enable_alloc(); 525 - if (resctrl_arch_mon_capable()) 526 - resctrl_arch_enable_mon(); 527 - 528 - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) 529 - resctrl_mounted = true; 530 - 531 - if (resctrl_is_mbm_enabled()) { 532 - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 533 - list_for_each_entry(dom, &r->mon_domains, hdr.list) 534 - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, 535 - RESCTRL_PICK_ANY_CPU); 536 - } 537 - 538 - goto out; 539 - 540 - out_psl: 541 - rdt_pseudo_lock_release(); 542 - out_mondata: 543 - if (resctrl_arch_mon_capable()) 544 - kernfs_remove(kn_mondata); 545 - out_mongrp: 546 - if (resctrl_arch_mon_capable()) 547 - kernfs_remove(kn_mongrp); 548 - out_info: 549 - kernfs_remove(kn_info); 550 - out_schemata_free: 551 - schemata_list_destroy(); 552 - out_ctx: 553 - rdt_disable_ctx(); 554 - out_root: 555 - rdtgroup_destroy_root(); 556 - out: 557 - rdt_last_cmd_clear(); 558 - mutex_unlock(&rdtgroup_mutex); 559 - cpus_read_unlock(); 560 - return ret; 561 - } 562 - 563 - enum rdt_param { 564 - Opt_cdp, 565 - Opt_cdpl2, 566 - Opt_mba_mbps, 567 - Opt_debug, 568 - nr__rdt_params 569 - }; 570 - 571 - static const struct fs_parameter_spec rdt_fs_parameters[] = { 572 - fsparam_flag("cdp", Opt_cdp), 573 - fsparam_flag("cdpl2", Opt_cdpl2), 574 - fsparam_flag("mba_MBps", Opt_mba_mbps), 575 - fsparam_flag("debug", Opt_debug), 576 - {} 577 - }; 578 - 579 - static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) 580 - { 581 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 582 - struct fs_parse_result result; 583 - const char *msg; 584 - int opt; 585 - 586 - opt = fs_parse(fc, rdt_fs_parameters, param, &result); 587 - if (opt < 0) 588 - return opt; 589 - 590 - switch (opt) { 591 - case Opt_cdp: 592 - ctx->enable_cdpl3 = true; 593 - return 0; 594 - case Opt_cdpl2: 595 - ctx->enable_cdpl2 = true; 596 - return 0; 597 - case Opt_mba_mbps: 598 - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; 599 - if (!supports_mba_mbps()) 600 - return invalfc(fc, msg); 601 - ctx->enable_mba_mbps = true; 602 - return 0; 603 - case Opt_debug: 604 - ctx->enable_debug = true; 605 - return 0; 606 - } 607 - 608 - return -EINVAL; 609 - } 610 - 611 - static void rdt_fs_context_free(struct fs_context *fc) 612 - { 613 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 614 - 615 - kernfs_free_fs_context(fc); 616 - kfree(ctx); 617 - } 618 - 619 - static const struct fs_context_operations rdt_fs_context_ops = { 620 - .free = rdt_fs_context_free, 621 - .parse_param = rdt_parse_param, 622 - .get_tree = rdt_get_tree, 623 - }; 624 - 625 - static int rdt_init_fs_context(struct fs_context *fc) 626 - { 627 - struct rdt_fs_context *ctx; 628 - 629 - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); 630 - if (!ctx) 631 - return -ENOMEM; 632 - 633 - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; 634 - fc->fs_private = &ctx->kfc; 635 - fc->ops = &rdt_fs_context_ops; 636 - put_user_ns(fc->user_ns); 637 - fc->user_ns = get_user_ns(&init_user_ns); 638 - fc->global = true; 639 - return 0; 2512 + return rdt_resources_all[l].cdp_enabled; 640 2513 } 641 2514 642 2515 void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) ··· 259 2952 } 260 2953 261 2954 return; 262 - } 263 - 264 - /* 265 - * Move tasks from one to the other group. If @from is NULL, then all tasks 266 - * in the systems are moved unconditionally (used for teardown). 267 - * 268 - * If @mask is not NULL the cpus on which moved tasks are running are set 269 - * in that mask so the update smp function call is restricted to affected 270 - * cpus. 271 - */ 272 - static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, 273 - struct cpumask *mask) 274 - { 275 - struct task_struct *p, *t; 276 - 277 - read_lock(&tasklist_lock); 278 - for_each_process_thread(p, t) { 279 - if (!from || is_closid_match(t, from) || 280 - is_rmid_match(t, from)) { 281 - resctrl_arch_set_closid_rmid(t, to->closid, 282 - to->mon.rmid); 283 - 284 - /* 285 - * Order the closid/rmid stores above before the loads 286 - * in task_curr(). This pairs with the full barrier 287 - * between the rq->curr update and resctrl_sched_in() 288 - * during context switch. 289 - */ 290 - smp_mb(); 291 - 292 - /* 293 - * If the task is on a CPU, set the CPU in the mask. 294 - * The detection is inaccurate as tasks might move or 295 - * schedule before the smp function call takes place. 296 - * In such a case the function call is pointless, but 297 - * there is no other side effect. 298 - */ 299 - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) 300 - cpumask_set_cpu(task_cpu(t), mask); 301 - } 302 - } 303 - read_unlock(&tasklist_lock); 304 - } 305 - 306 - static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) 307 - { 308 - struct rdtgroup *sentry, *stmp; 309 - struct list_head *head; 310 - 311 - head = &rdtgrp->mon.crdtgrp_list; 312 - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { 313 - free_rmid(sentry->closid, sentry->mon.rmid); 314 - list_del(&sentry->mon.crdtgrp_list); 315 - 316 - if (atomic_read(&sentry->waitcount) != 0) 317 - sentry->flags = RDT_DELETED; 318 - else 319 - rdtgroup_remove(sentry); 320 - } 321 - } 322 - 323 - /* 324 - * Forcibly remove all of subdirectories under root. 325 - */ 326 - static void rmdir_all_sub(void) 327 - { 328 - struct rdtgroup *rdtgrp, *tmp; 329 - 330 - /* Move all tasks to the default resource group */ 331 - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 332 - 333 - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 334 - /* Free any child rmids */ 335 - free_all_child_rdtgrp(rdtgrp); 336 - 337 - /* Remove each rdtgroup other than root */ 338 - if (rdtgrp == &rdtgroup_default) 339 - continue; 340 - 341 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 342 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 343 - rdtgroup_pseudo_lock_remove(rdtgrp); 344 - 345 - /* 346 - * Give any CPUs back to the default group. We cannot copy 347 - * cpu_online_mask because a CPU might have executed the 348 - * offline callback already, but is still marked online. 349 - */ 350 - cpumask_or(&rdtgroup_default.cpu_mask, 351 - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 352 - 353 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 354 - 355 - kernfs_remove(rdtgrp->kn); 356 - list_del(&rdtgrp->rdtgroup_list); 357 - 358 - if (atomic_read(&rdtgrp->waitcount) != 0) 359 - rdtgrp->flags = RDT_DELETED; 360 - else 361 - rdtgroup_remove(rdtgrp); 362 - } 363 - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 364 - update_closid_rmid(cpu_online_mask, &rdtgroup_default); 365 - 366 - kernfs_remove(kn_info); 367 - kernfs_remove(kn_mongrp); 368 - kernfs_remove(kn_mondata); 369 - } 370 - 371 - static void rdt_kill_sb(struct super_block *sb) 372 - { 373 - struct rdt_resource *r; 374 - 375 - cpus_read_lock(); 376 - mutex_lock(&rdtgroup_mutex); 377 - 378 - rdt_disable_ctx(); 379 - 380 - /* Put everything back to default values. */ 381 - for_each_alloc_capable_rdt_resource(r) 382 - resctrl_arch_reset_all_ctrls(r); 383 - 384 - rmdir_all_sub(); 385 - rdt_pseudo_lock_release(); 386 - rdtgroup_default.mode = RDT_MODE_SHAREABLE; 387 - schemata_list_destroy(); 388 - rdtgroup_destroy_root(); 389 - if (resctrl_arch_alloc_capable()) 390 - resctrl_arch_disable_alloc(); 391 - if (resctrl_arch_mon_capable()) 392 - resctrl_arch_disable_mon(); 393 - resctrl_mounted = false; 394 - kernfs_kill_sb(sb); 395 - mutex_unlock(&rdtgroup_mutex); 396 - cpus_read_unlock(); 397 - } 398 - 399 - static struct file_system_type rdt_fs_type = { 400 - .name = "resctrl", 401 - .init_fs_context = rdt_init_fs_context, 402 - .parameters = rdt_fs_parameters, 403 - .kill_sb = rdt_kill_sb, 404 - }; 405 - 406 - static int mon_addfile(struct kernfs_node *parent_kn, const char *name, 407 - void *priv) 408 - { 409 - struct kernfs_node *kn; 410 - int ret = 0; 411 - 412 - kn = __kernfs_create_file(parent_kn, name, 0444, 413 - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, 414 - &kf_mondata_ops, priv, NULL, NULL); 415 - if (IS_ERR(kn)) 416 - return PTR_ERR(kn); 417 - 418 - ret = rdtgroup_kn_set_ugid(kn); 419 - if (ret) { 420 - kernfs_remove(kn); 421 - return ret; 422 - } 423 - 424 - return ret; 425 - } 426 - 427 - static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) 428 - { 429 - struct kernfs_node *kn; 430 - 431 - kn = kernfs_find_and_get(pkn, name); 432 - if (!kn) 433 - return; 434 - kernfs_put(kn); 435 - 436 - if (kn->dir.subdirs <= 1) 437 - kernfs_remove(kn); 438 - else 439 - kernfs_remove_by_name(kn, subname); 440 - } 441 - 442 - /* 443 - * Remove all subdirectories of mon_data of ctrl_mon groups 444 - * and monitor groups for the given domain. 445 - * Remove files and directories containing "sum" of domain data 446 - * when last domain being summed is removed. 447 - */ 448 - static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 449 - struct rdt_mon_domain *d) 450 - { 451 - struct rdtgroup *prgrp, *crgrp; 452 - char subname[32]; 453 - bool snc_mode; 454 - char name[32]; 455 - 456 - snc_mode = r->mon_scope == RESCTRL_L3_NODE; 457 - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 458 - if (snc_mode) 459 - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); 460 - 461 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 462 - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); 463 - 464 - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) 465 - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); 466 - } 467 - } 468 - 469 - static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, 470 - struct rdt_resource *r, struct rdtgroup *prgrp, 471 - bool do_sum) 472 - { 473 - struct rmid_read rr = {0}; 474 - union mon_data_bits priv; 475 - struct mon_evt *mevt; 476 - int ret; 477 - 478 - if (WARN_ON(list_empty(&r->evt_list))) 479 - return -EPERM; 480 - 481 - priv.u.rid = r->rid; 482 - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; 483 - priv.u.sum = do_sum; 484 - list_for_each_entry(mevt, &r->evt_list, list) { 485 - priv.u.evtid = mevt->evtid; 486 - ret = mon_addfile(kn, mevt->name, priv.priv); 487 - if (ret) 488 - return ret; 489 - 490 - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) 491 - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); 492 - } 493 - 494 - return 0; 495 - } 496 - 497 - static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, 498 - struct rdt_mon_domain *d, 499 - struct rdt_resource *r, struct rdtgroup *prgrp) 500 - { 501 - struct kernfs_node *kn, *ckn; 502 - char name[32]; 503 - bool snc_mode; 504 - int ret = 0; 505 - 506 - lockdep_assert_held(&rdtgroup_mutex); 507 - 508 - snc_mode = r->mon_scope == RESCTRL_L3_NODE; 509 - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 510 - kn = kernfs_find_and_get(parent_kn, name); 511 - if (kn) { 512 - /* 513 - * rdtgroup_mutex will prevent this directory from being 514 - * removed. No need to keep this hold. 515 - */ 516 - kernfs_put(kn); 517 - } else { 518 - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 519 - if (IS_ERR(kn)) 520 - return PTR_ERR(kn); 521 - 522 - ret = rdtgroup_kn_set_ugid(kn); 523 - if (ret) 524 - goto out_destroy; 525 - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); 526 - if (ret) 527 - goto out_destroy; 528 - } 529 - 530 - if (snc_mode) { 531 - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); 532 - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); 533 - if (IS_ERR(ckn)) { 534 - ret = -EINVAL; 535 - goto out_destroy; 536 - } 537 - 538 - ret = rdtgroup_kn_set_ugid(ckn); 539 - if (ret) 540 - goto out_destroy; 541 - 542 - ret = mon_add_all_files(ckn, d, r, prgrp, false); 543 - if (ret) 544 - goto out_destroy; 545 - } 546 - 547 - kernfs_activate(kn); 548 - return 0; 549 - 550 - out_destroy: 551 - kernfs_remove(kn); 552 - return ret; 553 - } 554 - 555 - /* 556 - * Add all subdirectories of mon_data for "ctrl_mon" groups 557 - * and "monitor" groups with given domain id. 558 - */ 559 - static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 560 - struct rdt_mon_domain *d) 561 - { 562 - struct kernfs_node *parent_kn; 563 - struct rdtgroup *prgrp, *crgrp; 564 - struct list_head *head; 565 - 566 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 567 - parent_kn = prgrp->mon.mon_data_kn; 568 - mkdir_mondata_subdir(parent_kn, d, r, prgrp); 569 - 570 - head = &prgrp->mon.crdtgrp_list; 571 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 572 - parent_kn = crgrp->mon.mon_data_kn; 573 - mkdir_mondata_subdir(parent_kn, d, r, crgrp); 574 - } 575 - } 576 - } 577 - 578 - static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, 579 - struct rdt_resource *r, 580 - struct rdtgroup *prgrp) 581 - { 582 - struct rdt_mon_domain *dom; 583 - int ret; 584 - 585 - /* Walking r->domains, ensure it can't race with cpuhp */ 586 - lockdep_assert_cpus_held(); 587 - 588 - list_for_each_entry(dom, &r->mon_domains, hdr.list) { 589 - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); 590 - if (ret) 591 - return ret; 592 - } 593 - 594 - return 0; 595 - } 596 - 597 - /* 598 - * This creates a directory mon_data which contains the monitored data. 599 - * 600 - * mon_data has one directory for each domain which are named 601 - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data 602 - * with L3 domain looks as below: 603 - * ./mon_data: 604 - * mon_L3_00 605 - * mon_L3_01 606 - * mon_L3_02 607 - * ... 608 - * 609 - * Each domain directory has one file per event: 610 - * ./mon_L3_00/: 611 - * llc_occupancy 612 - * 613 - */ 614 - static int mkdir_mondata_all(struct kernfs_node *parent_kn, 615 - struct rdtgroup *prgrp, 616 - struct kernfs_node **dest_kn) 617 - { 618 - struct rdt_resource *r; 619 - struct kernfs_node *kn; 620 - int ret; 621 - 622 - /* 623 - * Create the mon_data directory first. 624 - */ 625 - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); 626 - if (ret) 627 - return ret; 628 - 629 - if (dest_kn) 630 - *dest_kn = kn; 631 - 632 - /* 633 - * Create the subdirectories for each domain. Note that all events 634 - * in a domain like L3 are grouped into a resource whose domain is L3 635 - */ 636 - for_each_mon_capable_rdt_resource(r) { 637 - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); 638 - if (ret) 639 - goto out_destroy; 640 - } 641 - 642 - return 0; 643 - 644 - out_destroy: 645 - kernfs_remove(kn); 646 - return ret; 647 - } 648 - 649 - /** 650 - * cbm_ensure_valid - Enforce validity on provided CBM 651 - * @_val: Candidate CBM 652 - * @r: RDT resource to which the CBM belongs 653 - * 654 - * The provided CBM represents all cache portions available for use. This 655 - * may be represented by a bitmap that does not consist of contiguous ones 656 - * and thus be an invalid CBM. 657 - * Here the provided CBM is forced to be a valid CBM by only considering 658 - * the first set of contiguous bits as valid and clearing all bits. 659 - * The intention here is to provide a valid default CBM with which a new 660 - * resource group is initialized. The user can follow this with a 661 - * modification to the CBM if the default does not satisfy the 662 - * requirements. 663 - */ 664 - static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) 665 - { 666 - unsigned int cbm_len = r->cache.cbm_len; 667 - unsigned long first_bit, zero_bit; 668 - unsigned long val = _val; 669 - 670 - if (!val) 671 - return 0; 672 - 673 - first_bit = find_first_bit(&val, cbm_len); 674 - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 675 - 676 - /* Clear any remaining bits to ensure contiguous region */ 677 - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); 678 - return (u32)val; 679 - } 680 - 681 - /* 682 - * Initialize cache resources per RDT domain 683 - * 684 - * Set the RDT domain up to start off with all usable allocations. That is, 685 - * all shareable and unused bits. All-zero CBM is invalid. 686 - */ 687 - static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, 688 - u32 closid) 689 - { 690 - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 691 - enum resctrl_conf_type t = s->conf_type; 692 - struct resctrl_staged_config *cfg; 693 - struct rdt_resource *r = s->res; 694 - u32 used_b = 0, unused_b = 0; 695 - unsigned long tmp_cbm; 696 - enum rdtgrp_mode mode; 697 - u32 peer_ctl, ctrl_val; 698 - int i; 699 - 700 - cfg = &d->staged_config[t]; 701 - cfg->have_new_ctrl = false; 702 - cfg->new_ctrl = r->cache.shareable_bits; 703 - used_b = r->cache.shareable_bits; 704 - for (i = 0; i < closids_supported(); i++) { 705 - if (closid_allocated(i) && i != closid) { 706 - mode = rdtgroup_mode_by_closid(i); 707 - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) 708 - /* 709 - * ctrl values for locksetup aren't relevant 710 - * until the schemata is written, and the mode 711 - * becomes RDT_MODE_PSEUDO_LOCKED. 712 - */ 713 - continue; 714 - /* 715 - * If CDP is active include peer domain's 716 - * usage to ensure there is no overlap 717 - * with an exclusive group. 718 - */ 719 - if (resctrl_arch_get_cdp_enabled(r->rid)) 720 - peer_ctl = resctrl_arch_get_config(r, d, i, 721 - peer_type); 722 - else 723 - peer_ctl = 0; 724 - ctrl_val = resctrl_arch_get_config(r, d, i, 725 - s->conf_type); 726 - used_b |= ctrl_val | peer_ctl; 727 - if (mode == RDT_MODE_SHAREABLE) 728 - cfg->new_ctrl |= ctrl_val | peer_ctl; 729 - } 730 - } 731 - if (d->plr && d->plr->cbm > 0) 732 - used_b |= d->plr->cbm; 733 - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); 734 - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; 735 - cfg->new_ctrl |= unused_b; 736 - /* 737 - * Force the initial CBM to be valid, user can 738 - * modify the CBM based on system availability. 739 - */ 740 - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); 741 - /* 742 - * Assign the u32 CBM to an unsigned long to ensure that 743 - * bitmap_weight() does not access out-of-bound memory. 744 - */ 745 - tmp_cbm = cfg->new_ctrl; 746 - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { 747 - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); 748 - return -ENOSPC; 749 - } 750 - cfg->have_new_ctrl = true; 751 - 752 - return 0; 753 - } 754 - 755 - /* 756 - * Initialize cache resources with default values. 757 - * 758 - * A new RDT group is being created on an allocation capable (CAT) 759 - * supporting system. Set this group up to start off with all usable 760 - * allocations. 761 - * 762 - * If there are no more shareable bits available on any domain then 763 - * the entire allocation will fail. 764 - */ 765 - static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) 766 - { 767 - struct rdt_ctrl_domain *d; 768 - int ret; 769 - 770 - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { 771 - ret = __init_one_rdt_domain(d, s, closid); 772 - if (ret < 0) 773 - return ret; 774 - } 775 - 776 - return 0; 777 - } 778 - 779 - /* Initialize MBA resource with default values. */ 780 - static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) 781 - { 782 - struct resctrl_staged_config *cfg; 783 - struct rdt_ctrl_domain *d; 784 - 785 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 786 - if (is_mba_sc(r)) { 787 - d->mbps_val[closid] = MBA_MAX_MBPS; 788 - continue; 789 - } 790 - 791 - cfg = &d->staged_config[CDP_NONE]; 792 - cfg->new_ctrl = resctrl_get_default_ctrl(r); 793 - cfg->have_new_ctrl = true; 794 - } 795 - } 796 - 797 - /* Initialize the RDT group's allocations. */ 798 - static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) 799 - { 800 - struct resctrl_schema *s; 801 - struct rdt_resource *r; 802 - int ret = 0; 803 - 804 - rdt_staged_configs_clear(); 805 - 806 - list_for_each_entry(s, &resctrl_schema_all, list) { 807 - r = s->res; 808 - if (r->rid == RDT_RESOURCE_MBA || 809 - r->rid == RDT_RESOURCE_SMBA) { 810 - rdtgroup_init_mba(r, rdtgrp->closid); 811 - if (is_mba_sc(r)) 812 - continue; 813 - } else { 814 - ret = rdtgroup_init_cat(s, rdtgrp->closid); 815 - if (ret < 0) 816 - goto out; 817 - } 818 - 819 - ret = resctrl_arch_update_domains(r, rdtgrp->closid); 820 - if (ret < 0) { 821 - rdt_last_cmd_puts("Failed to initialize allocations\n"); 822 - goto out; 823 - } 824 - 825 - } 826 - 827 - rdtgrp->mode = RDT_MODE_SHAREABLE; 828 - 829 - out: 830 - rdt_staged_configs_clear(); 831 - return ret; 832 - } 833 - 834 - static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) 835 - { 836 - int ret; 837 - 838 - if (!resctrl_arch_mon_capable()) 839 - return 0; 840 - 841 - ret = alloc_rmid(rdtgrp->closid); 842 - if (ret < 0) { 843 - rdt_last_cmd_puts("Out of RMIDs\n"); 844 - return ret; 845 - } 846 - rdtgrp->mon.rmid = ret; 847 - 848 - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); 849 - if (ret) { 850 - rdt_last_cmd_puts("kernfs subdir error\n"); 851 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 852 - return ret; 853 - } 854 - 855 - return 0; 856 - } 857 - 858 - static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) 859 - { 860 - if (resctrl_arch_mon_capable()) 861 - free_rmid(rgrp->closid, rgrp->mon.rmid); 862 - } 863 - 864 - /* 865 - * We allow creating mon groups only with in a directory called "mon_groups" 866 - * which is present in every ctrl_mon group. Check if this is a valid 867 - * "mon_groups" directory. 868 - * 869 - * 1. The directory should be named "mon_groups". 870 - * 2. The mon group itself should "not" be named "mon_groups". 871 - * This makes sure "mon_groups" directory always has a ctrl_mon group 872 - * as parent. 873 - */ 874 - static bool is_mon_groups(struct kernfs_node *kn, const char *name) 875 - { 876 - return (!strcmp(rdt_kn_name(kn), "mon_groups") && 877 - strcmp(name, "mon_groups")); 878 - } 879 - 880 - static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, 881 - const char *name, umode_t mode, 882 - enum rdt_group_type rtype, struct rdtgroup **r) 883 - { 884 - struct rdtgroup *prdtgrp, *rdtgrp; 885 - unsigned long files = 0; 886 - struct kernfs_node *kn; 887 - int ret; 888 - 889 - prdtgrp = rdtgroup_kn_lock_live(parent_kn); 890 - if (!prdtgrp) { 891 - ret = -ENODEV; 892 - goto out_unlock; 893 - } 894 - 895 - /* 896 - * Check that the parent directory for a monitor group is a "mon_groups" 897 - * directory. 898 - */ 899 - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { 900 - ret = -EPERM; 901 - goto out_unlock; 902 - } 903 - 904 - if (rtype == RDTMON_GROUP && 905 - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 906 - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { 907 - ret = -EINVAL; 908 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 909 - goto out_unlock; 910 - } 911 - 912 - /* allocate the rdtgroup. */ 913 - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 914 - if (!rdtgrp) { 915 - ret = -ENOSPC; 916 - rdt_last_cmd_puts("Kernel out of memory\n"); 917 - goto out_unlock; 918 - } 919 - *r = rdtgrp; 920 - rdtgrp->mon.parent = prdtgrp; 921 - rdtgrp->type = rtype; 922 - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); 923 - 924 - /* kernfs creates the directory for rdtgrp */ 925 - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); 926 - if (IS_ERR(kn)) { 927 - ret = PTR_ERR(kn); 928 - rdt_last_cmd_puts("kernfs create error\n"); 929 - goto out_free_rgrp; 930 - } 931 - rdtgrp->kn = kn; 932 - 933 - /* 934 - * kernfs_remove() will drop the reference count on "kn" which 935 - * will free it. But we still need it to stick around for the 936 - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, 937 - * which will be dropped by kernfs_put() in rdtgroup_remove(). 938 - */ 939 - kernfs_get(kn); 940 - 941 - ret = rdtgroup_kn_set_ugid(kn); 942 - if (ret) { 943 - rdt_last_cmd_puts("kernfs perm error\n"); 944 - goto out_destroy; 945 - } 946 - 947 - if (rtype == RDTCTRL_GROUP) { 948 - files = RFTYPE_BASE | RFTYPE_CTRL; 949 - if (resctrl_arch_mon_capable()) 950 - files |= RFTYPE_MON; 951 - } else { 952 - files = RFTYPE_BASE | RFTYPE_MON; 953 - } 954 - 955 - ret = rdtgroup_add_files(kn, files); 956 - if (ret) { 957 - rdt_last_cmd_puts("kernfs fill error\n"); 958 - goto out_destroy; 959 - } 960 - 961 - /* 962 - * The caller unlocks the parent_kn upon success. 963 - */ 964 - return 0; 965 - 966 - out_destroy: 967 - kernfs_put(rdtgrp->kn); 968 - kernfs_remove(rdtgrp->kn); 969 - out_free_rgrp: 970 - kfree(rdtgrp); 971 - out_unlock: 972 - rdtgroup_kn_unlock(parent_kn); 973 - return ret; 974 - } 975 - 976 - static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) 977 - { 978 - kernfs_remove(rgrp->kn); 979 - rdtgroup_remove(rgrp); 980 - } 981 - 982 - /* 983 - * Create a monitor group under "mon_groups" directory of a control 984 - * and monitor group(ctrl_mon). This is a resource group 985 - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. 986 - */ 987 - static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, 988 - const char *name, umode_t mode) 989 - { 990 - struct rdtgroup *rdtgrp, *prgrp; 991 - int ret; 992 - 993 - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); 994 - if (ret) 995 - return ret; 996 - 997 - prgrp = rdtgrp->mon.parent; 998 - rdtgrp->closid = prgrp->closid; 999 - 1000 - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 1001 - if (ret) { 1002 - mkdir_rdt_prepare_clean(rdtgrp); 1003 - goto out_unlock; 1004 - } 1005 - 1006 - kernfs_activate(rdtgrp->kn); 1007 - 1008 - /* 1009 - * Add the rdtgrp to the list of rdtgrps the parent 1010 - * ctrl_mon group has to track. 1011 - */ 1012 - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); 1013 - 1014 - out_unlock: 1015 - rdtgroup_kn_unlock(parent_kn); 1016 - return ret; 1017 - } 1018 - 1019 - /* 1020 - * These are rdtgroups created under the root directory. Can be used 1021 - * to allocate and monitor resources. 1022 - */ 1023 - static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, 1024 - const char *name, umode_t mode) 1025 - { 1026 - struct rdtgroup *rdtgrp; 1027 - struct kernfs_node *kn; 1028 - u32 closid; 1029 - int ret; 1030 - 1031 - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); 1032 - if (ret) 1033 - return ret; 1034 - 1035 - kn = rdtgrp->kn; 1036 - ret = closid_alloc(); 1037 - if (ret < 0) { 1038 - rdt_last_cmd_puts("Out of CLOSIDs\n"); 1039 - goto out_common_fail; 1040 - } 1041 - closid = ret; 1042 - ret = 0; 1043 - 1044 - rdtgrp->closid = closid; 1045 - 1046 - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 1047 - if (ret) 1048 - goto out_closid_free; 1049 - 1050 - kernfs_activate(rdtgrp->kn); 1051 - 1052 - ret = rdtgroup_init_alloc(rdtgrp); 1053 - if (ret < 0) 1054 - goto out_rmid_free; 1055 - 1056 - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 1057 - 1058 - if (resctrl_arch_mon_capable()) { 1059 - /* 1060 - * Create an empty mon_groups directory to hold the subset 1061 - * of tasks and cpus to monitor. 1062 - */ 1063 - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); 1064 - if (ret) { 1065 - rdt_last_cmd_puts("kernfs subdir error\n"); 1066 - goto out_del_list; 1067 - } 1068 - if (is_mba_sc(NULL)) 1069 - rdtgrp->mba_mbps_event = mba_mbps_default_event; 1070 - } 1071 - 1072 - goto out_unlock; 1073 - 1074 - out_del_list: 1075 - list_del(&rdtgrp->rdtgroup_list); 1076 - out_rmid_free: 1077 - mkdir_rdt_prepare_rmid_free(rdtgrp); 1078 - out_closid_free: 1079 - closid_free(closid); 1080 - out_common_fail: 1081 - mkdir_rdt_prepare_clean(rdtgrp); 1082 - out_unlock: 1083 - rdtgroup_kn_unlock(parent_kn); 1084 - return ret; 1085 - } 1086 - 1087 - static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 1088 - umode_t mode) 1089 - { 1090 - /* Do not accept '\n' to avoid unparsable situation. */ 1091 - if (strchr(name, '\n')) 1092 - return -EINVAL; 1093 - 1094 - /* 1095 - * If the parent directory is the root directory and RDT 1096 - * allocation is supported, add a control and monitoring 1097 - * subdirectory 1098 - */ 1099 - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) 1100 - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); 1101 - 1102 - /* Else, attempt to add a monitoring subdirectory. */ 1103 - if (resctrl_arch_mon_capable()) 1104 - return rdtgroup_mkdir_mon(parent_kn, name, mode); 1105 - 1106 - return -EPERM; 1107 - } 1108 - 1109 - static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 1110 - { 1111 - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 1112 - u32 closid, rmid; 1113 - int cpu; 1114 - 1115 - /* Give any tasks back to the parent group */ 1116 - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); 1117 - 1118 - /* 1119 - * Update per cpu closid/rmid of the moved CPUs first. 1120 - * Note: the closid will not change, but the arch code still needs it. 1121 - */ 1122 - closid = prdtgrp->closid; 1123 - rmid = prdtgrp->mon.rmid; 1124 - for_each_cpu(cpu, &rdtgrp->cpu_mask) 1125 - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 1126 - 1127 - /* 1128 - * Update the MSR on moved CPUs and CPUs which have moved 1129 - * task running on them. 1130 - */ 1131 - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1132 - update_closid_rmid(tmpmask, NULL); 1133 - 1134 - rdtgrp->flags = RDT_DELETED; 1135 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 1136 - 1137 - /* 1138 - * Remove the rdtgrp from the parent ctrl_mon group's list 1139 - */ 1140 - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 1141 - list_del(&rdtgrp->mon.crdtgrp_list); 1142 - 1143 - kernfs_remove(rdtgrp->kn); 1144 - 1145 - return 0; 1146 - } 1147 - 1148 - static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) 1149 - { 1150 - rdtgrp->flags = RDT_DELETED; 1151 - list_del(&rdtgrp->rdtgroup_list); 1152 - 1153 - kernfs_remove(rdtgrp->kn); 1154 - return 0; 1155 - } 1156 - 1157 - static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 1158 - { 1159 - u32 closid, rmid; 1160 - int cpu; 1161 - 1162 - /* Give any tasks back to the default group */ 1163 - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 1164 - 1165 - /* Give any CPUs back to the default group */ 1166 - cpumask_or(&rdtgroup_default.cpu_mask, 1167 - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 1168 - 1169 - /* Update per cpu closid and rmid of the moved CPUs first */ 1170 - closid = rdtgroup_default.closid; 1171 - rmid = rdtgroup_default.mon.rmid; 1172 - for_each_cpu(cpu, &rdtgrp->cpu_mask) 1173 - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 1174 - 1175 - /* 1176 - * Update the MSR on moved CPUs and CPUs which have moved 1177 - * task running on them. 1178 - */ 1179 - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1180 - update_closid_rmid(tmpmask, NULL); 1181 - 1182 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 1183 - closid_free(rdtgrp->closid); 1184 - 1185 - rdtgroup_ctrl_remove(rdtgrp); 1186 - 1187 - /* 1188 - * Free all the child monitor group rmids. 1189 - */ 1190 - free_all_child_rdtgrp(rdtgrp); 1191 - 1192 - return 0; 1193 - } 1194 - 1195 - static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) 1196 - { 1197 - /* 1198 - * Valid within the RCU section it was obtained or while rdtgroup_mutex 1199 - * is held. 1200 - */ 1201 - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); 1202 - } 1203 - 1204 - static int rdtgroup_rmdir(struct kernfs_node *kn) 1205 - { 1206 - struct kernfs_node *parent_kn; 1207 - struct rdtgroup *rdtgrp; 1208 - cpumask_var_t tmpmask; 1209 - int ret = 0; 1210 - 1211 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 1212 - return -ENOMEM; 1213 - 1214 - rdtgrp = rdtgroup_kn_lock_live(kn); 1215 - if (!rdtgrp) { 1216 - ret = -EPERM; 1217 - goto out; 1218 - } 1219 - parent_kn = rdt_kn_parent(kn); 1220 - 1221 - /* 1222 - * If the rdtgroup is a ctrl_mon group and parent directory 1223 - * is the root directory, remove the ctrl_mon group. 1224 - * 1225 - * If the rdtgroup is a mon group and parent directory 1226 - * is a valid "mon_groups" directory, remove the mon group. 1227 - */ 1228 - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && 1229 - rdtgrp != &rdtgroup_default) { 1230 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 1231 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1232 - ret = rdtgroup_ctrl_remove(rdtgrp); 1233 - } else { 1234 - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); 1235 - } 1236 - } else if (rdtgrp->type == RDTMON_GROUP && 1237 - is_mon_groups(parent_kn, rdt_kn_name(kn))) { 1238 - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); 1239 - } else { 1240 - ret = -EPERM; 1241 - } 1242 - 1243 - out: 1244 - rdtgroup_kn_unlock(kn); 1245 - free_cpumask_var(tmpmask); 1246 - return ret; 1247 - } 1248 - 1249 - /** 1250 - * mongrp_reparent() - replace parent CTRL_MON group of a MON group 1251 - * @rdtgrp: the MON group whose parent should be replaced 1252 - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp 1253 - * @cpus: cpumask provided by the caller for use during this call 1254 - * 1255 - * Replaces the parent CTRL_MON group for a MON group, resulting in all member 1256 - * tasks' CLOSID immediately changing to that of the new parent group. 1257 - * Monitoring data for the group is unaffected by this operation. 1258 - */ 1259 - static void mongrp_reparent(struct rdtgroup *rdtgrp, 1260 - struct rdtgroup *new_prdtgrp, 1261 - cpumask_var_t cpus) 1262 - { 1263 - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 1264 - 1265 - WARN_ON(rdtgrp->type != RDTMON_GROUP); 1266 - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); 1267 - 1268 - /* Nothing to do when simply renaming a MON group. */ 1269 - if (prdtgrp == new_prdtgrp) 1270 - return; 1271 - 1272 - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 1273 - list_move_tail(&rdtgrp->mon.crdtgrp_list, 1274 - &new_prdtgrp->mon.crdtgrp_list); 1275 - 1276 - rdtgrp->mon.parent = new_prdtgrp; 1277 - rdtgrp->closid = new_prdtgrp->closid; 1278 - 1279 - /* Propagate updated closid to all tasks in this group. */ 1280 - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); 1281 - 1282 - update_closid_rmid(cpus, NULL); 1283 - } 1284 - 1285 - static int rdtgroup_rename(struct kernfs_node *kn, 1286 - struct kernfs_node *new_parent, const char *new_name) 1287 - { 1288 - struct kernfs_node *kn_parent; 1289 - struct rdtgroup *new_prdtgrp; 1290 - struct rdtgroup *rdtgrp; 1291 - cpumask_var_t tmpmask; 1292 - int ret; 1293 - 1294 - rdtgrp = kernfs_to_rdtgroup(kn); 1295 - new_prdtgrp = kernfs_to_rdtgroup(new_parent); 1296 - if (!rdtgrp || !new_prdtgrp) 1297 - return -ENOENT; 1298 - 1299 - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ 1300 - rdtgroup_kn_get(rdtgrp, kn); 1301 - rdtgroup_kn_get(new_prdtgrp, new_parent); 1302 - 1303 - mutex_lock(&rdtgroup_mutex); 1304 - 1305 - rdt_last_cmd_clear(); 1306 - 1307 - /* 1308 - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if 1309 - * either kernfs_node is a file. 1310 - */ 1311 - if (kernfs_type(kn) != KERNFS_DIR || 1312 - kernfs_type(new_parent) != KERNFS_DIR) { 1313 - rdt_last_cmd_puts("Source and destination must be directories"); 1314 - ret = -EPERM; 1315 - goto out; 1316 - } 1317 - 1318 - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { 1319 - ret = -ENOENT; 1320 - goto out; 1321 - } 1322 - 1323 - kn_parent = rdt_kn_parent(kn); 1324 - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || 1325 - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { 1326 - rdt_last_cmd_puts("Source must be a MON group\n"); 1327 - ret = -EPERM; 1328 - goto out; 1329 - } 1330 - 1331 - if (!is_mon_groups(new_parent, new_name)) { 1332 - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); 1333 - ret = -EPERM; 1334 - goto out; 1335 - } 1336 - 1337 - /* 1338 - * If the MON group is monitoring CPUs, the CPUs must be assigned to the 1339 - * current parent CTRL_MON group and therefore cannot be assigned to 1340 - * the new parent, making the move illegal. 1341 - */ 1342 - if (!cpumask_empty(&rdtgrp->cpu_mask) && 1343 - rdtgrp->mon.parent != new_prdtgrp) { 1344 - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); 1345 - ret = -EPERM; 1346 - goto out; 1347 - } 1348 - 1349 - /* 1350 - * Allocate the cpumask for use in mongrp_reparent() to avoid the 1351 - * possibility of failing to allocate it after kernfs_rename() has 1352 - * succeeded. 1353 - */ 1354 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { 1355 - ret = -ENOMEM; 1356 - goto out; 1357 - } 1358 - 1359 - /* 1360 - * Perform all input validation and allocations needed to ensure 1361 - * mongrp_reparent() will succeed before calling kernfs_rename(), 1362 - * otherwise it would be necessary to revert this call if 1363 - * mongrp_reparent() failed. 1364 - */ 1365 - ret = kernfs_rename(kn, new_parent, new_name); 1366 - if (!ret) 1367 - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); 1368 - 1369 - free_cpumask_var(tmpmask); 1370 - 1371 - out: 1372 - mutex_unlock(&rdtgroup_mutex); 1373 - rdtgroup_kn_put(rdtgrp, kn); 1374 - rdtgroup_kn_put(new_prdtgrp, new_parent); 1375 - return ret; 1376 - } 1377 - 1378 - static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 1379 - { 1380 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) 1381 - seq_puts(seq, ",cdp"); 1382 - 1383 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) 1384 - seq_puts(seq, ",cdpl2"); 1385 - 1386 - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) 1387 - seq_puts(seq, ",mba_MBps"); 1388 - 1389 - if (resctrl_debug) 1390 - seq_puts(seq, ",debug"); 1391 - 1392 - return 0; 1393 - } 1394 - 1395 - static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { 1396 - .mkdir = rdtgroup_mkdir, 1397 - .rmdir = rdtgroup_rmdir, 1398 - .rename = rdtgroup_rename, 1399 - .show_options = rdtgroup_show_options, 1400 - }; 1401 - 1402 - static int rdtgroup_setup_root(struct rdt_fs_context *ctx) 1403 - { 1404 - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, 1405 - KERNFS_ROOT_CREATE_DEACTIVATED | 1406 - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, 1407 - &rdtgroup_default); 1408 - if (IS_ERR(rdt_root)) 1409 - return PTR_ERR(rdt_root); 1410 - 1411 - ctx->kfc.root = rdt_root; 1412 - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); 1413 - 1414 - return 0; 1415 - } 1416 - 1417 - static void rdtgroup_destroy_root(void) 1418 - { 1419 - kernfs_destroy_root(rdt_root); 1420 - rdtgroup_default.kn = NULL; 1421 - } 1422 - 1423 - static void __init rdtgroup_setup_default(void) 1424 - { 1425 - mutex_lock(&rdtgroup_mutex); 1426 - 1427 - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; 1428 - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; 1429 - rdtgroup_default.type = RDTCTRL_GROUP; 1430 - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); 1431 - 1432 - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 1433 - 1434 - mutex_unlock(&rdtgroup_mutex); 1435 - } 1436 - 1437 - static void domain_destroy_mon_state(struct rdt_mon_domain *d) 1438 - { 1439 - bitmap_free(d->rmid_busy_llc); 1440 - kfree(d->mbm_total); 1441 - kfree(d->mbm_local); 1442 - } 1443 - 1444 - void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 1445 - { 1446 - mutex_lock(&rdtgroup_mutex); 1447 - 1448 - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) 1449 - mba_sc_domain_destroy(r, d); 1450 - 1451 - mutex_unlock(&rdtgroup_mutex); 1452 - } 1453 - 1454 - void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 1455 - { 1456 - mutex_lock(&rdtgroup_mutex); 1457 - 1458 - /* 1459 - * If resctrl is mounted, remove all the 1460 - * per domain monitor data directories. 1461 - */ 1462 - if (resctrl_mounted && resctrl_arch_mon_capable()) 1463 - rmdir_mondata_subdir_allrdtgrp(r, d); 1464 - 1465 - if (resctrl_is_mbm_enabled()) 1466 - cancel_delayed_work(&d->mbm_over); 1467 - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { 1468 - /* 1469 - * When a package is going down, forcefully 1470 - * decrement rmid->ebusy. There is no way to know 1471 - * that the L3 was flushed and hence may lead to 1472 - * incorrect counts in rare scenarios, but leaving 1473 - * the RMID as busy creates RMID leaks if the 1474 - * package never comes back. 1475 - */ 1476 - __check_limbo(d, true); 1477 - cancel_delayed_work(&d->cqm_limbo); 1478 - } 1479 - 1480 - domain_destroy_mon_state(d); 1481 - 1482 - mutex_unlock(&rdtgroup_mutex); 1483 - } 1484 - 1485 - /** 1486 - * domain_setup_mon_state() - Initialise domain monitoring structures. 1487 - * @r: The resource for the newly online domain. 1488 - * @d: The newly online domain. 1489 - * 1490 - * Allocate monitor resources that belong to this domain. 1491 - * Called when the first CPU of a domain comes online, regardless of whether 1492 - * the filesystem is mounted. 1493 - * During boot this may be called before global allocations have been made by 1494 - * resctrl_mon_resource_init(). 1495 - * 1496 - * Returns 0 for success, or -ENOMEM. 1497 - */ 1498 - static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) 1499 - { 1500 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 1501 - size_t tsize; 1502 - 1503 - if (resctrl_arch_is_llc_occupancy_enabled()) { 1504 - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); 1505 - if (!d->rmid_busy_llc) 1506 - return -ENOMEM; 1507 - } 1508 - if (resctrl_arch_is_mbm_total_enabled()) { 1509 - tsize = sizeof(*d->mbm_total); 1510 - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); 1511 - if (!d->mbm_total) { 1512 - bitmap_free(d->rmid_busy_llc); 1513 - return -ENOMEM; 1514 - } 1515 - } 1516 - if (resctrl_arch_is_mbm_local_enabled()) { 1517 - tsize = sizeof(*d->mbm_local); 1518 - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); 1519 - if (!d->mbm_local) { 1520 - bitmap_free(d->rmid_busy_llc); 1521 - kfree(d->mbm_total); 1522 - return -ENOMEM; 1523 - } 1524 - } 1525 - 1526 - return 0; 1527 - } 1528 - 1529 - int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 1530 - { 1531 - int err = 0; 1532 - 1533 - mutex_lock(&rdtgroup_mutex); 1534 - 1535 - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { 1536 - /* RDT_RESOURCE_MBA is never mon_capable */ 1537 - err = mba_sc_domain_allocate(r, d); 1538 - } 1539 - 1540 - mutex_unlock(&rdtgroup_mutex); 1541 - 1542 - return err; 1543 - } 1544 - 1545 - int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 1546 - { 1547 - int err; 1548 - 1549 - mutex_lock(&rdtgroup_mutex); 1550 - 1551 - err = domain_setup_mon_state(r, d); 1552 - if (err) 1553 - goto out_unlock; 1554 - 1555 - if (resctrl_is_mbm_enabled()) { 1556 - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); 1557 - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, 1558 - RESCTRL_PICK_ANY_CPU); 1559 - } 1560 - 1561 - if (resctrl_arch_is_llc_occupancy_enabled()) 1562 - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); 1563 - 1564 - /* 1565 - * If the filesystem is not mounted then only the default resource group 1566 - * exists. Creation of its directories is deferred until mount time 1567 - * by rdt_get_tree() calling mkdir_mondata_all(). 1568 - * If resctrl is mounted, add per domain monitor data directories. 1569 - */ 1570 - if (resctrl_mounted && resctrl_arch_mon_capable()) 1571 - mkdir_mondata_subdir_allrdtgrp(r, d); 1572 - 1573 - out_unlock: 1574 - mutex_unlock(&rdtgroup_mutex); 1575 - 1576 - return err; 1577 - } 1578 - 1579 - void resctrl_online_cpu(unsigned int cpu) 1580 - { 1581 - mutex_lock(&rdtgroup_mutex); 1582 - /* The CPU is set in default rdtgroup after online. */ 1583 - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 1584 - mutex_unlock(&rdtgroup_mutex); 1585 - } 1586 - 1587 - static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) 1588 - { 1589 - struct rdtgroup *cr; 1590 - 1591 - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { 1592 - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) 1593 - break; 1594 - } 1595 - } 1596 - 1597 - static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, 1598 - struct rdt_resource *r) 1599 - { 1600 - struct rdt_mon_domain *d; 1601 - 1602 - lockdep_assert_cpus_held(); 1603 - 1604 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 1605 - /* Find the domain that contains this CPU */ 1606 - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 1607 - return d; 1608 - } 1609 - 1610 - return NULL; 1611 - } 1612 - 1613 - void resctrl_offline_cpu(unsigned int cpu) 1614 - { 1615 - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1616 - struct rdt_mon_domain *d; 1617 - struct rdtgroup *rdtgrp; 1618 - 1619 - mutex_lock(&rdtgroup_mutex); 1620 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 1621 - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { 1622 - clear_childcpus(rdtgrp, cpu); 1623 - break; 1624 - } 1625 - } 1626 - 1627 - if (!l3->mon_capable) 1628 - goto out_unlock; 1629 - 1630 - d = get_mon_domain_from_cpu(cpu, l3); 1631 - if (d) { 1632 - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { 1633 - cancel_delayed_work(&d->mbm_over); 1634 - mbm_setup_overflow_handler(d, 0, cpu); 1635 - } 1636 - if (resctrl_arch_is_llc_occupancy_enabled() && 1637 - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { 1638 - cancel_delayed_work(&d->cqm_limbo); 1639 - cqm_setup_limbo_handler(d, 0, cpu); 1640 - } 1641 - } 1642 - 1643 - out_unlock: 1644 - mutex_unlock(&rdtgroup_mutex); 1645 - } 1646 - 1647 - /* 1648 - * resctrl_init - resctrl filesystem initialization 1649 - * 1650 - * Setup resctrl file system including set up root, create mount point, 1651 - * register resctrl filesystem, and initialize files under root directory. 1652 - * 1653 - * Return: 0 on success or -errno 1654 - */ 1655 - int __init resctrl_init(void) 1656 - { 1657 - int ret = 0; 1658 - 1659 - seq_buf_init(&last_cmd_status, last_cmd_status_buf, 1660 - sizeof(last_cmd_status_buf)); 1661 - 1662 - rdtgroup_setup_default(); 1663 - 1664 - thread_throttle_mode_init(); 1665 - 1666 - ret = resctrl_mon_resource_init(); 1667 - if (ret) 1668 - return ret; 1669 - 1670 - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); 1671 - if (ret) { 1672 - resctrl_mon_resource_exit(); 1673 - return ret; 1674 - } 1675 - 1676 - ret = register_filesystem(&rdt_fs_type); 1677 - if (ret) 1678 - goto cleanup_mountpoint; 1679 - 1680 - /* 1681 - * Adding the resctrl debugfs directory here may not be ideal since 1682 - * it would let the resctrl debugfs directory appear on the debugfs 1683 - * filesystem before the resctrl filesystem is mounted. 1684 - * It may also be ok since that would enable debugging of RDT before 1685 - * resctrl is mounted. 1686 - * The reason why the debugfs directory is created here and not in 1687 - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and 1688 - * during the debugfs directory creation also &sb->s_type->i_mutex_key 1689 - * (the lockdep class of inode->i_rwsem). Other filesystem 1690 - * interactions (eg. SyS_getdents) have the lock ordering: 1691 - * &sb->s_type->i_mutex_key --> &mm->mmap_lock 1692 - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex 1693 - * is taken, thus creating dependency: 1694 - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause 1695 - * issues considering the other two lock dependencies. 1696 - * By creating the debugfs directory here we avoid a dependency 1697 - * that may cause deadlock (even though file operations cannot 1698 - * occur until the filesystem is mounted, but I do not know how to 1699 - * tell lockdep that). 1700 - */ 1701 - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); 1702 - 1703 - return 0; 1704 - 1705 - cleanup_mountpoint: 1706 - sysfs_remove_mount_point(fs_kobj, "resctrl"); 1707 - resctrl_mon_resource_exit(); 1708 - 1709 - return ret; 1710 - } 1711 - 1712 - void __exit resctrl_exit(void) 1713 - { 1714 - debugfs_remove_recursive(debugfs_resctrl); 1715 - unregister_filesystem(&rdt_fs_type); 1716 - sysfs_remove_mount_point(fs_kobj, "resctrl"); 1717 - 1718 - resctrl_mon_resource_exit(); 1719 2955 }

+6 -20

arch/x86/kernel/cpu/resctrl/trace.h arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h

··· 2 2 #undef TRACE_SYSTEM 3 3 #define TRACE_SYSTEM resctrl 4 4 5 - #if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) 6 - #define _TRACE_RESCTRL_H 5 + #if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 6 + #define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H 7 7 8 8 #include <linux/tracepoint.h> 9 9 ··· 35 35 TP_printk("hits=%llu miss=%llu", 36 36 __entry->l3_hits, __entry->l3_miss)); 37 37 38 - TRACE_EVENT(mon_llc_occupancy_limbo, 39 - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), 40 - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), 41 - TP_STRUCT__entry(__field(u32, ctrl_hw_id) 42 - __field(u32, mon_hw_id) 43 - __field(int, domain_id) 44 - __field(u64, llc_occupancy_bytes)), 45 - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; 46 - __entry->mon_hw_id = mon_hw_id; 47 - __entry->domain_id = domain_id; 48 - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), 49 - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", 50 - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, 51 - __entry->llc_occupancy_bytes) 52 - ); 53 - 54 - #endif /* _TRACE_RESCTRL_H */ 38 + #endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ 55 39 56 40 #undef TRACE_INCLUDE_PATH 57 41 #define TRACE_INCLUDE_PATH . 58 - #define TRACE_INCLUDE_FILE trace 42 + 43 + #define TRACE_INCLUDE_FILE pseudo_lock_trace 44 + 59 45 #include <trace/define_trace.h>

+1 -1

arch/x86/kernel/process_32.c

··· 208 208 raw_cpu_write(current_task, next_p); 209 209 210 210 /* Load the Intel cache allocation PQR MSR. */ 211 - resctrl_sched_in(next_p); 211 + resctrl_arch_sched_in(next_p); 212 212 213 213 return prev_p; 214 214 }

+1 -1

arch/x86/kernel/process_64.c

··· 705 705 } 706 706 707 707 /* Load the Intel cache allocation PQR MSR. */ 708 - resctrl_sched_in(next_p); 708 + resctrl_arch_sched_in(next_p); 709 709 710 710 return prev_p; 711 711 }

+1

fs/Kconfig

··· 335 335 source "fs/hpfs/Kconfig" 336 336 source "fs/qnx4/Kconfig" 337 337 source "fs/qnx6/Kconfig" 338 + source "fs/resctrl/Kconfig" 338 339 source "fs/romfs/Kconfig" 339 340 source "fs/pstore/Kconfig" 340 341 source "fs/ufs/Kconfig"

+1

fs/Makefile

··· 128 128 obj-$(CONFIG_VBOXSF_FS) += vboxsf/ 129 129 obj-$(CONFIG_ZONEFS_FS) += zonefs/ 130 130 obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o 131 + obj-$(CONFIG_RESCTRL_FS) += resctrl/

+39

fs/resctrl/Kconfig

··· 1 + config RESCTRL_FS 2 + bool "CPU Resource Control Filesystem (resctrl)" 3 + depends on ARCH_HAS_CPU_RESCTRL 4 + select KERNFS 5 + select PROC_CPU_RESCTRL if PROC_FS 6 + help 7 + Some architectures provide hardware facilities to group tasks and 8 + monitor and control their usage of memory system resources such as 9 + caches and memory bandwidth. Examples of such facilities include 10 + Intel's Resource Director Technology (Intel(R) RDT) and AMD's 11 + Platform Quality of Service (AMD QoS). 12 + 13 + If your system has the necessary support and you want to be able to 14 + assign tasks to groups and manipulate the associated resource 15 + monitors and controls from userspace, say Y here to get a mountable 16 + 'resctrl' filesystem that lets you do just that. 17 + 18 + If nothing mounts or prods the 'resctrl' filesystem, resource 19 + controls and monitors are left in a quiescent, permissive state. 20 + 21 + On architectures where this can be disabled independently, it is 22 + safe to say N. 23 + 24 + See <file:Documentation/filesystems/resctrl.rst> for more information. 25 + 26 + config RESCTRL_FS_PSEUDO_LOCK 27 + bool 28 + depends on RESCTRL_FS 29 + help 30 + Software mechanism to pin data in a cache portion using 31 + micro-architecture specific knowledge. 32 + 33 + config RESCTRL_RMID_DEPENDS_ON_CLOSID 34 + bool 35 + depends on RESCTRL_FS 36 + help 37 + Enabled by the architecture when the RMID values depend on the CLOSID. 38 + This causes the CLOSID allocator to search for CLOSID with clean 39 + RMID.

+6

fs/resctrl/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o 3 + obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o 4 + 5 + # To allow define_trace.h's recursive include: 6 + CFLAGS_monitor.o = -I$(src)

+661

fs/resctrl/ctrlmondata.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Resource Director Technology(RDT) 4 + * - Cache Allocation code. 5 + * 6 + * Copyright (C) 2016 Intel Corporation 7 + * 8 + * Authors: 9 + * Fenghua Yu <fenghua.yu@intel.com> 10 + * Tony Luck <tony.luck@intel.com> 11 + * 12 + * More information about RDT be found in the Intel (R) x86 Architecture 13 + * Software Developer Manual June 2016, volume 3, section 17.17. 14 + */ 15 + 16 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 + 18 + #include <linux/cpu.h> 19 + #include <linux/kernfs.h> 20 + #include <linux/seq_file.h> 21 + #include <linux/slab.h> 22 + #include <linux/tick.h> 23 + 24 + #include "internal.h" 25 + 26 + struct rdt_parse_data { 27 + struct rdtgroup *rdtgrp; 28 + char *buf; 29 + }; 30 + 31 + typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, 32 + struct resctrl_schema *s, 33 + struct rdt_ctrl_domain *d); 34 + 35 + /* 36 + * Check whether MBA bandwidth percentage value is correct. The value is 37 + * checked against the minimum and max bandwidth values specified by the 38 + * hardware. The allocated bandwidth percentage is rounded to the next 39 + * control step available on the hardware. 40 + */ 41 + static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) 42 + { 43 + int ret; 44 + u32 bw; 45 + 46 + /* 47 + * Only linear delay values is supported for current Intel SKUs. 48 + */ 49 + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { 50 + rdt_last_cmd_puts("No support for non-linear MB domains\n"); 51 + return false; 52 + } 53 + 54 + ret = kstrtou32(buf, 10, &bw); 55 + if (ret) { 56 + rdt_last_cmd_printf("Invalid MB value %s\n", buf); 57 + return false; 58 + } 59 + 60 + /* Nothing else to do if software controller is enabled. */ 61 + if (is_mba_sc(r)) { 62 + *data = bw; 63 + return true; 64 + } 65 + 66 + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { 67 + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", 68 + bw, r->membw.min_bw, r->membw.max_bw); 69 + return false; 70 + } 71 + 72 + *data = roundup(bw, (unsigned long)r->membw.bw_gran); 73 + return true; 74 + } 75 + 76 + static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, 77 + struct rdt_ctrl_domain *d) 78 + { 79 + struct resctrl_staged_config *cfg; 80 + u32 closid = data->rdtgrp->closid; 81 + struct rdt_resource *r = s->res; 82 + u32 bw_val; 83 + 84 + cfg = &d->staged_config[s->conf_type]; 85 + if (cfg->have_new_ctrl) { 86 + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 87 + return -EINVAL; 88 + } 89 + 90 + if (!bw_validate(data->buf, &bw_val, r)) 91 + return -EINVAL; 92 + 93 + if (is_mba_sc(r)) { 94 + d->mbps_val[closid] = bw_val; 95 + return 0; 96 + } 97 + 98 + cfg->new_ctrl = bw_val; 99 + cfg->have_new_ctrl = true; 100 + 101 + return 0; 102 + } 103 + 104 + /* 105 + * Check whether a cache bit mask is valid. 106 + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: 107 + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 108 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 109 + * 110 + * Haswell does not support a non-contiguous 1s value and additionally 111 + * requires at least two bits set. 112 + * AMD allows non-contiguous bitmasks. 113 + */ 114 + static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) 115 + { 116 + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; 117 + unsigned int cbm_len = r->cache.cbm_len; 118 + unsigned long first_bit, zero_bit, val; 119 + int ret; 120 + 121 + ret = kstrtoul(buf, 16, &val); 122 + if (ret) { 123 + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); 124 + return false; 125 + } 126 + 127 + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { 128 + rdt_last_cmd_puts("Mask out of range\n"); 129 + return false; 130 + } 131 + 132 + first_bit = find_first_bit(&val, cbm_len); 133 + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 134 + 135 + /* Are non-contiguous bitmasks allowed? */ 136 + if (!r->cache.arch_has_sparse_bitmasks && 137 + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { 138 + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); 139 + return false; 140 + } 141 + 142 + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { 143 + rdt_last_cmd_printf("Need at least %d bits in the mask\n", 144 + r->cache.min_cbm_bits); 145 + return false; 146 + } 147 + 148 + *data = val; 149 + return true; 150 + } 151 + 152 + /* 153 + * Read one cache bit mask (hex). Check that it is valid for the current 154 + * resource type. 155 + */ 156 + static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, 157 + struct rdt_ctrl_domain *d) 158 + { 159 + struct rdtgroup *rdtgrp = data->rdtgrp; 160 + struct resctrl_staged_config *cfg; 161 + struct rdt_resource *r = s->res; 162 + u32 cbm_val; 163 + 164 + cfg = &d->staged_config[s->conf_type]; 165 + if (cfg->have_new_ctrl) { 166 + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 167 + return -EINVAL; 168 + } 169 + 170 + /* 171 + * Cannot set up more than one pseudo-locked region in a cache 172 + * hierarchy. 173 + */ 174 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 175 + rdtgroup_pseudo_locked_in_hierarchy(d)) { 176 + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); 177 + return -EINVAL; 178 + } 179 + 180 + if (!cbm_validate(data->buf, &cbm_val, r)) 181 + return -EINVAL; 182 + 183 + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || 184 + rdtgrp->mode == RDT_MODE_SHAREABLE) && 185 + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { 186 + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); 187 + return -EINVAL; 188 + } 189 + 190 + /* 191 + * The CBM may not overlap with the CBM of another closid if 192 + * either is exclusive. 193 + */ 194 + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { 195 + rdt_last_cmd_puts("Overlaps with exclusive group\n"); 196 + return -EINVAL; 197 + } 198 + 199 + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { 200 + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || 201 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 202 + rdt_last_cmd_puts("Overlaps with other group\n"); 203 + return -EINVAL; 204 + } 205 + } 206 + 207 + cfg->new_ctrl = cbm_val; 208 + cfg->have_new_ctrl = true; 209 + 210 + return 0; 211 + } 212 + 213 + /* 214 + * For each domain in this resource we expect to find a series of: 215 + * id=mask 216 + * separated by ";". The "id" is in decimal, and must match one of 217 + * the "id"s for this resource. 218 + */ 219 + static int parse_line(char *line, struct resctrl_schema *s, 220 + struct rdtgroup *rdtgrp) 221 + { 222 + enum resctrl_conf_type t = s->conf_type; 223 + ctrlval_parser_t *parse_ctrlval = NULL; 224 + struct resctrl_staged_config *cfg; 225 + struct rdt_resource *r = s->res; 226 + struct rdt_parse_data data; 227 + struct rdt_ctrl_domain *d; 228 + char *dom = NULL, *id; 229 + unsigned long dom_id; 230 + 231 + /* Walking r->domains, ensure it can't race with cpuhp */ 232 + lockdep_assert_cpus_held(); 233 + 234 + switch (r->schema_fmt) { 235 + case RESCTRL_SCHEMA_BITMAP: 236 + parse_ctrlval = &parse_cbm; 237 + break; 238 + case RESCTRL_SCHEMA_RANGE: 239 + parse_ctrlval = &parse_bw; 240 + break; 241 + } 242 + 243 + if (WARN_ON_ONCE(!parse_ctrlval)) 244 + return -EINVAL; 245 + 246 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 247 + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { 248 + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); 249 + return -EINVAL; 250 + } 251 + 252 + next: 253 + if (!line || line[0] == '\0') 254 + return 0; 255 + dom = strsep(&line, ";"); 256 + id = strsep(&dom, "="); 257 + if (!dom || kstrtoul(id, 10, &dom_id)) { 258 + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); 259 + return -EINVAL; 260 + } 261 + dom = strim(dom); 262 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 263 + if (d->hdr.id == dom_id) { 264 + data.buf = dom; 265 + data.rdtgrp = rdtgrp; 266 + if (parse_ctrlval(&data, s, d)) 267 + return -EINVAL; 268 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 269 + cfg = &d->staged_config[t]; 270 + /* 271 + * In pseudo-locking setup mode and just 272 + * parsed a valid CBM that should be 273 + * pseudo-locked. Only one locked region per 274 + * resource group and domain so just do 275 + * the required initialization for single 276 + * region and return. 277 + */ 278 + rdtgrp->plr->s = s; 279 + rdtgrp->plr->d = d; 280 + rdtgrp->plr->cbm = cfg->new_ctrl; 281 + d->plr = rdtgrp->plr; 282 + return 0; 283 + } 284 + goto next; 285 + } 286 + } 287 + return -EINVAL; 288 + } 289 + 290 + static int rdtgroup_parse_resource(char *resname, char *tok, 291 + struct rdtgroup *rdtgrp) 292 + { 293 + struct resctrl_schema *s; 294 + 295 + list_for_each_entry(s, &resctrl_schema_all, list) { 296 + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) 297 + return parse_line(tok, s, rdtgrp); 298 + } 299 + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); 300 + return -EINVAL; 301 + } 302 + 303 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 304 + char *buf, size_t nbytes, loff_t off) 305 + { 306 + struct resctrl_schema *s; 307 + struct rdtgroup *rdtgrp; 308 + struct rdt_resource *r; 309 + char *tok, *resname; 310 + int ret = 0; 311 + 312 + /* Valid input requires a trailing newline */ 313 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 314 + return -EINVAL; 315 + buf[nbytes - 1] = '\0'; 316 + 317 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 318 + if (!rdtgrp) { 319 + rdtgroup_kn_unlock(of->kn); 320 + return -ENOENT; 321 + } 322 + rdt_last_cmd_clear(); 323 + 324 + /* 325 + * No changes to pseudo-locked region allowed. It has to be removed 326 + * and re-created instead. 327 + */ 328 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 329 + ret = -EINVAL; 330 + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); 331 + goto out; 332 + } 333 + 334 + rdt_staged_configs_clear(); 335 + 336 + while ((tok = strsep(&buf, "\n")) != NULL) { 337 + resname = strim(strsep(&tok, ":")); 338 + if (!tok) { 339 + rdt_last_cmd_puts("Missing ':'\n"); 340 + ret = -EINVAL; 341 + goto out; 342 + } 343 + if (tok[0] == '\0') { 344 + rdt_last_cmd_printf("Missing '%s' value\n", resname); 345 + ret = -EINVAL; 346 + goto out; 347 + } 348 + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); 349 + if (ret) 350 + goto out; 351 + } 352 + 353 + list_for_each_entry(s, &resctrl_schema_all, list) { 354 + r = s->res; 355 + 356 + /* 357 + * Writes to mba_sc resources update the software controller, 358 + * not the control MSR. 359 + */ 360 + if (is_mba_sc(r)) 361 + continue; 362 + 363 + ret = resctrl_arch_update_domains(r, rdtgrp->closid); 364 + if (ret) 365 + goto out; 366 + } 367 + 368 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 369 + /* 370 + * If pseudo-locking fails we keep the resource group in 371 + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service 372 + * active and updated for just the domain the pseudo-locked 373 + * region was requested for. 374 + */ 375 + ret = rdtgroup_pseudo_lock_create(rdtgrp); 376 + } 377 + 378 + out: 379 + rdt_staged_configs_clear(); 380 + rdtgroup_kn_unlock(of->kn); 381 + return ret ?: nbytes; 382 + } 383 + 384 + static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) 385 + { 386 + struct rdt_resource *r = schema->res; 387 + struct rdt_ctrl_domain *dom; 388 + bool sep = false; 389 + u32 ctrl_val; 390 + 391 + /* Walking r->domains, ensure it can't race with cpuhp */ 392 + lockdep_assert_cpus_held(); 393 + 394 + seq_printf(s, "%*s:", max_name_width, schema->name); 395 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 396 + if (sep) 397 + seq_puts(s, ";"); 398 + 399 + if (is_mba_sc(r)) 400 + ctrl_val = dom->mbps_val[closid]; 401 + else 402 + ctrl_val = resctrl_arch_get_config(r, dom, closid, 403 + schema->conf_type); 404 + 405 + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); 406 + sep = true; 407 + } 408 + seq_puts(s, "\n"); 409 + } 410 + 411 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 412 + struct seq_file *s, void *v) 413 + { 414 + struct resctrl_schema *schema; 415 + struct rdtgroup *rdtgrp; 416 + int ret = 0; 417 + u32 closid; 418 + 419 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 420 + if (rdtgrp) { 421 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 422 + list_for_each_entry(schema, &resctrl_schema_all, list) { 423 + seq_printf(s, "%s:uninitialized\n", schema->name); 424 + } 425 + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 426 + if (!rdtgrp->plr->d) { 427 + rdt_last_cmd_clear(); 428 + rdt_last_cmd_puts("Cache domain offline\n"); 429 + ret = -ENODEV; 430 + } else { 431 + seq_printf(s, "%s:%d=%x\n", 432 + rdtgrp->plr->s->res->name, 433 + rdtgrp->plr->d->hdr.id, 434 + rdtgrp->plr->cbm); 435 + } 436 + } else { 437 + closid = rdtgrp->closid; 438 + list_for_each_entry(schema, &resctrl_schema_all, list) { 439 + if (closid < schema->num_closid) 440 + show_doms(s, schema, closid); 441 + } 442 + } 443 + } else { 444 + ret = -ENOENT; 445 + } 446 + rdtgroup_kn_unlock(of->kn); 447 + return ret; 448 + } 449 + 450 + static int smp_mon_event_count(void *arg) 451 + { 452 + mon_event_count(arg); 453 + 454 + return 0; 455 + } 456 + 457 + ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 458 + char *buf, size_t nbytes, loff_t off) 459 + { 460 + struct rdtgroup *rdtgrp; 461 + int ret = 0; 462 + 463 + /* Valid input requires a trailing newline */ 464 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 465 + return -EINVAL; 466 + buf[nbytes - 1] = '\0'; 467 + 468 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 469 + if (!rdtgrp) { 470 + rdtgroup_kn_unlock(of->kn); 471 + return -ENOENT; 472 + } 473 + rdt_last_cmd_clear(); 474 + 475 + if (!strcmp(buf, "mbm_local_bytes")) { 476 + if (resctrl_arch_is_mbm_local_enabled()) 477 + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; 478 + else 479 + ret = -EINVAL; 480 + } else if (!strcmp(buf, "mbm_total_bytes")) { 481 + if (resctrl_arch_is_mbm_total_enabled()) 482 + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; 483 + else 484 + ret = -EINVAL; 485 + } else { 486 + ret = -EINVAL; 487 + } 488 + 489 + if (ret) 490 + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); 491 + 492 + rdtgroup_kn_unlock(of->kn); 493 + 494 + return ret ?: nbytes; 495 + } 496 + 497 + int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 498 + struct seq_file *s, void *v) 499 + { 500 + struct rdtgroup *rdtgrp; 501 + int ret = 0; 502 + 503 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 504 + 505 + if (rdtgrp) { 506 + switch (rdtgrp->mba_mbps_event) { 507 + case QOS_L3_MBM_LOCAL_EVENT_ID: 508 + seq_puts(s, "mbm_local_bytes\n"); 509 + break; 510 + case QOS_L3_MBM_TOTAL_EVENT_ID: 511 + seq_puts(s, "mbm_total_bytes\n"); 512 + break; 513 + default: 514 + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); 515 + ret = -EINVAL; 516 + break; 517 + } 518 + } else { 519 + ret = -ENOENT; 520 + } 521 + 522 + rdtgroup_kn_unlock(of->kn); 523 + 524 + return ret; 525 + } 526 + 527 + struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, 528 + struct list_head **pos) 529 + { 530 + struct rdt_domain_hdr *d; 531 + struct list_head *l; 532 + 533 + list_for_each(l, h) { 534 + d = list_entry(l, struct rdt_domain_hdr, list); 535 + /* When id is found, return its domain. */ 536 + if (id == d->id) 537 + return d; 538 + /* Stop searching when finding id's position in sorted list. */ 539 + if (id < d->id) 540 + break; 541 + } 542 + 543 + if (pos) 544 + *pos = l; 545 + 546 + return NULL; 547 + } 548 + 549 + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 550 + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 551 + cpumask_t *cpumask, int evtid, int first) 552 + { 553 + int cpu; 554 + 555 + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ 556 + lockdep_assert_cpus_held(); 557 + 558 + /* 559 + * Setup the parameters to pass to mon_event_count() to read the data. 560 + */ 561 + rr->rgrp = rdtgrp; 562 + rr->evtid = evtid; 563 + rr->r = r; 564 + rr->d = d; 565 + rr->first = first; 566 + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); 567 + if (IS_ERR(rr->arch_mon_ctx)) { 568 + rr->err = -EINVAL; 569 + return; 570 + } 571 + 572 + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); 573 + 574 + /* 575 + * cpumask_any_housekeeping() prefers housekeeping CPUs, but 576 + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. 577 + * MPAM's resctrl_arch_rmid_read() is unable to read the 578 + * counters on some platforms if its called in IRQ context. 579 + */ 580 + if (tick_nohz_full_cpu(cpu)) 581 + smp_call_function_any(cpumask, mon_event_count, rr, 1); 582 + else 583 + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); 584 + 585 + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); 586 + } 587 + 588 + int rdtgroup_mondata_show(struct seq_file *m, void *arg) 589 + { 590 + struct kernfs_open_file *of = m->private; 591 + enum resctrl_res_level resid; 592 + enum resctrl_event_id evtid; 593 + struct rdt_domain_hdr *hdr; 594 + struct rmid_read rr = {0}; 595 + struct rdt_mon_domain *d; 596 + struct rdtgroup *rdtgrp; 597 + struct rdt_resource *r; 598 + struct mon_data *md; 599 + int domid, ret = 0; 600 + 601 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 602 + if (!rdtgrp) { 603 + ret = -ENOENT; 604 + goto out; 605 + } 606 + 607 + md = of->kn->priv; 608 + if (WARN_ON_ONCE(!md)) { 609 + ret = -EIO; 610 + goto out; 611 + } 612 + 613 + resid = md->rid; 614 + domid = md->domid; 615 + evtid = md->evtid; 616 + r = resctrl_arch_get_resource(resid); 617 + 618 + if (md->sum) { 619 + /* 620 + * This file requires summing across all domains that share 621 + * the L3 cache id that was provided in the "domid" field of the 622 + * struct mon_data. Search all domains in the resource for 623 + * one that matches this cache id. 624 + */ 625 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 626 + if (d->ci->id == domid) { 627 + rr.ci = d->ci; 628 + mon_event_read(&rr, r, NULL, rdtgrp, 629 + &d->ci->shared_cpu_map, evtid, false); 630 + goto checkresult; 631 + } 632 + } 633 + ret = -ENOENT; 634 + goto out; 635 + } else { 636 + /* 637 + * This file provides data from a single domain. Search 638 + * the resource to find the domain with "domid". 639 + */ 640 + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); 641 + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { 642 + ret = -ENOENT; 643 + goto out; 644 + } 645 + d = container_of(hdr, struct rdt_mon_domain, hdr); 646 + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); 647 + } 648 + 649 + checkresult: 650 + 651 + if (rr.err == -EIO) 652 + seq_puts(m, "Error\n"); 653 + else if (rr.err == -EINVAL) 654 + seq_puts(m, "Unavailable\n"); 655 + else 656 + seq_printf(m, "%llu\n", rr.val); 657 + 658 + out: 659 + rdtgroup_kn_unlock(of->kn); 660 + return ret; 661 + }

+426

fs/resctrl/internal.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _FS_RESCTRL_INTERNAL_H 3 + #define _FS_RESCTRL_INTERNAL_H 4 + 5 + #include <linux/resctrl.h> 6 + #include <linux/kernfs.h> 7 + #include <linux/fs_context.h> 8 + #include <linux/tick.h> 9 + 10 + #define CQM_LIMBOCHECK_INTERVAL 1000 11 + 12 + /** 13 + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that 14 + * aren't marked nohz_full 15 + * @mask: The mask to pick a CPU from. 16 + * @exclude_cpu:The CPU to avoid picking. 17 + * 18 + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping 19 + * CPUs that don't use nohz_full, these are preferred. Pass 20 + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. 21 + * 22 + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. 23 + */ 24 + static inline unsigned int 25 + cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) 26 + { 27 + unsigned int cpu; 28 + 29 + /* Try to find a CPU that isn't nohz_full to use in preference */ 30 + if (tick_nohz_full_enabled()) { 31 + cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu); 32 + if (cpu < nr_cpu_ids) 33 + return cpu; 34 + } 35 + 36 + return cpumask_any_but(mask, exclude_cpu); 37 + } 38 + 39 + struct rdt_fs_context { 40 + struct kernfs_fs_context kfc; 41 + bool enable_cdpl2; 42 + bool enable_cdpl3; 43 + bool enable_mba_mbps; 44 + bool enable_debug; 45 + }; 46 + 47 + static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) 48 + { 49 + struct kernfs_fs_context *kfc = fc->fs_private; 50 + 51 + return container_of(kfc, struct rdt_fs_context, kfc); 52 + } 53 + 54 + /** 55 + * struct mon_evt - Entry in the event list of a resource 56 + * @evtid: event id 57 + * @name: name of the event 58 + * @configurable: true if the event is configurable 59 + * @list: entry in &rdt_resource->evt_list 60 + */ 61 + struct mon_evt { 62 + enum resctrl_event_id evtid; 63 + char *name; 64 + bool configurable; 65 + struct list_head list; 66 + }; 67 + 68 + /** 69 + * struct mon_data - Monitoring details for each event file. 70 + * @list: Member of the global @mon_data_kn_priv_list list. 71 + * @rid: Resource id associated with the event file. 72 + * @evtid: Event id associated with the event file. 73 + * @sum: Set when event must be summed across multiple 74 + * domains. 75 + * @domid: When @sum is zero this is the domain to which 76 + * the event file belongs. When @sum is one this 77 + * is the id of the L3 cache that all domains to be 78 + * summed share. 79 + * 80 + * Pointed to by the kernfs kn->priv field of monitoring event files. 81 + * Readers and writers must hold rdtgroup_mutex. 82 + */ 83 + struct mon_data { 84 + struct list_head list; 85 + enum resctrl_res_level rid; 86 + enum resctrl_event_id evtid; 87 + int domid; 88 + bool sum; 89 + }; 90 + 91 + /** 92 + * struct rmid_read - Data passed across smp_call*() to read event count. 93 + * @rgrp: Resource group for which the counter is being read. If it is a parent 94 + * resource group then its event count is summed with the count from all 95 + * its child resource groups. 96 + * @r: Resource describing the properties of the event being read. 97 + * @d: Domain that the counter should be read from. If NULL then sum all 98 + * domains in @r sharing L3 @ci.id 99 + * @evtid: Which monitor event to read. 100 + * @first: Initialize MBM counter when true. 101 + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. 102 + * @err: Error encountered when reading counter. 103 + * @val: Returned value of event counter. If @rgrp is a parent resource group, 104 + * @val includes the sum of event counts from its child resource groups. 105 + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, 106 + * (summed across child resource groups if @rgrp is a parent resource group). 107 + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). 108 + */ 109 + struct rmid_read { 110 + struct rdtgroup *rgrp; 111 + struct rdt_resource *r; 112 + struct rdt_mon_domain *d; 113 + enum resctrl_event_id evtid; 114 + bool first; 115 + struct cacheinfo *ci; 116 + int err; 117 + u64 val; 118 + void *arch_mon_ctx; 119 + }; 120 + 121 + extern struct list_head resctrl_schema_all; 122 + 123 + extern bool resctrl_mounted; 124 + 125 + enum rdt_group_type { 126 + RDTCTRL_GROUP = 0, 127 + RDTMON_GROUP, 128 + RDT_NUM_GROUP, 129 + }; 130 + 131 + /** 132 + * enum rdtgrp_mode - Mode of a RDT resource group 133 + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations 134 + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed 135 + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking 136 + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations 137 + * allowed AND the allocations are Cache Pseudo-Locked 138 + * @RDT_NUM_MODES: Total number of modes 139 + * 140 + * The mode of a resource group enables control over the allowed overlap 141 + * between allocations associated with different resource groups (classes 142 + * of service). User is able to modify the mode of a resource group by 143 + * writing to the "mode" resctrl file associated with the resource group. 144 + * 145 + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by 146 + * writing the appropriate text to the "mode" file. A resource group enters 147 + * "pseudo-locked" mode after the schemata is written while the resource 148 + * group is in "pseudo-locksetup" mode. 149 + */ 150 + enum rdtgrp_mode { 151 + RDT_MODE_SHAREABLE = 0, 152 + RDT_MODE_EXCLUSIVE, 153 + RDT_MODE_PSEUDO_LOCKSETUP, 154 + RDT_MODE_PSEUDO_LOCKED, 155 + 156 + /* Must be last */ 157 + RDT_NUM_MODES, 158 + }; 159 + 160 + /** 161 + * struct mongroup - store mon group's data in resctrl fs. 162 + * @mon_data_kn: kernfs node for the mon_data directory 163 + * @parent: parent rdtgrp 164 + * @crdtgrp_list: child rdtgroup node list 165 + * @rmid: rmid for this rdtgroup 166 + */ 167 + struct mongroup { 168 + struct kernfs_node *mon_data_kn; 169 + struct rdtgroup *parent; 170 + struct list_head crdtgrp_list; 171 + u32 rmid; 172 + }; 173 + 174 + /** 175 + * struct rdtgroup - store rdtgroup's data in resctrl file system. 176 + * @kn: kernfs node 177 + * @rdtgroup_list: linked list for all rdtgroups 178 + * @closid: closid for this rdtgroup 179 + * @cpu_mask: CPUs assigned to this rdtgroup 180 + * @flags: status bits 181 + * @waitcount: how many cpus expect to find this 182 + * group when they acquire rdtgroup_mutex 183 + * @type: indicates type of this rdtgroup - either 184 + * monitor only or ctrl_mon group 185 + * @mon: mongroup related data 186 + * @mode: mode of resource group 187 + * @mba_mbps_event: input monitoring event id when mba_sc is enabled 188 + * @plr: pseudo-locked region 189 + */ 190 + struct rdtgroup { 191 + struct kernfs_node *kn; 192 + struct list_head rdtgroup_list; 193 + u32 closid; 194 + struct cpumask cpu_mask; 195 + int flags; 196 + atomic_t waitcount; 197 + enum rdt_group_type type; 198 + struct mongroup mon; 199 + enum rdtgrp_mode mode; 200 + enum resctrl_event_id mba_mbps_event; 201 + struct pseudo_lock_region *plr; 202 + }; 203 + 204 + /* rdtgroup.flags */ 205 + #define RDT_DELETED 1 206 + 207 + /* rftype.flags */ 208 + #define RFTYPE_FLAGS_CPUS_LIST 1 209 + 210 + /* 211 + * Define the file type flags for base and info directories. 212 + */ 213 + #define RFTYPE_INFO BIT(0) 214 + 215 + #define RFTYPE_BASE BIT(1) 216 + 217 + #define RFTYPE_CTRL BIT(4) 218 + 219 + #define RFTYPE_MON BIT(5) 220 + 221 + #define RFTYPE_TOP BIT(6) 222 + 223 + #define RFTYPE_RES_CACHE BIT(8) 224 + 225 + #define RFTYPE_RES_MB BIT(9) 226 + 227 + #define RFTYPE_DEBUG BIT(10) 228 + 229 + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) 230 + 231 + #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) 232 + 233 + #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) 234 + 235 + #define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) 236 + 237 + #define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) 238 + 239 + /* List of all resource groups */ 240 + extern struct list_head rdt_all_groups; 241 + 242 + extern int max_name_width; 243 + 244 + /** 245 + * struct rftype - describe each file in the resctrl file system 246 + * @name: File name 247 + * @mode: Access mode 248 + * @kf_ops: File operations 249 + * @flags: File specific RFTYPE_FLAGS_* flags 250 + * @fflags: File specific RFTYPE_* flags 251 + * @seq_show: Show content of the file 252 + * @write: Write to the file 253 + */ 254 + struct rftype { 255 + char *name; 256 + umode_t mode; 257 + const struct kernfs_ops *kf_ops; 258 + unsigned long flags; 259 + unsigned long fflags; 260 + 261 + int (*seq_show)(struct kernfs_open_file *of, 262 + struct seq_file *sf, void *v); 263 + /* 264 + * write() is the generic write callback which maps directly to 265 + * kernfs write operation and overrides all other operations. 266 + * Maximum write size is determined by ->max_write_len. 267 + */ 268 + ssize_t (*write)(struct kernfs_open_file *of, 269 + char *buf, size_t nbytes, loff_t off); 270 + }; 271 + 272 + /** 273 + * struct mbm_state - status for each MBM counter in each domain 274 + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation 275 + * @prev_bw: The most recent bandwidth in MBps 276 + */ 277 + struct mbm_state { 278 + u64 prev_bw_bytes; 279 + u32 prev_bw; 280 + }; 281 + 282 + extern struct mutex rdtgroup_mutex; 283 + 284 + static inline const char *rdt_kn_name(const struct kernfs_node *kn) 285 + { 286 + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); 287 + } 288 + 289 + extern struct rdtgroup rdtgroup_default; 290 + 291 + extern struct dentry *debugfs_resctrl; 292 + 293 + extern enum resctrl_event_id mba_mbps_default_event; 294 + 295 + void rdt_last_cmd_clear(void); 296 + 297 + void rdt_last_cmd_puts(const char *s); 298 + 299 + __printf(1, 2) 300 + void rdt_last_cmd_printf(const char *fmt, ...); 301 + 302 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); 303 + 304 + void rdtgroup_kn_unlock(struct kernfs_node *kn); 305 + 306 + int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); 307 + 308 + int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 309 + umode_t mask); 310 + 311 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 312 + char *buf, size_t nbytes, loff_t off); 313 + 314 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 315 + struct seq_file *s, void *v); 316 + 317 + ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 318 + char *buf, size_t nbytes, loff_t off); 319 + 320 + int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 321 + struct seq_file *s, void *v); 322 + 323 + bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 324 + unsigned long cbm, int closid, bool exclusive); 325 + 326 + unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, 327 + unsigned long cbm); 328 + 329 + enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); 330 + 331 + int rdtgroup_tasks_assigned(struct rdtgroup *r); 332 + 333 + int closids_supported(void); 334 + 335 + void closid_free(int closid); 336 + 337 + int alloc_rmid(u32 closid); 338 + 339 + void free_rmid(u32 closid, u32 rmid); 340 + 341 + void resctrl_mon_resource_exit(void); 342 + 343 + void mon_event_count(void *info); 344 + 345 + int rdtgroup_mondata_show(struct seq_file *m, void *arg); 346 + 347 + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 348 + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 349 + cpumask_t *cpumask, int evtid, int first); 350 + 351 + int resctrl_mon_resource_init(void); 352 + 353 + void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, 354 + unsigned long delay_ms, 355 + int exclude_cpu); 356 + 357 + void mbm_handle_overflow(struct work_struct *work); 358 + 359 + bool is_mba_sc(struct rdt_resource *r); 360 + 361 + void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 362 + int exclude_cpu); 363 + 364 + void cqm_handle_limbo(struct work_struct *work); 365 + 366 + bool has_busy_rmid(struct rdt_mon_domain *d); 367 + 368 + void __check_limbo(struct rdt_mon_domain *d, bool force_free); 369 + 370 + void resctrl_file_fflags_init(const char *config, unsigned long fflags); 371 + 372 + void rdt_staged_configs_clear(void); 373 + 374 + bool closid_allocated(unsigned int closid); 375 + 376 + int resctrl_find_cleanest_closid(void); 377 + 378 + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 379 + int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); 380 + 381 + int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); 382 + 383 + bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); 384 + 385 + bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); 386 + 387 + int rdt_pseudo_lock_init(void); 388 + 389 + void rdt_pseudo_lock_release(void); 390 + 391 + int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); 392 + 393 + void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); 394 + 395 + #else 396 + static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 397 + { 398 + return -EOPNOTSUPP; 399 + } 400 + 401 + static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 402 + { 403 + return -EOPNOTSUPP; 404 + } 405 + 406 + static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 407 + { 408 + return false; 409 + } 410 + 411 + static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 412 + { 413 + return false; 414 + } 415 + 416 + static inline int rdt_pseudo_lock_init(void) { return 0; } 417 + static inline void rdt_pseudo_lock_release(void) { } 418 + static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 419 + { 420 + return -EOPNOTSUPP; 421 + } 422 + 423 + static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } 424 + #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 425 + 426 + #endif /* _FS_RESCTRL_INTERNAL_H */

+929

fs/resctrl/monitor.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Resource Director Technology(RDT) 4 + * - Monitoring code 5 + * 6 + * Copyright (C) 2017 Intel Corporation 7 + * 8 + * Author: 9 + * Vikas Shivappa <vikas.shivappa@intel.com> 10 + * 11 + * This replaces the cqm.c based on perf but we reuse a lot of 12 + * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 + * 14 + * More information about RDT be found in the Intel (R) x86 Architecture 15 + * Software Developer Manual June 2016, volume 3, section 17.17. 16 + */ 17 + 18 + #define pr_fmt(fmt) "resctrl: " fmt 19 + 20 + #include <linux/cpu.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/sizes.h> 23 + #include <linux/slab.h> 24 + 25 + #include "internal.h" 26 + 27 + #define CREATE_TRACE_POINTS 28 + 29 + #include "monitor_trace.h" 30 + 31 + /** 32 + * struct rmid_entry - dirty tracking for all RMID. 33 + * @closid: The CLOSID for this entry. 34 + * @rmid: The RMID for this entry. 35 + * @busy: The number of domains with cached data using this RMID. 36 + * @list: Member of the rmid_free_lru list when busy == 0. 37 + * 38 + * Depending on the architecture the correct monitor is accessed using 39 + * both @closid and @rmid, or @rmid only. 40 + * 41 + * Take the rdtgroup_mutex when accessing. 42 + */ 43 + struct rmid_entry { 44 + u32 closid; 45 + u32 rmid; 46 + int busy; 47 + struct list_head list; 48 + }; 49 + 50 + /* 51 + * @rmid_free_lru - A least recently used list of free RMIDs 52 + * These RMIDs are guaranteed to have an occupancy less than the 53 + * threshold occupancy 54 + */ 55 + static LIST_HEAD(rmid_free_lru); 56 + 57 + /* 58 + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 59 + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 60 + * Indexed by CLOSID. Protected by rdtgroup_mutex. 61 + */ 62 + static u32 *closid_num_dirty_rmid; 63 + 64 + /* 65 + * @rmid_limbo_count - count of currently unused but (potentially) 66 + * dirty RMIDs. 67 + * This counts RMIDs that no one is currently using but that 68 + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 69 + * change the threshold occupancy value. 70 + */ 71 + static unsigned int rmid_limbo_count; 72 + 73 + /* 74 + * @rmid_entry - The entry in the limbo and free lists. 75 + */ 76 + static struct rmid_entry *rmid_ptrs; 77 + 78 + /* 79 + * This is the threshold cache occupancy in bytes at which we will consider an 80 + * RMID available for re-allocation. 81 + */ 82 + unsigned int resctrl_rmid_realloc_threshold; 83 + 84 + /* 85 + * This is the maximum value for the reallocation threshold, in bytes. 86 + */ 87 + unsigned int resctrl_rmid_realloc_limit; 88 + 89 + /* 90 + * x86 and arm64 differ in their handling of monitoring. 91 + * x86's RMID are independent numbers, there is only one source of traffic 92 + * with an RMID value of '1'. 93 + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 94 + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 95 + * value is no longer unique. 96 + * To account for this, resctrl uses an index. On x86 this is just the RMID, 97 + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 98 + * 99 + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 100 + * must accept an attempt to read every index. 101 + */ 102 + static inline struct rmid_entry *__rmid_entry(u32 idx) 103 + { 104 + struct rmid_entry *entry; 105 + u32 closid, rmid; 106 + 107 + entry = &rmid_ptrs[idx]; 108 + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 109 + 110 + WARN_ON_ONCE(entry->closid != closid); 111 + WARN_ON_ONCE(entry->rmid != rmid); 112 + 113 + return entry; 114 + } 115 + 116 + static void limbo_release_entry(struct rmid_entry *entry) 117 + { 118 + lockdep_assert_held(&rdtgroup_mutex); 119 + 120 + rmid_limbo_count--; 121 + list_add_tail(&entry->list, &rmid_free_lru); 122 + 123 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 124 + closid_num_dirty_rmid[entry->closid]--; 125 + } 126 + 127 + /* 128 + * Check the RMIDs that are marked as busy for this domain. If the 129 + * reported LLC occupancy is below the threshold clear the busy bit and 130 + * decrement the count. If the busy count gets to zero on an RMID, we 131 + * free the RMID 132 + */ 133 + void __check_limbo(struct rdt_mon_domain *d, bool force_free) 134 + { 135 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 136 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 137 + struct rmid_entry *entry; 138 + u32 idx, cur_idx = 1; 139 + void *arch_mon_ctx; 140 + bool rmid_dirty; 141 + u64 val = 0; 142 + 143 + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 144 + if (IS_ERR(arch_mon_ctx)) { 145 + pr_warn_ratelimited("Failed to allocate monitor context: %ld", 146 + PTR_ERR(arch_mon_ctx)); 147 + return; 148 + } 149 + 150 + /* 151 + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 152 + * are marked as busy for occupancy < threshold. If the occupancy 153 + * is less than the threshold decrement the busy counter of the 154 + * RMID and move it to the free list when the counter reaches 0. 155 + */ 156 + for (;;) { 157 + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 158 + if (idx >= idx_limit) 159 + break; 160 + 161 + entry = __rmid_entry(idx); 162 + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 163 + QOS_L3_OCCUP_EVENT_ID, &val, 164 + arch_mon_ctx)) { 165 + rmid_dirty = true; 166 + } else { 167 + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 168 + 169 + /* 170 + * x86's CLOSID and RMID are independent numbers, so the entry's 171 + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 172 + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 173 + * used to select the configuration. It is thus necessary to track both 174 + * CLOSID and RMID because there may be dependencies between them 175 + * on some architectures. 176 + */ 177 + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 178 + } 179 + 180 + if (force_free || !rmid_dirty) { 181 + clear_bit(idx, d->rmid_busy_llc); 182 + if (!--entry->busy) 183 + limbo_release_entry(entry); 184 + } 185 + cur_idx = idx + 1; 186 + } 187 + 188 + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 189 + } 190 + 191 + bool has_busy_rmid(struct rdt_mon_domain *d) 192 + { 193 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 194 + 195 + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 196 + } 197 + 198 + static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 199 + { 200 + struct rmid_entry *itr; 201 + u32 itr_idx, cmp_idx; 202 + 203 + if (list_empty(&rmid_free_lru)) 204 + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 205 + 206 + list_for_each_entry(itr, &rmid_free_lru, list) { 207 + /* 208 + * Get the index of this free RMID, and the index it would need 209 + * to be if it were used with this CLOSID. 210 + * If the CLOSID is irrelevant on this architecture, the two 211 + * index values are always the same on every entry and thus the 212 + * very first entry will be returned. 213 + */ 214 + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 215 + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 216 + 217 + if (itr_idx == cmp_idx) 218 + return itr; 219 + } 220 + 221 + return ERR_PTR(-ENOSPC); 222 + } 223 + 224 + /** 225 + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 226 + * RMID are clean, or the CLOSID that has 227 + * the most clean RMID. 228 + * 229 + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 230 + * may not be able to allocate clean RMID. To avoid this the allocator will 231 + * choose the CLOSID with the most clean RMID. 232 + * 233 + * When the CLOSID and RMID are independent numbers, the first free CLOSID will 234 + * be returned. 235 + */ 236 + int resctrl_find_cleanest_closid(void) 237 + { 238 + u32 cleanest_closid = ~0; 239 + int i = 0; 240 + 241 + lockdep_assert_held(&rdtgroup_mutex); 242 + 243 + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 244 + return -EIO; 245 + 246 + for (i = 0; i < closids_supported(); i++) { 247 + int num_dirty; 248 + 249 + if (closid_allocated(i)) 250 + continue; 251 + 252 + num_dirty = closid_num_dirty_rmid[i]; 253 + if (num_dirty == 0) 254 + return i; 255 + 256 + if (cleanest_closid == ~0) 257 + cleanest_closid = i; 258 + 259 + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 260 + cleanest_closid = i; 261 + } 262 + 263 + if (cleanest_closid == ~0) 264 + return -ENOSPC; 265 + 266 + return cleanest_closid; 267 + } 268 + 269 + /* 270 + * For MPAM the RMID value is not unique, and has to be considered with 271 + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 272 + * allows all domains to be managed by a single free list. 273 + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 274 + */ 275 + int alloc_rmid(u32 closid) 276 + { 277 + struct rmid_entry *entry; 278 + 279 + lockdep_assert_held(&rdtgroup_mutex); 280 + 281 + entry = resctrl_find_free_rmid(closid); 282 + if (IS_ERR(entry)) 283 + return PTR_ERR(entry); 284 + 285 + list_del(&entry->list); 286 + return entry->rmid; 287 + } 288 + 289 + static void add_rmid_to_limbo(struct rmid_entry *entry) 290 + { 291 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 292 + struct rdt_mon_domain *d; 293 + u32 idx; 294 + 295 + lockdep_assert_held(&rdtgroup_mutex); 296 + 297 + /* Walking r->domains, ensure it can't race with cpuhp */ 298 + lockdep_assert_cpus_held(); 299 + 300 + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 301 + 302 + entry->busy = 0; 303 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 304 + /* 305 + * For the first limbo RMID in the domain, 306 + * setup up the limbo worker. 307 + */ 308 + if (!has_busy_rmid(d)) 309 + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 310 + RESCTRL_PICK_ANY_CPU); 311 + set_bit(idx, d->rmid_busy_llc); 312 + entry->busy++; 313 + } 314 + 315 + rmid_limbo_count++; 316 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 317 + closid_num_dirty_rmid[entry->closid]++; 318 + } 319 + 320 + void free_rmid(u32 closid, u32 rmid) 321 + { 322 + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 323 + struct rmid_entry *entry; 324 + 325 + lockdep_assert_held(&rdtgroup_mutex); 326 + 327 + /* 328 + * Do not allow the default rmid to be free'd. Comparing by index 329 + * allows architectures that ignore the closid parameter to avoid an 330 + * unnecessary check. 331 + */ 332 + if (!resctrl_arch_mon_capable() || 333 + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 334 + RESCTRL_RESERVED_RMID)) 335 + return; 336 + 337 + entry = __rmid_entry(idx); 338 + 339 + if (resctrl_arch_is_llc_occupancy_enabled()) 340 + add_rmid_to_limbo(entry); 341 + else 342 + list_add_tail(&entry->list, &rmid_free_lru); 343 + } 344 + 345 + static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 346 + u32 rmid, enum resctrl_event_id evtid) 347 + { 348 + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 349 + 350 + switch (evtid) { 351 + case QOS_L3_MBM_TOTAL_EVENT_ID: 352 + return &d->mbm_total[idx]; 353 + case QOS_L3_MBM_LOCAL_EVENT_ID: 354 + return &d->mbm_local[idx]; 355 + default: 356 + return NULL; 357 + } 358 + } 359 + 360 + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 361 + { 362 + int cpu = smp_processor_id(); 363 + struct rdt_mon_domain *d; 364 + struct mbm_state *m; 365 + int err, ret; 366 + u64 tval = 0; 367 + 368 + if (rr->first) { 369 + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 370 + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 371 + if (m) 372 + memset(m, 0, sizeof(struct mbm_state)); 373 + return 0; 374 + } 375 + 376 + if (rr->d) { 377 + /* Reading a single domain, must be on a CPU in that domain. */ 378 + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 379 + return -EINVAL; 380 + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 381 + rr->evtid, &tval, rr->arch_mon_ctx); 382 + if (rr->err) 383 + return rr->err; 384 + 385 + rr->val += tval; 386 + 387 + return 0; 388 + } 389 + 390 + /* Summing domains that share a cache, must be on a CPU for that cache. */ 391 + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 392 + return -EINVAL; 393 + 394 + /* 395 + * Legacy files must report the sum of an event across all 396 + * domains that share the same L3 cache instance. 397 + * Report success if a read from any domain succeeds, -EINVAL 398 + * (translated to "Unavailable" for user space) if reading from 399 + * all domains fail for any reason. 400 + */ 401 + ret = -EINVAL; 402 + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 403 + if (d->ci->id != rr->ci->id) 404 + continue; 405 + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 406 + rr->evtid, &tval, rr->arch_mon_ctx); 407 + if (!err) { 408 + rr->val += tval; 409 + ret = 0; 410 + } 411 + } 412 + 413 + if (ret) 414 + rr->err = ret; 415 + 416 + return ret; 417 + } 418 + 419 + /* 420 + * mbm_bw_count() - Update bw count from values previously read by 421 + * __mon_event_count(). 422 + * @closid: The closid used to identify the cached mbm_state. 423 + * @rmid: The rmid used to identify the cached mbm_state. 424 + * @rr: The struct rmid_read populated by __mon_event_count(). 425 + * 426 + * Supporting function to calculate the memory bandwidth 427 + * and delta bandwidth in MBps. The chunks value previously read by 428 + * __mon_event_count() is compared with the chunks value from the previous 429 + * invocation. This must be called once per second to maintain values in MBps. 430 + */ 431 + static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 432 + { 433 + u64 cur_bw, bytes, cur_bytes; 434 + struct mbm_state *m; 435 + 436 + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 437 + if (WARN_ON_ONCE(!m)) 438 + return; 439 + 440 + cur_bytes = rr->val; 441 + bytes = cur_bytes - m->prev_bw_bytes; 442 + m->prev_bw_bytes = cur_bytes; 443 + 444 + cur_bw = bytes / SZ_1M; 445 + 446 + m->prev_bw = cur_bw; 447 + } 448 + 449 + /* 450 + * This is scheduled by mon_event_read() to read the CQM/MBM counters 451 + * on a domain. 452 + */ 453 + void mon_event_count(void *info) 454 + { 455 + struct rdtgroup *rdtgrp, *entry; 456 + struct rmid_read *rr = info; 457 + struct list_head *head; 458 + int ret; 459 + 460 + rdtgrp = rr->rgrp; 461 + 462 + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 463 + 464 + /* 465 + * For Ctrl groups read data from child monitor groups and 466 + * add them together. Count events which are read successfully. 467 + * Discard the rmid_read's reporting errors. 468 + */ 469 + head = &rdtgrp->mon.crdtgrp_list; 470 + 471 + if (rdtgrp->type == RDTCTRL_GROUP) { 472 + list_for_each_entry(entry, head, mon.crdtgrp_list) { 473 + if (__mon_event_count(entry->closid, entry->mon.rmid, 474 + rr) == 0) 475 + ret = 0; 476 + } 477 + } 478 + 479 + /* 480 + * __mon_event_count() calls for newly created monitor groups may 481 + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 482 + * Discard error if any of the monitor event reads succeeded. 483 + */ 484 + if (ret == 0) 485 + rr->err = 0; 486 + } 487 + 488 + static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 489 + struct rdt_resource *r) 490 + { 491 + struct rdt_ctrl_domain *d; 492 + 493 + lockdep_assert_cpus_held(); 494 + 495 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 496 + /* Find the domain that contains this CPU */ 497 + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 498 + return d; 499 + } 500 + 501 + return NULL; 502 + } 503 + 504 + /* 505 + * Feedback loop for MBA software controller (mba_sc) 506 + * 507 + * mba_sc is a feedback loop where we periodically read MBM counters and 508 + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 509 + * that: 510 + * 511 + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 512 + * 513 + * This uses the MBM counters to measure the bandwidth and MBA throttle 514 + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 515 + * fact that resctrl rdtgroups have both monitoring and control. 516 + * 517 + * The frequency of the checks is 1s and we just tag along the MBM overflow 518 + * timer. Having 1s interval makes the calculation of bandwidth simpler. 519 + * 520 + * Although MBA's goal is to restrict the bandwidth to a maximum, there may 521 + * be a need to increase the bandwidth to avoid unnecessarily restricting 522 + * the L2 <-> L3 traffic. 523 + * 524 + * Since MBA controls the L2 external bandwidth where as MBM measures the 525 + * L3 external bandwidth the following sequence could lead to such a 526 + * situation. 527 + * 528 + * Consider an rdtgroup which had high L3 <-> memory traffic in initial 529 + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 530 + * after some time rdtgroup has mostly L2 <-> L3 traffic. 531 + * 532 + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 533 + * throttle MSRs already have low percentage values. To avoid 534 + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 535 + */ 536 + static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 537 + { 538 + u32 closid, rmid, cur_msr_val, new_msr_val; 539 + struct mbm_state *pmbm_data, *cmbm_data; 540 + struct rdt_ctrl_domain *dom_mba; 541 + enum resctrl_event_id evt_id; 542 + struct rdt_resource *r_mba; 543 + struct list_head *head; 544 + struct rdtgroup *entry; 545 + u32 cur_bw, user_bw; 546 + 547 + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 548 + evt_id = rgrp->mba_mbps_event; 549 + 550 + closid = rgrp->closid; 551 + rmid = rgrp->mon.rmid; 552 + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 553 + if (WARN_ON_ONCE(!pmbm_data)) 554 + return; 555 + 556 + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 557 + if (!dom_mba) { 558 + pr_warn_once("Failure to get domain for MBA update\n"); 559 + return; 560 + } 561 + 562 + cur_bw = pmbm_data->prev_bw; 563 + user_bw = dom_mba->mbps_val[closid]; 564 + 565 + /* MBA resource doesn't support CDP */ 566 + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 567 + 568 + /* 569 + * For Ctrl groups read data from child monitor groups. 570 + */ 571 + head = &rgrp->mon.crdtgrp_list; 572 + list_for_each_entry(entry, head, mon.crdtgrp_list) { 573 + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 574 + if (WARN_ON_ONCE(!cmbm_data)) 575 + return; 576 + cur_bw += cmbm_data->prev_bw; 577 + } 578 + 579 + /* 580 + * Scale up/down the bandwidth linearly for the ctrl group. The 581 + * bandwidth step is the bandwidth granularity specified by the 582 + * hardware. 583 + * Always increase throttling if current bandwidth is above the 584 + * target set by user. 585 + * But avoid thrashing up and down on every poll by checking 586 + * whether a decrease in throttling is likely to push the group 587 + * back over target. E.g. if currently throttling to 30% of bandwidth 588 + * on a system with 10% granularity steps, check whether moving to 589 + * 40% would go past the limit by multiplying current bandwidth by 590 + * "(30 + 10) / 30". 591 + */ 592 + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 593 + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 594 + } else if (cur_msr_val < MAX_MBA_BW && 595 + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 596 + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 597 + } else { 598 + return; 599 + } 600 + 601 + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 602 + } 603 + 604 + static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 605 + u32 closid, u32 rmid, enum resctrl_event_id evtid) 606 + { 607 + struct rmid_read rr = {0}; 608 + 609 + rr.r = r; 610 + rr.d = d; 611 + rr.evtid = evtid; 612 + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 613 + if (IS_ERR(rr.arch_mon_ctx)) { 614 + pr_warn_ratelimited("Failed to allocate monitor context: %ld", 615 + PTR_ERR(rr.arch_mon_ctx)); 616 + return; 617 + } 618 + 619 + __mon_event_count(closid, rmid, &rr); 620 + 621 + /* 622 + * If the software controller is enabled, compute the 623 + * bandwidth for this event id. 624 + */ 625 + if (is_mba_sc(NULL)) 626 + mbm_bw_count(closid, rmid, &rr); 627 + 628 + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 629 + } 630 + 631 + static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 632 + u32 closid, u32 rmid) 633 + { 634 + /* 635 + * This is protected from concurrent reads from user as both 636 + * the user and overflow handler hold the global mutex. 637 + */ 638 + if (resctrl_arch_is_mbm_total_enabled()) 639 + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 640 + 641 + if (resctrl_arch_is_mbm_local_enabled()) 642 + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 643 + } 644 + 645 + /* 646 + * Handler to scan the limbo list and move the RMIDs 647 + * to free list whose occupancy < threshold_occupancy. 648 + */ 649 + void cqm_handle_limbo(struct work_struct *work) 650 + { 651 + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 652 + struct rdt_mon_domain *d; 653 + 654 + cpus_read_lock(); 655 + mutex_lock(&rdtgroup_mutex); 656 + 657 + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 658 + 659 + __check_limbo(d, false); 660 + 661 + if (has_busy_rmid(d)) { 662 + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 663 + RESCTRL_PICK_ANY_CPU); 664 + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 665 + delay); 666 + } 667 + 668 + mutex_unlock(&rdtgroup_mutex); 669 + cpus_read_unlock(); 670 + } 671 + 672 + /** 673 + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 674 + * domain. 675 + * @dom: The domain the limbo handler should run for. 676 + * @delay_ms: How far in the future the handler should run. 677 + * @exclude_cpu: Which CPU the handler should not run on, 678 + * RESCTRL_PICK_ANY_CPU to pick any CPU. 679 + */ 680 + void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 681 + int exclude_cpu) 682 + { 683 + unsigned long delay = msecs_to_jiffies(delay_ms); 684 + int cpu; 685 + 686 + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 687 + dom->cqm_work_cpu = cpu; 688 + 689 + if (cpu < nr_cpu_ids) 690 + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 691 + } 692 + 693 + void mbm_handle_overflow(struct work_struct *work) 694 + { 695 + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 696 + struct rdtgroup *prgrp, *crgrp; 697 + struct rdt_mon_domain *d; 698 + struct list_head *head; 699 + struct rdt_resource *r; 700 + 701 + cpus_read_lock(); 702 + mutex_lock(&rdtgroup_mutex); 703 + 704 + /* 705 + * If the filesystem has been unmounted this work no longer needs to 706 + * run. 707 + */ 708 + if (!resctrl_mounted || !resctrl_arch_mon_capable()) 709 + goto out_unlock; 710 + 711 + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 712 + d = container_of(work, struct rdt_mon_domain, mbm_over.work); 713 + 714 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 715 + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 716 + 717 + head = &prgrp->mon.crdtgrp_list; 718 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) 719 + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 720 + 721 + if (is_mba_sc(NULL)) 722 + update_mba_bw(prgrp, d); 723 + } 724 + 725 + /* 726 + * Re-check for housekeeping CPUs. This allows the overflow handler to 727 + * move off a nohz_full CPU quickly. 728 + */ 729 + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 730 + RESCTRL_PICK_ANY_CPU); 731 + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 732 + 733 + out_unlock: 734 + mutex_unlock(&rdtgroup_mutex); 735 + cpus_read_unlock(); 736 + } 737 + 738 + /** 739 + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 740 + * domain. 741 + * @dom: The domain the overflow handler should run for. 742 + * @delay_ms: How far in the future the handler should run. 743 + * @exclude_cpu: Which CPU the handler should not run on, 744 + * RESCTRL_PICK_ANY_CPU to pick any CPU. 745 + */ 746 + void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 747 + int exclude_cpu) 748 + { 749 + unsigned long delay = msecs_to_jiffies(delay_ms); 750 + int cpu; 751 + 752 + /* 753 + * When a domain comes online there is no guarantee the filesystem is 754 + * mounted. If not, there is no need to catch counter overflow. 755 + */ 756 + if (!resctrl_mounted || !resctrl_arch_mon_capable()) 757 + return; 758 + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 759 + dom->mbm_work_cpu = cpu; 760 + 761 + if (cpu < nr_cpu_ids) 762 + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 763 + } 764 + 765 + static int dom_data_init(struct rdt_resource *r) 766 + { 767 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 768 + u32 num_closid = resctrl_arch_get_num_closid(r); 769 + struct rmid_entry *entry = NULL; 770 + int err = 0, i; 771 + u32 idx; 772 + 773 + mutex_lock(&rdtgroup_mutex); 774 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 775 + u32 *tmp; 776 + 777 + /* 778 + * If the architecture hasn't provided a sanitised value here, 779 + * this may result in larger arrays than necessary. Resctrl will 780 + * use a smaller system wide value based on the resources in 781 + * use. 782 + */ 783 + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 784 + if (!tmp) { 785 + err = -ENOMEM; 786 + goto out_unlock; 787 + } 788 + 789 + closid_num_dirty_rmid = tmp; 790 + } 791 + 792 + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 793 + if (!rmid_ptrs) { 794 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 795 + kfree(closid_num_dirty_rmid); 796 + closid_num_dirty_rmid = NULL; 797 + } 798 + err = -ENOMEM; 799 + goto out_unlock; 800 + } 801 + 802 + for (i = 0; i < idx_limit; i++) { 803 + entry = &rmid_ptrs[i]; 804 + INIT_LIST_HEAD(&entry->list); 805 + 806 + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 807 + list_add_tail(&entry->list, &rmid_free_lru); 808 + } 809 + 810 + /* 811 + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 812 + * are always allocated. These are used for the rdtgroup_default 813 + * control group, which will be setup later in resctrl_init(). 814 + */ 815 + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 816 + RESCTRL_RESERVED_RMID); 817 + entry = __rmid_entry(idx); 818 + list_del(&entry->list); 819 + 820 + out_unlock: 821 + mutex_unlock(&rdtgroup_mutex); 822 + 823 + return err; 824 + } 825 + 826 + static void dom_data_exit(struct rdt_resource *r) 827 + { 828 + mutex_lock(&rdtgroup_mutex); 829 + 830 + if (!r->mon_capable) 831 + goto out_unlock; 832 + 833 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 834 + kfree(closid_num_dirty_rmid); 835 + closid_num_dirty_rmid = NULL; 836 + } 837 + 838 + kfree(rmid_ptrs); 839 + rmid_ptrs = NULL; 840 + 841 + out_unlock: 842 + mutex_unlock(&rdtgroup_mutex); 843 + } 844 + 845 + static struct mon_evt llc_occupancy_event = { 846 + .name = "llc_occupancy", 847 + .evtid = QOS_L3_OCCUP_EVENT_ID, 848 + }; 849 + 850 + static struct mon_evt mbm_total_event = { 851 + .name = "mbm_total_bytes", 852 + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 853 + }; 854 + 855 + static struct mon_evt mbm_local_event = { 856 + .name = "mbm_local_bytes", 857 + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 858 + }; 859 + 860 + /* 861 + * Initialize the event list for the resource. 862 + * 863 + * Note that MBM events are also part of RDT_RESOURCE_L3 resource 864 + * because as per the SDM the total and local memory bandwidth 865 + * are enumerated as part of L3 monitoring. 866 + */ 867 + static void l3_mon_evt_init(struct rdt_resource *r) 868 + { 869 + INIT_LIST_HEAD(&r->evt_list); 870 + 871 + if (resctrl_arch_is_llc_occupancy_enabled()) 872 + list_add_tail(&llc_occupancy_event.list, &r->evt_list); 873 + if (resctrl_arch_is_mbm_total_enabled()) 874 + list_add_tail(&mbm_total_event.list, &r->evt_list); 875 + if (resctrl_arch_is_mbm_local_enabled()) 876 + list_add_tail(&mbm_local_event.list, &r->evt_list); 877 + } 878 + 879 + /** 880 + * resctrl_mon_resource_init() - Initialise global monitoring structures. 881 + * 882 + * Allocate and initialise global monitor resources that do not belong to a 883 + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 884 + * Called once during boot after the struct rdt_resource's have been configured 885 + * but before the filesystem is mounted. 886 + * Resctrl's cpuhp callbacks may be called before this point to bring a domain 887 + * online. 888 + * 889 + * Returns 0 for success, or -ENOMEM. 890 + */ 891 + int resctrl_mon_resource_init(void) 892 + { 893 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 894 + int ret; 895 + 896 + if (!r->mon_capable) 897 + return 0; 898 + 899 + ret = dom_data_init(r); 900 + if (ret) 901 + return ret; 902 + 903 + l3_mon_evt_init(r); 904 + 905 + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 906 + mbm_total_event.configurable = true; 907 + resctrl_file_fflags_init("mbm_total_bytes_config", 908 + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 909 + } 910 + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 911 + mbm_local_event.configurable = true; 912 + resctrl_file_fflags_init("mbm_local_bytes_config", 913 + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 914 + } 915 + 916 + if (resctrl_arch_is_mbm_local_enabled()) 917 + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 918 + else if (resctrl_arch_is_mbm_total_enabled()) 919 + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 920 + 921 + return 0; 922 + } 923 + 924 + void resctrl_mon_resource_exit(void) 925 + { 926 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 927 + 928 + dom_data_exit(r); 929 + }

+33

fs/resctrl/monitor_trace.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #undef TRACE_SYSTEM 3 + #define TRACE_SYSTEM resctrl 4 + 5 + #if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 6 + #define _FS_RESCTRL_MONITOR_TRACE_H 7 + 8 + #include <linux/tracepoint.h> 9 + 10 + TRACE_EVENT(mon_llc_occupancy_limbo, 11 + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), 12 + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), 13 + TP_STRUCT__entry(__field(u32, ctrl_hw_id) 14 + __field(u32, mon_hw_id) 15 + __field(int, domain_id) 16 + __field(u64, llc_occupancy_bytes)), 17 + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; 18 + __entry->mon_hw_id = mon_hw_id; 19 + __entry->domain_id = domain_id; 20 + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), 21 + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", 22 + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, 23 + __entry->llc_occupancy_bytes) 24 + ); 25 + 26 + #endif /* _FS_RESCTRL_MONITOR_TRACE_H */ 27 + 28 + #undef TRACE_INCLUDE_PATH 29 + #define TRACE_INCLUDE_PATH . 30 + 31 + #define TRACE_INCLUDE_FILE monitor_trace 32 + 33 + #include <trace/define_trace.h>

+1105

fs/resctrl/pseudo_lock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Resource Director Technology (RDT) 4 + * 5 + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) 6 + * 7 + * Copyright (C) 2018 Intel Corporation 8 + * 9 + * Author: Reinette Chatre <reinette.chatre@intel.com> 10 + */ 11 + 12 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 + 14 + #include <linux/cacheinfo.h> 15 + #include <linux/cpu.h> 16 + #include <linux/cpumask.h> 17 + #include <linux/debugfs.h> 18 + #include <linux/kthread.h> 19 + #include <linux/mman.h> 20 + #include <linux/pm_qos.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/slab.h> 23 + #include <linux/uaccess.h> 24 + 25 + #include "internal.h" 26 + 27 + /* 28 + * Major number assigned to and shared by all devices exposing 29 + * pseudo-locked regions. 30 + */ 31 + static unsigned int pseudo_lock_major; 32 + 33 + static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 34 + 35 + static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 36 + { 37 + const struct rdtgroup *rdtgrp; 38 + 39 + rdtgrp = dev_get_drvdata(dev); 40 + if (mode) 41 + *mode = 0600; 42 + guard(mutex)(&rdtgroup_mutex); 43 + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 44 + } 45 + 46 + static const struct class pseudo_lock_class = { 47 + .name = "pseudo_lock", 48 + .devnode = pseudo_lock_devnode, 49 + }; 50 + 51 + /** 52 + * pseudo_lock_minor_get - Obtain available minor number 53 + * @minor: Pointer to where new minor number will be stored 54 + * 55 + * A bitmask is used to track available minor numbers. Here the next free 56 + * minor number is marked as unavailable and returned. 57 + * 58 + * Return: 0 on success, <0 on failure. 59 + */ 60 + static int pseudo_lock_minor_get(unsigned int *minor) 61 + { 62 + unsigned long first_bit; 63 + 64 + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 65 + 66 + if (first_bit == MINORBITS) 67 + return -ENOSPC; 68 + 69 + __clear_bit(first_bit, &pseudo_lock_minor_avail); 70 + *minor = first_bit; 71 + 72 + return 0; 73 + } 74 + 75 + /** 76 + * pseudo_lock_minor_release - Return minor number to available 77 + * @minor: The minor number made available 78 + */ 79 + static void pseudo_lock_minor_release(unsigned int minor) 80 + { 81 + __set_bit(minor, &pseudo_lock_minor_avail); 82 + } 83 + 84 + /** 85 + * region_find_by_minor - Locate a pseudo-lock region by inode minor number 86 + * @minor: The minor number of the device representing pseudo-locked region 87 + * 88 + * When the character device is accessed we need to determine which 89 + * pseudo-locked region it belongs to. This is done by matching the minor 90 + * number of the device to the pseudo-locked region it belongs. 91 + * 92 + * Minor numbers are assigned at the time a pseudo-locked region is associated 93 + * with a cache instance. 94 + * 95 + * Return: On success return pointer to resource group owning the pseudo-locked 96 + * region, NULL on failure. 97 + */ 98 + static struct rdtgroup *region_find_by_minor(unsigned int minor) 99 + { 100 + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 101 + 102 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 103 + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 104 + rdtgrp_match = rdtgrp; 105 + break; 106 + } 107 + } 108 + return rdtgrp_match; 109 + } 110 + 111 + /** 112 + * struct pseudo_lock_pm_req - A power management QoS request list entry 113 + * @list: Entry within the @pm_reqs list for a pseudo-locked region 114 + * @req: PM QoS request 115 + */ 116 + struct pseudo_lock_pm_req { 117 + struct list_head list; 118 + struct dev_pm_qos_request req; 119 + }; 120 + 121 + static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 122 + { 123 + struct pseudo_lock_pm_req *pm_req, *next; 124 + 125 + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 126 + dev_pm_qos_remove_request(&pm_req->req); 127 + list_del(&pm_req->list); 128 + kfree(pm_req); 129 + } 130 + } 131 + 132 + /** 133 + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 134 + * @plr: Pseudo-locked region 135 + * 136 + * To prevent the cache from being affected by power management entering 137 + * C6 has to be avoided. This is accomplished by requesting a latency 138 + * requirement lower than lowest C6 exit latency of all supported 139 + * platforms as found in the cpuidle state tables in the intel_idle driver. 140 + * At this time it is possible to do so with a single latency requirement 141 + * for all supported platforms. 142 + * 143 + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 144 + * the ACPI latencies need to be considered while keeping in mind that C2 145 + * may be set to map to deeper sleep states. In this case the latency 146 + * requirement needs to prevent entering C2 also. 147 + * 148 + * Return: 0 on success, <0 on failure 149 + */ 150 + static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 151 + { 152 + struct pseudo_lock_pm_req *pm_req; 153 + int cpu; 154 + int ret; 155 + 156 + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 157 + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 158 + if (!pm_req) { 159 + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 160 + ret = -ENOMEM; 161 + goto out_err; 162 + } 163 + ret = dev_pm_qos_add_request(get_cpu_device(cpu), 164 + &pm_req->req, 165 + DEV_PM_QOS_RESUME_LATENCY, 166 + 30); 167 + if (ret < 0) { 168 + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 169 + cpu); 170 + kfree(pm_req); 171 + ret = -1; 172 + goto out_err; 173 + } 174 + list_add(&pm_req->list, &plr->pm_reqs); 175 + } 176 + 177 + return 0; 178 + 179 + out_err: 180 + pseudo_lock_cstates_relax(plr); 181 + return ret; 182 + } 183 + 184 + /** 185 + * pseudo_lock_region_clear - Reset pseudo-lock region data 186 + * @plr: pseudo-lock region 187 + * 188 + * All content of the pseudo-locked region is reset - any memory allocated 189 + * freed. 190 + * 191 + * Return: void 192 + */ 193 + static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 194 + { 195 + plr->size = 0; 196 + plr->line_size = 0; 197 + kfree(plr->kmem); 198 + plr->kmem = NULL; 199 + plr->s = NULL; 200 + if (plr->d) 201 + plr->d->plr = NULL; 202 + plr->d = NULL; 203 + plr->cbm = 0; 204 + plr->debugfs_dir = NULL; 205 + } 206 + 207 + /** 208 + * pseudo_lock_region_init - Initialize pseudo-lock region information 209 + * @plr: pseudo-lock region 210 + * 211 + * Called after user provided a schemata to be pseudo-locked. From the 212 + * schemata the &struct pseudo_lock_region is on entry already initialized 213 + * with the resource, domain, and capacity bitmask. Here the information 214 + * required for pseudo-locking is deduced from this data and &struct 215 + * pseudo_lock_region initialized further. This information includes: 216 + * - size in bytes of the region to be pseudo-locked 217 + * - cache line size to know the stride with which data needs to be accessed 218 + * to be pseudo-locked 219 + * - a cpu associated with the cache instance on which the pseudo-locking 220 + * flow can be executed 221 + * 222 + * Return: 0 on success, <0 on failure. Descriptive error will be written 223 + * to last_cmd_status buffer. 224 + */ 225 + static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 226 + { 227 + enum resctrl_scope scope = plr->s->res->ctrl_scope; 228 + struct cacheinfo *ci; 229 + int ret; 230 + 231 + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 232 + return -ENODEV; 233 + 234 + /* Pick the first cpu we find that is associated with the cache. */ 235 + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 236 + 237 + if (!cpu_online(plr->cpu)) { 238 + rdt_last_cmd_printf("CPU %u associated with cache not online\n", 239 + plr->cpu); 240 + ret = -ENODEV; 241 + goto out_region; 242 + } 243 + 244 + ci = get_cpu_cacheinfo_level(plr->cpu, scope); 245 + if (ci) { 246 + plr->line_size = ci->coherency_line_size; 247 + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 248 + return 0; 249 + } 250 + 251 + ret = -1; 252 + rdt_last_cmd_puts("Unable to determine cache line size\n"); 253 + out_region: 254 + pseudo_lock_region_clear(plr); 255 + return ret; 256 + } 257 + 258 + /** 259 + * pseudo_lock_init - Initialize a pseudo-lock region 260 + * @rdtgrp: resource group to which new pseudo-locked region will belong 261 + * 262 + * A pseudo-locked region is associated with a resource group. When this 263 + * association is created the pseudo-locked region is initialized. The 264 + * details of the pseudo-locked region are not known at this time so only 265 + * allocation is done and association established. 266 + * 267 + * Return: 0 on success, <0 on failure 268 + */ 269 + static int pseudo_lock_init(struct rdtgroup *rdtgrp) 270 + { 271 + struct pseudo_lock_region *plr; 272 + 273 + plr = kzalloc(sizeof(*plr), GFP_KERNEL); 274 + if (!plr) 275 + return -ENOMEM; 276 + 277 + init_waitqueue_head(&plr->lock_thread_wq); 278 + INIT_LIST_HEAD(&plr->pm_reqs); 279 + rdtgrp->plr = plr; 280 + return 0; 281 + } 282 + 283 + /** 284 + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 285 + * @plr: pseudo-lock region 286 + * 287 + * Initialize the details required to set up the pseudo-locked region and 288 + * allocate the contiguous memory that will be pseudo-locked to the cache. 289 + * 290 + * Return: 0 on success, <0 on failure. Descriptive error will be written 291 + * to last_cmd_status buffer. 292 + */ 293 + static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 294 + { 295 + int ret; 296 + 297 + ret = pseudo_lock_region_init(plr); 298 + if (ret < 0) 299 + return ret; 300 + 301 + /* 302 + * We do not yet support contiguous regions larger than 303 + * KMALLOC_MAX_SIZE. 304 + */ 305 + if (plr->size > KMALLOC_MAX_SIZE) { 306 + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 307 + ret = -E2BIG; 308 + goto out_region; 309 + } 310 + 311 + plr->kmem = kzalloc(plr->size, GFP_KERNEL); 312 + if (!plr->kmem) { 313 + rdt_last_cmd_puts("Unable to allocate memory\n"); 314 + ret = -ENOMEM; 315 + goto out_region; 316 + } 317 + 318 + ret = 0; 319 + goto out; 320 + out_region: 321 + pseudo_lock_region_clear(plr); 322 + out: 323 + return ret; 324 + } 325 + 326 + /** 327 + * pseudo_lock_free - Free a pseudo-locked region 328 + * @rdtgrp: resource group to which pseudo-locked region belonged 329 + * 330 + * The pseudo-locked region's resources have already been released, or not 331 + * yet created at this point. Now it can be freed and disassociated from the 332 + * resource group. 333 + * 334 + * Return: void 335 + */ 336 + static void pseudo_lock_free(struct rdtgroup *rdtgrp) 337 + { 338 + pseudo_lock_region_clear(rdtgrp->plr); 339 + kfree(rdtgrp->plr); 340 + rdtgrp->plr = NULL; 341 + } 342 + 343 + /** 344 + * rdtgroup_monitor_in_progress - Test if monitoring in progress 345 + * @rdtgrp: resource group being queried 346 + * 347 + * Return: 1 if monitor groups have been created for this resource 348 + * group, 0 otherwise. 349 + */ 350 + static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 351 + { 352 + return !list_empty(&rdtgrp->mon.crdtgrp_list); 353 + } 354 + 355 + /** 356 + * rdtgroup_locksetup_user_restrict - Restrict user access to group 357 + * @rdtgrp: resource group needing access restricted 358 + * 359 + * A resource group used for cache pseudo-locking cannot have cpus or tasks 360 + * assigned to it. This is communicated to the user by restricting access 361 + * to all the files that can be used to make such changes. 362 + * 363 + * Permissions restored with rdtgroup_locksetup_user_restore() 364 + * 365 + * Return: 0 on success, <0 on failure. If a failure occurs during the 366 + * restriction of access an attempt will be made to restore permissions but 367 + * the state of the mode of these files will be uncertain when a failure 368 + * occurs. 369 + */ 370 + static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 371 + { 372 + int ret; 373 + 374 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 375 + if (ret) 376 + return ret; 377 + 378 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 379 + if (ret) 380 + goto err_tasks; 381 + 382 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 383 + if (ret) 384 + goto err_cpus; 385 + 386 + if (resctrl_arch_mon_capable()) { 387 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 388 + if (ret) 389 + goto err_cpus_list; 390 + } 391 + 392 + ret = 0; 393 + goto out; 394 + 395 + err_cpus_list: 396 + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 397 + err_cpus: 398 + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 399 + err_tasks: 400 + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 401 + out: 402 + return ret; 403 + } 404 + 405 + /** 406 + * rdtgroup_locksetup_user_restore - Restore user access to group 407 + * @rdtgrp: resource group needing access restored 408 + * 409 + * Restore all file access previously removed using 410 + * rdtgroup_locksetup_user_restrict() 411 + * 412 + * Return: 0 on success, <0 on failure. If a failure occurs during the 413 + * restoration of access an attempt will be made to restrict permissions 414 + * again but the state of the mode of these files will be uncertain when 415 + * a failure occurs. 416 + */ 417 + static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 418 + { 419 + int ret; 420 + 421 + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 422 + if (ret) 423 + return ret; 424 + 425 + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 426 + if (ret) 427 + goto err_tasks; 428 + 429 + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 430 + if (ret) 431 + goto err_cpus; 432 + 433 + if (resctrl_arch_mon_capable()) { 434 + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 435 + if (ret) 436 + goto err_cpus_list; 437 + } 438 + 439 + ret = 0; 440 + goto out; 441 + 442 + err_cpus_list: 443 + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 444 + err_cpus: 445 + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 446 + err_tasks: 447 + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 448 + out: 449 + return ret; 450 + } 451 + 452 + /** 453 + * rdtgroup_locksetup_enter - Resource group enters locksetup mode 454 + * @rdtgrp: resource group requested to enter locksetup mode 455 + * 456 + * A resource group enters locksetup mode to reflect that it would be used 457 + * to represent a pseudo-locked region and is in the process of being set 458 + * up to do so. A resource group used for a pseudo-locked region would 459 + * lose the closid associated with it so we cannot allow it to have any 460 + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 461 + * future. Monitoring of a pseudo-locked region is not allowed either. 462 + * 463 + * The above and more restrictions on a pseudo-locked region are checked 464 + * for and enforced before the resource group enters the locksetup mode. 465 + * 466 + * Returns: 0 if the resource group successfully entered locksetup mode, <0 467 + * on failure. On failure the last_cmd_status buffer is updated with text to 468 + * communicate details of failure to the user. 469 + */ 470 + int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 471 + { 472 + int ret; 473 + 474 + /* 475 + * The default resource group can neither be removed nor lose the 476 + * default closid associated with it. 477 + */ 478 + if (rdtgrp == &rdtgroup_default) { 479 + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 480 + return -EINVAL; 481 + } 482 + 483 + /* 484 + * Cache Pseudo-locking not supported when CDP is enabled. 485 + * 486 + * Some things to consider if you would like to enable this 487 + * support (using L3 CDP as example): 488 + * - When CDP is enabled two separate resources are exposed, 489 + * L3DATA and L3CODE, but they are actually on the same cache. 490 + * The implication for pseudo-locking is that if a 491 + * pseudo-locked region is created on a domain of one 492 + * resource (eg. L3CODE), then a pseudo-locked region cannot 493 + * be created on that same domain of the other resource 494 + * (eg. L3DATA). This is because the creation of a 495 + * pseudo-locked region involves a call to wbinvd that will 496 + * affect all cache allocations on particular domain. 497 + * - Considering the previous, it may be possible to only 498 + * expose one of the CDP resources to pseudo-locking and 499 + * hide the other. For example, we could consider to only 500 + * expose L3DATA and since the L3 cache is unified it is 501 + * still possible to place instructions there are execute it. 502 + * - If only one region is exposed to pseudo-locking we should 503 + * still keep in mind that availability of a portion of cache 504 + * for pseudo-locking should take into account both resources. 505 + * Similarly, if a pseudo-locked region is created in one 506 + * resource, the portion of cache used by it should be made 507 + * unavailable to all future allocations from both resources. 508 + */ 509 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 510 + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 511 + rdt_last_cmd_puts("CDP enabled\n"); 512 + return -EINVAL; 513 + } 514 + 515 + /* 516 + * Not knowing the bits to disable prefetching implies that this 517 + * platform does not support Cache Pseudo-Locking. 518 + */ 519 + if (resctrl_arch_get_prefetch_disable_bits() == 0) { 520 + rdt_last_cmd_puts("Pseudo-locking not supported\n"); 521 + return -EINVAL; 522 + } 523 + 524 + if (rdtgroup_monitor_in_progress(rdtgrp)) { 525 + rdt_last_cmd_puts("Monitoring in progress\n"); 526 + return -EINVAL; 527 + } 528 + 529 + if (rdtgroup_tasks_assigned(rdtgrp)) { 530 + rdt_last_cmd_puts("Tasks assigned to resource group\n"); 531 + return -EINVAL; 532 + } 533 + 534 + if (!cpumask_empty(&rdtgrp->cpu_mask)) { 535 + rdt_last_cmd_puts("CPUs assigned to resource group\n"); 536 + return -EINVAL; 537 + } 538 + 539 + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 540 + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 541 + return -EIO; 542 + } 543 + 544 + ret = pseudo_lock_init(rdtgrp); 545 + if (ret) { 546 + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 547 + goto out_release; 548 + } 549 + 550 + /* 551 + * If this system is capable of monitoring a rmid would have been 552 + * allocated when the control group was created. This is not needed 553 + * anymore when this group would be used for pseudo-locking. This 554 + * is safe to call on platforms not capable of monitoring. 555 + */ 556 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 557 + 558 + ret = 0; 559 + goto out; 560 + 561 + out_release: 562 + rdtgroup_locksetup_user_restore(rdtgrp); 563 + out: 564 + return ret; 565 + } 566 + 567 + /** 568 + * rdtgroup_locksetup_exit - resource group exist locksetup mode 569 + * @rdtgrp: resource group 570 + * 571 + * When a resource group exits locksetup mode the earlier restrictions are 572 + * lifted. 573 + * 574 + * Return: 0 on success, <0 on failure 575 + */ 576 + int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 577 + { 578 + int ret; 579 + 580 + if (resctrl_arch_mon_capable()) { 581 + ret = alloc_rmid(rdtgrp->closid); 582 + if (ret < 0) { 583 + rdt_last_cmd_puts("Out of RMIDs\n"); 584 + return ret; 585 + } 586 + rdtgrp->mon.rmid = ret; 587 + } 588 + 589 + ret = rdtgroup_locksetup_user_restore(rdtgrp); 590 + if (ret) { 591 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 592 + return ret; 593 + } 594 + 595 + pseudo_lock_free(rdtgrp); 596 + return 0; 597 + } 598 + 599 + /** 600 + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 601 + * @d: RDT domain 602 + * @cbm: CBM to test 603 + * 604 + * @d represents a cache instance and @cbm a capacity bitmask that is 605 + * considered for it. Determine if @cbm overlaps with any existing 606 + * pseudo-locked region on @d. 607 + * 608 + * @cbm is unsigned long, even if only 32 bits are used, to make the 609 + * bitmap functions work correctly. 610 + * 611 + * Return: true if @cbm overlaps with pseudo-locked region on @d, false 612 + * otherwise. 613 + */ 614 + bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 615 + { 616 + unsigned int cbm_len; 617 + unsigned long cbm_b; 618 + 619 + if (d->plr) { 620 + cbm_len = d->plr->s->res->cache.cbm_len; 621 + cbm_b = d->plr->cbm; 622 + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 623 + return true; 624 + } 625 + return false; 626 + } 627 + 628 + /** 629 + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 630 + * @d: RDT domain under test 631 + * 632 + * The setup of a pseudo-locked region affects all cache instances within 633 + * the hierarchy of the region. It is thus essential to know if any 634 + * pseudo-locked regions exist within a cache hierarchy to prevent any 635 + * attempts to create new pseudo-locked regions in the same hierarchy. 636 + * 637 + * Return: true if a pseudo-locked region exists in the hierarchy of @d or 638 + * if it is not possible to test due to memory allocation issue, 639 + * false otherwise. 640 + */ 641 + bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 642 + { 643 + struct rdt_ctrl_domain *d_i; 644 + cpumask_var_t cpu_with_psl; 645 + struct rdt_resource *r; 646 + bool ret = false; 647 + 648 + /* Walking r->domains, ensure it can't race with cpuhp */ 649 + lockdep_assert_cpus_held(); 650 + 651 + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 652 + return true; 653 + 654 + /* 655 + * First determine which cpus have pseudo-locked regions 656 + * associated with them. 657 + */ 658 + for_each_alloc_capable_rdt_resource(r) { 659 + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 660 + if (d_i->plr) 661 + cpumask_or(cpu_with_psl, cpu_with_psl, 662 + &d_i->hdr.cpu_mask); 663 + } 664 + } 665 + 666 + /* 667 + * Next test if new pseudo-locked region would intersect with 668 + * existing region. 669 + */ 670 + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 671 + ret = true; 672 + 673 + free_cpumask_var(cpu_with_psl); 674 + return ret; 675 + } 676 + 677 + /** 678 + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 679 + * @rdtgrp: Resource group to which the pseudo-locked region belongs. 680 + * @sel: Selector of which measurement to perform on a pseudo-locked region. 681 + * 682 + * The measurement of latency to access a pseudo-locked region should be 683 + * done from a cpu that is associated with that pseudo-locked region. 684 + * Determine which cpu is associated with this region and start a thread on 685 + * that cpu to perform the measurement, wait for that thread to complete. 686 + * 687 + * Return: 0 on success, <0 on failure 688 + */ 689 + static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 690 + { 691 + struct pseudo_lock_region *plr = rdtgrp->plr; 692 + struct task_struct *thread; 693 + unsigned int cpu; 694 + int ret = -1; 695 + 696 + cpus_read_lock(); 697 + mutex_lock(&rdtgroup_mutex); 698 + 699 + if (rdtgrp->flags & RDT_DELETED) { 700 + ret = -ENODEV; 701 + goto out; 702 + } 703 + 704 + if (!plr->d) { 705 + ret = -ENODEV; 706 + goto out; 707 + } 708 + 709 + plr->thread_done = 0; 710 + cpu = cpumask_first(&plr->d->hdr.cpu_mask); 711 + if (!cpu_online(cpu)) { 712 + ret = -ENODEV; 713 + goto out; 714 + } 715 + 716 + plr->cpu = cpu; 717 + 718 + if (sel == 1) 719 + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 720 + plr, cpu, "pseudo_lock_measure/%u"); 721 + else if (sel == 2) 722 + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 723 + plr, cpu, "pseudo_lock_measure/%u"); 724 + else if (sel == 3) 725 + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 726 + plr, cpu, "pseudo_lock_measure/%u"); 727 + else 728 + goto out; 729 + 730 + if (IS_ERR(thread)) { 731 + ret = PTR_ERR(thread); 732 + goto out; 733 + } 734 + 735 + ret = wait_event_interruptible(plr->lock_thread_wq, 736 + plr->thread_done == 1); 737 + if (ret < 0) 738 + goto out; 739 + 740 + ret = 0; 741 + 742 + out: 743 + mutex_unlock(&rdtgroup_mutex); 744 + cpus_read_unlock(); 745 + return ret; 746 + } 747 + 748 + static ssize_t pseudo_lock_measure_trigger(struct file *file, 749 + const char __user *user_buf, 750 + size_t count, loff_t *ppos) 751 + { 752 + struct rdtgroup *rdtgrp = file->private_data; 753 + size_t buf_size; 754 + char buf[32]; 755 + int ret; 756 + int sel; 757 + 758 + buf_size = min(count, (sizeof(buf) - 1)); 759 + if (copy_from_user(buf, user_buf, buf_size)) 760 + return -EFAULT; 761 + 762 + buf[buf_size] = '\0'; 763 + ret = kstrtoint(buf, 10, &sel); 764 + if (ret == 0) { 765 + if (sel != 1 && sel != 2 && sel != 3) 766 + return -EINVAL; 767 + ret = debugfs_file_get(file->f_path.dentry); 768 + if (ret) 769 + return ret; 770 + ret = pseudo_lock_measure_cycles(rdtgrp, sel); 771 + if (ret == 0) 772 + ret = count; 773 + debugfs_file_put(file->f_path.dentry); 774 + } 775 + 776 + return ret; 777 + } 778 + 779 + static const struct file_operations pseudo_measure_fops = { 780 + .write = pseudo_lock_measure_trigger, 781 + .open = simple_open, 782 + .llseek = default_llseek, 783 + }; 784 + 785 + /** 786 + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 787 + * @rdtgrp: resource group to which pseudo-lock region belongs 788 + * 789 + * Called when a resource group in the pseudo-locksetup mode receives a 790 + * valid schemata that should be pseudo-locked. Since the resource group is 791 + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 792 + * allocated and initialized with the essential information. If a failure 793 + * occurs the resource group remains in the pseudo-locksetup mode with the 794 + * &struct pseudo_lock_region associated with it, but cleared from all 795 + * information and ready for the user to re-attempt pseudo-locking by 796 + * writing the schemata again. 797 + * 798 + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 799 + * on failure. Descriptive error will be written to last_cmd_status buffer. 800 + */ 801 + int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 802 + { 803 + struct pseudo_lock_region *plr = rdtgrp->plr; 804 + struct task_struct *thread; 805 + unsigned int new_minor; 806 + struct device *dev; 807 + char *kn_name __free(kfree) = NULL; 808 + int ret; 809 + 810 + ret = pseudo_lock_region_alloc(plr); 811 + if (ret < 0) 812 + return ret; 813 + 814 + ret = pseudo_lock_cstates_constrain(plr); 815 + if (ret < 0) { 816 + ret = -EINVAL; 817 + goto out_region; 818 + } 819 + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 820 + if (!kn_name) { 821 + ret = -ENOMEM; 822 + goto out_cstates; 823 + } 824 + 825 + plr->thread_done = 0; 826 + 827 + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 828 + plr->cpu, "pseudo_lock/%u"); 829 + if (IS_ERR(thread)) { 830 + ret = PTR_ERR(thread); 831 + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 832 + goto out_cstates; 833 + } 834 + 835 + ret = wait_event_interruptible(plr->lock_thread_wq, 836 + plr->thread_done == 1); 837 + if (ret < 0) { 838 + /* 839 + * If the thread does not get on the CPU for whatever 840 + * reason and the process which sets up the region is 841 + * interrupted then this will leave the thread in runnable 842 + * state and once it gets on the CPU it will dereference 843 + * the cleared, but not freed, plr struct resulting in an 844 + * empty pseudo-locking loop. 845 + */ 846 + rdt_last_cmd_puts("Locking thread interrupted\n"); 847 + goto out_cstates; 848 + } 849 + 850 + ret = pseudo_lock_minor_get(&new_minor); 851 + if (ret < 0) { 852 + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 853 + goto out_cstates; 854 + } 855 + 856 + /* 857 + * Unlock access but do not release the reference. The 858 + * pseudo-locked region will still be here on return. 859 + * 860 + * The mutex has to be released temporarily to avoid a potential 861 + * deadlock with the mm->mmap_lock which is obtained in the 862 + * device_create() and debugfs_create_dir() callpath below as well as 863 + * before the mmap() callback is called. 864 + */ 865 + mutex_unlock(&rdtgroup_mutex); 866 + 867 + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 868 + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 869 + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 870 + debugfs_create_file("pseudo_lock_measure", 0200, 871 + plr->debugfs_dir, rdtgrp, 872 + &pseudo_measure_fops); 873 + } 874 + 875 + dev = device_create(&pseudo_lock_class, NULL, 876 + MKDEV(pseudo_lock_major, new_minor), 877 + rdtgrp, "%s", kn_name); 878 + 879 + mutex_lock(&rdtgroup_mutex); 880 + 881 + if (IS_ERR(dev)) { 882 + ret = PTR_ERR(dev); 883 + rdt_last_cmd_printf("Failed to create character device: %d\n", 884 + ret); 885 + goto out_debugfs; 886 + } 887 + 888 + /* We released the mutex - check if group was removed while we did so */ 889 + if (rdtgrp->flags & RDT_DELETED) { 890 + ret = -ENODEV; 891 + goto out_device; 892 + } 893 + 894 + plr->minor = new_minor; 895 + 896 + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 897 + closid_free(rdtgrp->closid); 898 + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 899 + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 900 + 901 + ret = 0; 902 + goto out; 903 + 904 + out_device: 905 + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 906 + out_debugfs: 907 + debugfs_remove_recursive(plr->debugfs_dir); 908 + pseudo_lock_minor_release(new_minor); 909 + out_cstates: 910 + pseudo_lock_cstates_relax(plr); 911 + out_region: 912 + pseudo_lock_region_clear(plr); 913 + out: 914 + return ret; 915 + } 916 + 917 + /** 918 + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 919 + * @rdtgrp: resource group to which the pseudo-locked region belongs 920 + * 921 + * The removal of a pseudo-locked region can be initiated when the resource 922 + * group is removed from user space via a "rmdir" from userspace or the 923 + * unmount of the resctrl filesystem. On removal the resource group does 924 + * not go back to pseudo-locksetup mode before it is removed, instead it is 925 + * removed directly. There is thus asymmetry with the creation where the 926 + * &struct pseudo_lock_region is removed here while it was not created in 927 + * rdtgroup_pseudo_lock_create(). 928 + * 929 + * Return: void 930 + */ 931 + void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 932 + { 933 + struct pseudo_lock_region *plr = rdtgrp->plr; 934 + 935 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 936 + /* 937 + * Default group cannot be a pseudo-locked region so we can 938 + * free closid here. 939 + */ 940 + closid_free(rdtgrp->closid); 941 + goto free; 942 + } 943 + 944 + pseudo_lock_cstates_relax(plr); 945 + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 946 + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 947 + pseudo_lock_minor_release(plr->minor); 948 + 949 + free: 950 + pseudo_lock_free(rdtgrp); 951 + } 952 + 953 + static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 954 + { 955 + struct rdtgroup *rdtgrp; 956 + 957 + mutex_lock(&rdtgroup_mutex); 958 + 959 + rdtgrp = region_find_by_minor(iminor(inode)); 960 + if (!rdtgrp) { 961 + mutex_unlock(&rdtgroup_mutex); 962 + return -ENODEV; 963 + } 964 + 965 + filp->private_data = rdtgrp; 966 + atomic_inc(&rdtgrp->waitcount); 967 + /* Perform a non-seekable open - llseek is not supported */ 968 + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 969 + 970 + mutex_unlock(&rdtgroup_mutex); 971 + 972 + return 0; 973 + } 974 + 975 + static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 976 + { 977 + struct rdtgroup *rdtgrp; 978 + 979 + mutex_lock(&rdtgroup_mutex); 980 + rdtgrp = filp->private_data; 981 + WARN_ON(!rdtgrp); 982 + if (!rdtgrp) { 983 + mutex_unlock(&rdtgroup_mutex); 984 + return -ENODEV; 985 + } 986 + filp->private_data = NULL; 987 + atomic_dec(&rdtgrp->waitcount); 988 + mutex_unlock(&rdtgroup_mutex); 989 + return 0; 990 + } 991 + 992 + static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 993 + { 994 + /* Not supported */ 995 + return -EINVAL; 996 + } 997 + 998 + static const struct vm_operations_struct pseudo_mmap_ops = { 999 + .mremap = pseudo_lock_dev_mremap, 1000 + }; 1001 + 1002 + static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 1003 + { 1004 + unsigned long vsize = vma->vm_end - vma->vm_start; 1005 + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 1006 + struct pseudo_lock_region *plr; 1007 + struct rdtgroup *rdtgrp; 1008 + unsigned long physical; 1009 + unsigned long psize; 1010 + 1011 + mutex_lock(&rdtgroup_mutex); 1012 + 1013 + rdtgrp = filp->private_data; 1014 + WARN_ON(!rdtgrp); 1015 + if (!rdtgrp) { 1016 + mutex_unlock(&rdtgroup_mutex); 1017 + return -ENODEV; 1018 + } 1019 + 1020 + plr = rdtgrp->plr; 1021 + 1022 + if (!plr->d) { 1023 + mutex_unlock(&rdtgroup_mutex); 1024 + return -ENODEV; 1025 + } 1026 + 1027 + /* 1028 + * Task is required to run with affinity to the cpus associated 1029 + * with the pseudo-locked region. If this is not the case the task 1030 + * may be scheduled elsewhere and invalidate entries in the 1031 + * pseudo-locked region. 1032 + */ 1033 + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 1034 + mutex_unlock(&rdtgroup_mutex); 1035 + return -EINVAL; 1036 + } 1037 + 1038 + physical = __pa(plr->kmem) >> PAGE_SHIFT; 1039 + psize = plr->size - off; 1040 + 1041 + if (off > plr->size) { 1042 + mutex_unlock(&rdtgroup_mutex); 1043 + return -ENOSPC; 1044 + } 1045 + 1046 + /* 1047 + * Ensure changes are carried directly to the memory being mapped, 1048 + * do not allow copy-on-write mapping. 1049 + */ 1050 + if (!(vma->vm_flags & VM_SHARED)) { 1051 + mutex_unlock(&rdtgroup_mutex); 1052 + return -EINVAL; 1053 + } 1054 + 1055 + if (vsize > psize) { 1056 + mutex_unlock(&rdtgroup_mutex); 1057 + return -ENOSPC; 1058 + } 1059 + 1060 + memset(plr->kmem + off, 0, vsize); 1061 + 1062 + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 1063 + vsize, vma->vm_page_prot)) { 1064 + mutex_unlock(&rdtgroup_mutex); 1065 + return -EAGAIN; 1066 + } 1067 + vma->vm_ops = &pseudo_mmap_ops; 1068 + mutex_unlock(&rdtgroup_mutex); 1069 + return 0; 1070 + } 1071 + 1072 + static const struct file_operations pseudo_lock_dev_fops = { 1073 + .owner = THIS_MODULE, 1074 + .read = NULL, 1075 + .write = NULL, 1076 + .open = pseudo_lock_dev_open, 1077 + .release = pseudo_lock_dev_release, 1078 + .mmap = pseudo_lock_dev_mmap, 1079 + }; 1080 + 1081 + int rdt_pseudo_lock_init(void) 1082 + { 1083 + int ret; 1084 + 1085 + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 1086 + if (ret < 0) 1087 + return ret; 1088 + 1089 + pseudo_lock_major = ret; 1090 + 1091 + ret = class_register(&pseudo_lock_class); 1092 + if (ret) { 1093 + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1094 + return ret; 1095 + } 1096 + 1097 + return 0; 1098 + } 1099 + 1100 + void rdt_pseudo_lock_release(void) 1101 + { 1102 + class_unregister(&pseudo_lock_class); 1103 + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1104 + pseudo_lock_major = 0; 1105 + }

+4353

fs/resctrl/rdtgroup.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * User interface for Resource Allocation in Resource Director Technology(RDT) 4 + * 5 + * Copyright (C) 2016 Intel Corporation 6 + * 7 + * Author: Fenghua Yu <fenghua.yu@intel.com> 8 + * 9 + * More information about RDT be found in the Intel (R) x86 Architecture 10 + * Software Developer Manual. 11 + */ 12 + 13 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 + 15 + #include <linux/cpu.h> 16 + #include <linux/debugfs.h> 17 + #include <linux/fs.h> 18 + #include <linux/fs_parser.h> 19 + #include <linux/sysfs.h> 20 + #include <linux/kernfs.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/seq_buf.h> 23 + #include <linux/seq_file.h> 24 + #include <linux/sched/task.h> 25 + #include <linux/slab.h> 26 + #include <linux/user_namespace.h> 27 + 28 + #include <uapi/linux/magic.h> 29 + 30 + #include "internal.h" 31 + 32 + /* Mutex to protect rdtgroup access. */ 33 + DEFINE_MUTEX(rdtgroup_mutex); 34 + 35 + static struct kernfs_root *rdt_root; 36 + 37 + struct rdtgroup rdtgroup_default; 38 + 39 + LIST_HEAD(rdt_all_groups); 40 + 41 + /* list of entries for the schemata file */ 42 + LIST_HEAD(resctrl_schema_all); 43 + 44 + /* 45 + * List of struct mon_data containing private data of event files for use by 46 + * rdtgroup_mondata_show(). Protected by rdtgroup_mutex. 47 + */ 48 + static LIST_HEAD(mon_data_kn_priv_list); 49 + 50 + /* The filesystem can only be mounted once. */ 51 + bool resctrl_mounted; 52 + 53 + /* Kernel fs node for "info" directory under root */ 54 + static struct kernfs_node *kn_info; 55 + 56 + /* Kernel fs node for "mon_groups" directory under root */ 57 + static struct kernfs_node *kn_mongrp; 58 + 59 + /* Kernel fs node for "mon_data" directory under root */ 60 + static struct kernfs_node *kn_mondata; 61 + 62 + /* 63 + * Used to store the max resource name width to display the schemata names in 64 + * a tabular format. 65 + */ 66 + int max_name_width; 67 + 68 + static struct seq_buf last_cmd_status; 69 + 70 + static char last_cmd_status_buf[512]; 71 + 72 + static int rdtgroup_setup_root(struct rdt_fs_context *ctx); 73 + 74 + static void rdtgroup_destroy_root(void); 75 + 76 + struct dentry *debugfs_resctrl; 77 + 78 + /* 79 + * Memory bandwidth monitoring event to use for the default CTRL_MON group 80 + * and each new CTRL_MON group created by the user. Only relevant when 81 + * the filesystem is mounted with the "mba_MBps" option so it does not 82 + * matter that it remains uninitialized on systems that do not support 83 + * the "mba_MBps" option. 84 + */ 85 + enum resctrl_event_id mba_mbps_default_event; 86 + 87 + static bool resctrl_debug; 88 + 89 + void rdt_last_cmd_clear(void) 90 + { 91 + lockdep_assert_held(&rdtgroup_mutex); 92 + seq_buf_clear(&last_cmd_status); 93 + } 94 + 95 + void rdt_last_cmd_puts(const char *s) 96 + { 97 + lockdep_assert_held(&rdtgroup_mutex); 98 + seq_buf_puts(&last_cmd_status, s); 99 + } 100 + 101 + void rdt_last_cmd_printf(const char *fmt, ...) 102 + { 103 + va_list ap; 104 + 105 + va_start(ap, fmt); 106 + lockdep_assert_held(&rdtgroup_mutex); 107 + seq_buf_vprintf(&last_cmd_status, fmt, ap); 108 + va_end(ap); 109 + } 110 + 111 + void rdt_staged_configs_clear(void) 112 + { 113 + struct rdt_ctrl_domain *dom; 114 + struct rdt_resource *r; 115 + 116 + lockdep_assert_held(&rdtgroup_mutex); 117 + 118 + for_each_alloc_capable_rdt_resource(r) { 119 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) 120 + memset(dom->staged_config, 0, sizeof(dom->staged_config)); 121 + } 122 + } 123 + 124 + static bool resctrl_is_mbm_enabled(void) 125 + { 126 + return (resctrl_arch_is_mbm_total_enabled() || 127 + resctrl_arch_is_mbm_local_enabled()); 128 + } 129 + 130 + static bool resctrl_is_mbm_event(int e) 131 + { 132 + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && 133 + e <= QOS_L3_MBM_LOCAL_EVENT_ID); 134 + } 135 + 136 + /* 137 + * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap 138 + * of free CLOSIDs. 139 + * 140 + * Using a global CLOSID across all resources has some advantages and 141 + * some drawbacks: 142 + * + We can simply set current's closid to assign a task to a resource 143 + * group. 144 + * + Context switch code can avoid extra memory references deciding which 145 + * CLOSID to load into the PQR_ASSOC MSR 146 + * - We give up some options in configuring resource groups across multi-socket 147 + * systems. 148 + * - Our choices on how to configure each resource become progressively more 149 + * limited as the number of resources grows. 150 + */ 151 + static unsigned long *closid_free_map; 152 + 153 + static int closid_free_map_len; 154 + 155 + int closids_supported(void) 156 + { 157 + return closid_free_map_len; 158 + } 159 + 160 + static int closid_init(void) 161 + { 162 + struct resctrl_schema *s; 163 + u32 rdt_min_closid = ~0; 164 + 165 + /* Monitor only platforms still call closid_init() */ 166 + if (list_empty(&resctrl_schema_all)) 167 + return 0; 168 + 169 + /* Compute rdt_min_closid across all resources */ 170 + list_for_each_entry(s, &resctrl_schema_all, list) 171 + rdt_min_closid = min(rdt_min_closid, s->num_closid); 172 + 173 + closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL); 174 + if (!closid_free_map) 175 + return -ENOMEM; 176 + bitmap_fill(closid_free_map, rdt_min_closid); 177 + 178 + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ 179 + __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map); 180 + closid_free_map_len = rdt_min_closid; 181 + 182 + return 0; 183 + } 184 + 185 + static void closid_exit(void) 186 + { 187 + bitmap_free(closid_free_map); 188 + closid_free_map = NULL; 189 + } 190 + 191 + static int closid_alloc(void) 192 + { 193 + int cleanest_closid; 194 + u32 closid; 195 + 196 + lockdep_assert_held(&rdtgroup_mutex); 197 + 198 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && 199 + resctrl_arch_is_llc_occupancy_enabled()) { 200 + cleanest_closid = resctrl_find_cleanest_closid(); 201 + if (cleanest_closid < 0) 202 + return cleanest_closid; 203 + closid = cleanest_closid; 204 + } else { 205 + closid = find_first_bit(closid_free_map, closid_free_map_len); 206 + if (closid == closid_free_map_len) 207 + return -ENOSPC; 208 + } 209 + __clear_bit(closid, closid_free_map); 210 + 211 + return closid; 212 + } 213 + 214 + void closid_free(int closid) 215 + { 216 + lockdep_assert_held(&rdtgroup_mutex); 217 + 218 + __set_bit(closid, closid_free_map); 219 + } 220 + 221 + /** 222 + * closid_allocated - test if provided closid is in use 223 + * @closid: closid to be tested 224 + * 225 + * Return: true if @closid is currently associated with a resource group, 226 + * false if @closid is free 227 + */ 228 + bool closid_allocated(unsigned int closid) 229 + { 230 + lockdep_assert_held(&rdtgroup_mutex); 231 + 232 + return !test_bit(closid, closid_free_map); 233 + } 234 + 235 + /** 236 + * rdtgroup_mode_by_closid - Return mode of resource group with closid 237 + * @closid: closid if the resource group 238 + * 239 + * Each resource group is associated with a @closid. Here the mode 240 + * of a resource group can be queried by searching for it using its closid. 241 + * 242 + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid 243 + */ 244 + enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) 245 + { 246 + struct rdtgroup *rdtgrp; 247 + 248 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 249 + if (rdtgrp->closid == closid) 250 + return rdtgrp->mode; 251 + } 252 + 253 + return RDT_NUM_MODES; 254 + } 255 + 256 + static const char * const rdt_mode_str[] = { 257 + [RDT_MODE_SHAREABLE] = "shareable", 258 + [RDT_MODE_EXCLUSIVE] = "exclusive", 259 + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", 260 + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", 261 + }; 262 + 263 + /** 264 + * rdtgroup_mode_str - Return the string representation of mode 265 + * @mode: the resource group mode as &enum rdtgroup_mode 266 + * 267 + * Return: string representation of valid mode, "unknown" otherwise 268 + */ 269 + static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) 270 + { 271 + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) 272 + return "unknown"; 273 + 274 + return rdt_mode_str[mode]; 275 + } 276 + 277 + /* set uid and gid of rdtgroup dirs and files to that of the creator */ 278 + static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) 279 + { 280 + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 281 + .ia_uid = current_fsuid(), 282 + .ia_gid = current_fsgid(), }; 283 + 284 + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 285 + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 286 + return 0; 287 + 288 + return kernfs_setattr(kn, &iattr); 289 + } 290 + 291 + static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) 292 + { 293 + struct kernfs_node *kn; 294 + int ret; 295 + 296 + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, 297 + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 298 + 0, rft->kf_ops, rft, NULL, NULL); 299 + if (IS_ERR(kn)) 300 + return PTR_ERR(kn); 301 + 302 + ret = rdtgroup_kn_set_ugid(kn); 303 + if (ret) { 304 + kernfs_remove(kn); 305 + return ret; 306 + } 307 + 308 + return 0; 309 + } 310 + 311 + static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 312 + { 313 + struct kernfs_open_file *of = m->private; 314 + struct rftype *rft = of->kn->priv; 315 + 316 + if (rft->seq_show) 317 + return rft->seq_show(of, m, arg); 318 + return 0; 319 + } 320 + 321 + static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, 322 + size_t nbytes, loff_t off) 323 + { 324 + struct rftype *rft = of->kn->priv; 325 + 326 + if (rft->write) 327 + return rft->write(of, buf, nbytes, off); 328 + 329 + return -EINVAL; 330 + } 331 + 332 + static const struct kernfs_ops rdtgroup_kf_single_ops = { 333 + .atomic_write_len = PAGE_SIZE, 334 + .write = rdtgroup_file_write, 335 + .seq_show = rdtgroup_seqfile_show, 336 + }; 337 + 338 + static const struct kernfs_ops kf_mondata_ops = { 339 + .atomic_write_len = PAGE_SIZE, 340 + .seq_show = rdtgroup_mondata_show, 341 + }; 342 + 343 + static bool is_cpu_list(struct kernfs_open_file *of) 344 + { 345 + struct rftype *rft = of->kn->priv; 346 + 347 + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; 348 + } 349 + 350 + static int rdtgroup_cpus_show(struct kernfs_open_file *of, 351 + struct seq_file *s, void *v) 352 + { 353 + struct rdtgroup *rdtgrp; 354 + struct cpumask *mask; 355 + int ret = 0; 356 + 357 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 358 + 359 + if (rdtgrp) { 360 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 361 + if (!rdtgrp->plr->d) { 362 + rdt_last_cmd_clear(); 363 + rdt_last_cmd_puts("Cache domain offline\n"); 364 + ret = -ENODEV; 365 + } else { 366 + mask = &rdtgrp->plr->d->hdr.cpu_mask; 367 + seq_printf(s, is_cpu_list(of) ? 368 + "%*pbl\n" : "%*pb\n", 369 + cpumask_pr_args(mask)); 370 + } 371 + } else { 372 + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", 373 + cpumask_pr_args(&rdtgrp->cpu_mask)); 374 + } 375 + } else { 376 + ret = -ENOENT; 377 + } 378 + rdtgroup_kn_unlock(of->kn); 379 + 380 + return ret; 381 + } 382 + 383 + /* 384 + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 385 + * 386 + * Per task closids/rmids must have been set up before calling this function. 387 + * @r may be NULL. 388 + */ 389 + static void 390 + update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) 391 + { 392 + struct resctrl_cpu_defaults defaults, *p = NULL; 393 + 394 + if (r) { 395 + defaults.closid = r->closid; 396 + defaults.rmid = r->mon.rmid; 397 + p = &defaults; 398 + } 399 + 400 + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); 401 + } 402 + 403 + static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 404 + cpumask_var_t tmpmask) 405 + { 406 + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; 407 + struct list_head *head; 408 + 409 + /* Check whether cpus belong to parent ctrl group */ 410 + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); 411 + if (!cpumask_empty(tmpmask)) { 412 + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); 413 + return -EINVAL; 414 + } 415 + 416 + /* Check whether cpus are dropped from this group */ 417 + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 418 + if (!cpumask_empty(tmpmask)) { 419 + /* Give any dropped cpus to parent rdtgroup */ 420 + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); 421 + update_closid_rmid(tmpmask, prgrp); 422 + } 423 + 424 + /* 425 + * If we added cpus, remove them from previous group that owned them 426 + * and update per-cpu rmid 427 + */ 428 + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 429 + if (!cpumask_empty(tmpmask)) { 430 + head = &prgrp->mon.crdtgrp_list; 431 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 432 + if (crgrp == rdtgrp) 433 + continue; 434 + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, 435 + tmpmask); 436 + } 437 + update_closid_rmid(tmpmask, rdtgrp); 438 + } 439 + 440 + /* Done pushing/pulling - update this group with new mask */ 441 + cpumask_copy(&rdtgrp->cpu_mask, newmask); 442 + 443 + return 0; 444 + } 445 + 446 + static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) 447 + { 448 + struct rdtgroup *crgrp; 449 + 450 + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); 451 + /* update the child mon group masks as well*/ 452 + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) 453 + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); 454 + } 455 + 456 + static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 457 + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) 458 + { 459 + struct rdtgroup *r, *crgrp; 460 + struct list_head *head; 461 + 462 + /* Check whether cpus are dropped from this group */ 463 + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 464 + if (!cpumask_empty(tmpmask)) { 465 + /* Can't drop from default group */ 466 + if (rdtgrp == &rdtgroup_default) { 467 + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); 468 + return -EINVAL; 469 + } 470 + 471 + /* Give any dropped cpus to rdtgroup_default */ 472 + cpumask_or(&rdtgroup_default.cpu_mask, 473 + &rdtgroup_default.cpu_mask, tmpmask); 474 + update_closid_rmid(tmpmask, &rdtgroup_default); 475 + } 476 + 477 + /* 478 + * If we added cpus, remove them from previous group and 479 + * the prev group's child groups that owned them 480 + * and update per-cpu closid/rmid. 481 + */ 482 + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 483 + if (!cpumask_empty(tmpmask)) { 484 + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { 485 + if (r == rdtgrp) 486 + continue; 487 + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); 488 + if (!cpumask_empty(tmpmask1)) 489 + cpumask_rdtgrp_clear(r, tmpmask1); 490 + } 491 + update_closid_rmid(tmpmask, rdtgrp); 492 + } 493 + 494 + /* Done pushing/pulling - update this group with new mask */ 495 + cpumask_copy(&rdtgrp->cpu_mask, newmask); 496 + 497 + /* 498 + * Clear child mon group masks since there is a new parent mask 499 + * now and update the rmid for the cpus the child lost. 500 + */ 501 + head = &rdtgrp->mon.crdtgrp_list; 502 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 503 + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); 504 + update_closid_rmid(tmpmask, rdtgrp); 505 + cpumask_clear(&crgrp->cpu_mask); 506 + } 507 + 508 + return 0; 509 + } 510 + 511 + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 512 + char *buf, size_t nbytes, loff_t off) 513 + { 514 + cpumask_var_t tmpmask, newmask, tmpmask1; 515 + struct rdtgroup *rdtgrp; 516 + int ret; 517 + 518 + if (!buf) 519 + return -EINVAL; 520 + 521 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 522 + return -ENOMEM; 523 + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { 524 + free_cpumask_var(tmpmask); 525 + return -ENOMEM; 526 + } 527 + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { 528 + free_cpumask_var(tmpmask); 529 + free_cpumask_var(newmask); 530 + return -ENOMEM; 531 + } 532 + 533 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 534 + if (!rdtgrp) { 535 + ret = -ENOENT; 536 + goto unlock; 537 + } 538 + 539 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 540 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 541 + ret = -EINVAL; 542 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 543 + goto unlock; 544 + } 545 + 546 + if (is_cpu_list(of)) 547 + ret = cpulist_parse(buf, newmask); 548 + else 549 + ret = cpumask_parse(buf, newmask); 550 + 551 + if (ret) { 552 + rdt_last_cmd_puts("Bad CPU list/mask\n"); 553 + goto unlock; 554 + } 555 + 556 + /* check that user didn't specify any offline cpus */ 557 + cpumask_andnot(tmpmask, newmask, cpu_online_mask); 558 + if (!cpumask_empty(tmpmask)) { 559 + ret = -EINVAL; 560 + rdt_last_cmd_puts("Can only assign online CPUs\n"); 561 + goto unlock; 562 + } 563 + 564 + if (rdtgrp->type == RDTCTRL_GROUP) 565 + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); 566 + else if (rdtgrp->type == RDTMON_GROUP) 567 + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); 568 + else 569 + ret = -EINVAL; 570 + 571 + unlock: 572 + rdtgroup_kn_unlock(of->kn); 573 + free_cpumask_var(tmpmask); 574 + free_cpumask_var(newmask); 575 + free_cpumask_var(tmpmask1); 576 + 577 + return ret ?: nbytes; 578 + } 579 + 580 + /** 581 + * rdtgroup_remove - the helper to remove resource group safely 582 + * @rdtgrp: resource group to remove 583 + * 584 + * On resource group creation via a mkdir, an extra kernfs_node reference is 585 + * taken to ensure that the rdtgroup structure remains accessible for the 586 + * rdtgroup_kn_unlock() calls where it is removed. 587 + * 588 + * Drop the extra reference here, then free the rdtgroup structure. 589 + * 590 + * Return: void 591 + */ 592 + static void rdtgroup_remove(struct rdtgroup *rdtgrp) 593 + { 594 + kernfs_put(rdtgrp->kn); 595 + kfree(rdtgrp); 596 + } 597 + 598 + static void _update_task_closid_rmid(void *task) 599 + { 600 + /* 601 + * If the task is still current on this CPU, update PQR_ASSOC MSR. 602 + * Otherwise, the MSR is updated when the task is scheduled in. 603 + */ 604 + if (task == current) 605 + resctrl_arch_sched_in(task); 606 + } 607 + 608 + static void update_task_closid_rmid(struct task_struct *t) 609 + { 610 + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) 611 + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); 612 + else 613 + _update_task_closid_rmid(t); 614 + } 615 + 616 + static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) 617 + { 618 + u32 closid, rmid = rdtgrp->mon.rmid; 619 + 620 + if (rdtgrp->type == RDTCTRL_GROUP) 621 + closid = rdtgrp->closid; 622 + else if (rdtgrp->type == RDTMON_GROUP) 623 + closid = rdtgrp->mon.parent->closid; 624 + else 625 + return false; 626 + 627 + return resctrl_arch_match_closid(tsk, closid) && 628 + resctrl_arch_match_rmid(tsk, closid, rmid); 629 + } 630 + 631 + static int __rdtgroup_move_task(struct task_struct *tsk, 632 + struct rdtgroup *rdtgrp) 633 + { 634 + /* If the task is already in rdtgrp, no need to move the task. */ 635 + if (task_in_rdtgroup(tsk, rdtgrp)) 636 + return 0; 637 + 638 + /* 639 + * Set the task's closid/rmid before the PQR_ASSOC MSR can be 640 + * updated by them. 641 + * 642 + * For ctrl_mon groups, move both closid and rmid. 643 + * For monitor groups, can move the tasks only from 644 + * their parent CTRL group. 645 + */ 646 + if (rdtgrp->type == RDTMON_GROUP && 647 + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { 648 + rdt_last_cmd_puts("Can't move task to different control group\n"); 649 + return -EINVAL; 650 + } 651 + 652 + if (rdtgrp->type == RDTMON_GROUP) 653 + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, 654 + rdtgrp->mon.rmid); 655 + else 656 + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, 657 + rdtgrp->mon.rmid); 658 + 659 + /* 660 + * Ensure the task's closid and rmid are written before determining if 661 + * the task is current that will decide if it will be interrupted. 662 + * This pairs with the full barrier between the rq->curr update and 663 + * resctrl_arch_sched_in() during context switch. 664 + */ 665 + smp_mb(); 666 + 667 + /* 668 + * By now, the task's closid and rmid are set. If the task is current 669 + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource 670 + * group go into effect. If the task is not current, the MSR will be 671 + * updated when the task is scheduled in. 672 + */ 673 + update_task_closid_rmid(tsk); 674 + 675 + return 0; 676 + } 677 + 678 + static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) 679 + { 680 + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && 681 + resctrl_arch_match_closid(t, r->closid)); 682 + } 683 + 684 + static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) 685 + { 686 + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && 687 + resctrl_arch_match_rmid(t, r->mon.parent->closid, 688 + r->mon.rmid)); 689 + } 690 + 691 + /** 692 + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group 693 + * @r: Resource group 694 + * 695 + * Return: 1 if tasks have been assigned to @r, 0 otherwise 696 + */ 697 + int rdtgroup_tasks_assigned(struct rdtgroup *r) 698 + { 699 + struct task_struct *p, *t; 700 + int ret = 0; 701 + 702 + lockdep_assert_held(&rdtgroup_mutex); 703 + 704 + rcu_read_lock(); 705 + for_each_process_thread(p, t) { 706 + if (is_closid_match(t, r) || is_rmid_match(t, r)) { 707 + ret = 1; 708 + break; 709 + } 710 + } 711 + rcu_read_unlock(); 712 + 713 + return ret; 714 + } 715 + 716 + static int rdtgroup_task_write_permission(struct task_struct *task, 717 + struct kernfs_open_file *of) 718 + { 719 + const struct cred *tcred = get_task_cred(task); 720 + const struct cred *cred = current_cred(); 721 + int ret = 0; 722 + 723 + /* 724 + * Even if we're attaching all tasks in the thread group, we only 725 + * need to check permissions on one of them. 726 + */ 727 + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 728 + !uid_eq(cred->euid, tcred->uid) && 729 + !uid_eq(cred->euid, tcred->suid)) { 730 + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); 731 + ret = -EPERM; 732 + } 733 + 734 + put_cred(tcred); 735 + return ret; 736 + } 737 + 738 + static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, 739 + struct kernfs_open_file *of) 740 + { 741 + struct task_struct *tsk; 742 + int ret; 743 + 744 + rcu_read_lock(); 745 + if (pid) { 746 + tsk = find_task_by_vpid(pid); 747 + if (!tsk) { 748 + rcu_read_unlock(); 749 + rdt_last_cmd_printf("No task %d\n", pid); 750 + return -ESRCH; 751 + } 752 + } else { 753 + tsk = current; 754 + } 755 + 756 + get_task_struct(tsk); 757 + rcu_read_unlock(); 758 + 759 + ret = rdtgroup_task_write_permission(tsk, of); 760 + if (!ret) 761 + ret = __rdtgroup_move_task(tsk, rdtgrp); 762 + 763 + put_task_struct(tsk); 764 + return ret; 765 + } 766 + 767 + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, 768 + char *buf, size_t nbytes, loff_t off) 769 + { 770 + struct rdtgroup *rdtgrp; 771 + char *pid_str; 772 + int ret = 0; 773 + pid_t pid; 774 + 775 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 776 + if (!rdtgrp) { 777 + rdtgroup_kn_unlock(of->kn); 778 + return -ENOENT; 779 + } 780 + rdt_last_cmd_clear(); 781 + 782 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 783 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 784 + ret = -EINVAL; 785 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 786 + goto unlock; 787 + } 788 + 789 + while (buf && buf[0] != '\0' && buf[0] != '\n') { 790 + pid_str = strim(strsep(&buf, ",")); 791 + 792 + if (kstrtoint(pid_str, 0, &pid)) { 793 + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); 794 + ret = -EINVAL; 795 + break; 796 + } 797 + 798 + if (pid < 0) { 799 + rdt_last_cmd_printf("Invalid pid %d\n", pid); 800 + ret = -EINVAL; 801 + break; 802 + } 803 + 804 + ret = rdtgroup_move_task(pid, rdtgrp, of); 805 + if (ret) { 806 + rdt_last_cmd_printf("Error while processing task %d\n", pid); 807 + break; 808 + } 809 + } 810 + 811 + unlock: 812 + rdtgroup_kn_unlock(of->kn); 813 + 814 + return ret ?: nbytes; 815 + } 816 + 817 + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) 818 + { 819 + struct task_struct *p, *t; 820 + pid_t pid; 821 + 822 + rcu_read_lock(); 823 + for_each_process_thread(p, t) { 824 + if (is_closid_match(t, r) || is_rmid_match(t, r)) { 825 + pid = task_pid_vnr(t); 826 + if (pid) 827 + seq_printf(s, "%d\n", pid); 828 + } 829 + } 830 + rcu_read_unlock(); 831 + } 832 + 833 + static int rdtgroup_tasks_show(struct kernfs_open_file *of, 834 + struct seq_file *s, void *v) 835 + { 836 + struct rdtgroup *rdtgrp; 837 + int ret = 0; 838 + 839 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 840 + if (rdtgrp) 841 + show_rdt_tasks(rdtgrp, s); 842 + else 843 + ret = -ENOENT; 844 + rdtgroup_kn_unlock(of->kn); 845 + 846 + return ret; 847 + } 848 + 849 + static int rdtgroup_closid_show(struct kernfs_open_file *of, 850 + struct seq_file *s, void *v) 851 + { 852 + struct rdtgroup *rdtgrp; 853 + int ret = 0; 854 + 855 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 856 + if (rdtgrp) 857 + seq_printf(s, "%u\n", rdtgrp->closid); 858 + else 859 + ret = -ENOENT; 860 + rdtgroup_kn_unlock(of->kn); 861 + 862 + return ret; 863 + } 864 + 865 + static int rdtgroup_rmid_show(struct kernfs_open_file *of, 866 + struct seq_file *s, void *v) 867 + { 868 + struct rdtgroup *rdtgrp; 869 + int ret = 0; 870 + 871 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 872 + if (rdtgrp) 873 + seq_printf(s, "%u\n", rdtgrp->mon.rmid); 874 + else 875 + ret = -ENOENT; 876 + rdtgroup_kn_unlock(of->kn); 877 + 878 + return ret; 879 + } 880 + 881 + #ifdef CONFIG_PROC_CPU_RESCTRL 882 + /* 883 + * A task can only be part of one resctrl control group and of one monitor 884 + * group which is associated to that control group. 885 + * 886 + * 1) res: 887 + * mon: 888 + * 889 + * resctrl is not available. 890 + * 891 + * 2) res:/ 892 + * mon: 893 + * 894 + * Task is part of the root resctrl control group, and it is not associated 895 + * to any monitor group. 896 + * 897 + * 3) res:/ 898 + * mon:mon0 899 + * 900 + * Task is part of the root resctrl control group and monitor group mon0. 901 + * 902 + * 4) res:group0 903 + * mon: 904 + * 905 + * Task is part of resctrl control group group0, and it is not associated 906 + * to any monitor group. 907 + * 908 + * 5) res:group0 909 + * mon:mon1 910 + * 911 + * Task is part of resctrl control group group0 and monitor group mon1. 912 + */ 913 + int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, 914 + struct pid *pid, struct task_struct *tsk) 915 + { 916 + struct rdtgroup *rdtg; 917 + int ret = 0; 918 + 919 + mutex_lock(&rdtgroup_mutex); 920 + 921 + /* Return empty if resctrl has not been mounted. */ 922 + if (!resctrl_mounted) { 923 + seq_puts(s, "res:\nmon:\n"); 924 + goto unlock; 925 + } 926 + 927 + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { 928 + struct rdtgroup *crg; 929 + 930 + /* 931 + * Task information is only relevant for shareable 932 + * and exclusive groups. 933 + */ 934 + if (rdtg->mode != RDT_MODE_SHAREABLE && 935 + rdtg->mode != RDT_MODE_EXCLUSIVE) 936 + continue; 937 + 938 + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) 939 + continue; 940 + 941 + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", 942 + rdt_kn_name(rdtg->kn)); 943 + seq_puts(s, "mon:"); 944 + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, 945 + mon.crdtgrp_list) { 946 + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, 947 + crg->mon.rmid)) 948 + continue; 949 + seq_printf(s, "%s", rdt_kn_name(crg->kn)); 950 + break; 951 + } 952 + seq_putc(s, '\n'); 953 + goto unlock; 954 + } 955 + /* 956 + * The above search should succeed. Otherwise return 957 + * with an error. 958 + */ 959 + ret = -ENOENT; 960 + unlock: 961 + mutex_unlock(&rdtgroup_mutex); 962 + 963 + return ret; 964 + } 965 + #endif 966 + 967 + static int rdt_last_cmd_status_show(struct kernfs_open_file *of, 968 + struct seq_file *seq, void *v) 969 + { 970 + int len; 971 + 972 + mutex_lock(&rdtgroup_mutex); 973 + len = seq_buf_used(&last_cmd_status); 974 + if (len) 975 + seq_printf(seq, "%.*s", len, last_cmd_status_buf); 976 + else 977 + seq_puts(seq, "ok\n"); 978 + mutex_unlock(&rdtgroup_mutex); 979 + return 0; 980 + } 981 + 982 + static void *rdt_kn_parent_priv(struct kernfs_node *kn) 983 + { 984 + /* 985 + * The parent pointer is only valid within RCU section since it can be 986 + * replaced. 987 + */ 988 + guard(rcu)(); 989 + return rcu_dereference(kn->__parent)->priv; 990 + } 991 + 992 + static int rdt_num_closids_show(struct kernfs_open_file *of, 993 + struct seq_file *seq, void *v) 994 + { 995 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 996 + 997 + seq_printf(seq, "%u\n", s->num_closid); 998 + return 0; 999 + } 1000 + 1001 + static int rdt_default_ctrl_show(struct kernfs_open_file *of, 1002 + struct seq_file *seq, void *v) 1003 + { 1004 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1005 + struct rdt_resource *r = s->res; 1006 + 1007 + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); 1008 + return 0; 1009 + } 1010 + 1011 + static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, 1012 + struct seq_file *seq, void *v) 1013 + { 1014 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1015 + struct rdt_resource *r = s->res; 1016 + 1017 + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); 1018 + return 0; 1019 + } 1020 + 1021 + static int rdt_shareable_bits_show(struct kernfs_open_file *of, 1022 + struct seq_file *seq, void *v) 1023 + { 1024 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1025 + struct rdt_resource *r = s->res; 1026 + 1027 + seq_printf(seq, "%x\n", r->cache.shareable_bits); 1028 + return 0; 1029 + } 1030 + 1031 + /* 1032 + * rdt_bit_usage_show - Display current usage of resources 1033 + * 1034 + * A domain is a shared resource that can now be allocated differently. Here 1035 + * we display the current regions of the domain as an annotated bitmask. 1036 + * For each domain of this resource its allocation bitmask 1037 + * is annotated as below to indicate the current usage of the corresponding bit: 1038 + * 0 - currently unused 1039 + * X - currently available for sharing and used by software and hardware 1040 + * H - currently used by hardware only but available for software use 1041 + * S - currently used and shareable by software only 1042 + * E - currently used exclusively by one resource group 1043 + * P - currently pseudo-locked by one resource group 1044 + */ 1045 + static int rdt_bit_usage_show(struct kernfs_open_file *of, 1046 + struct seq_file *seq, void *v) 1047 + { 1048 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1049 + /* 1050 + * Use unsigned long even though only 32 bits are used to ensure 1051 + * test_bit() is used safely. 1052 + */ 1053 + unsigned long sw_shareable = 0, hw_shareable = 0; 1054 + unsigned long exclusive = 0, pseudo_locked = 0; 1055 + struct rdt_resource *r = s->res; 1056 + struct rdt_ctrl_domain *dom; 1057 + int i, hwb, swb, excl, psl; 1058 + enum rdtgrp_mode mode; 1059 + bool sep = false; 1060 + u32 ctrl_val; 1061 + 1062 + cpus_read_lock(); 1063 + mutex_lock(&rdtgroup_mutex); 1064 + hw_shareable = r->cache.shareable_bits; 1065 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 1066 + if (sep) 1067 + seq_putc(seq, ';'); 1068 + sw_shareable = 0; 1069 + exclusive = 0; 1070 + seq_printf(seq, "%d=", dom->hdr.id); 1071 + for (i = 0; i < closids_supported(); i++) { 1072 + if (!closid_allocated(i)) 1073 + continue; 1074 + ctrl_val = resctrl_arch_get_config(r, dom, i, 1075 + s->conf_type); 1076 + mode = rdtgroup_mode_by_closid(i); 1077 + switch (mode) { 1078 + case RDT_MODE_SHAREABLE: 1079 + sw_shareable |= ctrl_val; 1080 + break; 1081 + case RDT_MODE_EXCLUSIVE: 1082 + exclusive |= ctrl_val; 1083 + break; 1084 + case RDT_MODE_PSEUDO_LOCKSETUP: 1085 + /* 1086 + * RDT_MODE_PSEUDO_LOCKSETUP is possible 1087 + * here but not included since the CBM 1088 + * associated with this CLOSID in this mode 1089 + * is not initialized and no task or cpu can be 1090 + * assigned this CLOSID. 1091 + */ 1092 + break; 1093 + case RDT_MODE_PSEUDO_LOCKED: 1094 + case RDT_NUM_MODES: 1095 + WARN(1, 1096 + "invalid mode for closid %d\n", i); 1097 + break; 1098 + } 1099 + } 1100 + for (i = r->cache.cbm_len - 1; i >= 0; i--) { 1101 + pseudo_locked = dom->plr ? dom->plr->cbm : 0; 1102 + hwb = test_bit(i, &hw_shareable); 1103 + swb = test_bit(i, &sw_shareable); 1104 + excl = test_bit(i, &exclusive); 1105 + psl = test_bit(i, &pseudo_locked); 1106 + if (hwb && swb) 1107 + seq_putc(seq, 'X'); 1108 + else if (hwb && !swb) 1109 + seq_putc(seq, 'H'); 1110 + else if (!hwb && swb) 1111 + seq_putc(seq, 'S'); 1112 + else if (excl) 1113 + seq_putc(seq, 'E'); 1114 + else if (psl) 1115 + seq_putc(seq, 'P'); 1116 + else /* Unused bits remain */ 1117 + seq_putc(seq, '0'); 1118 + } 1119 + sep = true; 1120 + } 1121 + seq_putc(seq, '\n'); 1122 + mutex_unlock(&rdtgroup_mutex); 1123 + cpus_read_unlock(); 1124 + return 0; 1125 + } 1126 + 1127 + static int rdt_min_bw_show(struct kernfs_open_file *of, 1128 + struct seq_file *seq, void *v) 1129 + { 1130 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1131 + struct rdt_resource *r = s->res; 1132 + 1133 + seq_printf(seq, "%u\n", r->membw.min_bw); 1134 + return 0; 1135 + } 1136 + 1137 + static int rdt_num_rmids_show(struct kernfs_open_file *of, 1138 + struct seq_file *seq, void *v) 1139 + { 1140 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1141 + 1142 + seq_printf(seq, "%d\n", r->num_rmid); 1143 + 1144 + return 0; 1145 + } 1146 + 1147 + static int rdt_mon_features_show(struct kernfs_open_file *of, 1148 + struct seq_file *seq, void *v) 1149 + { 1150 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1151 + struct mon_evt *mevt; 1152 + 1153 + list_for_each_entry(mevt, &r->evt_list, list) { 1154 + seq_printf(seq, "%s\n", mevt->name); 1155 + if (mevt->configurable) 1156 + seq_printf(seq, "%s_config\n", mevt->name); 1157 + } 1158 + 1159 + return 0; 1160 + } 1161 + 1162 + static int rdt_bw_gran_show(struct kernfs_open_file *of, 1163 + struct seq_file *seq, void *v) 1164 + { 1165 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1166 + struct rdt_resource *r = s->res; 1167 + 1168 + seq_printf(seq, "%u\n", r->membw.bw_gran); 1169 + return 0; 1170 + } 1171 + 1172 + static int rdt_delay_linear_show(struct kernfs_open_file *of, 1173 + struct seq_file *seq, void *v) 1174 + { 1175 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1176 + struct rdt_resource *r = s->res; 1177 + 1178 + seq_printf(seq, "%u\n", r->membw.delay_linear); 1179 + return 0; 1180 + } 1181 + 1182 + static int max_threshold_occ_show(struct kernfs_open_file *of, 1183 + struct seq_file *seq, void *v) 1184 + { 1185 + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); 1186 + 1187 + return 0; 1188 + } 1189 + 1190 + static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, 1191 + struct seq_file *seq, void *v) 1192 + { 1193 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1194 + struct rdt_resource *r = s->res; 1195 + 1196 + switch (r->membw.throttle_mode) { 1197 + case THREAD_THROTTLE_PER_THREAD: 1198 + seq_puts(seq, "per-thread\n"); 1199 + return 0; 1200 + case THREAD_THROTTLE_MAX: 1201 + seq_puts(seq, "max\n"); 1202 + return 0; 1203 + case THREAD_THROTTLE_UNDEFINED: 1204 + seq_puts(seq, "undefined\n"); 1205 + return 0; 1206 + } 1207 + 1208 + WARN_ON_ONCE(1); 1209 + 1210 + return 0; 1211 + } 1212 + 1213 + static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, 1214 + char *buf, size_t nbytes, loff_t off) 1215 + { 1216 + unsigned int bytes; 1217 + int ret; 1218 + 1219 + ret = kstrtouint(buf, 0, &bytes); 1220 + if (ret) 1221 + return ret; 1222 + 1223 + if (bytes > resctrl_rmid_realloc_limit) 1224 + return -EINVAL; 1225 + 1226 + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); 1227 + 1228 + return nbytes; 1229 + } 1230 + 1231 + /* 1232 + * rdtgroup_mode_show - Display mode of this resource group 1233 + */ 1234 + static int rdtgroup_mode_show(struct kernfs_open_file *of, 1235 + struct seq_file *s, void *v) 1236 + { 1237 + struct rdtgroup *rdtgrp; 1238 + 1239 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1240 + if (!rdtgrp) { 1241 + rdtgroup_kn_unlock(of->kn); 1242 + return -ENOENT; 1243 + } 1244 + 1245 + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); 1246 + 1247 + rdtgroup_kn_unlock(of->kn); 1248 + return 0; 1249 + } 1250 + 1251 + static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) 1252 + { 1253 + switch (my_type) { 1254 + case CDP_CODE: 1255 + return CDP_DATA; 1256 + case CDP_DATA: 1257 + return CDP_CODE; 1258 + default: 1259 + case CDP_NONE: 1260 + return CDP_NONE; 1261 + } 1262 + } 1263 + 1264 + static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, 1265 + struct seq_file *seq, void *v) 1266 + { 1267 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1268 + struct rdt_resource *r = s->res; 1269 + 1270 + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); 1271 + 1272 + return 0; 1273 + } 1274 + 1275 + /** 1276 + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other 1277 + * @r: Resource to which domain instance @d belongs. 1278 + * @d: The domain instance for which @closid is being tested. 1279 + * @cbm: Capacity bitmask being tested. 1280 + * @closid: Intended closid for @cbm. 1281 + * @type: CDP type of @r. 1282 + * @exclusive: Only check if overlaps with exclusive resource groups 1283 + * 1284 + * Checks if provided @cbm intended to be used for @closid on domain 1285 + * @d overlaps with any other closids or other hardware usage associated 1286 + * with this domain. If @exclusive is true then only overlaps with 1287 + * resource groups in exclusive mode will be considered. If @exclusive 1288 + * is false then overlaps with any resource group or hardware entities 1289 + * will be considered. 1290 + * 1291 + * @cbm is unsigned long, even if only 32 bits are used, to make the 1292 + * bitmap functions work correctly. 1293 + * 1294 + * Return: false if CBM does not overlap, true if it does. 1295 + */ 1296 + static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, 1297 + unsigned long cbm, int closid, 1298 + enum resctrl_conf_type type, bool exclusive) 1299 + { 1300 + enum rdtgrp_mode mode; 1301 + unsigned long ctrl_b; 1302 + int i; 1303 + 1304 + /* Check for any overlap with regions used by hardware directly */ 1305 + if (!exclusive) { 1306 + ctrl_b = r->cache.shareable_bits; 1307 + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) 1308 + return true; 1309 + } 1310 + 1311 + /* Check for overlap with other resource groups */ 1312 + for (i = 0; i < closids_supported(); i++) { 1313 + ctrl_b = resctrl_arch_get_config(r, d, i, type); 1314 + mode = rdtgroup_mode_by_closid(i); 1315 + if (closid_allocated(i) && i != closid && 1316 + mode != RDT_MODE_PSEUDO_LOCKSETUP) { 1317 + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { 1318 + if (exclusive) { 1319 + if (mode == RDT_MODE_EXCLUSIVE) 1320 + return true; 1321 + continue; 1322 + } 1323 + return true; 1324 + } 1325 + } 1326 + } 1327 + 1328 + return false; 1329 + } 1330 + 1331 + /** 1332 + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware 1333 + * @s: Schema for the resource to which domain instance @d belongs. 1334 + * @d: The domain instance for which @closid is being tested. 1335 + * @cbm: Capacity bitmask being tested. 1336 + * @closid: Intended closid for @cbm. 1337 + * @exclusive: Only check if overlaps with exclusive resource groups 1338 + * 1339 + * Resources that can be allocated using a CBM can use the CBM to control 1340 + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test 1341 + * for overlap. Overlap test is not limited to the specific resource for 1342 + * which the CBM is intended though - when dealing with CDP resources that 1343 + * share the underlying hardware the overlap check should be performed on 1344 + * the CDP resource sharing the hardware also. 1345 + * 1346 + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the 1347 + * overlap test. 1348 + * 1349 + * Return: true if CBM overlap detected, false if there is no overlap 1350 + */ 1351 + bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 1352 + unsigned long cbm, int closid, bool exclusive) 1353 + { 1354 + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 1355 + struct rdt_resource *r = s->res; 1356 + 1357 + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, 1358 + exclusive)) 1359 + return true; 1360 + 1361 + if (!resctrl_arch_get_cdp_enabled(r->rid)) 1362 + return false; 1363 + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); 1364 + } 1365 + 1366 + /** 1367 + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive 1368 + * @rdtgrp: Resource group identified through its closid. 1369 + * 1370 + * An exclusive resource group implies that there should be no sharing of 1371 + * its allocated resources. At the time this group is considered to be 1372 + * exclusive this test can determine if its current schemata supports this 1373 + * setting by testing for overlap with all other resource groups. 1374 + * 1375 + * Return: true if resource group can be exclusive, false if there is overlap 1376 + * with allocations of other resource groups and thus this resource group 1377 + * cannot be exclusive. 1378 + */ 1379 + static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) 1380 + { 1381 + int closid = rdtgrp->closid; 1382 + struct rdt_ctrl_domain *d; 1383 + struct resctrl_schema *s; 1384 + struct rdt_resource *r; 1385 + bool has_cache = false; 1386 + u32 ctrl; 1387 + 1388 + /* Walking r->domains, ensure it can't race with cpuhp */ 1389 + lockdep_assert_cpus_held(); 1390 + 1391 + list_for_each_entry(s, &resctrl_schema_all, list) { 1392 + r = s->res; 1393 + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) 1394 + continue; 1395 + has_cache = true; 1396 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1397 + ctrl = resctrl_arch_get_config(r, d, closid, 1398 + s->conf_type); 1399 + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { 1400 + rdt_last_cmd_puts("Schemata overlaps\n"); 1401 + return false; 1402 + } 1403 + } 1404 + } 1405 + 1406 + if (!has_cache) { 1407 + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); 1408 + return false; 1409 + } 1410 + 1411 + return true; 1412 + } 1413 + 1414 + /* 1415 + * rdtgroup_mode_write - Modify the resource group's mode 1416 + */ 1417 + static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, 1418 + char *buf, size_t nbytes, loff_t off) 1419 + { 1420 + struct rdtgroup *rdtgrp; 1421 + enum rdtgrp_mode mode; 1422 + int ret = 0; 1423 + 1424 + /* Valid input requires a trailing newline */ 1425 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1426 + return -EINVAL; 1427 + buf[nbytes - 1] = '\0'; 1428 + 1429 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1430 + if (!rdtgrp) { 1431 + rdtgroup_kn_unlock(of->kn); 1432 + return -ENOENT; 1433 + } 1434 + 1435 + rdt_last_cmd_clear(); 1436 + 1437 + mode = rdtgrp->mode; 1438 + 1439 + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || 1440 + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || 1441 + (!strcmp(buf, "pseudo-locksetup") && 1442 + mode == RDT_MODE_PSEUDO_LOCKSETUP) || 1443 + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) 1444 + goto out; 1445 + 1446 + if (mode == RDT_MODE_PSEUDO_LOCKED) { 1447 + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); 1448 + ret = -EINVAL; 1449 + goto out; 1450 + } 1451 + 1452 + if (!strcmp(buf, "shareable")) { 1453 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1454 + ret = rdtgroup_locksetup_exit(rdtgrp); 1455 + if (ret) 1456 + goto out; 1457 + } 1458 + rdtgrp->mode = RDT_MODE_SHAREABLE; 1459 + } else if (!strcmp(buf, "exclusive")) { 1460 + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { 1461 + ret = -EINVAL; 1462 + goto out; 1463 + } 1464 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1465 + ret = rdtgroup_locksetup_exit(rdtgrp); 1466 + if (ret) 1467 + goto out; 1468 + } 1469 + rdtgrp->mode = RDT_MODE_EXCLUSIVE; 1470 + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && 1471 + !strcmp(buf, "pseudo-locksetup")) { 1472 + ret = rdtgroup_locksetup_enter(rdtgrp); 1473 + if (ret) 1474 + goto out; 1475 + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; 1476 + } else { 1477 + rdt_last_cmd_puts("Unknown or unsupported mode\n"); 1478 + ret = -EINVAL; 1479 + } 1480 + 1481 + out: 1482 + rdtgroup_kn_unlock(of->kn); 1483 + return ret ?: nbytes; 1484 + } 1485 + 1486 + /** 1487 + * rdtgroup_cbm_to_size - Translate CBM to size in bytes 1488 + * @r: RDT resource to which @d belongs. 1489 + * @d: RDT domain instance. 1490 + * @cbm: bitmask for which the size should be computed. 1491 + * 1492 + * The bitmask provided associated with the RDT domain instance @d will be 1493 + * translated into how many bytes it represents. The size in bytes is 1494 + * computed by first dividing the total cache size by the CBM length to 1495 + * determine how many bytes each bit in the bitmask represents. The result 1496 + * is multiplied with the number of bits set in the bitmask. 1497 + * 1498 + * @cbm is unsigned long, even if only 32 bits are used to make the 1499 + * bitmap functions work correctly. 1500 + */ 1501 + unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, 1502 + struct rdt_ctrl_domain *d, unsigned long cbm) 1503 + { 1504 + unsigned int size = 0; 1505 + struct cacheinfo *ci; 1506 + int num_b; 1507 + 1508 + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) 1509 + return size; 1510 + 1511 + num_b = bitmap_weight(&cbm, r->cache.cbm_len); 1512 + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); 1513 + if (ci) 1514 + size = ci->size / r->cache.cbm_len * num_b; 1515 + 1516 + return size; 1517 + } 1518 + 1519 + bool is_mba_sc(struct rdt_resource *r) 1520 + { 1521 + if (!r) 1522 + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 1523 + 1524 + /* 1525 + * The software controller support is only applicable to MBA resource. 1526 + * Make sure to check for resource type. 1527 + */ 1528 + if (r->rid != RDT_RESOURCE_MBA) 1529 + return false; 1530 + 1531 + return r->membw.mba_sc; 1532 + } 1533 + 1534 + /* 1535 + * rdtgroup_size_show - Display size in bytes of allocated regions 1536 + * 1537 + * The "size" file mirrors the layout of the "schemata" file, printing the 1538 + * size in bytes of each region instead of the capacity bitmask. 1539 + */ 1540 + static int rdtgroup_size_show(struct kernfs_open_file *of, 1541 + struct seq_file *s, void *v) 1542 + { 1543 + struct resctrl_schema *schema; 1544 + enum resctrl_conf_type type; 1545 + struct rdt_ctrl_domain *d; 1546 + struct rdtgroup *rdtgrp; 1547 + struct rdt_resource *r; 1548 + unsigned int size; 1549 + int ret = 0; 1550 + u32 closid; 1551 + bool sep; 1552 + u32 ctrl; 1553 + 1554 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1555 + if (!rdtgrp) { 1556 + rdtgroup_kn_unlock(of->kn); 1557 + return -ENOENT; 1558 + } 1559 + 1560 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1561 + if (!rdtgrp->plr->d) { 1562 + rdt_last_cmd_clear(); 1563 + rdt_last_cmd_puts("Cache domain offline\n"); 1564 + ret = -ENODEV; 1565 + } else { 1566 + seq_printf(s, "%*s:", max_name_width, 1567 + rdtgrp->plr->s->name); 1568 + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, 1569 + rdtgrp->plr->d, 1570 + rdtgrp->plr->cbm); 1571 + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); 1572 + } 1573 + goto out; 1574 + } 1575 + 1576 + closid = rdtgrp->closid; 1577 + 1578 + list_for_each_entry(schema, &resctrl_schema_all, list) { 1579 + r = schema->res; 1580 + type = schema->conf_type; 1581 + sep = false; 1582 + seq_printf(s, "%*s:", max_name_width, schema->name); 1583 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1584 + if (sep) 1585 + seq_putc(s, ';'); 1586 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1587 + size = 0; 1588 + } else { 1589 + if (is_mba_sc(r)) 1590 + ctrl = d->mbps_val[closid]; 1591 + else 1592 + ctrl = resctrl_arch_get_config(r, d, 1593 + closid, 1594 + type); 1595 + if (r->rid == RDT_RESOURCE_MBA || 1596 + r->rid == RDT_RESOURCE_SMBA) 1597 + size = ctrl; 1598 + else 1599 + size = rdtgroup_cbm_to_size(r, d, ctrl); 1600 + } 1601 + seq_printf(s, "%d=%u", d->hdr.id, size); 1602 + sep = true; 1603 + } 1604 + seq_putc(s, '\n'); 1605 + } 1606 + 1607 + out: 1608 + rdtgroup_kn_unlock(of->kn); 1609 + 1610 + return ret; 1611 + } 1612 + 1613 + static void mondata_config_read(struct resctrl_mon_config_info *mon_info) 1614 + { 1615 + smp_call_function_any(&mon_info->d->hdr.cpu_mask, 1616 + resctrl_arch_mon_event_config_read, mon_info, 1); 1617 + } 1618 + 1619 + static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) 1620 + { 1621 + struct resctrl_mon_config_info mon_info; 1622 + struct rdt_mon_domain *dom; 1623 + bool sep = false; 1624 + 1625 + cpus_read_lock(); 1626 + mutex_lock(&rdtgroup_mutex); 1627 + 1628 + list_for_each_entry(dom, &r->mon_domains, hdr.list) { 1629 + if (sep) 1630 + seq_puts(s, ";"); 1631 + 1632 + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); 1633 + mon_info.r = r; 1634 + mon_info.d = dom; 1635 + mon_info.evtid = evtid; 1636 + mondata_config_read(&mon_info); 1637 + 1638 + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); 1639 + sep = true; 1640 + } 1641 + seq_puts(s, "\n"); 1642 + 1643 + mutex_unlock(&rdtgroup_mutex); 1644 + cpus_read_unlock(); 1645 + 1646 + return 0; 1647 + } 1648 + 1649 + static int mbm_total_bytes_config_show(struct kernfs_open_file *of, 1650 + struct seq_file *seq, void *v) 1651 + { 1652 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1653 + 1654 + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); 1655 + 1656 + return 0; 1657 + } 1658 + 1659 + static int mbm_local_bytes_config_show(struct kernfs_open_file *of, 1660 + struct seq_file *seq, void *v) 1661 + { 1662 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1663 + 1664 + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); 1665 + 1666 + return 0; 1667 + } 1668 + 1669 + static void mbm_config_write_domain(struct rdt_resource *r, 1670 + struct rdt_mon_domain *d, u32 evtid, u32 val) 1671 + { 1672 + struct resctrl_mon_config_info mon_info = {0}; 1673 + 1674 + /* 1675 + * Read the current config value first. If both are the same then 1676 + * no need to write it again. 1677 + */ 1678 + mon_info.r = r; 1679 + mon_info.d = d; 1680 + mon_info.evtid = evtid; 1681 + mondata_config_read(&mon_info); 1682 + if (mon_info.mon_config == val) 1683 + return; 1684 + 1685 + mon_info.mon_config = val; 1686 + 1687 + /* 1688 + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the 1689 + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE 1690 + * are scoped at the domain level. Writing any of these MSRs 1691 + * on one CPU is observed by all the CPUs in the domain. 1692 + */ 1693 + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, 1694 + &mon_info, 1); 1695 + 1696 + /* 1697 + * When an Event Configuration is changed, the bandwidth counters 1698 + * for all RMIDs and Events will be cleared by the hardware. The 1699 + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for 1700 + * every RMID on the next read to any event for every RMID. 1701 + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) 1702 + * cleared while it is tracked by the hardware. Clear the 1703 + * mbm_local and mbm_total counts for all the RMIDs. 1704 + */ 1705 + resctrl_arch_reset_rmid_all(r, d); 1706 + } 1707 + 1708 + static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) 1709 + { 1710 + char *dom_str = NULL, *id_str; 1711 + unsigned long dom_id, val; 1712 + struct rdt_mon_domain *d; 1713 + 1714 + /* Walking r->domains, ensure it can't race with cpuhp */ 1715 + lockdep_assert_cpus_held(); 1716 + 1717 + next: 1718 + if (!tok || tok[0] == '\0') 1719 + return 0; 1720 + 1721 + /* Start processing the strings for each domain */ 1722 + dom_str = strim(strsep(&tok, ";")); 1723 + id_str = strsep(&dom_str, "="); 1724 + 1725 + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { 1726 + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); 1727 + return -EINVAL; 1728 + } 1729 + 1730 + if (!dom_str || kstrtoul(dom_str, 16, &val)) { 1731 + rdt_last_cmd_puts("Non-numeric event configuration value\n"); 1732 + return -EINVAL; 1733 + } 1734 + 1735 + /* Value from user cannot be more than the supported set of events */ 1736 + if ((val & r->mbm_cfg_mask) != val) { 1737 + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", 1738 + r->mbm_cfg_mask); 1739 + return -EINVAL; 1740 + } 1741 + 1742 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 1743 + if (d->hdr.id == dom_id) { 1744 + mbm_config_write_domain(r, d, evtid, val); 1745 + goto next; 1746 + } 1747 + } 1748 + 1749 + return -EINVAL; 1750 + } 1751 + 1752 + static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, 1753 + char *buf, size_t nbytes, 1754 + loff_t off) 1755 + { 1756 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1757 + int ret; 1758 + 1759 + /* Valid input requires a trailing newline */ 1760 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1761 + return -EINVAL; 1762 + 1763 + cpus_read_lock(); 1764 + mutex_lock(&rdtgroup_mutex); 1765 + 1766 + rdt_last_cmd_clear(); 1767 + 1768 + buf[nbytes - 1] = '\0'; 1769 + 1770 + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); 1771 + 1772 + mutex_unlock(&rdtgroup_mutex); 1773 + cpus_read_unlock(); 1774 + 1775 + return ret ?: nbytes; 1776 + } 1777 + 1778 + static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, 1779 + char *buf, size_t nbytes, 1780 + loff_t off) 1781 + { 1782 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1783 + int ret; 1784 + 1785 + /* Valid input requires a trailing newline */ 1786 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1787 + return -EINVAL; 1788 + 1789 + cpus_read_lock(); 1790 + mutex_lock(&rdtgroup_mutex); 1791 + 1792 + rdt_last_cmd_clear(); 1793 + 1794 + buf[nbytes - 1] = '\0'; 1795 + 1796 + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); 1797 + 1798 + mutex_unlock(&rdtgroup_mutex); 1799 + cpus_read_unlock(); 1800 + 1801 + return ret ?: nbytes; 1802 + } 1803 + 1804 + /* rdtgroup information files for one cache resource. */ 1805 + static struct rftype res_common_files[] = { 1806 + { 1807 + .name = "last_cmd_status", 1808 + .mode = 0444, 1809 + .kf_ops = &rdtgroup_kf_single_ops, 1810 + .seq_show = rdt_last_cmd_status_show, 1811 + .fflags = RFTYPE_TOP_INFO, 1812 + }, 1813 + { 1814 + .name = "num_closids", 1815 + .mode = 0444, 1816 + .kf_ops = &rdtgroup_kf_single_ops, 1817 + .seq_show = rdt_num_closids_show, 1818 + .fflags = RFTYPE_CTRL_INFO, 1819 + }, 1820 + { 1821 + .name = "mon_features", 1822 + .mode = 0444, 1823 + .kf_ops = &rdtgroup_kf_single_ops, 1824 + .seq_show = rdt_mon_features_show, 1825 + .fflags = RFTYPE_MON_INFO, 1826 + }, 1827 + { 1828 + .name = "num_rmids", 1829 + .mode = 0444, 1830 + .kf_ops = &rdtgroup_kf_single_ops, 1831 + .seq_show = rdt_num_rmids_show, 1832 + .fflags = RFTYPE_MON_INFO, 1833 + }, 1834 + { 1835 + .name = "cbm_mask", 1836 + .mode = 0444, 1837 + .kf_ops = &rdtgroup_kf_single_ops, 1838 + .seq_show = rdt_default_ctrl_show, 1839 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1840 + }, 1841 + { 1842 + .name = "min_cbm_bits", 1843 + .mode = 0444, 1844 + .kf_ops = &rdtgroup_kf_single_ops, 1845 + .seq_show = rdt_min_cbm_bits_show, 1846 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1847 + }, 1848 + { 1849 + .name = "shareable_bits", 1850 + .mode = 0444, 1851 + .kf_ops = &rdtgroup_kf_single_ops, 1852 + .seq_show = rdt_shareable_bits_show, 1853 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1854 + }, 1855 + { 1856 + .name = "bit_usage", 1857 + .mode = 0444, 1858 + .kf_ops = &rdtgroup_kf_single_ops, 1859 + .seq_show = rdt_bit_usage_show, 1860 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1861 + }, 1862 + { 1863 + .name = "min_bandwidth", 1864 + .mode = 0444, 1865 + .kf_ops = &rdtgroup_kf_single_ops, 1866 + .seq_show = rdt_min_bw_show, 1867 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1868 + }, 1869 + { 1870 + .name = "bandwidth_gran", 1871 + .mode = 0444, 1872 + .kf_ops = &rdtgroup_kf_single_ops, 1873 + .seq_show = rdt_bw_gran_show, 1874 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1875 + }, 1876 + { 1877 + .name = "delay_linear", 1878 + .mode = 0444, 1879 + .kf_ops = &rdtgroup_kf_single_ops, 1880 + .seq_show = rdt_delay_linear_show, 1881 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1882 + }, 1883 + /* 1884 + * Platform specific which (if any) capabilities are provided by 1885 + * thread_throttle_mode. Defer "fflags" initialization to platform 1886 + * discovery. 1887 + */ 1888 + { 1889 + .name = "thread_throttle_mode", 1890 + .mode = 0444, 1891 + .kf_ops = &rdtgroup_kf_single_ops, 1892 + .seq_show = rdt_thread_throttle_mode_show, 1893 + }, 1894 + { 1895 + .name = "max_threshold_occupancy", 1896 + .mode = 0644, 1897 + .kf_ops = &rdtgroup_kf_single_ops, 1898 + .write = max_threshold_occ_write, 1899 + .seq_show = max_threshold_occ_show, 1900 + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, 1901 + }, 1902 + { 1903 + .name = "mbm_total_bytes_config", 1904 + .mode = 0644, 1905 + .kf_ops = &rdtgroup_kf_single_ops, 1906 + .seq_show = mbm_total_bytes_config_show, 1907 + .write = mbm_total_bytes_config_write, 1908 + }, 1909 + { 1910 + .name = "mbm_local_bytes_config", 1911 + .mode = 0644, 1912 + .kf_ops = &rdtgroup_kf_single_ops, 1913 + .seq_show = mbm_local_bytes_config_show, 1914 + .write = mbm_local_bytes_config_write, 1915 + }, 1916 + { 1917 + .name = "cpus", 1918 + .mode = 0644, 1919 + .kf_ops = &rdtgroup_kf_single_ops, 1920 + .write = rdtgroup_cpus_write, 1921 + .seq_show = rdtgroup_cpus_show, 1922 + .fflags = RFTYPE_BASE, 1923 + }, 1924 + { 1925 + .name = "cpus_list", 1926 + .mode = 0644, 1927 + .kf_ops = &rdtgroup_kf_single_ops, 1928 + .write = rdtgroup_cpus_write, 1929 + .seq_show = rdtgroup_cpus_show, 1930 + .flags = RFTYPE_FLAGS_CPUS_LIST, 1931 + .fflags = RFTYPE_BASE, 1932 + }, 1933 + { 1934 + .name = "tasks", 1935 + .mode = 0644, 1936 + .kf_ops = &rdtgroup_kf_single_ops, 1937 + .write = rdtgroup_tasks_write, 1938 + .seq_show = rdtgroup_tasks_show, 1939 + .fflags = RFTYPE_BASE, 1940 + }, 1941 + { 1942 + .name = "mon_hw_id", 1943 + .mode = 0444, 1944 + .kf_ops = &rdtgroup_kf_single_ops, 1945 + .seq_show = rdtgroup_rmid_show, 1946 + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, 1947 + }, 1948 + { 1949 + .name = "schemata", 1950 + .mode = 0644, 1951 + .kf_ops = &rdtgroup_kf_single_ops, 1952 + .write = rdtgroup_schemata_write, 1953 + .seq_show = rdtgroup_schemata_show, 1954 + .fflags = RFTYPE_CTRL_BASE, 1955 + }, 1956 + { 1957 + .name = "mba_MBps_event", 1958 + .mode = 0644, 1959 + .kf_ops = &rdtgroup_kf_single_ops, 1960 + .write = rdtgroup_mba_mbps_event_write, 1961 + .seq_show = rdtgroup_mba_mbps_event_show, 1962 + }, 1963 + { 1964 + .name = "mode", 1965 + .mode = 0644, 1966 + .kf_ops = &rdtgroup_kf_single_ops, 1967 + .write = rdtgroup_mode_write, 1968 + .seq_show = rdtgroup_mode_show, 1969 + .fflags = RFTYPE_CTRL_BASE, 1970 + }, 1971 + { 1972 + .name = "size", 1973 + .mode = 0444, 1974 + .kf_ops = &rdtgroup_kf_single_ops, 1975 + .seq_show = rdtgroup_size_show, 1976 + .fflags = RFTYPE_CTRL_BASE, 1977 + }, 1978 + { 1979 + .name = "sparse_masks", 1980 + .mode = 0444, 1981 + .kf_ops = &rdtgroup_kf_single_ops, 1982 + .seq_show = rdt_has_sparse_bitmasks_show, 1983 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1984 + }, 1985 + { 1986 + .name = "ctrl_hw_id", 1987 + .mode = 0444, 1988 + .kf_ops = &rdtgroup_kf_single_ops, 1989 + .seq_show = rdtgroup_closid_show, 1990 + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, 1991 + }, 1992 + }; 1993 + 1994 + static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) 1995 + { 1996 + struct rftype *rfts, *rft; 1997 + int ret, len; 1998 + 1999 + rfts = res_common_files; 2000 + len = ARRAY_SIZE(res_common_files); 2001 + 2002 + lockdep_assert_held(&rdtgroup_mutex); 2003 + 2004 + if (resctrl_debug) 2005 + fflags |= RFTYPE_DEBUG; 2006 + 2007 + for (rft = rfts; rft < rfts + len; rft++) { 2008 + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { 2009 + ret = rdtgroup_add_file(kn, rft); 2010 + if (ret) 2011 + goto error; 2012 + } 2013 + } 2014 + 2015 + return 0; 2016 + error: 2017 + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); 2018 + while (--rft >= rfts) { 2019 + if ((fflags & rft->fflags) == rft->fflags) 2020 + kernfs_remove_by_name(kn, rft->name); 2021 + } 2022 + return ret; 2023 + } 2024 + 2025 + static struct rftype *rdtgroup_get_rftype_by_name(const char *name) 2026 + { 2027 + struct rftype *rfts, *rft; 2028 + int len; 2029 + 2030 + rfts = res_common_files; 2031 + len = ARRAY_SIZE(res_common_files); 2032 + 2033 + for (rft = rfts; rft < rfts + len; rft++) { 2034 + if (!strcmp(rft->name, name)) 2035 + return rft; 2036 + } 2037 + 2038 + return NULL; 2039 + } 2040 + 2041 + static void thread_throttle_mode_init(void) 2042 + { 2043 + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; 2044 + struct rdt_resource *r_mba, *r_smba; 2045 + 2046 + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2047 + if (r_mba->alloc_capable && 2048 + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 2049 + throttle_mode = r_mba->membw.throttle_mode; 2050 + 2051 + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); 2052 + if (r_smba->alloc_capable && 2053 + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 2054 + throttle_mode = r_smba->membw.throttle_mode; 2055 + 2056 + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) 2057 + return; 2058 + 2059 + resctrl_file_fflags_init("thread_throttle_mode", 2060 + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); 2061 + } 2062 + 2063 + void resctrl_file_fflags_init(const char *config, unsigned long fflags) 2064 + { 2065 + struct rftype *rft; 2066 + 2067 + rft = rdtgroup_get_rftype_by_name(config); 2068 + if (rft) 2069 + rft->fflags = fflags; 2070 + } 2071 + 2072 + /** 2073 + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file 2074 + * @r: The resource group with which the file is associated. 2075 + * @name: Name of the file 2076 + * 2077 + * The permissions of named resctrl file, directory, or link are modified 2078 + * to not allow read, write, or execute by any user. 2079 + * 2080 + * WARNING: This function is intended to communicate to the user that the 2081 + * resctrl file has been locked down - that it is not relevant to the 2082 + * particular state the system finds itself in. It should not be relied 2083 + * on to protect from user access because after the file's permissions 2084 + * are restricted the user can still change the permissions using chmod 2085 + * from the command line. 2086 + * 2087 + * Return: 0 on success, <0 on failure. 2088 + */ 2089 + int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) 2090 + { 2091 + struct iattr iattr = {.ia_valid = ATTR_MODE,}; 2092 + struct kernfs_node *kn; 2093 + int ret = 0; 2094 + 2095 + kn = kernfs_find_and_get_ns(r->kn, name, NULL); 2096 + if (!kn) 2097 + return -ENOENT; 2098 + 2099 + switch (kernfs_type(kn)) { 2100 + case KERNFS_DIR: 2101 + iattr.ia_mode = S_IFDIR; 2102 + break; 2103 + case KERNFS_FILE: 2104 + iattr.ia_mode = S_IFREG; 2105 + break; 2106 + case KERNFS_LINK: 2107 + iattr.ia_mode = S_IFLNK; 2108 + break; 2109 + } 2110 + 2111 + ret = kernfs_setattr(kn, &iattr); 2112 + kernfs_put(kn); 2113 + return ret; 2114 + } 2115 + 2116 + /** 2117 + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file 2118 + * @r: The resource group with which the file is associated. 2119 + * @name: Name of the file 2120 + * @mask: Mask of permissions that should be restored 2121 + * 2122 + * Restore the permissions of the named file. If @name is a directory the 2123 + * permissions of its parent will be used. 2124 + * 2125 + * Return: 0 on success, <0 on failure. 2126 + */ 2127 + int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 2128 + umode_t mask) 2129 + { 2130 + struct iattr iattr = {.ia_valid = ATTR_MODE,}; 2131 + struct kernfs_node *kn, *parent; 2132 + struct rftype *rfts, *rft; 2133 + int ret, len; 2134 + 2135 + rfts = res_common_files; 2136 + len = ARRAY_SIZE(res_common_files); 2137 + 2138 + for (rft = rfts; rft < rfts + len; rft++) { 2139 + if (!strcmp(rft->name, name)) 2140 + iattr.ia_mode = rft->mode & mask; 2141 + } 2142 + 2143 + kn = kernfs_find_and_get_ns(r->kn, name, NULL); 2144 + if (!kn) 2145 + return -ENOENT; 2146 + 2147 + switch (kernfs_type(kn)) { 2148 + case KERNFS_DIR: 2149 + parent = kernfs_get_parent(kn); 2150 + if (parent) { 2151 + iattr.ia_mode |= parent->mode; 2152 + kernfs_put(parent); 2153 + } 2154 + iattr.ia_mode |= S_IFDIR; 2155 + break; 2156 + case KERNFS_FILE: 2157 + iattr.ia_mode |= S_IFREG; 2158 + break; 2159 + case KERNFS_LINK: 2160 + iattr.ia_mode |= S_IFLNK; 2161 + break; 2162 + } 2163 + 2164 + ret = kernfs_setattr(kn, &iattr); 2165 + kernfs_put(kn); 2166 + return ret; 2167 + } 2168 + 2169 + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, 2170 + unsigned long fflags) 2171 + { 2172 + struct kernfs_node *kn_subdir; 2173 + int ret; 2174 + 2175 + kn_subdir = kernfs_create_dir(kn_info, name, 2176 + kn_info->mode, priv); 2177 + if (IS_ERR(kn_subdir)) 2178 + return PTR_ERR(kn_subdir); 2179 + 2180 + ret = rdtgroup_kn_set_ugid(kn_subdir); 2181 + if (ret) 2182 + return ret; 2183 + 2184 + ret = rdtgroup_add_files(kn_subdir, fflags); 2185 + if (!ret) 2186 + kernfs_activate(kn_subdir); 2187 + 2188 + return ret; 2189 + } 2190 + 2191 + static unsigned long fflags_from_resource(struct rdt_resource *r) 2192 + { 2193 + switch (r->rid) { 2194 + case RDT_RESOURCE_L3: 2195 + case RDT_RESOURCE_L2: 2196 + return RFTYPE_RES_CACHE; 2197 + case RDT_RESOURCE_MBA: 2198 + case RDT_RESOURCE_SMBA: 2199 + return RFTYPE_RES_MB; 2200 + } 2201 + 2202 + return WARN_ON_ONCE(1); 2203 + } 2204 + 2205 + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 2206 + { 2207 + struct resctrl_schema *s; 2208 + struct rdt_resource *r; 2209 + unsigned long fflags; 2210 + char name[32]; 2211 + int ret; 2212 + 2213 + /* create the directory */ 2214 + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 2215 + if (IS_ERR(kn_info)) 2216 + return PTR_ERR(kn_info); 2217 + 2218 + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); 2219 + if (ret) 2220 + goto out_destroy; 2221 + 2222 + /* loop over enabled controls, these are all alloc_capable */ 2223 + list_for_each_entry(s, &resctrl_schema_all, list) { 2224 + r = s->res; 2225 + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; 2226 + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); 2227 + if (ret) 2228 + goto out_destroy; 2229 + } 2230 + 2231 + for_each_mon_capable_rdt_resource(r) { 2232 + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; 2233 + sprintf(name, "%s_MON", r->name); 2234 + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); 2235 + if (ret) 2236 + goto out_destroy; 2237 + } 2238 + 2239 + ret = rdtgroup_kn_set_ugid(kn_info); 2240 + if (ret) 2241 + goto out_destroy; 2242 + 2243 + kernfs_activate(kn_info); 2244 + 2245 + return 0; 2246 + 2247 + out_destroy: 2248 + kernfs_remove(kn_info); 2249 + return ret; 2250 + } 2251 + 2252 + static int 2253 + mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, 2254 + char *name, struct kernfs_node **dest_kn) 2255 + { 2256 + struct kernfs_node *kn; 2257 + int ret; 2258 + 2259 + /* create the directory */ 2260 + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 2261 + if (IS_ERR(kn)) 2262 + return PTR_ERR(kn); 2263 + 2264 + if (dest_kn) 2265 + *dest_kn = kn; 2266 + 2267 + ret = rdtgroup_kn_set_ugid(kn); 2268 + if (ret) 2269 + goto out_destroy; 2270 + 2271 + kernfs_activate(kn); 2272 + 2273 + return 0; 2274 + 2275 + out_destroy: 2276 + kernfs_remove(kn); 2277 + return ret; 2278 + } 2279 + 2280 + static inline bool is_mba_linear(void) 2281 + { 2282 + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; 2283 + } 2284 + 2285 + static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) 2286 + { 2287 + u32 num_closid = resctrl_arch_get_num_closid(r); 2288 + int cpu = cpumask_any(&d->hdr.cpu_mask); 2289 + int i; 2290 + 2291 + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), 2292 + GFP_KERNEL, cpu_to_node(cpu)); 2293 + if (!d->mbps_val) 2294 + return -ENOMEM; 2295 + 2296 + for (i = 0; i < num_closid; i++) 2297 + d->mbps_val[i] = MBA_MAX_MBPS; 2298 + 2299 + return 0; 2300 + } 2301 + 2302 + static void mba_sc_domain_destroy(struct rdt_resource *r, 2303 + struct rdt_ctrl_domain *d) 2304 + { 2305 + kfree(d->mbps_val); 2306 + d->mbps_val = NULL; 2307 + } 2308 + 2309 + /* 2310 + * MBA software controller is supported only if 2311 + * MBM is supported and MBA is in linear scale, 2312 + * and the MBM monitor scope is the same as MBA 2313 + * control scope. 2314 + */ 2315 + static bool supports_mba_mbps(void) 2316 + { 2317 + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); 2318 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2319 + 2320 + return (resctrl_is_mbm_enabled() && 2321 + r->alloc_capable && is_mba_linear() && 2322 + r->ctrl_scope == rmbm->mon_scope); 2323 + } 2324 + 2325 + /* 2326 + * Enable or disable the MBA software controller 2327 + * which helps user specify bandwidth in MBps. 2328 + */ 2329 + static int set_mba_sc(bool mba_sc) 2330 + { 2331 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2332 + u32 num_closid = resctrl_arch_get_num_closid(r); 2333 + struct rdt_ctrl_domain *d; 2334 + unsigned long fflags; 2335 + int i; 2336 + 2337 + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) 2338 + return -EINVAL; 2339 + 2340 + r->membw.mba_sc = mba_sc; 2341 + 2342 + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; 2343 + 2344 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 2345 + for (i = 0; i < num_closid; i++) 2346 + d->mbps_val[i] = MBA_MAX_MBPS; 2347 + } 2348 + 2349 + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; 2350 + resctrl_file_fflags_init("mba_MBps_event", fflags); 2351 + 2352 + return 0; 2353 + } 2354 + 2355 + /* 2356 + * We don't allow rdtgroup directories to be created anywhere 2357 + * except the root directory. Thus when looking for the rdtgroup 2358 + * structure for a kernfs node we are either looking at a directory, 2359 + * in which case the rdtgroup structure is pointed at by the "priv" 2360 + * field, otherwise we have a file, and need only look to the parent 2361 + * to find the rdtgroup. 2362 + */ 2363 + static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) 2364 + { 2365 + if (kernfs_type(kn) == KERNFS_DIR) { 2366 + /* 2367 + * All the resource directories use "kn->priv" 2368 + * to point to the "struct rdtgroup" for the 2369 + * resource. "info" and its subdirectories don't 2370 + * have rdtgroup structures, so return NULL here. 2371 + */ 2372 + if (kn == kn_info || 2373 + rcu_access_pointer(kn->__parent) == kn_info) 2374 + return NULL; 2375 + else 2376 + return kn->priv; 2377 + } else { 2378 + return rdt_kn_parent_priv(kn); 2379 + } 2380 + } 2381 + 2382 + static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 2383 + { 2384 + atomic_inc(&rdtgrp->waitcount); 2385 + kernfs_break_active_protection(kn); 2386 + } 2387 + 2388 + static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 2389 + { 2390 + if (atomic_dec_and_test(&rdtgrp->waitcount) && 2391 + (rdtgrp->flags & RDT_DELETED)) { 2392 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 2393 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 2394 + rdtgroup_pseudo_lock_remove(rdtgrp); 2395 + kernfs_unbreak_active_protection(kn); 2396 + rdtgroup_remove(rdtgrp); 2397 + } else { 2398 + kernfs_unbreak_active_protection(kn); 2399 + } 2400 + } 2401 + 2402 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) 2403 + { 2404 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 2405 + 2406 + if (!rdtgrp) 2407 + return NULL; 2408 + 2409 + rdtgroup_kn_get(rdtgrp, kn); 2410 + 2411 + cpus_read_lock(); 2412 + mutex_lock(&rdtgroup_mutex); 2413 + 2414 + /* Was this group deleted while we waited? */ 2415 + if (rdtgrp->flags & RDT_DELETED) 2416 + return NULL; 2417 + 2418 + return rdtgrp; 2419 + } 2420 + 2421 + void rdtgroup_kn_unlock(struct kernfs_node *kn) 2422 + { 2423 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 2424 + 2425 + if (!rdtgrp) 2426 + return; 2427 + 2428 + mutex_unlock(&rdtgroup_mutex); 2429 + cpus_read_unlock(); 2430 + 2431 + rdtgroup_kn_put(rdtgrp, kn); 2432 + } 2433 + 2434 + static int mkdir_mondata_all(struct kernfs_node *parent_kn, 2435 + struct rdtgroup *prgrp, 2436 + struct kernfs_node **mon_data_kn); 2437 + 2438 + static void rdt_disable_ctx(void) 2439 + { 2440 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 2441 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 2442 + set_mba_sc(false); 2443 + 2444 + resctrl_debug = false; 2445 + } 2446 + 2447 + static int rdt_enable_ctx(struct rdt_fs_context *ctx) 2448 + { 2449 + int ret = 0; 2450 + 2451 + if (ctx->enable_cdpl2) { 2452 + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); 2453 + if (ret) 2454 + goto out_done; 2455 + } 2456 + 2457 + if (ctx->enable_cdpl3) { 2458 + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); 2459 + if (ret) 2460 + goto out_cdpl2; 2461 + } 2462 + 2463 + if (ctx->enable_mba_mbps) { 2464 + ret = set_mba_sc(true); 2465 + if (ret) 2466 + goto out_cdpl3; 2467 + } 2468 + 2469 + if (ctx->enable_debug) 2470 + resctrl_debug = true; 2471 + 2472 + return 0; 2473 + 2474 + out_cdpl3: 2475 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 2476 + out_cdpl2: 2477 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 2478 + out_done: 2479 + return ret; 2480 + } 2481 + 2482 + static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) 2483 + { 2484 + struct resctrl_schema *s; 2485 + const char *suffix = ""; 2486 + int ret, cl; 2487 + 2488 + s = kzalloc(sizeof(*s), GFP_KERNEL); 2489 + if (!s) 2490 + return -ENOMEM; 2491 + 2492 + s->res = r; 2493 + s->num_closid = resctrl_arch_get_num_closid(r); 2494 + if (resctrl_arch_get_cdp_enabled(r->rid)) 2495 + s->num_closid /= 2; 2496 + 2497 + s->conf_type = type; 2498 + switch (type) { 2499 + case CDP_CODE: 2500 + suffix = "CODE"; 2501 + break; 2502 + case CDP_DATA: 2503 + suffix = "DATA"; 2504 + break; 2505 + case CDP_NONE: 2506 + suffix = ""; 2507 + break; 2508 + } 2509 + 2510 + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); 2511 + if (ret >= sizeof(s->name)) { 2512 + kfree(s); 2513 + return -EINVAL; 2514 + } 2515 + 2516 + cl = strlen(s->name); 2517 + 2518 + /* 2519 + * If CDP is supported by this resource, but not enabled, 2520 + * include the suffix. This ensures the tabular format of the 2521 + * schemata file does not change between mounts of the filesystem. 2522 + */ 2523 + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) 2524 + cl += 4; 2525 + 2526 + if (cl > max_name_width) 2527 + max_name_width = cl; 2528 + 2529 + switch (r->schema_fmt) { 2530 + case RESCTRL_SCHEMA_BITMAP: 2531 + s->fmt_str = "%d=%x"; 2532 + break; 2533 + case RESCTRL_SCHEMA_RANGE: 2534 + s->fmt_str = "%d=%u"; 2535 + break; 2536 + } 2537 + 2538 + if (WARN_ON_ONCE(!s->fmt_str)) { 2539 + kfree(s); 2540 + return -EINVAL; 2541 + } 2542 + 2543 + INIT_LIST_HEAD(&s->list); 2544 + list_add(&s->list, &resctrl_schema_all); 2545 + 2546 + return 0; 2547 + } 2548 + 2549 + static int schemata_list_create(void) 2550 + { 2551 + struct rdt_resource *r; 2552 + int ret = 0; 2553 + 2554 + for_each_alloc_capable_rdt_resource(r) { 2555 + if (resctrl_arch_get_cdp_enabled(r->rid)) { 2556 + ret = schemata_list_add(r, CDP_CODE); 2557 + if (ret) 2558 + break; 2559 + 2560 + ret = schemata_list_add(r, CDP_DATA); 2561 + } else { 2562 + ret = schemata_list_add(r, CDP_NONE); 2563 + } 2564 + 2565 + if (ret) 2566 + break; 2567 + } 2568 + 2569 + return ret; 2570 + } 2571 + 2572 + static void schemata_list_destroy(void) 2573 + { 2574 + struct resctrl_schema *s, *tmp; 2575 + 2576 + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { 2577 + list_del(&s->list); 2578 + kfree(s); 2579 + } 2580 + } 2581 + 2582 + static int rdt_get_tree(struct fs_context *fc) 2583 + { 2584 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2585 + unsigned long flags = RFTYPE_CTRL_BASE; 2586 + struct rdt_mon_domain *dom; 2587 + struct rdt_resource *r; 2588 + int ret; 2589 + 2590 + cpus_read_lock(); 2591 + mutex_lock(&rdtgroup_mutex); 2592 + /* 2593 + * resctrl file system can only be mounted once. 2594 + */ 2595 + if (resctrl_mounted) { 2596 + ret = -EBUSY; 2597 + goto out; 2598 + } 2599 + 2600 + ret = rdtgroup_setup_root(ctx); 2601 + if (ret) 2602 + goto out; 2603 + 2604 + ret = rdt_enable_ctx(ctx); 2605 + if (ret) 2606 + goto out_root; 2607 + 2608 + ret = schemata_list_create(); 2609 + if (ret) { 2610 + schemata_list_destroy(); 2611 + goto out_ctx; 2612 + } 2613 + 2614 + ret = closid_init(); 2615 + if (ret) 2616 + goto out_schemata_free; 2617 + 2618 + if (resctrl_arch_mon_capable()) 2619 + flags |= RFTYPE_MON; 2620 + 2621 + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); 2622 + if (ret) 2623 + goto out_closid_exit; 2624 + 2625 + kernfs_activate(rdtgroup_default.kn); 2626 + 2627 + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); 2628 + if (ret < 0) 2629 + goto out_closid_exit; 2630 + 2631 + if (resctrl_arch_mon_capable()) { 2632 + ret = mongroup_create_dir(rdtgroup_default.kn, 2633 + &rdtgroup_default, "mon_groups", 2634 + &kn_mongrp); 2635 + if (ret < 0) 2636 + goto out_info; 2637 + 2638 + ret = mkdir_mondata_all(rdtgroup_default.kn, 2639 + &rdtgroup_default, &kn_mondata); 2640 + if (ret < 0) 2641 + goto out_mongrp; 2642 + rdtgroup_default.mon.mon_data_kn = kn_mondata; 2643 + } 2644 + 2645 + ret = rdt_pseudo_lock_init(); 2646 + if (ret) 2647 + goto out_mondata; 2648 + 2649 + ret = kernfs_get_tree(fc); 2650 + if (ret < 0) 2651 + goto out_psl; 2652 + 2653 + if (resctrl_arch_alloc_capable()) 2654 + resctrl_arch_enable_alloc(); 2655 + if (resctrl_arch_mon_capable()) 2656 + resctrl_arch_enable_mon(); 2657 + 2658 + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) 2659 + resctrl_mounted = true; 2660 + 2661 + if (resctrl_is_mbm_enabled()) { 2662 + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 2663 + list_for_each_entry(dom, &r->mon_domains, hdr.list) 2664 + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, 2665 + RESCTRL_PICK_ANY_CPU); 2666 + } 2667 + 2668 + goto out; 2669 + 2670 + out_psl: 2671 + rdt_pseudo_lock_release(); 2672 + out_mondata: 2673 + if (resctrl_arch_mon_capable()) 2674 + kernfs_remove(kn_mondata); 2675 + out_mongrp: 2676 + if (resctrl_arch_mon_capable()) 2677 + kernfs_remove(kn_mongrp); 2678 + out_info: 2679 + kernfs_remove(kn_info); 2680 + out_closid_exit: 2681 + closid_exit(); 2682 + out_schemata_free: 2683 + schemata_list_destroy(); 2684 + out_ctx: 2685 + rdt_disable_ctx(); 2686 + out_root: 2687 + rdtgroup_destroy_root(); 2688 + out: 2689 + rdt_last_cmd_clear(); 2690 + mutex_unlock(&rdtgroup_mutex); 2691 + cpus_read_unlock(); 2692 + return ret; 2693 + } 2694 + 2695 + enum rdt_param { 2696 + Opt_cdp, 2697 + Opt_cdpl2, 2698 + Opt_mba_mbps, 2699 + Opt_debug, 2700 + nr__rdt_params 2701 + }; 2702 + 2703 + static const struct fs_parameter_spec rdt_fs_parameters[] = { 2704 + fsparam_flag("cdp", Opt_cdp), 2705 + fsparam_flag("cdpl2", Opt_cdpl2), 2706 + fsparam_flag("mba_MBps", Opt_mba_mbps), 2707 + fsparam_flag("debug", Opt_debug), 2708 + {} 2709 + }; 2710 + 2711 + static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) 2712 + { 2713 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2714 + struct fs_parse_result result; 2715 + const char *msg; 2716 + int opt; 2717 + 2718 + opt = fs_parse(fc, rdt_fs_parameters, param, &result); 2719 + if (opt < 0) 2720 + return opt; 2721 + 2722 + switch (opt) { 2723 + case Opt_cdp: 2724 + ctx->enable_cdpl3 = true; 2725 + return 0; 2726 + case Opt_cdpl2: 2727 + ctx->enable_cdpl2 = true; 2728 + return 0; 2729 + case Opt_mba_mbps: 2730 + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; 2731 + if (!supports_mba_mbps()) 2732 + return invalfc(fc, msg); 2733 + ctx->enable_mba_mbps = true; 2734 + return 0; 2735 + case Opt_debug: 2736 + ctx->enable_debug = true; 2737 + return 0; 2738 + } 2739 + 2740 + return -EINVAL; 2741 + } 2742 + 2743 + static void rdt_fs_context_free(struct fs_context *fc) 2744 + { 2745 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2746 + 2747 + kernfs_free_fs_context(fc); 2748 + kfree(ctx); 2749 + } 2750 + 2751 + static const struct fs_context_operations rdt_fs_context_ops = { 2752 + .free = rdt_fs_context_free, 2753 + .parse_param = rdt_parse_param, 2754 + .get_tree = rdt_get_tree, 2755 + }; 2756 + 2757 + static int rdt_init_fs_context(struct fs_context *fc) 2758 + { 2759 + struct rdt_fs_context *ctx; 2760 + 2761 + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 2762 + if (!ctx) 2763 + return -ENOMEM; 2764 + 2765 + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; 2766 + fc->fs_private = &ctx->kfc; 2767 + fc->ops = &rdt_fs_context_ops; 2768 + put_user_ns(fc->user_ns); 2769 + fc->user_ns = get_user_ns(&init_user_ns); 2770 + fc->global = true; 2771 + return 0; 2772 + } 2773 + 2774 + /* 2775 + * Move tasks from one to the other group. If @from is NULL, then all tasks 2776 + * in the systems are moved unconditionally (used for teardown). 2777 + * 2778 + * If @mask is not NULL the cpus on which moved tasks are running are set 2779 + * in that mask so the update smp function call is restricted to affected 2780 + * cpus. 2781 + */ 2782 + static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, 2783 + struct cpumask *mask) 2784 + { 2785 + struct task_struct *p, *t; 2786 + 2787 + read_lock(&tasklist_lock); 2788 + for_each_process_thread(p, t) { 2789 + if (!from || is_closid_match(t, from) || 2790 + is_rmid_match(t, from)) { 2791 + resctrl_arch_set_closid_rmid(t, to->closid, 2792 + to->mon.rmid); 2793 + 2794 + /* 2795 + * Order the closid/rmid stores above before the loads 2796 + * in task_curr(). This pairs with the full barrier 2797 + * between the rq->curr update and 2798 + * resctrl_arch_sched_in() during context switch. 2799 + */ 2800 + smp_mb(); 2801 + 2802 + /* 2803 + * If the task is on a CPU, set the CPU in the mask. 2804 + * The detection is inaccurate as tasks might move or 2805 + * schedule before the smp function call takes place. 2806 + * In such a case the function call is pointless, but 2807 + * there is no other side effect. 2808 + */ 2809 + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) 2810 + cpumask_set_cpu(task_cpu(t), mask); 2811 + } 2812 + } 2813 + read_unlock(&tasklist_lock); 2814 + } 2815 + 2816 + static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) 2817 + { 2818 + struct rdtgroup *sentry, *stmp; 2819 + struct list_head *head; 2820 + 2821 + head = &rdtgrp->mon.crdtgrp_list; 2822 + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { 2823 + free_rmid(sentry->closid, sentry->mon.rmid); 2824 + list_del(&sentry->mon.crdtgrp_list); 2825 + 2826 + if (atomic_read(&sentry->waitcount) != 0) 2827 + sentry->flags = RDT_DELETED; 2828 + else 2829 + rdtgroup_remove(sentry); 2830 + } 2831 + } 2832 + 2833 + /* 2834 + * Forcibly remove all of subdirectories under root. 2835 + */ 2836 + static void rmdir_all_sub(void) 2837 + { 2838 + struct rdtgroup *rdtgrp, *tmp; 2839 + 2840 + /* Move all tasks to the default resource group */ 2841 + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 2842 + 2843 + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 2844 + /* Free any child rmids */ 2845 + free_all_child_rdtgrp(rdtgrp); 2846 + 2847 + /* Remove each rdtgroup other than root */ 2848 + if (rdtgrp == &rdtgroup_default) 2849 + continue; 2850 + 2851 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 2852 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 2853 + rdtgroup_pseudo_lock_remove(rdtgrp); 2854 + 2855 + /* 2856 + * Give any CPUs back to the default group. We cannot copy 2857 + * cpu_online_mask because a CPU might have executed the 2858 + * offline callback already, but is still marked online. 2859 + */ 2860 + cpumask_or(&rdtgroup_default.cpu_mask, 2861 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 2862 + 2863 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 2864 + 2865 + kernfs_remove(rdtgrp->kn); 2866 + list_del(&rdtgrp->rdtgroup_list); 2867 + 2868 + if (atomic_read(&rdtgrp->waitcount) != 0) 2869 + rdtgrp->flags = RDT_DELETED; 2870 + else 2871 + rdtgroup_remove(rdtgrp); 2872 + } 2873 + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 2874 + update_closid_rmid(cpu_online_mask, &rdtgroup_default); 2875 + 2876 + kernfs_remove(kn_info); 2877 + kernfs_remove(kn_mongrp); 2878 + kernfs_remove(kn_mondata); 2879 + } 2880 + 2881 + /** 2882 + * mon_get_kn_priv() - Get the mon_data priv data for this event. 2883 + * 2884 + * The same values are used across the mon_data directories of all control and 2885 + * monitor groups for the same event in the same domain. Keep a list of 2886 + * allocated structures and re-use an existing one with the same values for 2887 + * @rid, @domid, etc. 2888 + * 2889 + * @rid: The resource id for the event file being created. 2890 + * @domid: The domain id for the event file being created. 2891 + * @mevt: The type of event file being created. 2892 + * @do_sum: Whether SNC summing monitors are being created. 2893 + */ 2894 + static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, 2895 + struct mon_evt *mevt, 2896 + bool do_sum) 2897 + { 2898 + struct mon_data *priv; 2899 + 2900 + lockdep_assert_held(&rdtgroup_mutex); 2901 + 2902 + list_for_each_entry(priv, &mon_data_kn_priv_list, list) { 2903 + if (priv->rid == rid && priv->domid == domid && 2904 + priv->sum == do_sum && priv->evtid == mevt->evtid) 2905 + return priv; 2906 + } 2907 + 2908 + priv = kzalloc(sizeof(*priv), GFP_KERNEL); 2909 + if (!priv) 2910 + return NULL; 2911 + 2912 + priv->rid = rid; 2913 + priv->domid = domid; 2914 + priv->sum = do_sum; 2915 + priv->evtid = mevt->evtid; 2916 + list_add_tail(&priv->list, &mon_data_kn_priv_list); 2917 + 2918 + return priv; 2919 + } 2920 + 2921 + /** 2922 + * mon_put_kn_priv() - Free all allocated mon_data structures. 2923 + * 2924 + * Called when resctrl file system is unmounted. 2925 + */ 2926 + static void mon_put_kn_priv(void) 2927 + { 2928 + struct mon_data *priv, *tmp; 2929 + 2930 + lockdep_assert_held(&rdtgroup_mutex); 2931 + 2932 + list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) { 2933 + list_del(&priv->list); 2934 + kfree(priv); 2935 + } 2936 + } 2937 + 2938 + static void resctrl_fs_teardown(void) 2939 + { 2940 + lockdep_assert_held(&rdtgroup_mutex); 2941 + 2942 + /* Cleared by rdtgroup_destroy_root() */ 2943 + if (!rdtgroup_default.kn) 2944 + return; 2945 + 2946 + rmdir_all_sub(); 2947 + mon_put_kn_priv(); 2948 + rdt_pseudo_lock_release(); 2949 + rdtgroup_default.mode = RDT_MODE_SHAREABLE; 2950 + closid_exit(); 2951 + schemata_list_destroy(); 2952 + rdtgroup_destroy_root(); 2953 + } 2954 + 2955 + static void rdt_kill_sb(struct super_block *sb) 2956 + { 2957 + struct rdt_resource *r; 2958 + 2959 + cpus_read_lock(); 2960 + mutex_lock(&rdtgroup_mutex); 2961 + 2962 + rdt_disable_ctx(); 2963 + 2964 + /* Put everything back to default values. */ 2965 + for_each_alloc_capable_rdt_resource(r) 2966 + resctrl_arch_reset_all_ctrls(r); 2967 + 2968 + resctrl_fs_teardown(); 2969 + if (resctrl_arch_alloc_capable()) 2970 + resctrl_arch_disable_alloc(); 2971 + if (resctrl_arch_mon_capable()) 2972 + resctrl_arch_disable_mon(); 2973 + resctrl_mounted = false; 2974 + kernfs_kill_sb(sb); 2975 + mutex_unlock(&rdtgroup_mutex); 2976 + cpus_read_unlock(); 2977 + } 2978 + 2979 + static struct file_system_type rdt_fs_type = { 2980 + .name = "resctrl", 2981 + .init_fs_context = rdt_init_fs_context, 2982 + .parameters = rdt_fs_parameters, 2983 + .kill_sb = rdt_kill_sb, 2984 + }; 2985 + 2986 + static int mon_addfile(struct kernfs_node *parent_kn, const char *name, 2987 + void *priv) 2988 + { 2989 + struct kernfs_node *kn; 2990 + int ret = 0; 2991 + 2992 + kn = __kernfs_create_file(parent_kn, name, 0444, 2993 + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, 2994 + &kf_mondata_ops, priv, NULL, NULL); 2995 + if (IS_ERR(kn)) 2996 + return PTR_ERR(kn); 2997 + 2998 + ret = rdtgroup_kn_set_ugid(kn); 2999 + if (ret) { 3000 + kernfs_remove(kn); 3001 + return ret; 3002 + } 3003 + 3004 + return ret; 3005 + } 3006 + 3007 + static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) 3008 + { 3009 + struct kernfs_node *kn; 3010 + 3011 + kn = kernfs_find_and_get(pkn, name); 3012 + if (!kn) 3013 + return; 3014 + kernfs_put(kn); 3015 + 3016 + if (kn->dir.subdirs <= 1) 3017 + kernfs_remove(kn); 3018 + else 3019 + kernfs_remove_by_name(kn, subname); 3020 + } 3021 + 3022 + /* 3023 + * Remove all subdirectories of mon_data of ctrl_mon groups 3024 + * and monitor groups for the given domain. 3025 + * Remove files and directories containing "sum" of domain data 3026 + * when last domain being summed is removed. 3027 + */ 3028 + static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 3029 + struct rdt_mon_domain *d) 3030 + { 3031 + struct rdtgroup *prgrp, *crgrp; 3032 + char subname[32]; 3033 + bool snc_mode; 3034 + char name[32]; 3035 + 3036 + snc_mode = r->mon_scope == RESCTRL_L3_NODE; 3037 + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 3038 + if (snc_mode) 3039 + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); 3040 + 3041 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 3042 + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); 3043 + 3044 + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) 3045 + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); 3046 + } 3047 + } 3048 + 3049 + static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, 3050 + struct rdt_resource *r, struct rdtgroup *prgrp, 3051 + bool do_sum) 3052 + { 3053 + struct rmid_read rr = {0}; 3054 + struct mon_data *priv; 3055 + struct mon_evt *mevt; 3056 + int ret, domid; 3057 + 3058 + if (WARN_ON(list_empty(&r->evt_list))) 3059 + return -EPERM; 3060 + 3061 + list_for_each_entry(mevt, &r->evt_list, list) { 3062 + domid = do_sum ? d->ci->id : d->hdr.id; 3063 + priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); 3064 + if (WARN_ON_ONCE(!priv)) 3065 + return -EINVAL; 3066 + 3067 + ret = mon_addfile(kn, mevt->name, priv); 3068 + if (ret) 3069 + return ret; 3070 + 3071 + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) 3072 + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); 3073 + } 3074 + 3075 + return 0; 3076 + } 3077 + 3078 + static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, 3079 + struct rdt_mon_domain *d, 3080 + struct rdt_resource *r, struct rdtgroup *prgrp) 3081 + { 3082 + struct kernfs_node *kn, *ckn; 3083 + char name[32]; 3084 + bool snc_mode; 3085 + int ret = 0; 3086 + 3087 + lockdep_assert_held(&rdtgroup_mutex); 3088 + 3089 + snc_mode = r->mon_scope == RESCTRL_L3_NODE; 3090 + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 3091 + kn = kernfs_find_and_get(parent_kn, name); 3092 + if (kn) { 3093 + /* 3094 + * rdtgroup_mutex will prevent this directory from being 3095 + * removed. No need to keep this hold. 3096 + */ 3097 + kernfs_put(kn); 3098 + } else { 3099 + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 3100 + if (IS_ERR(kn)) 3101 + return PTR_ERR(kn); 3102 + 3103 + ret = rdtgroup_kn_set_ugid(kn); 3104 + if (ret) 3105 + goto out_destroy; 3106 + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); 3107 + if (ret) 3108 + goto out_destroy; 3109 + } 3110 + 3111 + if (snc_mode) { 3112 + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); 3113 + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); 3114 + if (IS_ERR(ckn)) { 3115 + ret = -EINVAL; 3116 + goto out_destroy; 3117 + } 3118 + 3119 + ret = rdtgroup_kn_set_ugid(ckn); 3120 + if (ret) 3121 + goto out_destroy; 3122 + 3123 + ret = mon_add_all_files(ckn, d, r, prgrp, false); 3124 + if (ret) 3125 + goto out_destroy; 3126 + } 3127 + 3128 + kernfs_activate(kn); 3129 + return 0; 3130 + 3131 + out_destroy: 3132 + kernfs_remove(kn); 3133 + return ret; 3134 + } 3135 + 3136 + /* 3137 + * Add all subdirectories of mon_data for "ctrl_mon" groups 3138 + * and "monitor" groups with given domain id. 3139 + */ 3140 + static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 3141 + struct rdt_mon_domain *d) 3142 + { 3143 + struct kernfs_node *parent_kn; 3144 + struct rdtgroup *prgrp, *crgrp; 3145 + struct list_head *head; 3146 + 3147 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 3148 + parent_kn = prgrp->mon.mon_data_kn; 3149 + mkdir_mondata_subdir(parent_kn, d, r, prgrp); 3150 + 3151 + head = &prgrp->mon.crdtgrp_list; 3152 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 3153 + parent_kn = crgrp->mon.mon_data_kn; 3154 + mkdir_mondata_subdir(parent_kn, d, r, crgrp); 3155 + } 3156 + } 3157 + } 3158 + 3159 + static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, 3160 + struct rdt_resource *r, 3161 + struct rdtgroup *prgrp) 3162 + { 3163 + struct rdt_mon_domain *dom; 3164 + int ret; 3165 + 3166 + /* Walking r->domains, ensure it can't race with cpuhp */ 3167 + lockdep_assert_cpus_held(); 3168 + 3169 + list_for_each_entry(dom, &r->mon_domains, hdr.list) { 3170 + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); 3171 + if (ret) 3172 + return ret; 3173 + } 3174 + 3175 + return 0; 3176 + } 3177 + 3178 + /* 3179 + * This creates a directory mon_data which contains the monitored data. 3180 + * 3181 + * mon_data has one directory for each domain which are named 3182 + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data 3183 + * with L3 domain looks as below: 3184 + * ./mon_data: 3185 + * mon_L3_00 3186 + * mon_L3_01 3187 + * mon_L3_02 3188 + * ... 3189 + * 3190 + * Each domain directory has one file per event: 3191 + * ./mon_L3_00/: 3192 + * llc_occupancy 3193 + * 3194 + */ 3195 + static int mkdir_mondata_all(struct kernfs_node *parent_kn, 3196 + struct rdtgroup *prgrp, 3197 + struct kernfs_node **dest_kn) 3198 + { 3199 + struct rdt_resource *r; 3200 + struct kernfs_node *kn; 3201 + int ret; 3202 + 3203 + /* 3204 + * Create the mon_data directory first. 3205 + */ 3206 + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); 3207 + if (ret) 3208 + return ret; 3209 + 3210 + if (dest_kn) 3211 + *dest_kn = kn; 3212 + 3213 + /* 3214 + * Create the subdirectories for each domain. Note that all events 3215 + * in a domain like L3 are grouped into a resource whose domain is L3 3216 + */ 3217 + for_each_mon_capable_rdt_resource(r) { 3218 + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); 3219 + if (ret) 3220 + goto out_destroy; 3221 + } 3222 + 3223 + return 0; 3224 + 3225 + out_destroy: 3226 + kernfs_remove(kn); 3227 + return ret; 3228 + } 3229 + 3230 + /** 3231 + * cbm_ensure_valid - Enforce validity on provided CBM 3232 + * @_val: Candidate CBM 3233 + * @r: RDT resource to which the CBM belongs 3234 + * 3235 + * The provided CBM represents all cache portions available for use. This 3236 + * may be represented by a bitmap that does not consist of contiguous ones 3237 + * and thus be an invalid CBM. 3238 + * Here the provided CBM is forced to be a valid CBM by only considering 3239 + * the first set of contiguous bits as valid and clearing all bits. 3240 + * The intention here is to provide a valid default CBM with which a new 3241 + * resource group is initialized. The user can follow this with a 3242 + * modification to the CBM if the default does not satisfy the 3243 + * requirements. 3244 + */ 3245 + static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) 3246 + { 3247 + unsigned int cbm_len = r->cache.cbm_len; 3248 + unsigned long first_bit, zero_bit; 3249 + unsigned long val = _val; 3250 + 3251 + if (!val) 3252 + return 0; 3253 + 3254 + first_bit = find_first_bit(&val, cbm_len); 3255 + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 3256 + 3257 + /* Clear any remaining bits to ensure contiguous region */ 3258 + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); 3259 + return (u32)val; 3260 + } 3261 + 3262 + /* 3263 + * Initialize cache resources per RDT domain 3264 + * 3265 + * Set the RDT domain up to start off with all usable allocations. That is, 3266 + * all shareable and unused bits. All-zero CBM is invalid. 3267 + */ 3268 + static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, 3269 + u32 closid) 3270 + { 3271 + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 3272 + enum resctrl_conf_type t = s->conf_type; 3273 + struct resctrl_staged_config *cfg; 3274 + struct rdt_resource *r = s->res; 3275 + u32 used_b = 0, unused_b = 0; 3276 + unsigned long tmp_cbm; 3277 + enum rdtgrp_mode mode; 3278 + u32 peer_ctl, ctrl_val; 3279 + int i; 3280 + 3281 + cfg = &d->staged_config[t]; 3282 + cfg->have_new_ctrl = false; 3283 + cfg->new_ctrl = r->cache.shareable_bits; 3284 + used_b = r->cache.shareable_bits; 3285 + for (i = 0; i < closids_supported(); i++) { 3286 + if (closid_allocated(i) && i != closid) { 3287 + mode = rdtgroup_mode_by_closid(i); 3288 + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) 3289 + /* 3290 + * ctrl values for locksetup aren't relevant 3291 + * until the schemata is written, and the mode 3292 + * becomes RDT_MODE_PSEUDO_LOCKED. 3293 + */ 3294 + continue; 3295 + /* 3296 + * If CDP is active include peer domain's 3297 + * usage to ensure there is no overlap 3298 + * with an exclusive group. 3299 + */ 3300 + if (resctrl_arch_get_cdp_enabled(r->rid)) 3301 + peer_ctl = resctrl_arch_get_config(r, d, i, 3302 + peer_type); 3303 + else 3304 + peer_ctl = 0; 3305 + ctrl_val = resctrl_arch_get_config(r, d, i, 3306 + s->conf_type); 3307 + used_b |= ctrl_val | peer_ctl; 3308 + if (mode == RDT_MODE_SHAREABLE) 3309 + cfg->new_ctrl |= ctrl_val | peer_ctl; 3310 + } 3311 + } 3312 + if (d->plr && d->plr->cbm > 0) 3313 + used_b |= d->plr->cbm; 3314 + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); 3315 + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; 3316 + cfg->new_ctrl |= unused_b; 3317 + /* 3318 + * Force the initial CBM to be valid, user can 3319 + * modify the CBM based on system availability. 3320 + */ 3321 + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); 3322 + /* 3323 + * Assign the u32 CBM to an unsigned long to ensure that 3324 + * bitmap_weight() does not access out-of-bound memory. 3325 + */ 3326 + tmp_cbm = cfg->new_ctrl; 3327 + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { 3328 + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); 3329 + return -ENOSPC; 3330 + } 3331 + cfg->have_new_ctrl = true; 3332 + 3333 + return 0; 3334 + } 3335 + 3336 + /* 3337 + * Initialize cache resources with default values. 3338 + * 3339 + * A new RDT group is being created on an allocation capable (CAT) 3340 + * supporting system. Set this group up to start off with all usable 3341 + * allocations. 3342 + * 3343 + * If there are no more shareable bits available on any domain then 3344 + * the entire allocation will fail. 3345 + */ 3346 + static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) 3347 + { 3348 + struct rdt_ctrl_domain *d; 3349 + int ret; 3350 + 3351 + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { 3352 + ret = __init_one_rdt_domain(d, s, closid); 3353 + if (ret < 0) 3354 + return ret; 3355 + } 3356 + 3357 + return 0; 3358 + } 3359 + 3360 + /* Initialize MBA resource with default values. */ 3361 + static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) 3362 + { 3363 + struct resctrl_staged_config *cfg; 3364 + struct rdt_ctrl_domain *d; 3365 + 3366 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 3367 + if (is_mba_sc(r)) { 3368 + d->mbps_val[closid] = MBA_MAX_MBPS; 3369 + continue; 3370 + } 3371 + 3372 + cfg = &d->staged_config[CDP_NONE]; 3373 + cfg->new_ctrl = resctrl_get_default_ctrl(r); 3374 + cfg->have_new_ctrl = true; 3375 + } 3376 + } 3377 + 3378 + /* Initialize the RDT group's allocations. */ 3379 + static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) 3380 + { 3381 + struct resctrl_schema *s; 3382 + struct rdt_resource *r; 3383 + int ret = 0; 3384 + 3385 + rdt_staged_configs_clear(); 3386 + 3387 + list_for_each_entry(s, &resctrl_schema_all, list) { 3388 + r = s->res; 3389 + if (r->rid == RDT_RESOURCE_MBA || 3390 + r->rid == RDT_RESOURCE_SMBA) { 3391 + rdtgroup_init_mba(r, rdtgrp->closid); 3392 + if (is_mba_sc(r)) 3393 + continue; 3394 + } else { 3395 + ret = rdtgroup_init_cat(s, rdtgrp->closid); 3396 + if (ret < 0) 3397 + goto out; 3398 + } 3399 + 3400 + ret = resctrl_arch_update_domains(r, rdtgrp->closid); 3401 + if (ret < 0) { 3402 + rdt_last_cmd_puts("Failed to initialize allocations\n"); 3403 + goto out; 3404 + } 3405 + } 3406 + 3407 + rdtgrp->mode = RDT_MODE_SHAREABLE; 3408 + 3409 + out: 3410 + rdt_staged_configs_clear(); 3411 + return ret; 3412 + } 3413 + 3414 + static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) 3415 + { 3416 + int ret; 3417 + 3418 + if (!resctrl_arch_mon_capable()) 3419 + return 0; 3420 + 3421 + ret = alloc_rmid(rdtgrp->closid); 3422 + if (ret < 0) { 3423 + rdt_last_cmd_puts("Out of RMIDs\n"); 3424 + return ret; 3425 + } 3426 + rdtgrp->mon.rmid = ret; 3427 + 3428 + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); 3429 + if (ret) { 3430 + rdt_last_cmd_puts("kernfs subdir error\n"); 3431 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3432 + return ret; 3433 + } 3434 + 3435 + return 0; 3436 + } 3437 + 3438 + static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) 3439 + { 3440 + if (resctrl_arch_mon_capable()) 3441 + free_rmid(rgrp->closid, rgrp->mon.rmid); 3442 + } 3443 + 3444 + /* 3445 + * We allow creating mon groups only with in a directory called "mon_groups" 3446 + * which is present in every ctrl_mon group. Check if this is a valid 3447 + * "mon_groups" directory. 3448 + * 3449 + * 1. The directory should be named "mon_groups". 3450 + * 2. The mon group itself should "not" be named "mon_groups". 3451 + * This makes sure "mon_groups" directory always has a ctrl_mon group 3452 + * as parent. 3453 + */ 3454 + static bool is_mon_groups(struct kernfs_node *kn, const char *name) 3455 + { 3456 + return (!strcmp(rdt_kn_name(kn), "mon_groups") && 3457 + strcmp(name, "mon_groups")); 3458 + } 3459 + 3460 + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, 3461 + const char *name, umode_t mode, 3462 + enum rdt_group_type rtype, struct rdtgroup **r) 3463 + { 3464 + struct rdtgroup *prdtgrp, *rdtgrp; 3465 + unsigned long files = 0; 3466 + struct kernfs_node *kn; 3467 + int ret; 3468 + 3469 + prdtgrp = rdtgroup_kn_lock_live(parent_kn); 3470 + if (!prdtgrp) { 3471 + ret = -ENODEV; 3472 + goto out_unlock; 3473 + } 3474 + 3475 + /* 3476 + * Check that the parent directory for a monitor group is a "mon_groups" 3477 + * directory. 3478 + */ 3479 + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { 3480 + ret = -EPERM; 3481 + goto out_unlock; 3482 + } 3483 + 3484 + if (rtype == RDTMON_GROUP && 3485 + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 3486 + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { 3487 + ret = -EINVAL; 3488 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 3489 + goto out_unlock; 3490 + } 3491 + 3492 + /* allocate the rdtgroup. */ 3493 + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 3494 + if (!rdtgrp) { 3495 + ret = -ENOSPC; 3496 + rdt_last_cmd_puts("Kernel out of memory\n"); 3497 + goto out_unlock; 3498 + } 3499 + *r = rdtgrp; 3500 + rdtgrp->mon.parent = prdtgrp; 3501 + rdtgrp->type = rtype; 3502 + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); 3503 + 3504 + /* kernfs creates the directory for rdtgrp */ 3505 + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); 3506 + if (IS_ERR(kn)) { 3507 + ret = PTR_ERR(kn); 3508 + rdt_last_cmd_puts("kernfs create error\n"); 3509 + goto out_free_rgrp; 3510 + } 3511 + rdtgrp->kn = kn; 3512 + 3513 + /* 3514 + * kernfs_remove() will drop the reference count on "kn" which 3515 + * will free it. But we still need it to stick around for the 3516 + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, 3517 + * which will be dropped by kernfs_put() in rdtgroup_remove(). 3518 + */ 3519 + kernfs_get(kn); 3520 + 3521 + ret = rdtgroup_kn_set_ugid(kn); 3522 + if (ret) { 3523 + rdt_last_cmd_puts("kernfs perm error\n"); 3524 + goto out_destroy; 3525 + } 3526 + 3527 + if (rtype == RDTCTRL_GROUP) { 3528 + files = RFTYPE_BASE | RFTYPE_CTRL; 3529 + if (resctrl_arch_mon_capable()) 3530 + files |= RFTYPE_MON; 3531 + } else { 3532 + files = RFTYPE_BASE | RFTYPE_MON; 3533 + } 3534 + 3535 + ret = rdtgroup_add_files(kn, files); 3536 + if (ret) { 3537 + rdt_last_cmd_puts("kernfs fill error\n"); 3538 + goto out_destroy; 3539 + } 3540 + 3541 + /* 3542 + * The caller unlocks the parent_kn upon success. 3543 + */ 3544 + return 0; 3545 + 3546 + out_destroy: 3547 + kernfs_put(rdtgrp->kn); 3548 + kernfs_remove(rdtgrp->kn); 3549 + out_free_rgrp: 3550 + kfree(rdtgrp); 3551 + out_unlock: 3552 + rdtgroup_kn_unlock(parent_kn); 3553 + return ret; 3554 + } 3555 + 3556 + static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) 3557 + { 3558 + kernfs_remove(rgrp->kn); 3559 + rdtgroup_remove(rgrp); 3560 + } 3561 + 3562 + /* 3563 + * Create a monitor group under "mon_groups" directory of a control 3564 + * and monitor group(ctrl_mon). This is a resource group 3565 + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. 3566 + */ 3567 + static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, 3568 + const char *name, umode_t mode) 3569 + { 3570 + struct rdtgroup *rdtgrp, *prgrp; 3571 + int ret; 3572 + 3573 + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); 3574 + if (ret) 3575 + return ret; 3576 + 3577 + prgrp = rdtgrp->mon.parent; 3578 + rdtgrp->closid = prgrp->closid; 3579 + 3580 + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 3581 + if (ret) { 3582 + mkdir_rdt_prepare_clean(rdtgrp); 3583 + goto out_unlock; 3584 + } 3585 + 3586 + kernfs_activate(rdtgrp->kn); 3587 + 3588 + /* 3589 + * Add the rdtgrp to the list of rdtgrps the parent 3590 + * ctrl_mon group has to track. 3591 + */ 3592 + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); 3593 + 3594 + out_unlock: 3595 + rdtgroup_kn_unlock(parent_kn); 3596 + return ret; 3597 + } 3598 + 3599 + /* 3600 + * These are rdtgroups created under the root directory. Can be used 3601 + * to allocate and monitor resources. 3602 + */ 3603 + static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, 3604 + const char *name, umode_t mode) 3605 + { 3606 + struct rdtgroup *rdtgrp; 3607 + struct kernfs_node *kn; 3608 + u32 closid; 3609 + int ret; 3610 + 3611 + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); 3612 + if (ret) 3613 + return ret; 3614 + 3615 + kn = rdtgrp->kn; 3616 + ret = closid_alloc(); 3617 + if (ret < 0) { 3618 + rdt_last_cmd_puts("Out of CLOSIDs\n"); 3619 + goto out_common_fail; 3620 + } 3621 + closid = ret; 3622 + ret = 0; 3623 + 3624 + rdtgrp->closid = closid; 3625 + 3626 + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 3627 + if (ret) 3628 + goto out_closid_free; 3629 + 3630 + kernfs_activate(rdtgrp->kn); 3631 + 3632 + ret = rdtgroup_init_alloc(rdtgrp); 3633 + if (ret < 0) 3634 + goto out_rmid_free; 3635 + 3636 + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 3637 + 3638 + if (resctrl_arch_mon_capable()) { 3639 + /* 3640 + * Create an empty mon_groups directory to hold the subset 3641 + * of tasks and cpus to monitor. 3642 + */ 3643 + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); 3644 + if (ret) { 3645 + rdt_last_cmd_puts("kernfs subdir error\n"); 3646 + goto out_del_list; 3647 + } 3648 + if (is_mba_sc(NULL)) 3649 + rdtgrp->mba_mbps_event = mba_mbps_default_event; 3650 + } 3651 + 3652 + goto out_unlock; 3653 + 3654 + out_del_list: 3655 + list_del(&rdtgrp->rdtgroup_list); 3656 + out_rmid_free: 3657 + mkdir_rdt_prepare_rmid_free(rdtgrp); 3658 + out_closid_free: 3659 + closid_free(closid); 3660 + out_common_fail: 3661 + mkdir_rdt_prepare_clean(rdtgrp); 3662 + out_unlock: 3663 + rdtgroup_kn_unlock(parent_kn); 3664 + return ret; 3665 + } 3666 + 3667 + static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 3668 + umode_t mode) 3669 + { 3670 + /* Do not accept '\n' to avoid unparsable situation. */ 3671 + if (strchr(name, '\n')) 3672 + return -EINVAL; 3673 + 3674 + /* 3675 + * If the parent directory is the root directory and RDT 3676 + * allocation is supported, add a control and monitoring 3677 + * subdirectory 3678 + */ 3679 + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) 3680 + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); 3681 + 3682 + /* Else, attempt to add a monitoring subdirectory. */ 3683 + if (resctrl_arch_mon_capable()) 3684 + return rdtgroup_mkdir_mon(parent_kn, name, mode); 3685 + 3686 + return -EPERM; 3687 + } 3688 + 3689 + static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 3690 + { 3691 + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 3692 + u32 closid, rmid; 3693 + int cpu; 3694 + 3695 + /* Give any tasks back to the parent group */ 3696 + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); 3697 + 3698 + /* 3699 + * Update per cpu closid/rmid of the moved CPUs first. 3700 + * Note: the closid will not change, but the arch code still needs it. 3701 + */ 3702 + closid = prdtgrp->closid; 3703 + rmid = prdtgrp->mon.rmid; 3704 + for_each_cpu(cpu, &rdtgrp->cpu_mask) 3705 + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 3706 + 3707 + /* 3708 + * Update the MSR on moved CPUs and CPUs which have moved 3709 + * task running on them. 3710 + */ 3711 + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 3712 + update_closid_rmid(tmpmask, NULL); 3713 + 3714 + rdtgrp->flags = RDT_DELETED; 3715 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3716 + 3717 + /* 3718 + * Remove the rdtgrp from the parent ctrl_mon group's list 3719 + */ 3720 + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 3721 + list_del(&rdtgrp->mon.crdtgrp_list); 3722 + 3723 + kernfs_remove(rdtgrp->kn); 3724 + 3725 + return 0; 3726 + } 3727 + 3728 + static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) 3729 + { 3730 + rdtgrp->flags = RDT_DELETED; 3731 + list_del(&rdtgrp->rdtgroup_list); 3732 + 3733 + kernfs_remove(rdtgrp->kn); 3734 + return 0; 3735 + } 3736 + 3737 + static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 3738 + { 3739 + u32 closid, rmid; 3740 + int cpu; 3741 + 3742 + /* Give any tasks back to the default group */ 3743 + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 3744 + 3745 + /* Give any CPUs back to the default group */ 3746 + cpumask_or(&rdtgroup_default.cpu_mask, 3747 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 3748 + 3749 + /* Update per cpu closid and rmid of the moved CPUs first */ 3750 + closid = rdtgroup_default.closid; 3751 + rmid = rdtgroup_default.mon.rmid; 3752 + for_each_cpu(cpu, &rdtgrp->cpu_mask) 3753 + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 3754 + 3755 + /* 3756 + * Update the MSR on moved CPUs and CPUs which have moved 3757 + * task running on them. 3758 + */ 3759 + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 3760 + update_closid_rmid(tmpmask, NULL); 3761 + 3762 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3763 + closid_free(rdtgrp->closid); 3764 + 3765 + rdtgroup_ctrl_remove(rdtgrp); 3766 + 3767 + /* 3768 + * Free all the child monitor group rmids. 3769 + */ 3770 + free_all_child_rdtgrp(rdtgrp); 3771 + 3772 + return 0; 3773 + } 3774 + 3775 + static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) 3776 + { 3777 + /* 3778 + * Valid within the RCU section it was obtained or while rdtgroup_mutex 3779 + * is held. 3780 + */ 3781 + return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); 3782 + } 3783 + 3784 + static int rdtgroup_rmdir(struct kernfs_node *kn) 3785 + { 3786 + struct kernfs_node *parent_kn; 3787 + struct rdtgroup *rdtgrp; 3788 + cpumask_var_t tmpmask; 3789 + int ret = 0; 3790 + 3791 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 3792 + return -ENOMEM; 3793 + 3794 + rdtgrp = rdtgroup_kn_lock_live(kn); 3795 + if (!rdtgrp) { 3796 + ret = -EPERM; 3797 + goto out; 3798 + } 3799 + parent_kn = rdt_kn_parent(kn); 3800 + 3801 + /* 3802 + * If the rdtgroup is a ctrl_mon group and parent directory 3803 + * is the root directory, remove the ctrl_mon group. 3804 + * 3805 + * If the rdtgroup is a mon group and parent directory 3806 + * is a valid "mon_groups" directory, remove the mon group. 3807 + */ 3808 + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && 3809 + rdtgrp != &rdtgroup_default) { 3810 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 3811 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 3812 + ret = rdtgroup_ctrl_remove(rdtgrp); 3813 + } else { 3814 + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); 3815 + } 3816 + } else if (rdtgrp->type == RDTMON_GROUP && 3817 + is_mon_groups(parent_kn, rdt_kn_name(kn))) { 3818 + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); 3819 + } else { 3820 + ret = -EPERM; 3821 + } 3822 + 3823 + out: 3824 + rdtgroup_kn_unlock(kn); 3825 + free_cpumask_var(tmpmask); 3826 + return ret; 3827 + } 3828 + 3829 + /** 3830 + * mongrp_reparent() - replace parent CTRL_MON group of a MON group 3831 + * @rdtgrp: the MON group whose parent should be replaced 3832 + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp 3833 + * @cpus: cpumask provided by the caller for use during this call 3834 + * 3835 + * Replaces the parent CTRL_MON group for a MON group, resulting in all member 3836 + * tasks' CLOSID immediately changing to that of the new parent group. 3837 + * Monitoring data for the group is unaffected by this operation. 3838 + */ 3839 + static void mongrp_reparent(struct rdtgroup *rdtgrp, 3840 + struct rdtgroup *new_prdtgrp, 3841 + cpumask_var_t cpus) 3842 + { 3843 + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 3844 + 3845 + WARN_ON(rdtgrp->type != RDTMON_GROUP); 3846 + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); 3847 + 3848 + /* Nothing to do when simply renaming a MON group. */ 3849 + if (prdtgrp == new_prdtgrp) 3850 + return; 3851 + 3852 + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 3853 + list_move_tail(&rdtgrp->mon.crdtgrp_list, 3854 + &new_prdtgrp->mon.crdtgrp_list); 3855 + 3856 + rdtgrp->mon.parent = new_prdtgrp; 3857 + rdtgrp->closid = new_prdtgrp->closid; 3858 + 3859 + /* Propagate updated closid to all tasks in this group. */ 3860 + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); 3861 + 3862 + update_closid_rmid(cpus, NULL); 3863 + } 3864 + 3865 + static int rdtgroup_rename(struct kernfs_node *kn, 3866 + struct kernfs_node *new_parent, const char *new_name) 3867 + { 3868 + struct kernfs_node *kn_parent; 3869 + struct rdtgroup *new_prdtgrp; 3870 + struct rdtgroup *rdtgrp; 3871 + cpumask_var_t tmpmask; 3872 + int ret; 3873 + 3874 + rdtgrp = kernfs_to_rdtgroup(kn); 3875 + new_prdtgrp = kernfs_to_rdtgroup(new_parent); 3876 + if (!rdtgrp || !new_prdtgrp) 3877 + return -ENOENT; 3878 + 3879 + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ 3880 + rdtgroup_kn_get(rdtgrp, kn); 3881 + rdtgroup_kn_get(new_prdtgrp, new_parent); 3882 + 3883 + mutex_lock(&rdtgroup_mutex); 3884 + 3885 + rdt_last_cmd_clear(); 3886 + 3887 + /* 3888 + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if 3889 + * either kernfs_node is a file. 3890 + */ 3891 + if (kernfs_type(kn) != KERNFS_DIR || 3892 + kernfs_type(new_parent) != KERNFS_DIR) { 3893 + rdt_last_cmd_puts("Source and destination must be directories"); 3894 + ret = -EPERM; 3895 + goto out; 3896 + } 3897 + 3898 + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { 3899 + ret = -ENOENT; 3900 + goto out; 3901 + } 3902 + 3903 + kn_parent = rdt_kn_parent(kn); 3904 + if (rdtgrp->type != RDTMON_GROUP || !kn_parent || 3905 + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { 3906 + rdt_last_cmd_puts("Source must be a MON group\n"); 3907 + ret = -EPERM; 3908 + goto out; 3909 + } 3910 + 3911 + if (!is_mon_groups(new_parent, new_name)) { 3912 + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); 3913 + ret = -EPERM; 3914 + goto out; 3915 + } 3916 + 3917 + /* 3918 + * If the MON group is monitoring CPUs, the CPUs must be assigned to the 3919 + * current parent CTRL_MON group and therefore cannot be assigned to 3920 + * the new parent, making the move illegal. 3921 + */ 3922 + if (!cpumask_empty(&rdtgrp->cpu_mask) && 3923 + rdtgrp->mon.parent != new_prdtgrp) { 3924 + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); 3925 + ret = -EPERM; 3926 + goto out; 3927 + } 3928 + 3929 + /* 3930 + * Allocate the cpumask for use in mongrp_reparent() to avoid the 3931 + * possibility of failing to allocate it after kernfs_rename() has 3932 + * succeeded. 3933 + */ 3934 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { 3935 + ret = -ENOMEM; 3936 + goto out; 3937 + } 3938 + 3939 + /* 3940 + * Perform all input validation and allocations needed to ensure 3941 + * mongrp_reparent() will succeed before calling kernfs_rename(), 3942 + * otherwise it would be necessary to revert this call if 3943 + * mongrp_reparent() failed. 3944 + */ 3945 + ret = kernfs_rename(kn, new_parent, new_name); 3946 + if (!ret) 3947 + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); 3948 + 3949 + free_cpumask_var(tmpmask); 3950 + 3951 + out: 3952 + mutex_unlock(&rdtgroup_mutex); 3953 + rdtgroup_kn_put(rdtgrp, kn); 3954 + rdtgroup_kn_put(new_prdtgrp, new_parent); 3955 + return ret; 3956 + } 3957 + 3958 + static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 3959 + { 3960 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) 3961 + seq_puts(seq, ",cdp"); 3962 + 3963 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) 3964 + seq_puts(seq, ",cdpl2"); 3965 + 3966 + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) 3967 + seq_puts(seq, ",mba_MBps"); 3968 + 3969 + if (resctrl_debug) 3970 + seq_puts(seq, ",debug"); 3971 + 3972 + return 0; 3973 + } 3974 + 3975 + static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { 3976 + .mkdir = rdtgroup_mkdir, 3977 + .rmdir = rdtgroup_rmdir, 3978 + .rename = rdtgroup_rename, 3979 + .show_options = rdtgroup_show_options, 3980 + }; 3981 + 3982 + static int rdtgroup_setup_root(struct rdt_fs_context *ctx) 3983 + { 3984 + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, 3985 + KERNFS_ROOT_CREATE_DEACTIVATED | 3986 + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, 3987 + &rdtgroup_default); 3988 + if (IS_ERR(rdt_root)) 3989 + return PTR_ERR(rdt_root); 3990 + 3991 + ctx->kfc.root = rdt_root; 3992 + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); 3993 + 3994 + return 0; 3995 + } 3996 + 3997 + static void rdtgroup_destroy_root(void) 3998 + { 3999 + lockdep_assert_held(&rdtgroup_mutex); 4000 + 4001 + kernfs_destroy_root(rdt_root); 4002 + rdtgroup_default.kn = NULL; 4003 + } 4004 + 4005 + static void rdtgroup_setup_default(void) 4006 + { 4007 + mutex_lock(&rdtgroup_mutex); 4008 + 4009 + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; 4010 + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; 4011 + rdtgroup_default.type = RDTCTRL_GROUP; 4012 + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); 4013 + 4014 + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 4015 + 4016 + mutex_unlock(&rdtgroup_mutex); 4017 + } 4018 + 4019 + static void domain_destroy_mon_state(struct rdt_mon_domain *d) 4020 + { 4021 + bitmap_free(d->rmid_busy_llc); 4022 + kfree(d->mbm_total); 4023 + kfree(d->mbm_local); 4024 + } 4025 + 4026 + void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 4027 + { 4028 + mutex_lock(&rdtgroup_mutex); 4029 + 4030 + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) 4031 + mba_sc_domain_destroy(r, d); 4032 + 4033 + mutex_unlock(&rdtgroup_mutex); 4034 + } 4035 + 4036 + void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 4037 + { 4038 + mutex_lock(&rdtgroup_mutex); 4039 + 4040 + /* 4041 + * If resctrl is mounted, remove all the 4042 + * per domain monitor data directories. 4043 + */ 4044 + if (resctrl_mounted && resctrl_arch_mon_capable()) 4045 + rmdir_mondata_subdir_allrdtgrp(r, d); 4046 + 4047 + if (resctrl_is_mbm_enabled()) 4048 + cancel_delayed_work(&d->mbm_over); 4049 + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { 4050 + /* 4051 + * When a package is going down, forcefully 4052 + * decrement rmid->ebusy. There is no way to know 4053 + * that the L3 was flushed and hence may lead to 4054 + * incorrect counts in rare scenarios, but leaving 4055 + * the RMID as busy creates RMID leaks if the 4056 + * package never comes back. 4057 + */ 4058 + __check_limbo(d, true); 4059 + cancel_delayed_work(&d->cqm_limbo); 4060 + } 4061 + 4062 + domain_destroy_mon_state(d); 4063 + 4064 + mutex_unlock(&rdtgroup_mutex); 4065 + } 4066 + 4067 + /** 4068 + * domain_setup_mon_state() - Initialise domain monitoring structures. 4069 + * @r: The resource for the newly online domain. 4070 + * @d: The newly online domain. 4071 + * 4072 + * Allocate monitor resources that belong to this domain. 4073 + * Called when the first CPU of a domain comes online, regardless of whether 4074 + * the filesystem is mounted. 4075 + * During boot this may be called before global allocations have been made by 4076 + * resctrl_mon_resource_init(). 4077 + * 4078 + * Returns 0 for success, or -ENOMEM. 4079 + */ 4080 + static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) 4081 + { 4082 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 4083 + size_t tsize; 4084 + 4085 + if (resctrl_arch_is_llc_occupancy_enabled()) { 4086 + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); 4087 + if (!d->rmid_busy_llc) 4088 + return -ENOMEM; 4089 + } 4090 + if (resctrl_arch_is_mbm_total_enabled()) { 4091 + tsize = sizeof(*d->mbm_total); 4092 + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); 4093 + if (!d->mbm_total) { 4094 + bitmap_free(d->rmid_busy_llc); 4095 + return -ENOMEM; 4096 + } 4097 + } 4098 + if (resctrl_arch_is_mbm_local_enabled()) { 4099 + tsize = sizeof(*d->mbm_local); 4100 + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); 4101 + if (!d->mbm_local) { 4102 + bitmap_free(d->rmid_busy_llc); 4103 + kfree(d->mbm_total); 4104 + return -ENOMEM; 4105 + } 4106 + } 4107 + 4108 + return 0; 4109 + } 4110 + 4111 + int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 4112 + { 4113 + int err = 0; 4114 + 4115 + mutex_lock(&rdtgroup_mutex); 4116 + 4117 + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { 4118 + /* RDT_RESOURCE_MBA is never mon_capable */ 4119 + err = mba_sc_domain_allocate(r, d); 4120 + } 4121 + 4122 + mutex_unlock(&rdtgroup_mutex); 4123 + 4124 + return err; 4125 + } 4126 + 4127 + int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 4128 + { 4129 + int err; 4130 + 4131 + mutex_lock(&rdtgroup_mutex); 4132 + 4133 + err = domain_setup_mon_state(r, d); 4134 + if (err) 4135 + goto out_unlock; 4136 + 4137 + if (resctrl_is_mbm_enabled()) { 4138 + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); 4139 + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, 4140 + RESCTRL_PICK_ANY_CPU); 4141 + } 4142 + 4143 + if (resctrl_arch_is_llc_occupancy_enabled()) 4144 + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); 4145 + 4146 + /* 4147 + * If the filesystem is not mounted then only the default resource group 4148 + * exists. Creation of its directories is deferred until mount time 4149 + * by rdt_get_tree() calling mkdir_mondata_all(). 4150 + * If resctrl is mounted, add per domain monitor data directories. 4151 + */ 4152 + if (resctrl_mounted && resctrl_arch_mon_capable()) 4153 + mkdir_mondata_subdir_allrdtgrp(r, d); 4154 + 4155 + out_unlock: 4156 + mutex_unlock(&rdtgroup_mutex); 4157 + 4158 + return err; 4159 + } 4160 + 4161 + void resctrl_online_cpu(unsigned int cpu) 4162 + { 4163 + mutex_lock(&rdtgroup_mutex); 4164 + /* The CPU is set in default rdtgroup after online. */ 4165 + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 4166 + mutex_unlock(&rdtgroup_mutex); 4167 + } 4168 + 4169 + static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) 4170 + { 4171 + struct rdtgroup *cr; 4172 + 4173 + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { 4174 + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) 4175 + break; 4176 + } 4177 + } 4178 + 4179 + static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, 4180 + struct rdt_resource *r) 4181 + { 4182 + struct rdt_mon_domain *d; 4183 + 4184 + lockdep_assert_cpus_held(); 4185 + 4186 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 4187 + /* Find the domain that contains this CPU */ 4188 + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 4189 + return d; 4190 + } 4191 + 4192 + return NULL; 4193 + } 4194 + 4195 + void resctrl_offline_cpu(unsigned int cpu) 4196 + { 4197 + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); 4198 + struct rdt_mon_domain *d; 4199 + struct rdtgroup *rdtgrp; 4200 + 4201 + mutex_lock(&rdtgroup_mutex); 4202 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 4203 + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { 4204 + clear_childcpus(rdtgrp, cpu); 4205 + break; 4206 + } 4207 + } 4208 + 4209 + if (!l3->mon_capable) 4210 + goto out_unlock; 4211 + 4212 + d = get_mon_domain_from_cpu(cpu, l3); 4213 + if (d) { 4214 + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { 4215 + cancel_delayed_work(&d->mbm_over); 4216 + mbm_setup_overflow_handler(d, 0, cpu); 4217 + } 4218 + if (resctrl_arch_is_llc_occupancy_enabled() && 4219 + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { 4220 + cancel_delayed_work(&d->cqm_limbo); 4221 + cqm_setup_limbo_handler(d, 0, cpu); 4222 + } 4223 + } 4224 + 4225 + out_unlock: 4226 + mutex_unlock(&rdtgroup_mutex); 4227 + } 4228 + 4229 + /* 4230 + * resctrl_init - resctrl filesystem initialization 4231 + * 4232 + * Setup resctrl file system including set up root, create mount point, 4233 + * register resctrl filesystem, and initialize files under root directory. 4234 + * 4235 + * Return: 0 on success or -errno 4236 + */ 4237 + int resctrl_init(void) 4238 + { 4239 + int ret = 0; 4240 + 4241 + seq_buf_init(&last_cmd_status, last_cmd_status_buf, 4242 + sizeof(last_cmd_status_buf)); 4243 + 4244 + rdtgroup_setup_default(); 4245 + 4246 + thread_throttle_mode_init(); 4247 + 4248 + ret = resctrl_mon_resource_init(); 4249 + if (ret) 4250 + return ret; 4251 + 4252 + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); 4253 + if (ret) { 4254 + resctrl_mon_resource_exit(); 4255 + return ret; 4256 + } 4257 + 4258 + ret = register_filesystem(&rdt_fs_type); 4259 + if (ret) 4260 + goto cleanup_mountpoint; 4261 + 4262 + /* 4263 + * Adding the resctrl debugfs directory here may not be ideal since 4264 + * it would let the resctrl debugfs directory appear on the debugfs 4265 + * filesystem before the resctrl filesystem is mounted. 4266 + * It may also be ok since that would enable debugging of RDT before 4267 + * resctrl is mounted. 4268 + * The reason why the debugfs directory is created here and not in 4269 + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and 4270 + * during the debugfs directory creation also &sb->s_type->i_mutex_key 4271 + * (the lockdep class of inode->i_rwsem). Other filesystem 4272 + * interactions (eg. SyS_getdents) have the lock ordering: 4273 + * &sb->s_type->i_mutex_key --> &mm->mmap_lock 4274 + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex 4275 + * is taken, thus creating dependency: 4276 + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause 4277 + * issues considering the other two lock dependencies. 4278 + * By creating the debugfs directory here we avoid a dependency 4279 + * that may cause deadlock (even though file operations cannot 4280 + * occur until the filesystem is mounted, but I do not know how to 4281 + * tell lockdep that). 4282 + */ 4283 + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); 4284 + 4285 + return 0; 4286 + 4287 + cleanup_mountpoint: 4288 + sysfs_remove_mount_point(fs_kobj, "resctrl"); 4289 + resctrl_mon_resource_exit(); 4290 + 4291 + return ret; 4292 + } 4293 + 4294 + static bool resctrl_online_domains_exist(void) 4295 + { 4296 + struct rdt_resource *r; 4297 + 4298 + /* 4299 + * Only walk capable resources to allow resctrl_arch_get_resource() 4300 + * to return dummy 'not capable' resources. 4301 + */ 4302 + for_each_alloc_capable_rdt_resource(r) { 4303 + if (!list_empty(&r->ctrl_domains)) 4304 + return true; 4305 + } 4306 + 4307 + for_each_mon_capable_rdt_resource(r) { 4308 + if (!list_empty(&r->mon_domains)) 4309 + return true; 4310 + } 4311 + 4312 + return false; 4313 + } 4314 + 4315 + /** 4316 + * resctrl_exit() - Remove the resctrl filesystem and free resources. 4317 + * 4318 + * Called by the architecture code in response to a fatal error. 4319 + * Removes resctrl files and structures from kernfs to prevent further 4320 + * configuration. 4321 + * 4322 + * When called by the architecture code, all CPUs and resctrl domains must be 4323 + * offline. This ensures the limbo and overflow handlers are not scheduled to 4324 + * run, meaning the data structures they access can be freed by 4325 + * resctrl_mon_resource_exit(). 4326 + * 4327 + * After resctrl_exit() returns, the architecture code should return an 4328 + * error from all resctrl_arch_ functions that can do this. 4329 + * resctrl_arch_get_resource() must continue to return struct rdt_resources 4330 + * with the correct rid field to ensure the filesystem can be unmounted. 4331 + */ 4332 + void resctrl_exit(void) 4333 + { 4334 + cpus_read_lock(); 4335 + WARN_ON_ONCE(resctrl_online_domains_exist()); 4336 + 4337 + mutex_lock(&rdtgroup_mutex); 4338 + resctrl_fs_teardown(); 4339 + mutex_unlock(&rdtgroup_mutex); 4340 + 4341 + cpus_read_unlock(); 4342 + 4343 + debugfs_remove_recursive(debugfs_resctrl); 4344 + debugfs_resctrl = NULL; 4345 + unregister_filesystem(&rdt_fs_type); 4346 + 4347 + /* 4348 + * Do not remove the sysfs mount point added by resctrl_init() so that 4349 + * it can be used to umount resctrl. 4350 + */ 4351 + 4352 + resctrl_mon_resource_exit(); 4353 + }

+71 -4

include/linux/cpumask.h

··· 179 179 } 180 180 181 181 /** 182 + * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 183 + * @srcp1: the first input 184 + * @srcp2: the second input 185 + * 186 + * Return: >= nr_cpu_ids if no such cpu found. 187 + */ 188 + static __always_inline 189 + unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) 190 + { 191 + return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); 192 + } 193 + 194 + /** 182 195 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 183 196 * @srcp1: the first input 184 197 * @srcp2: the second input ··· 294 281 if (n != -1) 295 282 cpumask_check(n); 296 283 return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), 284 + small_cpumask_bits, n + 1); 285 + } 286 + 287 + /** 288 + * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p 289 + * @n: the cpu prior to the place to search (i.e. return will be > @n) 290 + * @src1p: the first cpumask pointer 291 + * @src2p: the second cpumask pointer 292 + * 293 + * Return: >= nr_cpu_ids if no further cpus set in both. 294 + */ 295 + static __always_inline 296 + unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, 297 + const struct cpumask *src2p) 298 + { 299 + /* -1 is a legal arg here. */ 300 + if (n != -1) 301 + cpumask_check(n); 302 + return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), 297 303 small_cpumask_bits, n + 1); 298 304 } 299 305 ··· 445 413 * @cpu: the cpu to ignore. 446 414 * 447 415 * Often used to find any cpu but smp_processor_id() in a mask. 416 + * If @cpu == -1, the function is equivalent to cpumask_any(). 448 417 * Return: >= nr_cpu_ids if no cpus set. 449 418 */ 450 419 static __always_inline 451 - unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) 420 + unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) 452 421 { 453 422 unsigned int i; 454 423 455 - cpumask_check(cpu); 424 + /* -1 is a legal arg here. */ 425 + if (cpu != -1) 426 + cpumask_check(cpu); 427 + 456 428 for_each_cpu(i, mask) 457 429 if (i != cpu) 458 430 break; ··· 469 433 * @mask2: the second input cpumask 470 434 * @cpu: the cpu to ignore 471 435 * 436 + * If @cpu == -1, the function is equivalent to cpumask_any_and(). 472 437 * Returns >= nr_cpu_ids if no cpus set. 473 438 */ 474 439 static __always_inline 475 440 unsigned int cpumask_any_and_but(const struct cpumask *mask1, 476 441 const struct cpumask *mask2, 477 - unsigned int cpu) 442 + int cpu) 478 443 { 479 444 unsigned int i; 480 445 481 - cpumask_check(cpu); 446 + /* -1 is a legal arg here. */ 447 + if (cpu != -1) 448 + cpumask_check(cpu); 449 + 482 450 i = cpumask_first_and(mask1, mask2); 483 451 if (i != cpu) 484 452 return i; 485 453 486 454 return cpumask_next_and(cpu, mask1, mask2); 455 + } 456 + 457 + /** 458 + * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. 459 + * @mask1: the first input cpumask 460 + * @mask2: the second input cpumask 461 + * @cpu: the cpu to ignore 462 + * 463 + * If @cpu == -1, the function returns the first matching cpu. 464 + * Returns >= nr_cpu_ids if no cpus set. 465 + */ 466 + static __always_inline 467 + unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, 468 + const struct cpumask *mask2, 469 + int cpu) 470 + { 471 + unsigned int i; 472 + 473 + /* -1 is a legal arg here. */ 474 + if (cpu != -1) 475 + cpumask_check(cpu); 476 + 477 + i = cpumask_first_andnot(mask1, mask2); 478 + if (i != cpu) 479 + return i; 480 + 481 + return cpumask_next_andnot(cpu, mask1, mask2); 487 482 } 488 483 489 484 /**

+25

include/linux/find.h

··· 29 29 unsigned long n); 30 30 extern unsigned long _find_first_and_bit(const unsigned long *addr1, 31 31 const unsigned long *addr2, unsigned long size); 32 + unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, 33 + unsigned long size); 32 34 unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, 33 35 const unsigned long *addr3, unsigned long size); 34 36 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); ··· 348 346 return _find_first_and_bit(addr1, addr2, size); 349 347 } 350 348 #endif 349 + 350 + /** 351 + * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd 352 + * @addr1: The first address to base the search on 353 + * @addr2: The second address to base the search on 354 + * @size: The bitmap size in bits 355 + * 356 + * Returns the bit number for the first set bit 357 + * If no bits are set, returns >= @size. 358 + */ 359 + static __always_inline 360 + unsigned long find_first_andnot_bit(const unsigned long *addr1, 361 + const unsigned long *addr2, 362 + unsigned long size) 363 + { 364 + if (small_const_nbits(size)) { 365 + unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); 366 + 367 + return val ? __ffs(val) : size; 368 + } 369 + 370 + return _find_first_andnot_bit(addr1, addr2, size); 371 + } 351 372 352 373 /** 353 374 * find_first_and_and_bit - find the first set bit in 3 memory regions

+33 -3

include/linux/resctrl.h

··· 8 8 #include <linux/pid.h> 9 9 #include <linux/resctrl_types.h> 10 10 11 + #ifdef CONFIG_ARCH_HAS_CPU_RESCTRL 12 + #include <asm/resctrl.h> 13 + #endif 14 + 11 15 /* CLOSID, RMID value used by the default control group */ 12 16 #define RESCTRL_RESERVED_CLOSID 0 13 17 #define RESCTRL_RESERVED_RMID 0 ··· 47 43 #define for_each_mon_capable_rdt_resource(r) \ 48 44 for_each_rdt_resource((r)) \ 49 45 if ((r)->mon_capable) 46 + 47 + enum resctrl_res_level { 48 + RDT_RESOURCE_L3, 49 + RDT_RESOURCE_L2, 50 + RDT_RESOURCE_MBA, 51 + RDT_RESOURCE_SMBA, 52 + 53 + /* Must be the last */ 54 + RDT_NUM_RESOURCES, 55 + }; 50 56 51 57 /** 52 58 * enum resctrl_conf_type - The type of configuration. ··· 372 358 u32 resctrl_arch_system_num_rmid_idx(void); 373 359 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); 374 360 375 - __init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); 361 + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); 376 362 377 363 /** 378 364 * resctrl_arch_mon_event_config_write() - Write the config for an event. ··· 412 398 return closid * 2; 413 399 } 414 400 } 401 + 402 + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); 403 + int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); 415 404 416 405 /* 417 406 * Update the ctrl_val and apply this config right now. ··· 531 514 extern unsigned int resctrl_rmid_realloc_threshold; 532 515 extern unsigned int resctrl_rmid_realloc_limit; 533 516 534 - int __init resctrl_init(void); 535 - void __exit resctrl_exit(void); 517 + int resctrl_init(void); 518 + void resctrl_exit(void); 536 519 520 + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 521 + u64 resctrl_arch_get_prefetch_disable_bits(void); 522 + int resctrl_arch_pseudo_lock_fn(void *_plr); 523 + int resctrl_arch_measure_cycles_lat_fn(void *_plr); 524 + int resctrl_arch_measure_l2_residency(void *_plr); 525 + int resctrl_arch_measure_l3_residency(void *_plr); 526 + #else 527 + static inline u64 resctrl_arch_get_prefetch_disable_bits(void) { return 0; } 528 + static inline int resctrl_arch_pseudo_lock_fn(void *_plr) { return 0; } 529 + static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } 530 + static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } 531 + static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } 532 + #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 537 533 #endif /* _RESCTRL_H */

+6 -10

include/linux/resctrl_types.h

··· 7 7 #ifndef __LINUX_RESCTRL_TYPES_H 8 8 #define __LINUX_RESCTRL_TYPES_H 9 9 10 + #define MAX_MBA_BW 100u 11 + #define MBM_OVERFLOW_INTERVAL 1000 12 + 10 13 /* Reads to Local DRAM Memory */ 11 14 #define READS_TO_LOCAL_MEM BIT(0) 12 15 ··· 34 31 /* Max event bits supported */ 35 32 #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) 36 33 37 - enum resctrl_res_level { 38 - RDT_RESOURCE_L3, 39 - RDT_RESOURCE_L2, 40 - RDT_RESOURCE_MBA, 41 - RDT_RESOURCE_SMBA, 42 - 43 - /* Must be the last */ 44 - RDT_NUM_RESOURCES, 45 - }; 46 - 47 34 /* 48 35 * Event IDs, the values match those used to program IA32_QM_EVTSEL before 49 36 * reading IA32_QM_CTR on RDT systems. ··· 42 49 QOS_L3_OCCUP_EVENT_ID = 0x01, 43 50 QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, 44 51 QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, 52 + 53 + /* Must be the last */ 54 + QOS_NUM_EVENTS, 45 55 }; 46 56 47 57 #endif /* __LINUX_RESCTRL_TYPES_H */

+11

lib/find_bit.c

··· 117 117 #endif 118 118 119 119 /* 120 + * Find the first bit set in 1st memory region and unset in 2nd. 121 + */ 122 + unsigned long _find_first_andnot_bit(const unsigned long *addr1, 123 + const unsigned long *addr2, 124 + unsigned long size) 125 + { 126 + return FIND_FIRST_BIT(addr1[idx] & ~addr2[idx], /* nop */, size); 127 + } 128 + EXPORT_SYMBOL(_find_first_andnot_bit); 129 + 130 + /* 120 131 * Find the first set bit in three memory regions. 121 132 */ 122 133 unsigned long _find_first_and_and_bit(const unsigned long *addr1,

Configure Feed

Configure Feed