Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86_cache_for_v6.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 resource control updates from Borislav Petkov:
"Carve out the resctrl filesystem-related code into fs/resctrl/ so that
multiple architectures can share the fs API for manipulating their
respective hw resource control implementation.

This is the second step in the work towards sharing the resctrl
filesystem interface, the next one being plugging ARM's MPAM into the
aforementioned fs API"

* tag 'x86_cache_for_v6.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (25 commits)
MAINTAINERS: Add reviewers for fs/resctrl
x86,fs/resctrl: Move the resctrl filesystem code to live in /fs/resctrl
x86/resctrl: Always initialise rid field in rdt_resources_all[]
x86/resctrl: Relax some asm #includes
x86/resctrl: Prefer alloc(sizeof(*foo)) idiom in rdt_init_fs_context()
x86/resctrl: Squelch whitespace anomalies in resctrl core code
x86/resctrl: Move pseudo lock prototypes to include/linux/resctrl.h
x86/resctrl: Fix types in resctrl_arch_mon_ctx_{alloc,free}() stubs
x86/resctrl: Move enum resctrl_event_id to resctrl.h
x86/resctrl: Move the filesystem bits to headers visible to fs/resctrl
fs/resctrl: Add boiler plate for external resctrl code
x86/resctrl: Add 'resctrl' to the title of the resctrl documentation
x86/resctrl: Split trace.h
x86/resctrl: Expand the width of domid by replacing mon_data_bits
x86/resctrl: Add end-marker to the resctrl_event_id enum
x86/resctrl: Move is_mba_sc() out of core.c
x86/resctrl: Drop __init/__exit on assorted symbols
x86/resctrl: Resctrl_exit() teardown resctrl but leave the mount point
x86/resctrl: Check all domains are offline in resctrl_exit()
x86/resctrl: Rename resctrl_sched_in() to begin with "resctrl_arch_"
...

+7772 -7267
-1
Documentation/arch/x86/index.rst
··· 32 32 pti 33 33 mds 34 34 microcode 35 - resctrl 36 35 tsx_async_abort 37 36 buslock 38 37 usb-legacy-support
+3 -3
Documentation/arch/x86/resctrl.rst Documentation/filesystems/resctrl.rst
··· 1 1 .. SPDX-License-Identifier: GPL-2.0 2 2 .. include:: <isonum.txt> 3 3 4 - =========================================== 5 - User Interface for Resource Control feature 6 - =========================================== 4 + ===================================================== 5 + User Interface for Resource Control feature (resctrl) 6 + ===================================================== 7 7 8 8 :Copyright: |copy| 2016 Intel Corporation 9 9 :Authors: - Fenghua Yu <fenghua.yu@intel.com>
+1
Documentation/filesystems/index.rst
··· 113 113 qnx6 114 114 ramfs-rootfs-initramfs 115 115 relay 116 + resctrl 116 117 romfs 117 118 smb/index 118 119 spufs/index
+4 -1
MAINTAINERS
··· 20501 20501 RDT - RESOURCE ALLOCATION 20502 20502 M: Tony Luck <tony.luck@intel.com> 20503 20503 M: Reinette Chatre <reinette.chatre@intel.com> 20504 + R: Dave Martin <Dave.Martin@arm.com> 20505 + R: James Morse <james.morse@arm.com> 20504 20506 L: linux-kernel@vger.kernel.org 20505 20507 S: Supported 20506 - F: Documentation/arch/x86/resctrl* 20508 + F: Documentation/filesystems/resctrl.rst 20507 20509 F: arch/x86/include/asm/resctrl.h 20508 20510 F: arch/x86/kernel/cpu/resctrl/ 20511 + F: fs/resctrl/ 20509 20512 F: include/linux/resctrl*.h 20510 20513 F: tools/testing/selftests/resctrl/ 20511 20514
+8
arch/Kconfig
··· 1518 1518 config ARCH_HAS_PHYS_TO_DMA 1519 1519 bool 1520 1520 1521 + config ARCH_HAS_CPU_RESCTRL 1522 + bool 1523 + help 1524 + An architecture selects this option to indicate that the necessary 1525 + hooks are provided to support the common memory system usage 1526 + monitoring and control interfaces provided by the 'resctrl' 1527 + filesystem (see RESCTRL_FS). 1528 + 1521 1529 config HAVE_ARCH_COMPILER_H 1522 1530 bool 1523 1531 help
+3 -8
arch/x86/Kconfig
··· 507 507 config X86_CPU_RESCTRL 508 508 bool "x86 CPU resource control support" 509 509 depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) 510 - select KERNFS 511 - select PROC_CPU_RESCTRL if PROC_FS 510 + depends on MISC_FILESYSTEMS 511 + select ARCH_HAS_CPU_RESCTRL 512 + select RESCTRL_FS 512 513 select RESCTRL_FS_PSEUDO_LOCK 513 514 help 514 515 Enable x86 CPU resource control support. ··· 526 525 Platform Quality of Service Extensions manual. 527 526 528 527 Say N if unsure. 529 - 530 - config RESCTRL_FS_PSEUDO_LOCK 531 - bool 532 - help 533 - Software mechanism to pin data in a cache portion using 534 - micro-architecture specific knowledge. 535 528 536 529 config X86_FRED 537 530 bool "Flexible Return and Event Delivery"
+8 -11
arch/x86/include/asm/resctrl.h
··· 177 177 return READ_ONCE(tsk->rmid) == rmid; 178 178 } 179 179 180 - static inline void resctrl_sched_in(struct task_struct *tsk) 180 + static inline void resctrl_arch_sched_in(struct task_struct *tsk) 181 181 { 182 182 if (static_branch_likely(&rdt_enable_key)) 183 183 __resctrl_sched_in(tsk); ··· 196 196 197 197 /* x86 can always read an rmid, nothing needs allocating */ 198 198 struct rdt_resource; 199 - static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) 199 + static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, 200 + enum resctrl_event_id evtid) 200 201 { 201 202 might_sleep(); 202 203 return NULL; 203 - }; 204 + } 204 205 205 - static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, 206 - void *ctx) { }; 206 + static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, 207 + enum resctrl_event_id evtid, 208 + void *ctx) { } 207 209 208 - u64 resctrl_arch_get_prefetch_disable_bits(void); 209 - int resctrl_arch_pseudo_lock_fn(void *_plr); 210 - int resctrl_arch_measure_cycles_lat_fn(void *_plr); 211 - int resctrl_arch_measure_l2_residency(void *_plr); 212 - int resctrl_arch_measure_l3_residency(void *_plr); 213 210 void resctrl_cpu_detect(struct cpuinfo_x86 *c); 214 211 215 212 #else 216 213 217 - static inline void resctrl_sched_in(struct task_struct *tsk) {} 214 + static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} 218 215 static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} 219 216 220 217 #endif /* CONFIG_X86_CPU_RESCTRL */
+2
arch/x86/kernel/cpu/resctrl/Makefile
··· 2 2 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o 3 3 obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o 4 4 obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o 5 + 6 + # To allow define_trace.h's recursive include: 5 7 CFLAGS_pseudo_lock.o = -I$(src)
+8 -23
arch/x86/kernel/cpu/resctrl/core.c
··· 61 61 [RDT_RESOURCE_L3] = 62 62 { 63 63 .r_resctrl = { 64 - .rid = RDT_RESOURCE_L3, 65 64 .name = "L3", 66 65 .ctrl_scope = RESCTRL_L3_CACHE, 67 66 .mon_scope = RESCTRL_L3_CACHE, ··· 74 75 [RDT_RESOURCE_L2] = 75 76 { 76 77 .r_resctrl = { 77 - .rid = RDT_RESOURCE_L2, 78 78 .name = "L2", 79 79 .ctrl_scope = RESCTRL_L2_CACHE, 80 80 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), ··· 85 87 [RDT_RESOURCE_MBA] = 86 88 { 87 89 .r_resctrl = { 88 - .rid = RDT_RESOURCE_MBA, 89 90 .name = "MB", 90 91 .ctrl_scope = RESCTRL_L3_CACHE, 91 92 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), ··· 94 97 [RDT_RESOURCE_SMBA] = 95 98 { 96 99 .r_resctrl = { 97 - .rid = RDT_RESOURCE_SMBA, 98 100 .name = "SMBA", 99 101 .ctrl_scope = RESCTRL_L3_CACHE, 100 102 .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), ··· 159 163 r->alloc_capable = true; 160 164 161 165 rdt_alloc_capable = true; 162 - } 163 - 164 - bool is_mba_sc(struct rdt_resource *r) 165 - { 166 - if (!r) 167 - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 168 - 169 - /* 170 - * The software controller support is only applicable to MBA resource. 171 - * Make sure to check for resource type. 172 - */ 173 - if (r->rid != RDT_RESOURCE_MBA) 174 - return false; 175 - 176 - return r->membw.mba_sc; 177 166 } 178 167 179 168 /* ··· 719 738 bool force_off, force_on; 720 739 }; 721 740 722 - static struct rdt_options rdt_options[] __initdata = { 741 + static struct rdt_options rdt_options[] __ro_after_init = { 723 742 RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), 724 743 RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), 725 744 RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), ··· 759 778 } 760 779 __setup("rdt", set_rdt_options); 761 780 762 - bool __init rdt_cpu_has(int flag) 781 + bool rdt_cpu_has(int flag) 763 782 { 764 783 bool ret = boot_cpu_has(flag); 765 784 struct rdt_options *o; ··· 779 798 return ret; 780 799 } 781 800 782 - __init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) 801 + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) 783 802 { 784 803 if (!rdt_cpu_has(X86_FEATURE_BMEC)) 785 804 return false; ··· 993 1012 static int __init resctrl_arch_late_init(void) 994 1013 { 995 1014 struct rdt_resource *r; 996 - int state, ret; 1015 + int state, ret, i; 1016 + 1017 + /* for_each_rdt_resource() requires all rid to be initialised. */ 1018 + for (i = 0; i < RDT_NUM_RESOURCES; i++) 1019 + rdt_resources_all[i].r_resctrl.rid = i; 997 1020 998 1021 /* 999 1022 * Initialize functions(or definitions) that are different
-635
arch/x86/kernel/cpu/resctrl/ctrlmondata.c
··· 16 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 17 18 18 #include <linux/cpu.h> 19 - #include <linux/kernfs.h> 20 - #include <linux/seq_file.h> 21 - #include <linux/slab.h> 22 - #include <linux/tick.h> 23 19 24 20 #include "internal.h" 25 - 26 - struct rdt_parse_data { 27 - struct rdtgroup *rdtgrp; 28 - char *buf; 29 - }; 30 - 31 - typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, 32 - struct resctrl_schema *s, 33 - struct rdt_ctrl_domain *d); 34 - 35 - /* 36 - * Check whether MBA bandwidth percentage value is correct. The value is 37 - * checked against the minimum and max bandwidth values specified by the 38 - * hardware. The allocated bandwidth percentage is rounded to the next 39 - * control step available on the hardware. 40 - */ 41 - static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) 42 - { 43 - int ret; 44 - u32 bw; 45 - 46 - /* 47 - * Only linear delay values is supported for current Intel SKUs. 48 - */ 49 - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { 50 - rdt_last_cmd_puts("No support for non-linear MB domains\n"); 51 - return false; 52 - } 53 - 54 - ret = kstrtou32(buf, 10, &bw); 55 - if (ret) { 56 - rdt_last_cmd_printf("Invalid MB value %s\n", buf); 57 - return false; 58 - } 59 - 60 - /* Nothing else to do if software controller is enabled. */ 61 - if (is_mba_sc(r)) { 62 - *data = bw; 63 - return true; 64 - } 65 - 66 - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { 67 - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", 68 - bw, r->membw.min_bw, r->membw.max_bw); 69 - return false; 70 - } 71 - 72 - *data = roundup(bw, (unsigned long)r->membw.bw_gran); 73 - return true; 74 - } 75 - 76 - static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, 77 - struct rdt_ctrl_domain *d) 78 - { 79 - struct resctrl_staged_config *cfg; 80 - u32 closid = data->rdtgrp->closid; 81 - struct rdt_resource *r = s->res; 82 - u32 bw_val; 83 - 84 - cfg = &d->staged_config[s->conf_type]; 85 - if (cfg->have_new_ctrl) { 86 - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 87 - return -EINVAL; 88 - } 89 - 90 - if (!bw_validate(data->buf, &bw_val, r)) 91 - return -EINVAL; 92 - 93 - if (is_mba_sc(r)) { 94 - d->mbps_val[closid] = bw_val; 95 - return 0; 96 - } 97 - 98 - cfg->new_ctrl = bw_val; 99 - cfg->have_new_ctrl = true; 100 - 101 - return 0; 102 - } 103 - 104 - /* 105 - * Check whether a cache bit mask is valid. 106 - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: 107 - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 108 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 109 - * 110 - * Haswell does not support a non-contiguous 1s value and additionally 111 - * requires at least two bits set. 112 - * AMD allows non-contiguous bitmasks. 113 - */ 114 - static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) 115 - { 116 - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; 117 - unsigned int cbm_len = r->cache.cbm_len; 118 - unsigned long first_bit, zero_bit, val; 119 - int ret; 120 - 121 - ret = kstrtoul(buf, 16, &val); 122 - if (ret) { 123 - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); 124 - return false; 125 - } 126 - 127 - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { 128 - rdt_last_cmd_puts("Mask out of range\n"); 129 - return false; 130 - } 131 - 132 - first_bit = find_first_bit(&val, cbm_len); 133 - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 134 - 135 - /* Are non-contiguous bitmasks allowed? */ 136 - if (!r->cache.arch_has_sparse_bitmasks && 137 - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { 138 - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); 139 - return false; 140 - } 141 - 142 - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { 143 - rdt_last_cmd_printf("Need at least %d bits in the mask\n", 144 - r->cache.min_cbm_bits); 145 - return false; 146 - } 147 - 148 - *data = val; 149 - return true; 150 - } 151 - 152 - /* 153 - * Read one cache bit mask (hex). Check that it is valid for the current 154 - * resource type. 155 - */ 156 - static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, 157 - struct rdt_ctrl_domain *d) 158 - { 159 - struct rdtgroup *rdtgrp = data->rdtgrp; 160 - struct resctrl_staged_config *cfg; 161 - struct rdt_resource *r = s->res; 162 - u32 cbm_val; 163 - 164 - cfg = &d->staged_config[s->conf_type]; 165 - if (cfg->have_new_ctrl) { 166 - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 167 - return -EINVAL; 168 - } 169 - 170 - /* 171 - * Cannot set up more than one pseudo-locked region in a cache 172 - * hierarchy. 173 - */ 174 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 175 - rdtgroup_pseudo_locked_in_hierarchy(d)) { 176 - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); 177 - return -EINVAL; 178 - } 179 - 180 - if (!cbm_validate(data->buf, &cbm_val, r)) 181 - return -EINVAL; 182 - 183 - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || 184 - rdtgrp->mode == RDT_MODE_SHAREABLE) && 185 - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { 186 - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); 187 - return -EINVAL; 188 - } 189 - 190 - /* 191 - * The CBM may not overlap with the CBM of another closid if 192 - * either is exclusive. 193 - */ 194 - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { 195 - rdt_last_cmd_puts("Overlaps with exclusive group\n"); 196 - return -EINVAL; 197 - } 198 - 199 - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { 200 - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || 201 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 202 - rdt_last_cmd_puts("Overlaps with other group\n"); 203 - return -EINVAL; 204 - } 205 - } 206 - 207 - cfg->new_ctrl = cbm_val; 208 - cfg->have_new_ctrl = true; 209 - 210 - return 0; 211 - } 212 - 213 - /* 214 - * For each domain in this resource we expect to find a series of: 215 - * id=mask 216 - * separated by ";". The "id" is in decimal, and must match one of 217 - * the "id"s for this resource. 218 - */ 219 - static int parse_line(char *line, struct resctrl_schema *s, 220 - struct rdtgroup *rdtgrp) 221 - { 222 - enum resctrl_conf_type t = s->conf_type; 223 - ctrlval_parser_t *parse_ctrlval = NULL; 224 - struct resctrl_staged_config *cfg; 225 - struct rdt_resource *r = s->res; 226 - struct rdt_parse_data data; 227 - struct rdt_ctrl_domain *d; 228 - char *dom = NULL, *id; 229 - unsigned long dom_id; 230 - 231 - /* Walking r->domains, ensure it can't race with cpuhp */ 232 - lockdep_assert_cpus_held(); 233 - 234 - switch (r->schema_fmt) { 235 - case RESCTRL_SCHEMA_BITMAP: 236 - parse_ctrlval = &parse_cbm; 237 - break; 238 - case RESCTRL_SCHEMA_RANGE: 239 - parse_ctrlval = &parse_bw; 240 - break; 241 - } 242 - 243 - if (WARN_ON_ONCE(!parse_ctrlval)) 244 - return -EINVAL; 245 - 246 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 247 - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { 248 - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); 249 - return -EINVAL; 250 - } 251 - 252 - next: 253 - if (!line || line[0] == '\0') 254 - return 0; 255 - dom = strsep(&line, ";"); 256 - id = strsep(&dom, "="); 257 - if (!dom || kstrtoul(id, 10, &dom_id)) { 258 - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); 259 - return -EINVAL; 260 - } 261 - dom = strim(dom); 262 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 263 - if (d->hdr.id == dom_id) { 264 - data.buf = dom; 265 - data.rdtgrp = rdtgrp; 266 - if (parse_ctrlval(&data, s, d)) 267 - return -EINVAL; 268 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 269 - cfg = &d->staged_config[t]; 270 - /* 271 - * In pseudo-locking setup mode and just 272 - * parsed a valid CBM that should be 273 - * pseudo-locked. Only one locked region per 274 - * resource group and domain so just do 275 - * the required initialization for single 276 - * region and return. 277 - */ 278 - rdtgrp->plr->s = s; 279 - rdtgrp->plr->d = d; 280 - rdtgrp->plr->cbm = cfg->new_ctrl; 281 - d->plr = rdtgrp->plr; 282 - return 0; 283 - } 284 - goto next; 285 - } 286 - } 287 - return -EINVAL; 288 - } 289 21 290 22 int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, 291 23 u32 closid, enum resctrl_conf_type t, u32 cfg_val) ··· 83 351 return 0; 84 352 } 85 353 86 - static int rdtgroup_parse_resource(char *resname, char *tok, 87 - struct rdtgroup *rdtgrp) 88 - { 89 - struct resctrl_schema *s; 90 - 91 - list_for_each_entry(s, &resctrl_schema_all, list) { 92 - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) 93 - return parse_line(tok, s, rdtgrp); 94 - } 95 - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); 96 - return -EINVAL; 97 - } 98 - 99 - ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 100 - char *buf, size_t nbytes, loff_t off) 101 - { 102 - struct resctrl_schema *s; 103 - struct rdtgroup *rdtgrp; 104 - struct rdt_resource *r; 105 - char *tok, *resname; 106 - int ret = 0; 107 - 108 - /* Valid input requires a trailing newline */ 109 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 110 - return -EINVAL; 111 - buf[nbytes - 1] = '\0'; 112 - 113 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 114 - if (!rdtgrp) { 115 - rdtgroup_kn_unlock(of->kn); 116 - return -ENOENT; 117 - } 118 - rdt_last_cmd_clear(); 119 - 120 - /* 121 - * No changes to pseudo-locked region allowed. It has to be removed 122 - * and re-created instead. 123 - */ 124 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 125 - ret = -EINVAL; 126 - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); 127 - goto out; 128 - } 129 - 130 - rdt_staged_configs_clear(); 131 - 132 - while ((tok = strsep(&buf, "\n")) != NULL) { 133 - resname = strim(strsep(&tok, ":")); 134 - if (!tok) { 135 - rdt_last_cmd_puts("Missing ':'\n"); 136 - ret = -EINVAL; 137 - goto out; 138 - } 139 - if (tok[0] == '\0') { 140 - rdt_last_cmd_printf("Missing '%s' value\n", resname); 141 - ret = -EINVAL; 142 - goto out; 143 - } 144 - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); 145 - if (ret) 146 - goto out; 147 - } 148 - 149 - list_for_each_entry(s, &resctrl_schema_all, list) { 150 - r = s->res; 151 - 152 - /* 153 - * Writes to mba_sc resources update the software controller, 154 - * not the control MSR. 155 - */ 156 - if (is_mba_sc(r)) 157 - continue; 158 - 159 - ret = resctrl_arch_update_domains(r, rdtgrp->closid); 160 - if (ret) 161 - goto out; 162 - } 163 - 164 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 165 - /* 166 - * If pseudo-locking fails we keep the resource group in 167 - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service 168 - * active and updated for just the domain the pseudo-locked 169 - * region was requested for. 170 - */ 171 - ret = rdtgroup_pseudo_lock_create(rdtgrp); 172 - } 173 - 174 - out: 175 - rdt_staged_configs_clear(); 176 - rdtgroup_kn_unlock(of->kn); 177 - return ret ?: nbytes; 178 - } 179 - 180 354 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, 181 355 u32 closid, enum resctrl_conf_type type) 182 356 { ··· 90 452 u32 idx = resctrl_get_config_index(closid, type); 91 453 92 454 return hw_dom->ctrl_val[idx]; 93 - } 94 - 95 - static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) 96 - { 97 - struct rdt_resource *r = schema->res; 98 - struct rdt_ctrl_domain *dom; 99 - bool sep = false; 100 - u32 ctrl_val; 101 - 102 - /* Walking r->domains, ensure it can't race with cpuhp */ 103 - lockdep_assert_cpus_held(); 104 - 105 - seq_printf(s, "%*s:", max_name_width, schema->name); 106 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 107 - if (sep) 108 - seq_puts(s, ";"); 109 - 110 - if (is_mba_sc(r)) 111 - ctrl_val = dom->mbps_val[closid]; 112 - else 113 - ctrl_val = resctrl_arch_get_config(r, dom, closid, 114 - schema->conf_type); 115 - 116 - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); 117 - sep = true; 118 - } 119 - seq_puts(s, "\n"); 120 - } 121 - 122 - int rdtgroup_schemata_show(struct kernfs_open_file *of, 123 - struct seq_file *s, void *v) 124 - { 125 - struct resctrl_schema *schema; 126 - struct rdtgroup *rdtgrp; 127 - int ret = 0; 128 - u32 closid; 129 - 130 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 131 - if (rdtgrp) { 132 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 133 - list_for_each_entry(schema, &resctrl_schema_all, list) { 134 - seq_printf(s, "%s:uninitialized\n", schema->name); 135 - } 136 - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 137 - if (!rdtgrp->plr->d) { 138 - rdt_last_cmd_clear(); 139 - rdt_last_cmd_puts("Cache domain offline\n"); 140 - ret = -ENODEV; 141 - } else { 142 - seq_printf(s, "%s:%d=%x\n", 143 - rdtgrp->plr->s->res->name, 144 - rdtgrp->plr->d->hdr.id, 145 - rdtgrp->plr->cbm); 146 - } 147 - } else { 148 - closid = rdtgrp->closid; 149 - list_for_each_entry(schema, &resctrl_schema_all, list) { 150 - if (closid < schema->num_closid) 151 - show_doms(s, schema, closid); 152 - } 153 - } 154 - } else { 155 - ret = -ENOENT; 156 - } 157 - rdtgroup_kn_unlock(of->kn); 158 - return ret; 159 - } 160 - 161 - static int smp_mon_event_count(void *arg) 162 - { 163 - mon_event_count(arg); 164 - 165 - return 0; 166 - } 167 - 168 - ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 169 - char *buf, size_t nbytes, loff_t off) 170 - { 171 - struct rdtgroup *rdtgrp; 172 - int ret = 0; 173 - 174 - /* Valid input requires a trailing newline */ 175 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 176 - return -EINVAL; 177 - buf[nbytes - 1] = '\0'; 178 - 179 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 180 - if (!rdtgrp) { 181 - rdtgroup_kn_unlock(of->kn); 182 - return -ENOENT; 183 - } 184 - rdt_last_cmd_clear(); 185 - 186 - if (!strcmp(buf, "mbm_local_bytes")) { 187 - if (resctrl_arch_is_mbm_local_enabled()) 188 - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; 189 - else 190 - ret = -EINVAL; 191 - } else if (!strcmp(buf, "mbm_total_bytes")) { 192 - if (resctrl_arch_is_mbm_total_enabled()) 193 - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; 194 - else 195 - ret = -EINVAL; 196 - } else { 197 - ret = -EINVAL; 198 - } 199 - 200 - if (ret) 201 - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); 202 - 203 - rdtgroup_kn_unlock(of->kn); 204 - 205 - return ret ?: nbytes; 206 - } 207 - 208 - int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 209 - struct seq_file *s, void *v) 210 - { 211 - struct rdtgroup *rdtgrp; 212 - int ret = 0; 213 - 214 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 215 - 216 - if (rdtgrp) { 217 - switch (rdtgrp->mba_mbps_event) { 218 - case QOS_L3_MBM_LOCAL_EVENT_ID: 219 - seq_puts(s, "mbm_local_bytes\n"); 220 - break; 221 - case QOS_L3_MBM_TOTAL_EVENT_ID: 222 - seq_puts(s, "mbm_total_bytes\n"); 223 - break; 224 - default: 225 - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); 226 - ret = -EINVAL; 227 - break; 228 - } 229 - } else { 230 - ret = -ENOENT; 231 - } 232 - 233 - rdtgroup_kn_unlock(of->kn); 234 - 235 - return ret; 236 - } 237 - 238 - struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, 239 - struct list_head **pos) 240 - { 241 - struct rdt_domain_hdr *d; 242 - struct list_head *l; 243 - 244 - list_for_each(l, h) { 245 - d = list_entry(l, struct rdt_domain_hdr, list); 246 - /* When id is found, return its domain. */ 247 - if (id == d->id) 248 - return d; 249 - /* Stop searching when finding id's position in sorted list. */ 250 - if (id < d->id) 251 - break; 252 - } 253 - 254 - if (pos) 255 - *pos = l; 256 - 257 - return NULL; 258 - } 259 - 260 - void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 261 - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 262 - cpumask_t *cpumask, int evtid, int first) 263 - { 264 - int cpu; 265 - 266 - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ 267 - lockdep_assert_cpus_held(); 268 - 269 - /* 270 - * Setup the parameters to pass to mon_event_count() to read the data. 271 - */ 272 - rr->rgrp = rdtgrp; 273 - rr->evtid = evtid; 274 - rr->r = r; 275 - rr->d = d; 276 - rr->first = first; 277 - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); 278 - if (IS_ERR(rr->arch_mon_ctx)) { 279 - rr->err = -EINVAL; 280 - return; 281 - } 282 - 283 - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); 284 - 285 - /* 286 - * cpumask_any_housekeeping() prefers housekeeping CPUs, but 287 - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. 288 - * MPAM's resctrl_arch_rmid_read() is unable to read the 289 - * counters on some platforms if its called in IRQ context. 290 - */ 291 - if (tick_nohz_full_cpu(cpu)) 292 - smp_call_function_any(cpumask, mon_event_count, rr, 1); 293 - else 294 - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); 295 - 296 - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); 297 - } 298 - 299 - int rdtgroup_mondata_show(struct seq_file *m, void *arg) 300 - { 301 - struct kernfs_open_file *of = m->private; 302 - struct rdt_domain_hdr *hdr; 303 - struct rmid_read rr = {0}; 304 - struct rdt_mon_domain *d; 305 - u32 resid, evtid, domid; 306 - struct rdtgroup *rdtgrp; 307 - struct rdt_resource *r; 308 - union mon_data_bits md; 309 - int ret = 0; 310 - 311 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 312 - if (!rdtgrp) { 313 - ret = -ENOENT; 314 - goto out; 315 - } 316 - 317 - md.priv = of->kn->priv; 318 - resid = md.u.rid; 319 - domid = md.u.domid; 320 - evtid = md.u.evtid; 321 - r = resctrl_arch_get_resource(resid); 322 - 323 - if (md.u.sum) { 324 - /* 325 - * This file requires summing across all domains that share 326 - * the L3 cache id that was provided in the "domid" field of the 327 - * mon_data_bits union. Search all domains in the resource for 328 - * one that matches this cache id. 329 - */ 330 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 331 - if (d->ci->id == domid) { 332 - rr.ci = d->ci; 333 - mon_event_read(&rr, r, NULL, rdtgrp, 334 - &d->ci->shared_cpu_map, evtid, false); 335 - goto checkresult; 336 - } 337 - } 338 - ret = -ENOENT; 339 - goto out; 340 - } else { 341 - /* 342 - * This file provides data from a single domain. Search 343 - * the resource to find the domain with "domid". 344 - */ 345 - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); 346 - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { 347 - ret = -ENOENT; 348 - goto out; 349 - } 350 - d = container_of(hdr, struct rdt_mon_domain, hdr); 351 - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); 352 - } 353 - 354 - checkresult: 355 - 356 - if (rr.err == -EIO) 357 - seq_puts(m, "Error\n"); 358 - else if (rr.err == -EINVAL) 359 - seq_puts(m, "Unavailable\n"); 360 - else 361 - seq_printf(m, "%llu\n", rr.val); 362 - 363 - out: 364 - rdtgroup_kn_unlock(of->kn); 365 - return ret; 366 455 }
+9 -390
arch/x86/kernel/cpu/resctrl/internal.h
··· 3 3 #define _ASM_X86_RESCTRL_INTERNAL_H 4 4 5 5 #include <linux/resctrl.h> 6 - #include <linux/sched.h> 7 - #include <linux/kernfs.h> 8 - #include <linux/fs_context.h> 9 - #include <linux/jump_label.h> 10 - #include <linux/tick.h> 11 - 12 - #include <asm/resctrl.h> 13 6 14 7 #define L3_QOS_CDP_ENABLE 0x01ULL 15 8 16 9 #define L2_QOS_CDP_ENABLE 0x01ULL 17 10 18 - #define CQM_LIMBOCHECK_INTERVAL 1000 19 - 20 11 #define MBM_CNTR_WIDTH_BASE 24 21 - #define MBM_OVERFLOW_INTERVAL 1000 22 - #define MAX_MBA_BW 100u 12 + 23 13 #define MBA_IS_LINEAR 0x4 14 + 24 15 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 25 16 26 17 #define RMID_VAL_ERROR BIT_ULL(63) 18 + 27 19 #define RMID_VAL_UNAVAIL BIT_ULL(62) 20 + 28 21 /* 29 22 * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for 30 23 * data to be returned. The counter width is discovered from the hardware 31 24 * as an offset from MBM_CNTR_WIDTH_BASE. 32 25 */ 33 26 #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) 34 - 35 - /** 36 - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that 37 - * aren't marked nohz_full 38 - * @mask: The mask to pick a CPU from. 39 - * @exclude_cpu:The CPU to avoid picking. 40 - * 41 - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping 42 - * CPUs that don't use nohz_full, these are preferred. Pass 43 - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. 44 - * 45 - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. 46 - */ 47 - static inline unsigned int 48 - cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) 49 - { 50 - unsigned int cpu, hk_cpu; 51 - 52 - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) 53 - cpu = cpumask_any(mask); 54 - else 55 - cpu = cpumask_any_but(mask, exclude_cpu); 56 - 57 - /* Only continue if tick_nohz_full_mask has been initialized. */ 58 - if (!tick_nohz_full_enabled()) 59 - return cpu; 60 - 61 - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ 62 - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) 63 - return cpu; 64 - 65 - /* Try to find a CPU that isn't nohz_full to use in preference */ 66 - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); 67 - if (hk_cpu == exclude_cpu) 68 - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); 69 - 70 - if (hk_cpu < nr_cpu_ids) 71 - cpu = hk_cpu; 72 - 73 - return cpu; 74 - } 75 - 76 - struct rdt_fs_context { 77 - struct kernfs_fs_context kfc; 78 - bool enable_cdpl2; 79 - bool enable_cdpl3; 80 - bool enable_mba_mbps; 81 - bool enable_debug; 82 - }; 83 - 84 - static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) 85 - { 86 - struct kernfs_fs_context *kfc = fc->fs_private; 87 - 88 - return container_of(kfc, struct rdt_fs_context, kfc); 89 - } 90 - 91 - /** 92 - * struct mon_evt - Entry in the event list of a resource 93 - * @evtid: event id 94 - * @name: name of the event 95 - * @configurable: true if the event is configurable 96 - * @list: entry in &rdt_resource->evt_list 97 - */ 98 - struct mon_evt { 99 - enum resctrl_event_id evtid; 100 - char *name; 101 - bool configurable; 102 - struct list_head list; 103 - }; 104 - 105 - /** 106 - * union mon_data_bits - Monitoring details for each event file. 107 - * @priv: Used to store monitoring event data in @u 108 - * as kernfs private data. 109 - * @u.rid: Resource id associated with the event file. 110 - * @u.evtid: Event id associated with the event file. 111 - * @u.sum: Set when event must be summed across multiple 112 - * domains. 113 - * @u.domid: When @u.sum is zero this is the domain to which 114 - * the event file belongs. When @sum is one this 115 - * is the id of the L3 cache that all domains to be 116 - * summed share. 117 - * @u: Name of the bit fields struct. 118 - */ 119 - union mon_data_bits { 120 - void *priv; 121 - struct { 122 - unsigned int rid : 10; 123 - enum resctrl_event_id evtid : 7; 124 - unsigned int sum : 1; 125 - unsigned int domid : 14; 126 - } u; 127 - }; 128 - 129 - /** 130 - * struct rmid_read - Data passed across smp_call*() to read event count. 131 - * @rgrp: Resource group for which the counter is being read. If it is a parent 132 - * resource group then its event count is summed with the count from all 133 - * its child resource groups. 134 - * @r: Resource describing the properties of the event being read. 135 - * @d: Domain that the counter should be read from. If NULL then sum all 136 - * domains in @r sharing L3 @ci.id 137 - * @evtid: Which monitor event to read. 138 - * @first: Initialize MBM counter when true. 139 - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. 140 - * @err: Error encountered when reading counter. 141 - * @val: Returned value of event counter. If @rgrp is a parent resource group, 142 - * @val includes the sum of event counts from its child resource groups. 143 - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, 144 - * (summed across child resource groups if @rgrp is a parent resource group). 145 - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). 146 - */ 147 - struct rmid_read { 148 - struct rdtgroup *rgrp; 149 - struct rdt_resource *r; 150 - struct rdt_mon_domain *d; 151 - enum resctrl_event_id evtid; 152 - bool first; 153 - struct cacheinfo *ci; 154 - int err; 155 - u64 val; 156 - void *arch_mon_ctx; 157 - }; 158 - 159 - extern struct list_head resctrl_schema_all; 160 - extern bool resctrl_mounted; 161 - 162 - enum rdt_group_type { 163 - RDTCTRL_GROUP = 0, 164 - RDTMON_GROUP, 165 - RDT_NUM_GROUP, 166 - }; 167 - 168 - /** 169 - * enum rdtgrp_mode - Mode of a RDT resource group 170 - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations 171 - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed 172 - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking 173 - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations 174 - * allowed AND the allocations are Cache Pseudo-Locked 175 - * @RDT_NUM_MODES: Total number of modes 176 - * 177 - * The mode of a resource group enables control over the allowed overlap 178 - * between allocations associated with different resource groups (classes 179 - * of service). User is able to modify the mode of a resource group by 180 - * writing to the "mode" resctrl file associated with the resource group. 181 - * 182 - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by 183 - * writing the appropriate text to the "mode" file. A resource group enters 184 - * "pseudo-locked" mode after the schemata is written while the resource 185 - * group is in "pseudo-locksetup" mode. 186 - */ 187 - enum rdtgrp_mode { 188 - RDT_MODE_SHAREABLE = 0, 189 - RDT_MODE_EXCLUSIVE, 190 - RDT_MODE_PSEUDO_LOCKSETUP, 191 - RDT_MODE_PSEUDO_LOCKED, 192 - 193 - /* Must be last */ 194 - RDT_NUM_MODES, 195 - }; 196 - 197 - /** 198 - * struct mongroup - store mon group's data in resctrl fs. 199 - * @mon_data_kn: kernfs node for the mon_data directory 200 - * @parent: parent rdtgrp 201 - * @crdtgrp_list: child rdtgroup node list 202 - * @rmid: rmid for this rdtgroup 203 - */ 204 - struct mongroup { 205 - struct kernfs_node *mon_data_kn; 206 - struct rdtgroup *parent; 207 - struct list_head crdtgrp_list; 208 - u32 rmid; 209 - }; 210 - 211 - /** 212 - * struct rdtgroup - store rdtgroup's data in resctrl file system. 213 - * @kn: kernfs node 214 - * @rdtgroup_list: linked list for all rdtgroups 215 - * @closid: closid for this rdtgroup 216 - * @cpu_mask: CPUs assigned to this rdtgroup 217 - * @flags: status bits 218 - * @waitcount: how many cpus expect to find this 219 - * group when they acquire rdtgroup_mutex 220 - * @type: indicates type of this rdtgroup - either 221 - * monitor only or ctrl_mon group 222 - * @mon: mongroup related data 223 - * @mode: mode of resource group 224 - * @mba_mbps_event: input monitoring event id when mba_sc is enabled 225 - * @plr: pseudo-locked region 226 - */ 227 - struct rdtgroup { 228 - struct kernfs_node *kn; 229 - struct list_head rdtgroup_list; 230 - u32 closid; 231 - struct cpumask cpu_mask; 232 - int flags; 233 - atomic_t waitcount; 234 - enum rdt_group_type type; 235 - struct mongroup mon; 236 - enum rdtgrp_mode mode; 237 - enum resctrl_event_id mba_mbps_event; 238 - struct pseudo_lock_region *plr; 239 - }; 240 - 241 - /* rdtgroup.flags */ 242 - #define RDT_DELETED 1 243 - 244 - /* rftype.flags */ 245 - #define RFTYPE_FLAGS_CPUS_LIST 1 246 - 247 - /* 248 - * Define the file type flags for base and info directories. 249 - */ 250 - #define RFTYPE_INFO BIT(0) 251 - #define RFTYPE_BASE BIT(1) 252 - #define RFTYPE_CTRL BIT(4) 253 - #define RFTYPE_MON BIT(5) 254 - #define RFTYPE_TOP BIT(6) 255 - #define RFTYPE_RES_CACHE BIT(8) 256 - #define RFTYPE_RES_MB BIT(9) 257 - #define RFTYPE_DEBUG BIT(10) 258 - #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) 259 - #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) 260 - #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) 261 - #define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) 262 - #define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) 263 - 264 - /* List of all resource groups */ 265 - extern struct list_head rdt_all_groups; 266 - 267 - extern int max_name_width; 268 - 269 - /** 270 - * struct rftype - describe each file in the resctrl file system 271 - * @name: File name 272 - * @mode: Access mode 273 - * @kf_ops: File operations 274 - * @flags: File specific RFTYPE_FLAGS_* flags 275 - * @fflags: File specific RFTYPE_* flags 276 - * @seq_show: Show content of the file 277 - * @write: Write to the file 278 - */ 279 - struct rftype { 280 - char *name; 281 - umode_t mode; 282 - const struct kernfs_ops *kf_ops; 283 - unsigned long flags; 284 - unsigned long fflags; 285 - 286 - int (*seq_show)(struct kernfs_open_file *of, 287 - struct seq_file *sf, void *v); 288 - /* 289 - * write() is the generic write callback which maps directly to 290 - * kernfs write operation and overrides all other operations. 291 - * Maximum write size is determined by ->max_write_len. 292 - */ 293 - ssize_t (*write)(struct kernfs_open_file *of, 294 - char *buf, size_t nbytes, loff_t off); 295 - }; 296 - 297 - /** 298 - * struct mbm_state - status for each MBM counter in each domain 299 - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation 300 - * @prev_bw: The most recent bandwidth in MBps 301 - */ 302 - struct mbm_state { 303 - u64 prev_bw_bytes; 304 - u32 prev_bw; 305 - }; 306 27 307 28 /** 308 29 * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s ··· 122 401 return container_of(r, struct rdt_hw_resource, r_resctrl); 123 402 } 124 403 125 - extern struct mutex rdtgroup_mutex; 126 - 127 - static inline const char *rdt_kn_name(const struct kernfs_node *kn) 128 - { 129 - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); 130 - } 131 - 132 404 extern struct rdt_hw_resource rdt_resources_all[]; 133 - extern struct rdtgroup rdtgroup_default; 134 - extern struct dentry *debugfs_resctrl; 135 - extern enum resctrl_event_id mba_mbps_default_event; 136 - 137 - static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) 138 - { 139 - return rdt_resources_all[l].cdp_enabled; 140 - } 141 - 142 - int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); 143 405 144 406 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); 145 407 ··· 159 455 unsigned int full; 160 456 }; 161 457 162 - void rdt_last_cmd_clear(void); 163 - void rdt_last_cmd_puts(const char *s); 164 - __printf(1, 2) 165 - void rdt_last_cmd_printf(const char *fmt, ...); 166 - 167 458 void rdt_ctrl_update(void *arg); 168 - struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); 169 - void rdtgroup_kn_unlock(struct kernfs_node *kn); 170 - int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); 171 - int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 172 - umode_t mask); 173 - ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 174 - char *buf, size_t nbytes, loff_t off); 175 - int rdtgroup_schemata_show(struct kernfs_open_file *of, 176 - struct seq_file *s, void *v); 177 - ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 178 - char *buf, size_t nbytes, loff_t off); 179 - int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 180 - struct seq_file *s, void *v); 181 - bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 182 - unsigned long cbm, int closid, bool exclusive); 183 - unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, 184 - unsigned long cbm); 185 - enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); 186 - int rdtgroup_tasks_assigned(struct rdtgroup *r); 187 - int closids_supported(void); 188 - void closid_free(int closid); 189 - int alloc_rmid(u32 closid); 190 - void free_rmid(u32 closid, u32 rmid); 459 + 191 460 int rdt_get_mon_l3_config(struct rdt_resource *r); 192 - void resctrl_mon_resource_exit(void); 193 - bool __init rdt_cpu_has(int flag); 194 - void mon_event_count(void *info); 195 - int rdtgroup_mondata_show(struct seq_file *m, void *arg); 196 - void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 197 - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 198 - cpumask_t *cpumask, int evtid, int first); 199 - int __init resctrl_mon_resource_init(void); 200 - void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, 201 - unsigned long delay_ms, 202 - int exclude_cpu); 203 - void mbm_handle_overflow(struct work_struct *work); 461 + 462 + bool rdt_cpu_has(int flag); 463 + 204 464 void __init intel_rdt_mbm_apply_quirk(void); 205 - bool is_mba_sc(struct rdt_resource *r); 206 - void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 207 - int exclude_cpu); 208 - void cqm_handle_limbo(struct work_struct *work); 209 - bool has_busy_rmid(struct rdt_mon_domain *d); 210 - void __check_limbo(struct rdt_mon_domain *d, bool force_free); 465 + 211 466 void rdt_domain_reconfigure_cdp(struct rdt_resource *r); 212 - void resctrl_file_fflags_init(const char *config, unsigned long fflags); 213 - void rdt_staged_configs_clear(void); 214 - bool closid_allocated(unsigned int closid); 215 - int resctrl_find_cleanest_closid(void); 216 - 217 - #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 218 - int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); 219 - int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); 220 - bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); 221 - bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); 222 - int rdt_pseudo_lock_init(void); 223 - void rdt_pseudo_lock_release(void); 224 - int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); 225 - void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); 226 - #else 227 - static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 228 - { 229 - return -EOPNOTSUPP; 230 - } 231 - 232 - static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 233 - { 234 - return -EOPNOTSUPP; 235 - } 236 - 237 - static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 238 - { 239 - return false; 240 - } 241 - 242 - static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 243 - { 244 - return false; 245 - } 246 - 247 - static inline int rdt_pseudo_lock_init(void) { return 0; } 248 - static inline void rdt_pseudo_lock_release(void) { } 249 - static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 250 - { 251 - return -EOPNOTSUPP; 252 - } 253 - 254 - static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } 255 - #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 256 467 257 468 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
+7 -911
arch/x86/kernel/cpu/resctrl/monitor.c
··· 18 18 #define pr_fmt(fmt) "resctrl: " fmt 19 19 20 20 #include <linux/cpu.h> 21 - #include <linux/module.h> 22 - #include <linux/sizes.h> 23 - #include <linux/slab.h> 21 + #include <linux/resctrl.h> 24 22 25 23 #include <asm/cpu_device_id.h> 26 24 #include <asm/msr.h> 27 - #include <asm/resctrl.h> 28 25 29 26 #include "internal.h" 30 - #include "trace.h" 31 - 32 - /** 33 - * struct rmid_entry - dirty tracking for all RMID. 34 - * @closid: The CLOSID for this entry. 35 - * @rmid: The RMID for this entry. 36 - * @busy: The number of domains with cached data using this RMID. 37 - * @list: Member of the rmid_free_lru list when busy == 0. 38 - * 39 - * Depending on the architecture the correct monitor is accessed using 40 - * both @closid and @rmid, or @rmid only. 41 - * 42 - * Take the rdtgroup_mutex when accessing. 43 - */ 44 - struct rmid_entry { 45 - u32 closid; 46 - u32 rmid; 47 - int busy; 48 - struct list_head list; 49 - }; 50 - 51 - /* 52 - * @rmid_free_lru - A least recently used list of free RMIDs 53 - * These RMIDs are guaranteed to have an occupancy less than the 54 - * threshold occupancy 55 - */ 56 - static LIST_HEAD(rmid_free_lru); 57 - 58 - /* 59 - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 60 - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 61 - * Indexed by CLOSID. Protected by rdtgroup_mutex. 62 - */ 63 - static u32 *closid_num_dirty_rmid; 64 - 65 - /* 66 - * @rmid_limbo_count - count of currently unused but (potentially) 67 - * dirty RMIDs. 68 - * This counts RMIDs that no one is currently using but that 69 - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 70 - * change the threshold occupancy value. 71 - */ 72 - static unsigned int rmid_limbo_count; 73 - 74 - /* 75 - * @rmid_entry - The entry in the limbo and free lists. 76 - */ 77 - static struct rmid_entry *rmid_ptrs; 78 27 79 28 /* 80 29 * Global boolean for rdt_monitor which is true if any ··· 36 87 */ 37 88 unsigned int rdt_mon_features; 38 89 39 - /* 40 - * This is the threshold cache occupancy in bytes at which we will consider an 41 - * RMID available for re-allocation. 42 - */ 43 - unsigned int resctrl_rmid_realloc_threshold; 44 - 45 - /* 46 - * This is the maximum value for the reallocation threshold, in bytes. 47 - */ 48 - unsigned int resctrl_rmid_realloc_limit; 49 - 50 90 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 51 91 52 92 static int snc_nodes_per_l3_cache = 1; 53 93 54 94 /* 55 - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 95 + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 56 96 * If rmid > rmid threshold, MBM total and local values should be multiplied 57 97 * by the correction factor. 58 98 * ··· 90 152 }; 91 153 92 154 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 155 + 93 156 static u64 mbm_cf __read_mostly; 94 157 95 158 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) ··· 100 161 val = (val * mbm_cf) >> 20; 101 162 102 163 return val; 103 - } 104 - 105 - /* 106 - * x86 and arm64 differ in their handling of monitoring. 107 - * x86's RMID are independent numbers, there is only one source of traffic 108 - * with an RMID value of '1'. 109 - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 110 - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 111 - * value is no longer unique. 112 - * To account for this, resctrl uses an index. On x86 this is just the RMID, 113 - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 114 - * 115 - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 116 - * must accept an attempt to read every index. 117 - */ 118 - static inline struct rmid_entry *__rmid_entry(u32 idx) 119 - { 120 - struct rmid_entry *entry; 121 - u32 closid, rmid; 122 - 123 - entry = &rmid_ptrs[idx]; 124 - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 125 - 126 - WARN_ON_ONCE(entry->closid != closid); 127 - WARN_ON_ONCE(entry->rmid != rmid); 128 - 129 - return entry; 130 164 } 131 165 132 166 /* ··· 173 261 return &hw_dom->arch_mbm_total[rmid]; 174 262 case QOS_L3_MBM_LOCAL_EVENT_ID: 175 263 return &hw_dom->arch_mbm_local[rmid]; 264 + default: 265 + /* Never expect to get here */ 266 + WARN_ON_ONCE(1); 267 + return NULL; 176 268 } 177 - 178 - /* Never expect to get here */ 179 - WARN_ON_ONCE(1); 180 - 181 - return NULL; 182 269 } 183 270 184 271 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, ··· 256 345 *val = chunks * hw_res->mon_scale; 257 346 258 347 return 0; 259 - } 260 - 261 - static void limbo_release_entry(struct rmid_entry *entry) 262 - { 263 - lockdep_assert_held(&rdtgroup_mutex); 264 - 265 - rmid_limbo_count--; 266 - list_add_tail(&entry->list, &rmid_free_lru); 267 - 268 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 269 - closid_num_dirty_rmid[entry->closid]--; 270 - } 271 - 272 - /* 273 - * Check the RMIDs that are marked as busy for this domain. If the 274 - * reported LLC occupancy is below the threshold clear the busy bit and 275 - * decrement the count. If the busy count gets to zero on an RMID, we 276 - * free the RMID 277 - */ 278 - void __check_limbo(struct rdt_mon_domain *d, bool force_free) 279 - { 280 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 281 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 282 - struct rmid_entry *entry; 283 - u32 idx, cur_idx = 1; 284 - void *arch_mon_ctx; 285 - bool rmid_dirty; 286 - u64 val = 0; 287 - 288 - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 289 - if (IS_ERR(arch_mon_ctx)) { 290 - pr_warn_ratelimited("Failed to allocate monitor context: %ld", 291 - PTR_ERR(arch_mon_ctx)); 292 - return; 293 - } 294 - 295 - /* 296 - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 297 - * are marked as busy for occupancy < threshold. If the occupancy 298 - * is less than the threshold decrement the busy counter of the 299 - * RMID and move it to the free list when the counter reaches 0. 300 - */ 301 - for (;;) { 302 - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 303 - if (idx >= idx_limit) 304 - break; 305 - 306 - entry = __rmid_entry(idx); 307 - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 308 - QOS_L3_OCCUP_EVENT_ID, &val, 309 - arch_mon_ctx)) { 310 - rmid_dirty = true; 311 - } else { 312 - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 313 - 314 - /* 315 - * x86's CLOSID and RMID are independent numbers, so the entry's 316 - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 317 - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 318 - * used to select the configuration. It is thus necessary to track both 319 - * CLOSID and RMID because there may be dependencies between them 320 - * on some architectures. 321 - */ 322 - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 323 - } 324 - 325 - if (force_free || !rmid_dirty) { 326 - clear_bit(idx, d->rmid_busy_llc); 327 - if (!--entry->busy) 328 - limbo_release_entry(entry); 329 - } 330 - cur_idx = idx + 1; 331 - } 332 - 333 - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 334 - } 335 - 336 - bool has_busy_rmid(struct rdt_mon_domain *d) 337 - { 338 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 339 - 340 - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 341 - } 342 - 343 - static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 344 - { 345 - struct rmid_entry *itr; 346 - u32 itr_idx, cmp_idx; 347 - 348 - if (list_empty(&rmid_free_lru)) 349 - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 350 - 351 - list_for_each_entry(itr, &rmid_free_lru, list) { 352 - /* 353 - * Get the index of this free RMID, and the index it would need 354 - * to be if it were used with this CLOSID. 355 - * If the CLOSID is irrelevant on this architecture, the two 356 - * index values are always the same on every entry and thus the 357 - * very first entry will be returned. 358 - */ 359 - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 360 - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 361 - 362 - if (itr_idx == cmp_idx) 363 - return itr; 364 - } 365 - 366 - return ERR_PTR(-ENOSPC); 367 - } 368 - 369 - /** 370 - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 371 - * RMID are clean, or the CLOSID that has 372 - * the most clean RMID. 373 - * 374 - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 375 - * may not be able to allocate clean RMID. To avoid this the allocator will 376 - * choose the CLOSID with the most clean RMID. 377 - * 378 - * When the CLOSID and RMID are independent numbers, the first free CLOSID will 379 - * be returned. 380 - */ 381 - int resctrl_find_cleanest_closid(void) 382 - { 383 - u32 cleanest_closid = ~0; 384 - int i = 0; 385 - 386 - lockdep_assert_held(&rdtgroup_mutex); 387 - 388 - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 389 - return -EIO; 390 - 391 - for (i = 0; i < closids_supported(); i++) { 392 - int num_dirty; 393 - 394 - if (closid_allocated(i)) 395 - continue; 396 - 397 - num_dirty = closid_num_dirty_rmid[i]; 398 - if (num_dirty == 0) 399 - return i; 400 - 401 - if (cleanest_closid == ~0) 402 - cleanest_closid = i; 403 - 404 - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 405 - cleanest_closid = i; 406 - } 407 - 408 - if (cleanest_closid == ~0) 409 - return -ENOSPC; 410 - 411 - return cleanest_closid; 412 - } 413 - 414 - /* 415 - * For MPAM the RMID value is not unique, and has to be considered with 416 - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 417 - * allows all domains to be managed by a single free list. 418 - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 419 - */ 420 - int alloc_rmid(u32 closid) 421 - { 422 - struct rmid_entry *entry; 423 - 424 - lockdep_assert_held(&rdtgroup_mutex); 425 - 426 - entry = resctrl_find_free_rmid(closid); 427 - if (IS_ERR(entry)) 428 - return PTR_ERR(entry); 429 - 430 - list_del(&entry->list); 431 - return entry->rmid; 432 - } 433 - 434 - static void add_rmid_to_limbo(struct rmid_entry *entry) 435 - { 436 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 437 - struct rdt_mon_domain *d; 438 - u32 idx; 439 - 440 - lockdep_assert_held(&rdtgroup_mutex); 441 - 442 - /* Walking r->domains, ensure it can't race with cpuhp */ 443 - lockdep_assert_cpus_held(); 444 - 445 - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 446 - 447 - entry->busy = 0; 448 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 449 - /* 450 - * For the first limbo RMID in the domain, 451 - * setup up the limbo worker. 452 - */ 453 - if (!has_busy_rmid(d)) 454 - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 455 - RESCTRL_PICK_ANY_CPU); 456 - set_bit(idx, d->rmid_busy_llc); 457 - entry->busy++; 458 - } 459 - 460 - rmid_limbo_count++; 461 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 462 - closid_num_dirty_rmid[entry->closid]++; 463 - } 464 - 465 - void free_rmid(u32 closid, u32 rmid) 466 - { 467 - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 468 - struct rmid_entry *entry; 469 - 470 - lockdep_assert_held(&rdtgroup_mutex); 471 - 472 - /* 473 - * Do not allow the default rmid to be free'd. Comparing by index 474 - * allows architectures that ignore the closid parameter to avoid an 475 - * unnecessary check. 476 - */ 477 - if (!resctrl_arch_mon_capable() || 478 - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 479 - RESCTRL_RESERVED_RMID)) 480 - return; 481 - 482 - entry = __rmid_entry(idx); 483 - 484 - if (resctrl_arch_is_llc_occupancy_enabled()) 485 - add_rmid_to_limbo(entry); 486 - else 487 - list_add_tail(&entry->list, &rmid_free_lru); 488 - } 489 - 490 - static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 491 - u32 rmid, enum resctrl_event_id evtid) 492 - { 493 - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 494 - 495 - switch (evtid) { 496 - case QOS_L3_MBM_TOTAL_EVENT_ID: 497 - return &d->mbm_total[idx]; 498 - case QOS_L3_MBM_LOCAL_EVENT_ID: 499 - return &d->mbm_local[idx]; 500 - default: 501 - return NULL; 502 - } 503 - } 504 - 505 - static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 506 - { 507 - int cpu = smp_processor_id(); 508 - struct rdt_mon_domain *d; 509 - struct mbm_state *m; 510 - int err, ret; 511 - u64 tval = 0; 512 - 513 - if (rr->first) { 514 - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 515 - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 516 - if (m) 517 - memset(m, 0, sizeof(struct mbm_state)); 518 - return 0; 519 - } 520 - 521 - if (rr->d) { 522 - /* Reading a single domain, must be on a CPU in that domain. */ 523 - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 524 - return -EINVAL; 525 - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 526 - rr->evtid, &tval, rr->arch_mon_ctx); 527 - if (rr->err) 528 - return rr->err; 529 - 530 - rr->val += tval; 531 - 532 - return 0; 533 - } 534 - 535 - /* Summing domains that share a cache, must be on a CPU for that cache. */ 536 - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 537 - return -EINVAL; 538 - 539 - /* 540 - * Legacy files must report the sum of an event across all 541 - * domains that share the same L3 cache instance. 542 - * Report success if a read from any domain succeeds, -EINVAL 543 - * (translated to "Unavailable" for user space) if reading from 544 - * all domains fail for any reason. 545 - */ 546 - ret = -EINVAL; 547 - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 548 - if (d->ci->id != rr->ci->id) 549 - continue; 550 - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 551 - rr->evtid, &tval, rr->arch_mon_ctx); 552 - if (!err) { 553 - rr->val += tval; 554 - ret = 0; 555 - } 556 - } 557 - 558 - if (ret) 559 - rr->err = ret; 560 - 561 - return ret; 562 - } 563 - 564 - /* 565 - * mbm_bw_count() - Update bw count from values previously read by 566 - * __mon_event_count(). 567 - * @closid: The closid used to identify the cached mbm_state. 568 - * @rmid: The rmid used to identify the cached mbm_state. 569 - * @rr: The struct rmid_read populated by __mon_event_count(). 570 - * 571 - * Supporting function to calculate the memory bandwidth 572 - * and delta bandwidth in MBps. The chunks value previously read by 573 - * __mon_event_count() is compared with the chunks value from the previous 574 - * invocation. This must be called once per second to maintain values in MBps. 575 - */ 576 - static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 577 - { 578 - u64 cur_bw, bytes, cur_bytes; 579 - struct mbm_state *m; 580 - 581 - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 582 - if (WARN_ON_ONCE(!m)) 583 - return; 584 - 585 - cur_bytes = rr->val; 586 - bytes = cur_bytes - m->prev_bw_bytes; 587 - m->prev_bw_bytes = cur_bytes; 588 - 589 - cur_bw = bytes / SZ_1M; 590 - 591 - m->prev_bw = cur_bw; 592 - } 593 - 594 - /* 595 - * This is scheduled by mon_event_read() to read the CQM/MBM counters 596 - * on a domain. 597 - */ 598 - void mon_event_count(void *info) 599 - { 600 - struct rdtgroup *rdtgrp, *entry; 601 - struct rmid_read *rr = info; 602 - struct list_head *head; 603 - int ret; 604 - 605 - rdtgrp = rr->rgrp; 606 - 607 - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 608 - 609 - /* 610 - * For Ctrl groups read data from child monitor groups and 611 - * add them together. Count events which are read successfully. 612 - * Discard the rmid_read's reporting errors. 613 - */ 614 - head = &rdtgrp->mon.crdtgrp_list; 615 - 616 - if (rdtgrp->type == RDTCTRL_GROUP) { 617 - list_for_each_entry(entry, head, mon.crdtgrp_list) { 618 - if (__mon_event_count(entry->closid, entry->mon.rmid, 619 - rr) == 0) 620 - ret = 0; 621 - } 622 - } 623 - 624 - /* 625 - * __mon_event_count() calls for newly created monitor groups may 626 - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 627 - * Discard error if any of the monitor event reads succeeded. 628 - */ 629 - if (ret == 0) 630 - rr->err = 0; 631 - } 632 - 633 - static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 634 - struct rdt_resource *r) 635 - { 636 - struct rdt_ctrl_domain *d; 637 - 638 - lockdep_assert_cpus_held(); 639 - 640 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 641 - /* Find the domain that contains this CPU */ 642 - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 643 - return d; 644 - } 645 - 646 - return NULL; 647 - } 648 - 649 - /* 650 - * Feedback loop for MBA software controller (mba_sc) 651 - * 652 - * mba_sc is a feedback loop where we periodically read MBM counters and 653 - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 654 - * that: 655 - * 656 - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 657 - * 658 - * This uses the MBM counters to measure the bandwidth and MBA throttle 659 - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 660 - * fact that resctrl rdtgroups have both monitoring and control. 661 - * 662 - * The frequency of the checks is 1s and we just tag along the MBM overflow 663 - * timer. Having 1s interval makes the calculation of bandwidth simpler. 664 - * 665 - * Although MBA's goal is to restrict the bandwidth to a maximum, there may 666 - * be a need to increase the bandwidth to avoid unnecessarily restricting 667 - * the L2 <-> L3 traffic. 668 - * 669 - * Since MBA controls the L2 external bandwidth where as MBM measures the 670 - * L3 external bandwidth the following sequence could lead to such a 671 - * situation. 672 - * 673 - * Consider an rdtgroup which had high L3 <-> memory traffic in initial 674 - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 675 - * after some time rdtgroup has mostly L2 <-> L3 traffic. 676 - * 677 - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 678 - * throttle MSRs already have low percentage values. To avoid 679 - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 680 - */ 681 - static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 682 - { 683 - u32 closid, rmid, cur_msr_val, new_msr_val; 684 - struct mbm_state *pmbm_data, *cmbm_data; 685 - struct rdt_ctrl_domain *dom_mba; 686 - enum resctrl_event_id evt_id; 687 - struct rdt_resource *r_mba; 688 - struct list_head *head; 689 - struct rdtgroup *entry; 690 - u32 cur_bw, user_bw; 691 - 692 - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 693 - evt_id = rgrp->mba_mbps_event; 694 - 695 - closid = rgrp->closid; 696 - rmid = rgrp->mon.rmid; 697 - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 698 - if (WARN_ON_ONCE(!pmbm_data)) 699 - return; 700 - 701 - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 702 - if (!dom_mba) { 703 - pr_warn_once("Failure to get domain for MBA update\n"); 704 - return; 705 - } 706 - 707 - cur_bw = pmbm_data->prev_bw; 708 - user_bw = dom_mba->mbps_val[closid]; 709 - 710 - /* MBA resource doesn't support CDP */ 711 - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 712 - 713 - /* 714 - * For Ctrl groups read data from child monitor groups. 715 - */ 716 - head = &rgrp->mon.crdtgrp_list; 717 - list_for_each_entry(entry, head, mon.crdtgrp_list) { 718 - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 719 - if (WARN_ON_ONCE(!cmbm_data)) 720 - return; 721 - cur_bw += cmbm_data->prev_bw; 722 - } 723 - 724 - /* 725 - * Scale up/down the bandwidth linearly for the ctrl group. The 726 - * bandwidth step is the bandwidth granularity specified by the 727 - * hardware. 728 - * Always increase throttling if current bandwidth is above the 729 - * target set by user. 730 - * But avoid thrashing up and down on every poll by checking 731 - * whether a decrease in throttling is likely to push the group 732 - * back over target. E.g. if currently throttling to 30% of bandwidth 733 - * on a system with 10% granularity steps, check whether moving to 734 - * 40% would go past the limit by multiplying current bandwidth by 735 - * "(30 + 10) / 30". 736 - */ 737 - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 738 - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 739 - } else if (cur_msr_val < MAX_MBA_BW && 740 - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 741 - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 742 - } else { 743 - return; 744 - } 745 - 746 - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 747 - } 748 - 749 - static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 750 - u32 closid, u32 rmid, enum resctrl_event_id evtid) 751 - { 752 - struct rmid_read rr = {0}; 753 - 754 - rr.r = r; 755 - rr.d = d; 756 - rr.evtid = evtid; 757 - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 758 - if (IS_ERR(rr.arch_mon_ctx)) { 759 - pr_warn_ratelimited("Failed to allocate monitor context: %ld", 760 - PTR_ERR(rr.arch_mon_ctx)); 761 - return; 762 - } 763 - 764 - __mon_event_count(closid, rmid, &rr); 765 - 766 - /* 767 - * If the software controller is enabled, compute the 768 - * bandwidth for this event id. 769 - */ 770 - if (is_mba_sc(NULL)) 771 - mbm_bw_count(closid, rmid, &rr); 772 - 773 - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 774 - } 775 - 776 - static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 777 - u32 closid, u32 rmid) 778 - { 779 - /* 780 - * This is protected from concurrent reads from user as both 781 - * the user and overflow handler hold the global mutex. 782 - */ 783 - if (resctrl_arch_is_mbm_total_enabled()) 784 - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 785 - 786 - if (resctrl_arch_is_mbm_local_enabled()) 787 - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 788 - } 789 - 790 - /* 791 - * Handler to scan the limbo list and move the RMIDs 792 - * to free list whose occupancy < threshold_occupancy. 793 - */ 794 - void cqm_handle_limbo(struct work_struct *work) 795 - { 796 - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 797 - struct rdt_mon_domain *d; 798 - 799 - cpus_read_lock(); 800 - mutex_lock(&rdtgroup_mutex); 801 - 802 - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 803 - 804 - __check_limbo(d, false); 805 - 806 - if (has_busy_rmid(d)) { 807 - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 808 - RESCTRL_PICK_ANY_CPU); 809 - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 810 - delay); 811 - } 812 - 813 - mutex_unlock(&rdtgroup_mutex); 814 - cpus_read_unlock(); 815 - } 816 - 817 - /** 818 - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 819 - * domain. 820 - * @dom: The domain the limbo handler should run for. 821 - * @delay_ms: How far in the future the handler should run. 822 - * @exclude_cpu: Which CPU the handler should not run on, 823 - * RESCTRL_PICK_ANY_CPU to pick any CPU. 824 - */ 825 - void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 826 - int exclude_cpu) 827 - { 828 - unsigned long delay = msecs_to_jiffies(delay_ms); 829 - int cpu; 830 - 831 - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 832 - dom->cqm_work_cpu = cpu; 833 - 834 - if (cpu < nr_cpu_ids) 835 - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 836 - } 837 - 838 - void mbm_handle_overflow(struct work_struct *work) 839 - { 840 - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 841 - struct rdtgroup *prgrp, *crgrp; 842 - struct rdt_mon_domain *d; 843 - struct list_head *head; 844 - struct rdt_resource *r; 845 - 846 - cpus_read_lock(); 847 - mutex_lock(&rdtgroup_mutex); 848 - 849 - /* 850 - * If the filesystem has been unmounted this work no longer needs to 851 - * run. 852 - */ 853 - if (!resctrl_mounted || !resctrl_arch_mon_capable()) 854 - goto out_unlock; 855 - 856 - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 857 - d = container_of(work, struct rdt_mon_domain, mbm_over.work); 858 - 859 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 860 - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 861 - 862 - head = &prgrp->mon.crdtgrp_list; 863 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) 864 - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 865 - 866 - if (is_mba_sc(NULL)) 867 - update_mba_bw(prgrp, d); 868 - } 869 - 870 - /* 871 - * Re-check for housekeeping CPUs. This allows the overflow handler to 872 - * move off a nohz_full CPU quickly. 873 - */ 874 - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 875 - RESCTRL_PICK_ANY_CPU); 876 - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 877 - 878 - out_unlock: 879 - mutex_unlock(&rdtgroup_mutex); 880 - cpus_read_unlock(); 881 - } 882 - 883 - /** 884 - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 885 - * domain. 886 - * @dom: The domain the overflow handler should run for. 887 - * @delay_ms: How far in the future the handler should run. 888 - * @exclude_cpu: Which CPU the handler should not run on, 889 - * RESCTRL_PICK_ANY_CPU to pick any CPU. 890 - */ 891 - void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 892 - int exclude_cpu) 893 - { 894 - unsigned long delay = msecs_to_jiffies(delay_ms); 895 - int cpu; 896 - 897 - /* 898 - * When a domain comes online there is no guarantee the filesystem is 899 - * mounted. If not, there is no need to catch counter overflow. 900 - */ 901 - if (!resctrl_mounted || !resctrl_arch_mon_capable()) 902 - return; 903 - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 904 - dom->mbm_work_cpu = cpu; 905 - 906 - if (cpu < nr_cpu_ids) 907 - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 908 - } 909 - 910 - static int dom_data_init(struct rdt_resource *r) 911 - { 912 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 913 - u32 num_closid = resctrl_arch_get_num_closid(r); 914 - struct rmid_entry *entry = NULL; 915 - int err = 0, i; 916 - u32 idx; 917 - 918 - mutex_lock(&rdtgroup_mutex); 919 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 920 - u32 *tmp; 921 - 922 - /* 923 - * If the architecture hasn't provided a sanitised value here, 924 - * this may result in larger arrays than necessary. Resctrl will 925 - * use a smaller system wide value based on the resources in 926 - * use. 927 - */ 928 - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 929 - if (!tmp) { 930 - err = -ENOMEM; 931 - goto out_unlock; 932 - } 933 - 934 - closid_num_dirty_rmid = tmp; 935 - } 936 - 937 - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 938 - if (!rmid_ptrs) { 939 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 940 - kfree(closid_num_dirty_rmid); 941 - closid_num_dirty_rmid = NULL; 942 - } 943 - err = -ENOMEM; 944 - goto out_unlock; 945 - } 946 - 947 - for (i = 0; i < idx_limit; i++) { 948 - entry = &rmid_ptrs[i]; 949 - INIT_LIST_HEAD(&entry->list); 950 - 951 - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 952 - list_add_tail(&entry->list, &rmid_free_lru); 953 - } 954 - 955 - /* 956 - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 957 - * are always allocated. These are used for the rdtgroup_default 958 - * control group, which will be setup later in resctrl_init(). 959 - */ 960 - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 961 - RESCTRL_RESERVED_RMID); 962 - entry = __rmid_entry(idx); 963 - list_del(&entry->list); 964 - 965 - out_unlock: 966 - mutex_unlock(&rdtgroup_mutex); 967 - 968 - return err; 969 - } 970 - 971 - static void dom_data_exit(struct rdt_resource *r) 972 - { 973 - mutex_lock(&rdtgroup_mutex); 974 - 975 - if (!r->mon_capable) 976 - goto out_unlock; 977 - 978 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 979 - kfree(closid_num_dirty_rmid); 980 - closid_num_dirty_rmid = NULL; 981 - } 982 - 983 - kfree(rmid_ptrs); 984 - rmid_ptrs = NULL; 985 - 986 - out_unlock: 987 - mutex_unlock(&rdtgroup_mutex); 988 - } 989 - 990 - static struct mon_evt llc_occupancy_event = { 991 - .name = "llc_occupancy", 992 - .evtid = QOS_L3_OCCUP_EVENT_ID, 993 - }; 994 - 995 - static struct mon_evt mbm_total_event = { 996 - .name = "mbm_total_bytes", 997 - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 998 - }; 999 - 1000 - static struct mon_evt mbm_local_event = { 1001 - .name = "mbm_local_bytes", 1002 - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 1003 - }; 1004 - 1005 - /* 1006 - * Initialize the event list for the resource. 1007 - * 1008 - * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1009 - * because as per the SDM the total and local memory bandwidth 1010 - * are enumerated as part of L3 monitoring. 1011 - */ 1012 - static void l3_mon_evt_init(struct rdt_resource *r) 1013 - { 1014 - INIT_LIST_HEAD(&r->evt_list); 1015 - 1016 - if (resctrl_arch_is_llc_occupancy_enabled()) 1017 - list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1018 - if (resctrl_arch_is_mbm_total_enabled()) 1019 - list_add_tail(&mbm_total_event.list, &r->evt_list); 1020 - if (resctrl_arch_is_mbm_local_enabled()) 1021 - list_add_tail(&mbm_local_event.list, &r->evt_list); 1022 348 } 1023 349 1024 350 /* ··· 341 1193 return ret; 342 1194 } 343 1195 344 - /** 345 - * resctrl_mon_resource_init() - Initialise global monitoring structures. 346 - * 347 - * Allocate and initialise global monitor resources that do not belong to a 348 - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 349 - * Called once during boot after the struct rdt_resource's have been configured 350 - * but before the filesystem is mounted. 351 - * Resctrl's cpuhp callbacks may be called before this point to bring a domain 352 - * online. 353 - * 354 - * Returns 0 for success, or -ENOMEM. 355 - */ 356 - int __init resctrl_mon_resource_init(void) 357 - { 358 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 359 - int ret; 360 - 361 - if (!r->mon_capable) 362 - return 0; 363 - 364 - ret = dom_data_init(r); 365 - if (ret) 366 - return ret; 367 - 368 - l3_mon_evt_init(r); 369 - 370 - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 371 - mbm_total_event.configurable = true; 372 - resctrl_file_fflags_init("mbm_total_bytes_config", 373 - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 374 - } 375 - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 376 - mbm_local_event.configurable = true; 377 - resctrl_file_fflags_init("mbm_local_bytes_config", 378 - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 379 - } 380 - 381 - if (resctrl_arch_is_mbm_local_enabled()) 382 - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 383 - else if (resctrl_arch_is_mbm_total_enabled()) 384 - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 385 - 386 - return 0; 387 - } 388 - 389 1196 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 390 1197 { 391 1198 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; ··· 386 1283 r->mon_capable = true; 387 1284 388 1285 return 0; 389 - } 390 - 391 - void resctrl_mon_resource_exit(void) 392 - { 393 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 394 - 395 - dom_data_exit(r); 396 1286 } 397 1287 398 1288 void __init intel_rdt_mbm_apply_quirk(void)
+4 -1088
arch/x86/kernel/cpu/resctrl/pseudo_lock.c
··· 11 11 12 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 13 14 + #include <linux/cacheflush.h> 14 15 #include <linux/cpu.h> 15 - #include <linux/cpumask.h> 16 - #include <linux/debugfs.h> 17 - #include <linux/kthread.h> 18 - #include <linux/mman.h> 19 16 #include <linux/perf_event.h> 20 17 #include <linux/pm_qos.h> 21 - #include <linux/slab.h> 22 - #include <linux/uaccess.h> 18 + #include <linux/resctrl.h> 23 19 24 - #include <asm/cacheflush.h> 25 20 #include <asm/cpu_device_id.h> 26 - #include <asm/resctrl.h> 27 21 #include <asm/perf_event.h> 28 22 #include <asm/msr.h> 29 23 ··· 25 31 #include "internal.h" 26 32 27 33 #define CREATE_TRACE_POINTS 28 - #include "trace.h" 34 + 35 + #include "pseudo_lock_trace.h" 29 36 30 37 /* 31 38 * The bits needed to disable hardware prefetching varies based on the 32 39 * platform. During initialization we will discover which bits to use. 33 40 */ 34 41 static u64 prefetch_disable_bits; 35 - 36 - /* 37 - * Major number assigned to and shared by all devices exposing 38 - * pseudo-locked regions. 39 - */ 40 - static unsigned int pseudo_lock_major; 41 - static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 42 - 43 - static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 44 - { 45 - const struct rdtgroup *rdtgrp; 46 - 47 - rdtgrp = dev_get_drvdata(dev); 48 - if (mode) 49 - *mode = 0600; 50 - guard(mutex)(&rdtgroup_mutex); 51 - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 52 - } 53 - 54 - static const struct class pseudo_lock_class = { 55 - .name = "pseudo_lock", 56 - .devnode = pseudo_lock_devnode, 57 - }; 58 42 59 43 /** 60 44 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported ··· 92 120 } 93 121 94 122 return prefetch_disable_bits; 95 - } 96 - 97 - /** 98 - * pseudo_lock_minor_get - Obtain available minor number 99 - * @minor: Pointer to where new minor number will be stored 100 - * 101 - * A bitmask is used to track available minor numbers. Here the next free 102 - * minor number is marked as unavailable and returned. 103 - * 104 - * Return: 0 on success, <0 on failure. 105 - */ 106 - static int pseudo_lock_minor_get(unsigned int *minor) 107 - { 108 - unsigned long first_bit; 109 - 110 - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 111 - 112 - if (first_bit == MINORBITS) 113 - return -ENOSPC; 114 - 115 - __clear_bit(first_bit, &pseudo_lock_minor_avail); 116 - *minor = first_bit; 117 - 118 - return 0; 119 - } 120 - 121 - /** 122 - * pseudo_lock_minor_release - Return minor number to available 123 - * @minor: The minor number made available 124 - */ 125 - static void pseudo_lock_minor_release(unsigned int minor) 126 - { 127 - __set_bit(minor, &pseudo_lock_minor_avail); 128 - } 129 - 130 - /** 131 - * region_find_by_minor - Locate a pseudo-lock region by inode minor number 132 - * @minor: The minor number of the device representing pseudo-locked region 133 - * 134 - * When the character device is accessed we need to determine which 135 - * pseudo-locked region it belongs to. This is done by matching the minor 136 - * number of the device to the pseudo-locked region it belongs. 137 - * 138 - * Minor numbers are assigned at the time a pseudo-locked region is associated 139 - * with a cache instance. 140 - * 141 - * Return: On success return pointer to resource group owning the pseudo-locked 142 - * region, NULL on failure. 143 - */ 144 - static struct rdtgroup *region_find_by_minor(unsigned int minor) 145 - { 146 - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 147 - 148 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 149 - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 150 - rdtgrp_match = rdtgrp; 151 - break; 152 - } 153 - } 154 - return rdtgrp_match; 155 - } 156 - 157 - /** 158 - * struct pseudo_lock_pm_req - A power management QoS request list entry 159 - * @list: Entry within the @pm_reqs list for a pseudo-locked region 160 - * @req: PM QoS request 161 - */ 162 - struct pseudo_lock_pm_req { 163 - struct list_head list; 164 - struct dev_pm_qos_request req; 165 - }; 166 - 167 - static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 168 - { 169 - struct pseudo_lock_pm_req *pm_req, *next; 170 - 171 - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 172 - dev_pm_qos_remove_request(&pm_req->req); 173 - list_del(&pm_req->list); 174 - kfree(pm_req); 175 - } 176 - } 177 - 178 - /** 179 - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 180 - * @plr: Pseudo-locked region 181 - * 182 - * To prevent the cache from being affected by power management entering 183 - * C6 has to be avoided. This is accomplished by requesting a latency 184 - * requirement lower than lowest C6 exit latency of all supported 185 - * platforms as found in the cpuidle state tables in the intel_idle driver. 186 - * At this time it is possible to do so with a single latency requirement 187 - * for all supported platforms. 188 - * 189 - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 190 - * the ACPI latencies need to be considered while keeping in mind that C2 191 - * may be set to map to deeper sleep states. In this case the latency 192 - * requirement needs to prevent entering C2 also. 193 - * 194 - * Return: 0 on success, <0 on failure 195 - */ 196 - static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 197 - { 198 - struct pseudo_lock_pm_req *pm_req; 199 - int cpu; 200 - int ret; 201 - 202 - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 203 - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 204 - if (!pm_req) { 205 - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 206 - ret = -ENOMEM; 207 - goto out_err; 208 - } 209 - ret = dev_pm_qos_add_request(get_cpu_device(cpu), 210 - &pm_req->req, 211 - DEV_PM_QOS_RESUME_LATENCY, 212 - 30); 213 - if (ret < 0) { 214 - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 215 - cpu); 216 - kfree(pm_req); 217 - ret = -1; 218 - goto out_err; 219 - } 220 - list_add(&pm_req->list, &plr->pm_reqs); 221 - } 222 - 223 - return 0; 224 - 225 - out_err: 226 - pseudo_lock_cstates_relax(plr); 227 - return ret; 228 - } 229 - 230 - /** 231 - * pseudo_lock_region_clear - Reset pseudo-lock region data 232 - * @plr: pseudo-lock region 233 - * 234 - * All content of the pseudo-locked region is reset - any memory allocated 235 - * freed. 236 - * 237 - * Return: void 238 - */ 239 - static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 240 - { 241 - plr->size = 0; 242 - plr->line_size = 0; 243 - kfree(plr->kmem); 244 - plr->kmem = NULL; 245 - plr->s = NULL; 246 - if (plr->d) 247 - plr->d->plr = NULL; 248 - plr->d = NULL; 249 - plr->cbm = 0; 250 - plr->debugfs_dir = NULL; 251 - } 252 - 253 - /** 254 - * pseudo_lock_region_init - Initialize pseudo-lock region information 255 - * @plr: pseudo-lock region 256 - * 257 - * Called after user provided a schemata to be pseudo-locked. From the 258 - * schemata the &struct pseudo_lock_region is on entry already initialized 259 - * with the resource, domain, and capacity bitmask. Here the information 260 - * required for pseudo-locking is deduced from this data and &struct 261 - * pseudo_lock_region initialized further. This information includes: 262 - * - size in bytes of the region to be pseudo-locked 263 - * - cache line size to know the stride with which data needs to be accessed 264 - * to be pseudo-locked 265 - * - a cpu associated with the cache instance on which the pseudo-locking 266 - * flow can be executed 267 - * 268 - * Return: 0 on success, <0 on failure. Descriptive error will be written 269 - * to last_cmd_status buffer. 270 - */ 271 - static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 272 - { 273 - enum resctrl_scope scope = plr->s->res->ctrl_scope; 274 - struct cacheinfo *ci; 275 - int ret; 276 - 277 - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 278 - return -ENODEV; 279 - 280 - /* Pick the first cpu we find that is associated with the cache. */ 281 - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 282 - 283 - if (!cpu_online(plr->cpu)) { 284 - rdt_last_cmd_printf("CPU %u associated with cache not online\n", 285 - plr->cpu); 286 - ret = -ENODEV; 287 - goto out_region; 288 - } 289 - 290 - ci = get_cpu_cacheinfo_level(plr->cpu, scope); 291 - if (ci) { 292 - plr->line_size = ci->coherency_line_size; 293 - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 294 - return 0; 295 - } 296 - 297 - ret = -1; 298 - rdt_last_cmd_puts("Unable to determine cache line size\n"); 299 - out_region: 300 - pseudo_lock_region_clear(plr); 301 - return ret; 302 - } 303 - 304 - /** 305 - * pseudo_lock_init - Initialize a pseudo-lock region 306 - * @rdtgrp: resource group to which new pseudo-locked region will belong 307 - * 308 - * A pseudo-locked region is associated with a resource group. When this 309 - * association is created the pseudo-locked region is initialized. The 310 - * details of the pseudo-locked region are not known at this time so only 311 - * allocation is done and association established. 312 - * 313 - * Return: 0 on success, <0 on failure 314 - */ 315 - static int pseudo_lock_init(struct rdtgroup *rdtgrp) 316 - { 317 - struct pseudo_lock_region *plr; 318 - 319 - plr = kzalloc(sizeof(*plr), GFP_KERNEL); 320 - if (!plr) 321 - return -ENOMEM; 322 - 323 - init_waitqueue_head(&plr->lock_thread_wq); 324 - INIT_LIST_HEAD(&plr->pm_reqs); 325 - rdtgrp->plr = plr; 326 - return 0; 327 - } 328 - 329 - /** 330 - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 331 - * @plr: pseudo-lock region 332 - * 333 - * Initialize the details required to set up the pseudo-locked region and 334 - * allocate the contiguous memory that will be pseudo-locked to the cache. 335 - * 336 - * Return: 0 on success, <0 on failure. Descriptive error will be written 337 - * to last_cmd_status buffer. 338 - */ 339 - static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 340 - { 341 - int ret; 342 - 343 - ret = pseudo_lock_region_init(plr); 344 - if (ret < 0) 345 - return ret; 346 - 347 - /* 348 - * We do not yet support contiguous regions larger than 349 - * KMALLOC_MAX_SIZE. 350 - */ 351 - if (plr->size > KMALLOC_MAX_SIZE) { 352 - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 353 - ret = -E2BIG; 354 - goto out_region; 355 - } 356 - 357 - plr->kmem = kzalloc(plr->size, GFP_KERNEL); 358 - if (!plr->kmem) { 359 - rdt_last_cmd_puts("Unable to allocate memory\n"); 360 - ret = -ENOMEM; 361 - goto out_region; 362 - } 363 - 364 - ret = 0; 365 - goto out; 366 - out_region: 367 - pseudo_lock_region_clear(plr); 368 - out: 369 - return ret; 370 - } 371 - 372 - /** 373 - * pseudo_lock_free - Free a pseudo-locked region 374 - * @rdtgrp: resource group to which pseudo-locked region belonged 375 - * 376 - * The pseudo-locked region's resources have already been released, or not 377 - * yet created at this point. Now it can be freed and disassociated from the 378 - * resource group. 379 - * 380 - * Return: void 381 - */ 382 - static void pseudo_lock_free(struct rdtgroup *rdtgrp) 383 - { 384 - pseudo_lock_region_clear(rdtgrp->plr); 385 - kfree(rdtgrp->plr); 386 - rdtgrp->plr = NULL; 387 123 } 388 124 389 125 /** ··· 221 541 plr->thread_done = 1; 222 542 wake_up_interruptible(&plr->lock_thread_wq); 223 543 return 0; 224 - } 225 - 226 - /** 227 - * rdtgroup_monitor_in_progress - Test if monitoring in progress 228 - * @rdtgrp: resource group being queried 229 - * 230 - * Return: 1 if monitor groups have been created for this resource 231 - * group, 0 otherwise. 232 - */ 233 - static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 234 - { 235 - return !list_empty(&rdtgrp->mon.crdtgrp_list); 236 - } 237 - 238 - /** 239 - * rdtgroup_locksetup_user_restrict - Restrict user access to group 240 - * @rdtgrp: resource group needing access restricted 241 - * 242 - * A resource group used for cache pseudo-locking cannot have cpus or tasks 243 - * assigned to it. This is communicated to the user by restricting access 244 - * to all the files that can be used to make such changes. 245 - * 246 - * Permissions restored with rdtgroup_locksetup_user_restore() 247 - * 248 - * Return: 0 on success, <0 on failure. If a failure occurs during the 249 - * restriction of access an attempt will be made to restore permissions but 250 - * the state of the mode of these files will be uncertain when a failure 251 - * occurs. 252 - */ 253 - static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 254 - { 255 - int ret; 256 - 257 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 258 - if (ret) 259 - return ret; 260 - 261 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 262 - if (ret) 263 - goto err_tasks; 264 - 265 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 266 - if (ret) 267 - goto err_cpus; 268 - 269 - if (resctrl_arch_mon_capable()) { 270 - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 271 - if (ret) 272 - goto err_cpus_list; 273 - } 274 - 275 - ret = 0; 276 - goto out; 277 - 278 - err_cpus_list: 279 - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 280 - err_cpus: 281 - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 282 - err_tasks: 283 - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 284 - out: 285 - return ret; 286 - } 287 - 288 - /** 289 - * rdtgroup_locksetup_user_restore - Restore user access to group 290 - * @rdtgrp: resource group needing access restored 291 - * 292 - * Restore all file access previously removed using 293 - * rdtgroup_locksetup_user_restrict() 294 - * 295 - * Return: 0 on success, <0 on failure. If a failure occurs during the 296 - * restoration of access an attempt will be made to restrict permissions 297 - * again but the state of the mode of these files will be uncertain when 298 - * a failure occurs. 299 - */ 300 - static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 301 - { 302 - int ret; 303 - 304 - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 305 - if (ret) 306 - return ret; 307 - 308 - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 309 - if (ret) 310 - goto err_tasks; 311 - 312 - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 313 - if (ret) 314 - goto err_cpus; 315 - 316 - if (resctrl_arch_mon_capable()) { 317 - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 318 - if (ret) 319 - goto err_cpus_list; 320 - } 321 - 322 - ret = 0; 323 - goto out; 324 - 325 - err_cpus_list: 326 - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 327 - err_cpus: 328 - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 329 - err_tasks: 330 - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 331 - out: 332 - return ret; 333 - } 334 - 335 - /** 336 - * rdtgroup_locksetup_enter - Resource group enters locksetup mode 337 - * @rdtgrp: resource group requested to enter locksetup mode 338 - * 339 - * A resource group enters locksetup mode to reflect that it would be used 340 - * to represent a pseudo-locked region and is in the process of being set 341 - * up to do so. A resource group used for a pseudo-locked region would 342 - * lose the closid associated with it so we cannot allow it to have any 343 - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 344 - * future. Monitoring of a pseudo-locked region is not allowed either. 345 - * 346 - * The above and more restrictions on a pseudo-locked region are checked 347 - * for and enforced before the resource group enters the locksetup mode. 348 - * 349 - * Returns: 0 if the resource group successfully entered locksetup mode, <0 350 - * on failure. On failure the last_cmd_status buffer is updated with text to 351 - * communicate details of failure to the user. 352 - */ 353 - int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 354 - { 355 - int ret; 356 - 357 - /* 358 - * The default resource group can neither be removed nor lose the 359 - * default closid associated with it. 360 - */ 361 - if (rdtgrp == &rdtgroup_default) { 362 - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 363 - return -EINVAL; 364 - } 365 - 366 - /* 367 - * Cache Pseudo-locking not supported when CDP is enabled. 368 - * 369 - * Some things to consider if you would like to enable this 370 - * support (using L3 CDP as example): 371 - * - When CDP is enabled two separate resources are exposed, 372 - * L3DATA and L3CODE, but they are actually on the same cache. 373 - * The implication for pseudo-locking is that if a 374 - * pseudo-locked region is created on a domain of one 375 - * resource (eg. L3CODE), then a pseudo-locked region cannot 376 - * be created on that same domain of the other resource 377 - * (eg. L3DATA). This is because the creation of a 378 - * pseudo-locked region involves a call to wbinvd that will 379 - * affect all cache allocations on particular domain. 380 - * - Considering the previous, it may be possible to only 381 - * expose one of the CDP resources to pseudo-locking and 382 - * hide the other. For example, we could consider to only 383 - * expose L3DATA and since the L3 cache is unified it is 384 - * still possible to place instructions there are execute it. 385 - * - If only one region is exposed to pseudo-locking we should 386 - * still keep in mind that availability of a portion of cache 387 - * for pseudo-locking should take into account both resources. 388 - * Similarly, if a pseudo-locked region is created in one 389 - * resource, the portion of cache used by it should be made 390 - * unavailable to all future allocations from both resources. 391 - */ 392 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 393 - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 394 - rdt_last_cmd_puts("CDP enabled\n"); 395 - return -EINVAL; 396 - } 397 - 398 - /* 399 - * Not knowing the bits to disable prefetching implies that this 400 - * platform does not support Cache Pseudo-Locking. 401 - */ 402 - if (resctrl_arch_get_prefetch_disable_bits() == 0) { 403 - rdt_last_cmd_puts("Pseudo-locking not supported\n"); 404 - return -EINVAL; 405 - } 406 - 407 - if (rdtgroup_monitor_in_progress(rdtgrp)) { 408 - rdt_last_cmd_puts("Monitoring in progress\n"); 409 - return -EINVAL; 410 - } 411 - 412 - if (rdtgroup_tasks_assigned(rdtgrp)) { 413 - rdt_last_cmd_puts("Tasks assigned to resource group\n"); 414 - return -EINVAL; 415 - } 416 - 417 - if (!cpumask_empty(&rdtgrp->cpu_mask)) { 418 - rdt_last_cmd_puts("CPUs assigned to resource group\n"); 419 - return -EINVAL; 420 - } 421 - 422 - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 423 - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 424 - return -EIO; 425 - } 426 - 427 - ret = pseudo_lock_init(rdtgrp); 428 - if (ret) { 429 - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 430 - goto out_release; 431 - } 432 - 433 - /* 434 - * If this system is capable of monitoring a rmid would have been 435 - * allocated when the control group was created. This is not needed 436 - * anymore when this group would be used for pseudo-locking. This 437 - * is safe to call on platforms not capable of monitoring. 438 - */ 439 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 440 - 441 - ret = 0; 442 - goto out; 443 - 444 - out_release: 445 - rdtgroup_locksetup_user_restore(rdtgrp); 446 - out: 447 - return ret; 448 - } 449 - 450 - /** 451 - * rdtgroup_locksetup_exit - resource group exist locksetup mode 452 - * @rdtgrp: resource group 453 - * 454 - * When a resource group exits locksetup mode the earlier restrictions are 455 - * lifted. 456 - * 457 - * Return: 0 on success, <0 on failure 458 - */ 459 - int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 460 - { 461 - int ret; 462 - 463 - if (resctrl_arch_mon_capable()) { 464 - ret = alloc_rmid(rdtgrp->closid); 465 - if (ret < 0) { 466 - rdt_last_cmd_puts("Out of RMIDs\n"); 467 - return ret; 468 - } 469 - rdtgrp->mon.rmid = ret; 470 - } 471 - 472 - ret = rdtgroup_locksetup_user_restore(rdtgrp); 473 - if (ret) { 474 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 475 - return ret; 476 - } 477 - 478 - pseudo_lock_free(rdtgrp); 479 - return 0; 480 - } 481 - 482 - /** 483 - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 484 - * @d: RDT domain 485 - * @cbm: CBM to test 486 - * 487 - * @d represents a cache instance and @cbm a capacity bitmask that is 488 - * considered for it. Determine if @cbm overlaps with any existing 489 - * pseudo-locked region on @d. 490 - * 491 - * @cbm is unsigned long, even if only 32 bits are used, to make the 492 - * bitmap functions work correctly. 493 - * 494 - * Return: true if @cbm overlaps with pseudo-locked region on @d, false 495 - * otherwise. 496 - */ 497 - bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 498 - { 499 - unsigned int cbm_len; 500 - unsigned long cbm_b; 501 - 502 - if (d->plr) { 503 - cbm_len = d->plr->s->res->cache.cbm_len; 504 - cbm_b = d->plr->cbm; 505 - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 506 - return true; 507 - } 508 - return false; 509 - } 510 - 511 - /** 512 - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 513 - * @d: RDT domain under test 514 - * 515 - * The setup of a pseudo-locked region affects all cache instances within 516 - * the hierarchy of the region. It is thus essential to know if any 517 - * pseudo-locked regions exist within a cache hierarchy to prevent any 518 - * attempts to create new pseudo-locked regions in the same hierarchy. 519 - * 520 - * Return: true if a pseudo-locked region exists in the hierarchy of @d or 521 - * if it is not possible to test due to memory allocation issue, 522 - * false otherwise. 523 - */ 524 - bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 525 - { 526 - struct rdt_ctrl_domain *d_i; 527 - cpumask_var_t cpu_with_psl; 528 - struct rdt_resource *r; 529 - bool ret = false; 530 - 531 - /* Walking r->domains, ensure it can't race with cpuhp */ 532 - lockdep_assert_cpus_held(); 533 - 534 - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 535 - return true; 536 - 537 - /* 538 - * First determine which cpus have pseudo-locked regions 539 - * associated with them. 540 - */ 541 - for_each_alloc_capable_rdt_resource(r) { 542 - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 543 - if (d_i->plr) 544 - cpumask_or(cpu_with_psl, cpu_with_psl, 545 - &d_i->hdr.cpu_mask); 546 - } 547 - } 548 - 549 - /* 550 - * Next test if new pseudo-locked region would intersect with 551 - * existing region. 552 - */ 553 - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 554 - ret = true; 555 - 556 - free_cpumask_var(cpu_with_psl); 557 - return ret; 558 544 } 559 545 560 546 /** ··· 514 1168 plr->thread_done = 1; 515 1169 wake_up_interruptible(&plr->lock_thread_wq); 516 1170 return 0; 517 - } 518 - 519 - /** 520 - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 521 - * @rdtgrp: Resource group to which the pseudo-locked region belongs. 522 - * @sel: Selector of which measurement to perform on a pseudo-locked region. 523 - * 524 - * The measurement of latency to access a pseudo-locked region should be 525 - * done from a cpu that is associated with that pseudo-locked region. 526 - * Determine which cpu is associated with this region and start a thread on 527 - * that cpu to perform the measurement, wait for that thread to complete. 528 - * 529 - * Return: 0 on success, <0 on failure 530 - */ 531 - static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 532 - { 533 - struct pseudo_lock_region *plr = rdtgrp->plr; 534 - struct task_struct *thread; 535 - unsigned int cpu; 536 - int ret = -1; 537 - 538 - cpus_read_lock(); 539 - mutex_lock(&rdtgroup_mutex); 540 - 541 - if (rdtgrp->flags & RDT_DELETED) { 542 - ret = -ENODEV; 543 - goto out; 544 - } 545 - 546 - if (!plr->d) { 547 - ret = -ENODEV; 548 - goto out; 549 - } 550 - 551 - plr->thread_done = 0; 552 - cpu = cpumask_first(&plr->d->hdr.cpu_mask); 553 - if (!cpu_online(cpu)) { 554 - ret = -ENODEV; 555 - goto out; 556 - } 557 - 558 - plr->cpu = cpu; 559 - 560 - if (sel == 1) 561 - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 562 - plr, cpu, "pseudo_lock_measure/%u"); 563 - else if (sel == 2) 564 - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 565 - plr, cpu, "pseudo_lock_measure/%u"); 566 - else if (sel == 3) 567 - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 568 - plr, cpu, "pseudo_lock_measure/%u"); 569 - else 570 - goto out; 571 - 572 - if (IS_ERR(thread)) { 573 - ret = PTR_ERR(thread); 574 - goto out; 575 - } 576 - 577 - ret = wait_event_interruptible(plr->lock_thread_wq, 578 - plr->thread_done == 1); 579 - if (ret < 0) 580 - goto out; 581 - 582 - ret = 0; 583 - 584 - out: 585 - mutex_unlock(&rdtgroup_mutex); 586 - cpus_read_unlock(); 587 - return ret; 588 - } 589 - 590 - static ssize_t pseudo_lock_measure_trigger(struct file *file, 591 - const char __user *user_buf, 592 - size_t count, loff_t *ppos) 593 - { 594 - struct rdtgroup *rdtgrp = file->private_data; 595 - size_t buf_size; 596 - char buf[32]; 597 - int ret; 598 - int sel; 599 - 600 - buf_size = min(count, (sizeof(buf) - 1)); 601 - if (copy_from_user(buf, user_buf, buf_size)) 602 - return -EFAULT; 603 - 604 - buf[buf_size] = '\0'; 605 - ret = kstrtoint(buf, 10, &sel); 606 - if (ret == 0) { 607 - if (sel != 1 && sel != 2 && sel != 3) 608 - return -EINVAL; 609 - ret = debugfs_file_get(file->f_path.dentry); 610 - if (ret) 611 - return ret; 612 - ret = pseudo_lock_measure_cycles(rdtgrp, sel); 613 - if (ret == 0) 614 - ret = count; 615 - debugfs_file_put(file->f_path.dentry); 616 - } 617 - 618 - return ret; 619 - } 620 - 621 - static const struct file_operations pseudo_measure_fops = { 622 - .write = pseudo_lock_measure_trigger, 623 - .open = simple_open, 624 - .llseek = default_llseek, 625 - }; 626 - 627 - /** 628 - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 629 - * @rdtgrp: resource group to which pseudo-lock region belongs 630 - * 631 - * Called when a resource group in the pseudo-locksetup mode receives a 632 - * valid schemata that should be pseudo-locked. Since the resource group is 633 - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 634 - * allocated and initialized with the essential information. If a failure 635 - * occurs the resource group remains in the pseudo-locksetup mode with the 636 - * &struct pseudo_lock_region associated with it, but cleared from all 637 - * information and ready for the user to re-attempt pseudo-locking by 638 - * writing the schemata again. 639 - * 640 - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 641 - * on failure. Descriptive error will be written to last_cmd_status buffer. 642 - */ 643 - int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 644 - { 645 - struct pseudo_lock_region *plr = rdtgrp->plr; 646 - struct task_struct *thread; 647 - unsigned int new_minor; 648 - struct device *dev; 649 - char *kn_name __free(kfree) = NULL; 650 - int ret; 651 - 652 - ret = pseudo_lock_region_alloc(plr); 653 - if (ret < 0) 654 - return ret; 655 - 656 - ret = pseudo_lock_cstates_constrain(plr); 657 - if (ret < 0) { 658 - ret = -EINVAL; 659 - goto out_region; 660 - } 661 - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 662 - if (!kn_name) { 663 - ret = -ENOMEM; 664 - goto out_cstates; 665 - } 666 - 667 - plr->thread_done = 0; 668 - 669 - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 670 - plr->cpu, "pseudo_lock/%u"); 671 - if (IS_ERR(thread)) { 672 - ret = PTR_ERR(thread); 673 - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 674 - goto out_cstates; 675 - } 676 - 677 - ret = wait_event_interruptible(plr->lock_thread_wq, 678 - plr->thread_done == 1); 679 - if (ret < 0) { 680 - /* 681 - * If the thread does not get on the CPU for whatever 682 - * reason and the process which sets up the region is 683 - * interrupted then this will leave the thread in runnable 684 - * state and once it gets on the CPU it will dereference 685 - * the cleared, but not freed, plr struct resulting in an 686 - * empty pseudo-locking loop. 687 - */ 688 - rdt_last_cmd_puts("Locking thread interrupted\n"); 689 - goto out_cstates; 690 - } 691 - 692 - ret = pseudo_lock_minor_get(&new_minor); 693 - if (ret < 0) { 694 - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 695 - goto out_cstates; 696 - } 697 - 698 - /* 699 - * Unlock access but do not release the reference. The 700 - * pseudo-locked region will still be here on return. 701 - * 702 - * The mutex has to be released temporarily to avoid a potential 703 - * deadlock with the mm->mmap_lock which is obtained in the 704 - * device_create() and debugfs_create_dir() callpath below as well as 705 - * before the mmap() callback is called. 706 - */ 707 - mutex_unlock(&rdtgroup_mutex); 708 - 709 - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 710 - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 711 - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 712 - debugfs_create_file("pseudo_lock_measure", 0200, 713 - plr->debugfs_dir, rdtgrp, 714 - &pseudo_measure_fops); 715 - } 716 - 717 - dev = device_create(&pseudo_lock_class, NULL, 718 - MKDEV(pseudo_lock_major, new_minor), 719 - rdtgrp, "%s", kn_name); 720 - 721 - mutex_lock(&rdtgroup_mutex); 722 - 723 - if (IS_ERR(dev)) { 724 - ret = PTR_ERR(dev); 725 - rdt_last_cmd_printf("Failed to create character device: %d\n", 726 - ret); 727 - goto out_debugfs; 728 - } 729 - 730 - /* We released the mutex - check if group was removed while we did so */ 731 - if (rdtgrp->flags & RDT_DELETED) { 732 - ret = -ENODEV; 733 - goto out_device; 734 - } 735 - 736 - plr->minor = new_minor; 737 - 738 - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 739 - closid_free(rdtgrp->closid); 740 - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 741 - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 742 - 743 - ret = 0; 744 - goto out; 745 - 746 - out_device: 747 - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 748 - out_debugfs: 749 - debugfs_remove_recursive(plr->debugfs_dir); 750 - pseudo_lock_minor_release(new_minor); 751 - out_cstates: 752 - pseudo_lock_cstates_relax(plr); 753 - out_region: 754 - pseudo_lock_region_clear(plr); 755 - out: 756 - return ret; 757 - } 758 - 759 - /** 760 - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 761 - * @rdtgrp: resource group to which the pseudo-locked region belongs 762 - * 763 - * The removal of a pseudo-locked region can be initiated when the resource 764 - * group is removed from user space via a "rmdir" from userspace or the 765 - * unmount of the resctrl filesystem. On removal the resource group does 766 - * not go back to pseudo-locksetup mode before it is removed, instead it is 767 - * removed directly. There is thus asymmetry with the creation where the 768 - * &struct pseudo_lock_region is removed here while it was not created in 769 - * rdtgroup_pseudo_lock_create(). 770 - * 771 - * Return: void 772 - */ 773 - void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 774 - { 775 - struct pseudo_lock_region *plr = rdtgrp->plr; 776 - 777 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 778 - /* 779 - * Default group cannot be a pseudo-locked region so we can 780 - * free closid here. 781 - */ 782 - closid_free(rdtgrp->closid); 783 - goto free; 784 - } 785 - 786 - pseudo_lock_cstates_relax(plr); 787 - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 788 - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 789 - pseudo_lock_minor_release(plr->minor); 790 - 791 - free: 792 - pseudo_lock_free(rdtgrp); 793 - } 794 - 795 - static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 796 - { 797 - struct rdtgroup *rdtgrp; 798 - 799 - mutex_lock(&rdtgroup_mutex); 800 - 801 - rdtgrp = region_find_by_minor(iminor(inode)); 802 - if (!rdtgrp) { 803 - mutex_unlock(&rdtgroup_mutex); 804 - return -ENODEV; 805 - } 806 - 807 - filp->private_data = rdtgrp; 808 - atomic_inc(&rdtgrp->waitcount); 809 - /* Perform a non-seekable open - llseek is not supported */ 810 - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 811 - 812 - mutex_unlock(&rdtgroup_mutex); 813 - 814 - return 0; 815 - } 816 - 817 - static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 818 - { 819 - struct rdtgroup *rdtgrp; 820 - 821 - mutex_lock(&rdtgroup_mutex); 822 - rdtgrp = filp->private_data; 823 - WARN_ON(!rdtgrp); 824 - if (!rdtgrp) { 825 - mutex_unlock(&rdtgroup_mutex); 826 - return -ENODEV; 827 - } 828 - filp->private_data = NULL; 829 - atomic_dec(&rdtgrp->waitcount); 830 - mutex_unlock(&rdtgroup_mutex); 831 - return 0; 832 - } 833 - 834 - static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 835 - { 836 - /* Not supported */ 837 - return -EINVAL; 838 - } 839 - 840 - static const struct vm_operations_struct pseudo_mmap_ops = { 841 - .mremap = pseudo_lock_dev_mremap, 842 - }; 843 - 844 - static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 845 - { 846 - unsigned long vsize = vma->vm_end - vma->vm_start; 847 - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 848 - struct pseudo_lock_region *plr; 849 - struct rdtgroup *rdtgrp; 850 - unsigned long physical; 851 - unsigned long psize; 852 - 853 - mutex_lock(&rdtgroup_mutex); 854 - 855 - rdtgrp = filp->private_data; 856 - WARN_ON(!rdtgrp); 857 - if (!rdtgrp) { 858 - mutex_unlock(&rdtgroup_mutex); 859 - return -ENODEV; 860 - } 861 - 862 - plr = rdtgrp->plr; 863 - 864 - if (!plr->d) { 865 - mutex_unlock(&rdtgroup_mutex); 866 - return -ENODEV; 867 - } 868 - 869 - /* 870 - * Task is required to run with affinity to the cpus associated 871 - * with the pseudo-locked region. If this is not the case the task 872 - * may be scheduled elsewhere and invalidate entries in the 873 - * pseudo-locked region. 874 - */ 875 - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 876 - mutex_unlock(&rdtgroup_mutex); 877 - return -EINVAL; 878 - } 879 - 880 - physical = __pa(plr->kmem) >> PAGE_SHIFT; 881 - psize = plr->size - off; 882 - 883 - if (off > plr->size) { 884 - mutex_unlock(&rdtgroup_mutex); 885 - return -ENOSPC; 886 - } 887 - 888 - /* 889 - * Ensure changes are carried directly to the memory being mapped, 890 - * do not allow copy-on-write mapping. 891 - */ 892 - if (!(vma->vm_flags & VM_SHARED)) { 893 - mutex_unlock(&rdtgroup_mutex); 894 - return -EINVAL; 895 - } 896 - 897 - if (vsize > psize) { 898 - mutex_unlock(&rdtgroup_mutex); 899 - return -ENOSPC; 900 - } 901 - 902 - memset(plr->kmem + off, 0, vsize); 903 - 904 - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 905 - vsize, vma->vm_page_prot)) { 906 - mutex_unlock(&rdtgroup_mutex); 907 - return -EAGAIN; 908 - } 909 - vma->vm_ops = &pseudo_mmap_ops; 910 - mutex_unlock(&rdtgroup_mutex); 911 - return 0; 912 - } 913 - 914 - static const struct file_operations pseudo_lock_dev_fops = { 915 - .owner = THIS_MODULE, 916 - .read = NULL, 917 - .write = NULL, 918 - .open = pseudo_lock_dev_open, 919 - .release = pseudo_lock_dev_release, 920 - .mmap = pseudo_lock_dev_mmap, 921 - }; 922 - 923 - int rdt_pseudo_lock_init(void) 924 - { 925 - int ret; 926 - 927 - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 928 - if (ret < 0) 929 - return ret; 930 - 931 - pseudo_lock_major = ret; 932 - 933 - ret = class_register(&pseudo_lock_class); 934 - if (ret) { 935 - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 936 - return ret; 937 - } 938 - 939 - return 0; 940 - } 941 - 942 - void rdt_pseudo_lock_release(void) 943 - { 944 - class_unregister(&pseudo_lock_class); 945 - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 946 - pseudo_lock_major = 0; 947 1171 }
+7 -4157
arch/x86/kernel/cpu/resctrl/rdtgroup.c
··· 18 18 #include <linux/fs_parser.h> 19 19 #include <linux/sysfs.h> 20 20 #include <linux/kernfs.h> 21 + #include <linux/resctrl.h> 21 22 #include <linux/seq_buf.h> 22 23 #include <linux/seq_file.h> 23 24 #include <linux/sched/signal.h> ··· 30 29 #include <uapi/linux/magic.h> 31 30 32 31 #include <asm/msr.h> 33 - #include <asm/resctrl.h> 34 32 #include "internal.h" 35 33 36 34 DEFINE_STATIC_KEY_FALSE(rdt_enable_key); 35 + 37 36 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); 37 + 38 38 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); 39 39 40 - /* Mutex to protect rdtgroup access. */ 41 - DEFINE_MUTEX(rdtgroup_mutex); 42 - 43 - static struct kernfs_root *rdt_root; 44 - struct rdtgroup rdtgroup_default; 45 - LIST_HEAD(rdt_all_groups); 46 - 47 - /* list of entries for the schemata file */ 48 - LIST_HEAD(resctrl_schema_all); 49 - 50 - /* The filesystem can only be mounted once. */ 51 - bool resctrl_mounted; 52 - 53 - /* Kernel fs node for "info" directory under root */ 54 - static struct kernfs_node *kn_info; 55 - 56 - /* Kernel fs node for "mon_groups" directory under root */ 57 - static struct kernfs_node *kn_mongrp; 58 - 59 - /* Kernel fs node for "mon_data" directory under root */ 60 - static struct kernfs_node *kn_mondata; 61 - 62 40 /* 63 - * Used to store the max resource name width to display the schemata names in 64 - * a tabular format. 65 - */ 66 - int max_name_width; 67 - 68 - static struct seq_buf last_cmd_status; 69 - static char last_cmd_status_buf[512]; 70 - 71 - static int rdtgroup_setup_root(struct rdt_fs_context *ctx); 72 - static void rdtgroup_destroy_root(void); 73 - 74 - struct dentry *debugfs_resctrl; 75 - 76 - /* 77 - * Memory bandwidth monitoring event to use for the default CTRL_MON group 78 - * and each new CTRL_MON group created by the user. Only relevant when 79 - * the filesystem is mounted with the "mba_MBps" option so it does not 80 - * matter that it remains uninitialized on systems that do not support 81 - * the "mba_MBps" option. 82 - */ 83 - enum resctrl_event_id mba_mbps_default_event; 84 - 85 - static bool resctrl_debug; 86 - 87 - void rdt_last_cmd_clear(void) 88 - { 89 - lockdep_assert_held(&rdtgroup_mutex); 90 - seq_buf_clear(&last_cmd_status); 91 - } 92 - 93 - void rdt_last_cmd_puts(const char *s) 94 - { 95 - lockdep_assert_held(&rdtgroup_mutex); 96 - seq_buf_puts(&last_cmd_status, s); 97 - } 98 - 99 - void rdt_last_cmd_printf(const char *fmt, ...) 100 - { 101 - va_list ap; 102 - 103 - va_start(ap, fmt); 104 - lockdep_assert_held(&rdtgroup_mutex); 105 - seq_buf_vprintf(&last_cmd_status, fmt, ap); 106 - va_end(ap); 107 - } 108 - 109 - void rdt_staged_configs_clear(void) 110 - { 111 - struct rdt_ctrl_domain *dom; 112 - struct rdt_resource *r; 113 - 114 - lockdep_assert_held(&rdtgroup_mutex); 115 - 116 - for_each_alloc_capable_rdt_resource(r) { 117 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) 118 - memset(dom->staged_config, 0, sizeof(dom->staged_config)); 119 - } 120 - } 121 - 122 - static bool resctrl_is_mbm_enabled(void) 123 - { 124 - return (resctrl_arch_is_mbm_total_enabled() || 125 - resctrl_arch_is_mbm_local_enabled()); 126 - } 127 - 128 - static bool resctrl_is_mbm_event(int e) 129 - { 130 - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && 131 - e <= QOS_L3_MBM_LOCAL_EVENT_ID); 132 - } 133 - 134 - /* 135 - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, 136 - * we can keep a bitmap of free CLOSIDs in a single integer. 137 - * 138 - * Using a global CLOSID across all resources has some advantages and 139 - * some drawbacks: 140 - * + We can simply set current's closid to assign a task to a resource 141 - * group. 142 - * + Context switch code can avoid extra memory references deciding which 143 - * CLOSID to load into the PQR_ASSOC MSR 144 - * - We give up some options in configuring resource groups across multi-socket 145 - * systems. 146 - * - Our choices on how to configure each resource become progressively more 147 - * limited as the number of resources grows. 148 - */ 149 - static unsigned long closid_free_map; 150 - static int closid_free_map_len; 151 - 152 - int closids_supported(void) 153 - { 154 - return closid_free_map_len; 155 - } 156 - 157 - static void closid_init(void) 158 - { 159 - struct resctrl_schema *s; 160 - u32 rdt_min_closid = 32; 161 - 162 - /* Compute rdt_min_closid across all resources */ 163 - list_for_each_entry(s, &resctrl_schema_all, list) 164 - rdt_min_closid = min(rdt_min_closid, s->num_closid); 165 - 166 - closid_free_map = BIT_MASK(rdt_min_closid) - 1; 167 - 168 - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ 169 - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); 170 - closid_free_map_len = rdt_min_closid; 171 - } 172 - 173 - static int closid_alloc(void) 174 - { 175 - int cleanest_closid; 176 - u32 closid; 177 - 178 - lockdep_assert_held(&rdtgroup_mutex); 179 - 180 - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && 181 - resctrl_arch_is_llc_occupancy_enabled()) { 182 - cleanest_closid = resctrl_find_cleanest_closid(); 183 - if (cleanest_closid < 0) 184 - return cleanest_closid; 185 - closid = cleanest_closid; 186 - } else { 187 - closid = ffs(closid_free_map); 188 - if (closid == 0) 189 - return -ENOSPC; 190 - closid--; 191 - } 192 - __clear_bit(closid, &closid_free_map); 193 - 194 - return closid; 195 - } 196 - 197 - void closid_free(int closid) 198 - { 199 - lockdep_assert_held(&rdtgroup_mutex); 200 - 201 - __set_bit(closid, &closid_free_map); 202 - } 203 - 204 - /** 205 - * closid_allocated - test if provided closid is in use 206 - * @closid: closid to be tested 207 - * 208 - * Return: true if @closid is currently associated with a resource group, 209 - * false if @closid is free 210 - */ 211 - bool closid_allocated(unsigned int closid) 212 - { 213 - lockdep_assert_held(&rdtgroup_mutex); 214 - 215 - return !test_bit(closid, &closid_free_map); 216 - } 217 - 218 - /** 219 - * rdtgroup_mode_by_closid - Return mode of resource group with closid 220 - * @closid: closid if the resource group 221 - * 222 - * Each resource group is associated with a @closid. Here the mode 223 - * of a resource group can be queried by searching for it using its closid. 224 - * 225 - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid 226 - */ 227 - enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) 228 - { 229 - struct rdtgroup *rdtgrp; 230 - 231 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 232 - if (rdtgrp->closid == closid) 233 - return rdtgrp->mode; 234 - } 235 - 236 - return RDT_NUM_MODES; 237 - } 238 - 239 - static const char * const rdt_mode_str[] = { 240 - [RDT_MODE_SHAREABLE] = "shareable", 241 - [RDT_MODE_EXCLUSIVE] = "exclusive", 242 - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", 243 - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", 244 - }; 245 - 246 - /** 247 - * rdtgroup_mode_str - Return the string representation of mode 248 - * @mode: the resource group mode as &enum rdtgroup_mode 249 - * 250 - * Return: string representation of valid mode, "unknown" otherwise 251 - */ 252 - static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) 253 - { 254 - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) 255 - return "unknown"; 256 - 257 - return rdt_mode_str[mode]; 258 - } 259 - 260 - /* set uid and gid of rdtgroup dirs and files to that of the creator */ 261 - static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) 262 - { 263 - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 264 - .ia_uid = current_fsuid(), 265 - .ia_gid = current_fsgid(), }; 266 - 267 - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 268 - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 269 - return 0; 270 - 271 - return kernfs_setattr(kn, &iattr); 272 - } 273 - 274 - static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) 275 - { 276 - struct kernfs_node *kn; 277 - int ret; 278 - 279 - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, 280 - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 281 - 0, rft->kf_ops, rft, NULL, NULL); 282 - if (IS_ERR(kn)) 283 - return PTR_ERR(kn); 284 - 285 - ret = rdtgroup_kn_set_ugid(kn); 286 - if (ret) { 287 - kernfs_remove(kn); 288 - return ret; 289 - } 290 - 291 - return 0; 292 - } 293 - 294 - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 295 - { 296 - struct kernfs_open_file *of = m->private; 297 - struct rftype *rft = of->kn->priv; 298 - 299 - if (rft->seq_show) 300 - return rft->seq_show(of, m, arg); 301 - return 0; 302 - } 303 - 304 - static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, 305 - size_t nbytes, loff_t off) 306 - { 307 - struct rftype *rft = of->kn->priv; 308 - 309 - if (rft->write) 310 - return rft->write(of, buf, nbytes, off); 311 - 312 - return -EINVAL; 313 - } 314 - 315 - static const struct kernfs_ops rdtgroup_kf_single_ops = { 316 - .atomic_write_len = PAGE_SIZE, 317 - .write = rdtgroup_file_write, 318 - .seq_show = rdtgroup_seqfile_show, 319 - }; 320 - 321 - static const struct kernfs_ops kf_mondata_ops = { 322 - .atomic_write_len = PAGE_SIZE, 323 - .seq_show = rdtgroup_mondata_show, 324 - }; 325 - 326 - static bool is_cpu_list(struct kernfs_open_file *of) 327 - { 328 - struct rftype *rft = of->kn->priv; 329 - 330 - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; 331 - } 332 - 333 - static int rdtgroup_cpus_show(struct kernfs_open_file *of, 334 - struct seq_file *s, void *v) 335 - { 336 - struct rdtgroup *rdtgrp; 337 - struct cpumask *mask; 338 - int ret = 0; 339 - 340 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 341 - 342 - if (rdtgrp) { 343 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 344 - if (!rdtgrp->plr->d) { 345 - rdt_last_cmd_clear(); 346 - rdt_last_cmd_puts("Cache domain offline\n"); 347 - ret = -ENODEV; 348 - } else { 349 - mask = &rdtgrp->plr->d->hdr.cpu_mask; 350 - seq_printf(s, is_cpu_list(of) ? 351 - "%*pbl\n" : "%*pb\n", 352 - cpumask_pr_args(mask)); 353 - } 354 - } else { 355 - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", 356 - cpumask_pr_args(&rdtgrp->cpu_mask)); 357 - } 358 - } else { 359 - ret = -ENOENT; 360 - } 361 - rdtgroup_kn_unlock(of->kn); 362 - 363 - return ret; 364 - } 365 - 366 - /* 367 - * This is safe against resctrl_sched_in() called from __switch_to() 41 + * This is safe against resctrl_arch_sched_in() called from __switch_to() 368 42 * because __switch_to() is executed with interrupts disabled. A local call 369 43 * from update_closid_rmid() is protected against __switch_to() because 370 44 * preemption is disabled. ··· 58 382 * executing task might have its own closid selected. Just reuse 59 383 * the context switch code. 60 384 */ 61 - resctrl_sched_in(current); 62 - } 63 - 64 - /* 65 - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 66 - * 67 - * Per task closids/rmids must have been set up before calling this function. 68 - * @r may be NULL. 69 - */ 70 - static void 71 - update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) 72 - { 73 - struct resctrl_cpu_defaults defaults, *p = NULL; 74 - 75 - if (r) { 76 - defaults.closid = r->closid; 77 - defaults.rmid = r->mon.rmid; 78 - p = &defaults; 79 - } 80 - 81 - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); 82 - } 83 - 84 - static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 85 - cpumask_var_t tmpmask) 86 - { 87 - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; 88 - struct list_head *head; 89 - 90 - /* Check whether cpus belong to parent ctrl group */ 91 - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); 92 - if (!cpumask_empty(tmpmask)) { 93 - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); 94 - return -EINVAL; 95 - } 96 - 97 - /* Check whether cpus are dropped from this group */ 98 - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 99 - if (!cpumask_empty(tmpmask)) { 100 - /* Give any dropped cpus to parent rdtgroup */ 101 - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); 102 - update_closid_rmid(tmpmask, prgrp); 103 - } 104 - 105 - /* 106 - * If we added cpus, remove them from previous group that owned them 107 - * and update per-cpu rmid 108 - */ 109 - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 110 - if (!cpumask_empty(tmpmask)) { 111 - head = &prgrp->mon.crdtgrp_list; 112 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 113 - if (crgrp == rdtgrp) 114 - continue; 115 - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, 116 - tmpmask); 117 - } 118 - update_closid_rmid(tmpmask, rdtgrp); 119 - } 120 - 121 - /* Done pushing/pulling - update this group with new mask */ 122 - cpumask_copy(&rdtgrp->cpu_mask, newmask); 123 - 124 - return 0; 125 - } 126 - 127 - static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) 128 - { 129 - struct rdtgroup *crgrp; 130 - 131 - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); 132 - /* update the child mon group masks as well*/ 133 - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) 134 - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); 135 - } 136 - 137 - static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 138 - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) 139 - { 140 - struct rdtgroup *r, *crgrp; 141 - struct list_head *head; 142 - 143 - /* Check whether cpus are dropped from this group */ 144 - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 145 - if (!cpumask_empty(tmpmask)) { 146 - /* Can't drop from default group */ 147 - if (rdtgrp == &rdtgroup_default) { 148 - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); 149 - return -EINVAL; 150 - } 151 - 152 - /* Give any dropped cpus to rdtgroup_default */ 153 - cpumask_or(&rdtgroup_default.cpu_mask, 154 - &rdtgroup_default.cpu_mask, tmpmask); 155 - update_closid_rmid(tmpmask, &rdtgroup_default); 156 - } 157 - 158 - /* 159 - * If we added cpus, remove them from previous group and 160 - * the prev group's child groups that owned them 161 - * and update per-cpu closid/rmid. 162 - */ 163 - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 164 - if (!cpumask_empty(tmpmask)) { 165 - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { 166 - if (r == rdtgrp) 167 - continue; 168 - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); 169 - if (!cpumask_empty(tmpmask1)) 170 - cpumask_rdtgrp_clear(r, tmpmask1); 171 - } 172 - update_closid_rmid(tmpmask, rdtgrp); 173 - } 174 - 175 - /* Done pushing/pulling - update this group with new mask */ 176 - cpumask_copy(&rdtgrp->cpu_mask, newmask); 177 - 178 - /* 179 - * Clear child mon group masks since there is a new parent mask 180 - * now and update the rmid for the cpus the child lost. 181 - */ 182 - head = &rdtgrp->mon.crdtgrp_list; 183 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 184 - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); 185 - update_closid_rmid(tmpmask, rdtgrp); 186 - cpumask_clear(&crgrp->cpu_mask); 187 - } 188 - 189 - return 0; 190 - } 191 - 192 - static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 193 - char *buf, size_t nbytes, loff_t off) 194 - { 195 - cpumask_var_t tmpmask, newmask, tmpmask1; 196 - struct rdtgroup *rdtgrp; 197 - int ret; 198 - 199 - if (!buf) 200 - return -EINVAL; 201 - 202 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 203 - return -ENOMEM; 204 - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { 205 - free_cpumask_var(tmpmask); 206 - return -ENOMEM; 207 - } 208 - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { 209 - free_cpumask_var(tmpmask); 210 - free_cpumask_var(newmask); 211 - return -ENOMEM; 212 - } 213 - 214 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 215 - if (!rdtgrp) { 216 - ret = -ENOENT; 217 - goto unlock; 218 - } 219 - 220 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 221 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 222 - ret = -EINVAL; 223 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 224 - goto unlock; 225 - } 226 - 227 - if (is_cpu_list(of)) 228 - ret = cpulist_parse(buf, newmask); 229 - else 230 - ret = cpumask_parse(buf, newmask); 231 - 232 - if (ret) { 233 - rdt_last_cmd_puts("Bad CPU list/mask\n"); 234 - goto unlock; 235 - } 236 - 237 - /* check that user didn't specify any offline cpus */ 238 - cpumask_andnot(tmpmask, newmask, cpu_online_mask); 239 - if (!cpumask_empty(tmpmask)) { 240 - ret = -EINVAL; 241 - rdt_last_cmd_puts("Can only assign online CPUs\n"); 242 - goto unlock; 243 - } 244 - 245 - if (rdtgrp->type == RDTCTRL_GROUP) 246 - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); 247 - else if (rdtgrp->type == RDTMON_GROUP) 248 - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); 249 - else 250 - ret = -EINVAL; 251 - 252 - unlock: 253 - rdtgroup_kn_unlock(of->kn); 254 - free_cpumask_var(tmpmask); 255 - free_cpumask_var(newmask); 256 - free_cpumask_var(tmpmask1); 257 - 258 - return ret ?: nbytes; 259 - } 260 - 261 - /** 262 - * rdtgroup_remove - the helper to remove resource group safely 263 - * @rdtgrp: resource group to remove 264 - * 265 - * On resource group creation via a mkdir, an extra kernfs_node reference is 266 - * taken to ensure that the rdtgroup structure remains accessible for the 267 - * rdtgroup_kn_unlock() calls where it is removed. 268 - * 269 - * Drop the extra reference here, then free the rdtgroup structure. 270 - * 271 - * Return: void 272 - */ 273 - static void rdtgroup_remove(struct rdtgroup *rdtgrp) 274 - { 275 - kernfs_put(rdtgrp->kn); 276 - kfree(rdtgrp); 277 - } 278 - 279 - static void _update_task_closid_rmid(void *task) 280 - { 281 - /* 282 - * If the task is still current on this CPU, update PQR_ASSOC MSR. 283 - * Otherwise, the MSR is updated when the task is scheduled in. 284 - */ 285 - if (task == current) 286 - resctrl_sched_in(task); 287 - } 288 - 289 - static void update_task_closid_rmid(struct task_struct *t) 290 - { 291 - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) 292 - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); 293 - else 294 - _update_task_closid_rmid(t); 295 - } 296 - 297 - static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) 298 - { 299 - u32 closid, rmid = rdtgrp->mon.rmid; 300 - 301 - if (rdtgrp->type == RDTCTRL_GROUP) 302 - closid = rdtgrp->closid; 303 - else if (rdtgrp->type == RDTMON_GROUP) 304 - closid = rdtgrp->mon.parent->closid; 305 - else 306 - return false; 307 - 308 - return resctrl_arch_match_closid(tsk, closid) && 309 - resctrl_arch_match_rmid(tsk, closid, rmid); 310 - } 311 - 312 - static int __rdtgroup_move_task(struct task_struct *tsk, 313 - struct rdtgroup *rdtgrp) 314 - { 315 - /* If the task is already in rdtgrp, no need to move the task. */ 316 - if (task_in_rdtgroup(tsk, rdtgrp)) 317 - return 0; 318 - 319 - /* 320 - * Set the task's closid/rmid before the PQR_ASSOC MSR can be 321 - * updated by them. 322 - * 323 - * For ctrl_mon groups, move both closid and rmid. 324 - * For monitor groups, can move the tasks only from 325 - * their parent CTRL group. 326 - */ 327 - if (rdtgrp->type == RDTMON_GROUP && 328 - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { 329 - rdt_last_cmd_puts("Can't move task to different control group\n"); 330 - return -EINVAL; 331 - } 332 - 333 - if (rdtgrp->type == RDTMON_GROUP) 334 - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, 335 - rdtgrp->mon.rmid); 336 - else 337 - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, 338 - rdtgrp->mon.rmid); 339 - 340 - /* 341 - * Ensure the task's closid and rmid are written before determining if 342 - * the task is current that will decide if it will be interrupted. 343 - * This pairs with the full barrier between the rq->curr update and 344 - * resctrl_sched_in() during context switch. 345 - */ 346 - smp_mb(); 347 - 348 - /* 349 - * By now, the task's closid and rmid are set. If the task is current 350 - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource 351 - * group go into effect. If the task is not current, the MSR will be 352 - * updated when the task is scheduled in. 353 - */ 354 - update_task_closid_rmid(tsk); 355 - 356 - return 0; 357 - } 358 - 359 - static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) 360 - { 361 - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && 362 - resctrl_arch_match_closid(t, r->closid)); 363 - } 364 - 365 - static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) 366 - { 367 - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && 368 - resctrl_arch_match_rmid(t, r->mon.parent->closid, 369 - r->mon.rmid)); 370 - } 371 - 372 - /** 373 - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group 374 - * @r: Resource group 375 - * 376 - * Return: 1 if tasks have been assigned to @r, 0 otherwise 377 - */ 378 - int rdtgroup_tasks_assigned(struct rdtgroup *r) 379 - { 380 - struct task_struct *p, *t; 381 - int ret = 0; 382 - 383 - lockdep_assert_held(&rdtgroup_mutex); 384 - 385 - rcu_read_lock(); 386 - for_each_process_thread(p, t) { 387 - if (is_closid_match(t, r) || is_rmid_match(t, r)) { 388 - ret = 1; 389 - break; 390 - } 391 - } 392 - rcu_read_unlock(); 393 - 394 - return ret; 395 - } 396 - 397 - static int rdtgroup_task_write_permission(struct task_struct *task, 398 - struct kernfs_open_file *of) 399 - { 400 - const struct cred *tcred = get_task_cred(task); 401 - const struct cred *cred = current_cred(); 402 - int ret = 0; 403 - 404 - /* 405 - * Even if we're attaching all tasks in the thread group, we only 406 - * need to check permissions on one of them. 407 - */ 408 - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 409 - !uid_eq(cred->euid, tcred->uid) && 410 - !uid_eq(cred->euid, tcred->suid)) { 411 - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); 412 - ret = -EPERM; 413 - } 414 - 415 - put_cred(tcred); 416 - return ret; 417 - } 418 - 419 - static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, 420 - struct kernfs_open_file *of) 421 - { 422 - struct task_struct *tsk; 423 - int ret; 424 - 425 - rcu_read_lock(); 426 - if (pid) { 427 - tsk = find_task_by_vpid(pid); 428 - if (!tsk) { 429 - rcu_read_unlock(); 430 - rdt_last_cmd_printf("No task %d\n", pid); 431 - return -ESRCH; 432 - } 433 - } else { 434 - tsk = current; 435 - } 436 - 437 - get_task_struct(tsk); 438 - rcu_read_unlock(); 439 - 440 - ret = rdtgroup_task_write_permission(tsk, of); 441 - if (!ret) 442 - ret = __rdtgroup_move_task(tsk, rdtgrp); 443 - 444 - put_task_struct(tsk); 445 - return ret; 446 - } 447 - 448 - static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, 449 - char *buf, size_t nbytes, loff_t off) 450 - { 451 - struct rdtgroup *rdtgrp; 452 - char *pid_str; 453 - int ret = 0; 454 - pid_t pid; 455 - 456 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 457 - if (!rdtgrp) { 458 - rdtgroup_kn_unlock(of->kn); 459 - return -ENOENT; 460 - } 461 - rdt_last_cmd_clear(); 462 - 463 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 464 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 465 - ret = -EINVAL; 466 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 467 - goto unlock; 468 - } 469 - 470 - while (buf && buf[0] != '\0' && buf[0] != '\n') { 471 - pid_str = strim(strsep(&buf, ",")); 472 - 473 - if (kstrtoint(pid_str, 0, &pid)) { 474 - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); 475 - ret = -EINVAL; 476 - break; 477 - } 478 - 479 - if (pid < 0) { 480 - rdt_last_cmd_printf("Invalid pid %d\n", pid); 481 - ret = -EINVAL; 482 - break; 483 - } 484 - 485 - ret = rdtgroup_move_task(pid, rdtgrp, of); 486 - if (ret) { 487 - rdt_last_cmd_printf("Error while processing task %d\n", pid); 488 - break; 489 - } 490 - } 491 - 492 - unlock: 493 - rdtgroup_kn_unlock(of->kn); 494 - 495 - return ret ?: nbytes; 496 - } 497 - 498 - static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) 499 - { 500 - struct task_struct *p, *t; 501 - pid_t pid; 502 - 503 - rcu_read_lock(); 504 - for_each_process_thread(p, t) { 505 - if (is_closid_match(t, r) || is_rmid_match(t, r)) { 506 - pid = task_pid_vnr(t); 507 - if (pid) 508 - seq_printf(s, "%d\n", pid); 509 - } 510 - } 511 - rcu_read_unlock(); 512 - } 513 - 514 - static int rdtgroup_tasks_show(struct kernfs_open_file *of, 515 - struct seq_file *s, void *v) 516 - { 517 - struct rdtgroup *rdtgrp; 518 - int ret = 0; 519 - 520 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 521 - if (rdtgrp) 522 - show_rdt_tasks(rdtgrp, s); 523 - else 524 - ret = -ENOENT; 525 - rdtgroup_kn_unlock(of->kn); 526 - 527 - return ret; 528 - } 529 - 530 - static int rdtgroup_closid_show(struct kernfs_open_file *of, 531 - struct seq_file *s, void *v) 532 - { 533 - struct rdtgroup *rdtgrp; 534 - int ret = 0; 535 - 536 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 537 - if (rdtgrp) 538 - seq_printf(s, "%u\n", rdtgrp->closid); 539 - else 540 - ret = -ENOENT; 541 - rdtgroup_kn_unlock(of->kn); 542 - 543 - return ret; 544 - } 545 - 546 - static int rdtgroup_rmid_show(struct kernfs_open_file *of, 547 - struct seq_file *s, void *v) 548 - { 549 - struct rdtgroup *rdtgrp; 550 - int ret = 0; 551 - 552 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 553 - if (rdtgrp) 554 - seq_printf(s, "%u\n", rdtgrp->mon.rmid); 555 - else 556 - ret = -ENOENT; 557 - rdtgroup_kn_unlock(of->kn); 558 - 559 - return ret; 560 - } 561 - 562 - #ifdef CONFIG_PROC_CPU_RESCTRL 563 - 564 - /* 565 - * A task can only be part of one resctrl control group and of one monitor 566 - * group which is associated to that control group. 567 - * 568 - * 1) res: 569 - * mon: 570 - * 571 - * resctrl is not available. 572 - * 573 - * 2) res:/ 574 - * mon: 575 - * 576 - * Task is part of the root resctrl control group, and it is not associated 577 - * to any monitor group. 578 - * 579 - * 3) res:/ 580 - * mon:mon0 581 - * 582 - * Task is part of the root resctrl control group and monitor group mon0. 583 - * 584 - * 4) res:group0 585 - * mon: 586 - * 587 - * Task is part of resctrl control group group0, and it is not associated 588 - * to any monitor group. 589 - * 590 - * 5) res:group0 591 - * mon:mon1 592 - * 593 - * Task is part of resctrl control group group0 and monitor group mon1. 594 - */ 595 - int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, 596 - struct pid *pid, struct task_struct *tsk) 597 - { 598 - struct rdtgroup *rdtg; 599 - int ret = 0; 600 - 601 - mutex_lock(&rdtgroup_mutex); 602 - 603 - /* Return empty if resctrl has not been mounted. */ 604 - if (!resctrl_mounted) { 605 - seq_puts(s, "res:\nmon:\n"); 606 - goto unlock; 607 - } 608 - 609 - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { 610 - struct rdtgroup *crg; 611 - 612 - /* 613 - * Task information is only relevant for shareable 614 - * and exclusive groups. 615 - */ 616 - if (rdtg->mode != RDT_MODE_SHAREABLE && 617 - rdtg->mode != RDT_MODE_EXCLUSIVE) 618 - continue; 619 - 620 - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) 621 - continue; 622 - 623 - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", 624 - rdt_kn_name(rdtg->kn)); 625 - seq_puts(s, "mon:"); 626 - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, 627 - mon.crdtgrp_list) { 628 - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, 629 - crg->mon.rmid)) 630 - continue; 631 - seq_printf(s, "%s", rdt_kn_name(crg->kn)); 632 - break; 633 - } 634 - seq_putc(s, '\n'); 635 - goto unlock; 636 - } 637 - /* 638 - * The above search should succeed. Otherwise return 639 - * with an error. 640 - */ 641 - ret = -ENOENT; 642 - unlock: 643 - mutex_unlock(&rdtgroup_mutex); 644 - 645 - return ret; 646 - } 647 - #endif 648 - 649 - static int rdt_last_cmd_status_show(struct kernfs_open_file *of, 650 - struct seq_file *seq, void *v) 651 - { 652 - int len; 653 - 654 - mutex_lock(&rdtgroup_mutex); 655 - len = seq_buf_used(&last_cmd_status); 656 - if (len) 657 - seq_printf(seq, "%.*s", len, last_cmd_status_buf); 658 - else 659 - seq_puts(seq, "ok\n"); 660 - mutex_unlock(&rdtgroup_mutex); 661 - return 0; 662 - } 663 - 664 - static void *rdt_kn_parent_priv(struct kernfs_node *kn) 665 - { 666 - /* 667 - * The parent pointer is only valid within RCU section since it can be 668 - * replaced. 669 - */ 670 - guard(rcu)(); 671 - return rcu_dereference(kn->__parent)->priv; 672 - } 673 - 674 - static int rdt_num_closids_show(struct kernfs_open_file *of, 675 - struct seq_file *seq, void *v) 676 - { 677 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 678 - 679 - seq_printf(seq, "%u\n", s->num_closid); 680 - return 0; 681 - } 682 - 683 - static int rdt_default_ctrl_show(struct kernfs_open_file *of, 684 - struct seq_file *seq, void *v) 685 - { 686 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 687 - struct rdt_resource *r = s->res; 688 - 689 - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); 690 - return 0; 691 - } 692 - 693 - static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, 694 - struct seq_file *seq, void *v) 695 - { 696 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 697 - struct rdt_resource *r = s->res; 698 - 699 - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); 700 - return 0; 701 - } 702 - 703 - static int rdt_shareable_bits_show(struct kernfs_open_file *of, 704 - struct seq_file *seq, void *v) 705 - { 706 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 707 - struct rdt_resource *r = s->res; 708 - 709 - seq_printf(seq, "%x\n", r->cache.shareable_bits); 710 - return 0; 711 - } 712 - 713 - /* 714 - * rdt_bit_usage_show - Display current usage of resources 715 - * 716 - * A domain is a shared resource that can now be allocated differently. Here 717 - * we display the current regions of the domain as an annotated bitmask. 718 - * For each domain of this resource its allocation bitmask 719 - * is annotated as below to indicate the current usage of the corresponding bit: 720 - * 0 - currently unused 721 - * X - currently available for sharing and used by software and hardware 722 - * H - currently used by hardware only but available for software use 723 - * S - currently used and shareable by software only 724 - * E - currently used exclusively by one resource group 725 - * P - currently pseudo-locked by one resource group 726 - */ 727 - static int rdt_bit_usage_show(struct kernfs_open_file *of, 728 - struct seq_file *seq, void *v) 729 - { 730 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 731 - /* 732 - * Use unsigned long even though only 32 bits are used to ensure 733 - * test_bit() is used safely. 734 - */ 735 - unsigned long sw_shareable = 0, hw_shareable = 0; 736 - unsigned long exclusive = 0, pseudo_locked = 0; 737 - struct rdt_resource *r = s->res; 738 - struct rdt_ctrl_domain *dom; 739 - int i, hwb, swb, excl, psl; 740 - enum rdtgrp_mode mode; 741 - bool sep = false; 742 - u32 ctrl_val; 743 - 744 - cpus_read_lock(); 745 - mutex_lock(&rdtgroup_mutex); 746 - hw_shareable = r->cache.shareable_bits; 747 - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 748 - if (sep) 749 - seq_putc(seq, ';'); 750 - sw_shareable = 0; 751 - exclusive = 0; 752 - seq_printf(seq, "%d=", dom->hdr.id); 753 - for (i = 0; i < closids_supported(); i++) { 754 - if (!closid_allocated(i)) 755 - continue; 756 - ctrl_val = resctrl_arch_get_config(r, dom, i, 757 - s->conf_type); 758 - mode = rdtgroup_mode_by_closid(i); 759 - switch (mode) { 760 - case RDT_MODE_SHAREABLE: 761 - sw_shareable |= ctrl_val; 762 - break; 763 - case RDT_MODE_EXCLUSIVE: 764 - exclusive |= ctrl_val; 765 - break; 766 - case RDT_MODE_PSEUDO_LOCKSETUP: 767 - /* 768 - * RDT_MODE_PSEUDO_LOCKSETUP is possible 769 - * here but not included since the CBM 770 - * associated with this CLOSID in this mode 771 - * is not initialized and no task or cpu can be 772 - * assigned this CLOSID. 773 - */ 774 - break; 775 - case RDT_MODE_PSEUDO_LOCKED: 776 - case RDT_NUM_MODES: 777 - WARN(1, 778 - "invalid mode for closid %d\n", i); 779 - break; 780 - } 781 - } 782 - for (i = r->cache.cbm_len - 1; i >= 0; i--) { 783 - pseudo_locked = dom->plr ? dom->plr->cbm : 0; 784 - hwb = test_bit(i, &hw_shareable); 785 - swb = test_bit(i, &sw_shareable); 786 - excl = test_bit(i, &exclusive); 787 - psl = test_bit(i, &pseudo_locked); 788 - if (hwb && swb) 789 - seq_putc(seq, 'X'); 790 - else if (hwb && !swb) 791 - seq_putc(seq, 'H'); 792 - else if (!hwb && swb) 793 - seq_putc(seq, 'S'); 794 - else if (excl) 795 - seq_putc(seq, 'E'); 796 - else if (psl) 797 - seq_putc(seq, 'P'); 798 - else /* Unused bits remain */ 799 - seq_putc(seq, '0'); 800 - } 801 - sep = true; 802 - } 803 - seq_putc(seq, '\n'); 804 - mutex_unlock(&rdtgroup_mutex); 805 - cpus_read_unlock(); 806 - return 0; 807 - } 808 - 809 - static int rdt_min_bw_show(struct kernfs_open_file *of, 810 - struct seq_file *seq, void *v) 811 - { 812 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 813 - struct rdt_resource *r = s->res; 814 - 815 - seq_printf(seq, "%u\n", r->membw.min_bw); 816 - return 0; 817 - } 818 - 819 - static int rdt_num_rmids_show(struct kernfs_open_file *of, 820 - struct seq_file *seq, void *v) 821 - { 822 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 823 - 824 - seq_printf(seq, "%d\n", r->num_rmid); 825 - 826 - return 0; 827 - } 828 - 829 - static int rdt_mon_features_show(struct kernfs_open_file *of, 830 - struct seq_file *seq, void *v) 831 - { 832 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 833 - struct mon_evt *mevt; 834 - 835 - list_for_each_entry(mevt, &r->evt_list, list) { 836 - seq_printf(seq, "%s\n", mevt->name); 837 - if (mevt->configurable) 838 - seq_printf(seq, "%s_config\n", mevt->name); 839 - } 840 - 841 - return 0; 842 - } 843 - 844 - static int rdt_bw_gran_show(struct kernfs_open_file *of, 845 - struct seq_file *seq, void *v) 846 - { 847 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 848 - struct rdt_resource *r = s->res; 849 - 850 - seq_printf(seq, "%u\n", r->membw.bw_gran); 851 - return 0; 852 - } 853 - 854 - static int rdt_delay_linear_show(struct kernfs_open_file *of, 855 - struct seq_file *seq, void *v) 856 - { 857 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 858 - struct rdt_resource *r = s->res; 859 - 860 - seq_printf(seq, "%u\n", r->membw.delay_linear); 861 - return 0; 862 - } 863 - 864 - static int max_threshold_occ_show(struct kernfs_open_file *of, 865 - struct seq_file *seq, void *v) 866 - { 867 - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); 868 - 869 - return 0; 870 - } 871 - 872 - static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, 873 - struct seq_file *seq, void *v) 874 - { 875 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 876 - struct rdt_resource *r = s->res; 877 - 878 - switch (r->membw.throttle_mode) { 879 - case THREAD_THROTTLE_PER_THREAD: 880 - seq_puts(seq, "per-thread\n"); 881 - return 0; 882 - case THREAD_THROTTLE_MAX: 883 - seq_puts(seq, "max\n"); 884 - return 0; 885 - case THREAD_THROTTLE_UNDEFINED: 886 - seq_puts(seq, "undefined\n"); 887 - return 0; 888 - } 889 - 890 - WARN_ON_ONCE(1); 891 - 892 - return 0; 893 - } 894 - 895 - static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, 896 - char *buf, size_t nbytes, loff_t off) 897 - { 898 - unsigned int bytes; 899 - int ret; 900 - 901 - ret = kstrtouint(buf, 0, &bytes); 902 - if (ret) 903 - return ret; 904 - 905 - if (bytes > resctrl_rmid_realloc_limit) 906 - return -EINVAL; 907 - 908 - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); 909 - 910 - return nbytes; 911 - } 912 - 913 - /* 914 - * rdtgroup_mode_show - Display mode of this resource group 915 - */ 916 - static int rdtgroup_mode_show(struct kernfs_open_file *of, 917 - struct seq_file *s, void *v) 918 - { 919 - struct rdtgroup *rdtgrp; 920 - 921 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 922 - if (!rdtgrp) { 923 - rdtgroup_kn_unlock(of->kn); 924 - return -ENOENT; 925 - } 926 - 927 - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); 928 - 929 - rdtgroup_kn_unlock(of->kn); 930 - return 0; 931 - } 932 - 933 - static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) 934 - { 935 - switch (my_type) { 936 - case CDP_CODE: 937 - return CDP_DATA; 938 - case CDP_DATA: 939 - return CDP_CODE; 940 - default: 941 - case CDP_NONE: 942 - return CDP_NONE; 943 - } 944 - } 945 - 946 - static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, 947 - struct seq_file *seq, void *v) 948 - { 949 - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 950 - struct rdt_resource *r = s->res; 951 - 952 - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); 953 - 954 - return 0; 955 - } 956 - 957 - /** 958 - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other 959 - * @r: Resource to which domain instance @d belongs. 960 - * @d: The domain instance for which @closid is being tested. 961 - * @cbm: Capacity bitmask being tested. 962 - * @closid: Intended closid for @cbm. 963 - * @type: CDP type of @r. 964 - * @exclusive: Only check if overlaps with exclusive resource groups 965 - * 966 - * Checks if provided @cbm intended to be used for @closid on domain 967 - * @d overlaps with any other closids or other hardware usage associated 968 - * with this domain. If @exclusive is true then only overlaps with 969 - * resource groups in exclusive mode will be considered. If @exclusive 970 - * is false then overlaps with any resource group or hardware entities 971 - * will be considered. 972 - * 973 - * @cbm is unsigned long, even if only 32 bits are used, to make the 974 - * bitmap functions work correctly. 975 - * 976 - * Return: false if CBM does not overlap, true if it does. 977 - */ 978 - static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, 979 - unsigned long cbm, int closid, 980 - enum resctrl_conf_type type, bool exclusive) 981 - { 982 - enum rdtgrp_mode mode; 983 - unsigned long ctrl_b; 984 - int i; 985 - 986 - /* Check for any overlap with regions used by hardware directly */ 987 - if (!exclusive) { 988 - ctrl_b = r->cache.shareable_bits; 989 - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) 990 - return true; 991 - } 992 - 993 - /* Check for overlap with other resource groups */ 994 - for (i = 0; i < closids_supported(); i++) { 995 - ctrl_b = resctrl_arch_get_config(r, d, i, type); 996 - mode = rdtgroup_mode_by_closid(i); 997 - if (closid_allocated(i) && i != closid && 998 - mode != RDT_MODE_PSEUDO_LOCKSETUP) { 999 - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { 1000 - if (exclusive) { 1001 - if (mode == RDT_MODE_EXCLUSIVE) 1002 - return true; 1003 - continue; 1004 - } 1005 - return true; 1006 - } 1007 - } 1008 - } 1009 - 1010 - return false; 1011 - } 1012 - 1013 - /** 1014 - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware 1015 - * @s: Schema for the resource to which domain instance @d belongs. 1016 - * @d: The domain instance for which @closid is being tested. 1017 - * @cbm: Capacity bitmask being tested. 1018 - * @closid: Intended closid for @cbm. 1019 - * @exclusive: Only check if overlaps with exclusive resource groups 1020 - * 1021 - * Resources that can be allocated using a CBM can use the CBM to control 1022 - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test 1023 - * for overlap. Overlap test is not limited to the specific resource for 1024 - * which the CBM is intended though - when dealing with CDP resources that 1025 - * share the underlying hardware the overlap check should be performed on 1026 - * the CDP resource sharing the hardware also. 1027 - * 1028 - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the 1029 - * overlap test. 1030 - * 1031 - * Return: true if CBM overlap detected, false if there is no overlap 1032 - */ 1033 - bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 1034 - unsigned long cbm, int closid, bool exclusive) 1035 - { 1036 - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 1037 - struct rdt_resource *r = s->res; 1038 - 1039 - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, 1040 - exclusive)) 1041 - return true; 1042 - 1043 - if (!resctrl_arch_get_cdp_enabled(r->rid)) 1044 - return false; 1045 - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); 1046 - } 1047 - 1048 - /** 1049 - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive 1050 - * @rdtgrp: Resource group identified through its closid. 1051 - * 1052 - * An exclusive resource group implies that there should be no sharing of 1053 - * its allocated resources. At the time this group is considered to be 1054 - * exclusive this test can determine if its current schemata supports this 1055 - * setting by testing for overlap with all other resource groups. 1056 - * 1057 - * Return: true if resource group can be exclusive, false if there is overlap 1058 - * with allocations of other resource groups and thus this resource group 1059 - * cannot be exclusive. 1060 - */ 1061 - static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) 1062 - { 1063 - int closid = rdtgrp->closid; 1064 - struct rdt_ctrl_domain *d; 1065 - struct resctrl_schema *s; 1066 - struct rdt_resource *r; 1067 - bool has_cache = false; 1068 - u32 ctrl; 1069 - 1070 - /* Walking r->domains, ensure it can't race with cpuhp */ 1071 - lockdep_assert_cpus_held(); 1072 - 1073 - list_for_each_entry(s, &resctrl_schema_all, list) { 1074 - r = s->res; 1075 - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) 1076 - continue; 1077 - has_cache = true; 1078 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1079 - ctrl = resctrl_arch_get_config(r, d, closid, 1080 - s->conf_type); 1081 - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { 1082 - rdt_last_cmd_puts("Schemata overlaps\n"); 1083 - return false; 1084 - } 1085 - } 1086 - } 1087 - 1088 - if (!has_cache) { 1089 - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); 1090 - return false; 1091 - } 1092 - 1093 - return true; 1094 - } 1095 - 1096 - /* 1097 - * rdtgroup_mode_write - Modify the resource group's mode 1098 - */ 1099 - static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, 1100 - char *buf, size_t nbytes, loff_t off) 1101 - { 1102 - struct rdtgroup *rdtgrp; 1103 - enum rdtgrp_mode mode; 1104 - int ret = 0; 1105 - 1106 - /* Valid input requires a trailing newline */ 1107 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 1108 - return -EINVAL; 1109 - buf[nbytes - 1] = '\0'; 1110 - 1111 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 1112 - if (!rdtgrp) { 1113 - rdtgroup_kn_unlock(of->kn); 1114 - return -ENOENT; 1115 - } 1116 - 1117 - rdt_last_cmd_clear(); 1118 - 1119 - mode = rdtgrp->mode; 1120 - 1121 - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || 1122 - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || 1123 - (!strcmp(buf, "pseudo-locksetup") && 1124 - mode == RDT_MODE_PSEUDO_LOCKSETUP) || 1125 - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) 1126 - goto out; 1127 - 1128 - if (mode == RDT_MODE_PSEUDO_LOCKED) { 1129 - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); 1130 - ret = -EINVAL; 1131 - goto out; 1132 - } 1133 - 1134 - if (!strcmp(buf, "shareable")) { 1135 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1136 - ret = rdtgroup_locksetup_exit(rdtgrp); 1137 - if (ret) 1138 - goto out; 1139 - } 1140 - rdtgrp->mode = RDT_MODE_SHAREABLE; 1141 - } else if (!strcmp(buf, "exclusive")) { 1142 - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { 1143 - ret = -EINVAL; 1144 - goto out; 1145 - } 1146 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1147 - ret = rdtgroup_locksetup_exit(rdtgrp); 1148 - if (ret) 1149 - goto out; 1150 - } 1151 - rdtgrp->mode = RDT_MODE_EXCLUSIVE; 1152 - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && 1153 - !strcmp(buf, "pseudo-locksetup")) { 1154 - ret = rdtgroup_locksetup_enter(rdtgrp); 1155 - if (ret) 1156 - goto out; 1157 - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; 1158 - } else { 1159 - rdt_last_cmd_puts("Unknown or unsupported mode\n"); 1160 - ret = -EINVAL; 1161 - } 1162 - 1163 - out: 1164 - rdtgroup_kn_unlock(of->kn); 1165 - return ret ?: nbytes; 1166 - } 1167 - 1168 - /** 1169 - * rdtgroup_cbm_to_size - Translate CBM to size in bytes 1170 - * @r: RDT resource to which @d belongs. 1171 - * @d: RDT domain instance. 1172 - * @cbm: bitmask for which the size should be computed. 1173 - * 1174 - * The bitmask provided associated with the RDT domain instance @d will be 1175 - * translated into how many bytes it represents. The size in bytes is 1176 - * computed by first dividing the total cache size by the CBM length to 1177 - * determine how many bytes each bit in the bitmask represents. The result 1178 - * is multiplied with the number of bits set in the bitmask. 1179 - * 1180 - * @cbm is unsigned long, even if only 32 bits are used to make the 1181 - * bitmap functions work correctly. 1182 - */ 1183 - unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, 1184 - struct rdt_ctrl_domain *d, unsigned long cbm) 1185 - { 1186 - unsigned int size = 0; 1187 - struct cacheinfo *ci; 1188 - int num_b; 1189 - 1190 - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) 1191 - return size; 1192 - 1193 - num_b = bitmap_weight(&cbm, r->cache.cbm_len); 1194 - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); 1195 - if (ci) 1196 - size = ci->size / r->cache.cbm_len * num_b; 1197 - 1198 - return size; 1199 - } 1200 - 1201 - /* 1202 - * rdtgroup_size_show - Display size in bytes of allocated regions 1203 - * 1204 - * The "size" file mirrors the layout of the "schemata" file, printing the 1205 - * size in bytes of each region instead of the capacity bitmask. 1206 - */ 1207 - static int rdtgroup_size_show(struct kernfs_open_file *of, 1208 - struct seq_file *s, void *v) 1209 - { 1210 - struct resctrl_schema *schema; 1211 - enum resctrl_conf_type type; 1212 - struct rdt_ctrl_domain *d; 1213 - struct rdtgroup *rdtgrp; 1214 - struct rdt_resource *r; 1215 - unsigned int size; 1216 - int ret = 0; 1217 - u32 closid; 1218 - bool sep; 1219 - u32 ctrl; 1220 - 1221 - rdtgrp = rdtgroup_kn_lock_live(of->kn); 1222 - if (!rdtgrp) { 1223 - rdtgroup_kn_unlock(of->kn); 1224 - return -ENOENT; 1225 - } 1226 - 1227 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1228 - if (!rdtgrp->plr->d) { 1229 - rdt_last_cmd_clear(); 1230 - rdt_last_cmd_puts("Cache domain offline\n"); 1231 - ret = -ENODEV; 1232 - } else { 1233 - seq_printf(s, "%*s:", max_name_width, 1234 - rdtgrp->plr->s->name); 1235 - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, 1236 - rdtgrp->plr->d, 1237 - rdtgrp->plr->cbm); 1238 - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); 1239 - } 1240 - goto out; 1241 - } 1242 - 1243 - closid = rdtgrp->closid; 1244 - 1245 - list_for_each_entry(schema, &resctrl_schema_all, list) { 1246 - r = schema->res; 1247 - type = schema->conf_type; 1248 - sep = false; 1249 - seq_printf(s, "%*s:", max_name_width, schema->name); 1250 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1251 - if (sep) 1252 - seq_putc(s, ';'); 1253 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1254 - size = 0; 1255 - } else { 1256 - if (is_mba_sc(r)) 1257 - ctrl = d->mbps_val[closid]; 1258 - else 1259 - ctrl = resctrl_arch_get_config(r, d, 1260 - closid, 1261 - type); 1262 - if (r->rid == RDT_RESOURCE_MBA || 1263 - r->rid == RDT_RESOURCE_SMBA) 1264 - size = ctrl; 1265 - else 1266 - size = rdtgroup_cbm_to_size(r, d, ctrl); 1267 - } 1268 - seq_printf(s, "%d=%u", d->hdr.id, size); 1269 - sep = true; 1270 - } 1271 - seq_putc(s, '\n'); 1272 - } 1273 - 1274 - out: 1275 - rdtgroup_kn_unlock(of->kn); 1276 - 1277 - return ret; 385 + resctrl_arch_sched_in(current); 1278 386 } 1279 387 1280 388 #define INVALID_CONFIG_INDEX UINT_MAX ··· 102 1642 config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; 103 1643 } 104 1644 105 - static void mondata_config_read(struct resctrl_mon_config_info *mon_info) 106 - { 107 - smp_call_function_any(&mon_info->d->hdr.cpu_mask, 108 - resctrl_arch_mon_event_config_read, mon_info, 1); 109 - } 110 - 111 - static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) 112 - { 113 - struct resctrl_mon_config_info mon_info; 114 - struct rdt_mon_domain *dom; 115 - bool sep = false; 116 - 117 - cpus_read_lock(); 118 - mutex_lock(&rdtgroup_mutex); 119 - 120 - list_for_each_entry(dom, &r->mon_domains, hdr.list) { 121 - if (sep) 122 - seq_puts(s, ";"); 123 - 124 - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); 125 - mon_info.r = r; 126 - mon_info.d = dom; 127 - mon_info.evtid = evtid; 128 - mondata_config_read(&mon_info); 129 - 130 - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); 131 - sep = true; 132 - } 133 - seq_puts(s, "\n"); 134 - 135 - mutex_unlock(&rdtgroup_mutex); 136 - cpus_read_unlock(); 137 - 138 - return 0; 139 - } 140 - 141 - static int mbm_total_bytes_config_show(struct kernfs_open_file *of, 142 - struct seq_file *seq, void *v) 143 - { 144 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 145 - 146 - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); 147 - 148 - return 0; 149 - } 150 - 151 - static int mbm_local_bytes_config_show(struct kernfs_open_file *of, 152 - struct seq_file *seq, void *v) 153 - { 154 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 155 - 156 - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); 157 - 158 - return 0; 159 - } 160 - 161 1645 void resctrl_arch_mon_event_config_write(void *_config_info) 162 1646 { 163 1647 struct resctrl_mon_config_info *config_info = _config_info; ··· 113 1709 return; 114 1710 } 115 1711 wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); 116 - } 117 - 118 - static void mbm_config_write_domain(struct rdt_resource *r, 119 - struct rdt_mon_domain *d, u32 evtid, u32 val) 120 - { 121 - struct resctrl_mon_config_info mon_info = {0}; 122 - 123 - /* 124 - * Read the current config value first. If both are the same then 125 - * no need to write it again. 126 - */ 127 - mon_info.r = r; 128 - mon_info.d = d; 129 - mon_info.evtid = evtid; 130 - mondata_config_read(&mon_info); 131 - if (mon_info.mon_config == val) 132 - return; 133 - 134 - mon_info.mon_config = val; 135 - 136 - /* 137 - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the 138 - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE 139 - * are scoped at the domain level. Writing any of these MSRs 140 - * on one CPU is observed by all the CPUs in the domain. 141 - */ 142 - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, 143 - &mon_info, 1); 144 - 145 - /* 146 - * When an Event Configuration is changed, the bandwidth counters 147 - * for all RMIDs and Events will be cleared by the hardware. The 148 - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for 149 - * every RMID on the next read to any event for every RMID. 150 - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) 151 - * cleared while it is tracked by the hardware. Clear the 152 - * mbm_local and mbm_total counts for all the RMIDs. 153 - */ 154 - resctrl_arch_reset_rmid_all(r, d); 155 - } 156 - 157 - static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) 158 - { 159 - char *dom_str = NULL, *id_str; 160 - unsigned long dom_id, val; 161 - struct rdt_mon_domain *d; 162 - 163 - /* Walking r->domains, ensure it can't race with cpuhp */ 164 - lockdep_assert_cpus_held(); 165 - 166 - next: 167 - if (!tok || tok[0] == '\0') 168 - return 0; 169 - 170 - /* Start processing the strings for each domain */ 171 - dom_str = strim(strsep(&tok, ";")); 172 - id_str = strsep(&dom_str, "="); 173 - 174 - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { 175 - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); 176 - return -EINVAL; 177 - } 178 - 179 - if (!dom_str || kstrtoul(dom_str, 16, &val)) { 180 - rdt_last_cmd_puts("Non-numeric event configuration value\n"); 181 - return -EINVAL; 182 - } 183 - 184 - /* Value from user cannot be more than the supported set of events */ 185 - if ((val & r->mbm_cfg_mask) != val) { 186 - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", 187 - r->mbm_cfg_mask); 188 - return -EINVAL; 189 - } 190 - 191 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 192 - if (d->hdr.id == dom_id) { 193 - mbm_config_write_domain(r, d, evtid, val); 194 - goto next; 195 - } 196 - } 197 - 198 - return -EINVAL; 199 - } 200 - 201 - static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, 202 - char *buf, size_t nbytes, 203 - loff_t off) 204 - { 205 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 206 - int ret; 207 - 208 - /* Valid input requires a trailing newline */ 209 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 210 - return -EINVAL; 211 - 212 - cpus_read_lock(); 213 - mutex_lock(&rdtgroup_mutex); 214 - 215 - rdt_last_cmd_clear(); 216 - 217 - buf[nbytes - 1] = '\0'; 218 - 219 - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); 220 - 221 - mutex_unlock(&rdtgroup_mutex); 222 - cpus_read_unlock(); 223 - 224 - return ret ?: nbytes; 225 - } 226 - 227 - static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, 228 - char *buf, size_t nbytes, 229 - loff_t off) 230 - { 231 - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 232 - int ret; 233 - 234 - /* Valid input requires a trailing newline */ 235 - if (nbytes == 0 || buf[nbytes - 1] != '\n') 236 - return -EINVAL; 237 - 238 - cpus_read_lock(); 239 - mutex_lock(&rdtgroup_mutex); 240 - 241 - rdt_last_cmd_clear(); 242 - 243 - buf[nbytes - 1] = '\0'; 244 - 245 - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); 246 - 247 - mutex_unlock(&rdtgroup_mutex); 248 - cpus_read_unlock(); 249 - 250 - return ret ?: nbytes; 251 - } 252 - 253 - /* rdtgroup information files for one cache resource. */ 254 - static struct rftype res_common_files[] = { 255 - { 256 - .name = "last_cmd_status", 257 - .mode = 0444, 258 - .kf_ops = &rdtgroup_kf_single_ops, 259 - .seq_show = rdt_last_cmd_status_show, 260 - .fflags = RFTYPE_TOP_INFO, 261 - }, 262 - { 263 - .name = "num_closids", 264 - .mode = 0444, 265 - .kf_ops = &rdtgroup_kf_single_ops, 266 - .seq_show = rdt_num_closids_show, 267 - .fflags = RFTYPE_CTRL_INFO, 268 - }, 269 - { 270 - .name = "mon_features", 271 - .mode = 0444, 272 - .kf_ops = &rdtgroup_kf_single_ops, 273 - .seq_show = rdt_mon_features_show, 274 - .fflags = RFTYPE_MON_INFO, 275 - }, 276 - { 277 - .name = "num_rmids", 278 - .mode = 0444, 279 - .kf_ops = &rdtgroup_kf_single_ops, 280 - .seq_show = rdt_num_rmids_show, 281 - .fflags = RFTYPE_MON_INFO, 282 - }, 283 - { 284 - .name = "cbm_mask", 285 - .mode = 0444, 286 - .kf_ops = &rdtgroup_kf_single_ops, 287 - .seq_show = rdt_default_ctrl_show, 288 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 289 - }, 290 - { 291 - .name = "min_cbm_bits", 292 - .mode = 0444, 293 - .kf_ops = &rdtgroup_kf_single_ops, 294 - .seq_show = rdt_min_cbm_bits_show, 295 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 296 - }, 297 - { 298 - .name = "shareable_bits", 299 - .mode = 0444, 300 - .kf_ops = &rdtgroup_kf_single_ops, 301 - .seq_show = rdt_shareable_bits_show, 302 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 303 - }, 304 - { 305 - .name = "bit_usage", 306 - .mode = 0444, 307 - .kf_ops = &rdtgroup_kf_single_ops, 308 - .seq_show = rdt_bit_usage_show, 309 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 310 - }, 311 - { 312 - .name = "min_bandwidth", 313 - .mode = 0444, 314 - .kf_ops = &rdtgroup_kf_single_ops, 315 - .seq_show = rdt_min_bw_show, 316 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 317 - }, 318 - { 319 - .name = "bandwidth_gran", 320 - .mode = 0444, 321 - .kf_ops = &rdtgroup_kf_single_ops, 322 - .seq_show = rdt_bw_gran_show, 323 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 324 - }, 325 - { 326 - .name = "delay_linear", 327 - .mode = 0444, 328 - .kf_ops = &rdtgroup_kf_single_ops, 329 - .seq_show = rdt_delay_linear_show, 330 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 331 - }, 332 - /* 333 - * Platform specific which (if any) capabilities are provided by 334 - * thread_throttle_mode. Defer "fflags" initialization to platform 335 - * discovery. 336 - */ 337 - { 338 - .name = "thread_throttle_mode", 339 - .mode = 0444, 340 - .kf_ops = &rdtgroup_kf_single_ops, 341 - .seq_show = rdt_thread_throttle_mode_show, 342 - }, 343 - { 344 - .name = "max_threshold_occupancy", 345 - .mode = 0644, 346 - .kf_ops = &rdtgroup_kf_single_ops, 347 - .write = max_threshold_occ_write, 348 - .seq_show = max_threshold_occ_show, 349 - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, 350 - }, 351 - { 352 - .name = "mbm_total_bytes_config", 353 - .mode = 0644, 354 - .kf_ops = &rdtgroup_kf_single_ops, 355 - .seq_show = mbm_total_bytes_config_show, 356 - .write = mbm_total_bytes_config_write, 357 - }, 358 - { 359 - .name = "mbm_local_bytes_config", 360 - .mode = 0644, 361 - .kf_ops = &rdtgroup_kf_single_ops, 362 - .seq_show = mbm_local_bytes_config_show, 363 - .write = mbm_local_bytes_config_write, 364 - }, 365 - { 366 - .name = "cpus", 367 - .mode = 0644, 368 - .kf_ops = &rdtgroup_kf_single_ops, 369 - .write = rdtgroup_cpus_write, 370 - .seq_show = rdtgroup_cpus_show, 371 - .fflags = RFTYPE_BASE, 372 - }, 373 - { 374 - .name = "cpus_list", 375 - .mode = 0644, 376 - .kf_ops = &rdtgroup_kf_single_ops, 377 - .write = rdtgroup_cpus_write, 378 - .seq_show = rdtgroup_cpus_show, 379 - .flags = RFTYPE_FLAGS_CPUS_LIST, 380 - .fflags = RFTYPE_BASE, 381 - }, 382 - { 383 - .name = "tasks", 384 - .mode = 0644, 385 - .kf_ops = &rdtgroup_kf_single_ops, 386 - .write = rdtgroup_tasks_write, 387 - .seq_show = rdtgroup_tasks_show, 388 - .fflags = RFTYPE_BASE, 389 - }, 390 - { 391 - .name = "mon_hw_id", 392 - .mode = 0444, 393 - .kf_ops = &rdtgroup_kf_single_ops, 394 - .seq_show = rdtgroup_rmid_show, 395 - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, 396 - }, 397 - { 398 - .name = "schemata", 399 - .mode = 0644, 400 - .kf_ops = &rdtgroup_kf_single_ops, 401 - .write = rdtgroup_schemata_write, 402 - .seq_show = rdtgroup_schemata_show, 403 - .fflags = RFTYPE_CTRL_BASE, 404 - }, 405 - { 406 - .name = "mba_MBps_event", 407 - .mode = 0644, 408 - .kf_ops = &rdtgroup_kf_single_ops, 409 - .write = rdtgroup_mba_mbps_event_write, 410 - .seq_show = rdtgroup_mba_mbps_event_show, 411 - }, 412 - { 413 - .name = "mode", 414 - .mode = 0644, 415 - .kf_ops = &rdtgroup_kf_single_ops, 416 - .write = rdtgroup_mode_write, 417 - .seq_show = rdtgroup_mode_show, 418 - .fflags = RFTYPE_CTRL_BASE, 419 - }, 420 - { 421 - .name = "size", 422 - .mode = 0444, 423 - .kf_ops = &rdtgroup_kf_single_ops, 424 - .seq_show = rdtgroup_size_show, 425 - .fflags = RFTYPE_CTRL_BASE, 426 - }, 427 - { 428 - .name = "sparse_masks", 429 - .mode = 0444, 430 - .kf_ops = &rdtgroup_kf_single_ops, 431 - .seq_show = rdt_has_sparse_bitmasks_show, 432 - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 433 - }, 434 - { 435 - .name = "ctrl_hw_id", 436 - .mode = 0444, 437 - .kf_ops = &rdtgroup_kf_single_ops, 438 - .seq_show = rdtgroup_closid_show, 439 - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, 440 - }, 441 - 442 - }; 443 - 444 - static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) 445 - { 446 - struct rftype *rfts, *rft; 447 - int ret, len; 448 - 449 - rfts = res_common_files; 450 - len = ARRAY_SIZE(res_common_files); 451 - 452 - lockdep_assert_held(&rdtgroup_mutex); 453 - 454 - if (resctrl_debug) 455 - fflags |= RFTYPE_DEBUG; 456 - 457 - for (rft = rfts; rft < rfts + len; rft++) { 458 - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { 459 - ret = rdtgroup_add_file(kn, rft); 460 - if (ret) 461 - goto error; 462 - } 463 - } 464 - 465 - return 0; 466 - error: 467 - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); 468 - while (--rft >= rfts) { 469 - if ((fflags & rft->fflags) == rft->fflags) 470 - kernfs_remove_by_name(kn, rft->name); 471 - } 472 - return ret; 473 - } 474 - 475 - static struct rftype *rdtgroup_get_rftype_by_name(const char *name) 476 - { 477 - struct rftype *rfts, *rft; 478 - int len; 479 - 480 - rfts = res_common_files; 481 - len = ARRAY_SIZE(res_common_files); 482 - 483 - for (rft = rfts; rft < rfts + len; rft++) { 484 - if (!strcmp(rft->name, name)) 485 - return rft; 486 - } 487 - 488 - return NULL; 489 - } 490 - 491 - static void thread_throttle_mode_init(void) 492 - { 493 - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; 494 - struct rdt_resource *r_mba, *r_smba; 495 - 496 - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 497 - if (r_mba->alloc_capable && 498 - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 499 - throttle_mode = r_mba->membw.throttle_mode; 500 - 501 - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); 502 - if (r_smba->alloc_capable && 503 - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 504 - throttle_mode = r_smba->membw.throttle_mode; 505 - 506 - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) 507 - return; 508 - 509 - resctrl_file_fflags_init("thread_throttle_mode", 510 - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); 511 - } 512 - 513 - void resctrl_file_fflags_init(const char *config, unsigned long fflags) 514 - { 515 - struct rftype *rft; 516 - 517 - rft = rdtgroup_get_rftype_by_name(config); 518 - if (rft) 519 - rft->fflags = fflags; 520 - } 521 - 522 - /** 523 - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file 524 - * @r: The resource group with which the file is associated. 525 - * @name: Name of the file 526 - * 527 - * The permissions of named resctrl file, directory, or link are modified 528 - * to not allow read, write, or execute by any user. 529 - * 530 - * WARNING: This function is intended to communicate to the user that the 531 - * resctrl file has been locked down - that it is not relevant to the 532 - * particular state the system finds itself in. It should not be relied 533 - * on to protect from user access because after the file's permissions 534 - * are restricted the user can still change the permissions using chmod 535 - * from the command line. 536 - * 537 - * Return: 0 on success, <0 on failure. 538 - */ 539 - int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) 540 - { 541 - struct iattr iattr = {.ia_valid = ATTR_MODE,}; 542 - struct kernfs_node *kn; 543 - int ret = 0; 544 - 545 - kn = kernfs_find_and_get_ns(r->kn, name, NULL); 546 - if (!kn) 547 - return -ENOENT; 548 - 549 - switch (kernfs_type(kn)) { 550 - case KERNFS_DIR: 551 - iattr.ia_mode = S_IFDIR; 552 - break; 553 - case KERNFS_FILE: 554 - iattr.ia_mode = S_IFREG; 555 - break; 556 - case KERNFS_LINK: 557 - iattr.ia_mode = S_IFLNK; 558 - break; 559 - } 560 - 561 - ret = kernfs_setattr(kn, &iattr); 562 - kernfs_put(kn); 563 - return ret; 564 - } 565 - 566 - /** 567 - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file 568 - * @r: The resource group with which the file is associated. 569 - * @name: Name of the file 570 - * @mask: Mask of permissions that should be restored 571 - * 572 - * Restore the permissions of the named file. If @name is a directory the 573 - * permissions of its parent will be used. 574 - * 575 - * Return: 0 on success, <0 on failure. 576 - */ 577 - int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 578 - umode_t mask) 579 - { 580 - struct iattr iattr = {.ia_valid = ATTR_MODE,}; 581 - struct kernfs_node *kn, *parent; 582 - struct rftype *rfts, *rft; 583 - int ret, len; 584 - 585 - rfts = res_common_files; 586 - len = ARRAY_SIZE(res_common_files); 587 - 588 - for (rft = rfts; rft < rfts + len; rft++) { 589 - if (!strcmp(rft->name, name)) 590 - iattr.ia_mode = rft->mode & mask; 591 - } 592 - 593 - kn = kernfs_find_and_get_ns(r->kn, name, NULL); 594 - if (!kn) 595 - return -ENOENT; 596 - 597 - switch (kernfs_type(kn)) { 598 - case KERNFS_DIR: 599 - parent = kernfs_get_parent(kn); 600 - if (parent) { 601 - iattr.ia_mode |= parent->mode; 602 - kernfs_put(parent); 603 - } 604 - iattr.ia_mode |= S_IFDIR; 605 - break; 606 - case KERNFS_FILE: 607 - iattr.ia_mode |= S_IFREG; 608 - break; 609 - case KERNFS_LINK: 610 - iattr.ia_mode |= S_IFLNK; 611 - break; 612 - } 613 - 614 - ret = kernfs_setattr(kn, &iattr); 615 - kernfs_put(kn); 616 - return ret; 617 - } 618 - 619 - static int rdtgroup_mkdir_info_resdir(void *priv, char *name, 620 - unsigned long fflags) 621 - { 622 - struct kernfs_node *kn_subdir; 623 - int ret; 624 - 625 - kn_subdir = kernfs_create_dir(kn_info, name, 626 - kn_info->mode, priv); 627 - if (IS_ERR(kn_subdir)) 628 - return PTR_ERR(kn_subdir); 629 - 630 - ret = rdtgroup_kn_set_ugid(kn_subdir); 631 - if (ret) 632 - return ret; 633 - 634 - ret = rdtgroup_add_files(kn_subdir, fflags); 635 - if (!ret) 636 - kernfs_activate(kn_subdir); 637 - 638 - return ret; 639 - } 640 - 641 - static unsigned long fflags_from_resource(struct rdt_resource *r) 642 - { 643 - switch (r->rid) { 644 - case RDT_RESOURCE_L3: 645 - case RDT_RESOURCE_L2: 646 - return RFTYPE_RES_CACHE; 647 - case RDT_RESOURCE_MBA: 648 - case RDT_RESOURCE_SMBA: 649 - return RFTYPE_RES_MB; 650 - } 651 - 652 - return WARN_ON_ONCE(1); 653 - } 654 - 655 - static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 656 - { 657 - struct resctrl_schema *s; 658 - struct rdt_resource *r; 659 - unsigned long fflags; 660 - char name[32]; 661 - int ret; 662 - 663 - /* create the directory */ 664 - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 665 - if (IS_ERR(kn_info)) 666 - return PTR_ERR(kn_info); 667 - 668 - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); 669 - if (ret) 670 - goto out_destroy; 671 - 672 - /* loop over enabled controls, these are all alloc_capable */ 673 - list_for_each_entry(s, &resctrl_schema_all, list) { 674 - r = s->res; 675 - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; 676 - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); 677 - if (ret) 678 - goto out_destroy; 679 - } 680 - 681 - for_each_mon_capable_rdt_resource(r) { 682 - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; 683 - sprintf(name, "%s_MON", r->name); 684 - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); 685 - if (ret) 686 - goto out_destroy; 687 - } 688 - 689 - ret = rdtgroup_kn_set_ugid(kn_info); 690 - if (ret) 691 - goto out_destroy; 692 - 693 - kernfs_activate(kn_info); 694 - 695 - return 0; 696 - 697 - out_destroy: 698 - kernfs_remove(kn_info); 699 - return ret; 700 - } 701 - 702 - static int 703 - mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, 704 - char *name, struct kernfs_node **dest_kn) 705 - { 706 - struct kernfs_node *kn; 707 - int ret; 708 - 709 - /* create the directory */ 710 - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 711 - if (IS_ERR(kn)) 712 - return PTR_ERR(kn); 713 - 714 - if (dest_kn) 715 - *dest_kn = kn; 716 - 717 - ret = rdtgroup_kn_set_ugid(kn); 718 - if (ret) 719 - goto out_destroy; 720 - 721 - kernfs_activate(kn); 722 - 723 - return 0; 724 - 725 - out_destroy: 726 - kernfs_remove(kn); 727 - return ret; 728 1712 } 729 1713 730 1714 static void l3_qos_cfg_update(void *arg) ··· 127 2335 bool *enable = arg; 128 2336 129 2337 wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); 130 - } 131 - 132 - static inline bool is_mba_linear(void) 133 - { 134 - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; 135 2338 } 136 2339 137 2340 static int set_cache_qos_cfg(int level, bool enable) ··· 184 2397 l3_qos_cfg_update(&hw_res->cdp_enabled); 185 2398 } 186 2399 187 - static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) 188 - { 189 - u32 num_closid = resctrl_arch_get_num_closid(r); 190 - int cpu = cpumask_any(&d->hdr.cpu_mask); 191 - int i; 192 - 193 - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), 194 - GFP_KERNEL, cpu_to_node(cpu)); 195 - if (!d->mbps_val) 196 - return -ENOMEM; 197 - 198 - for (i = 0; i < num_closid; i++) 199 - d->mbps_val[i] = MBA_MAX_MBPS; 200 - 201 - return 0; 202 - } 203 - 204 - static void mba_sc_domain_destroy(struct rdt_resource *r, 205 - struct rdt_ctrl_domain *d) 206 - { 207 - kfree(d->mbps_val); 208 - d->mbps_val = NULL; 209 - } 210 - 211 - /* 212 - * MBA software controller is supported only if 213 - * MBM is supported and MBA is in linear scale, 214 - * and the MBM monitor scope is the same as MBA 215 - * control scope. 216 - */ 217 - static bool supports_mba_mbps(void) 218 - { 219 - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); 220 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 221 - 222 - return (resctrl_is_mbm_enabled() && 223 - r->alloc_capable && is_mba_linear() && 224 - r->ctrl_scope == rmbm->mon_scope); 225 - } 226 - 227 - /* 228 - * Enable or disable the MBA software controller 229 - * which helps user specify bandwidth in MBps. 230 - */ 231 - static int set_mba_sc(bool mba_sc) 232 - { 233 - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 234 - u32 num_closid = resctrl_arch_get_num_closid(r); 235 - struct rdt_ctrl_domain *d; 236 - unsigned long fflags; 237 - int i; 238 - 239 - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) 240 - return -EINVAL; 241 - 242 - r->membw.mba_sc = mba_sc; 243 - 244 - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; 245 - 246 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 247 - for (i = 0; i < num_closid; i++) 248 - d->mbps_val[i] = MBA_MAX_MBPS; 249 - } 250 - 251 - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; 252 - resctrl_file_fflags_init("mba_MBps_event", fflags); 253 - 254 - return 0; 255 - } 256 - 257 2400 static int cdp_enable(int level) 258 2401 { 259 2402 struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; ··· 224 2507 return 0; 225 2508 } 226 2509 227 - /* 228 - * We don't allow rdtgroup directories to be created anywhere 229 - * except the root directory. Thus when looking for the rdtgroup 230 - * structure for a kernfs node we are either looking at a directory, 231 - * in which case the rdtgroup structure is pointed at by the "priv" 232 - * field, otherwise we have a file, and need only look to the parent 233 - * to find the rdtgroup. 234 - */ 235 - static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) 2510 + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) 236 2511 { 237 - if (kernfs_type(kn) == KERNFS_DIR) { 238 - /* 239 - * All the resource directories use "kn->priv" 240 - * to point to the "struct rdtgroup" for the 241 - * resource. "info" and its subdirectories don't 242 - * have rdtgroup structures, so return NULL here. 243 - */ 244 - if (kn == kn_info || 245 - rcu_access_pointer(kn->__parent) == kn_info) 246 - return NULL; 247 - else 248 - return kn->priv; 249 - } else { 250 - return rdt_kn_parent_priv(kn); 251 - } 252 - } 253 - 254 - static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 255 - { 256 - atomic_inc(&rdtgrp->waitcount); 257 - kernfs_break_active_protection(kn); 258 - } 259 - 260 - static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 261 - { 262 - if (atomic_dec_and_test(&rdtgrp->waitcount) && 263 - (rdtgrp->flags & RDT_DELETED)) { 264 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 265 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 266 - rdtgroup_pseudo_lock_remove(rdtgrp); 267 - kernfs_unbreak_active_protection(kn); 268 - rdtgroup_remove(rdtgrp); 269 - } else { 270 - kernfs_unbreak_active_protection(kn); 271 - } 272 - } 273 - 274 - struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) 275 - { 276 - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 277 - 278 - if (!rdtgrp) 279 - return NULL; 280 - 281 - rdtgroup_kn_get(rdtgrp, kn); 282 - 283 - cpus_read_lock(); 284 - mutex_lock(&rdtgroup_mutex); 285 - 286 - /* Was this group deleted while we waited? */ 287 - if (rdtgrp->flags & RDT_DELETED) 288 - return NULL; 289 - 290 - return rdtgrp; 291 - } 292 - 293 - void rdtgroup_kn_unlock(struct kernfs_node *kn) 294 - { 295 - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 296 - 297 - if (!rdtgrp) 298 - return; 299 - 300 - mutex_unlock(&rdtgroup_mutex); 301 - cpus_read_unlock(); 302 - 303 - rdtgroup_kn_put(rdtgrp, kn); 304 - } 305 - 306 - static int mkdir_mondata_all(struct kernfs_node *parent_kn, 307 - struct rdtgroup *prgrp, 308 - struct kernfs_node **mon_data_kn); 309 - 310 - static void rdt_disable_ctx(void) 311 - { 312 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 313 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 314 - set_mba_sc(false); 315 - 316 - resctrl_debug = false; 317 - } 318 - 319 - static int rdt_enable_ctx(struct rdt_fs_context *ctx) 320 - { 321 - int ret = 0; 322 - 323 - if (ctx->enable_cdpl2) { 324 - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); 325 - if (ret) 326 - goto out_done; 327 - } 328 - 329 - if (ctx->enable_cdpl3) { 330 - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); 331 - if (ret) 332 - goto out_cdpl2; 333 - } 334 - 335 - if (ctx->enable_mba_mbps) { 336 - ret = set_mba_sc(true); 337 - if (ret) 338 - goto out_cdpl3; 339 - } 340 - 341 - if (ctx->enable_debug) 342 - resctrl_debug = true; 343 - 344 - return 0; 345 - 346 - out_cdpl3: 347 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 348 - out_cdpl2: 349 - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 350 - out_done: 351 - return ret; 352 - } 353 - 354 - static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) 355 - { 356 - struct resctrl_schema *s; 357 - const char *suffix = ""; 358 - int ret, cl; 359 - 360 - s = kzalloc(sizeof(*s), GFP_KERNEL); 361 - if (!s) 362 - return -ENOMEM; 363 - 364 - s->res = r; 365 - s->num_closid = resctrl_arch_get_num_closid(r); 366 - if (resctrl_arch_get_cdp_enabled(r->rid)) 367 - s->num_closid /= 2; 368 - 369 - s->conf_type = type; 370 - switch (type) { 371 - case CDP_CODE: 372 - suffix = "CODE"; 373 - break; 374 - case CDP_DATA: 375 - suffix = "DATA"; 376 - break; 377 - case CDP_NONE: 378 - suffix = ""; 379 - break; 380 - } 381 - 382 - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); 383 - if (ret >= sizeof(s->name)) { 384 - kfree(s); 385 - return -EINVAL; 386 - } 387 - 388 - cl = strlen(s->name); 389 - 390 - /* 391 - * If CDP is supported by this resource, but not enabled, 392 - * include the suffix. This ensures the tabular format of the 393 - * schemata file does not change between mounts of the filesystem. 394 - */ 395 - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) 396 - cl += 4; 397 - 398 - if (cl > max_name_width) 399 - max_name_width = cl; 400 - 401 - switch (r->schema_fmt) { 402 - case RESCTRL_SCHEMA_BITMAP: 403 - s->fmt_str = "%d=%x"; 404 - break; 405 - case RESCTRL_SCHEMA_RANGE: 406 - s->fmt_str = "%d=%u"; 407 - break; 408 - } 409 - 410 - if (WARN_ON_ONCE(!s->fmt_str)) { 411 - kfree(s); 412 - return -EINVAL; 413 - } 414 - 415 - INIT_LIST_HEAD(&s->list); 416 - list_add(&s->list, &resctrl_schema_all); 417 - 418 - return 0; 419 - } 420 - 421 - static int schemata_list_create(void) 422 - { 423 - struct rdt_resource *r; 424 - int ret = 0; 425 - 426 - for_each_alloc_capable_rdt_resource(r) { 427 - if (resctrl_arch_get_cdp_enabled(r->rid)) { 428 - ret = schemata_list_add(r, CDP_CODE); 429 - if (ret) 430 - break; 431 - 432 - ret = schemata_list_add(r, CDP_DATA); 433 - } else { 434 - ret = schemata_list_add(r, CDP_NONE); 435 - } 436 - 437 - if (ret) 438 - break; 439 - } 440 - 441 - return ret; 442 - } 443 - 444 - static void schemata_list_destroy(void) 445 - { 446 - struct resctrl_schema *s, *tmp; 447 - 448 - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { 449 - list_del(&s->list); 450 - kfree(s); 451 - } 452 - } 453 - 454 - static int rdt_get_tree(struct fs_context *fc) 455 - { 456 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 457 - unsigned long flags = RFTYPE_CTRL_BASE; 458 - struct rdt_mon_domain *dom; 459 - struct rdt_resource *r; 460 - int ret; 461 - 462 - cpus_read_lock(); 463 - mutex_lock(&rdtgroup_mutex); 464 - /* 465 - * resctrl file system can only be mounted once. 466 - */ 467 - if (resctrl_mounted) { 468 - ret = -EBUSY; 469 - goto out; 470 - } 471 - 472 - ret = rdtgroup_setup_root(ctx); 473 - if (ret) 474 - goto out; 475 - 476 - ret = rdt_enable_ctx(ctx); 477 - if (ret) 478 - goto out_root; 479 - 480 - ret = schemata_list_create(); 481 - if (ret) { 482 - schemata_list_destroy(); 483 - goto out_ctx; 484 - } 485 - 486 - closid_init(); 487 - 488 - if (resctrl_arch_mon_capable()) 489 - flags |= RFTYPE_MON; 490 - 491 - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); 492 - if (ret) 493 - goto out_schemata_free; 494 - 495 - kernfs_activate(rdtgroup_default.kn); 496 - 497 - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); 498 - if (ret < 0) 499 - goto out_schemata_free; 500 - 501 - if (resctrl_arch_mon_capable()) { 502 - ret = mongroup_create_dir(rdtgroup_default.kn, 503 - &rdtgroup_default, "mon_groups", 504 - &kn_mongrp); 505 - if (ret < 0) 506 - goto out_info; 507 - 508 - ret = mkdir_mondata_all(rdtgroup_default.kn, 509 - &rdtgroup_default, &kn_mondata); 510 - if (ret < 0) 511 - goto out_mongrp; 512 - rdtgroup_default.mon.mon_data_kn = kn_mondata; 513 - } 514 - 515 - ret = rdt_pseudo_lock_init(); 516 - if (ret) 517 - goto out_mondata; 518 - 519 - ret = kernfs_get_tree(fc); 520 - if (ret < 0) 521 - goto out_psl; 522 - 523 - if (resctrl_arch_alloc_capable()) 524 - resctrl_arch_enable_alloc(); 525 - if (resctrl_arch_mon_capable()) 526 - resctrl_arch_enable_mon(); 527 - 528 - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) 529 - resctrl_mounted = true; 530 - 531 - if (resctrl_is_mbm_enabled()) { 532 - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 533 - list_for_each_entry(dom, &r->mon_domains, hdr.list) 534 - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, 535 - RESCTRL_PICK_ANY_CPU); 536 - } 537 - 538 - goto out; 539 - 540 - out_psl: 541 - rdt_pseudo_lock_release(); 542 - out_mondata: 543 - if (resctrl_arch_mon_capable()) 544 - kernfs_remove(kn_mondata); 545 - out_mongrp: 546 - if (resctrl_arch_mon_capable()) 547 - kernfs_remove(kn_mongrp); 548 - out_info: 549 - kernfs_remove(kn_info); 550 - out_schemata_free: 551 - schemata_list_destroy(); 552 - out_ctx: 553 - rdt_disable_ctx(); 554 - out_root: 555 - rdtgroup_destroy_root(); 556 - out: 557 - rdt_last_cmd_clear(); 558 - mutex_unlock(&rdtgroup_mutex); 559 - cpus_read_unlock(); 560 - return ret; 561 - } 562 - 563 - enum rdt_param { 564 - Opt_cdp, 565 - Opt_cdpl2, 566 - Opt_mba_mbps, 567 - Opt_debug, 568 - nr__rdt_params 569 - }; 570 - 571 - static const struct fs_parameter_spec rdt_fs_parameters[] = { 572 - fsparam_flag("cdp", Opt_cdp), 573 - fsparam_flag("cdpl2", Opt_cdpl2), 574 - fsparam_flag("mba_MBps", Opt_mba_mbps), 575 - fsparam_flag("debug", Opt_debug), 576 - {} 577 - }; 578 - 579 - static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) 580 - { 581 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 582 - struct fs_parse_result result; 583 - const char *msg; 584 - int opt; 585 - 586 - opt = fs_parse(fc, rdt_fs_parameters, param, &result); 587 - if (opt < 0) 588 - return opt; 589 - 590 - switch (opt) { 591 - case Opt_cdp: 592 - ctx->enable_cdpl3 = true; 593 - return 0; 594 - case Opt_cdpl2: 595 - ctx->enable_cdpl2 = true; 596 - return 0; 597 - case Opt_mba_mbps: 598 - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; 599 - if (!supports_mba_mbps()) 600 - return invalfc(fc, msg); 601 - ctx->enable_mba_mbps = true; 602 - return 0; 603 - case Opt_debug: 604 - ctx->enable_debug = true; 605 - return 0; 606 - } 607 - 608 - return -EINVAL; 609 - } 610 - 611 - static void rdt_fs_context_free(struct fs_context *fc) 612 - { 613 - struct rdt_fs_context *ctx = rdt_fc2context(fc); 614 - 615 - kernfs_free_fs_context(fc); 616 - kfree(ctx); 617 - } 618 - 619 - static const struct fs_context_operations rdt_fs_context_ops = { 620 - .free = rdt_fs_context_free, 621 - .parse_param = rdt_parse_param, 622 - .get_tree = rdt_get_tree, 623 - }; 624 - 625 - static int rdt_init_fs_context(struct fs_context *fc) 626 - { 627 - struct rdt_fs_context *ctx; 628 - 629 - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); 630 - if (!ctx) 631 - return -ENOMEM; 632 - 633 - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; 634 - fc->fs_private = &ctx->kfc; 635 - fc->ops = &rdt_fs_context_ops; 636 - put_user_ns(fc->user_ns); 637 - fc->user_ns = get_user_ns(&init_user_ns); 638 - fc->global = true; 639 - return 0; 2512 + return rdt_resources_all[l].cdp_enabled; 640 2513 } 641 2514 642 2515 void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) ··· 259 2952 } 260 2953 261 2954 return; 262 - } 263 - 264 - /* 265 - * Move tasks from one to the other group. If @from is NULL, then all tasks 266 - * in the systems are moved unconditionally (used for teardown). 267 - * 268 - * If @mask is not NULL the cpus on which moved tasks are running are set 269 - * in that mask so the update smp function call is restricted to affected 270 - * cpus. 271 - */ 272 - static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, 273 - struct cpumask *mask) 274 - { 275 - struct task_struct *p, *t; 276 - 277 - read_lock(&tasklist_lock); 278 - for_each_process_thread(p, t) { 279 - if (!from || is_closid_match(t, from) || 280 - is_rmid_match(t, from)) { 281 - resctrl_arch_set_closid_rmid(t, to->closid, 282 - to->mon.rmid); 283 - 284 - /* 285 - * Order the closid/rmid stores above before the loads 286 - * in task_curr(). This pairs with the full barrier 287 - * between the rq->curr update and resctrl_sched_in() 288 - * during context switch. 289 - */ 290 - smp_mb(); 291 - 292 - /* 293 - * If the task is on a CPU, set the CPU in the mask. 294 - * The detection is inaccurate as tasks might move or 295 - * schedule before the smp function call takes place. 296 - * In such a case the function call is pointless, but 297 - * there is no other side effect. 298 - */ 299 - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) 300 - cpumask_set_cpu(task_cpu(t), mask); 301 - } 302 - } 303 - read_unlock(&tasklist_lock); 304 - } 305 - 306 - static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) 307 - { 308 - struct rdtgroup *sentry, *stmp; 309 - struct list_head *head; 310 - 311 - head = &rdtgrp->mon.crdtgrp_list; 312 - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { 313 - free_rmid(sentry->closid, sentry->mon.rmid); 314 - list_del(&sentry->mon.crdtgrp_list); 315 - 316 - if (atomic_read(&sentry->waitcount) != 0) 317 - sentry->flags = RDT_DELETED; 318 - else 319 - rdtgroup_remove(sentry); 320 - } 321 - } 322 - 323 - /* 324 - * Forcibly remove all of subdirectories under root. 325 - */ 326 - static void rmdir_all_sub(void) 327 - { 328 - struct rdtgroup *rdtgrp, *tmp; 329 - 330 - /* Move all tasks to the default resource group */ 331 - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 332 - 333 - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 334 - /* Free any child rmids */ 335 - free_all_child_rdtgrp(rdtgrp); 336 - 337 - /* Remove each rdtgroup other than root */ 338 - if (rdtgrp == &rdtgroup_default) 339 - continue; 340 - 341 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 342 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 343 - rdtgroup_pseudo_lock_remove(rdtgrp); 344 - 345 - /* 346 - * Give any CPUs back to the default group. We cannot copy 347 - * cpu_online_mask because a CPU might have executed the 348 - * offline callback already, but is still marked online. 349 - */ 350 - cpumask_or(&rdtgroup_default.cpu_mask, 351 - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 352 - 353 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 354 - 355 - kernfs_remove(rdtgrp->kn); 356 - list_del(&rdtgrp->rdtgroup_list); 357 - 358 - if (atomic_read(&rdtgrp->waitcount) != 0) 359 - rdtgrp->flags = RDT_DELETED; 360 - else 361 - rdtgroup_remove(rdtgrp); 362 - } 363 - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 364 - update_closid_rmid(cpu_online_mask, &rdtgroup_default); 365 - 366 - kernfs_remove(kn_info); 367 - kernfs_remove(kn_mongrp); 368 - kernfs_remove(kn_mondata); 369 - } 370 - 371 - static void rdt_kill_sb(struct super_block *sb) 372 - { 373 - struct rdt_resource *r; 374 - 375 - cpus_read_lock(); 376 - mutex_lock(&rdtgroup_mutex); 377 - 378 - rdt_disable_ctx(); 379 - 380 - /* Put everything back to default values. */ 381 - for_each_alloc_capable_rdt_resource(r) 382 - resctrl_arch_reset_all_ctrls(r); 383 - 384 - rmdir_all_sub(); 385 - rdt_pseudo_lock_release(); 386 - rdtgroup_default.mode = RDT_MODE_SHAREABLE; 387 - schemata_list_destroy(); 388 - rdtgroup_destroy_root(); 389 - if (resctrl_arch_alloc_capable()) 390 - resctrl_arch_disable_alloc(); 391 - if (resctrl_arch_mon_capable()) 392 - resctrl_arch_disable_mon(); 393 - resctrl_mounted = false; 394 - kernfs_kill_sb(sb); 395 - mutex_unlock(&rdtgroup_mutex); 396 - cpus_read_unlock(); 397 - } 398 - 399 - static struct file_system_type rdt_fs_type = { 400 - .name = "resctrl", 401 - .init_fs_context = rdt_init_fs_context, 402 - .parameters = rdt_fs_parameters, 403 - .kill_sb = rdt_kill_sb, 404 - }; 405 - 406 - static int mon_addfile(struct kernfs_node *parent_kn, const char *name, 407 - void *priv) 408 - { 409 - struct kernfs_node *kn; 410 - int ret = 0; 411 - 412 - kn = __kernfs_create_file(parent_kn, name, 0444, 413 - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, 414 - &kf_mondata_ops, priv, NULL, NULL); 415 - if (IS_ERR(kn)) 416 - return PTR_ERR(kn); 417 - 418 - ret = rdtgroup_kn_set_ugid(kn); 419 - if (ret) { 420 - kernfs_remove(kn); 421 - return ret; 422 - } 423 - 424 - return ret; 425 - } 426 - 427 - static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) 428 - { 429 - struct kernfs_node *kn; 430 - 431 - kn = kernfs_find_and_get(pkn, name); 432 - if (!kn) 433 - return; 434 - kernfs_put(kn); 435 - 436 - if (kn->dir.subdirs <= 1) 437 - kernfs_remove(kn); 438 - else 439 - kernfs_remove_by_name(kn, subname); 440 - } 441 - 442 - /* 443 - * Remove all subdirectories of mon_data of ctrl_mon groups 444 - * and monitor groups for the given domain. 445 - * Remove files and directories containing "sum" of domain data 446 - * when last domain being summed is removed. 447 - */ 448 - static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 449 - struct rdt_mon_domain *d) 450 - { 451 - struct rdtgroup *prgrp, *crgrp; 452 - char subname[32]; 453 - bool snc_mode; 454 - char name[32]; 455 - 456 - snc_mode = r->mon_scope == RESCTRL_L3_NODE; 457 - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 458 - if (snc_mode) 459 - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); 460 - 461 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 462 - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); 463 - 464 - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) 465 - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); 466 - } 467 - } 468 - 469 - static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, 470 - struct rdt_resource *r, struct rdtgroup *prgrp, 471 - bool do_sum) 472 - { 473 - struct rmid_read rr = {0}; 474 - union mon_data_bits priv; 475 - struct mon_evt *mevt; 476 - int ret; 477 - 478 - if (WARN_ON(list_empty(&r->evt_list))) 479 - return -EPERM; 480 - 481 - priv.u.rid = r->rid; 482 - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; 483 - priv.u.sum = do_sum; 484 - list_for_each_entry(mevt, &r->evt_list, list) { 485 - priv.u.evtid = mevt->evtid; 486 - ret = mon_addfile(kn, mevt->name, priv.priv); 487 - if (ret) 488 - return ret; 489 - 490 - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) 491 - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); 492 - } 493 - 494 - return 0; 495 - } 496 - 497 - static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, 498 - struct rdt_mon_domain *d, 499 - struct rdt_resource *r, struct rdtgroup *prgrp) 500 - { 501 - struct kernfs_node *kn, *ckn; 502 - char name[32]; 503 - bool snc_mode; 504 - int ret = 0; 505 - 506 - lockdep_assert_held(&rdtgroup_mutex); 507 - 508 - snc_mode = r->mon_scope == RESCTRL_L3_NODE; 509 - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 510 - kn = kernfs_find_and_get(parent_kn, name); 511 - if (kn) { 512 - /* 513 - * rdtgroup_mutex will prevent this directory from being 514 - * removed. No need to keep this hold. 515 - */ 516 - kernfs_put(kn); 517 - } else { 518 - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 519 - if (IS_ERR(kn)) 520 - return PTR_ERR(kn); 521 - 522 - ret = rdtgroup_kn_set_ugid(kn); 523 - if (ret) 524 - goto out_destroy; 525 - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); 526 - if (ret) 527 - goto out_destroy; 528 - } 529 - 530 - if (snc_mode) { 531 - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); 532 - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); 533 - if (IS_ERR(ckn)) { 534 - ret = -EINVAL; 535 - goto out_destroy; 536 - } 537 - 538 - ret = rdtgroup_kn_set_ugid(ckn); 539 - if (ret) 540 - goto out_destroy; 541 - 542 - ret = mon_add_all_files(ckn, d, r, prgrp, false); 543 - if (ret) 544 - goto out_destroy; 545 - } 546 - 547 - kernfs_activate(kn); 548 - return 0; 549 - 550 - out_destroy: 551 - kernfs_remove(kn); 552 - return ret; 553 - } 554 - 555 - /* 556 - * Add all subdirectories of mon_data for "ctrl_mon" groups 557 - * and "monitor" groups with given domain id. 558 - */ 559 - static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 560 - struct rdt_mon_domain *d) 561 - { 562 - struct kernfs_node *parent_kn; 563 - struct rdtgroup *prgrp, *crgrp; 564 - struct list_head *head; 565 - 566 - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 567 - parent_kn = prgrp->mon.mon_data_kn; 568 - mkdir_mondata_subdir(parent_kn, d, r, prgrp); 569 - 570 - head = &prgrp->mon.crdtgrp_list; 571 - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 572 - parent_kn = crgrp->mon.mon_data_kn; 573 - mkdir_mondata_subdir(parent_kn, d, r, crgrp); 574 - } 575 - } 576 - } 577 - 578 - static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, 579 - struct rdt_resource *r, 580 - struct rdtgroup *prgrp) 581 - { 582 - struct rdt_mon_domain *dom; 583 - int ret; 584 - 585 - /* Walking r->domains, ensure it can't race with cpuhp */ 586 - lockdep_assert_cpus_held(); 587 - 588 - list_for_each_entry(dom, &r->mon_domains, hdr.list) { 589 - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); 590 - if (ret) 591 - return ret; 592 - } 593 - 594 - return 0; 595 - } 596 - 597 - /* 598 - * This creates a directory mon_data which contains the monitored data. 599 - * 600 - * mon_data has one directory for each domain which are named 601 - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data 602 - * with L3 domain looks as below: 603 - * ./mon_data: 604 - * mon_L3_00 605 - * mon_L3_01 606 - * mon_L3_02 607 - * ... 608 - * 609 - * Each domain directory has one file per event: 610 - * ./mon_L3_00/: 611 - * llc_occupancy 612 - * 613 - */ 614 - static int mkdir_mondata_all(struct kernfs_node *parent_kn, 615 - struct rdtgroup *prgrp, 616 - struct kernfs_node **dest_kn) 617 - { 618 - struct rdt_resource *r; 619 - struct kernfs_node *kn; 620 - int ret; 621 - 622 - /* 623 - * Create the mon_data directory first. 624 - */ 625 - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); 626 - if (ret) 627 - return ret; 628 - 629 - if (dest_kn) 630 - *dest_kn = kn; 631 - 632 - /* 633 - * Create the subdirectories for each domain. Note that all events 634 - * in a domain like L3 are grouped into a resource whose domain is L3 635 - */ 636 - for_each_mon_capable_rdt_resource(r) { 637 - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); 638 - if (ret) 639 - goto out_destroy; 640 - } 641 - 642 - return 0; 643 - 644 - out_destroy: 645 - kernfs_remove(kn); 646 - return ret; 647 - } 648 - 649 - /** 650 - * cbm_ensure_valid - Enforce validity on provided CBM 651 - * @_val: Candidate CBM 652 - * @r: RDT resource to which the CBM belongs 653 - * 654 - * The provided CBM represents all cache portions available for use. This 655 - * may be represented by a bitmap that does not consist of contiguous ones 656 - * and thus be an invalid CBM. 657 - * Here the provided CBM is forced to be a valid CBM by only considering 658 - * the first set of contiguous bits as valid and clearing all bits. 659 - * The intention here is to provide a valid default CBM with which a new 660 - * resource group is initialized. The user can follow this with a 661 - * modification to the CBM if the default does not satisfy the 662 - * requirements. 663 - */ 664 - static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) 665 - { 666 - unsigned int cbm_len = r->cache.cbm_len; 667 - unsigned long first_bit, zero_bit; 668 - unsigned long val = _val; 669 - 670 - if (!val) 671 - return 0; 672 - 673 - first_bit = find_first_bit(&val, cbm_len); 674 - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 675 - 676 - /* Clear any remaining bits to ensure contiguous region */ 677 - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); 678 - return (u32)val; 679 - } 680 - 681 - /* 682 - * Initialize cache resources per RDT domain 683 - * 684 - * Set the RDT domain up to start off with all usable allocations. That is, 685 - * all shareable and unused bits. All-zero CBM is invalid. 686 - */ 687 - static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, 688 - u32 closid) 689 - { 690 - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 691 - enum resctrl_conf_type t = s->conf_type; 692 - struct resctrl_staged_config *cfg; 693 - struct rdt_resource *r = s->res; 694 - u32 used_b = 0, unused_b = 0; 695 - unsigned long tmp_cbm; 696 - enum rdtgrp_mode mode; 697 - u32 peer_ctl, ctrl_val; 698 - int i; 699 - 700 - cfg = &d->staged_config[t]; 701 - cfg->have_new_ctrl = false; 702 - cfg->new_ctrl = r->cache.shareable_bits; 703 - used_b = r->cache.shareable_bits; 704 - for (i = 0; i < closids_supported(); i++) { 705 - if (closid_allocated(i) && i != closid) { 706 - mode = rdtgroup_mode_by_closid(i); 707 - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) 708 - /* 709 - * ctrl values for locksetup aren't relevant 710 - * until the schemata is written, and the mode 711 - * becomes RDT_MODE_PSEUDO_LOCKED. 712 - */ 713 - continue; 714 - /* 715 - * If CDP is active include peer domain's 716 - * usage to ensure there is no overlap 717 - * with an exclusive group. 718 - */ 719 - if (resctrl_arch_get_cdp_enabled(r->rid)) 720 - peer_ctl = resctrl_arch_get_config(r, d, i, 721 - peer_type); 722 - else 723 - peer_ctl = 0; 724 - ctrl_val = resctrl_arch_get_config(r, d, i, 725 - s->conf_type); 726 - used_b |= ctrl_val | peer_ctl; 727 - if (mode == RDT_MODE_SHAREABLE) 728 - cfg->new_ctrl |= ctrl_val | peer_ctl; 729 - } 730 - } 731 - if (d->plr && d->plr->cbm > 0) 732 - used_b |= d->plr->cbm; 733 - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); 734 - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; 735 - cfg->new_ctrl |= unused_b; 736 - /* 737 - * Force the initial CBM to be valid, user can 738 - * modify the CBM based on system availability. 739 - */ 740 - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); 741 - /* 742 - * Assign the u32 CBM to an unsigned long to ensure that 743 - * bitmap_weight() does not access out-of-bound memory. 744 - */ 745 - tmp_cbm = cfg->new_ctrl; 746 - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { 747 - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); 748 - return -ENOSPC; 749 - } 750 - cfg->have_new_ctrl = true; 751 - 752 - return 0; 753 - } 754 - 755 - /* 756 - * Initialize cache resources with default values. 757 - * 758 - * A new RDT group is being created on an allocation capable (CAT) 759 - * supporting system. Set this group up to start off with all usable 760 - * allocations. 761 - * 762 - * If there are no more shareable bits available on any domain then 763 - * the entire allocation will fail. 764 - */ 765 - static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) 766 - { 767 - struct rdt_ctrl_domain *d; 768 - int ret; 769 - 770 - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { 771 - ret = __init_one_rdt_domain(d, s, closid); 772 - if (ret < 0) 773 - return ret; 774 - } 775 - 776 - return 0; 777 - } 778 - 779 - /* Initialize MBA resource with default values. */ 780 - static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) 781 - { 782 - struct resctrl_staged_config *cfg; 783 - struct rdt_ctrl_domain *d; 784 - 785 - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 786 - if (is_mba_sc(r)) { 787 - d->mbps_val[closid] = MBA_MAX_MBPS; 788 - continue; 789 - } 790 - 791 - cfg = &d->staged_config[CDP_NONE]; 792 - cfg->new_ctrl = resctrl_get_default_ctrl(r); 793 - cfg->have_new_ctrl = true; 794 - } 795 - } 796 - 797 - /* Initialize the RDT group's allocations. */ 798 - static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) 799 - { 800 - struct resctrl_schema *s; 801 - struct rdt_resource *r; 802 - int ret = 0; 803 - 804 - rdt_staged_configs_clear(); 805 - 806 - list_for_each_entry(s, &resctrl_schema_all, list) { 807 - r = s->res; 808 - if (r->rid == RDT_RESOURCE_MBA || 809 - r->rid == RDT_RESOURCE_SMBA) { 810 - rdtgroup_init_mba(r, rdtgrp->closid); 811 - if (is_mba_sc(r)) 812 - continue; 813 - } else { 814 - ret = rdtgroup_init_cat(s, rdtgrp->closid); 815 - if (ret < 0) 816 - goto out; 817 - } 818 - 819 - ret = resctrl_arch_update_domains(r, rdtgrp->closid); 820 - if (ret < 0) { 821 - rdt_last_cmd_puts("Failed to initialize allocations\n"); 822 - goto out; 823 - } 824 - 825 - } 826 - 827 - rdtgrp->mode = RDT_MODE_SHAREABLE; 828 - 829 - out: 830 - rdt_staged_configs_clear(); 831 - return ret; 832 - } 833 - 834 - static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) 835 - { 836 - int ret; 837 - 838 - if (!resctrl_arch_mon_capable()) 839 - return 0; 840 - 841 - ret = alloc_rmid(rdtgrp->closid); 842 - if (ret < 0) { 843 - rdt_last_cmd_puts("Out of RMIDs\n"); 844 - return ret; 845 - } 846 - rdtgrp->mon.rmid = ret; 847 - 848 - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); 849 - if (ret) { 850 - rdt_last_cmd_puts("kernfs subdir error\n"); 851 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 852 - return ret; 853 - } 854 - 855 - return 0; 856 - } 857 - 858 - static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) 859 - { 860 - if (resctrl_arch_mon_capable()) 861 - free_rmid(rgrp->closid, rgrp->mon.rmid); 862 - } 863 - 864 - /* 865 - * We allow creating mon groups only with in a directory called "mon_groups" 866 - * which is present in every ctrl_mon group. Check if this is a valid 867 - * "mon_groups" directory. 868 - * 869 - * 1. The directory should be named "mon_groups". 870 - * 2. The mon group itself should "not" be named "mon_groups". 871 - * This makes sure "mon_groups" directory always has a ctrl_mon group 872 - * as parent. 873 - */ 874 - static bool is_mon_groups(struct kernfs_node *kn, const char *name) 875 - { 876 - return (!strcmp(rdt_kn_name(kn), "mon_groups") && 877 - strcmp(name, "mon_groups")); 878 - } 879 - 880 - static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, 881 - const char *name, umode_t mode, 882 - enum rdt_group_type rtype, struct rdtgroup **r) 883 - { 884 - struct rdtgroup *prdtgrp, *rdtgrp; 885 - unsigned long files = 0; 886 - struct kernfs_node *kn; 887 - int ret; 888 - 889 - prdtgrp = rdtgroup_kn_lock_live(parent_kn); 890 - if (!prdtgrp) { 891 - ret = -ENODEV; 892 - goto out_unlock; 893 - } 894 - 895 - /* 896 - * Check that the parent directory for a monitor group is a "mon_groups" 897 - * directory. 898 - */ 899 - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { 900 - ret = -EPERM; 901 - goto out_unlock; 902 - } 903 - 904 - if (rtype == RDTMON_GROUP && 905 - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 906 - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { 907 - ret = -EINVAL; 908 - rdt_last_cmd_puts("Pseudo-locking in progress\n"); 909 - goto out_unlock; 910 - } 911 - 912 - /* allocate the rdtgroup. */ 913 - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 914 - if (!rdtgrp) { 915 - ret = -ENOSPC; 916 - rdt_last_cmd_puts("Kernel out of memory\n"); 917 - goto out_unlock; 918 - } 919 - *r = rdtgrp; 920 - rdtgrp->mon.parent = prdtgrp; 921 - rdtgrp->type = rtype; 922 - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); 923 - 924 - /* kernfs creates the directory for rdtgrp */ 925 - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); 926 - if (IS_ERR(kn)) { 927 - ret = PTR_ERR(kn); 928 - rdt_last_cmd_puts("kernfs create error\n"); 929 - goto out_free_rgrp; 930 - } 931 - rdtgrp->kn = kn; 932 - 933 - /* 934 - * kernfs_remove() will drop the reference count on "kn" which 935 - * will free it. But we still need it to stick around for the 936 - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, 937 - * which will be dropped by kernfs_put() in rdtgroup_remove(). 938 - */ 939 - kernfs_get(kn); 940 - 941 - ret = rdtgroup_kn_set_ugid(kn); 942 - if (ret) { 943 - rdt_last_cmd_puts("kernfs perm error\n"); 944 - goto out_destroy; 945 - } 946 - 947 - if (rtype == RDTCTRL_GROUP) { 948 - files = RFTYPE_BASE | RFTYPE_CTRL; 949 - if (resctrl_arch_mon_capable()) 950 - files |= RFTYPE_MON; 951 - } else { 952 - files = RFTYPE_BASE | RFTYPE_MON; 953 - } 954 - 955 - ret = rdtgroup_add_files(kn, files); 956 - if (ret) { 957 - rdt_last_cmd_puts("kernfs fill error\n"); 958 - goto out_destroy; 959 - } 960 - 961 - /* 962 - * The caller unlocks the parent_kn upon success. 963 - */ 964 - return 0; 965 - 966 - out_destroy: 967 - kernfs_put(rdtgrp->kn); 968 - kernfs_remove(rdtgrp->kn); 969 - out_free_rgrp: 970 - kfree(rdtgrp); 971 - out_unlock: 972 - rdtgroup_kn_unlock(parent_kn); 973 - return ret; 974 - } 975 - 976 - static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) 977 - { 978 - kernfs_remove(rgrp->kn); 979 - rdtgroup_remove(rgrp); 980 - } 981 - 982 - /* 983 - * Create a monitor group under "mon_groups" directory of a control 984 - * and monitor group(ctrl_mon). This is a resource group 985 - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. 986 - */ 987 - static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, 988 - const char *name, umode_t mode) 989 - { 990 - struct rdtgroup *rdtgrp, *prgrp; 991 - int ret; 992 - 993 - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); 994 - if (ret) 995 - return ret; 996 - 997 - prgrp = rdtgrp->mon.parent; 998 - rdtgrp->closid = prgrp->closid; 999 - 1000 - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 1001 - if (ret) { 1002 - mkdir_rdt_prepare_clean(rdtgrp); 1003 - goto out_unlock; 1004 - } 1005 - 1006 - kernfs_activate(rdtgrp->kn); 1007 - 1008 - /* 1009 - * Add the rdtgrp to the list of rdtgrps the parent 1010 - * ctrl_mon group has to track. 1011 - */ 1012 - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); 1013 - 1014 - out_unlock: 1015 - rdtgroup_kn_unlock(parent_kn); 1016 - return ret; 1017 - } 1018 - 1019 - /* 1020 - * These are rdtgroups created under the root directory. Can be used 1021 - * to allocate and monitor resources. 1022 - */ 1023 - static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, 1024 - const char *name, umode_t mode) 1025 - { 1026 - struct rdtgroup *rdtgrp; 1027 - struct kernfs_node *kn; 1028 - u32 closid; 1029 - int ret; 1030 - 1031 - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); 1032 - if (ret) 1033 - return ret; 1034 - 1035 - kn = rdtgrp->kn; 1036 - ret = closid_alloc(); 1037 - if (ret < 0) { 1038 - rdt_last_cmd_puts("Out of CLOSIDs\n"); 1039 - goto out_common_fail; 1040 - } 1041 - closid = ret; 1042 - ret = 0; 1043 - 1044 - rdtgrp->closid = closid; 1045 - 1046 - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 1047 - if (ret) 1048 - goto out_closid_free; 1049 - 1050 - kernfs_activate(rdtgrp->kn); 1051 - 1052 - ret = rdtgroup_init_alloc(rdtgrp); 1053 - if (ret < 0) 1054 - goto out_rmid_free; 1055 - 1056 - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 1057 - 1058 - if (resctrl_arch_mon_capable()) { 1059 - /* 1060 - * Create an empty mon_groups directory to hold the subset 1061 - * of tasks and cpus to monitor. 1062 - */ 1063 - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); 1064 - if (ret) { 1065 - rdt_last_cmd_puts("kernfs subdir error\n"); 1066 - goto out_del_list; 1067 - } 1068 - if (is_mba_sc(NULL)) 1069 - rdtgrp->mba_mbps_event = mba_mbps_default_event; 1070 - } 1071 - 1072 - goto out_unlock; 1073 - 1074 - out_del_list: 1075 - list_del(&rdtgrp->rdtgroup_list); 1076 - out_rmid_free: 1077 - mkdir_rdt_prepare_rmid_free(rdtgrp); 1078 - out_closid_free: 1079 - closid_free(closid); 1080 - out_common_fail: 1081 - mkdir_rdt_prepare_clean(rdtgrp); 1082 - out_unlock: 1083 - rdtgroup_kn_unlock(parent_kn); 1084 - return ret; 1085 - } 1086 - 1087 - static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 1088 - umode_t mode) 1089 - { 1090 - /* Do not accept '\n' to avoid unparsable situation. */ 1091 - if (strchr(name, '\n')) 1092 - return -EINVAL; 1093 - 1094 - /* 1095 - * If the parent directory is the root directory and RDT 1096 - * allocation is supported, add a control and monitoring 1097 - * subdirectory 1098 - */ 1099 - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) 1100 - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); 1101 - 1102 - /* Else, attempt to add a monitoring subdirectory. */ 1103 - if (resctrl_arch_mon_capable()) 1104 - return rdtgroup_mkdir_mon(parent_kn, name, mode); 1105 - 1106 - return -EPERM; 1107 - } 1108 - 1109 - static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 1110 - { 1111 - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 1112 - u32 closid, rmid; 1113 - int cpu; 1114 - 1115 - /* Give any tasks back to the parent group */ 1116 - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); 1117 - 1118 - /* 1119 - * Update per cpu closid/rmid of the moved CPUs first. 1120 - * Note: the closid will not change, but the arch code still needs it. 1121 - */ 1122 - closid = prdtgrp->closid; 1123 - rmid = prdtgrp->mon.rmid; 1124 - for_each_cpu(cpu, &rdtgrp->cpu_mask) 1125 - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 1126 - 1127 - /* 1128 - * Update the MSR on moved CPUs and CPUs which have moved 1129 - * task running on them. 1130 - */ 1131 - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1132 - update_closid_rmid(tmpmask, NULL); 1133 - 1134 - rdtgrp->flags = RDT_DELETED; 1135 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 1136 - 1137 - /* 1138 - * Remove the rdtgrp from the parent ctrl_mon group's list 1139 - */ 1140 - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 1141 - list_del(&rdtgrp->mon.crdtgrp_list); 1142 - 1143 - kernfs_remove(rdtgrp->kn); 1144 - 1145 - return 0; 1146 - } 1147 - 1148 - static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) 1149 - { 1150 - rdtgrp->flags = RDT_DELETED; 1151 - list_del(&rdtgrp->rdtgroup_list); 1152 - 1153 - kernfs_remove(rdtgrp->kn); 1154 - return 0; 1155 - } 1156 - 1157 - static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 1158 - { 1159 - u32 closid, rmid; 1160 - int cpu; 1161 - 1162 - /* Give any tasks back to the default group */ 1163 - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 1164 - 1165 - /* Give any CPUs back to the default group */ 1166 - cpumask_or(&rdtgroup_default.cpu_mask, 1167 - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 1168 - 1169 - /* Update per cpu closid and rmid of the moved CPUs first */ 1170 - closid = rdtgroup_default.closid; 1171 - rmid = rdtgroup_default.mon.rmid; 1172 - for_each_cpu(cpu, &rdtgrp->cpu_mask) 1173 - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 1174 - 1175 - /* 1176 - * Update the MSR on moved CPUs and CPUs which have moved 1177 - * task running on them. 1178 - */ 1179 - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1180 - update_closid_rmid(tmpmask, NULL); 1181 - 1182 - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 1183 - closid_free(rdtgrp->closid); 1184 - 1185 - rdtgroup_ctrl_remove(rdtgrp); 1186 - 1187 - /* 1188 - * Free all the child monitor group rmids. 1189 - */ 1190 - free_all_child_rdtgrp(rdtgrp); 1191 - 1192 - return 0; 1193 - } 1194 - 1195 - static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) 1196 - { 1197 - /* 1198 - * Valid within the RCU section it was obtained or while rdtgroup_mutex 1199 - * is held. 1200 - */ 1201 - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); 1202 - } 1203 - 1204 - static int rdtgroup_rmdir(struct kernfs_node *kn) 1205 - { 1206 - struct kernfs_node *parent_kn; 1207 - struct rdtgroup *rdtgrp; 1208 - cpumask_var_t tmpmask; 1209 - int ret = 0; 1210 - 1211 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 1212 - return -ENOMEM; 1213 - 1214 - rdtgrp = rdtgroup_kn_lock_live(kn); 1215 - if (!rdtgrp) { 1216 - ret = -EPERM; 1217 - goto out; 1218 - } 1219 - parent_kn = rdt_kn_parent(kn); 1220 - 1221 - /* 1222 - * If the rdtgroup is a ctrl_mon group and parent directory 1223 - * is the root directory, remove the ctrl_mon group. 1224 - * 1225 - * If the rdtgroup is a mon group and parent directory 1226 - * is a valid "mon_groups" directory, remove the mon group. 1227 - */ 1228 - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && 1229 - rdtgrp != &rdtgroup_default) { 1230 - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 1231 - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1232 - ret = rdtgroup_ctrl_remove(rdtgrp); 1233 - } else { 1234 - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); 1235 - } 1236 - } else if (rdtgrp->type == RDTMON_GROUP && 1237 - is_mon_groups(parent_kn, rdt_kn_name(kn))) { 1238 - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); 1239 - } else { 1240 - ret = -EPERM; 1241 - } 1242 - 1243 - out: 1244 - rdtgroup_kn_unlock(kn); 1245 - free_cpumask_var(tmpmask); 1246 - return ret; 1247 - } 1248 - 1249 - /** 1250 - * mongrp_reparent() - replace parent CTRL_MON group of a MON group 1251 - * @rdtgrp: the MON group whose parent should be replaced 1252 - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp 1253 - * @cpus: cpumask provided by the caller for use during this call 1254 - * 1255 - * Replaces the parent CTRL_MON group for a MON group, resulting in all member 1256 - * tasks' CLOSID immediately changing to that of the new parent group. 1257 - * Monitoring data for the group is unaffected by this operation. 1258 - */ 1259 - static void mongrp_reparent(struct rdtgroup *rdtgrp, 1260 - struct rdtgroup *new_prdtgrp, 1261 - cpumask_var_t cpus) 1262 - { 1263 - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 1264 - 1265 - WARN_ON(rdtgrp->type != RDTMON_GROUP); 1266 - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); 1267 - 1268 - /* Nothing to do when simply renaming a MON group. */ 1269 - if (prdtgrp == new_prdtgrp) 1270 - return; 1271 - 1272 - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 1273 - list_move_tail(&rdtgrp->mon.crdtgrp_list, 1274 - &new_prdtgrp->mon.crdtgrp_list); 1275 - 1276 - rdtgrp->mon.parent = new_prdtgrp; 1277 - rdtgrp->closid = new_prdtgrp->closid; 1278 - 1279 - /* Propagate updated closid to all tasks in this group. */ 1280 - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); 1281 - 1282 - update_closid_rmid(cpus, NULL); 1283 - } 1284 - 1285 - static int rdtgroup_rename(struct kernfs_node *kn, 1286 - struct kernfs_node *new_parent, const char *new_name) 1287 - { 1288 - struct kernfs_node *kn_parent; 1289 - struct rdtgroup *new_prdtgrp; 1290 - struct rdtgroup *rdtgrp; 1291 - cpumask_var_t tmpmask; 1292 - int ret; 1293 - 1294 - rdtgrp = kernfs_to_rdtgroup(kn); 1295 - new_prdtgrp = kernfs_to_rdtgroup(new_parent); 1296 - if (!rdtgrp || !new_prdtgrp) 1297 - return -ENOENT; 1298 - 1299 - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ 1300 - rdtgroup_kn_get(rdtgrp, kn); 1301 - rdtgroup_kn_get(new_prdtgrp, new_parent); 1302 - 1303 - mutex_lock(&rdtgroup_mutex); 1304 - 1305 - rdt_last_cmd_clear(); 1306 - 1307 - /* 1308 - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if 1309 - * either kernfs_node is a file. 1310 - */ 1311 - if (kernfs_type(kn) != KERNFS_DIR || 1312 - kernfs_type(new_parent) != KERNFS_DIR) { 1313 - rdt_last_cmd_puts("Source and destination must be directories"); 1314 - ret = -EPERM; 1315 - goto out; 1316 - } 1317 - 1318 - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { 1319 - ret = -ENOENT; 1320 - goto out; 1321 - } 1322 - 1323 - kn_parent = rdt_kn_parent(kn); 1324 - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || 1325 - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { 1326 - rdt_last_cmd_puts("Source must be a MON group\n"); 1327 - ret = -EPERM; 1328 - goto out; 1329 - } 1330 - 1331 - if (!is_mon_groups(new_parent, new_name)) { 1332 - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); 1333 - ret = -EPERM; 1334 - goto out; 1335 - } 1336 - 1337 - /* 1338 - * If the MON group is monitoring CPUs, the CPUs must be assigned to the 1339 - * current parent CTRL_MON group and therefore cannot be assigned to 1340 - * the new parent, making the move illegal. 1341 - */ 1342 - if (!cpumask_empty(&rdtgrp->cpu_mask) && 1343 - rdtgrp->mon.parent != new_prdtgrp) { 1344 - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); 1345 - ret = -EPERM; 1346 - goto out; 1347 - } 1348 - 1349 - /* 1350 - * Allocate the cpumask for use in mongrp_reparent() to avoid the 1351 - * possibility of failing to allocate it after kernfs_rename() has 1352 - * succeeded. 1353 - */ 1354 - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { 1355 - ret = -ENOMEM; 1356 - goto out; 1357 - } 1358 - 1359 - /* 1360 - * Perform all input validation and allocations needed to ensure 1361 - * mongrp_reparent() will succeed before calling kernfs_rename(), 1362 - * otherwise it would be necessary to revert this call if 1363 - * mongrp_reparent() failed. 1364 - */ 1365 - ret = kernfs_rename(kn, new_parent, new_name); 1366 - if (!ret) 1367 - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); 1368 - 1369 - free_cpumask_var(tmpmask); 1370 - 1371 - out: 1372 - mutex_unlock(&rdtgroup_mutex); 1373 - rdtgroup_kn_put(rdtgrp, kn); 1374 - rdtgroup_kn_put(new_prdtgrp, new_parent); 1375 - return ret; 1376 - } 1377 - 1378 - static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 1379 - { 1380 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) 1381 - seq_puts(seq, ",cdp"); 1382 - 1383 - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) 1384 - seq_puts(seq, ",cdpl2"); 1385 - 1386 - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) 1387 - seq_puts(seq, ",mba_MBps"); 1388 - 1389 - if (resctrl_debug) 1390 - seq_puts(seq, ",debug"); 1391 - 1392 - return 0; 1393 - } 1394 - 1395 - static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { 1396 - .mkdir = rdtgroup_mkdir, 1397 - .rmdir = rdtgroup_rmdir, 1398 - .rename = rdtgroup_rename, 1399 - .show_options = rdtgroup_show_options, 1400 - }; 1401 - 1402 - static int rdtgroup_setup_root(struct rdt_fs_context *ctx) 1403 - { 1404 - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, 1405 - KERNFS_ROOT_CREATE_DEACTIVATED | 1406 - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, 1407 - &rdtgroup_default); 1408 - if (IS_ERR(rdt_root)) 1409 - return PTR_ERR(rdt_root); 1410 - 1411 - ctx->kfc.root = rdt_root; 1412 - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); 1413 - 1414 - return 0; 1415 - } 1416 - 1417 - static void rdtgroup_destroy_root(void) 1418 - { 1419 - kernfs_destroy_root(rdt_root); 1420 - rdtgroup_default.kn = NULL; 1421 - } 1422 - 1423 - static void __init rdtgroup_setup_default(void) 1424 - { 1425 - mutex_lock(&rdtgroup_mutex); 1426 - 1427 - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; 1428 - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; 1429 - rdtgroup_default.type = RDTCTRL_GROUP; 1430 - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); 1431 - 1432 - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 1433 - 1434 - mutex_unlock(&rdtgroup_mutex); 1435 - } 1436 - 1437 - static void domain_destroy_mon_state(struct rdt_mon_domain *d) 1438 - { 1439 - bitmap_free(d->rmid_busy_llc); 1440 - kfree(d->mbm_total); 1441 - kfree(d->mbm_local); 1442 - } 1443 - 1444 - void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 1445 - { 1446 - mutex_lock(&rdtgroup_mutex); 1447 - 1448 - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) 1449 - mba_sc_domain_destroy(r, d); 1450 - 1451 - mutex_unlock(&rdtgroup_mutex); 1452 - } 1453 - 1454 - void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 1455 - { 1456 - mutex_lock(&rdtgroup_mutex); 1457 - 1458 - /* 1459 - * If resctrl is mounted, remove all the 1460 - * per domain monitor data directories. 1461 - */ 1462 - if (resctrl_mounted && resctrl_arch_mon_capable()) 1463 - rmdir_mondata_subdir_allrdtgrp(r, d); 1464 - 1465 - if (resctrl_is_mbm_enabled()) 1466 - cancel_delayed_work(&d->mbm_over); 1467 - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { 1468 - /* 1469 - * When a package is going down, forcefully 1470 - * decrement rmid->ebusy. There is no way to know 1471 - * that the L3 was flushed and hence may lead to 1472 - * incorrect counts in rare scenarios, but leaving 1473 - * the RMID as busy creates RMID leaks if the 1474 - * package never comes back. 1475 - */ 1476 - __check_limbo(d, true); 1477 - cancel_delayed_work(&d->cqm_limbo); 1478 - } 1479 - 1480 - domain_destroy_mon_state(d); 1481 - 1482 - mutex_unlock(&rdtgroup_mutex); 1483 - } 1484 - 1485 - /** 1486 - * domain_setup_mon_state() - Initialise domain monitoring structures. 1487 - * @r: The resource for the newly online domain. 1488 - * @d: The newly online domain. 1489 - * 1490 - * Allocate monitor resources that belong to this domain. 1491 - * Called when the first CPU of a domain comes online, regardless of whether 1492 - * the filesystem is mounted. 1493 - * During boot this may be called before global allocations have been made by 1494 - * resctrl_mon_resource_init(). 1495 - * 1496 - * Returns 0 for success, or -ENOMEM. 1497 - */ 1498 - static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) 1499 - { 1500 - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 1501 - size_t tsize; 1502 - 1503 - if (resctrl_arch_is_llc_occupancy_enabled()) { 1504 - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); 1505 - if (!d->rmid_busy_llc) 1506 - return -ENOMEM; 1507 - } 1508 - if (resctrl_arch_is_mbm_total_enabled()) { 1509 - tsize = sizeof(*d->mbm_total); 1510 - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); 1511 - if (!d->mbm_total) { 1512 - bitmap_free(d->rmid_busy_llc); 1513 - return -ENOMEM; 1514 - } 1515 - } 1516 - if (resctrl_arch_is_mbm_local_enabled()) { 1517 - tsize = sizeof(*d->mbm_local); 1518 - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); 1519 - if (!d->mbm_local) { 1520 - bitmap_free(d->rmid_busy_llc); 1521 - kfree(d->mbm_total); 1522 - return -ENOMEM; 1523 - } 1524 - } 1525 - 1526 - return 0; 1527 - } 1528 - 1529 - int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 1530 - { 1531 - int err = 0; 1532 - 1533 - mutex_lock(&rdtgroup_mutex); 1534 - 1535 - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { 1536 - /* RDT_RESOURCE_MBA is never mon_capable */ 1537 - err = mba_sc_domain_allocate(r, d); 1538 - } 1539 - 1540 - mutex_unlock(&rdtgroup_mutex); 1541 - 1542 - return err; 1543 - } 1544 - 1545 - int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 1546 - { 1547 - int err; 1548 - 1549 - mutex_lock(&rdtgroup_mutex); 1550 - 1551 - err = domain_setup_mon_state(r, d); 1552 - if (err) 1553 - goto out_unlock; 1554 - 1555 - if (resctrl_is_mbm_enabled()) { 1556 - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); 1557 - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, 1558 - RESCTRL_PICK_ANY_CPU); 1559 - } 1560 - 1561 - if (resctrl_arch_is_llc_occupancy_enabled()) 1562 - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); 1563 - 1564 - /* 1565 - * If the filesystem is not mounted then only the default resource group 1566 - * exists. Creation of its directories is deferred until mount time 1567 - * by rdt_get_tree() calling mkdir_mondata_all(). 1568 - * If resctrl is mounted, add per domain monitor data directories. 1569 - */ 1570 - if (resctrl_mounted && resctrl_arch_mon_capable()) 1571 - mkdir_mondata_subdir_allrdtgrp(r, d); 1572 - 1573 - out_unlock: 1574 - mutex_unlock(&rdtgroup_mutex); 1575 - 1576 - return err; 1577 - } 1578 - 1579 - void resctrl_online_cpu(unsigned int cpu) 1580 - { 1581 - mutex_lock(&rdtgroup_mutex); 1582 - /* The CPU is set in default rdtgroup after online. */ 1583 - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 1584 - mutex_unlock(&rdtgroup_mutex); 1585 - } 1586 - 1587 - static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) 1588 - { 1589 - struct rdtgroup *cr; 1590 - 1591 - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { 1592 - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) 1593 - break; 1594 - } 1595 - } 1596 - 1597 - static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, 1598 - struct rdt_resource *r) 1599 - { 1600 - struct rdt_mon_domain *d; 1601 - 1602 - lockdep_assert_cpus_held(); 1603 - 1604 - list_for_each_entry(d, &r->mon_domains, hdr.list) { 1605 - /* Find the domain that contains this CPU */ 1606 - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 1607 - return d; 1608 - } 1609 - 1610 - return NULL; 1611 - } 1612 - 1613 - void resctrl_offline_cpu(unsigned int cpu) 1614 - { 1615 - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1616 - struct rdt_mon_domain *d; 1617 - struct rdtgroup *rdtgrp; 1618 - 1619 - mutex_lock(&rdtgroup_mutex); 1620 - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 1621 - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { 1622 - clear_childcpus(rdtgrp, cpu); 1623 - break; 1624 - } 1625 - } 1626 - 1627 - if (!l3->mon_capable) 1628 - goto out_unlock; 1629 - 1630 - d = get_mon_domain_from_cpu(cpu, l3); 1631 - if (d) { 1632 - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { 1633 - cancel_delayed_work(&d->mbm_over); 1634 - mbm_setup_overflow_handler(d, 0, cpu); 1635 - } 1636 - if (resctrl_arch_is_llc_occupancy_enabled() && 1637 - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { 1638 - cancel_delayed_work(&d->cqm_limbo); 1639 - cqm_setup_limbo_handler(d, 0, cpu); 1640 - } 1641 - } 1642 - 1643 - out_unlock: 1644 - mutex_unlock(&rdtgroup_mutex); 1645 - } 1646 - 1647 - /* 1648 - * resctrl_init - resctrl filesystem initialization 1649 - * 1650 - * Setup resctrl file system including set up root, create mount point, 1651 - * register resctrl filesystem, and initialize files under root directory. 1652 - * 1653 - * Return: 0 on success or -errno 1654 - */ 1655 - int __init resctrl_init(void) 1656 - { 1657 - int ret = 0; 1658 - 1659 - seq_buf_init(&last_cmd_status, last_cmd_status_buf, 1660 - sizeof(last_cmd_status_buf)); 1661 - 1662 - rdtgroup_setup_default(); 1663 - 1664 - thread_throttle_mode_init(); 1665 - 1666 - ret = resctrl_mon_resource_init(); 1667 - if (ret) 1668 - return ret; 1669 - 1670 - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); 1671 - if (ret) { 1672 - resctrl_mon_resource_exit(); 1673 - return ret; 1674 - } 1675 - 1676 - ret = register_filesystem(&rdt_fs_type); 1677 - if (ret) 1678 - goto cleanup_mountpoint; 1679 - 1680 - /* 1681 - * Adding the resctrl debugfs directory here may not be ideal since 1682 - * it would let the resctrl debugfs directory appear on the debugfs 1683 - * filesystem before the resctrl filesystem is mounted. 1684 - * It may also be ok since that would enable debugging of RDT before 1685 - * resctrl is mounted. 1686 - * The reason why the debugfs directory is created here and not in 1687 - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and 1688 - * during the debugfs directory creation also &sb->s_type->i_mutex_key 1689 - * (the lockdep class of inode->i_rwsem). Other filesystem 1690 - * interactions (eg. SyS_getdents) have the lock ordering: 1691 - * &sb->s_type->i_mutex_key --> &mm->mmap_lock 1692 - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex 1693 - * is taken, thus creating dependency: 1694 - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause 1695 - * issues considering the other two lock dependencies. 1696 - * By creating the debugfs directory here we avoid a dependency 1697 - * that may cause deadlock (even though file operations cannot 1698 - * occur until the filesystem is mounted, but I do not know how to 1699 - * tell lockdep that). 1700 - */ 1701 - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); 1702 - 1703 - return 0; 1704 - 1705 - cleanup_mountpoint: 1706 - sysfs_remove_mount_point(fs_kobj, "resctrl"); 1707 - resctrl_mon_resource_exit(); 1708 - 1709 - return ret; 1710 - } 1711 - 1712 - void __exit resctrl_exit(void) 1713 - { 1714 - debugfs_remove_recursive(debugfs_resctrl); 1715 - unregister_filesystem(&rdt_fs_type); 1716 - sysfs_remove_mount_point(fs_kobj, "resctrl"); 1717 - 1718 - resctrl_mon_resource_exit(); 1719 2955 }
+6 -20
arch/x86/kernel/cpu/resctrl/trace.h arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
··· 2 2 #undef TRACE_SYSTEM 3 3 #define TRACE_SYSTEM resctrl 4 4 5 - #if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) 6 - #define _TRACE_RESCTRL_H 5 + #if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 6 + #define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H 7 7 8 8 #include <linux/tracepoint.h> 9 9 ··· 35 35 TP_printk("hits=%llu miss=%llu", 36 36 __entry->l3_hits, __entry->l3_miss)); 37 37 38 - TRACE_EVENT(mon_llc_occupancy_limbo, 39 - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), 40 - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), 41 - TP_STRUCT__entry(__field(u32, ctrl_hw_id) 42 - __field(u32, mon_hw_id) 43 - __field(int, domain_id) 44 - __field(u64, llc_occupancy_bytes)), 45 - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; 46 - __entry->mon_hw_id = mon_hw_id; 47 - __entry->domain_id = domain_id; 48 - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), 49 - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", 50 - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, 51 - __entry->llc_occupancy_bytes) 52 - ); 53 - 54 - #endif /* _TRACE_RESCTRL_H */ 38 + #endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ 55 39 56 40 #undef TRACE_INCLUDE_PATH 57 41 #define TRACE_INCLUDE_PATH . 58 - #define TRACE_INCLUDE_FILE trace 42 + 43 + #define TRACE_INCLUDE_FILE pseudo_lock_trace 44 + 59 45 #include <trace/define_trace.h>
+1 -1
arch/x86/kernel/process_32.c
··· 208 208 raw_cpu_write(current_task, next_p); 209 209 210 210 /* Load the Intel cache allocation PQR MSR. */ 211 - resctrl_sched_in(next_p); 211 + resctrl_arch_sched_in(next_p); 212 212 213 213 return prev_p; 214 214 }
+1 -1
arch/x86/kernel/process_64.c
··· 705 705 } 706 706 707 707 /* Load the Intel cache allocation PQR MSR. */ 708 - resctrl_sched_in(next_p); 708 + resctrl_arch_sched_in(next_p); 709 709 710 710 return prev_p; 711 711 }
+1
fs/Kconfig
··· 335 335 source "fs/hpfs/Kconfig" 336 336 source "fs/qnx4/Kconfig" 337 337 source "fs/qnx6/Kconfig" 338 + source "fs/resctrl/Kconfig" 338 339 source "fs/romfs/Kconfig" 339 340 source "fs/pstore/Kconfig" 340 341 source "fs/ufs/Kconfig"
+1
fs/Makefile
··· 128 128 obj-$(CONFIG_VBOXSF_FS) += vboxsf/ 129 129 obj-$(CONFIG_ZONEFS_FS) += zonefs/ 130 130 obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o 131 + obj-$(CONFIG_RESCTRL_FS) += resctrl/
+39
fs/resctrl/Kconfig
··· 1 + config RESCTRL_FS 2 + bool "CPU Resource Control Filesystem (resctrl)" 3 + depends on ARCH_HAS_CPU_RESCTRL 4 + select KERNFS 5 + select PROC_CPU_RESCTRL if PROC_FS 6 + help 7 + Some architectures provide hardware facilities to group tasks and 8 + monitor and control their usage of memory system resources such as 9 + caches and memory bandwidth. Examples of such facilities include 10 + Intel's Resource Director Technology (Intel(R) RDT) and AMD's 11 + Platform Quality of Service (AMD QoS). 12 + 13 + If your system has the necessary support and you want to be able to 14 + assign tasks to groups and manipulate the associated resource 15 + monitors and controls from userspace, say Y here to get a mountable 16 + 'resctrl' filesystem that lets you do just that. 17 + 18 + If nothing mounts or prods the 'resctrl' filesystem, resource 19 + controls and monitors are left in a quiescent, permissive state. 20 + 21 + On architectures where this can be disabled independently, it is 22 + safe to say N. 23 + 24 + See <file:Documentation/filesystems/resctrl.rst> for more information. 25 + 26 + config RESCTRL_FS_PSEUDO_LOCK 27 + bool 28 + depends on RESCTRL_FS 29 + help 30 + Software mechanism to pin data in a cache portion using 31 + micro-architecture specific knowledge. 32 + 33 + config RESCTRL_RMID_DEPENDS_ON_CLOSID 34 + bool 35 + depends on RESCTRL_FS 36 + help 37 + Enabled by the architecture when the RMID values depend on the CLOSID. 38 + This causes the CLOSID allocator to search for CLOSID with clean 39 + RMID.
+6
fs/resctrl/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o 3 + obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o 4 + 5 + # To allow define_trace.h's recursive include: 6 + CFLAGS_monitor.o = -I$(src)
+661
fs/resctrl/ctrlmondata.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Resource Director Technology(RDT) 4 + * - Cache Allocation code. 5 + * 6 + * Copyright (C) 2016 Intel Corporation 7 + * 8 + * Authors: 9 + * Fenghua Yu <fenghua.yu@intel.com> 10 + * Tony Luck <tony.luck@intel.com> 11 + * 12 + * More information about RDT be found in the Intel (R) x86 Architecture 13 + * Software Developer Manual June 2016, volume 3, section 17.17. 14 + */ 15 + 16 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 + 18 + #include <linux/cpu.h> 19 + #include <linux/kernfs.h> 20 + #include <linux/seq_file.h> 21 + #include <linux/slab.h> 22 + #include <linux/tick.h> 23 + 24 + #include "internal.h" 25 + 26 + struct rdt_parse_data { 27 + struct rdtgroup *rdtgrp; 28 + char *buf; 29 + }; 30 + 31 + typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, 32 + struct resctrl_schema *s, 33 + struct rdt_ctrl_domain *d); 34 + 35 + /* 36 + * Check whether MBA bandwidth percentage value is correct. The value is 37 + * checked against the minimum and max bandwidth values specified by the 38 + * hardware. The allocated bandwidth percentage is rounded to the next 39 + * control step available on the hardware. 40 + */ 41 + static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) 42 + { 43 + int ret; 44 + u32 bw; 45 + 46 + /* 47 + * Only linear delay values is supported for current Intel SKUs. 48 + */ 49 + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { 50 + rdt_last_cmd_puts("No support for non-linear MB domains\n"); 51 + return false; 52 + } 53 + 54 + ret = kstrtou32(buf, 10, &bw); 55 + if (ret) { 56 + rdt_last_cmd_printf("Invalid MB value %s\n", buf); 57 + return false; 58 + } 59 + 60 + /* Nothing else to do if software controller is enabled. */ 61 + if (is_mba_sc(r)) { 62 + *data = bw; 63 + return true; 64 + } 65 + 66 + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { 67 + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", 68 + bw, r->membw.min_bw, r->membw.max_bw); 69 + return false; 70 + } 71 + 72 + *data = roundup(bw, (unsigned long)r->membw.bw_gran); 73 + return true; 74 + } 75 + 76 + static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, 77 + struct rdt_ctrl_domain *d) 78 + { 79 + struct resctrl_staged_config *cfg; 80 + u32 closid = data->rdtgrp->closid; 81 + struct rdt_resource *r = s->res; 82 + u32 bw_val; 83 + 84 + cfg = &d->staged_config[s->conf_type]; 85 + if (cfg->have_new_ctrl) { 86 + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 87 + return -EINVAL; 88 + } 89 + 90 + if (!bw_validate(data->buf, &bw_val, r)) 91 + return -EINVAL; 92 + 93 + if (is_mba_sc(r)) { 94 + d->mbps_val[closid] = bw_val; 95 + return 0; 96 + } 97 + 98 + cfg->new_ctrl = bw_val; 99 + cfg->have_new_ctrl = true; 100 + 101 + return 0; 102 + } 103 + 104 + /* 105 + * Check whether a cache bit mask is valid. 106 + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: 107 + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 108 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 109 + * 110 + * Haswell does not support a non-contiguous 1s value and additionally 111 + * requires at least two bits set. 112 + * AMD allows non-contiguous bitmasks. 113 + */ 114 + static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) 115 + { 116 + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; 117 + unsigned int cbm_len = r->cache.cbm_len; 118 + unsigned long first_bit, zero_bit, val; 119 + int ret; 120 + 121 + ret = kstrtoul(buf, 16, &val); 122 + if (ret) { 123 + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); 124 + return false; 125 + } 126 + 127 + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { 128 + rdt_last_cmd_puts("Mask out of range\n"); 129 + return false; 130 + } 131 + 132 + first_bit = find_first_bit(&val, cbm_len); 133 + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 134 + 135 + /* Are non-contiguous bitmasks allowed? */ 136 + if (!r->cache.arch_has_sparse_bitmasks && 137 + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { 138 + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); 139 + return false; 140 + } 141 + 142 + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { 143 + rdt_last_cmd_printf("Need at least %d bits in the mask\n", 144 + r->cache.min_cbm_bits); 145 + return false; 146 + } 147 + 148 + *data = val; 149 + return true; 150 + } 151 + 152 + /* 153 + * Read one cache bit mask (hex). Check that it is valid for the current 154 + * resource type. 155 + */ 156 + static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, 157 + struct rdt_ctrl_domain *d) 158 + { 159 + struct rdtgroup *rdtgrp = data->rdtgrp; 160 + struct resctrl_staged_config *cfg; 161 + struct rdt_resource *r = s->res; 162 + u32 cbm_val; 163 + 164 + cfg = &d->staged_config[s->conf_type]; 165 + if (cfg->have_new_ctrl) { 166 + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); 167 + return -EINVAL; 168 + } 169 + 170 + /* 171 + * Cannot set up more than one pseudo-locked region in a cache 172 + * hierarchy. 173 + */ 174 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 175 + rdtgroup_pseudo_locked_in_hierarchy(d)) { 176 + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); 177 + return -EINVAL; 178 + } 179 + 180 + if (!cbm_validate(data->buf, &cbm_val, r)) 181 + return -EINVAL; 182 + 183 + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || 184 + rdtgrp->mode == RDT_MODE_SHAREABLE) && 185 + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { 186 + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); 187 + return -EINVAL; 188 + } 189 + 190 + /* 191 + * The CBM may not overlap with the CBM of another closid if 192 + * either is exclusive. 193 + */ 194 + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { 195 + rdt_last_cmd_puts("Overlaps with exclusive group\n"); 196 + return -EINVAL; 197 + } 198 + 199 + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { 200 + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || 201 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 202 + rdt_last_cmd_puts("Overlaps with other group\n"); 203 + return -EINVAL; 204 + } 205 + } 206 + 207 + cfg->new_ctrl = cbm_val; 208 + cfg->have_new_ctrl = true; 209 + 210 + return 0; 211 + } 212 + 213 + /* 214 + * For each domain in this resource we expect to find a series of: 215 + * id=mask 216 + * separated by ";". The "id" is in decimal, and must match one of 217 + * the "id"s for this resource. 218 + */ 219 + static int parse_line(char *line, struct resctrl_schema *s, 220 + struct rdtgroup *rdtgrp) 221 + { 222 + enum resctrl_conf_type t = s->conf_type; 223 + ctrlval_parser_t *parse_ctrlval = NULL; 224 + struct resctrl_staged_config *cfg; 225 + struct rdt_resource *r = s->res; 226 + struct rdt_parse_data data; 227 + struct rdt_ctrl_domain *d; 228 + char *dom = NULL, *id; 229 + unsigned long dom_id; 230 + 231 + /* Walking r->domains, ensure it can't race with cpuhp */ 232 + lockdep_assert_cpus_held(); 233 + 234 + switch (r->schema_fmt) { 235 + case RESCTRL_SCHEMA_BITMAP: 236 + parse_ctrlval = &parse_cbm; 237 + break; 238 + case RESCTRL_SCHEMA_RANGE: 239 + parse_ctrlval = &parse_bw; 240 + break; 241 + } 242 + 243 + if (WARN_ON_ONCE(!parse_ctrlval)) 244 + return -EINVAL; 245 + 246 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && 247 + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { 248 + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); 249 + return -EINVAL; 250 + } 251 + 252 + next: 253 + if (!line || line[0] == '\0') 254 + return 0; 255 + dom = strsep(&line, ";"); 256 + id = strsep(&dom, "="); 257 + if (!dom || kstrtoul(id, 10, &dom_id)) { 258 + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); 259 + return -EINVAL; 260 + } 261 + dom = strim(dom); 262 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 263 + if (d->hdr.id == dom_id) { 264 + data.buf = dom; 265 + data.rdtgrp = rdtgrp; 266 + if (parse_ctrlval(&data, s, d)) 267 + return -EINVAL; 268 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 269 + cfg = &d->staged_config[t]; 270 + /* 271 + * In pseudo-locking setup mode and just 272 + * parsed a valid CBM that should be 273 + * pseudo-locked. Only one locked region per 274 + * resource group and domain so just do 275 + * the required initialization for single 276 + * region and return. 277 + */ 278 + rdtgrp->plr->s = s; 279 + rdtgrp->plr->d = d; 280 + rdtgrp->plr->cbm = cfg->new_ctrl; 281 + d->plr = rdtgrp->plr; 282 + return 0; 283 + } 284 + goto next; 285 + } 286 + } 287 + return -EINVAL; 288 + } 289 + 290 + static int rdtgroup_parse_resource(char *resname, char *tok, 291 + struct rdtgroup *rdtgrp) 292 + { 293 + struct resctrl_schema *s; 294 + 295 + list_for_each_entry(s, &resctrl_schema_all, list) { 296 + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) 297 + return parse_line(tok, s, rdtgrp); 298 + } 299 + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); 300 + return -EINVAL; 301 + } 302 + 303 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 304 + char *buf, size_t nbytes, loff_t off) 305 + { 306 + struct resctrl_schema *s; 307 + struct rdtgroup *rdtgrp; 308 + struct rdt_resource *r; 309 + char *tok, *resname; 310 + int ret = 0; 311 + 312 + /* Valid input requires a trailing newline */ 313 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 314 + return -EINVAL; 315 + buf[nbytes - 1] = '\0'; 316 + 317 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 318 + if (!rdtgrp) { 319 + rdtgroup_kn_unlock(of->kn); 320 + return -ENOENT; 321 + } 322 + rdt_last_cmd_clear(); 323 + 324 + /* 325 + * No changes to pseudo-locked region allowed. It has to be removed 326 + * and re-created instead. 327 + */ 328 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 329 + ret = -EINVAL; 330 + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); 331 + goto out; 332 + } 333 + 334 + rdt_staged_configs_clear(); 335 + 336 + while ((tok = strsep(&buf, "\n")) != NULL) { 337 + resname = strim(strsep(&tok, ":")); 338 + if (!tok) { 339 + rdt_last_cmd_puts("Missing ':'\n"); 340 + ret = -EINVAL; 341 + goto out; 342 + } 343 + if (tok[0] == '\0') { 344 + rdt_last_cmd_printf("Missing '%s' value\n", resname); 345 + ret = -EINVAL; 346 + goto out; 347 + } 348 + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); 349 + if (ret) 350 + goto out; 351 + } 352 + 353 + list_for_each_entry(s, &resctrl_schema_all, list) { 354 + r = s->res; 355 + 356 + /* 357 + * Writes to mba_sc resources update the software controller, 358 + * not the control MSR. 359 + */ 360 + if (is_mba_sc(r)) 361 + continue; 362 + 363 + ret = resctrl_arch_update_domains(r, rdtgrp->closid); 364 + if (ret) 365 + goto out; 366 + } 367 + 368 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 369 + /* 370 + * If pseudo-locking fails we keep the resource group in 371 + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service 372 + * active and updated for just the domain the pseudo-locked 373 + * region was requested for. 374 + */ 375 + ret = rdtgroup_pseudo_lock_create(rdtgrp); 376 + } 377 + 378 + out: 379 + rdt_staged_configs_clear(); 380 + rdtgroup_kn_unlock(of->kn); 381 + return ret ?: nbytes; 382 + } 383 + 384 + static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) 385 + { 386 + struct rdt_resource *r = schema->res; 387 + struct rdt_ctrl_domain *dom; 388 + bool sep = false; 389 + u32 ctrl_val; 390 + 391 + /* Walking r->domains, ensure it can't race with cpuhp */ 392 + lockdep_assert_cpus_held(); 393 + 394 + seq_printf(s, "%*s:", max_name_width, schema->name); 395 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 396 + if (sep) 397 + seq_puts(s, ";"); 398 + 399 + if (is_mba_sc(r)) 400 + ctrl_val = dom->mbps_val[closid]; 401 + else 402 + ctrl_val = resctrl_arch_get_config(r, dom, closid, 403 + schema->conf_type); 404 + 405 + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); 406 + sep = true; 407 + } 408 + seq_puts(s, "\n"); 409 + } 410 + 411 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 412 + struct seq_file *s, void *v) 413 + { 414 + struct resctrl_schema *schema; 415 + struct rdtgroup *rdtgrp; 416 + int ret = 0; 417 + u32 closid; 418 + 419 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 420 + if (rdtgrp) { 421 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 422 + list_for_each_entry(schema, &resctrl_schema_all, list) { 423 + seq_printf(s, "%s:uninitialized\n", schema->name); 424 + } 425 + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 426 + if (!rdtgrp->plr->d) { 427 + rdt_last_cmd_clear(); 428 + rdt_last_cmd_puts("Cache domain offline\n"); 429 + ret = -ENODEV; 430 + } else { 431 + seq_printf(s, "%s:%d=%x\n", 432 + rdtgrp->plr->s->res->name, 433 + rdtgrp->plr->d->hdr.id, 434 + rdtgrp->plr->cbm); 435 + } 436 + } else { 437 + closid = rdtgrp->closid; 438 + list_for_each_entry(schema, &resctrl_schema_all, list) { 439 + if (closid < schema->num_closid) 440 + show_doms(s, schema, closid); 441 + } 442 + } 443 + } else { 444 + ret = -ENOENT; 445 + } 446 + rdtgroup_kn_unlock(of->kn); 447 + return ret; 448 + } 449 + 450 + static int smp_mon_event_count(void *arg) 451 + { 452 + mon_event_count(arg); 453 + 454 + return 0; 455 + } 456 + 457 + ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 458 + char *buf, size_t nbytes, loff_t off) 459 + { 460 + struct rdtgroup *rdtgrp; 461 + int ret = 0; 462 + 463 + /* Valid input requires a trailing newline */ 464 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 465 + return -EINVAL; 466 + buf[nbytes - 1] = '\0'; 467 + 468 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 469 + if (!rdtgrp) { 470 + rdtgroup_kn_unlock(of->kn); 471 + return -ENOENT; 472 + } 473 + rdt_last_cmd_clear(); 474 + 475 + if (!strcmp(buf, "mbm_local_bytes")) { 476 + if (resctrl_arch_is_mbm_local_enabled()) 477 + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; 478 + else 479 + ret = -EINVAL; 480 + } else if (!strcmp(buf, "mbm_total_bytes")) { 481 + if (resctrl_arch_is_mbm_total_enabled()) 482 + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; 483 + else 484 + ret = -EINVAL; 485 + } else { 486 + ret = -EINVAL; 487 + } 488 + 489 + if (ret) 490 + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); 491 + 492 + rdtgroup_kn_unlock(of->kn); 493 + 494 + return ret ?: nbytes; 495 + } 496 + 497 + int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 498 + struct seq_file *s, void *v) 499 + { 500 + struct rdtgroup *rdtgrp; 501 + int ret = 0; 502 + 503 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 504 + 505 + if (rdtgrp) { 506 + switch (rdtgrp->mba_mbps_event) { 507 + case QOS_L3_MBM_LOCAL_EVENT_ID: 508 + seq_puts(s, "mbm_local_bytes\n"); 509 + break; 510 + case QOS_L3_MBM_TOTAL_EVENT_ID: 511 + seq_puts(s, "mbm_total_bytes\n"); 512 + break; 513 + default: 514 + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); 515 + ret = -EINVAL; 516 + break; 517 + } 518 + } else { 519 + ret = -ENOENT; 520 + } 521 + 522 + rdtgroup_kn_unlock(of->kn); 523 + 524 + return ret; 525 + } 526 + 527 + struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, 528 + struct list_head **pos) 529 + { 530 + struct rdt_domain_hdr *d; 531 + struct list_head *l; 532 + 533 + list_for_each(l, h) { 534 + d = list_entry(l, struct rdt_domain_hdr, list); 535 + /* When id is found, return its domain. */ 536 + if (id == d->id) 537 + return d; 538 + /* Stop searching when finding id's position in sorted list. */ 539 + if (id < d->id) 540 + break; 541 + } 542 + 543 + if (pos) 544 + *pos = l; 545 + 546 + return NULL; 547 + } 548 + 549 + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 550 + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 551 + cpumask_t *cpumask, int evtid, int first) 552 + { 553 + int cpu; 554 + 555 + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ 556 + lockdep_assert_cpus_held(); 557 + 558 + /* 559 + * Setup the parameters to pass to mon_event_count() to read the data. 560 + */ 561 + rr->rgrp = rdtgrp; 562 + rr->evtid = evtid; 563 + rr->r = r; 564 + rr->d = d; 565 + rr->first = first; 566 + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); 567 + if (IS_ERR(rr->arch_mon_ctx)) { 568 + rr->err = -EINVAL; 569 + return; 570 + } 571 + 572 + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); 573 + 574 + /* 575 + * cpumask_any_housekeeping() prefers housekeeping CPUs, but 576 + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. 577 + * MPAM's resctrl_arch_rmid_read() is unable to read the 578 + * counters on some platforms if its called in IRQ context. 579 + */ 580 + if (tick_nohz_full_cpu(cpu)) 581 + smp_call_function_any(cpumask, mon_event_count, rr, 1); 582 + else 583 + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); 584 + 585 + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); 586 + } 587 + 588 + int rdtgroup_mondata_show(struct seq_file *m, void *arg) 589 + { 590 + struct kernfs_open_file *of = m->private; 591 + enum resctrl_res_level resid; 592 + enum resctrl_event_id evtid; 593 + struct rdt_domain_hdr *hdr; 594 + struct rmid_read rr = {0}; 595 + struct rdt_mon_domain *d; 596 + struct rdtgroup *rdtgrp; 597 + struct rdt_resource *r; 598 + struct mon_data *md; 599 + int domid, ret = 0; 600 + 601 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 602 + if (!rdtgrp) { 603 + ret = -ENOENT; 604 + goto out; 605 + } 606 + 607 + md = of->kn->priv; 608 + if (WARN_ON_ONCE(!md)) { 609 + ret = -EIO; 610 + goto out; 611 + } 612 + 613 + resid = md->rid; 614 + domid = md->domid; 615 + evtid = md->evtid; 616 + r = resctrl_arch_get_resource(resid); 617 + 618 + if (md->sum) { 619 + /* 620 + * This file requires summing across all domains that share 621 + * the L3 cache id that was provided in the "domid" field of the 622 + * struct mon_data. Search all domains in the resource for 623 + * one that matches this cache id. 624 + */ 625 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 626 + if (d->ci->id == domid) { 627 + rr.ci = d->ci; 628 + mon_event_read(&rr, r, NULL, rdtgrp, 629 + &d->ci->shared_cpu_map, evtid, false); 630 + goto checkresult; 631 + } 632 + } 633 + ret = -ENOENT; 634 + goto out; 635 + } else { 636 + /* 637 + * This file provides data from a single domain. Search 638 + * the resource to find the domain with "domid". 639 + */ 640 + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); 641 + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { 642 + ret = -ENOENT; 643 + goto out; 644 + } 645 + d = container_of(hdr, struct rdt_mon_domain, hdr); 646 + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); 647 + } 648 + 649 + checkresult: 650 + 651 + if (rr.err == -EIO) 652 + seq_puts(m, "Error\n"); 653 + else if (rr.err == -EINVAL) 654 + seq_puts(m, "Unavailable\n"); 655 + else 656 + seq_printf(m, "%llu\n", rr.val); 657 + 658 + out: 659 + rdtgroup_kn_unlock(of->kn); 660 + return ret; 661 + }
+426
fs/resctrl/internal.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _FS_RESCTRL_INTERNAL_H 3 + #define _FS_RESCTRL_INTERNAL_H 4 + 5 + #include <linux/resctrl.h> 6 + #include <linux/kernfs.h> 7 + #include <linux/fs_context.h> 8 + #include <linux/tick.h> 9 + 10 + #define CQM_LIMBOCHECK_INTERVAL 1000 11 + 12 + /** 13 + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that 14 + * aren't marked nohz_full 15 + * @mask: The mask to pick a CPU from. 16 + * @exclude_cpu:The CPU to avoid picking. 17 + * 18 + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping 19 + * CPUs that don't use nohz_full, these are preferred. Pass 20 + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. 21 + * 22 + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. 23 + */ 24 + static inline unsigned int 25 + cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) 26 + { 27 + unsigned int cpu; 28 + 29 + /* Try to find a CPU that isn't nohz_full to use in preference */ 30 + if (tick_nohz_full_enabled()) { 31 + cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu); 32 + if (cpu < nr_cpu_ids) 33 + return cpu; 34 + } 35 + 36 + return cpumask_any_but(mask, exclude_cpu); 37 + } 38 + 39 + struct rdt_fs_context { 40 + struct kernfs_fs_context kfc; 41 + bool enable_cdpl2; 42 + bool enable_cdpl3; 43 + bool enable_mba_mbps; 44 + bool enable_debug; 45 + }; 46 + 47 + static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) 48 + { 49 + struct kernfs_fs_context *kfc = fc->fs_private; 50 + 51 + return container_of(kfc, struct rdt_fs_context, kfc); 52 + } 53 + 54 + /** 55 + * struct mon_evt - Entry in the event list of a resource 56 + * @evtid: event id 57 + * @name: name of the event 58 + * @configurable: true if the event is configurable 59 + * @list: entry in &rdt_resource->evt_list 60 + */ 61 + struct mon_evt { 62 + enum resctrl_event_id evtid; 63 + char *name; 64 + bool configurable; 65 + struct list_head list; 66 + }; 67 + 68 + /** 69 + * struct mon_data - Monitoring details for each event file. 70 + * @list: Member of the global @mon_data_kn_priv_list list. 71 + * @rid: Resource id associated with the event file. 72 + * @evtid: Event id associated with the event file. 73 + * @sum: Set when event must be summed across multiple 74 + * domains. 75 + * @domid: When @sum is zero this is the domain to which 76 + * the event file belongs. When @sum is one this 77 + * is the id of the L3 cache that all domains to be 78 + * summed share. 79 + * 80 + * Pointed to by the kernfs kn->priv field of monitoring event files. 81 + * Readers and writers must hold rdtgroup_mutex. 82 + */ 83 + struct mon_data { 84 + struct list_head list; 85 + enum resctrl_res_level rid; 86 + enum resctrl_event_id evtid; 87 + int domid; 88 + bool sum; 89 + }; 90 + 91 + /** 92 + * struct rmid_read - Data passed across smp_call*() to read event count. 93 + * @rgrp: Resource group for which the counter is being read. If it is a parent 94 + * resource group then its event count is summed with the count from all 95 + * its child resource groups. 96 + * @r: Resource describing the properties of the event being read. 97 + * @d: Domain that the counter should be read from. If NULL then sum all 98 + * domains in @r sharing L3 @ci.id 99 + * @evtid: Which monitor event to read. 100 + * @first: Initialize MBM counter when true. 101 + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. 102 + * @err: Error encountered when reading counter. 103 + * @val: Returned value of event counter. If @rgrp is a parent resource group, 104 + * @val includes the sum of event counts from its child resource groups. 105 + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, 106 + * (summed across child resource groups if @rgrp is a parent resource group). 107 + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). 108 + */ 109 + struct rmid_read { 110 + struct rdtgroup *rgrp; 111 + struct rdt_resource *r; 112 + struct rdt_mon_domain *d; 113 + enum resctrl_event_id evtid; 114 + bool first; 115 + struct cacheinfo *ci; 116 + int err; 117 + u64 val; 118 + void *arch_mon_ctx; 119 + }; 120 + 121 + extern struct list_head resctrl_schema_all; 122 + 123 + extern bool resctrl_mounted; 124 + 125 + enum rdt_group_type { 126 + RDTCTRL_GROUP = 0, 127 + RDTMON_GROUP, 128 + RDT_NUM_GROUP, 129 + }; 130 + 131 + /** 132 + * enum rdtgrp_mode - Mode of a RDT resource group 133 + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations 134 + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed 135 + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking 136 + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations 137 + * allowed AND the allocations are Cache Pseudo-Locked 138 + * @RDT_NUM_MODES: Total number of modes 139 + * 140 + * The mode of a resource group enables control over the allowed overlap 141 + * between allocations associated with different resource groups (classes 142 + * of service). User is able to modify the mode of a resource group by 143 + * writing to the "mode" resctrl file associated with the resource group. 144 + * 145 + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by 146 + * writing the appropriate text to the "mode" file. A resource group enters 147 + * "pseudo-locked" mode after the schemata is written while the resource 148 + * group is in "pseudo-locksetup" mode. 149 + */ 150 + enum rdtgrp_mode { 151 + RDT_MODE_SHAREABLE = 0, 152 + RDT_MODE_EXCLUSIVE, 153 + RDT_MODE_PSEUDO_LOCKSETUP, 154 + RDT_MODE_PSEUDO_LOCKED, 155 + 156 + /* Must be last */ 157 + RDT_NUM_MODES, 158 + }; 159 + 160 + /** 161 + * struct mongroup - store mon group's data in resctrl fs. 162 + * @mon_data_kn: kernfs node for the mon_data directory 163 + * @parent: parent rdtgrp 164 + * @crdtgrp_list: child rdtgroup node list 165 + * @rmid: rmid for this rdtgroup 166 + */ 167 + struct mongroup { 168 + struct kernfs_node *mon_data_kn; 169 + struct rdtgroup *parent; 170 + struct list_head crdtgrp_list; 171 + u32 rmid; 172 + }; 173 + 174 + /** 175 + * struct rdtgroup - store rdtgroup's data in resctrl file system. 176 + * @kn: kernfs node 177 + * @rdtgroup_list: linked list for all rdtgroups 178 + * @closid: closid for this rdtgroup 179 + * @cpu_mask: CPUs assigned to this rdtgroup 180 + * @flags: status bits 181 + * @waitcount: how many cpus expect to find this 182 + * group when they acquire rdtgroup_mutex 183 + * @type: indicates type of this rdtgroup - either 184 + * monitor only or ctrl_mon group 185 + * @mon: mongroup related data 186 + * @mode: mode of resource group 187 + * @mba_mbps_event: input monitoring event id when mba_sc is enabled 188 + * @plr: pseudo-locked region 189 + */ 190 + struct rdtgroup { 191 + struct kernfs_node *kn; 192 + struct list_head rdtgroup_list; 193 + u32 closid; 194 + struct cpumask cpu_mask; 195 + int flags; 196 + atomic_t waitcount; 197 + enum rdt_group_type type; 198 + struct mongroup mon; 199 + enum rdtgrp_mode mode; 200 + enum resctrl_event_id mba_mbps_event; 201 + struct pseudo_lock_region *plr; 202 + }; 203 + 204 + /* rdtgroup.flags */ 205 + #define RDT_DELETED 1 206 + 207 + /* rftype.flags */ 208 + #define RFTYPE_FLAGS_CPUS_LIST 1 209 + 210 + /* 211 + * Define the file type flags for base and info directories. 212 + */ 213 + #define RFTYPE_INFO BIT(0) 214 + 215 + #define RFTYPE_BASE BIT(1) 216 + 217 + #define RFTYPE_CTRL BIT(4) 218 + 219 + #define RFTYPE_MON BIT(5) 220 + 221 + #define RFTYPE_TOP BIT(6) 222 + 223 + #define RFTYPE_RES_CACHE BIT(8) 224 + 225 + #define RFTYPE_RES_MB BIT(9) 226 + 227 + #define RFTYPE_DEBUG BIT(10) 228 + 229 + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) 230 + 231 + #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) 232 + 233 + #define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) 234 + 235 + #define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) 236 + 237 + #define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) 238 + 239 + /* List of all resource groups */ 240 + extern struct list_head rdt_all_groups; 241 + 242 + extern int max_name_width; 243 + 244 + /** 245 + * struct rftype - describe each file in the resctrl file system 246 + * @name: File name 247 + * @mode: Access mode 248 + * @kf_ops: File operations 249 + * @flags: File specific RFTYPE_FLAGS_* flags 250 + * @fflags: File specific RFTYPE_* flags 251 + * @seq_show: Show content of the file 252 + * @write: Write to the file 253 + */ 254 + struct rftype { 255 + char *name; 256 + umode_t mode; 257 + const struct kernfs_ops *kf_ops; 258 + unsigned long flags; 259 + unsigned long fflags; 260 + 261 + int (*seq_show)(struct kernfs_open_file *of, 262 + struct seq_file *sf, void *v); 263 + /* 264 + * write() is the generic write callback which maps directly to 265 + * kernfs write operation and overrides all other operations. 266 + * Maximum write size is determined by ->max_write_len. 267 + */ 268 + ssize_t (*write)(struct kernfs_open_file *of, 269 + char *buf, size_t nbytes, loff_t off); 270 + }; 271 + 272 + /** 273 + * struct mbm_state - status for each MBM counter in each domain 274 + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation 275 + * @prev_bw: The most recent bandwidth in MBps 276 + */ 277 + struct mbm_state { 278 + u64 prev_bw_bytes; 279 + u32 prev_bw; 280 + }; 281 + 282 + extern struct mutex rdtgroup_mutex; 283 + 284 + static inline const char *rdt_kn_name(const struct kernfs_node *kn) 285 + { 286 + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); 287 + } 288 + 289 + extern struct rdtgroup rdtgroup_default; 290 + 291 + extern struct dentry *debugfs_resctrl; 292 + 293 + extern enum resctrl_event_id mba_mbps_default_event; 294 + 295 + void rdt_last_cmd_clear(void); 296 + 297 + void rdt_last_cmd_puts(const char *s); 298 + 299 + __printf(1, 2) 300 + void rdt_last_cmd_printf(const char *fmt, ...); 301 + 302 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); 303 + 304 + void rdtgroup_kn_unlock(struct kernfs_node *kn); 305 + 306 + int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); 307 + 308 + int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 309 + umode_t mask); 310 + 311 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 312 + char *buf, size_t nbytes, loff_t off); 313 + 314 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 315 + struct seq_file *s, void *v); 316 + 317 + ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, 318 + char *buf, size_t nbytes, loff_t off); 319 + 320 + int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, 321 + struct seq_file *s, void *v); 322 + 323 + bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 324 + unsigned long cbm, int closid, bool exclusive); 325 + 326 + unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, 327 + unsigned long cbm); 328 + 329 + enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); 330 + 331 + int rdtgroup_tasks_assigned(struct rdtgroup *r); 332 + 333 + int closids_supported(void); 334 + 335 + void closid_free(int closid); 336 + 337 + int alloc_rmid(u32 closid); 338 + 339 + void free_rmid(u32 closid, u32 rmid); 340 + 341 + void resctrl_mon_resource_exit(void); 342 + 343 + void mon_event_count(void *info); 344 + 345 + int rdtgroup_mondata_show(struct seq_file *m, void *arg); 346 + 347 + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, 348 + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, 349 + cpumask_t *cpumask, int evtid, int first); 350 + 351 + int resctrl_mon_resource_init(void); 352 + 353 + void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, 354 + unsigned long delay_ms, 355 + int exclude_cpu); 356 + 357 + void mbm_handle_overflow(struct work_struct *work); 358 + 359 + bool is_mba_sc(struct rdt_resource *r); 360 + 361 + void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 362 + int exclude_cpu); 363 + 364 + void cqm_handle_limbo(struct work_struct *work); 365 + 366 + bool has_busy_rmid(struct rdt_mon_domain *d); 367 + 368 + void __check_limbo(struct rdt_mon_domain *d, bool force_free); 369 + 370 + void resctrl_file_fflags_init(const char *config, unsigned long fflags); 371 + 372 + void rdt_staged_configs_clear(void); 373 + 374 + bool closid_allocated(unsigned int closid); 375 + 376 + int resctrl_find_cleanest_closid(void); 377 + 378 + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 379 + int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); 380 + 381 + int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); 382 + 383 + bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); 384 + 385 + bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); 386 + 387 + int rdt_pseudo_lock_init(void); 388 + 389 + void rdt_pseudo_lock_release(void); 390 + 391 + int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); 392 + 393 + void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); 394 + 395 + #else 396 + static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 397 + { 398 + return -EOPNOTSUPP; 399 + } 400 + 401 + static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 402 + { 403 + return -EOPNOTSUPP; 404 + } 405 + 406 + static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 407 + { 408 + return false; 409 + } 410 + 411 + static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 412 + { 413 + return false; 414 + } 415 + 416 + static inline int rdt_pseudo_lock_init(void) { return 0; } 417 + static inline void rdt_pseudo_lock_release(void) { } 418 + static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 419 + { 420 + return -EOPNOTSUPP; 421 + } 422 + 423 + static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } 424 + #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 425 + 426 + #endif /* _FS_RESCTRL_INTERNAL_H */
+929
fs/resctrl/monitor.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Resource Director Technology(RDT) 4 + * - Monitoring code 5 + * 6 + * Copyright (C) 2017 Intel Corporation 7 + * 8 + * Author: 9 + * Vikas Shivappa <vikas.shivappa@intel.com> 10 + * 11 + * This replaces the cqm.c based on perf but we reuse a lot of 12 + * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 + * 14 + * More information about RDT be found in the Intel (R) x86 Architecture 15 + * Software Developer Manual June 2016, volume 3, section 17.17. 16 + */ 17 + 18 + #define pr_fmt(fmt) "resctrl: " fmt 19 + 20 + #include <linux/cpu.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/sizes.h> 23 + #include <linux/slab.h> 24 + 25 + #include "internal.h" 26 + 27 + #define CREATE_TRACE_POINTS 28 + 29 + #include "monitor_trace.h" 30 + 31 + /** 32 + * struct rmid_entry - dirty tracking for all RMID. 33 + * @closid: The CLOSID for this entry. 34 + * @rmid: The RMID for this entry. 35 + * @busy: The number of domains with cached data using this RMID. 36 + * @list: Member of the rmid_free_lru list when busy == 0. 37 + * 38 + * Depending on the architecture the correct monitor is accessed using 39 + * both @closid and @rmid, or @rmid only. 40 + * 41 + * Take the rdtgroup_mutex when accessing. 42 + */ 43 + struct rmid_entry { 44 + u32 closid; 45 + u32 rmid; 46 + int busy; 47 + struct list_head list; 48 + }; 49 + 50 + /* 51 + * @rmid_free_lru - A least recently used list of free RMIDs 52 + * These RMIDs are guaranteed to have an occupancy less than the 53 + * threshold occupancy 54 + */ 55 + static LIST_HEAD(rmid_free_lru); 56 + 57 + /* 58 + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 59 + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 60 + * Indexed by CLOSID. Protected by rdtgroup_mutex. 61 + */ 62 + static u32 *closid_num_dirty_rmid; 63 + 64 + /* 65 + * @rmid_limbo_count - count of currently unused but (potentially) 66 + * dirty RMIDs. 67 + * This counts RMIDs that no one is currently using but that 68 + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 69 + * change the threshold occupancy value. 70 + */ 71 + static unsigned int rmid_limbo_count; 72 + 73 + /* 74 + * @rmid_entry - The entry in the limbo and free lists. 75 + */ 76 + static struct rmid_entry *rmid_ptrs; 77 + 78 + /* 79 + * This is the threshold cache occupancy in bytes at which we will consider an 80 + * RMID available for re-allocation. 81 + */ 82 + unsigned int resctrl_rmid_realloc_threshold; 83 + 84 + /* 85 + * This is the maximum value for the reallocation threshold, in bytes. 86 + */ 87 + unsigned int resctrl_rmid_realloc_limit; 88 + 89 + /* 90 + * x86 and arm64 differ in their handling of monitoring. 91 + * x86's RMID are independent numbers, there is only one source of traffic 92 + * with an RMID value of '1'. 93 + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 94 + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 95 + * value is no longer unique. 96 + * To account for this, resctrl uses an index. On x86 this is just the RMID, 97 + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 98 + * 99 + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 100 + * must accept an attempt to read every index. 101 + */ 102 + static inline struct rmid_entry *__rmid_entry(u32 idx) 103 + { 104 + struct rmid_entry *entry; 105 + u32 closid, rmid; 106 + 107 + entry = &rmid_ptrs[idx]; 108 + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 109 + 110 + WARN_ON_ONCE(entry->closid != closid); 111 + WARN_ON_ONCE(entry->rmid != rmid); 112 + 113 + return entry; 114 + } 115 + 116 + static void limbo_release_entry(struct rmid_entry *entry) 117 + { 118 + lockdep_assert_held(&rdtgroup_mutex); 119 + 120 + rmid_limbo_count--; 121 + list_add_tail(&entry->list, &rmid_free_lru); 122 + 123 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 124 + closid_num_dirty_rmid[entry->closid]--; 125 + } 126 + 127 + /* 128 + * Check the RMIDs that are marked as busy for this domain. If the 129 + * reported LLC occupancy is below the threshold clear the busy bit and 130 + * decrement the count. If the busy count gets to zero on an RMID, we 131 + * free the RMID 132 + */ 133 + void __check_limbo(struct rdt_mon_domain *d, bool force_free) 134 + { 135 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 136 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 137 + struct rmid_entry *entry; 138 + u32 idx, cur_idx = 1; 139 + void *arch_mon_ctx; 140 + bool rmid_dirty; 141 + u64 val = 0; 142 + 143 + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 144 + if (IS_ERR(arch_mon_ctx)) { 145 + pr_warn_ratelimited("Failed to allocate monitor context: %ld", 146 + PTR_ERR(arch_mon_ctx)); 147 + return; 148 + } 149 + 150 + /* 151 + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 152 + * are marked as busy for occupancy < threshold. If the occupancy 153 + * is less than the threshold decrement the busy counter of the 154 + * RMID and move it to the free list when the counter reaches 0. 155 + */ 156 + for (;;) { 157 + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 158 + if (idx >= idx_limit) 159 + break; 160 + 161 + entry = __rmid_entry(idx); 162 + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 163 + QOS_L3_OCCUP_EVENT_ID, &val, 164 + arch_mon_ctx)) { 165 + rmid_dirty = true; 166 + } else { 167 + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 168 + 169 + /* 170 + * x86's CLOSID and RMID are independent numbers, so the entry's 171 + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 172 + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 173 + * used to select the configuration. It is thus necessary to track both 174 + * CLOSID and RMID because there may be dependencies between them 175 + * on some architectures. 176 + */ 177 + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 178 + } 179 + 180 + if (force_free || !rmid_dirty) { 181 + clear_bit(idx, d->rmid_busy_llc); 182 + if (!--entry->busy) 183 + limbo_release_entry(entry); 184 + } 185 + cur_idx = idx + 1; 186 + } 187 + 188 + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 189 + } 190 + 191 + bool has_busy_rmid(struct rdt_mon_domain *d) 192 + { 193 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 194 + 195 + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 196 + } 197 + 198 + static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 199 + { 200 + struct rmid_entry *itr; 201 + u32 itr_idx, cmp_idx; 202 + 203 + if (list_empty(&rmid_free_lru)) 204 + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 205 + 206 + list_for_each_entry(itr, &rmid_free_lru, list) { 207 + /* 208 + * Get the index of this free RMID, and the index it would need 209 + * to be if it were used with this CLOSID. 210 + * If the CLOSID is irrelevant on this architecture, the two 211 + * index values are always the same on every entry and thus the 212 + * very first entry will be returned. 213 + */ 214 + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 215 + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 216 + 217 + if (itr_idx == cmp_idx) 218 + return itr; 219 + } 220 + 221 + return ERR_PTR(-ENOSPC); 222 + } 223 + 224 + /** 225 + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 226 + * RMID are clean, or the CLOSID that has 227 + * the most clean RMID. 228 + * 229 + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 230 + * may not be able to allocate clean RMID. To avoid this the allocator will 231 + * choose the CLOSID with the most clean RMID. 232 + * 233 + * When the CLOSID and RMID are independent numbers, the first free CLOSID will 234 + * be returned. 235 + */ 236 + int resctrl_find_cleanest_closid(void) 237 + { 238 + u32 cleanest_closid = ~0; 239 + int i = 0; 240 + 241 + lockdep_assert_held(&rdtgroup_mutex); 242 + 243 + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 244 + return -EIO; 245 + 246 + for (i = 0; i < closids_supported(); i++) { 247 + int num_dirty; 248 + 249 + if (closid_allocated(i)) 250 + continue; 251 + 252 + num_dirty = closid_num_dirty_rmid[i]; 253 + if (num_dirty == 0) 254 + return i; 255 + 256 + if (cleanest_closid == ~0) 257 + cleanest_closid = i; 258 + 259 + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 260 + cleanest_closid = i; 261 + } 262 + 263 + if (cleanest_closid == ~0) 264 + return -ENOSPC; 265 + 266 + return cleanest_closid; 267 + } 268 + 269 + /* 270 + * For MPAM the RMID value is not unique, and has to be considered with 271 + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 272 + * allows all domains to be managed by a single free list. 273 + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 274 + */ 275 + int alloc_rmid(u32 closid) 276 + { 277 + struct rmid_entry *entry; 278 + 279 + lockdep_assert_held(&rdtgroup_mutex); 280 + 281 + entry = resctrl_find_free_rmid(closid); 282 + if (IS_ERR(entry)) 283 + return PTR_ERR(entry); 284 + 285 + list_del(&entry->list); 286 + return entry->rmid; 287 + } 288 + 289 + static void add_rmid_to_limbo(struct rmid_entry *entry) 290 + { 291 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 292 + struct rdt_mon_domain *d; 293 + u32 idx; 294 + 295 + lockdep_assert_held(&rdtgroup_mutex); 296 + 297 + /* Walking r->domains, ensure it can't race with cpuhp */ 298 + lockdep_assert_cpus_held(); 299 + 300 + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 301 + 302 + entry->busy = 0; 303 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 304 + /* 305 + * For the first limbo RMID in the domain, 306 + * setup up the limbo worker. 307 + */ 308 + if (!has_busy_rmid(d)) 309 + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 310 + RESCTRL_PICK_ANY_CPU); 311 + set_bit(idx, d->rmid_busy_llc); 312 + entry->busy++; 313 + } 314 + 315 + rmid_limbo_count++; 316 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 317 + closid_num_dirty_rmid[entry->closid]++; 318 + } 319 + 320 + void free_rmid(u32 closid, u32 rmid) 321 + { 322 + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 323 + struct rmid_entry *entry; 324 + 325 + lockdep_assert_held(&rdtgroup_mutex); 326 + 327 + /* 328 + * Do not allow the default rmid to be free'd. Comparing by index 329 + * allows architectures that ignore the closid parameter to avoid an 330 + * unnecessary check. 331 + */ 332 + if (!resctrl_arch_mon_capable() || 333 + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 334 + RESCTRL_RESERVED_RMID)) 335 + return; 336 + 337 + entry = __rmid_entry(idx); 338 + 339 + if (resctrl_arch_is_llc_occupancy_enabled()) 340 + add_rmid_to_limbo(entry); 341 + else 342 + list_add_tail(&entry->list, &rmid_free_lru); 343 + } 344 + 345 + static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 346 + u32 rmid, enum resctrl_event_id evtid) 347 + { 348 + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 349 + 350 + switch (evtid) { 351 + case QOS_L3_MBM_TOTAL_EVENT_ID: 352 + return &d->mbm_total[idx]; 353 + case QOS_L3_MBM_LOCAL_EVENT_ID: 354 + return &d->mbm_local[idx]; 355 + default: 356 + return NULL; 357 + } 358 + } 359 + 360 + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 361 + { 362 + int cpu = smp_processor_id(); 363 + struct rdt_mon_domain *d; 364 + struct mbm_state *m; 365 + int err, ret; 366 + u64 tval = 0; 367 + 368 + if (rr->first) { 369 + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 370 + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 371 + if (m) 372 + memset(m, 0, sizeof(struct mbm_state)); 373 + return 0; 374 + } 375 + 376 + if (rr->d) { 377 + /* Reading a single domain, must be on a CPU in that domain. */ 378 + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 379 + return -EINVAL; 380 + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 381 + rr->evtid, &tval, rr->arch_mon_ctx); 382 + if (rr->err) 383 + return rr->err; 384 + 385 + rr->val += tval; 386 + 387 + return 0; 388 + } 389 + 390 + /* Summing domains that share a cache, must be on a CPU for that cache. */ 391 + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 392 + return -EINVAL; 393 + 394 + /* 395 + * Legacy files must report the sum of an event across all 396 + * domains that share the same L3 cache instance. 397 + * Report success if a read from any domain succeeds, -EINVAL 398 + * (translated to "Unavailable" for user space) if reading from 399 + * all domains fail for any reason. 400 + */ 401 + ret = -EINVAL; 402 + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 403 + if (d->ci->id != rr->ci->id) 404 + continue; 405 + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 406 + rr->evtid, &tval, rr->arch_mon_ctx); 407 + if (!err) { 408 + rr->val += tval; 409 + ret = 0; 410 + } 411 + } 412 + 413 + if (ret) 414 + rr->err = ret; 415 + 416 + return ret; 417 + } 418 + 419 + /* 420 + * mbm_bw_count() - Update bw count from values previously read by 421 + * __mon_event_count(). 422 + * @closid: The closid used to identify the cached mbm_state. 423 + * @rmid: The rmid used to identify the cached mbm_state. 424 + * @rr: The struct rmid_read populated by __mon_event_count(). 425 + * 426 + * Supporting function to calculate the memory bandwidth 427 + * and delta bandwidth in MBps. The chunks value previously read by 428 + * __mon_event_count() is compared with the chunks value from the previous 429 + * invocation. This must be called once per second to maintain values in MBps. 430 + */ 431 + static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 432 + { 433 + u64 cur_bw, bytes, cur_bytes; 434 + struct mbm_state *m; 435 + 436 + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 437 + if (WARN_ON_ONCE(!m)) 438 + return; 439 + 440 + cur_bytes = rr->val; 441 + bytes = cur_bytes - m->prev_bw_bytes; 442 + m->prev_bw_bytes = cur_bytes; 443 + 444 + cur_bw = bytes / SZ_1M; 445 + 446 + m->prev_bw = cur_bw; 447 + } 448 + 449 + /* 450 + * This is scheduled by mon_event_read() to read the CQM/MBM counters 451 + * on a domain. 452 + */ 453 + void mon_event_count(void *info) 454 + { 455 + struct rdtgroup *rdtgrp, *entry; 456 + struct rmid_read *rr = info; 457 + struct list_head *head; 458 + int ret; 459 + 460 + rdtgrp = rr->rgrp; 461 + 462 + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 463 + 464 + /* 465 + * For Ctrl groups read data from child monitor groups and 466 + * add them together. Count events which are read successfully. 467 + * Discard the rmid_read's reporting errors. 468 + */ 469 + head = &rdtgrp->mon.crdtgrp_list; 470 + 471 + if (rdtgrp->type == RDTCTRL_GROUP) { 472 + list_for_each_entry(entry, head, mon.crdtgrp_list) { 473 + if (__mon_event_count(entry->closid, entry->mon.rmid, 474 + rr) == 0) 475 + ret = 0; 476 + } 477 + } 478 + 479 + /* 480 + * __mon_event_count() calls for newly created monitor groups may 481 + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 482 + * Discard error if any of the monitor event reads succeeded. 483 + */ 484 + if (ret == 0) 485 + rr->err = 0; 486 + } 487 + 488 + static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 489 + struct rdt_resource *r) 490 + { 491 + struct rdt_ctrl_domain *d; 492 + 493 + lockdep_assert_cpus_held(); 494 + 495 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 496 + /* Find the domain that contains this CPU */ 497 + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 498 + return d; 499 + } 500 + 501 + return NULL; 502 + } 503 + 504 + /* 505 + * Feedback loop for MBA software controller (mba_sc) 506 + * 507 + * mba_sc is a feedback loop where we periodically read MBM counters and 508 + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 509 + * that: 510 + * 511 + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 512 + * 513 + * This uses the MBM counters to measure the bandwidth and MBA throttle 514 + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 515 + * fact that resctrl rdtgroups have both monitoring and control. 516 + * 517 + * The frequency of the checks is 1s and we just tag along the MBM overflow 518 + * timer. Having 1s interval makes the calculation of bandwidth simpler. 519 + * 520 + * Although MBA's goal is to restrict the bandwidth to a maximum, there may 521 + * be a need to increase the bandwidth to avoid unnecessarily restricting 522 + * the L2 <-> L3 traffic. 523 + * 524 + * Since MBA controls the L2 external bandwidth where as MBM measures the 525 + * L3 external bandwidth the following sequence could lead to such a 526 + * situation. 527 + * 528 + * Consider an rdtgroup which had high L3 <-> memory traffic in initial 529 + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 530 + * after some time rdtgroup has mostly L2 <-> L3 traffic. 531 + * 532 + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 533 + * throttle MSRs already have low percentage values. To avoid 534 + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 535 + */ 536 + static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 537 + { 538 + u32 closid, rmid, cur_msr_val, new_msr_val; 539 + struct mbm_state *pmbm_data, *cmbm_data; 540 + struct rdt_ctrl_domain *dom_mba; 541 + enum resctrl_event_id evt_id; 542 + struct rdt_resource *r_mba; 543 + struct list_head *head; 544 + struct rdtgroup *entry; 545 + u32 cur_bw, user_bw; 546 + 547 + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 548 + evt_id = rgrp->mba_mbps_event; 549 + 550 + closid = rgrp->closid; 551 + rmid = rgrp->mon.rmid; 552 + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 553 + if (WARN_ON_ONCE(!pmbm_data)) 554 + return; 555 + 556 + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 557 + if (!dom_mba) { 558 + pr_warn_once("Failure to get domain for MBA update\n"); 559 + return; 560 + } 561 + 562 + cur_bw = pmbm_data->prev_bw; 563 + user_bw = dom_mba->mbps_val[closid]; 564 + 565 + /* MBA resource doesn't support CDP */ 566 + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 567 + 568 + /* 569 + * For Ctrl groups read data from child monitor groups. 570 + */ 571 + head = &rgrp->mon.crdtgrp_list; 572 + list_for_each_entry(entry, head, mon.crdtgrp_list) { 573 + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 574 + if (WARN_ON_ONCE(!cmbm_data)) 575 + return; 576 + cur_bw += cmbm_data->prev_bw; 577 + } 578 + 579 + /* 580 + * Scale up/down the bandwidth linearly for the ctrl group. The 581 + * bandwidth step is the bandwidth granularity specified by the 582 + * hardware. 583 + * Always increase throttling if current bandwidth is above the 584 + * target set by user. 585 + * But avoid thrashing up and down on every poll by checking 586 + * whether a decrease in throttling is likely to push the group 587 + * back over target. E.g. if currently throttling to 30% of bandwidth 588 + * on a system with 10% granularity steps, check whether moving to 589 + * 40% would go past the limit by multiplying current bandwidth by 590 + * "(30 + 10) / 30". 591 + */ 592 + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 593 + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 594 + } else if (cur_msr_val < MAX_MBA_BW && 595 + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 596 + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 597 + } else { 598 + return; 599 + } 600 + 601 + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 602 + } 603 + 604 + static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 605 + u32 closid, u32 rmid, enum resctrl_event_id evtid) 606 + { 607 + struct rmid_read rr = {0}; 608 + 609 + rr.r = r; 610 + rr.d = d; 611 + rr.evtid = evtid; 612 + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 613 + if (IS_ERR(rr.arch_mon_ctx)) { 614 + pr_warn_ratelimited("Failed to allocate monitor context: %ld", 615 + PTR_ERR(rr.arch_mon_ctx)); 616 + return; 617 + } 618 + 619 + __mon_event_count(closid, rmid, &rr); 620 + 621 + /* 622 + * If the software controller is enabled, compute the 623 + * bandwidth for this event id. 624 + */ 625 + if (is_mba_sc(NULL)) 626 + mbm_bw_count(closid, rmid, &rr); 627 + 628 + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 629 + } 630 + 631 + static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 632 + u32 closid, u32 rmid) 633 + { 634 + /* 635 + * This is protected from concurrent reads from user as both 636 + * the user and overflow handler hold the global mutex. 637 + */ 638 + if (resctrl_arch_is_mbm_total_enabled()) 639 + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 640 + 641 + if (resctrl_arch_is_mbm_local_enabled()) 642 + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 643 + } 644 + 645 + /* 646 + * Handler to scan the limbo list and move the RMIDs 647 + * to free list whose occupancy < threshold_occupancy. 648 + */ 649 + void cqm_handle_limbo(struct work_struct *work) 650 + { 651 + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 652 + struct rdt_mon_domain *d; 653 + 654 + cpus_read_lock(); 655 + mutex_lock(&rdtgroup_mutex); 656 + 657 + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 658 + 659 + __check_limbo(d, false); 660 + 661 + if (has_busy_rmid(d)) { 662 + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 663 + RESCTRL_PICK_ANY_CPU); 664 + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 665 + delay); 666 + } 667 + 668 + mutex_unlock(&rdtgroup_mutex); 669 + cpus_read_unlock(); 670 + } 671 + 672 + /** 673 + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 674 + * domain. 675 + * @dom: The domain the limbo handler should run for. 676 + * @delay_ms: How far in the future the handler should run. 677 + * @exclude_cpu: Which CPU the handler should not run on, 678 + * RESCTRL_PICK_ANY_CPU to pick any CPU. 679 + */ 680 + void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 681 + int exclude_cpu) 682 + { 683 + unsigned long delay = msecs_to_jiffies(delay_ms); 684 + int cpu; 685 + 686 + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 687 + dom->cqm_work_cpu = cpu; 688 + 689 + if (cpu < nr_cpu_ids) 690 + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 691 + } 692 + 693 + void mbm_handle_overflow(struct work_struct *work) 694 + { 695 + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 696 + struct rdtgroup *prgrp, *crgrp; 697 + struct rdt_mon_domain *d; 698 + struct list_head *head; 699 + struct rdt_resource *r; 700 + 701 + cpus_read_lock(); 702 + mutex_lock(&rdtgroup_mutex); 703 + 704 + /* 705 + * If the filesystem has been unmounted this work no longer needs to 706 + * run. 707 + */ 708 + if (!resctrl_mounted || !resctrl_arch_mon_capable()) 709 + goto out_unlock; 710 + 711 + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 712 + d = container_of(work, struct rdt_mon_domain, mbm_over.work); 713 + 714 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 715 + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 716 + 717 + head = &prgrp->mon.crdtgrp_list; 718 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) 719 + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 720 + 721 + if (is_mba_sc(NULL)) 722 + update_mba_bw(prgrp, d); 723 + } 724 + 725 + /* 726 + * Re-check for housekeeping CPUs. This allows the overflow handler to 727 + * move off a nohz_full CPU quickly. 728 + */ 729 + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 730 + RESCTRL_PICK_ANY_CPU); 731 + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 732 + 733 + out_unlock: 734 + mutex_unlock(&rdtgroup_mutex); 735 + cpus_read_unlock(); 736 + } 737 + 738 + /** 739 + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 740 + * domain. 741 + * @dom: The domain the overflow handler should run for. 742 + * @delay_ms: How far in the future the handler should run. 743 + * @exclude_cpu: Which CPU the handler should not run on, 744 + * RESCTRL_PICK_ANY_CPU to pick any CPU. 745 + */ 746 + void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 747 + int exclude_cpu) 748 + { 749 + unsigned long delay = msecs_to_jiffies(delay_ms); 750 + int cpu; 751 + 752 + /* 753 + * When a domain comes online there is no guarantee the filesystem is 754 + * mounted. If not, there is no need to catch counter overflow. 755 + */ 756 + if (!resctrl_mounted || !resctrl_arch_mon_capable()) 757 + return; 758 + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 759 + dom->mbm_work_cpu = cpu; 760 + 761 + if (cpu < nr_cpu_ids) 762 + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 763 + } 764 + 765 + static int dom_data_init(struct rdt_resource *r) 766 + { 767 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 768 + u32 num_closid = resctrl_arch_get_num_closid(r); 769 + struct rmid_entry *entry = NULL; 770 + int err = 0, i; 771 + u32 idx; 772 + 773 + mutex_lock(&rdtgroup_mutex); 774 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 775 + u32 *tmp; 776 + 777 + /* 778 + * If the architecture hasn't provided a sanitised value here, 779 + * this may result in larger arrays than necessary. Resctrl will 780 + * use a smaller system wide value based on the resources in 781 + * use. 782 + */ 783 + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 784 + if (!tmp) { 785 + err = -ENOMEM; 786 + goto out_unlock; 787 + } 788 + 789 + closid_num_dirty_rmid = tmp; 790 + } 791 + 792 + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 793 + if (!rmid_ptrs) { 794 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 795 + kfree(closid_num_dirty_rmid); 796 + closid_num_dirty_rmid = NULL; 797 + } 798 + err = -ENOMEM; 799 + goto out_unlock; 800 + } 801 + 802 + for (i = 0; i < idx_limit; i++) { 803 + entry = &rmid_ptrs[i]; 804 + INIT_LIST_HEAD(&entry->list); 805 + 806 + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 807 + list_add_tail(&entry->list, &rmid_free_lru); 808 + } 809 + 810 + /* 811 + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 812 + * are always allocated. These are used for the rdtgroup_default 813 + * control group, which will be setup later in resctrl_init(). 814 + */ 815 + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 816 + RESCTRL_RESERVED_RMID); 817 + entry = __rmid_entry(idx); 818 + list_del(&entry->list); 819 + 820 + out_unlock: 821 + mutex_unlock(&rdtgroup_mutex); 822 + 823 + return err; 824 + } 825 + 826 + static void dom_data_exit(struct rdt_resource *r) 827 + { 828 + mutex_lock(&rdtgroup_mutex); 829 + 830 + if (!r->mon_capable) 831 + goto out_unlock; 832 + 833 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 834 + kfree(closid_num_dirty_rmid); 835 + closid_num_dirty_rmid = NULL; 836 + } 837 + 838 + kfree(rmid_ptrs); 839 + rmid_ptrs = NULL; 840 + 841 + out_unlock: 842 + mutex_unlock(&rdtgroup_mutex); 843 + } 844 + 845 + static struct mon_evt llc_occupancy_event = { 846 + .name = "llc_occupancy", 847 + .evtid = QOS_L3_OCCUP_EVENT_ID, 848 + }; 849 + 850 + static struct mon_evt mbm_total_event = { 851 + .name = "mbm_total_bytes", 852 + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 853 + }; 854 + 855 + static struct mon_evt mbm_local_event = { 856 + .name = "mbm_local_bytes", 857 + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 858 + }; 859 + 860 + /* 861 + * Initialize the event list for the resource. 862 + * 863 + * Note that MBM events are also part of RDT_RESOURCE_L3 resource 864 + * because as per the SDM the total and local memory bandwidth 865 + * are enumerated as part of L3 monitoring. 866 + */ 867 + static void l3_mon_evt_init(struct rdt_resource *r) 868 + { 869 + INIT_LIST_HEAD(&r->evt_list); 870 + 871 + if (resctrl_arch_is_llc_occupancy_enabled()) 872 + list_add_tail(&llc_occupancy_event.list, &r->evt_list); 873 + if (resctrl_arch_is_mbm_total_enabled()) 874 + list_add_tail(&mbm_total_event.list, &r->evt_list); 875 + if (resctrl_arch_is_mbm_local_enabled()) 876 + list_add_tail(&mbm_local_event.list, &r->evt_list); 877 + } 878 + 879 + /** 880 + * resctrl_mon_resource_init() - Initialise global monitoring structures. 881 + * 882 + * Allocate and initialise global monitor resources that do not belong to a 883 + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 884 + * Called once during boot after the struct rdt_resource's have been configured 885 + * but before the filesystem is mounted. 886 + * Resctrl's cpuhp callbacks may be called before this point to bring a domain 887 + * online. 888 + * 889 + * Returns 0 for success, or -ENOMEM. 890 + */ 891 + int resctrl_mon_resource_init(void) 892 + { 893 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 894 + int ret; 895 + 896 + if (!r->mon_capable) 897 + return 0; 898 + 899 + ret = dom_data_init(r); 900 + if (ret) 901 + return ret; 902 + 903 + l3_mon_evt_init(r); 904 + 905 + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 906 + mbm_total_event.configurable = true; 907 + resctrl_file_fflags_init("mbm_total_bytes_config", 908 + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 909 + } 910 + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 911 + mbm_local_event.configurable = true; 912 + resctrl_file_fflags_init("mbm_local_bytes_config", 913 + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 914 + } 915 + 916 + if (resctrl_arch_is_mbm_local_enabled()) 917 + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 918 + else if (resctrl_arch_is_mbm_total_enabled()) 919 + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 920 + 921 + return 0; 922 + } 923 + 924 + void resctrl_mon_resource_exit(void) 925 + { 926 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 927 + 928 + dom_data_exit(r); 929 + }
+33
fs/resctrl/monitor_trace.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #undef TRACE_SYSTEM 3 + #define TRACE_SYSTEM resctrl 4 + 5 + #if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 6 + #define _FS_RESCTRL_MONITOR_TRACE_H 7 + 8 + #include <linux/tracepoint.h> 9 + 10 + TRACE_EVENT(mon_llc_occupancy_limbo, 11 + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), 12 + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), 13 + TP_STRUCT__entry(__field(u32, ctrl_hw_id) 14 + __field(u32, mon_hw_id) 15 + __field(int, domain_id) 16 + __field(u64, llc_occupancy_bytes)), 17 + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; 18 + __entry->mon_hw_id = mon_hw_id; 19 + __entry->domain_id = domain_id; 20 + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), 21 + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", 22 + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, 23 + __entry->llc_occupancy_bytes) 24 + ); 25 + 26 + #endif /* _FS_RESCTRL_MONITOR_TRACE_H */ 27 + 28 + #undef TRACE_INCLUDE_PATH 29 + #define TRACE_INCLUDE_PATH . 30 + 31 + #define TRACE_INCLUDE_FILE monitor_trace 32 + 33 + #include <trace/define_trace.h>
+1105
fs/resctrl/pseudo_lock.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Resource Director Technology (RDT) 4 + * 5 + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) 6 + * 7 + * Copyright (C) 2018 Intel Corporation 8 + * 9 + * Author: Reinette Chatre <reinette.chatre@intel.com> 10 + */ 11 + 12 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 + 14 + #include <linux/cacheinfo.h> 15 + #include <linux/cpu.h> 16 + #include <linux/cpumask.h> 17 + #include <linux/debugfs.h> 18 + #include <linux/kthread.h> 19 + #include <linux/mman.h> 20 + #include <linux/pm_qos.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/slab.h> 23 + #include <linux/uaccess.h> 24 + 25 + #include "internal.h" 26 + 27 + /* 28 + * Major number assigned to and shared by all devices exposing 29 + * pseudo-locked regions. 30 + */ 31 + static unsigned int pseudo_lock_major; 32 + 33 + static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); 34 + 35 + static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) 36 + { 37 + const struct rdtgroup *rdtgrp; 38 + 39 + rdtgrp = dev_get_drvdata(dev); 40 + if (mode) 41 + *mode = 0600; 42 + guard(mutex)(&rdtgroup_mutex); 43 + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 44 + } 45 + 46 + static const struct class pseudo_lock_class = { 47 + .name = "pseudo_lock", 48 + .devnode = pseudo_lock_devnode, 49 + }; 50 + 51 + /** 52 + * pseudo_lock_minor_get - Obtain available minor number 53 + * @minor: Pointer to where new minor number will be stored 54 + * 55 + * A bitmask is used to track available minor numbers. Here the next free 56 + * minor number is marked as unavailable and returned. 57 + * 58 + * Return: 0 on success, <0 on failure. 59 + */ 60 + static int pseudo_lock_minor_get(unsigned int *minor) 61 + { 62 + unsigned long first_bit; 63 + 64 + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); 65 + 66 + if (first_bit == MINORBITS) 67 + return -ENOSPC; 68 + 69 + __clear_bit(first_bit, &pseudo_lock_minor_avail); 70 + *minor = first_bit; 71 + 72 + return 0; 73 + } 74 + 75 + /** 76 + * pseudo_lock_minor_release - Return minor number to available 77 + * @minor: The minor number made available 78 + */ 79 + static void pseudo_lock_minor_release(unsigned int minor) 80 + { 81 + __set_bit(minor, &pseudo_lock_minor_avail); 82 + } 83 + 84 + /** 85 + * region_find_by_minor - Locate a pseudo-lock region by inode minor number 86 + * @minor: The minor number of the device representing pseudo-locked region 87 + * 88 + * When the character device is accessed we need to determine which 89 + * pseudo-locked region it belongs to. This is done by matching the minor 90 + * number of the device to the pseudo-locked region it belongs. 91 + * 92 + * Minor numbers are assigned at the time a pseudo-locked region is associated 93 + * with a cache instance. 94 + * 95 + * Return: On success return pointer to resource group owning the pseudo-locked 96 + * region, NULL on failure. 97 + */ 98 + static struct rdtgroup *region_find_by_minor(unsigned int minor) 99 + { 100 + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; 101 + 102 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 103 + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { 104 + rdtgrp_match = rdtgrp; 105 + break; 106 + } 107 + } 108 + return rdtgrp_match; 109 + } 110 + 111 + /** 112 + * struct pseudo_lock_pm_req - A power management QoS request list entry 113 + * @list: Entry within the @pm_reqs list for a pseudo-locked region 114 + * @req: PM QoS request 115 + */ 116 + struct pseudo_lock_pm_req { 117 + struct list_head list; 118 + struct dev_pm_qos_request req; 119 + }; 120 + 121 + static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) 122 + { 123 + struct pseudo_lock_pm_req *pm_req, *next; 124 + 125 + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { 126 + dev_pm_qos_remove_request(&pm_req->req); 127 + list_del(&pm_req->list); 128 + kfree(pm_req); 129 + } 130 + } 131 + 132 + /** 133 + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 134 + * @plr: Pseudo-locked region 135 + * 136 + * To prevent the cache from being affected by power management entering 137 + * C6 has to be avoided. This is accomplished by requesting a latency 138 + * requirement lower than lowest C6 exit latency of all supported 139 + * platforms as found in the cpuidle state tables in the intel_idle driver. 140 + * At this time it is possible to do so with a single latency requirement 141 + * for all supported platforms. 142 + * 143 + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, 144 + * the ACPI latencies need to be considered while keeping in mind that C2 145 + * may be set to map to deeper sleep states. In this case the latency 146 + * requirement needs to prevent entering C2 also. 147 + * 148 + * Return: 0 on success, <0 on failure 149 + */ 150 + static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) 151 + { 152 + struct pseudo_lock_pm_req *pm_req; 153 + int cpu; 154 + int ret; 155 + 156 + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { 157 + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); 158 + if (!pm_req) { 159 + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); 160 + ret = -ENOMEM; 161 + goto out_err; 162 + } 163 + ret = dev_pm_qos_add_request(get_cpu_device(cpu), 164 + &pm_req->req, 165 + DEV_PM_QOS_RESUME_LATENCY, 166 + 30); 167 + if (ret < 0) { 168 + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", 169 + cpu); 170 + kfree(pm_req); 171 + ret = -1; 172 + goto out_err; 173 + } 174 + list_add(&pm_req->list, &plr->pm_reqs); 175 + } 176 + 177 + return 0; 178 + 179 + out_err: 180 + pseudo_lock_cstates_relax(plr); 181 + return ret; 182 + } 183 + 184 + /** 185 + * pseudo_lock_region_clear - Reset pseudo-lock region data 186 + * @plr: pseudo-lock region 187 + * 188 + * All content of the pseudo-locked region is reset - any memory allocated 189 + * freed. 190 + * 191 + * Return: void 192 + */ 193 + static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) 194 + { 195 + plr->size = 0; 196 + plr->line_size = 0; 197 + kfree(plr->kmem); 198 + plr->kmem = NULL; 199 + plr->s = NULL; 200 + if (plr->d) 201 + plr->d->plr = NULL; 202 + plr->d = NULL; 203 + plr->cbm = 0; 204 + plr->debugfs_dir = NULL; 205 + } 206 + 207 + /** 208 + * pseudo_lock_region_init - Initialize pseudo-lock region information 209 + * @plr: pseudo-lock region 210 + * 211 + * Called after user provided a schemata to be pseudo-locked. From the 212 + * schemata the &struct pseudo_lock_region is on entry already initialized 213 + * with the resource, domain, and capacity bitmask. Here the information 214 + * required for pseudo-locking is deduced from this data and &struct 215 + * pseudo_lock_region initialized further. This information includes: 216 + * - size in bytes of the region to be pseudo-locked 217 + * - cache line size to know the stride with which data needs to be accessed 218 + * to be pseudo-locked 219 + * - a cpu associated with the cache instance on which the pseudo-locking 220 + * flow can be executed 221 + * 222 + * Return: 0 on success, <0 on failure. Descriptive error will be written 223 + * to last_cmd_status buffer. 224 + */ 225 + static int pseudo_lock_region_init(struct pseudo_lock_region *plr) 226 + { 227 + enum resctrl_scope scope = plr->s->res->ctrl_scope; 228 + struct cacheinfo *ci; 229 + int ret; 230 + 231 + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) 232 + return -ENODEV; 233 + 234 + /* Pick the first cpu we find that is associated with the cache. */ 235 + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); 236 + 237 + if (!cpu_online(plr->cpu)) { 238 + rdt_last_cmd_printf("CPU %u associated with cache not online\n", 239 + plr->cpu); 240 + ret = -ENODEV; 241 + goto out_region; 242 + } 243 + 244 + ci = get_cpu_cacheinfo_level(plr->cpu, scope); 245 + if (ci) { 246 + plr->line_size = ci->coherency_line_size; 247 + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); 248 + return 0; 249 + } 250 + 251 + ret = -1; 252 + rdt_last_cmd_puts("Unable to determine cache line size\n"); 253 + out_region: 254 + pseudo_lock_region_clear(plr); 255 + return ret; 256 + } 257 + 258 + /** 259 + * pseudo_lock_init - Initialize a pseudo-lock region 260 + * @rdtgrp: resource group to which new pseudo-locked region will belong 261 + * 262 + * A pseudo-locked region is associated with a resource group. When this 263 + * association is created the pseudo-locked region is initialized. The 264 + * details of the pseudo-locked region are not known at this time so only 265 + * allocation is done and association established. 266 + * 267 + * Return: 0 on success, <0 on failure 268 + */ 269 + static int pseudo_lock_init(struct rdtgroup *rdtgrp) 270 + { 271 + struct pseudo_lock_region *plr; 272 + 273 + plr = kzalloc(sizeof(*plr), GFP_KERNEL); 274 + if (!plr) 275 + return -ENOMEM; 276 + 277 + init_waitqueue_head(&plr->lock_thread_wq); 278 + INIT_LIST_HEAD(&plr->pm_reqs); 279 + rdtgrp->plr = plr; 280 + return 0; 281 + } 282 + 283 + /** 284 + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked 285 + * @plr: pseudo-lock region 286 + * 287 + * Initialize the details required to set up the pseudo-locked region and 288 + * allocate the contiguous memory that will be pseudo-locked to the cache. 289 + * 290 + * Return: 0 on success, <0 on failure. Descriptive error will be written 291 + * to last_cmd_status buffer. 292 + */ 293 + static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) 294 + { 295 + int ret; 296 + 297 + ret = pseudo_lock_region_init(plr); 298 + if (ret < 0) 299 + return ret; 300 + 301 + /* 302 + * We do not yet support contiguous regions larger than 303 + * KMALLOC_MAX_SIZE. 304 + */ 305 + if (plr->size > KMALLOC_MAX_SIZE) { 306 + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); 307 + ret = -E2BIG; 308 + goto out_region; 309 + } 310 + 311 + plr->kmem = kzalloc(plr->size, GFP_KERNEL); 312 + if (!plr->kmem) { 313 + rdt_last_cmd_puts("Unable to allocate memory\n"); 314 + ret = -ENOMEM; 315 + goto out_region; 316 + } 317 + 318 + ret = 0; 319 + goto out; 320 + out_region: 321 + pseudo_lock_region_clear(plr); 322 + out: 323 + return ret; 324 + } 325 + 326 + /** 327 + * pseudo_lock_free - Free a pseudo-locked region 328 + * @rdtgrp: resource group to which pseudo-locked region belonged 329 + * 330 + * The pseudo-locked region's resources have already been released, or not 331 + * yet created at this point. Now it can be freed and disassociated from the 332 + * resource group. 333 + * 334 + * Return: void 335 + */ 336 + static void pseudo_lock_free(struct rdtgroup *rdtgrp) 337 + { 338 + pseudo_lock_region_clear(rdtgrp->plr); 339 + kfree(rdtgrp->plr); 340 + rdtgrp->plr = NULL; 341 + } 342 + 343 + /** 344 + * rdtgroup_monitor_in_progress - Test if monitoring in progress 345 + * @rdtgrp: resource group being queried 346 + * 347 + * Return: 1 if monitor groups have been created for this resource 348 + * group, 0 otherwise. 349 + */ 350 + static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) 351 + { 352 + return !list_empty(&rdtgrp->mon.crdtgrp_list); 353 + } 354 + 355 + /** 356 + * rdtgroup_locksetup_user_restrict - Restrict user access to group 357 + * @rdtgrp: resource group needing access restricted 358 + * 359 + * A resource group used for cache pseudo-locking cannot have cpus or tasks 360 + * assigned to it. This is communicated to the user by restricting access 361 + * to all the files that can be used to make such changes. 362 + * 363 + * Permissions restored with rdtgroup_locksetup_user_restore() 364 + * 365 + * Return: 0 on success, <0 on failure. If a failure occurs during the 366 + * restriction of access an attempt will be made to restore permissions but 367 + * the state of the mode of these files will be uncertain when a failure 368 + * occurs. 369 + */ 370 + static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) 371 + { 372 + int ret; 373 + 374 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 375 + if (ret) 376 + return ret; 377 + 378 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 379 + if (ret) 380 + goto err_tasks; 381 + 382 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 383 + if (ret) 384 + goto err_cpus; 385 + 386 + if (resctrl_arch_mon_capable()) { 387 + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); 388 + if (ret) 389 + goto err_cpus_list; 390 + } 391 + 392 + ret = 0; 393 + goto out; 394 + 395 + err_cpus_list: 396 + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 397 + err_cpus: 398 + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 399 + err_tasks: 400 + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 401 + out: 402 + return ret; 403 + } 404 + 405 + /** 406 + * rdtgroup_locksetup_user_restore - Restore user access to group 407 + * @rdtgrp: resource group needing access restored 408 + * 409 + * Restore all file access previously removed using 410 + * rdtgroup_locksetup_user_restrict() 411 + * 412 + * Return: 0 on success, <0 on failure. If a failure occurs during the 413 + * restoration of access an attempt will be made to restrict permissions 414 + * again but the state of the mode of these files will be uncertain when 415 + * a failure occurs. 416 + */ 417 + static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) 418 + { 419 + int ret; 420 + 421 + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); 422 + if (ret) 423 + return ret; 424 + 425 + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); 426 + if (ret) 427 + goto err_tasks; 428 + 429 + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); 430 + if (ret) 431 + goto err_cpus; 432 + 433 + if (resctrl_arch_mon_capable()) { 434 + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); 435 + if (ret) 436 + goto err_cpus_list; 437 + } 438 + 439 + ret = 0; 440 + goto out; 441 + 442 + err_cpus_list: 443 + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); 444 + err_cpus: 445 + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); 446 + err_tasks: 447 + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); 448 + out: 449 + return ret; 450 + } 451 + 452 + /** 453 + * rdtgroup_locksetup_enter - Resource group enters locksetup mode 454 + * @rdtgrp: resource group requested to enter locksetup mode 455 + * 456 + * A resource group enters locksetup mode to reflect that it would be used 457 + * to represent a pseudo-locked region and is in the process of being set 458 + * up to do so. A resource group used for a pseudo-locked region would 459 + * lose the closid associated with it so we cannot allow it to have any 460 + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the 461 + * future. Monitoring of a pseudo-locked region is not allowed either. 462 + * 463 + * The above and more restrictions on a pseudo-locked region are checked 464 + * for and enforced before the resource group enters the locksetup mode. 465 + * 466 + * Returns: 0 if the resource group successfully entered locksetup mode, <0 467 + * on failure. On failure the last_cmd_status buffer is updated with text to 468 + * communicate details of failure to the user. 469 + */ 470 + int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) 471 + { 472 + int ret; 473 + 474 + /* 475 + * The default resource group can neither be removed nor lose the 476 + * default closid associated with it. 477 + */ 478 + if (rdtgrp == &rdtgroup_default) { 479 + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); 480 + return -EINVAL; 481 + } 482 + 483 + /* 484 + * Cache Pseudo-locking not supported when CDP is enabled. 485 + * 486 + * Some things to consider if you would like to enable this 487 + * support (using L3 CDP as example): 488 + * - When CDP is enabled two separate resources are exposed, 489 + * L3DATA and L3CODE, but they are actually on the same cache. 490 + * The implication for pseudo-locking is that if a 491 + * pseudo-locked region is created on a domain of one 492 + * resource (eg. L3CODE), then a pseudo-locked region cannot 493 + * be created on that same domain of the other resource 494 + * (eg. L3DATA). This is because the creation of a 495 + * pseudo-locked region involves a call to wbinvd that will 496 + * affect all cache allocations on particular domain. 497 + * - Considering the previous, it may be possible to only 498 + * expose one of the CDP resources to pseudo-locking and 499 + * hide the other. For example, we could consider to only 500 + * expose L3DATA and since the L3 cache is unified it is 501 + * still possible to place instructions there are execute it. 502 + * - If only one region is exposed to pseudo-locking we should 503 + * still keep in mind that availability of a portion of cache 504 + * for pseudo-locking should take into account both resources. 505 + * Similarly, if a pseudo-locked region is created in one 506 + * resource, the portion of cache used by it should be made 507 + * unavailable to all future allocations from both resources. 508 + */ 509 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || 510 + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { 511 + rdt_last_cmd_puts("CDP enabled\n"); 512 + return -EINVAL; 513 + } 514 + 515 + /* 516 + * Not knowing the bits to disable prefetching implies that this 517 + * platform does not support Cache Pseudo-Locking. 518 + */ 519 + if (resctrl_arch_get_prefetch_disable_bits() == 0) { 520 + rdt_last_cmd_puts("Pseudo-locking not supported\n"); 521 + return -EINVAL; 522 + } 523 + 524 + if (rdtgroup_monitor_in_progress(rdtgrp)) { 525 + rdt_last_cmd_puts("Monitoring in progress\n"); 526 + return -EINVAL; 527 + } 528 + 529 + if (rdtgroup_tasks_assigned(rdtgrp)) { 530 + rdt_last_cmd_puts("Tasks assigned to resource group\n"); 531 + return -EINVAL; 532 + } 533 + 534 + if (!cpumask_empty(&rdtgrp->cpu_mask)) { 535 + rdt_last_cmd_puts("CPUs assigned to resource group\n"); 536 + return -EINVAL; 537 + } 538 + 539 + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { 540 + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); 541 + return -EIO; 542 + } 543 + 544 + ret = pseudo_lock_init(rdtgrp); 545 + if (ret) { 546 + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); 547 + goto out_release; 548 + } 549 + 550 + /* 551 + * If this system is capable of monitoring a rmid would have been 552 + * allocated when the control group was created. This is not needed 553 + * anymore when this group would be used for pseudo-locking. This 554 + * is safe to call on platforms not capable of monitoring. 555 + */ 556 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 557 + 558 + ret = 0; 559 + goto out; 560 + 561 + out_release: 562 + rdtgroup_locksetup_user_restore(rdtgrp); 563 + out: 564 + return ret; 565 + } 566 + 567 + /** 568 + * rdtgroup_locksetup_exit - resource group exist locksetup mode 569 + * @rdtgrp: resource group 570 + * 571 + * When a resource group exits locksetup mode the earlier restrictions are 572 + * lifted. 573 + * 574 + * Return: 0 on success, <0 on failure 575 + */ 576 + int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) 577 + { 578 + int ret; 579 + 580 + if (resctrl_arch_mon_capable()) { 581 + ret = alloc_rmid(rdtgrp->closid); 582 + if (ret < 0) { 583 + rdt_last_cmd_puts("Out of RMIDs\n"); 584 + return ret; 585 + } 586 + rdtgrp->mon.rmid = ret; 587 + } 588 + 589 + ret = rdtgroup_locksetup_user_restore(rdtgrp); 590 + if (ret) { 591 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 592 + return ret; 593 + } 594 + 595 + pseudo_lock_free(rdtgrp); 596 + return 0; 597 + } 598 + 599 + /** 600 + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked 601 + * @d: RDT domain 602 + * @cbm: CBM to test 603 + * 604 + * @d represents a cache instance and @cbm a capacity bitmask that is 605 + * considered for it. Determine if @cbm overlaps with any existing 606 + * pseudo-locked region on @d. 607 + * 608 + * @cbm is unsigned long, even if only 32 bits are used, to make the 609 + * bitmap functions work correctly. 610 + * 611 + * Return: true if @cbm overlaps with pseudo-locked region on @d, false 612 + * otherwise. 613 + */ 614 + bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) 615 + { 616 + unsigned int cbm_len; 617 + unsigned long cbm_b; 618 + 619 + if (d->plr) { 620 + cbm_len = d->plr->s->res->cache.cbm_len; 621 + cbm_b = d->plr->cbm; 622 + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) 623 + return true; 624 + } 625 + return false; 626 + } 627 + 628 + /** 629 + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy 630 + * @d: RDT domain under test 631 + * 632 + * The setup of a pseudo-locked region affects all cache instances within 633 + * the hierarchy of the region. It is thus essential to know if any 634 + * pseudo-locked regions exist within a cache hierarchy to prevent any 635 + * attempts to create new pseudo-locked regions in the same hierarchy. 636 + * 637 + * Return: true if a pseudo-locked region exists in the hierarchy of @d or 638 + * if it is not possible to test due to memory allocation issue, 639 + * false otherwise. 640 + */ 641 + bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) 642 + { 643 + struct rdt_ctrl_domain *d_i; 644 + cpumask_var_t cpu_with_psl; 645 + struct rdt_resource *r; 646 + bool ret = false; 647 + 648 + /* Walking r->domains, ensure it can't race with cpuhp */ 649 + lockdep_assert_cpus_held(); 650 + 651 + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) 652 + return true; 653 + 654 + /* 655 + * First determine which cpus have pseudo-locked regions 656 + * associated with them. 657 + */ 658 + for_each_alloc_capable_rdt_resource(r) { 659 + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { 660 + if (d_i->plr) 661 + cpumask_or(cpu_with_psl, cpu_with_psl, 662 + &d_i->hdr.cpu_mask); 663 + } 664 + } 665 + 666 + /* 667 + * Next test if new pseudo-locked region would intersect with 668 + * existing region. 669 + */ 670 + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) 671 + ret = true; 672 + 673 + free_cpumask_var(cpu_with_psl); 674 + return ret; 675 + } 676 + 677 + /** 678 + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region 679 + * @rdtgrp: Resource group to which the pseudo-locked region belongs. 680 + * @sel: Selector of which measurement to perform on a pseudo-locked region. 681 + * 682 + * The measurement of latency to access a pseudo-locked region should be 683 + * done from a cpu that is associated with that pseudo-locked region. 684 + * Determine which cpu is associated with this region and start a thread on 685 + * that cpu to perform the measurement, wait for that thread to complete. 686 + * 687 + * Return: 0 on success, <0 on failure 688 + */ 689 + static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) 690 + { 691 + struct pseudo_lock_region *plr = rdtgrp->plr; 692 + struct task_struct *thread; 693 + unsigned int cpu; 694 + int ret = -1; 695 + 696 + cpus_read_lock(); 697 + mutex_lock(&rdtgroup_mutex); 698 + 699 + if (rdtgrp->flags & RDT_DELETED) { 700 + ret = -ENODEV; 701 + goto out; 702 + } 703 + 704 + if (!plr->d) { 705 + ret = -ENODEV; 706 + goto out; 707 + } 708 + 709 + plr->thread_done = 0; 710 + cpu = cpumask_first(&plr->d->hdr.cpu_mask); 711 + if (!cpu_online(cpu)) { 712 + ret = -ENODEV; 713 + goto out; 714 + } 715 + 716 + plr->cpu = cpu; 717 + 718 + if (sel == 1) 719 + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, 720 + plr, cpu, "pseudo_lock_measure/%u"); 721 + else if (sel == 2) 722 + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, 723 + plr, cpu, "pseudo_lock_measure/%u"); 724 + else if (sel == 3) 725 + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, 726 + plr, cpu, "pseudo_lock_measure/%u"); 727 + else 728 + goto out; 729 + 730 + if (IS_ERR(thread)) { 731 + ret = PTR_ERR(thread); 732 + goto out; 733 + } 734 + 735 + ret = wait_event_interruptible(plr->lock_thread_wq, 736 + plr->thread_done == 1); 737 + if (ret < 0) 738 + goto out; 739 + 740 + ret = 0; 741 + 742 + out: 743 + mutex_unlock(&rdtgroup_mutex); 744 + cpus_read_unlock(); 745 + return ret; 746 + } 747 + 748 + static ssize_t pseudo_lock_measure_trigger(struct file *file, 749 + const char __user *user_buf, 750 + size_t count, loff_t *ppos) 751 + { 752 + struct rdtgroup *rdtgrp = file->private_data; 753 + size_t buf_size; 754 + char buf[32]; 755 + int ret; 756 + int sel; 757 + 758 + buf_size = min(count, (sizeof(buf) - 1)); 759 + if (copy_from_user(buf, user_buf, buf_size)) 760 + return -EFAULT; 761 + 762 + buf[buf_size] = '\0'; 763 + ret = kstrtoint(buf, 10, &sel); 764 + if (ret == 0) { 765 + if (sel != 1 && sel != 2 && sel != 3) 766 + return -EINVAL; 767 + ret = debugfs_file_get(file->f_path.dentry); 768 + if (ret) 769 + return ret; 770 + ret = pseudo_lock_measure_cycles(rdtgrp, sel); 771 + if (ret == 0) 772 + ret = count; 773 + debugfs_file_put(file->f_path.dentry); 774 + } 775 + 776 + return ret; 777 + } 778 + 779 + static const struct file_operations pseudo_measure_fops = { 780 + .write = pseudo_lock_measure_trigger, 781 + .open = simple_open, 782 + .llseek = default_llseek, 783 + }; 784 + 785 + /** 786 + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region 787 + * @rdtgrp: resource group to which pseudo-lock region belongs 788 + * 789 + * Called when a resource group in the pseudo-locksetup mode receives a 790 + * valid schemata that should be pseudo-locked. Since the resource group is 791 + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been 792 + * allocated and initialized with the essential information. If a failure 793 + * occurs the resource group remains in the pseudo-locksetup mode with the 794 + * &struct pseudo_lock_region associated with it, but cleared from all 795 + * information and ready for the user to re-attempt pseudo-locking by 796 + * writing the schemata again. 797 + * 798 + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 799 + * on failure. Descriptive error will be written to last_cmd_status buffer. 800 + */ 801 + int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) 802 + { 803 + struct pseudo_lock_region *plr = rdtgrp->plr; 804 + struct task_struct *thread; 805 + unsigned int new_minor; 806 + struct device *dev; 807 + char *kn_name __free(kfree) = NULL; 808 + int ret; 809 + 810 + ret = pseudo_lock_region_alloc(plr); 811 + if (ret < 0) 812 + return ret; 813 + 814 + ret = pseudo_lock_cstates_constrain(plr); 815 + if (ret < 0) { 816 + ret = -EINVAL; 817 + goto out_region; 818 + } 819 + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 820 + if (!kn_name) { 821 + ret = -ENOMEM; 822 + goto out_cstates; 823 + } 824 + 825 + plr->thread_done = 0; 826 + 827 + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, 828 + plr->cpu, "pseudo_lock/%u"); 829 + if (IS_ERR(thread)) { 830 + ret = PTR_ERR(thread); 831 + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); 832 + goto out_cstates; 833 + } 834 + 835 + ret = wait_event_interruptible(plr->lock_thread_wq, 836 + plr->thread_done == 1); 837 + if (ret < 0) { 838 + /* 839 + * If the thread does not get on the CPU for whatever 840 + * reason and the process which sets up the region is 841 + * interrupted then this will leave the thread in runnable 842 + * state and once it gets on the CPU it will dereference 843 + * the cleared, but not freed, plr struct resulting in an 844 + * empty pseudo-locking loop. 845 + */ 846 + rdt_last_cmd_puts("Locking thread interrupted\n"); 847 + goto out_cstates; 848 + } 849 + 850 + ret = pseudo_lock_minor_get(&new_minor); 851 + if (ret < 0) { 852 + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); 853 + goto out_cstates; 854 + } 855 + 856 + /* 857 + * Unlock access but do not release the reference. The 858 + * pseudo-locked region will still be here on return. 859 + * 860 + * The mutex has to be released temporarily to avoid a potential 861 + * deadlock with the mm->mmap_lock which is obtained in the 862 + * device_create() and debugfs_create_dir() callpath below as well as 863 + * before the mmap() callback is called. 864 + */ 865 + mutex_unlock(&rdtgroup_mutex); 866 + 867 + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 868 + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 869 + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 870 + debugfs_create_file("pseudo_lock_measure", 0200, 871 + plr->debugfs_dir, rdtgrp, 872 + &pseudo_measure_fops); 873 + } 874 + 875 + dev = device_create(&pseudo_lock_class, NULL, 876 + MKDEV(pseudo_lock_major, new_minor), 877 + rdtgrp, "%s", kn_name); 878 + 879 + mutex_lock(&rdtgroup_mutex); 880 + 881 + if (IS_ERR(dev)) { 882 + ret = PTR_ERR(dev); 883 + rdt_last_cmd_printf("Failed to create character device: %d\n", 884 + ret); 885 + goto out_debugfs; 886 + } 887 + 888 + /* We released the mutex - check if group was removed while we did so */ 889 + if (rdtgrp->flags & RDT_DELETED) { 890 + ret = -ENODEV; 891 + goto out_device; 892 + } 893 + 894 + plr->minor = new_minor; 895 + 896 + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; 897 + closid_free(rdtgrp->closid); 898 + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); 899 + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); 900 + 901 + ret = 0; 902 + goto out; 903 + 904 + out_device: 905 + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); 906 + out_debugfs: 907 + debugfs_remove_recursive(plr->debugfs_dir); 908 + pseudo_lock_minor_release(new_minor); 909 + out_cstates: 910 + pseudo_lock_cstates_relax(plr); 911 + out_region: 912 + pseudo_lock_region_clear(plr); 913 + out: 914 + return ret; 915 + } 916 + 917 + /** 918 + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region 919 + * @rdtgrp: resource group to which the pseudo-locked region belongs 920 + * 921 + * The removal of a pseudo-locked region can be initiated when the resource 922 + * group is removed from user space via a "rmdir" from userspace or the 923 + * unmount of the resctrl filesystem. On removal the resource group does 924 + * not go back to pseudo-locksetup mode before it is removed, instead it is 925 + * removed directly. There is thus asymmetry with the creation where the 926 + * &struct pseudo_lock_region is removed here while it was not created in 927 + * rdtgroup_pseudo_lock_create(). 928 + * 929 + * Return: void 930 + */ 931 + void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) 932 + { 933 + struct pseudo_lock_region *plr = rdtgrp->plr; 934 + 935 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 936 + /* 937 + * Default group cannot be a pseudo-locked region so we can 938 + * free closid here. 939 + */ 940 + closid_free(rdtgrp->closid); 941 + goto free; 942 + } 943 + 944 + pseudo_lock_cstates_relax(plr); 945 + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); 946 + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); 947 + pseudo_lock_minor_release(plr->minor); 948 + 949 + free: 950 + pseudo_lock_free(rdtgrp); 951 + } 952 + 953 + static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) 954 + { 955 + struct rdtgroup *rdtgrp; 956 + 957 + mutex_lock(&rdtgroup_mutex); 958 + 959 + rdtgrp = region_find_by_minor(iminor(inode)); 960 + if (!rdtgrp) { 961 + mutex_unlock(&rdtgroup_mutex); 962 + return -ENODEV; 963 + } 964 + 965 + filp->private_data = rdtgrp; 966 + atomic_inc(&rdtgrp->waitcount); 967 + /* Perform a non-seekable open - llseek is not supported */ 968 + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 969 + 970 + mutex_unlock(&rdtgroup_mutex); 971 + 972 + return 0; 973 + } 974 + 975 + static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) 976 + { 977 + struct rdtgroup *rdtgrp; 978 + 979 + mutex_lock(&rdtgroup_mutex); 980 + rdtgrp = filp->private_data; 981 + WARN_ON(!rdtgrp); 982 + if (!rdtgrp) { 983 + mutex_unlock(&rdtgroup_mutex); 984 + return -ENODEV; 985 + } 986 + filp->private_data = NULL; 987 + atomic_dec(&rdtgrp->waitcount); 988 + mutex_unlock(&rdtgroup_mutex); 989 + return 0; 990 + } 991 + 992 + static int pseudo_lock_dev_mremap(struct vm_area_struct *area) 993 + { 994 + /* Not supported */ 995 + return -EINVAL; 996 + } 997 + 998 + static const struct vm_operations_struct pseudo_mmap_ops = { 999 + .mremap = pseudo_lock_dev_mremap, 1000 + }; 1001 + 1002 + static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) 1003 + { 1004 + unsigned long vsize = vma->vm_end - vma->vm_start; 1005 + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 1006 + struct pseudo_lock_region *plr; 1007 + struct rdtgroup *rdtgrp; 1008 + unsigned long physical; 1009 + unsigned long psize; 1010 + 1011 + mutex_lock(&rdtgroup_mutex); 1012 + 1013 + rdtgrp = filp->private_data; 1014 + WARN_ON(!rdtgrp); 1015 + if (!rdtgrp) { 1016 + mutex_unlock(&rdtgroup_mutex); 1017 + return -ENODEV; 1018 + } 1019 + 1020 + plr = rdtgrp->plr; 1021 + 1022 + if (!plr->d) { 1023 + mutex_unlock(&rdtgroup_mutex); 1024 + return -ENODEV; 1025 + } 1026 + 1027 + /* 1028 + * Task is required to run with affinity to the cpus associated 1029 + * with the pseudo-locked region. If this is not the case the task 1030 + * may be scheduled elsewhere and invalidate entries in the 1031 + * pseudo-locked region. 1032 + */ 1033 + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { 1034 + mutex_unlock(&rdtgroup_mutex); 1035 + return -EINVAL; 1036 + } 1037 + 1038 + physical = __pa(plr->kmem) >> PAGE_SHIFT; 1039 + psize = plr->size - off; 1040 + 1041 + if (off > plr->size) { 1042 + mutex_unlock(&rdtgroup_mutex); 1043 + return -ENOSPC; 1044 + } 1045 + 1046 + /* 1047 + * Ensure changes are carried directly to the memory being mapped, 1048 + * do not allow copy-on-write mapping. 1049 + */ 1050 + if (!(vma->vm_flags & VM_SHARED)) { 1051 + mutex_unlock(&rdtgroup_mutex); 1052 + return -EINVAL; 1053 + } 1054 + 1055 + if (vsize > psize) { 1056 + mutex_unlock(&rdtgroup_mutex); 1057 + return -ENOSPC; 1058 + } 1059 + 1060 + memset(plr->kmem + off, 0, vsize); 1061 + 1062 + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, 1063 + vsize, vma->vm_page_prot)) { 1064 + mutex_unlock(&rdtgroup_mutex); 1065 + return -EAGAIN; 1066 + } 1067 + vma->vm_ops = &pseudo_mmap_ops; 1068 + mutex_unlock(&rdtgroup_mutex); 1069 + return 0; 1070 + } 1071 + 1072 + static const struct file_operations pseudo_lock_dev_fops = { 1073 + .owner = THIS_MODULE, 1074 + .read = NULL, 1075 + .write = NULL, 1076 + .open = pseudo_lock_dev_open, 1077 + .release = pseudo_lock_dev_release, 1078 + .mmap = pseudo_lock_dev_mmap, 1079 + }; 1080 + 1081 + int rdt_pseudo_lock_init(void) 1082 + { 1083 + int ret; 1084 + 1085 + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); 1086 + if (ret < 0) 1087 + return ret; 1088 + 1089 + pseudo_lock_major = ret; 1090 + 1091 + ret = class_register(&pseudo_lock_class); 1092 + if (ret) { 1093 + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1094 + return ret; 1095 + } 1096 + 1097 + return 0; 1098 + } 1099 + 1100 + void rdt_pseudo_lock_release(void) 1101 + { 1102 + class_unregister(&pseudo_lock_class); 1103 + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); 1104 + pseudo_lock_major = 0; 1105 + }
+4353
fs/resctrl/rdtgroup.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * User interface for Resource Allocation in Resource Director Technology(RDT) 4 + * 5 + * Copyright (C) 2016 Intel Corporation 6 + * 7 + * Author: Fenghua Yu <fenghua.yu@intel.com> 8 + * 9 + * More information about RDT be found in the Intel (R) x86 Architecture 10 + * Software Developer Manual. 11 + */ 12 + 13 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 + 15 + #include <linux/cpu.h> 16 + #include <linux/debugfs.h> 17 + #include <linux/fs.h> 18 + #include <linux/fs_parser.h> 19 + #include <linux/sysfs.h> 20 + #include <linux/kernfs.h> 21 + #include <linux/resctrl.h> 22 + #include <linux/seq_buf.h> 23 + #include <linux/seq_file.h> 24 + #include <linux/sched/task.h> 25 + #include <linux/slab.h> 26 + #include <linux/user_namespace.h> 27 + 28 + #include <uapi/linux/magic.h> 29 + 30 + #include "internal.h" 31 + 32 + /* Mutex to protect rdtgroup access. */ 33 + DEFINE_MUTEX(rdtgroup_mutex); 34 + 35 + static struct kernfs_root *rdt_root; 36 + 37 + struct rdtgroup rdtgroup_default; 38 + 39 + LIST_HEAD(rdt_all_groups); 40 + 41 + /* list of entries for the schemata file */ 42 + LIST_HEAD(resctrl_schema_all); 43 + 44 + /* 45 + * List of struct mon_data containing private data of event files for use by 46 + * rdtgroup_mondata_show(). Protected by rdtgroup_mutex. 47 + */ 48 + static LIST_HEAD(mon_data_kn_priv_list); 49 + 50 + /* The filesystem can only be mounted once. */ 51 + bool resctrl_mounted; 52 + 53 + /* Kernel fs node for "info" directory under root */ 54 + static struct kernfs_node *kn_info; 55 + 56 + /* Kernel fs node for "mon_groups" directory under root */ 57 + static struct kernfs_node *kn_mongrp; 58 + 59 + /* Kernel fs node for "mon_data" directory under root */ 60 + static struct kernfs_node *kn_mondata; 61 + 62 + /* 63 + * Used to store the max resource name width to display the schemata names in 64 + * a tabular format. 65 + */ 66 + int max_name_width; 67 + 68 + static struct seq_buf last_cmd_status; 69 + 70 + static char last_cmd_status_buf[512]; 71 + 72 + static int rdtgroup_setup_root(struct rdt_fs_context *ctx); 73 + 74 + static void rdtgroup_destroy_root(void); 75 + 76 + struct dentry *debugfs_resctrl; 77 + 78 + /* 79 + * Memory bandwidth monitoring event to use for the default CTRL_MON group 80 + * and each new CTRL_MON group created by the user. Only relevant when 81 + * the filesystem is mounted with the "mba_MBps" option so it does not 82 + * matter that it remains uninitialized on systems that do not support 83 + * the "mba_MBps" option. 84 + */ 85 + enum resctrl_event_id mba_mbps_default_event; 86 + 87 + static bool resctrl_debug; 88 + 89 + void rdt_last_cmd_clear(void) 90 + { 91 + lockdep_assert_held(&rdtgroup_mutex); 92 + seq_buf_clear(&last_cmd_status); 93 + } 94 + 95 + void rdt_last_cmd_puts(const char *s) 96 + { 97 + lockdep_assert_held(&rdtgroup_mutex); 98 + seq_buf_puts(&last_cmd_status, s); 99 + } 100 + 101 + void rdt_last_cmd_printf(const char *fmt, ...) 102 + { 103 + va_list ap; 104 + 105 + va_start(ap, fmt); 106 + lockdep_assert_held(&rdtgroup_mutex); 107 + seq_buf_vprintf(&last_cmd_status, fmt, ap); 108 + va_end(ap); 109 + } 110 + 111 + void rdt_staged_configs_clear(void) 112 + { 113 + struct rdt_ctrl_domain *dom; 114 + struct rdt_resource *r; 115 + 116 + lockdep_assert_held(&rdtgroup_mutex); 117 + 118 + for_each_alloc_capable_rdt_resource(r) { 119 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) 120 + memset(dom->staged_config, 0, sizeof(dom->staged_config)); 121 + } 122 + } 123 + 124 + static bool resctrl_is_mbm_enabled(void) 125 + { 126 + return (resctrl_arch_is_mbm_total_enabled() || 127 + resctrl_arch_is_mbm_local_enabled()); 128 + } 129 + 130 + static bool resctrl_is_mbm_event(int e) 131 + { 132 + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && 133 + e <= QOS_L3_MBM_LOCAL_EVENT_ID); 134 + } 135 + 136 + /* 137 + * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap 138 + * of free CLOSIDs. 139 + * 140 + * Using a global CLOSID across all resources has some advantages and 141 + * some drawbacks: 142 + * + We can simply set current's closid to assign a task to a resource 143 + * group. 144 + * + Context switch code can avoid extra memory references deciding which 145 + * CLOSID to load into the PQR_ASSOC MSR 146 + * - We give up some options in configuring resource groups across multi-socket 147 + * systems. 148 + * - Our choices on how to configure each resource become progressively more 149 + * limited as the number of resources grows. 150 + */ 151 + static unsigned long *closid_free_map; 152 + 153 + static int closid_free_map_len; 154 + 155 + int closids_supported(void) 156 + { 157 + return closid_free_map_len; 158 + } 159 + 160 + static int closid_init(void) 161 + { 162 + struct resctrl_schema *s; 163 + u32 rdt_min_closid = ~0; 164 + 165 + /* Monitor only platforms still call closid_init() */ 166 + if (list_empty(&resctrl_schema_all)) 167 + return 0; 168 + 169 + /* Compute rdt_min_closid across all resources */ 170 + list_for_each_entry(s, &resctrl_schema_all, list) 171 + rdt_min_closid = min(rdt_min_closid, s->num_closid); 172 + 173 + closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL); 174 + if (!closid_free_map) 175 + return -ENOMEM; 176 + bitmap_fill(closid_free_map, rdt_min_closid); 177 + 178 + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ 179 + __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map); 180 + closid_free_map_len = rdt_min_closid; 181 + 182 + return 0; 183 + } 184 + 185 + static void closid_exit(void) 186 + { 187 + bitmap_free(closid_free_map); 188 + closid_free_map = NULL; 189 + } 190 + 191 + static int closid_alloc(void) 192 + { 193 + int cleanest_closid; 194 + u32 closid; 195 + 196 + lockdep_assert_held(&rdtgroup_mutex); 197 + 198 + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && 199 + resctrl_arch_is_llc_occupancy_enabled()) { 200 + cleanest_closid = resctrl_find_cleanest_closid(); 201 + if (cleanest_closid < 0) 202 + return cleanest_closid; 203 + closid = cleanest_closid; 204 + } else { 205 + closid = find_first_bit(closid_free_map, closid_free_map_len); 206 + if (closid == closid_free_map_len) 207 + return -ENOSPC; 208 + } 209 + __clear_bit(closid, closid_free_map); 210 + 211 + return closid; 212 + } 213 + 214 + void closid_free(int closid) 215 + { 216 + lockdep_assert_held(&rdtgroup_mutex); 217 + 218 + __set_bit(closid, closid_free_map); 219 + } 220 + 221 + /** 222 + * closid_allocated - test if provided closid is in use 223 + * @closid: closid to be tested 224 + * 225 + * Return: true if @closid is currently associated with a resource group, 226 + * false if @closid is free 227 + */ 228 + bool closid_allocated(unsigned int closid) 229 + { 230 + lockdep_assert_held(&rdtgroup_mutex); 231 + 232 + return !test_bit(closid, closid_free_map); 233 + } 234 + 235 + /** 236 + * rdtgroup_mode_by_closid - Return mode of resource group with closid 237 + * @closid: closid if the resource group 238 + * 239 + * Each resource group is associated with a @closid. Here the mode 240 + * of a resource group can be queried by searching for it using its closid. 241 + * 242 + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid 243 + */ 244 + enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) 245 + { 246 + struct rdtgroup *rdtgrp; 247 + 248 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 249 + if (rdtgrp->closid == closid) 250 + return rdtgrp->mode; 251 + } 252 + 253 + return RDT_NUM_MODES; 254 + } 255 + 256 + static const char * const rdt_mode_str[] = { 257 + [RDT_MODE_SHAREABLE] = "shareable", 258 + [RDT_MODE_EXCLUSIVE] = "exclusive", 259 + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", 260 + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", 261 + }; 262 + 263 + /** 264 + * rdtgroup_mode_str - Return the string representation of mode 265 + * @mode: the resource group mode as &enum rdtgroup_mode 266 + * 267 + * Return: string representation of valid mode, "unknown" otherwise 268 + */ 269 + static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) 270 + { 271 + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) 272 + return "unknown"; 273 + 274 + return rdt_mode_str[mode]; 275 + } 276 + 277 + /* set uid and gid of rdtgroup dirs and files to that of the creator */ 278 + static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) 279 + { 280 + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 281 + .ia_uid = current_fsuid(), 282 + .ia_gid = current_fsgid(), }; 283 + 284 + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 285 + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 286 + return 0; 287 + 288 + return kernfs_setattr(kn, &iattr); 289 + } 290 + 291 + static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) 292 + { 293 + struct kernfs_node *kn; 294 + int ret; 295 + 296 + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, 297 + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 298 + 0, rft->kf_ops, rft, NULL, NULL); 299 + if (IS_ERR(kn)) 300 + return PTR_ERR(kn); 301 + 302 + ret = rdtgroup_kn_set_ugid(kn); 303 + if (ret) { 304 + kernfs_remove(kn); 305 + return ret; 306 + } 307 + 308 + return 0; 309 + } 310 + 311 + static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 312 + { 313 + struct kernfs_open_file *of = m->private; 314 + struct rftype *rft = of->kn->priv; 315 + 316 + if (rft->seq_show) 317 + return rft->seq_show(of, m, arg); 318 + return 0; 319 + } 320 + 321 + static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, 322 + size_t nbytes, loff_t off) 323 + { 324 + struct rftype *rft = of->kn->priv; 325 + 326 + if (rft->write) 327 + return rft->write(of, buf, nbytes, off); 328 + 329 + return -EINVAL; 330 + } 331 + 332 + static const struct kernfs_ops rdtgroup_kf_single_ops = { 333 + .atomic_write_len = PAGE_SIZE, 334 + .write = rdtgroup_file_write, 335 + .seq_show = rdtgroup_seqfile_show, 336 + }; 337 + 338 + static const struct kernfs_ops kf_mondata_ops = { 339 + .atomic_write_len = PAGE_SIZE, 340 + .seq_show = rdtgroup_mondata_show, 341 + }; 342 + 343 + static bool is_cpu_list(struct kernfs_open_file *of) 344 + { 345 + struct rftype *rft = of->kn->priv; 346 + 347 + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; 348 + } 349 + 350 + static int rdtgroup_cpus_show(struct kernfs_open_file *of, 351 + struct seq_file *s, void *v) 352 + { 353 + struct rdtgroup *rdtgrp; 354 + struct cpumask *mask; 355 + int ret = 0; 356 + 357 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 358 + 359 + if (rdtgrp) { 360 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 361 + if (!rdtgrp->plr->d) { 362 + rdt_last_cmd_clear(); 363 + rdt_last_cmd_puts("Cache domain offline\n"); 364 + ret = -ENODEV; 365 + } else { 366 + mask = &rdtgrp->plr->d->hdr.cpu_mask; 367 + seq_printf(s, is_cpu_list(of) ? 368 + "%*pbl\n" : "%*pb\n", 369 + cpumask_pr_args(mask)); 370 + } 371 + } else { 372 + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", 373 + cpumask_pr_args(&rdtgrp->cpu_mask)); 374 + } 375 + } else { 376 + ret = -ENOENT; 377 + } 378 + rdtgroup_kn_unlock(of->kn); 379 + 380 + return ret; 381 + } 382 + 383 + /* 384 + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 385 + * 386 + * Per task closids/rmids must have been set up before calling this function. 387 + * @r may be NULL. 388 + */ 389 + static void 390 + update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) 391 + { 392 + struct resctrl_cpu_defaults defaults, *p = NULL; 393 + 394 + if (r) { 395 + defaults.closid = r->closid; 396 + defaults.rmid = r->mon.rmid; 397 + p = &defaults; 398 + } 399 + 400 + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); 401 + } 402 + 403 + static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 404 + cpumask_var_t tmpmask) 405 + { 406 + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; 407 + struct list_head *head; 408 + 409 + /* Check whether cpus belong to parent ctrl group */ 410 + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); 411 + if (!cpumask_empty(tmpmask)) { 412 + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); 413 + return -EINVAL; 414 + } 415 + 416 + /* Check whether cpus are dropped from this group */ 417 + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 418 + if (!cpumask_empty(tmpmask)) { 419 + /* Give any dropped cpus to parent rdtgroup */ 420 + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); 421 + update_closid_rmid(tmpmask, prgrp); 422 + } 423 + 424 + /* 425 + * If we added cpus, remove them from previous group that owned them 426 + * and update per-cpu rmid 427 + */ 428 + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 429 + if (!cpumask_empty(tmpmask)) { 430 + head = &prgrp->mon.crdtgrp_list; 431 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 432 + if (crgrp == rdtgrp) 433 + continue; 434 + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, 435 + tmpmask); 436 + } 437 + update_closid_rmid(tmpmask, rdtgrp); 438 + } 439 + 440 + /* Done pushing/pulling - update this group with new mask */ 441 + cpumask_copy(&rdtgrp->cpu_mask, newmask); 442 + 443 + return 0; 444 + } 445 + 446 + static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) 447 + { 448 + struct rdtgroup *crgrp; 449 + 450 + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); 451 + /* update the child mon group masks as well*/ 452 + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) 453 + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); 454 + } 455 + 456 + static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, 457 + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) 458 + { 459 + struct rdtgroup *r, *crgrp; 460 + struct list_head *head; 461 + 462 + /* Check whether cpus are dropped from this group */ 463 + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 464 + if (!cpumask_empty(tmpmask)) { 465 + /* Can't drop from default group */ 466 + if (rdtgrp == &rdtgroup_default) { 467 + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); 468 + return -EINVAL; 469 + } 470 + 471 + /* Give any dropped cpus to rdtgroup_default */ 472 + cpumask_or(&rdtgroup_default.cpu_mask, 473 + &rdtgroup_default.cpu_mask, tmpmask); 474 + update_closid_rmid(tmpmask, &rdtgroup_default); 475 + } 476 + 477 + /* 478 + * If we added cpus, remove them from previous group and 479 + * the prev group's child groups that owned them 480 + * and update per-cpu closid/rmid. 481 + */ 482 + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 483 + if (!cpumask_empty(tmpmask)) { 484 + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { 485 + if (r == rdtgrp) 486 + continue; 487 + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); 488 + if (!cpumask_empty(tmpmask1)) 489 + cpumask_rdtgrp_clear(r, tmpmask1); 490 + } 491 + update_closid_rmid(tmpmask, rdtgrp); 492 + } 493 + 494 + /* Done pushing/pulling - update this group with new mask */ 495 + cpumask_copy(&rdtgrp->cpu_mask, newmask); 496 + 497 + /* 498 + * Clear child mon group masks since there is a new parent mask 499 + * now and update the rmid for the cpus the child lost. 500 + */ 501 + head = &rdtgrp->mon.crdtgrp_list; 502 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 503 + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); 504 + update_closid_rmid(tmpmask, rdtgrp); 505 + cpumask_clear(&crgrp->cpu_mask); 506 + } 507 + 508 + return 0; 509 + } 510 + 511 + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 512 + char *buf, size_t nbytes, loff_t off) 513 + { 514 + cpumask_var_t tmpmask, newmask, tmpmask1; 515 + struct rdtgroup *rdtgrp; 516 + int ret; 517 + 518 + if (!buf) 519 + return -EINVAL; 520 + 521 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 522 + return -ENOMEM; 523 + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { 524 + free_cpumask_var(tmpmask); 525 + return -ENOMEM; 526 + } 527 + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { 528 + free_cpumask_var(tmpmask); 529 + free_cpumask_var(newmask); 530 + return -ENOMEM; 531 + } 532 + 533 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 534 + if (!rdtgrp) { 535 + ret = -ENOENT; 536 + goto unlock; 537 + } 538 + 539 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 540 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 541 + ret = -EINVAL; 542 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 543 + goto unlock; 544 + } 545 + 546 + if (is_cpu_list(of)) 547 + ret = cpulist_parse(buf, newmask); 548 + else 549 + ret = cpumask_parse(buf, newmask); 550 + 551 + if (ret) { 552 + rdt_last_cmd_puts("Bad CPU list/mask\n"); 553 + goto unlock; 554 + } 555 + 556 + /* check that user didn't specify any offline cpus */ 557 + cpumask_andnot(tmpmask, newmask, cpu_online_mask); 558 + if (!cpumask_empty(tmpmask)) { 559 + ret = -EINVAL; 560 + rdt_last_cmd_puts("Can only assign online CPUs\n"); 561 + goto unlock; 562 + } 563 + 564 + if (rdtgrp->type == RDTCTRL_GROUP) 565 + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); 566 + else if (rdtgrp->type == RDTMON_GROUP) 567 + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); 568 + else 569 + ret = -EINVAL; 570 + 571 + unlock: 572 + rdtgroup_kn_unlock(of->kn); 573 + free_cpumask_var(tmpmask); 574 + free_cpumask_var(newmask); 575 + free_cpumask_var(tmpmask1); 576 + 577 + return ret ?: nbytes; 578 + } 579 + 580 + /** 581 + * rdtgroup_remove - the helper to remove resource group safely 582 + * @rdtgrp: resource group to remove 583 + * 584 + * On resource group creation via a mkdir, an extra kernfs_node reference is 585 + * taken to ensure that the rdtgroup structure remains accessible for the 586 + * rdtgroup_kn_unlock() calls where it is removed. 587 + * 588 + * Drop the extra reference here, then free the rdtgroup structure. 589 + * 590 + * Return: void 591 + */ 592 + static void rdtgroup_remove(struct rdtgroup *rdtgrp) 593 + { 594 + kernfs_put(rdtgrp->kn); 595 + kfree(rdtgrp); 596 + } 597 + 598 + static void _update_task_closid_rmid(void *task) 599 + { 600 + /* 601 + * If the task is still current on this CPU, update PQR_ASSOC MSR. 602 + * Otherwise, the MSR is updated when the task is scheduled in. 603 + */ 604 + if (task == current) 605 + resctrl_arch_sched_in(task); 606 + } 607 + 608 + static void update_task_closid_rmid(struct task_struct *t) 609 + { 610 + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) 611 + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); 612 + else 613 + _update_task_closid_rmid(t); 614 + } 615 + 616 + static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) 617 + { 618 + u32 closid, rmid = rdtgrp->mon.rmid; 619 + 620 + if (rdtgrp->type == RDTCTRL_GROUP) 621 + closid = rdtgrp->closid; 622 + else if (rdtgrp->type == RDTMON_GROUP) 623 + closid = rdtgrp->mon.parent->closid; 624 + else 625 + return false; 626 + 627 + return resctrl_arch_match_closid(tsk, closid) && 628 + resctrl_arch_match_rmid(tsk, closid, rmid); 629 + } 630 + 631 + static int __rdtgroup_move_task(struct task_struct *tsk, 632 + struct rdtgroup *rdtgrp) 633 + { 634 + /* If the task is already in rdtgrp, no need to move the task. */ 635 + if (task_in_rdtgroup(tsk, rdtgrp)) 636 + return 0; 637 + 638 + /* 639 + * Set the task's closid/rmid before the PQR_ASSOC MSR can be 640 + * updated by them. 641 + * 642 + * For ctrl_mon groups, move both closid and rmid. 643 + * For monitor groups, can move the tasks only from 644 + * their parent CTRL group. 645 + */ 646 + if (rdtgrp->type == RDTMON_GROUP && 647 + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { 648 + rdt_last_cmd_puts("Can't move task to different control group\n"); 649 + return -EINVAL; 650 + } 651 + 652 + if (rdtgrp->type == RDTMON_GROUP) 653 + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, 654 + rdtgrp->mon.rmid); 655 + else 656 + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, 657 + rdtgrp->mon.rmid); 658 + 659 + /* 660 + * Ensure the task's closid and rmid are written before determining if 661 + * the task is current that will decide if it will be interrupted. 662 + * This pairs with the full barrier between the rq->curr update and 663 + * resctrl_arch_sched_in() during context switch. 664 + */ 665 + smp_mb(); 666 + 667 + /* 668 + * By now, the task's closid and rmid are set. If the task is current 669 + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource 670 + * group go into effect. If the task is not current, the MSR will be 671 + * updated when the task is scheduled in. 672 + */ 673 + update_task_closid_rmid(tsk); 674 + 675 + return 0; 676 + } 677 + 678 + static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) 679 + { 680 + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && 681 + resctrl_arch_match_closid(t, r->closid)); 682 + } 683 + 684 + static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) 685 + { 686 + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && 687 + resctrl_arch_match_rmid(t, r->mon.parent->closid, 688 + r->mon.rmid)); 689 + } 690 + 691 + /** 692 + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group 693 + * @r: Resource group 694 + * 695 + * Return: 1 if tasks have been assigned to @r, 0 otherwise 696 + */ 697 + int rdtgroup_tasks_assigned(struct rdtgroup *r) 698 + { 699 + struct task_struct *p, *t; 700 + int ret = 0; 701 + 702 + lockdep_assert_held(&rdtgroup_mutex); 703 + 704 + rcu_read_lock(); 705 + for_each_process_thread(p, t) { 706 + if (is_closid_match(t, r) || is_rmid_match(t, r)) { 707 + ret = 1; 708 + break; 709 + } 710 + } 711 + rcu_read_unlock(); 712 + 713 + return ret; 714 + } 715 + 716 + static int rdtgroup_task_write_permission(struct task_struct *task, 717 + struct kernfs_open_file *of) 718 + { 719 + const struct cred *tcred = get_task_cred(task); 720 + const struct cred *cred = current_cred(); 721 + int ret = 0; 722 + 723 + /* 724 + * Even if we're attaching all tasks in the thread group, we only 725 + * need to check permissions on one of them. 726 + */ 727 + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 728 + !uid_eq(cred->euid, tcred->uid) && 729 + !uid_eq(cred->euid, tcred->suid)) { 730 + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); 731 + ret = -EPERM; 732 + } 733 + 734 + put_cred(tcred); 735 + return ret; 736 + } 737 + 738 + static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, 739 + struct kernfs_open_file *of) 740 + { 741 + struct task_struct *tsk; 742 + int ret; 743 + 744 + rcu_read_lock(); 745 + if (pid) { 746 + tsk = find_task_by_vpid(pid); 747 + if (!tsk) { 748 + rcu_read_unlock(); 749 + rdt_last_cmd_printf("No task %d\n", pid); 750 + return -ESRCH; 751 + } 752 + } else { 753 + tsk = current; 754 + } 755 + 756 + get_task_struct(tsk); 757 + rcu_read_unlock(); 758 + 759 + ret = rdtgroup_task_write_permission(tsk, of); 760 + if (!ret) 761 + ret = __rdtgroup_move_task(tsk, rdtgrp); 762 + 763 + put_task_struct(tsk); 764 + return ret; 765 + } 766 + 767 + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, 768 + char *buf, size_t nbytes, loff_t off) 769 + { 770 + struct rdtgroup *rdtgrp; 771 + char *pid_str; 772 + int ret = 0; 773 + pid_t pid; 774 + 775 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 776 + if (!rdtgrp) { 777 + rdtgroup_kn_unlock(of->kn); 778 + return -ENOENT; 779 + } 780 + rdt_last_cmd_clear(); 781 + 782 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || 783 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 784 + ret = -EINVAL; 785 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 786 + goto unlock; 787 + } 788 + 789 + while (buf && buf[0] != '\0' && buf[0] != '\n') { 790 + pid_str = strim(strsep(&buf, ",")); 791 + 792 + if (kstrtoint(pid_str, 0, &pid)) { 793 + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); 794 + ret = -EINVAL; 795 + break; 796 + } 797 + 798 + if (pid < 0) { 799 + rdt_last_cmd_printf("Invalid pid %d\n", pid); 800 + ret = -EINVAL; 801 + break; 802 + } 803 + 804 + ret = rdtgroup_move_task(pid, rdtgrp, of); 805 + if (ret) { 806 + rdt_last_cmd_printf("Error while processing task %d\n", pid); 807 + break; 808 + } 809 + } 810 + 811 + unlock: 812 + rdtgroup_kn_unlock(of->kn); 813 + 814 + return ret ?: nbytes; 815 + } 816 + 817 + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) 818 + { 819 + struct task_struct *p, *t; 820 + pid_t pid; 821 + 822 + rcu_read_lock(); 823 + for_each_process_thread(p, t) { 824 + if (is_closid_match(t, r) || is_rmid_match(t, r)) { 825 + pid = task_pid_vnr(t); 826 + if (pid) 827 + seq_printf(s, "%d\n", pid); 828 + } 829 + } 830 + rcu_read_unlock(); 831 + } 832 + 833 + static int rdtgroup_tasks_show(struct kernfs_open_file *of, 834 + struct seq_file *s, void *v) 835 + { 836 + struct rdtgroup *rdtgrp; 837 + int ret = 0; 838 + 839 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 840 + if (rdtgrp) 841 + show_rdt_tasks(rdtgrp, s); 842 + else 843 + ret = -ENOENT; 844 + rdtgroup_kn_unlock(of->kn); 845 + 846 + return ret; 847 + } 848 + 849 + static int rdtgroup_closid_show(struct kernfs_open_file *of, 850 + struct seq_file *s, void *v) 851 + { 852 + struct rdtgroup *rdtgrp; 853 + int ret = 0; 854 + 855 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 856 + if (rdtgrp) 857 + seq_printf(s, "%u\n", rdtgrp->closid); 858 + else 859 + ret = -ENOENT; 860 + rdtgroup_kn_unlock(of->kn); 861 + 862 + return ret; 863 + } 864 + 865 + static int rdtgroup_rmid_show(struct kernfs_open_file *of, 866 + struct seq_file *s, void *v) 867 + { 868 + struct rdtgroup *rdtgrp; 869 + int ret = 0; 870 + 871 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 872 + if (rdtgrp) 873 + seq_printf(s, "%u\n", rdtgrp->mon.rmid); 874 + else 875 + ret = -ENOENT; 876 + rdtgroup_kn_unlock(of->kn); 877 + 878 + return ret; 879 + } 880 + 881 + #ifdef CONFIG_PROC_CPU_RESCTRL 882 + /* 883 + * A task can only be part of one resctrl control group and of one monitor 884 + * group which is associated to that control group. 885 + * 886 + * 1) res: 887 + * mon: 888 + * 889 + * resctrl is not available. 890 + * 891 + * 2) res:/ 892 + * mon: 893 + * 894 + * Task is part of the root resctrl control group, and it is not associated 895 + * to any monitor group. 896 + * 897 + * 3) res:/ 898 + * mon:mon0 899 + * 900 + * Task is part of the root resctrl control group and monitor group mon0. 901 + * 902 + * 4) res:group0 903 + * mon: 904 + * 905 + * Task is part of resctrl control group group0, and it is not associated 906 + * to any monitor group. 907 + * 908 + * 5) res:group0 909 + * mon:mon1 910 + * 911 + * Task is part of resctrl control group group0 and monitor group mon1. 912 + */ 913 + int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, 914 + struct pid *pid, struct task_struct *tsk) 915 + { 916 + struct rdtgroup *rdtg; 917 + int ret = 0; 918 + 919 + mutex_lock(&rdtgroup_mutex); 920 + 921 + /* Return empty if resctrl has not been mounted. */ 922 + if (!resctrl_mounted) { 923 + seq_puts(s, "res:\nmon:\n"); 924 + goto unlock; 925 + } 926 + 927 + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { 928 + struct rdtgroup *crg; 929 + 930 + /* 931 + * Task information is only relevant for shareable 932 + * and exclusive groups. 933 + */ 934 + if (rdtg->mode != RDT_MODE_SHAREABLE && 935 + rdtg->mode != RDT_MODE_EXCLUSIVE) 936 + continue; 937 + 938 + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) 939 + continue; 940 + 941 + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", 942 + rdt_kn_name(rdtg->kn)); 943 + seq_puts(s, "mon:"); 944 + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, 945 + mon.crdtgrp_list) { 946 + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, 947 + crg->mon.rmid)) 948 + continue; 949 + seq_printf(s, "%s", rdt_kn_name(crg->kn)); 950 + break; 951 + } 952 + seq_putc(s, '\n'); 953 + goto unlock; 954 + } 955 + /* 956 + * The above search should succeed. Otherwise return 957 + * with an error. 958 + */ 959 + ret = -ENOENT; 960 + unlock: 961 + mutex_unlock(&rdtgroup_mutex); 962 + 963 + return ret; 964 + } 965 + #endif 966 + 967 + static int rdt_last_cmd_status_show(struct kernfs_open_file *of, 968 + struct seq_file *seq, void *v) 969 + { 970 + int len; 971 + 972 + mutex_lock(&rdtgroup_mutex); 973 + len = seq_buf_used(&last_cmd_status); 974 + if (len) 975 + seq_printf(seq, "%.*s", len, last_cmd_status_buf); 976 + else 977 + seq_puts(seq, "ok\n"); 978 + mutex_unlock(&rdtgroup_mutex); 979 + return 0; 980 + } 981 + 982 + static void *rdt_kn_parent_priv(struct kernfs_node *kn) 983 + { 984 + /* 985 + * The parent pointer is only valid within RCU section since it can be 986 + * replaced. 987 + */ 988 + guard(rcu)(); 989 + return rcu_dereference(kn->__parent)->priv; 990 + } 991 + 992 + static int rdt_num_closids_show(struct kernfs_open_file *of, 993 + struct seq_file *seq, void *v) 994 + { 995 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 996 + 997 + seq_printf(seq, "%u\n", s->num_closid); 998 + return 0; 999 + } 1000 + 1001 + static int rdt_default_ctrl_show(struct kernfs_open_file *of, 1002 + struct seq_file *seq, void *v) 1003 + { 1004 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1005 + struct rdt_resource *r = s->res; 1006 + 1007 + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); 1008 + return 0; 1009 + } 1010 + 1011 + static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, 1012 + struct seq_file *seq, void *v) 1013 + { 1014 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1015 + struct rdt_resource *r = s->res; 1016 + 1017 + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); 1018 + return 0; 1019 + } 1020 + 1021 + static int rdt_shareable_bits_show(struct kernfs_open_file *of, 1022 + struct seq_file *seq, void *v) 1023 + { 1024 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1025 + struct rdt_resource *r = s->res; 1026 + 1027 + seq_printf(seq, "%x\n", r->cache.shareable_bits); 1028 + return 0; 1029 + } 1030 + 1031 + /* 1032 + * rdt_bit_usage_show - Display current usage of resources 1033 + * 1034 + * A domain is a shared resource that can now be allocated differently. Here 1035 + * we display the current regions of the domain as an annotated bitmask. 1036 + * For each domain of this resource its allocation bitmask 1037 + * is annotated as below to indicate the current usage of the corresponding bit: 1038 + * 0 - currently unused 1039 + * X - currently available for sharing and used by software and hardware 1040 + * H - currently used by hardware only but available for software use 1041 + * S - currently used and shareable by software only 1042 + * E - currently used exclusively by one resource group 1043 + * P - currently pseudo-locked by one resource group 1044 + */ 1045 + static int rdt_bit_usage_show(struct kernfs_open_file *of, 1046 + struct seq_file *seq, void *v) 1047 + { 1048 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1049 + /* 1050 + * Use unsigned long even though only 32 bits are used to ensure 1051 + * test_bit() is used safely. 1052 + */ 1053 + unsigned long sw_shareable = 0, hw_shareable = 0; 1054 + unsigned long exclusive = 0, pseudo_locked = 0; 1055 + struct rdt_resource *r = s->res; 1056 + struct rdt_ctrl_domain *dom; 1057 + int i, hwb, swb, excl, psl; 1058 + enum rdtgrp_mode mode; 1059 + bool sep = false; 1060 + u32 ctrl_val; 1061 + 1062 + cpus_read_lock(); 1063 + mutex_lock(&rdtgroup_mutex); 1064 + hw_shareable = r->cache.shareable_bits; 1065 + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { 1066 + if (sep) 1067 + seq_putc(seq, ';'); 1068 + sw_shareable = 0; 1069 + exclusive = 0; 1070 + seq_printf(seq, "%d=", dom->hdr.id); 1071 + for (i = 0; i < closids_supported(); i++) { 1072 + if (!closid_allocated(i)) 1073 + continue; 1074 + ctrl_val = resctrl_arch_get_config(r, dom, i, 1075 + s->conf_type); 1076 + mode = rdtgroup_mode_by_closid(i); 1077 + switch (mode) { 1078 + case RDT_MODE_SHAREABLE: 1079 + sw_shareable |= ctrl_val; 1080 + break; 1081 + case RDT_MODE_EXCLUSIVE: 1082 + exclusive |= ctrl_val; 1083 + break; 1084 + case RDT_MODE_PSEUDO_LOCKSETUP: 1085 + /* 1086 + * RDT_MODE_PSEUDO_LOCKSETUP is possible 1087 + * here but not included since the CBM 1088 + * associated with this CLOSID in this mode 1089 + * is not initialized and no task or cpu can be 1090 + * assigned this CLOSID. 1091 + */ 1092 + break; 1093 + case RDT_MODE_PSEUDO_LOCKED: 1094 + case RDT_NUM_MODES: 1095 + WARN(1, 1096 + "invalid mode for closid %d\n", i); 1097 + break; 1098 + } 1099 + } 1100 + for (i = r->cache.cbm_len - 1; i >= 0; i--) { 1101 + pseudo_locked = dom->plr ? dom->plr->cbm : 0; 1102 + hwb = test_bit(i, &hw_shareable); 1103 + swb = test_bit(i, &sw_shareable); 1104 + excl = test_bit(i, &exclusive); 1105 + psl = test_bit(i, &pseudo_locked); 1106 + if (hwb && swb) 1107 + seq_putc(seq, 'X'); 1108 + else if (hwb && !swb) 1109 + seq_putc(seq, 'H'); 1110 + else if (!hwb && swb) 1111 + seq_putc(seq, 'S'); 1112 + else if (excl) 1113 + seq_putc(seq, 'E'); 1114 + else if (psl) 1115 + seq_putc(seq, 'P'); 1116 + else /* Unused bits remain */ 1117 + seq_putc(seq, '0'); 1118 + } 1119 + sep = true; 1120 + } 1121 + seq_putc(seq, '\n'); 1122 + mutex_unlock(&rdtgroup_mutex); 1123 + cpus_read_unlock(); 1124 + return 0; 1125 + } 1126 + 1127 + static int rdt_min_bw_show(struct kernfs_open_file *of, 1128 + struct seq_file *seq, void *v) 1129 + { 1130 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1131 + struct rdt_resource *r = s->res; 1132 + 1133 + seq_printf(seq, "%u\n", r->membw.min_bw); 1134 + return 0; 1135 + } 1136 + 1137 + static int rdt_num_rmids_show(struct kernfs_open_file *of, 1138 + struct seq_file *seq, void *v) 1139 + { 1140 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1141 + 1142 + seq_printf(seq, "%d\n", r->num_rmid); 1143 + 1144 + return 0; 1145 + } 1146 + 1147 + static int rdt_mon_features_show(struct kernfs_open_file *of, 1148 + struct seq_file *seq, void *v) 1149 + { 1150 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1151 + struct mon_evt *mevt; 1152 + 1153 + list_for_each_entry(mevt, &r->evt_list, list) { 1154 + seq_printf(seq, "%s\n", mevt->name); 1155 + if (mevt->configurable) 1156 + seq_printf(seq, "%s_config\n", mevt->name); 1157 + } 1158 + 1159 + return 0; 1160 + } 1161 + 1162 + static int rdt_bw_gran_show(struct kernfs_open_file *of, 1163 + struct seq_file *seq, void *v) 1164 + { 1165 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1166 + struct rdt_resource *r = s->res; 1167 + 1168 + seq_printf(seq, "%u\n", r->membw.bw_gran); 1169 + return 0; 1170 + } 1171 + 1172 + static int rdt_delay_linear_show(struct kernfs_open_file *of, 1173 + struct seq_file *seq, void *v) 1174 + { 1175 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1176 + struct rdt_resource *r = s->res; 1177 + 1178 + seq_printf(seq, "%u\n", r->membw.delay_linear); 1179 + return 0; 1180 + } 1181 + 1182 + static int max_threshold_occ_show(struct kernfs_open_file *of, 1183 + struct seq_file *seq, void *v) 1184 + { 1185 + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); 1186 + 1187 + return 0; 1188 + } 1189 + 1190 + static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, 1191 + struct seq_file *seq, void *v) 1192 + { 1193 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1194 + struct rdt_resource *r = s->res; 1195 + 1196 + switch (r->membw.throttle_mode) { 1197 + case THREAD_THROTTLE_PER_THREAD: 1198 + seq_puts(seq, "per-thread\n"); 1199 + return 0; 1200 + case THREAD_THROTTLE_MAX: 1201 + seq_puts(seq, "max\n"); 1202 + return 0; 1203 + case THREAD_THROTTLE_UNDEFINED: 1204 + seq_puts(seq, "undefined\n"); 1205 + return 0; 1206 + } 1207 + 1208 + WARN_ON_ONCE(1); 1209 + 1210 + return 0; 1211 + } 1212 + 1213 + static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, 1214 + char *buf, size_t nbytes, loff_t off) 1215 + { 1216 + unsigned int bytes; 1217 + int ret; 1218 + 1219 + ret = kstrtouint(buf, 0, &bytes); 1220 + if (ret) 1221 + return ret; 1222 + 1223 + if (bytes > resctrl_rmid_realloc_limit) 1224 + return -EINVAL; 1225 + 1226 + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); 1227 + 1228 + return nbytes; 1229 + } 1230 + 1231 + /* 1232 + * rdtgroup_mode_show - Display mode of this resource group 1233 + */ 1234 + static int rdtgroup_mode_show(struct kernfs_open_file *of, 1235 + struct seq_file *s, void *v) 1236 + { 1237 + struct rdtgroup *rdtgrp; 1238 + 1239 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1240 + if (!rdtgrp) { 1241 + rdtgroup_kn_unlock(of->kn); 1242 + return -ENOENT; 1243 + } 1244 + 1245 + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); 1246 + 1247 + rdtgroup_kn_unlock(of->kn); 1248 + return 0; 1249 + } 1250 + 1251 + static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) 1252 + { 1253 + switch (my_type) { 1254 + case CDP_CODE: 1255 + return CDP_DATA; 1256 + case CDP_DATA: 1257 + return CDP_CODE; 1258 + default: 1259 + case CDP_NONE: 1260 + return CDP_NONE; 1261 + } 1262 + } 1263 + 1264 + static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, 1265 + struct seq_file *seq, void *v) 1266 + { 1267 + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); 1268 + struct rdt_resource *r = s->res; 1269 + 1270 + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); 1271 + 1272 + return 0; 1273 + } 1274 + 1275 + /** 1276 + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other 1277 + * @r: Resource to which domain instance @d belongs. 1278 + * @d: The domain instance for which @closid is being tested. 1279 + * @cbm: Capacity bitmask being tested. 1280 + * @closid: Intended closid for @cbm. 1281 + * @type: CDP type of @r. 1282 + * @exclusive: Only check if overlaps with exclusive resource groups 1283 + * 1284 + * Checks if provided @cbm intended to be used for @closid on domain 1285 + * @d overlaps with any other closids or other hardware usage associated 1286 + * with this domain. If @exclusive is true then only overlaps with 1287 + * resource groups in exclusive mode will be considered. If @exclusive 1288 + * is false then overlaps with any resource group or hardware entities 1289 + * will be considered. 1290 + * 1291 + * @cbm is unsigned long, even if only 32 bits are used, to make the 1292 + * bitmap functions work correctly. 1293 + * 1294 + * Return: false if CBM does not overlap, true if it does. 1295 + */ 1296 + static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, 1297 + unsigned long cbm, int closid, 1298 + enum resctrl_conf_type type, bool exclusive) 1299 + { 1300 + enum rdtgrp_mode mode; 1301 + unsigned long ctrl_b; 1302 + int i; 1303 + 1304 + /* Check for any overlap with regions used by hardware directly */ 1305 + if (!exclusive) { 1306 + ctrl_b = r->cache.shareable_bits; 1307 + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) 1308 + return true; 1309 + } 1310 + 1311 + /* Check for overlap with other resource groups */ 1312 + for (i = 0; i < closids_supported(); i++) { 1313 + ctrl_b = resctrl_arch_get_config(r, d, i, type); 1314 + mode = rdtgroup_mode_by_closid(i); 1315 + if (closid_allocated(i) && i != closid && 1316 + mode != RDT_MODE_PSEUDO_LOCKSETUP) { 1317 + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { 1318 + if (exclusive) { 1319 + if (mode == RDT_MODE_EXCLUSIVE) 1320 + return true; 1321 + continue; 1322 + } 1323 + return true; 1324 + } 1325 + } 1326 + } 1327 + 1328 + return false; 1329 + } 1330 + 1331 + /** 1332 + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware 1333 + * @s: Schema for the resource to which domain instance @d belongs. 1334 + * @d: The domain instance for which @closid is being tested. 1335 + * @cbm: Capacity bitmask being tested. 1336 + * @closid: Intended closid for @cbm. 1337 + * @exclusive: Only check if overlaps with exclusive resource groups 1338 + * 1339 + * Resources that can be allocated using a CBM can use the CBM to control 1340 + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test 1341 + * for overlap. Overlap test is not limited to the specific resource for 1342 + * which the CBM is intended though - when dealing with CDP resources that 1343 + * share the underlying hardware the overlap check should be performed on 1344 + * the CDP resource sharing the hardware also. 1345 + * 1346 + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the 1347 + * overlap test. 1348 + * 1349 + * Return: true if CBM overlap detected, false if there is no overlap 1350 + */ 1351 + bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, 1352 + unsigned long cbm, int closid, bool exclusive) 1353 + { 1354 + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 1355 + struct rdt_resource *r = s->res; 1356 + 1357 + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, 1358 + exclusive)) 1359 + return true; 1360 + 1361 + if (!resctrl_arch_get_cdp_enabled(r->rid)) 1362 + return false; 1363 + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); 1364 + } 1365 + 1366 + /** 1367 + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive 1368 + * @rdtgrp: Resource group identified through its closid. 1369 + * 1370 + * An exclusive resource group implies that there should be no sharing of 1371 + * its allocated resources. At the time this group is considered to be 1372 + * exclusive this test can determine if its current schemata supports this 1373 + * setting by testing for overlap with all other resource groups. 1374 + * 1375 + * Return: true if resource group can be exclusive, false if there is overlap 1376 + * with allocations of other resource groups and thus this resource group 1377 + * cannot be exclusive. 1378 + */ 1379 + static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) 1380 + { 1381 + int closid = rdtgrp->closid; 1382 + struct rdt_ctrl_domain *d; 1383 + struct resctrl_schema *s; 1384 + struct rdt_resource *r; 1385 + bool has_cache = false; 1386 + u32 ctrl; 1387 + 1388 + /* Walking r->domains, ensure it can't race with cpuhp */ 1389 + lockdep_assert_cpus_held(); 1390 + 1391 + list_for_each_entry(s, &resctrl_schema_all, list) { 1392 + r = s->res; 1393 + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) 1394 + continue; 1395 + has_cache = true; 1396 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1397 + ctrl = resctrl_arch_get_config(r, d, closid, 1398 + s->conf_type); 1399 + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { 1400 + rdt_last_cmd_puts("Schemata overlaps\n"); 1401 + return false; 1402 + } 1403 + } 1404 + } 1405 + 1406 + if (!has_cache) { 1407 + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); 1408 + return false; 1409 + } 1410 + 1411 + return true; 1412 + } 1413 + 1414 + /* 1415 + * rdtgroup_mode_write - Modify the resource group's mode 1416 + */ 1417 + static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, 1418 + char *buf, size_t nbytes, loff_t off) 1419 + { 1420 + struct rdtgroup *rdtgrp; 1421 + enum rdtgrp_mode mode; 1422 + int ret = 0; 1423 + 1424 + /* Valid input requires a trailing newline */ 1425 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1426 + return -EINVAL; 1427 + buf[nbytes - 1] = '\0'; 1428 + 1429 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1430 + if (!rdtgrp) { 1431 + rdtgroup_kn_unlock(of->kn); 1432 + return -ENOENT; 1433 + } 1434 + 1435 + rdt_last_cmd_clear(); 1436 + 1437 + mode = rdtgrp->mode; 1438 + 1439 + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || 1440 + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || 1441 + (!strcmp(buf, "pseudo-locksetup") && 1442 + mode == RDT_MODE_PSEUDO_LOCKSETUP) || 1443 + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) 1444 + goto out; 1445 + 1446 + if (mode == RDT_MODE_PSEUDO_LOCKED) { 1447 + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); 1448 + ret = -EINVAL; 1449 + goto out; 1450 + } 1451 + 1452 + if (!strcmp(buf, "shareable")) { 1453 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1454 + ret = rdtgroup_locksetup_exit(rdtgrp); 1455 + if (ret) 1456 + goto out; 1457 + } 1458 + rdtgrp->mode = RDT_MODE_SHAREABLE; 1459 + } else if (!strcmp(buf, "exclusive")) { 1460 + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { 1461 + ret = -EINVAL; 1462 + goto out; 1463 + } 1464 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1465 + ret = rdtgroup_locksetup_exit(rdtgrp); 1466 + if (ret) 1467 + goto out; 1468 + } 1469 + rdtgrp->mode = RDT_MODE_EXCLUSIVE; 1470 + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && 1471 + !strcmp(buf, "pseudo-locksetup")) { 1472 + ret = rdtgroup_locksetup_enter(rdtgrp); 1473 + if (ret) 1474 + goto out; 1475 + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; 1476 + } else { 1477 + rdt_last_cmd_puts("Unknown or unsupported mode\n"); 1478 + ret = -EINVAL; 1479 + } 1480 + 1481 + out: 1482 + rdtgroup_kn_unlock(of->kn); 1483 + return ret ?: nbytes; 1484 + } 1485 + 1486 + /** 1487 + * rdtgroup_cbm_to_size - Translate CBM to size in bytes 1488 + * @r: RDT resource to which @d belongs. 1489 + * @d: RDT domain instance. 1490 + * @cbm: bitmask for which the size should be computed. 1491 + * 1492 + * The bitmask provided associated with the RDT domain instance @d will be 1493 + * translated into how many bytes it represents. The size in bytes is 1494 + * computed by first dividing the total cache size by the CBM length to 1495 + * determine how many bytes each bit in the bitmask represents. The result 1496 + * is multiplied with the number of bits set in the bitmask. 1497 + * 1498 + * @cbm is unsigned long, even if only 32 bits are used to make the 1499 + * bitmap functions work correctly. 1500 + */ 1501 + unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, 1502 + struct rdt_ctrl_domain *d, unsigned long cbm) 1503 + { 1504 + unsigned int size = 0; 1505 + struct cacheinfo *ci; 1506 + int num_b; 1507 + 1508 + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) 1509 + return size; 1510 + 1511 + num_b = bitmap_weight(&cbm, r->cache.cbm_len); 1512 + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); 1513 + if (ci) 1514 + size = ci->size / r->cache.cbm_len * num_b; 1515 + 1516 + return size; 1517 + } 1518 + 1519 + bool is_mba_sc(struct rdt_resource *r) 1520 + { 1521 + if (!r) 1522 + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 1523 + 1524 + /* 1525 + * The software controller support is only applicable to MBA resource. 1526 + * Make sure to check for resource type. 1527 + */ 1528 + if (r->rid != RDT_RESOURCE_MBA) 1529 + return false; 1530 + 1531 + return r->membw.mba_sc; 1532 + } 1533 + 1534 + /* 1535 + * rdtgroup_size_show - Display size in bytes of allocated regions 1536 + * 1537 + * The "size" file mirrors the layout of the "schemata" file, printing the 1538 + * size in bytes of each region instead of the capacity bitmask. 1539 + */ 1540 + static int rdtgroup_size_show(struct kernfs_open_file *of, 1541 + struct seq_file *s, void *v) 1542 + { 1543 + struct resctrl_schema *schema; 1544 + enum resctrl_conf_type type; 1545 + struct rdt_ctrl_domain *d; 1546 + struct rdtgroup *rdtgrp; 1547 + struct rdt_resource *r; 1548 + unsigned int size; 1549 + int ret = 0; 1550 + u32 closid; 1551 + bool sep; 1552 + u32 ctrl; 1553 + 1554 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 1555 + if (!rdtgrp) { 1556 + rdtgroup_kn_unlock(of->kn); 1557 + return -ENOENT; 1558 + } 1559 + 1560 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 1561 + if (!rdtgrp->plr->d) { 1562 + rdt_last_cmd_clear(); 1563 + rdt_last_cmd_puts("Cache domain offline\n"); 1564 + ret = -ENODEV; 1565 + } else { 1566 + seq_printf(s, "%*s:", max_name_width, 1567 + rdtgrp->plr->s->name); 1568 + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, 1569 + rdtgrp->plr->d, 1570 + rdtgrp->plr->cbm); 1571 + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); 1572 + } 1573 + goto out; 1574 + } 1575 + 1576 + closid = rdtgrp->closid; 1577 + 1578 + list_for_each_entry(schema, &resctrl_schema_all, list) { 1579 + r = schema->res; 1580 + type = schema->conf_type; 1581 + sep = false; 1582 + seq_printf(s, "%*s:", max_name_width, schema->name); 1583 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 1584 + if (sep) 1585 + seq_putc(s, ';'); 1586 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { 1587 + size = 0; 1588 + } else { 1589 + if (is_mba_sc(r)) 1590 + ctrl = d->mbps_val[closid]; 1591 + else 1592 + ctrl = resctrl_arch_get_config(r, d, 1593 + closid, 1594 + type); 1595 + if (r->rid == RDT_RESOURCE_MBA || 1596 + r->rid == RDT_RESOURCE_SMBA) 1597 + size = ctrl; 1598 + else 1599 + size = rdtgroup_cbm_to_size(r, d, ctrl); 1600 + } 1601 + seq_printf(s, "%d=%u", d->hdr.id, size); 1602 + sep = true; 1603 + } 1604 + seq_putc(s, '\n'); 1605 + } 1606 + 1607 + out: 1608 + rdtgroup_kn_unlock(of->kn); 1609 + 1610 + return ret; 1611 + } 1612 + 1613 + static void mondata_config_read(struct resctrl_mon_config_info *mon_info) 1614 + { 1615 + smp_call_function_any(&mon_info->d->hdr.cpu_mask, 1616 + resctrl_arch_mon_event_config_read, mon_info, 1); 1617 + } 1618 + 1619 + static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) 1620 + { 1621 + struct resctrl_mon_config_info mon_info; 1622 + struct rdt_mon_domain *dom; 1623 + bool sep = false; 1624 + 1625 + cpus_read_lock(); 1626 + mutex_lock(&rdtgroup_mutex); 1627 + 1628 + list_for_each_entry(dom, &r->mon_domains, hdr.list) { 1629 + if (sep) 1630 + seq_puts(s, ";"); 1631 + 1632 + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); 1633 + mon_info.r = r; 1634 + mon_info.d = dom; 1635 + mon_info.evtid = evtid; 1636 + mondata_config_read(&mon_info); 1637 + 1638 + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); 1639 + sep = true; 1640 + } 1641 + seq_puts(s, "\n"); 1642 + 1643 + mutex_unlock(&rdtgroup_mutex); 1644 + cpus_read_unlock(); 1645 + 1646 + return 0; 1647 + } 1648 + 1649 + static int mbm_total_bytes_config_show(struct kernfs_open_file *of, 1650 + struct seq_file *seq, void *v) 1651 + { 1652 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1653 + 1654 + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); 1655 + 1656 + return 0; 1657 + } 1658 + 1659 + static int mbm_local_bytes_config_show(struct kernfs_open_file *of, 1660 + struct seq_file *seq, void *v) 1661 + { 1662 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1663 + 1664 + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); 1665 + 1666 + return 0; 1667 + } 1668 + 1669 + static void mbm_config_write_domain(struct rdt_resource *r, 1670 + struct rdt_mon_domain *d, u32 evtid, u32 val) 1671 + { 1672 + struct resctrl_mon_config_info mon_info = {0}; 1673 + 1674 + /* 1675 + * Read the current config value first. If both are the same then 1676 + * no need to write it again. 1677 + */ 1678 + mon_info.r = r; 1679 + mon_info.d = d; 1680 + mon_info.evtid = evtid; 1681 + mondata_config_read(&mon_info); 1682 + if (mon_info.mon_config == val) 1683 + return; 1684 + 1685 + mon_info.mon_config = val; 1686 + 1687 + /* 1688 + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the 1689 + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE 1690 + * are scoped at the domain level. Writing any of these MSRs 1691 + * on one CPU is observed by all the CPUs in the domain. 1692 + */ 1693 + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, 1694 + &mon_info, 1); 1695 + 1696 + /* 1697 + * When an Event Configuration is changed, the bandwidth counters 1698 + * for all RMIDs and Events will be cleared by the hardware. The 1699 + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for 1700 + * every RMID on the next read to any event for every RMID. 1701 + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) 1702 + * cleared while it is tracked by the hardware. Clear the 1703 + * mbm_local and mbm_total counts for all the RMIDs. 1704 + */ 1705 + resctrl_arch_reset_rmid_all(r, d); 1706 + } 1707 + 1708 + static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) 1709 + { 1710 + char *dom_str = NULL, *id_str; 1711 + unsigned long dom_id, val; 1712 + struct rdt_mon_domain *d; 1713 + 1714 + /* Walking r->domains, ensure it can't race with cpuhp */ 1715 + lockdep_assert_cpus_held(); 1716 + 1717 + next: 1718 + if (!tok || tok[0] == '\0') 1719 + return 0; 1720 + 1721 + /* Start processing the strings for each domain */ 1722 + dom_str = strim(strsep(&tok, ";")); 1723 + id_str = strsep(&dom_str, "="); 1724 + 1725 + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { 1726 + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); 1727 + return -EINVAL; 1728 + } 1729 + 1730 + if (!dom_str || kstrtoul(dom_str, 16, &val)) { 1731 + rdt_last_cmd_puts("Non-numeric event configuration value\n"); 1732 + return -EINVAL; 1733 + } 1734 + 1735 + /* Value from user cannot be more than the supported set of events */ 1736 + if ((val & r->mbm_cfg_mask) != val) { 1737 + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", 1738 + r->mbm_cfg_mask); 1739 + return -EINVAL; 1740 + } 1741 + 1742 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 1743 + if (d->hdr.id == dom_id) { 1744 + mbm_config_write_domain(r, d, evtid, val); 1745 + goto next; 1746 + } 1747 + } 1748 + 1749 + return -EINVAL; 1750 + } 1751 + 1752 + static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, 1753 + char *buf, size_t nbytes, 1754 + loff_t off) 1755 + { 1756 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1757 + int ret; 1758 + 1759 + /* Valid input requires a trailing newline */ 1760 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1761 + return -EINVAL; 1762 + 1763 + cpus_read_lock(); 1764 + mutex_lock(&rdtgroup_mutex); 1765 + 1766 + rdt_last_cmd_clear(); 1767 + 1768 + buf[nbytes - 1] = '\0'; 1769 + 1770 + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); 1771 + 1772 + mutex_unlock(&rdtgroup_mutex); 1773 + cpus_read_unlock(); 1774 + 1775 + return ret ?: nbytes; 1776 + } 1777 + 1778 + static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, 1779 + char *buf, size_t nbytes, 1780 + loff_t off) 1781 + { 1782 + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); 1783 + int ret; 1784 + 1785 + /* Valid input requires a trailing newline */ 1786 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 1787 + return -EINVAL; 1788 + 1789 + cpus_read_lock(); 1790 + mutex_lock(&rdtgroup_mutex); 1791 + 1792 + rdt_last_cmd_clear(); 1793 + 1794 + buf[nbytes - 1] = '\0'; 1795 + 1796 + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); 1797 + 1798 + mutex_unlock(&rdtgroup_mutex); 1799 + cpus_read_unlock(); 1800 + 1801 + return ret ?: nbytes; 1802 + } 1803 + 1804 + /* rdtgroup information files for one cache resource. */ 1805 + static struct rftype res_common_files[] = { 1806 + { 1807 + .name = "last_cmd_status", 1808 + .mode = 0444, 1809 + .kf_ops = &rdtgroup_kf_single_ops, 1810 + .seq_show = rdt_last_cmd_status_show, 1811 + .fflags = RFTYPE_TOP_INFO, 1812 + }, 1813 + { 1814 + .name = "num_closids", 1815 + .mode = 0444, 1816 + .kf_ops = &rdtgroup_kf_single_ops, 1817 + .seq_show = rdt_num_closids_show, 1818 + .fflags = RFTYPE_CTRL_INFO, 1819 + }, 1820 + { 1821 + .name = "mon_features", 1822 + .mode = 0444, 1823 + .kf_ops = &rdtgroup_kf_single_ops, 1824 + .seq_show = rdt_mon_features_show, 1825 + .fflags = RFTYPE_MON_INFO, 1826 + }, 1827 + { 1828 + .name = "num_rmids", 1829 + .mode = 0444, 1830 + .kf_ops = &rdtgroup_kf_single_ops, 1831 + .seq_show = rdt_num_rmids_show, 1832 + .fflags = RFTYPE_MON_INFO, 1833 + }, 1834 + { 1835 + .name = "cbm_mask", 1836 + .mode = 0444, 1837 + .kf_ops = &rdtgroup_kf_single_ops, 1838 + .seq_show = rdt_default_ctrl_show, 1839 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1840 + }, 1841 + { 1842 + .name = "min_cbm_bits", 1843 + .mode = 0444, 1844 + .kf_ops = &rdtgroup_kf_single_ops, 1845 + .seq_show = rdt_min_cbm_bits_show, 1846 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1847 + }, 1848 + { 1849 + .name = "shareable_bits", 1850 + .mode = 0444, 1851 + .kf_ops = &rdtgroup_kf_single_ops, 1852 + .seq_show = rdt_shareable_bits_show, 1853 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1854 + }, 1855 + { 1856 + .name = "bit_usage", 1857 + .mode = 0444, 1858 + .kf_ops = &rdtgroup_kf_single_ops, 1859 + .seq_show = rdt_bit_usage_show, 1860 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1861 + }, 1862 + { 1863 + .name = "min_bandwidth", 1864 + .mode = 0444, 1865 + .kf_ops = &rdtgroup_kf_single_ops, 1866 + .seq_show = rdt_min_bw_show, 1867 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1868 + }, 1869 + { 1870 + .name = "bandwidth_gran", 1871 + .mode = 0444, 1872 + .kf_ops = &rdtgroup_kf_single_ops, 1873 + .seq_show = rdt_bw_gran_show, 1874 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1875 + }, 1876 + { 1877 + .name = "delay_linear", 1878 + .mode = 0444, 1879 + .kf_ops = &rdtgroup_kf_single_ops, 1880 + .seq_show = rdt_delay_linear_show, 1881 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, 1882 + }, 1883 + /* 1884 + * Platform specific which (if any) capabilities are provided by 1885 + * thread_throttle_mode. Defer "fflags" initialization to platform 1886 + * discovery. 1887 + */ 1888 + { 1889 + .name = "thread_throttle_mode", 1890 + .mode = 0444, 1891 + .kf_ops = &rdtgroup_kf_single_ops, 1892 + .seq_show = rdt_thread_throttle_mode_show, 1893 + }, 1894 + { 1895 + .name = "max_threshold_occupancy", 1896 + .mode = 0644, 1897 + .kf_ops = &rdtgroup_kf_single_ops, 1898 + .write = max_threshold_occ_write, 1899 + .seq_show = max_threshold_occ_show, 1900 + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, 1901 + }, 1902 + { 1903 + .name = "mbm_total_bytes_config", 1904 + .mode = 0644, 1905 + .kf_ops = &rdtgroup_kf_single_ops, 1906 + .seq_show = mbm_total_bytes_config_show, 1907 + .write = mbm_total_bytes_config_write, 1908 + }, 1909 + { 1910 + .name = "mbm_local_bytes_config", 1911 + .mode = 0644, 1912 + .kf_ops = &rdtgroup_kf_single_ops, 1913 + .seq_show = mbm_local_bytes_config_show, 1914 + .write = mbm_local_bytes_config_write, 1915 + }, 1916 + { 1917 + .name = "cpus", 1918 + .mode = 0644, 1919 + .kf_ops = &rdtgroup_kf_single_ops, 1920 + .write = rdtgroup_cpus_write, 1921 + .seq_show = rdtgroup_cpus_show, 1922 + .fflags = RFTYPE_BASE, 1923 + }, 1924 + { 1925 + .name = "cpus_list", 1926 + .mode = 0644, 1927 + .kf_ops = &rdtgroup_kf_single_ops, 1928 + .write = rdtgroup_cpus_write, 1929 + .seq_show = rdtgroup_cpus_show, 1930 + .flags = RFTYPE_FLAGS_CPUS_LIST, 1931 + .fflags = RFTYPE_BASE, 1932 + }, 1933 + { 1934 + .name = "tasks", 1935 + .mode = 0644, 1936 + .kf_ops = &rdtgroup_kf_single_ops, 1937 + .write = rdtgroup_tasks_write, 1938 + .seq_show = rdtgroup_tasks_show, 1939 + .fflags = RFTYPE_BASE, 1940 + }, 1941 + { 1942 + .name = "mon_hw_id", 1943 + .mode = 0444, 1944 + .kf_ops = &rdtgroup_kf_single_ops, 1945 + .seq_show = rdtgroup_rmid_show, 1946 + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, 1947 + }, 1948 + { 1949 + .name = "schemata", 1950 + .mode = 0644, 1951 + .kf_ops = &rdtgroup_kf_single_ops, 1952 + .write = rdtgroup_schemata_write, 1953 + .seq_show = rdtgroup_schemata_show, 1954 + .fflags = RFTYPE_CTRL_BASE, 1955 + }, 1956 + { 1957 + .name = "mba_MBps_event", 1958 + .mode = 0644, 1959 + .kf_ops = &rdtgroup_kf_single_ops, 1960 + .write = rdtgroup_mba_mbps_event_write, 1961 + .seq_show = rdtgroup_mba_mbps_event_show, 1962 + }, 1963 + { 1964 + .name = "mode", 1965 + .mode = 0644, 1966 + .kf_ops = &rdtgroup_kf_single_ops, 1967 + .write = rdtgroup_mode_write, 1968 + .seq_show = rdtgroup_mode_show, 1969 + .fflags = RFTYPE_CTRL_BASE, 1970 + }, 1971 + { 1972 + .name = "size", 1973 + .mode = 0444, 1974 + .kf_ops = &rdtgroup_kf_single_ops, 1975 + .seq_show = rdtgroup_size_show, 1976 + .fflags = RFTYPE_CTRL_BASE, 1977 + }, 1978 + { 1979 + .name = "sparse_masks", 1980 + .mode = 0444, 1981 + .kf_ops = &rdtgroup_kf_single_ops, 1982 + .seq_show = rdt_has_sparse_bitmasks_show, 1983 + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, 1984 + }, 1985 + { 1986 + .name = "ctrl_hw_id", 1987 + .mode = 0444, 1988 + .kf_ops = &rdtgroup_kf_single_ops, 1989 + .seq_show = rdtgroup_closid_show, 1990 + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, 1991 + }, 1992 + }; 1993 + 1994 + static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) 1995 + { 1996 + struct rftype *rfts, *rft; 1997 + int ret, len; 1998 + 1999 + rfts = res_common_files; 2000 + len = ARRAY_SIZE(res_common_files); 2001 + 2002 + lockdep_assert_held(&rdtgroup_mutex); 2003 + 2004 + if (resctrl_debug) 2005 + fflags |= RFTYPE_DEBUG; 2006 + 2007 + for (rft = rfts; rft < rfts + len; rft++) { 2008 + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { 2009 + ret = rdtgroup_add_file(kn, rft); 2010 + if (ret) 2011 + goto error; 2012 + } 2013 + } 2014 + 2015 + return 0; 2016 + error: 2017 + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); 2018 + while (--rft >= rfts) { 2019 + if ((fflags & rft->fflags) == rft->fflags) 2020 + kernfs_remove_by_name(kn, rft->name); 2021 + } 2022 + return ret; 2023 + } 2024 + 2025 + static struct rftype *rdtgroup_get_rftype_by_name(const char *name) 2026 + { 2027 + struct rftype *rfts, *rft; 2028 + int len; 2029 + 2030 + rfts = res_common_files; 2031 + len = ARRAY_SIZE(res_common_files); 2032 + 2033 + for (rft = rfts; rft < rfts + len; rft++) { 2034 + if (!strcmp(rft->name, name)) 2035 + return rft; 2036 + } 2037 + 2038 + return NULL; 2039 + } 2040 + 2041 + static void thread_throttle_mode_init(void) 2042 + { 2043 + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; 2044 + struct rdt_resource *r_mba, *r_smba; 2045 + 2046 + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2047 + if (r_mba->alloc_capable && 2048 + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 2049 + throttle_mode = r_mba->membw.throttle_mode; 2050 + 2051 + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); 2052 + if (r_smba->alloc_capable && 2053 + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) 2054 + throttle_mode = r_smba->membw.throttle_mode; 2055 + 2056 + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) 2057 + return; 2058 + 2059 + resctrl_file_fflags_init("thread_throttle_mode", 2060 + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); 2061 + } 2062 + 2063 + void resctrl_file_fflags_init(const char *config, unsigned long fflags) 2064 + { 2065 + struct rftype *rft; 2066 + 2067 + rft = rdtgroup_get_rftype_by_name(config); 2068 + if (rft) 2069 + rft->fflags = fflags; 2070 + } 2071 + 2072 + /** 2073 + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file 2074 + * @r: The resource group with which the file is associated. 2075 + * @name: Name of the file 2076 + * 2077 + * The permissions of named resctrl file, directory, or link are modified 2078 + * to not allow read, write, or execute by any user. 2079 + * 2080 + * WARNING: This function is intended to communicate to the user that the 2081 + * resctrl file has been locked down - that it is not relevant to the 2082 + * particular state the system finds itself in. It should not be relied 2083 + * on to protect from user access because after the file's permissions 2084 + * are restricted the user can still change the permissions using chmod 2085 + * from the command line. 2086 + * 2087 + * Return: 0 on success, <0 on failure. 2088 + */ 2089 + int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) 2090 + { 2091 + struct iattr iattr = {.ia_valid = ATTR_MODE,}; 2092 + struct kernfs_node *kn; 2093 + int ret = 0; 2094 + 2095 + kn = kernfs_find_and_get_ns(r->kn, name, NULL); 2096 + if (!kn) 2097 + return -ENOENT; 2098 + 2099 + switch (kernfs_type(kn)) { 2100 + case KERNFS_DIR: 2101 + iattr.ia_mode = S_IFDIR; 2102 + break; 2103 + case KERNFS_FILE: 2104 + iattr.ia_mode = S_IFREG; 2105 + break; 2106 + case KERNFS_LINK: 2107 + iattr.ia_mode = S_IFLNK; 2108 + break; 2109 + } 2110 + 2111 + ret = kernfs_setattr(kn, &iattr); 2112 + kernfs_put(kn); 2113 + return ret; 2114 + } 2115 + 2116 + /** 2117 + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file 2118 + * @r: The resource group with which the file is associated. 2119 + * @name: Name of the file 2120 + * @mask: Mask of permissions that should be restored 2121 + * 2122 + * Restore the permissions of the named file. If @name is a directory the 2123 + * permissions of its parent will be used. 2124 + * 2125 + * Return: 0 on success, <0 on failure. 2126 + */ 2127 + int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, 2128 + umode_t mask) 2129 + { 2130 + struct iattr iattr = {.ia_valid = ATTR_MODE,}; 2131 + struct kernfs_node *kn, *parent; 2132 + struct rftype *rfts, *rft; 2133 + int ret, len; 2134 + 2135 + rfts = res_common_files; 2136 + len = ARRAY_SIZE(res_common_files); 2137 + 2138 + for (rft = rfts; rft < rfts + len; rft++) { 2139 + if (!strcmp(rft->name, name)) 2140 + iattr.ia_mode = rft->mode & mask; 2141 + } 2142 + 2143 + kn = kernfs_find_and_get_ns(r->kn, name, NULL); 2144 + if (!kn) 2145 + return -ENOENT; 2146 + 2147 + switch (kernfs_type(kn)) { 2148 + case KERNFS_DIR: 2149 + parent = kernfs_get_parent(kn); 2150 + if (parent) { 2151 + iattr.ia_mode |= parent->mode; 2152 + kernfs_put(parent); 2153 + } 2154 + iattr.ia_mode |= S_IFDIR; 2155 + break; 2156 + case KERNFS_FILE: 2157 + iattr.ia_mode |= S_IFREG; 2158 + break; 2159 + case KERNFS_LINK: 2160 + iattr.ia_mode |= S_IFLNK; 2161 + break; 2162 + } 2163 + 2164 + ret = kernfs_setattr(kn, &iattr); 2165 + kernfs_put(kn); 2166 + return ret; 2167 + } 2168 + 2169 + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, 2170 + unsigned long fflags) 2171 + { 2172 + struct kernfs_node *kn_subdir; 2173 + int ret; 2174 + 2175 + kn_subdir = kernfs_create_dir(kn_info, name, 2176 + kn_info->mode, priv); 2177 + if (IS_ERR(kn_subdir)) 2178 + return PTR_ERR(kn_subdir); 2179 + 2180 + ret = rdtgroup_kn_set_ugid(kn_subdir); 2181 + if (ret) 2182 + return ret; 2183 + 2184 + ret = rdtgroup_add_files(kn_subdir, fflags); 2185 + if (!ret) 2186 + kernfs_activate(kn_subdir); 2187 + 2188 + return ret; 2189 + } 2190 + 2191 + static unsigned long fflags_from_resource(struct rdt_resource *r) 2192 + { 2193 + switch (r->rid) { 2194 + case RDT_RESOURCE_L3: 2195 + case RDT_RESOURCE_L2: 2196 + return RFTYPE_RES_CACHE; 2197 + case RDT_RESOURCE_MBA: 2198 + case RDT_RESOURCE_SMBA: 2199 + return RFTYPE_RES_MB; 2200 + } 2201 + 2202 + return WARN_ON_ONCE(1); 2203 + } 2204 + 2205 + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 2206 + { 2207 + struct resctrl_schema *s; 2208 + struct rdt_resource *r; 2209 + unsigned long fflags; 2210 + char name[32]; 2211 + int ret; 2212 + 2213 + /* create the directory */ 2214 + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 2215 + if (IS_ERR(kn_info)) 2216 + return PTR_ERR(kn_info); 2217 + 2218 + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); 2219 + if (ret) 2220 + goto out_destroy; 2221 + 2222 + /* loop over enabled controls, these are all alloc_capable */ 2223 + list_for_each_entry(s, &resctrl_schema_all, list) { 2224 + r = s->res; 2225 + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; 2226 + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); 2227 + if (ret) 2228 + goto out_destroy; 2229 + } 2230 + 2231 + for_each_mon_capable_rdt_resource(r) { 2232 + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; 2233 + sprintf(name, "%s_MON", r->name); 2234 + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); 2235 + if (ret) 2236 + goto out_destroy; 2237 + } 2238 + 2239 + ret = rdtgroup_kn_set_ugid(kn_info); 2240 + if (ret) 2241 + goto out_destroy; 2242 + 2243 + kernfs_activate(kn_info); 2244 + 2245 + return 0; 2246 + 2247 + out_destroy: 2248 + kernfs_remove(kn_info); 2249 + return ret; 2250 + } 2251 + 2252 + static int 2253 + mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, 2254 + char *name, struct kernfs_node **dest_kn) 2255 + { 2256 + struct kernfs_node *kn; 2257 + int ret; 2258 + 2259 + /* create the directory */ 2260 + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 2261 + if (IS_ERR(kn)) 2262 + return PTR_ERR(kn); 2263 + 2264 + if (dest_kn) 2265 + *dest_kn = kn; 2266 + 2267 + ret = rdtgroup_kn_set_ugid(kn); 2268 + if (ret) 2269 + goto out_destroy; 2270 + 2271 + kernfs_activate(kn); 2272 + 2273 + return 0; 2274 + 2275 + out_destroy: 2276 + kernfs_remove(kn); 2277 + return ret; 2278 + } 2279 + 2280 + static inline bool is_mba_linear(void) 2281 + { 2282 + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; 2283 + } 2284 + 2285 + static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) 2286 + { 2287 + u32 num_closid = resctrl_arch_get_num_closid(r); 2288 + int cpu = cpumask_any(&d->hdr.cpu_mask); 2289 + int i; 2290 + 2291 + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), 2292 + GFP_KERNEL, cpu_to_node(cpu)); 2293 + if (!d->mbps_val) 2294 + return -ENOMEM; 2295 + 2296 + for (i = 0; i < num_closid; i++) 2297 + d->mbps_val[i] = MBA_MAX_MBPS; 2298 + 2299 + return 0; 2300 + } 2301 + 2302 + static void mba_sc_domain_destroy(struct rdt_resource *r, 2303 + struct rdt_ctrl_domain *d) 2304 + { 2305 + kfree(d->mbps_val); 2306 + d->mbps_val = NULL; 2307 + } 2308 + 2309 + /* 2310 + * MBA software controller is supported only if 2311 + * MBM is supported and MBA is in linear scale, 2312 + * and the MBM monitor scope is the same as MBA 2313 + * control scope. 2314 + */ 2315 + static bool supports_mba_mbps(void) 2316 + { 2317 + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); 2318 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2319 + 2320 + return (resctrl_is_mbm_enabled() && 2321 + r->alloc_capable && is_mba_linear() && 2322 + r->ctrl_scope == rmbm->mon_scope); 2323 + } 2324 + 2325 + /* 2326 + * Enable or disable the MBA software controller 2327 + * which helps user specify bandwidth in MBps. 2328 + */ 2329 + static int set_mba_sc(bool mba_sc) 2330 + { 2331 + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 2332 + u32 num_closid = resctrl_arch_get_num_closid(r); 2333 + struct rdt_ctrl_domain *d; 2334 + unsigned long fflags; 2335 + int i; 2336 + 2337 + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) 2338 + return -EINVAL; 2339 + 2340 + r->membw.mba_sc = mba_sc; 2341 + 2342 + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; 2343 + 2344 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 2345 + for (i = 0; i < num_closid; i++) 2346 + d->mbps_val[i] = MBA_MAX_MBPS; 2347 + } 2348 + 2349 + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; 2350 + resctrl_file_fflags_init("mba_MBps_event", fflags); 2351 + 2352 + return 0; 2353 + } 2354 + 2355 + /* 2356 + * We don't allow rdtgroup directories to be created anywhere 2357 + * except the root directory. Thus when looking for the rdtgroup 2358 + * structure for a kernfs node we are either looking at a directory, 2359 + * in which case the rdtgroup structure is pointed at by the "priv" 2360 + * field, otherwise we have a file, and need only look to the parent 2361 + * to find the rdtgroup. 2362 + */ 2363 + static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) 2364 + { 2365 + if (kernfs_type(kn) == KERNFS_DIR) { 2366 + /* 2367 + * All the resource directories use "kn->priv" 2368 + * to point to the "struct rdtgroup" for the 2369 + * resource. "info" and its subdirectories don't 2370 + * have rdtgroup structures, so return NULL here. 2371 + */ 2372 + if (kn == kn_info || 2373 + rcu_access_pointer(kn->__parent) == kn_info) 2374 + return NULL; 2375 + else 2376 + return kn->priv; 2377 + } else { 2378 + return rdt_kn_parent_priv(kn); 2379 + } 2380 + } 2381 + 2382 + static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 2383 + { 2384 + atomic_inc(&rdtgrp->waitcount); 2385 + kernfs_break_active_protection(kn); 2386 + } 2387 + 2388 + static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) 2389 + { 2390 + if (atomic_dec_and_test(&rdtgrp->waitcount) && 2391 + (rdtgrp->flags & RDT_DELETED)) { 2392 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 2393 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 2394 + rdtgroup_pseudo_lock_remove(rdtgrp); 2395 + kernfs_unbreak_active_protection(kn); 2396 + rdtgroup_remove(rdtgrp); 2397 + } else { 2398 + kernfs_unbreak_active_protection(kn); 2399 + } 2400 + } 2401 + 2402 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) 2403 + { 2404 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 2405 + 2406 + if (!rdtgrp) 2407 + return NULL; 2408 + 2409 + rdtgroup_kn_get(rdtgrp, kn); 2410 + 2411 + cpus_read_lock(); 2412 + mutex_lock(&rdtgroup_mutex); 2413 + 2414 + /* Was this group deleted while we waited? */ 2415 + if (rdtgrp->flags & RDT_DELETED) 2416 + return NULL; 2417 + 2418 + return rdtgrp; 2419 + } 2420 + 2421 + void rdtgroup_kn_unlock(struct kernfs_node *kn) 2422 + { 2423 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 2424 + 2425 + if (!rdtgrp) 2426 + return; 2427 + 2428 + mutex_unlock(&rdtgroup_mutex); 2429 + cpus_read_unlock(); 2430 + 2431 + rdtgroup_kn_put(rdtgrp, kn); 2432 + } 2433 + 2434 + static int mkdir_mondata_all(struct kernfs_node *parent_kn, 2435 + struct rdtgroup *prgrp, 2436 + struct kernfs_node **mon_data_kn); 2437 + 2438 + static void rdt_disable_ctx(void) 2439 + { 2440 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 2441 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 2442 + set_mba_sc(false); 2443 + 2444 + resctrl_debug = false; 2445 + } 2446 + 2447 + static int rdt_enable_ctx(struct rdt_fs_context *ctx) 2448 + { 2449 + int ret = 0; 2450 + 2451 + if (ctx->enable_cdpl2) { 2452 + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); 2453 + if (ret) 2454 + goto out_done; 2455 + } 2456 + 2457 + if (ctx->enable_cdpl3) { 2458 + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); 2459 + if (ret) 2460 + goto out_cdpl2; 2461 + } 2462 + 2463 + if (ctx->enable_mba_mbps) { 2464 + ret = set_mba_sc(true); 2465 + if (ret) 2466 + goto out_cdpl3; 2467 + } 2468 + 2469 + if (ctx->enable_debug) 2470 + resctrl_debug = true; 2471 + 2472 + return 0; 2473 + 2474 + out_cdpl3: 2475 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); 2476 + out_cdpl2: 2477 + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); 2478 + out_done: 2479 + return ret; 2480 + } 2481 + 2482 + static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) 2483 + { 2484 + struct resctrl_schema *s; 2485 + const char *suffix = ""; 2486 + int ret, cl; 2487 + 2488 + s = kzalloc(sizeof(*s), GFP_KERNEL); 2489 + if (!s) 2490 + return -ENOMEM; 2491 + 2492 + s->res = r; 2493 + s->num_closid = resctrl_arch_get_num_closid(r); 2494 + if (resctrl_arch_get_cdp_enabled(r->rid)) 2495 + s->num_closid /= 2; 2496 + 2497 + s->conf_type = type; 2498 + switch (type) { 2499 + case CDP_CODE: 2500 + suffix = "CODE"; 2501 + break; 2502 + case CDP_DATA: 2503 + suffix = "DATA"; 2504 + break; 2505 + case CDP_NONE: 2506 + suffix = ""; 2507 + break; 2508 + } 2509 + 2510 + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); 2511 + if (ret >= sizeof(s->name)) { 2512 + kfree(s); 2513 + return -EINVAL; 2514 + } 2515 + 2516 + cl = strlen(s->name); 2517 + 2518 + /* 2519 + * If CDP is supported by this resource, but not enabled, 2520 + * include the suffix. This ensures the tabular format of the 2521 + * schemata file does not change between mounts of the filesystem. 2522 + */ 2523 + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) 2524 + cl += 4; 2525 + 2526 + if (cl > max_name_width) 2527 + max_name_width = cl; 2528 + 2529 + switch (r->schema_fmt) { 2530 + case RESCTRL_SCHEMA_BITMAP: 2531 + s->fmt_str = "%d=%x"; 2532 + break; 2533 + case RESCTRL_SCHEMA_RANGE: 2534 + s->fmt_str = "%d=%u"; 2535 + break; 2536 + } 2537 + 2538 + if (WARN_ON_ONCE(!s->fmt_str)) { 2539 + kfree(s); 2540 + return -EINVAL; 2541 + } 2542 + 2543 + INIT_LIST_HEAD(&s->list); 2544 + list_add(&s->list, &resctrl_schema_all); 2545 + 2546 + return 0; 2547 + } 2548 + 2549 + static int schemata_list_create(void) 2550 + { 2551 + struct rdt_resource *r; 2552 + int ret = 0; 2553 + 2554 + for_each_alloc_capable_rdt_resource(r) { 2555 + if (resctrl_arch_get_cdp_enabled(r->rid)) { 2556 + ret = schemata_list_add(r, CDP_CODE); 2557 + if (ret) 2558 + break; 2559 + 2560 + ret = schemata_list_add(r, CDP_DATA); 2561 + } else { 2562 + ret = schemata_list_add(r, CDP_NONE); 2563 + } 2564 + 2565 + if (ret) 2566 + break; 2567 + } 2568 + 2569 + return ret; 2570 + } 2571 + 2572 + static void schemata_list_destroy(void) 2573 + { 2574 + struct resctrl_schema *s, *tmp; 2575 + 2576 + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { 2577 + list_del(&s->list); 2578 + kfree(s); 2579 + } 2580 + } 2581 + 2582 + static int rdt_get_tree(struct fs_context *fc) 2583 + { 2584 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2585 + unsigned long flags = RFTYPE_CTRL_BASE; 2586 + struct rdt_mon_domain *dom; 2587 + struct rdt_resource *r; 2588 + int ret; 2589 + 2590 + cpus_read_lock(); 2591 + mutex_lock(&rdtgroup_mutex); 2592 + /* 2593 + * resctrl file system can only be mounted once. 2594 + */ 2595 + if (resctrl_mounted) { 2596 + ret = -EBUSY; 2597 + goto out; 2598 + } 2599 + 2600 + ret = rdtgroup_setup_root(ctx); 2601 + if (ret) 2602 + goto out; 2603 + 2604 + ret = rdt_enable_ctx(ctx); 2605 + if (ret) 2606 + goto out_root; 2607 + 2608 + ret = schemata_list_create(); 2609 + if (ret) { 2610 + schemata_list_destroy(); 2611 + goto out_ctx; 2612 + } 2613 + 2614 + ret = closid_init(); 2615 + if (ret) 2616 + goto out_schemata_free; 2617 + 2618 + if (resctrl_arch_mon_capable()) 2619 + flags |= RFTYPE_MON; 2620 + 2621 + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); 2622 + if (ret) 2623 + goto out_closid_exit; 2624 + 2625 + kernfs_activate(rdtgroup_default.kn); 2626 + 2627 + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); 2628 + if (ret < 0) 2629 + goto out_closid_exit; 2630 + 2631 + if (resctrl_arch_mon_capable()) { 2632 + ret = mongroup_create_dir(rdtgroup_default.kn, 2633 + &rdtgroup_default, "mon_groups", 2634 + &kn_mongrp); 2635 + if (ret < 0) 2636 + goto out_info; 2637 + 2638 + ret = mkdir_mondata_all(rdtgroup_default.kn, 2639 + &rdtgroup_default, &kn_mondata); 2640 + if (ret < 0) 2641 + goto out_mongrp; 2642 + rdtgroup_default.mon.mon_data_kn = kn_mondata; 2643 + } 2644 + 2645 + ret = rdt_pseudo_lock_init(); 2646 + if (ret) 2647 + goto out_mondata; 2648 + 2649 + ret = kernfs_get_tree(fc); 2650 + if (ret < 0) 2651 + goto out_psl; 2652 + 2653 + if (resctrl_arch_alloc_capable()) 2654 + resctrl_arch_enable_alloc(); 2655 + if (resctrl_arch_mon_capable()) 2656 + resctrl_arch_enable_mon(); 2657 + 2658 + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) 2659 + resctrl_mounted = true; 2660 + 2661 + if (resctrl_is_mbm_enabled()) { 2662 + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 2663 + list_for_each_entry(dom, &r->mon_domains, hdr.list) 2664 + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, 2665 + RESCTRL_PICK_ANY_CPU); 2666 + } 2667 + 2668 + goto out; 2669 + 2670 + out_psl: 2671 + rdt_pseudo_lock_release(); 2672 + out_mondata: 2673 + if (resctrl_arch_mon_capable()) 2674 + kernfs_remove(kn_mondata); 2675 + out_mongrp: 2676 + if (resctrl_arch_mon_capable()) 2677 + kernfs_remove(kn_mongrp); 2678 + out_info: 2679 + kernfs_remove(kn_info); 2680 + out_closid_exit: 2681 + closid_exit(); 2682 + out_schemata_free: 2683 + schemata_list_destroy(); 2684 + out_ctx: 2685 + rdt_disable_ctx(); 2686 + out_root: 2687 + rdtgroup_destroy_root(); 2688 + out: 2689 + rdt_last_cmd_clear(); 2690 + mutex_unlock(&rdtgroup_mutex); 2691 + cpus_read_unlock(); 2692 + return ret; 2693 + } 2694 + 2695 + enum rdt_param { 2696 + Opt_cdp, 2697 + Opt_cdpl2, 2698 + Opt_mba_mbps, 2699 + Opt_debug, 2700 + nr__rdt_params 2701 + }; 2702 + 2703 + static const struct fs_parameter_spec rdt_fs_parameters[] = { 2704 + fsparam_flag("cdp", Opt_cdp), 2705 + fsparam_flag("cdpl2", Opt_cdpl2), 2706 + fsparam_flag("mba_MBps", Opt_mba_mbps), 2707 + fsparam_flag("debug", Opt_debug), 2708 + {} 2709 + }; 2710 + 2711 + static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) 2712 + { 2713 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2714 + struct fs_parse_result result; 2715 + const char *msg; 2716 + int opt; 2717 + 2718 + opt = fs_parse(fc, rdt_fs_parameters, param, &result); 2719 + if (opt < 0) 2720 + return opt; 2721 + 2722 + switch (opt) { 2723 + case Opt_cdp: 2724 + ctx->enable_cdpl3 = true; 2725 + return 0; 2726 + case Opt_cdpl2: 2727 + ctx->enable_cdpl2 = true; 2728 + return 0; 2729 + case Opt_mba_mbps: 2730 + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; 2731 + if (!supports_mba_mbps()) 2732 + return invalfc(fc, msg); 2733 + ctx->enable_mba_mbps = true; 2734 + return 0; 2735 + case Opt_debug: 2736 + ctx->enable_debug = true; 2737 + return 0; 2738 + } 2739 + 2740 + return -EINVAL; 2741 + } 2742 + 2743 + static void rdt_fs_context_free(struct fs_context *fc) 2744 + { 2745 + struct rdt_fs_context *ctx = rdt_fc2context(fc); 2746 + 2747 + kernfs_free_fs_context(fc); 2748 + kfree(ctx); 2749 + } 2750 + 2751 + static const struct fs_context_operations rdt_fs_context_ops = { 2752 + .free = rdt_fs_context_free, 2753 + .parse_param = rdt_parse_param, 2754 + .get_tree = rdt_get_tree, 2755 + }; 2756 + 2757 + static int rdt_init_fs_context(struct fs_context *fc) 2758 + { 2759 + struct rdt_fs_context *ctx; 2760 + 2761 + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 2762 + if (!ctx) 2763 + return -ENOMEM; 2764 + 2765 + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; 2766 + fc->fs_private = &ctx->kfc; 2767 + fc->ops = &rdt_fs_context_ops; 2768 + put_user_ns(fc->user_ns); 2769 + fc->user_ns = get_user_ns(&init_user_ns); 2770 + fc->global = true; 2771 + return 0; 2772 + } 2773 + 2774 + /* 2775 + * Move tasks from one to the other group. If @from is NULL, then all tasks 2776 + * in the systems are moved unconditionally (used for teardown). 2777 + * 2778 + * If @mask is not NULL the cpus on which moved tasks are running are set 2779 + * in that mask so the update smp function call is restricted to affected 2780 + * cpus. 2781 + */ 2782 + static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, 2783 + struct cpumask *mask) 2784 + { 2785 + struct task_struct *p, *t; 2786 + 2787 + read_lock(&tasklist_lock); 2788 + for_each_process_thread(p, t) { 2789 + if (!from || is_closid_match(t, from) || 2790 + is_rmid_match(t, from)) { 2791 + resctrl_arch_set_closid_rmid(t, to->closid, 2792 + to->mon.rmid); 2793 + 2794 + /* 2795 + * Order the closid/rmid stores above before the loads 2796 + * in task_curr(). This pairs with the full barrier 2797 + * between the rq->curr update and 2798 + * resctrl_arch_sched_in() during context switch. 2799 + */ 2800 + smp_mb(); 2801 + 2802 + /* 2803 + * If the task is on a CPU, set the CPU in the mask. 2804 + * The detection is inaccurate as tasks might move or 2805 + * schedule before the smp function call takes place. 2806 + * In such a case the function call is pointless, but 2807 + * there is no other side effect. 2808 + */ 2809 + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) 2810 + cpumask_set_cpu(task_cpu(t), mask); 2811 + } 2812 + } 2813 + read_unlock(&tasklist_lock); 2814 + } 2815 + 2816 + static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) 2817 + { 2818 + struct rdtgroup *sentry, *stmp; 2819 + struct list_head *head; 2820 + 2821 + head = &rdtgrp->mon.crdtgrp_list; 2822 + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { 2823 + free_rmid(sentry->closid, sentry->mon.rmid); 2824 + list_del(&sentry->mon.crdtgrp_list); 2825 + 2826 + if (atomic_read(&sentry->waitcount) != 0) 2827 + sentry->flags = RDT_DELETED; 2828 + else 2829 + rdtgroup_remove(sentry); 2830 + } 2831 + } 2832 + 2833 + /* 2834 + * Forcibly remove all of subdirectories under root. 2835 + */ 2836 + static void rmdir_all_sub(void) 2837 + { 2838 + struct rdtgroup *rdtgrp, *tmp; 2839 + 2840 + /* Move all tasks to the default resource group */ 2841 + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 2842 + 2843 + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 2844 + /* Free any child rmids */ 2845 + free_all_child_rdtgrp(rdtgrp); 2846 + 2847 + /* Remove each rdtgroup other than root */ 2848 + if (rdtgrp == &rdtgroup_default) 2849 + continue; 2850 + 2851 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 2852 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) 2853 + rdtgroup_pseudo_lock_remove(rdtgrp); 2854 + 2855 + /* 2856 + * Give any CPUs back to the default group. We cannot copy 2857 + * cpu_online_mask because a CPU might have executed the 2858 + * offline callback already, but is still marked online. 2859 + */ 2860 + cpumask_or(&rdtgroup_default.cpu_mask, 2861 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 2862 + 2863 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 2864 + 2865 + kernfs_remove(rdtgrp->kn); 2866 + list_del(&rdtgrp->rdtgroup_list); 2867 + 2868 + if (atomic_read(&rdtgrp->waitcount) != 0) 2869 + rdtgrp->flags = RDT_DELETED; 2870 + else 2871 + rdtgroup_remove(rdtgrp); 2872 + } 2873 + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 2874 + update_closid_rmid(cpu_online_mask, &rdtgroup_default); 2875 + 2876 + kernfs_remove(kn_info); 2877 + kernfs_remove(kn_mongrp); 2878 + kernfs_remove(kn_mondata); 2879 + } 2880 + 2881 + /** 2882 + * mon_get_kn_priv() - Get the mon_data priv data for this event. 2883 + * 2884 + * The same values are used across the mon_data directories of all control and 2885 + * monitor groups for the same event in the same domain. Keep a list of 2886 + * allocated structures and re-use an existing one with the same values for 2887 + * @rid, @domid, etc. 2888 + * 2889 + * @rid: The resource id for the event file being created. 2890 + * @domid: The domain id for the event file being created. 2891 + * @mevt: The type of event file being created. 2892 + * @do_sum: Whether SNC summing monitors are being created. 2893 + */ 2894 + static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, 2895 + struct mon_evt *mevt, 2896 + bool do_sum) 2897 + { 2898 + struct mon_data *priv; 2899 + 2900 + lockdep_assert_held(&rdtgroup_mutex); 2901 + 2902 + list_for_each_entry(priv, &mon_data_kn_priv_list, list) { 2903 + if (priv->rid == rid && priv->domid == domid && 2904 + priv->sum == do_sum && priv->evtid == mevt->evtid) 2905 + return priv; 2906 + } 2907 + 2908 + priv = kzalloc(sizeof(*priv), GFP_KERNEL); 2909 + if (!priv) 2910 + return NULL; 2911 + 2912 + priv->rid = rid; 2913 + priv->domid = domid; 2914 + priv->sum = do_sum; 2915 + priv->evtid = mevt->evtid; 2916 + list_add_tail(&priv->list, &mon_data_kn_priv_list); 2917 + 2918 + return priv; 2919 + } 2920 + 2921 + /** 2922 + * mon_put_kn_priv() - Free all allocated mon_data structures. 2923 + * 2924 + * Called when resctrl file system is unmounted. 2925 + */ 2926 + static void mon_put_kn_priv(void) 2927 + { 2928 + struct mon_data *priv, *tmp; 2929 + 2930 + lockdep_assert_held(&rdtgroup_mutex); 2931 + 2932 + list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) { 2933 + list_del(&priv->list); 2934 + kfree(priv); 2935 + } 2936 + } 2937 + 2938 + static void resctrl_fs_teardown(void) 2939 + { 2940 + lockdep_assert_held(&rdtgroup_mutex); 2941 + 2942 + /* Cleared by rdtgroup_destroy_root() */ 2943 + if (!rdtgroup_default.kn) 2944 + return; 2945 + 2946 + rmdir_all_sub(); 2947 + mon_put_kn_priv(); 2948 + rdt_pseudo_lock_release(); 2949 + rdtgroup_default.mode = RDT_MODE_SHAREABLE; 2950 + closid_exit(); 2951 + schemata_list_destroy(); 2952 + rdtgroup_destroy_root(); 2953 + } 2954 + 2955 + static void rdt_kill_sb(struct super_block *sb) 2956 + { 2957 + struct rdt_resource *r; 2958 + 2959 + cpus_read_lock(); 2960 + mutex_lock(&rdtgroup_mutex); 2961 + 2962 + rdt_disable_ctx(); 2963 + 2964 + /* Put everything back to default values. */ 2965 + for_each_alloc_capable_rdt_resource(r) 2966 + resctrl_arch_reset_all_ctrls(r); 2967 + 2968 + resctrl_fs_teardown(); 2969 + if (resctrl_arch_alloc_capable()) 2970 + resctrl_arch_disable_alloc(); 2971 + if (resctrl_arch_mon_capable()) 2972 + resctrl_arch_disable_mon(); 2973 + resctrl_mounted = false; 2974 + kernfs_kill_sb(sb); 2975 + mutex_unlock(&rdtgroup_mutex); 2976 + cpus_read_unlock(); 2977 + } 2978 + 2979 + static struct file_system_type rdt_fs_type = { 2980 + .name = "resctrl", 2981 + .init_fs_context = rdt_init_fs_context, 2982 + .parameters = rdt_fs_parameters, 2983 + .kill_sb = rdt_kill_sb, 2984 + }; 2985 + 2986 + static int mon_addfile(struct kernfs_node *parent_kn, const char *name, 2987 + void *priv) 2988 + { 2989 + struct kernfs_node *kn; 2990 + int ret = 0; 2991 + 2992 + kn = __kernfs_create_file(parent_kn, name, 0444, 2993 + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, 2994 + &kf_mondata_ops, priv, NULL, NULL); 2995 + if (IS_ERR(kn)) 2996 + return PTR_ERR(kn); 2997 + 2998 + ret = rdtgroup_kn_set_ugid(kn); 2999 + if (ret) { 3000 + kernfs_remove(kn); 3001 + return ret; 3002 + } 3003 + 3004 + return ret; 3005 + } 3006 + 3007 + static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) 3008 + { 3009 + struct kernfs_node *kn; 3010 + 3011 + kn = kernfs_find_and_get(pkn, name); 3012 + if (!kn) 3013 + return; 3014 + kernfs_put(kn); 3015 + 3016 + if (kn->dir.subdirs <= 1) 3017 + kernfs_remove(kn); 3018 + else 3019 + kernfs_remove_by_name(kn, subname); 3020 + } 3021 + 3022 + /* 3023 + * Remove all subdirectories of mon_data of ctrl_mon groups 3024 + * and monitor groups for the given domain. 3025 + * Remove files and directories containing "sum" of domain data 3026 + * when last domain being summed is removed. 3027 + */ 3028 + static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 3029 + struct rdt_mon_domain *d) 3030 + { 3031 + struct rdtgroup *prgrp, *crgrp; 3032 + char subname[32]; 3033 + bool snc_mode; 3034 + char name[32]; 3035 + 3036 + snc_mode = r->mon_scope == RESCTRL_L3_NODE; 3037 + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 3038 + if (snc_mode) 3039 + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); 3040 + 3041 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 3042 + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); 3043 + 3044 + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) 3045 + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); 3046 + } 3047 + } 3048 + 3049 + static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, 3050 + struct rdt_resource *r, struct rdtgroup *prgrp, 3051 + bool do_sum) 3052 + { 3053 + struct rmid_read rr = {0}; 3054 + struct mon_data *priv; 3055 + struct mon_evt *mevt; 3056 + int ret, domid; 3057 + 3058 + if (WARN_ON(list_empty(&r->evt_list))) 3059 + return -EPERM; 3060 + 3061 + list_for_each_entry(mevt, &r->evt_list, list) { 3062 + domid = do_sum ? d->ci->id : d->hdr.id; 3063 + priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); 3064 + if (WARN_ON_ONCE(!priv)) 3065 + return -EINVAL; 3066 + 3067 + ret = mon_addfile(kn, mevt->name, priv); 3068 + if (ret) 3069 + return ret; 3070 + 3071 + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) 3072 + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); 3073 + } 3074 + 3075 + return 0; 3076 + } 3077 + 3078 + static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, 3079 + struct rdt_mon_domain *d, 3080 + struct rdt_resource *r, struct rdtgroup *prgrp) 3081 + { 3082 + struct kernfs_node *kn, *ckn; 3083 + char name[32]; 3084 + bool snc_mode; 3085 + int ret = 0; 3086 + 3087 + lockdep_assert_held(&rdtgroup_mutex); 3088 + 3089 + snc_mode = r->mon_scope == RESCTRL_L3_NODE; 3090 + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); 3091 + kn = kernfs_find_and_get(parent_kn, name); 3092 + if (kn) { 3093 + /* 3094 + * rdtgroup_mutex will prevent this directory from being 3095 + * removed. No need to keep this hold. 3096 + */ 3097 + kernfs_put(kn); 3098 + } else { 3099 + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); 3100 + if (IS_ERR(kn)) 3101 + return PTR_ERR(kn); 3102 + 3103 + ret = rdtgroup_kn_set_ugid(kn); 3104 + if (ret) 3105 + goto out_destroy; 3106 + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); 3107 + if (ret) 3108 + goto out_destroy; 3109 + } 3110 + 3111 + if (snc_mode) { 3112 + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); 3113 + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); 3114 + if (IS_ERR(ckn)) { 3115 + ret = -EINVAL; 3116 + goto out_destroy; 3117 + } 3118 + 3119 + ret = rdtgroup_kn_set_ugid(ckn); 3120 + if (ret) 3121 + goto out_destroy; 3122 + 3123 + ret = mon_add_all_files(ckn, d, r, prgrp, false); 3124 + if (ret) 3125 + goto out_destroy; 3126 + } 3127 + 3128 + kernfs_activate(kn); 3129 + return 0; 3130 + 3131 + out_destroy: 3132 + kernfs_remove(kn); 3133 + return ret; 3134 + } 3135 + 3136 + /* 3137 + * Add all subdirectories of mon_data for "ctrl_mon" groups 3138 + * and "monitor" groups with given domain id. 3139 + */ 3140 + static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, 3141 + struct rdt_mon_domain *d) 3142 + { 3143 + struct kernfs_node *parent_kn; 3144 + struct rdtgroup *prgrp, *crgrp; 3145 + struct list_head *head; 3146 + 3147 + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 3148 + parent_kn = prgrp->mon.mon_data_kn; 3149 + mkdir_mondata_subdir(parent_kn, d, r, prgrp); 3150 + 3151 + head = &prgrp->mon.crdtgrp_list; 3152 + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { 3153 + parent_kn = crgrp->mon.mon_data_kn; 3154 + mkdir_mondata_subdir(parent_kn, d, r, crgrp); 3155 + } 3156 + } 3157 + } 3158 + 3159 + static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, 3160 + struct rdt_resource *r, 3161 + struct rdtgroup *prgrp) 3162 + { 3163 + struct rdt_mon_domain *dom; 3164 + int ret; 3165 + 3166 + /* Walking r->domains, ensure it can't race with cpuhp */ 3167 + lockdep_assert_cpus_held(); 3168 + 3169 + list_for_each_entry(dom, &r->mon_domains, hdr.list) { 3170 + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); 3171 + if (ret) 3172 + return ret; 3173 + } 3174 + 3175 + return 0; 3176 + } 3177 + 3178 + /* 3179 + * This creates a directory mon_data which contains the monitored data. 3180 + * 3181 + * mon_data has one directory for each domain which are named 3182 + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data 3183 + * with L3 domain looks as below: 3184 + * ./mon_data: 3185 + * mon_L3_00 3186 + * mon_L3_01 3187 + * mon_L3_02 3188 + * ... 3189 + * 3190 + * Each domain directory has one file per event: 3191 + * ./mon_L3_00/: 3192 + * llc_occupancy 3193 + * 3194 + */ 3195 + static int mkdir_mondata_all(struct kernfs_node *parent_kn, 3196 + struct rdtgroup *prgrp, 3197 + struct kernfs_node **dest_kn) 3198 + { 3199 + struct rdt_resource *r; 3200 + struct kernfs_node *kn; 3201 + int ret; 3202 + 3203 + /* 3204 + * Create the mon_data directory first. 3205 + */ 3206 + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); 3207 + if (ret) 3208 + return ret; 3209 + 3210 + if (dest_kn) 3211 + *dest_kn = kn; 3212 + 3213 + /* 3214 + * Create the subdirectories for each domain. Note that all events 3215 + * in a domain like L3 are grouped into a resource whose domain is L3 3216 + */ 3217 + for_each_mon_capable_rdt_resource(r) { 3218 + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); 3219 + if (ret) 3220 + goto out_destroy; 3221 + } 3222 + 3223 + return 0; 3224 + 3225 + out_destroy: 3226 + kernfs_remove(kn); 3227 + return ret; 3228 + } 3229 + 3230 + /** 3231 + * cbm_ensure_valid - Enforce validity on provided CBM 3232 + * @_val: Candidate CBM 3233 + * @r: RDT resource to which the CBM belongs 3234 + * 3235 + * The provided CBM represents all cache portions available for use. This 3236 + * may be represented by a bitmap that does not consist of contiguous ones 3237 + * and thus be an invalid CBM. 3238 + * Here the provided CBM is forced to be a valid CBM by only considering 3239 + * the first set of contiguous bits as valid and clearing all bits. 3240 + * The intention here is to provide a valid default CBM with which a new 3241 + * resource group is initialized. The user can follow this with a 3242 + * modification to the CBM if the default does not satisfy the 3243 + * requirements. 3244 + */ 3245 + static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) 3246 + { 3247 + unsigned int cbm_len = r->cache.cbm_len; 3248 + unsigned long first_bit, zero_bit; 3249 + unsigned long val = _val; 3250 + 3251 + if (!val) 3252 + return 0; 3253 + 3254 + first_bit = find_first_bit(&val, cbm_len); 3255 + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); 3256 + 3257 + /* Clear any remaining bits to ensure contiguous region */ 3258 + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); 3259 + return (u32)val; 3260 + } 3261 + 3262 + /* 3263 + * Initialize cache resources per RDT domain 3264 + * 3265 + * Set the RDT domain up to start off with all usable allocations. That is, 3266 + * all shareable and unused bits. All-zero CBM is invalid. 3267 + */ 3268 + static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, 3269 + u32 closid) 3270 + { 3271 + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); 3272 + enum resctrl_conf_type t = s->conf_type; 3273 + struct resctrl_staged_config *cfg; 3274 + struct rdt_resource *r = s->res; 3275 + u32 used_b = 0, unused_b = 0; 3276 + unsigned long tmp_cbm; 3277 + enum rdtgrp_mode mode; 3278 + u32 peer_ctl, ctrl_val; 3279 + int i; 3280 + 3281 + cfg = &d->staged_config[t]; 3282 + cfg->have_new_ctrl = false; 3283 + cfg->new_ctrl = r->cache.shareable_bits; 3284 + used_b = r->cache.shareable_bits; 3285 + for (i = 0; i < closids_supported(); i++) { 3286 + if (closid_allocated(i) && i != closid) { 3287 + mode = rdtgroup_mode_by_closid(i); 3288 + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) 3289 + /* 3290 + * ctrl values for locksetup aren't relevant 3291 + * until the schemata is written, and the mode 3292 + * becomes RDT_MODE_PSEUDO_LOCKED. 3293 + */ 3294 + continue; 3295 + /* 3296 + * If CDP is active include peer domain's 3297 + * usage to ensure there is no overlap 3298 + * with an exclusive group. 3299 + */ 3300 + if (resctrl_arch_get_cdp_enabled(r->rid)) 3301 + peer_ctl = resctrl_arch_get_config(r, d, i, 3302 + peer_type); 3303 + else 3304 + peer_ctl = 0; 3305 + ctrl_val = resctrl_arch_get_config(r, d, i, 3306 + s->conf_type); 3307 + used_b |= ctrl_val | peer_ctl; 3308 + if (mode == RDT_MODE_SHAREABLE) 3309 + cfg->new_ctrl |= ctrl_val | peer_ctl; 3310 + } 3311 + } 3312 + if (d->plr && d->plr->cbm > 0) 3313 + used_b |= d->plr->cbm; 3314 + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); 3315 + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; 3316 + cfg->new_ctrl |= unused_b; 3317 + /* 3318 + * Force the initial CBM to be valid, user can 3319 + * modify the CBM based on system availability. 3320 + */ 3321 + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); 3322 + /* 3323 + * Assign the u32 CBM to an unsigned long to ensure that 3324 + * bitmap_weight() does not access out-of-bound memory. 3325 + */ 3326 + tmp_cbm = cfg->new_ctrl; 3327 + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { 3328 + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); 3329 + return -ENOSPC; 3330 + } 3331 + cfg->have_new_ctrl = true; 3332 + 3333 + return 0; 3334 + } 3335 + 3336 + /* 3337 + * Initialize cache resources with default values. 3338 + * 3339 + * A new RDT group is being created on an allocation capable (CAT) 3340 + * supporting system. Set this group up to start off with all usable 3341 + * allocations. 3342 + * 3343 + * If there are no more shareable bits available on any domain then 3344 + * the entire allocation will fail. 3345 + */ 3346 + static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) 3347 + { 3348 + struct rdt_ctrl_domain *d; 3349 + int ret; 3350 + 3351 + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { 3352 + ret = __init_one_rdt_domain(d, s, closid); 3353 + if (ret < 0) 3354 + return ret; 3355 + } 3356 + 3357 + return 0; 3358 + } 3359 + 3360 + /* Initialize MBA resource with default values. */ 3361 + static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) 3362 + { 3363 + struct resctrl_staged_config *cfg; 3364 + struct rdt_ctrl_domain *d; 3365 + 3366 + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 3367 + if (is_mba_sc(r)) { 3368 + d->mbps_val[closid] = MBA_MAX_MBPS; 3369 + continue; 3370 + } 3371 + 3372 + cfg = &d->staged_config[CDP_NONE]; 3373 + cfg->new_ctrl = resctrl_get_default_ctrl(r); 3374 + cfg->have_new_ctrl = true; 3375 + } 3376 + } 3377 + 3378 + /* Initialize the RDT group's allocations. */ 3379 + static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) 3380 + { 3381 + struct resctrl_schema *s; 3382 + struct rdt_resource *r; 3383 + int ret = 0; 3384 + 3385 + rdt_staged_configs_clear(); 3386 + 3387 + list_for_each_entry(s, &resctrl_schema_all, list) { 3388 + r = s->res; 3389 + if (r->rid == RDT_RESOURCE_MBA || 3390 + r->rid == RDT_RESOURCE_SMBA) { 3391 + rdtgroup_init_mba(r, rdtgrp->closid); 3392 + if (is_mba_sc(r)) 3393 + continue; 3394 + } else { 3395 + ret = rdtgroup_init_cat(s, rdtgrp->closid); 3396 + if (ret < 0) 3397 + goto out; 3398 + } 3399 + 3400 + ret = resctrl_arch_update_domains(r, rdtgrp->closid); 3401 + if (ret < 0) { 3402 + rdt_last_cmd_puts("Failed to initialize allocations\n"); 3403 + goto out; 3404 + } 3405 + } 3406 + 3407 + rdtgrp->mode = RDT_MODE_SHAREABLE; 3408 + 3409 + out: 3410 + rdt_staged_configs_clear(); 3411 + return ret; 3412 + } 3413 + 3414 + static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) 3415 + { 3416 + int ret; 3417 + 3418 + if (!resctrl_arch_mon_capable()) 3419 + return 0; 3420 + 3421 + ret = alloc_rmid(rdtgrp->closid); 3422 + if (ret < 0) { 3423 + rdt_last_cmd_puts("Out of RMIDs\n"); 3424 + return ret; 3425 + } 3426 + rdtgrp->mon.rmid = ret; 3427 + 3428 + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); 3429 + if (ret) { 3430 + rdt_last_cmd_puts("kernfs subdir error\n"); 3431 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3432 + return ret; 3433 + } 3434 + 3435 + return 0; 3436 + } 3437 + 3438 + static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) 3439 + { 3440 + if (resctrl_arch_mon_capable()) 3441 + free_rmid(rgrp->closid, rgrp->mon.rmid); 3442 + } 3443 + 3444 + /* 3445 + * We allow creating mon groups only with in a directory called "mon_groups" 3446 + * which is present in every ctrl_mon group. Check if this is a valid 3447 + * "mon_groups" directory. 3448 + * 3449 + * 1. The directory should be named "mon_groups". 3450 + * 2. The mon group itself should "not" be named "mon_groups". 3451 + * This makes sure "mon_groups" directory always has a ctrl_mon group 3452 + * as parent. 3453 + */ 3454 + static bool is_mon_groups(struct kernfs_node *kn, const char *name) 3455 + { 3456 + return (!strcmp(rdt_kn_name(kn), "mon_groups") && 3457 + strcmp(name, "mon_groups")); 3458 + } 3459 + 3460 + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, 3461 + const char *name, umode_t mode, 3462 + enum rdt_group_type rtype, struct rdtgroup **r) 3463 + { 3464 + struct rdtgroup *prdtgrp, *rdtgrp; 3465 + unsigned long files = 0; 3466 + struct kernfs_node *kn; 3467 + int ret; 3468 + 3469 + prdtgrp = rdtgroup_kn_lock_live(parent_kn); 3470 + if (!prdtgrp) { 3471 + ret = -ENODEV; 3472 + goto out_unlock; 3473 + } 3474 + 3475 + /* 3476 + * Check that the parent directory for a monitor group is a "mon_groups" 3477 + * directory. 3478 + */ 3479 + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { 3480 + ret = -EPERM; 3481 + goto out_unlock; 3482 + } 3483 + 3484 + if (rtype == RDTMON_GROUP && 3485 + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 3486 + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { 3487 + ret = -EINVAL; 3488 + rdt_last_cmd_puts("Pseudo-locking in progress\n"); 3489 + goto out_unlock; 3490 + } 3491 + 3492 + /* allocate the rdtgroup. */ 3493 + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 3494 + if (!rdtgrp) { 3495 + ret = -ENOSPC; 3496 + rdt_last_cmd_puts("Kernel out of memory\n"); 3497 + goto out_unlock; 3498 + } 3499 + *r = rdtgrp; 3500 + rdtgrp->mon.parent = prdtgrp; 3501 + rdtgrp->type = rtype; 3502 + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); 3503 + 3504 + /* kernfs creates the directory for rdtgrp */ 3505 + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); 3506 + if (IS_ERR(kn)) { 3507 + ret = PTR_ERR(kn); 3508 + rdt_last_cmd_puts("kernfs create error\n"); 3509 + goto out_free_rgrp; 3510 + } 3511 + rdtgrp->kn = kn; 3512 + 3513 + /* 3514 + * kernfs_remove() will drop the reference count on "kn" which 3515 + * will free it. But we still need it to stick around for the 3516 + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, 3517 + * which will be dropped by kernfs_put() in rdtgroup_remove(). 3518 + */ 3519 + kernfs_get(kn); 3520 + 3521 + ret = rdtgroup_kn_set_ugid(kn); 3522 + if (ret) { 3523 + rdt_last_cmd_puts("kernfs perm error\n"); 3524 + goto out_destroy; 3525 + } 3526 + 3527 + if (rtype == RDTCTRL_GROUP) { 3528 + files = RFTYPE_BASE | RFTYPE_CTRL; 3529 + if (resctrl_arch_mon_capable()) 3530 + files |= RFTYPE_MON; 3531 + } else { 3532 + files = RFTYPE_BASE | RFTYPE_MON; 3533 + } 3534 + 3535 + ret = rdtgroup_add_files(kn, files); 3536 + if (ret) { 3537 + rdt_last_cmd_puts("kernfs fill error\n"); 3538 + goto out_destroy; 3539 + } 3540 + 3541 + /* 3542 + * The caller unlocks the parent_kn upon success. 3543 + */ 3544 + return 0; 3545 + 3546 + out_destroy: 3547 + kernfs_put(rdtgrp->kn); 3548 + kernfs_remove(rdtgrp->kn); 3549 + out_free_rgrp: 3550 + kfree(rdtgrp); 3551 + out_unlock: 3552 + rdtgroup_kn_unlock(parent_kn); 3553 + return ret; 3554 + } 3555 + 3556 + static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) 3557 + { 3558 + kernfs_remove(rgrp->kn); 3559 + rdtgroup_remove(rgrp); 3560 + } 3561 + 3562 + /* 3563 + * Create a monitor group under "mon_groups" directory of a control 3564 + * and monitor group(ctrl_mon). This is a resource group 3565 + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. 3566 + */ 3567 + static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, 3568 + const char *name, umode_t mode) 3569 + { 3570 + struct rdtgroup *rdtgrp, *prgrp; 3571 + int ret; 3572 + 3573 + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); 3574 + if (ret) 3575 + return ret; 3576 + 3577 + prgrp = rdtgrp->mon.parent; 3578 + rdtgrp->closid = prgrp->closid; 3579 + 3580 + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 3581 + if (ret) { 3582 + mkdir_rdt_prepare_clean(rdtgrp); 3583 + goto out_unlock; 3584 + } 3585 + 3586 + kernfs_activate(rdtgrp->kn); 3587 + 3588 + /* 3589 + * Add the rdtgrp to the list of rdtgrps the parent 3590 + * ctrl_mon group has to track. 3591 + */ 3592 + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); 3593 + 3594 + out_unlock: 3595 + rdtgroup_kn_unlock(parent_kn); 3596 + return ret; 3597 + } 3598 + 3599 + /* 3600 + * These are rdtgroups created under the root directory. Can be used 3601 + * to allocate and monitor resources. 3602 + */ 3603 + static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, 3604 + const char *name, umode_t mode) 3605 + { 3606 + struct rdtgroup *rdtgrp; 3607 + struct kernfs_node *kn; 3608 + u32 closid; 3609 + int ret; 3610 + 3611 + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); 3612 + if (ret) 3613 + return ret; 3614 + 3615 + kn = rdtgrp->kn; 3616 + ret = closid_alloc(); 3617 + if (ret < 0) { 3618 + rdt_last_cmd_puts("Out of CLOSIDs\n"); 3619 + goto out_common_fail; 3620 + } 3621 + closid = ret; 3622 + ret = 0; 3623 + 3624 + rdtgrp->closid = closid; 3625 + 3626 + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); 3627 + if (ret) 3628 + goto out_closid_free; 3629 + 3630 + kernfs_activate(rdtgrp->kn); 3631 + 3632 + ret = rdtgroup_init_alloc(rdtgrp); 3633 + if (ret < 0) 3634 + goto out_rmid_free; 3635 + 3636 + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 3637 + 3638 + if (resctrl_arch_mon_capable()) { 3639 + /* 3640 + * Create an empty mon_groups directory to hold the subset 3641 + * of tasks and cpus to monitor. 3642 + */ 3643 + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); 3644 + if (ret) { 3645 + rdt_last_cmd_puts("kernfs subdir error\n"); 3646 + goto out_del_list; 3647 + } 3648 + if (is_mba_sc(NULL)) 3649 + rdtgrp->mba_mbps_event = mba_mbps_default_event; 3650 + } 3651 + 3652 + goto out_unlock; 3653 + 3654 + out_del_list: 3655 + list_del(&rdtgrp->rdtgroup_list); 3656 + out_rmid_free: 3657 + mkdir_rdt_prepare_rmid_free(rdtgrp); 3658 + out_closid_free: 3659 + closid_free(closid); 3660 + out_common_fail: 3661 + mkdir_rdt_prepare_clean(rdtgrp); 3662 + out_unlock: 3663 + rdtgroup_kn_unlock(parent_kn); 3664 + return ret; 3665 + } 3666 + 3667 + static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 3668 + umode_t mode) 3669 + { 3670 + /* Do not accept '\n' to avoid unparsable situation. */ 3671 + if (strchr(name, '\n')) 3672 + return -EINVAL; 3673 + 3674 + /* 3675 + * If the parent directory is the root directory and RDT 3676 + * allocation is supported, add a control and monitoring 3677 + * subdirectory 3678 + */ 3679 + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) 3680 + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); 3681 + 3682 + /* Else, attempt to add a monitoring subdirectory. */ 3683 + if (resctrl_arch_mon_capable()) 3684 + return rdtgroup_mkdir_mon(parent_kn, name, mode); 3685 + 3686 + return -EPERM; 3687 + } 3688 + 3689 + static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 3690 + { 3691 + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 3692 + u32 closid, rmid; 3693 + int cpu; 3694 + 3695 + /* Give any tasks back to the parent group */ 3696 + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); 3697 + 3698 + /* 3699 + * Update per cpu closid/rmid of the moved CPUs first. 3700 + * Note: the closid will not change, but the arch code still needs it. 3701 + */ 3702 + closid = prdtgrp->closid; 3703 + rmid = prdtgrp->mon.rmid; 3704 + for_each_cpu(cpu, &rdtgrp->cpu_mask) 3705 + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 3706 + 3707 + /* 3708 + * Update the MSR on moved CPUs and CPUs which have moved 3709 + * task running on them. 3710 + */ 3711 + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 3712 + update_closid_rmid(tmpmask, NULL); 3713 + 3714 + rdtgrp->flags = RDT_DELETED; 3715 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3716 + 3717 + /* 3718 + * Remove the rdtgrp from the parent ctrl_mon group's list 3719 + */ 3720 + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 3721 + list_del(&rdtgrp->mon.crdtgrp_list); 3722 + 3723 + kernfs_remove(rdtgrp->kn); 3724 + 3725 + return 0; 3726 + } 3727 + 3728 + static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) 3729 + { 3730 + rdtgrp->flags = RDT_DELETED; 3731 + list_del(&rdtgrp->rdtgroup_list); 3732 + 3733 + kernfs_remove(rdtgrp->kn); 3734 + return 0; 3735 + } 3736 + 3737 + static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) 3738 + { 3739 + u32 closid, rmid; 3740 + int cpu; 3741 + 3742 + /* Give any tasks back to the default group */ 3743 + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 3744 + 3745 + /* Give any CPUs back to the default group */ 3746 + cpumask_or(&rdtgroup_default.cpu_mask, 3747 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 3748 + 3749 + /* Update per cpu closid and rmid of the moved CPUs first */ 3750 + closid = rdtgroup_default.closid; 3751 + rmid = rdtgroup_default.mon.rmid; 3752 + for_each_cpu(cpu, &rdtgrp->cpu_mask) 3753 + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); 3754 + 3755 + /* 3756 + * Update the MSR on moved CPUs and CPUs which have moved 3757 + * task running on them. 3758 + */ 3759 + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 3760 + update_closid_rmid(tmpmask, NULL); 3761 + 3762 + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); 3763 + closid_free(rdtgrp->closid); 3764 + 3765 + rdtgroup_ctrl_remove(rdtgrp); 3766 + 3767 + /* 3768 + * Free all the child monitor group rmids. 3769 + */ 3770 + free_all_child_rdtgrp(rdtgrp); 3771 + 3772 + return 0; 3773 + } 3774 + 3775 + static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) 3776 + { 3777 + /* 3778 + * Valid within the RCU section it was obtained or while rdtgroup_mutex 3779 + * is held. 3780 + */ 3781 + return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); 3782 + } 3783 + 3784 + static int rdtgroup_rmdir(struct kernfs_node *kn) 3785 + { 3786 + struct kernfs_node *parent_kn; 3787 + struct rdtgroup *rdtgrp; 3788 + cpumask_var_t tmpmask; 3789 + int ret = 0; 3790 + 3791 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 3792 + return -ENOMEM; 3793 + 3794 + rdtgrp = rdtgroup_kn_lock_live(kn); 3795 + if (!rdtgrp) { 3796 + ret = -EPERM; 3797 + goto out; 3798 + } 3799 + parent_kn = rdt_kn_parent(kn); 3800 + 3801 + /* 3802 + * If the rdtgroup is a ctrl_mon group and parent directory 3803 + * is the root directory, remove the ctrl_mon group. 3804 + * 3805 + * If the rdtgroup is a mon group and parent directory 3806 + * is a valid "mon_groups" directory, remove the mon group. 3807 + */ 3808 + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && 3809 + rdtgrp != &rdtgroup_default) { 3810 + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || 3811 + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { 3812 + ret = rdtgroup_ctrl_remove(rdtgrp); 3813 + } else { 3814 + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); 3815 + } 3816 + } else if (rdtgrp->type == RDTMON_GROUP && 3817 + is_mon_groups(parent_kn, rdt_kn_name(kn))) { 3818 + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); 3819 + } else { 3820 + ret = -EPERM; 3821 + } 3822 + 3823 + out: 3824 + rdtgroup_kn_unlock(kn); 3825 + free_cpumask_var(tmpmask); 3826 + return ret; 3827 + } 3828 + 3829 + /** 3830 + * mongrp_reparent() - replace parent CTRL_MON group of a MON group 3831 + * @rdtgrp: the MON group whose parent should be replaced 3832 + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp 3833 + * @cpus: cpumask provided by the caller for use during this call 3834 + * 3835 + * Replaces the parent CTRL_MON group for a MON group, resulting in all member 3836 + * tasks' CLOSID immediately changing to that of the new parent group. 3837 + * Monitoring data for the group is unaffected by this operation. 3838 + */ 3839 + static void mongrp_reparent(struct rdtgroup *rdtgrp, 3840 + struct rdtgroup *new_prdtgrp, 3841 + cpumask_var_t cpus) 3842 + { 3843 + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; 3844 + 3845 + WARN_ON(rdtgrp->type != RDTMON_GROUP); 3846 + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); 3847 + 3848 + /* Nothing to do when simply renaming a MON group. */ 3849 + if (prdtgrp == new_prdtgrp) 3850 + return; 3851 + 3852 + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); 3853 + list_move_tail(&rdtgrp->mon.crdtgrp_list, 3854 + &new_prdtgrp->mon.crdtgrp_list); 3855 + 3856 + rdtgrp->mon.parent = new_prdtgrp; 3857 + rdtgrp->closid = new_prdtgrp->closid; 3858 + 3859 + /* Propagate updated closid to all tasks in this group. */ 3860 + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); 3861 + 3862 + update_closid_rmid(cpus, NULL); 3863 + } 3864 + 3865 + static int rdtgroup_rename(struct kernfs_node *kn, 3866 + struct kernfs_node *new_parent, const char *new_name) 3867 + { 3868 + struct kernfs_node *kn_parent; 3869 + struct rdtgroup *new_prdtgrp; 3870 + struct rdtgroup *rdtgrp; 3871 + cpumask_var_t tmpmask; 3872 + int ret; 3873 + 3874 + rdtgrp = kernfs_to_rdtgroup(kn); 3875 + new_prdtgrp = kernfs_to_rdtgroup(new_parent); 3876 + if (!rdtgrp || !new_prdtgrp) 3877 + return -ENOENT; 3878 + 3879 + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ 3880 + rdtgroup_kn_get(rdtgrp, kn); 3881 + rdtgroup_kn_get(new_prdtgrp, new_parent); 3882 + 3883 + mutex_lock(&rdtgroup_mutex); 3884 + 3885 + rdt_last_cmd_clear(); 3886 + 3887 + /* 3888 + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if 3889 + * either kernfs_node is a file. 3890 + */ 3891 + if (kernfs_type(kn) != KERNFS_DIR || 3892 + kernfs_type(new_parent) != KERNFS_DIR) { 3893 + rdt_last_cmd_puts("Source and destination must be directories"); 3894 + ret = -EPERM; 3895 + goto out; 3896 + } 3897 + 3898 + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { 3899 + ret = -ENOENT; 3900 + goto out; 3901 + } 3902 + 3903 + kn_parent = rdt_kn_parent(kn); 3904 + if (rdtgrp->type != RDTMON_GROUP || !kn_parent || 3905 + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { 3906 + rdt_last_cmd_puts("Source must be a MON group\n"); 3907 + ret = -EPERM; 3908 + goto out; 3909 + } 3910 + 3911 + if (!is_mon_groups(new_parent, new_name)) { 3912 + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); 3913 + ret = -EPERM; 3914 + goto out; 3915 + } 3916 + 3917 + /* 3918 + * If the MON group is monitoring CPUs, the CPUs must be assigned to the 3919 + * current parent CTRL_MON group and therefore cannot be assigned to 3920 + * the new parent, making the move illegal. 3921 + */ 3922 + if (!cpumask_empty(&rdtgrp->cpu_mask) && 3923 + rdtgrp->mon.parent != new_prdtgrp) { 3924 + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); 3925 + ret = -EPERM; 3926 + goto out; 3927 + } 3928 + 3929 + /* 3930 + * Allocate the cpumask for use in mongrp_reparent() to avoid the 3931 + * possibility of failing to allocate it after kernfs_rename() has 3932 + * succeeded. 3933 + */ 3934 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { 3935 + ret = -ENOMEM; 3936 + goto out; 3937 + } 3938 + 3939 + /* 3940 + * Perform all input validation and allocations needed to ensure 3941 + * mongrp_reparent() will succeed before calling kernfs_rename(), 3942 + * otherwise it would be necessary to revert this call if 3943 + * mongrp_reparent() failed. 3944 + */ 3945 + ret = kernfs_rename(kn, new_parent, new_name); 3946 + if (!ret) 3947 + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); 3948 + 3949 + free_cpumask_var(tmpmask); 3950 + 3951 + out: 3952 + mutex_unlock(&rdtgroup_mutex); 3953 + rdtgroup_kn_put(rdtgrp, kn); 3954 + rdtgroup_kn_put(new_prdtgrp, new_parent); 3955 + return ret; 3956 + } 3957 + 3958 + static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 3959 + { 3960 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) 3961 + seq_puts(seq, ",cdp"); 3962 + 3963 + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) 3964 + seq_puts(seq, ",cdpl2"); 3965 + 3966 + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) 3967 + seq_puts(seq, ",mba_MBps"); 3968 + 3969 + if (resctrl_debug) 3970 + seq_puts(seq, ",debug"); 3971 + 3972 + return 0; 3973 + } 3974 + 3975 + static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { 3976 + .mkdir = rdtgroup_mkdir, 3977 + .rmdir = rdtgroup_rmdir, 3978 + .rename = rdtgroup_rename, 3979 + .show_options = rdtgroup_show_options, 3980 + }; 3981 + 3982 + static int rdtgroup_setup_root(struct rdt_fs_context *ctx) 3983 + { 3984 + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, 3985 + KERNFS_ROOT_CREATE_DEACTIVATED | 3986 + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, 3987 + &rdtgroup_default); 3988 + if (IS_ERR(rdt_root)) 3989 + return PTR_ERR(rdt_root); 3990 + 3991 + ctx->kfc.root = rdt_root; 3992 + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); 3993 + 3994 + return 0; 3995 + } 3996 + 3997 + static void rdtgroup_destroy_root(void) 3998 + { 3999 + lockdep_assert_held(&rdtgroup_mutex); 4000 + 4001 + kernfs_destroy_root(rdt_root); 4002 + rdtgroup_default.kn = NULL; 4003 + } 4004 + 4005 + static void rdtgroup_setup_default(void) 4006 + { 4007 + mutex_lock(&rdtgroup_mutex); 4008 + 4009 + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; 4010 + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; 4011 + rdtgroup_default.type = RDTCTRL_GROUP; 4012 + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); 4013 + 4014 + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 4015 + 4016 + mutex_unlock(&rdtgroup_mutex); 4017 + } 4018 + 4019 + static void domain_destroy_mon_state(struct rdt_mon_domain *d) 4020 + { 4021 + bitmap_free(d->rmid_busy_llc); 4022 + kfree(d->mbm_total); 4023 + kfree(d->mbm_local); 4024 + } 4025 + 4026 + void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 4027 + { 4028 + mutex_lock(&rdtgroup_mutex); 4029 + 4030 + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) 4031 + mba_sc_domain_destroy(r, d); 4032 + 4033 + mutex_unlock(&rdtgroup_mutex); 4034 + } 4035 + 4036 + void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 4037 + { 4038 + mutex_lock(&rdtgroup_mutex); 4039 + 4040 + /* 4041 + * If resctrl is mounted, remove all the 4042 + * per domain monitor data directories. 4043 + */ 4044 + if (resctrl_mounted && resctrl_arch_mon_capable()) 4045 + rmdir_mondata_subdir_allrdtgrp(r, d); 4046 + 4047 + if (resctrl_is_mbm_enabled()) 4048 + cancel_delayed_work(&d->mbm_over); 4049 + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { 4050 + /* 4051 + * When a package is going down, forcefully 4052 + * decrement rmid->ebusy. There is no way to know 4053 + * that the L3 was flushed and hence may lead to 4054 + * incorrect counts in rare scenarios, but leaving 4055 + * the RMID as busy creates RMID leaks if the 4056 + * package never comes back. 4057 + */ 4058 + __check_limbo(d, true); 4059 + cancel_delayed_work(&d->cqm_limbo); 4060 + } 4061 + 4062 + domain_destroy_mon_state(d); 4063 + 4064 + mutex_unlock(&rdtgroup_mutex); 4065 + } 4066 + 4067 + /** 4068 + * domain_setup_mon_state() - Initialise domain monitoring structures. 4069 + * @r: The resource for the newly online domain. 4070 + * @d: The newly online domain. 4071 + * 4072 + * Allocate monitor resources that belong to this domain. 4073 + * Called when the first CPU of a domain comes online, regardless of whether 4074 + * the filesystem is mounted. 4075 + * During boot this may be called before global allocations have been made by 4076 + * resctrl_mon_resource_init(). 4077 + * 4078 + * Returns 0 for success, or -ENOMEM. 4079 + */ 4080 + static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) 4081 + { 4082 + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 4083 + size_t tsize; 4084 + 4085 + if (resctrl_arch_is_llc_occupancy_enabled()) { 4086 + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); 4087 + if (!d->rmid_busy_llc) 4088 + return -ENOMEM; 4089 + } 4090 + if (resctrl_arch_is_mbm_total_enabled()) { 4091 + tsize = sizeof(*d->mbm_total); 4092 + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); 4093 + if (!d->mbm_total) { 4094 + bitmap_free(d->rmid_busy_llc); 4095 + return -ENOMEM; 4096 + } 4097 + } 4098 + if (resctrl_arch_is_mbm_local_enabled()) { 4099 + tsize = sizeof(*d->mbm_local); 4100 + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); 4101 + if (!d->mbm_local) { 4102 + bitmap_free(d->rmid_busy_llc); 4103 + kfree(d->mbm_total); 4104 + return -ENOMEM; 4105 + } 4106 + } 4107 + 4108 + return 0; 4109 + } 4110 + 4111 + int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) 4112 + { 4113 + int err = 0; 4114 + 4115 + mutex_lock(&rdtgroup_mutex); 4116 + 4117 + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { 4118 + /* RDT_RESOURCE_MBA is never mon_capable */ 4119 + err = mba_sc_domain_allocate(r, d); 4120 + } 4121 + 4122 + mutex_unlock(&rdtgroup_mutex); 4123 + 4124 + return err; 4125 + } 4126 + 4127 + int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) 4128 + { 4129 + int err; 4130 + 4131 + mutex_lock(&rdtgroup_mutex); 4132 + 4133 + err = domain_setup_mon_state(r, d); 4134 + if (err) 4135 + goto out_unlock; 4136 + 4137 + if (resctrl_is_mbm_enabled()) { 4138 + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); 4139 + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, 4140 + RESCTRL_PICK_ANY_CPU); 4141 + } 4142 + 4143 + if (resctrl_arch_is_llc_occupancy_enabled()) 4144 + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); 4145 + 4146 + /* 4147 + * If the filesystem is not mounted then only the default resource group 4148 + * exists. Creation of its directories is deferred until mount time 4149 + * by rdt_get_tree() calling mkdir_mondata_all(). 4150 + * If resctrl is mounted, add per domain monitor data directories. 4151 + */ 4152 + if (resctrl_mounted && resctrl_arch_mon_capable()) 4153 + mkdir_mondata_subdir_allrdtgrp(r, d); 4154 + 4155 + out_unlock: 4156 + mutex_unlock(&rdtgroup_mutex); 4157 + 4158 + return err; 4159 + } 4160 + 4161 + void resctrl_online_cpu(unsigned int cpu) 4162 + { 4163 + mutex_lock(&rdtgroup_mutex); 4164 + /* The CPU is set in default rdtgroup after online. */ 4165 + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 4166 + mutex_unlock(&rdtgroup_mutex); 4167 + } 4168 + 4169 + static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) 4170 + { 4171 + struct rdtgroup *cr; 4172 + 4173 + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { 4174 + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) 4175 + break; 4176 + } 4177 + } 4178 + 4179 + static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, 4180 + struct rdt_resource *r) 4181 + { 4182 + struct rdt_mon_domain *d; 4183 + 4184 + lockdep_assert_cpus_held(); 4185 + 4186 + list_for_each_entry(d, &r->mon_domains, hdr.list) { 4187 + /* Find the domain that contains this CPU */ 4188 + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 4189 + return d; 4190 + } 4191 + 4192 + return NULL; 4193 + } 4194 + 4195 + void resctrl_offline_cpu(unsigned int cpu) 4196 + { 4197 + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); 4198 + struct rdt_mon_domain *d; 4199 + struct rdtgroup *rdtgrp; 4200 + 4201 + mutex_lock(&rdtgroup_mutex); 4202 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 4203 + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { 4204 + clear_childcpus(rdtgrp, cpu); 4205 + break; 4206 + } 4207 + } 4208 + 4209 + if (!l3->mon_capable) 4210 + goto out_unlock; 4211 + 4212 + d = get_mon_domain_from_cpu(cpu, l3); 4213 + if (d) { 4214 + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { 4215 + cancel_delayed_work(&d->mbm_over); 4216 + mbm_setup_overflow_handler(d, 0, cpu); 4217 + } 4218 + if (resctrl_arch_is_llc_occupancy_enabled() && 4219 + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { 4220 + cancel_delayed_work(&d->cqm_limbo); 4221 + cqm_setup_limbo_handler(d, 0, cpu); 4222 + } 4223 + } 4224 + 4225 + out_unlock: 4226 + mutex_unlock(&rdtgroup_mutex); 4227 + } 4228 + 4229 + /* 4230 + * resctrl_init - resctrl filesystem initialization 4231 + * 4232 + * Setup resctrl file system including set up root, create mount point, 4233 + * register resctrl filesystem, and initialize files under root directory. 4234 + * 4235 + * Return: 0 on success or -errno 4236 + */ 4237 + int resctrl_init(void) 4238 + { 4239 + int ret = 0; 4240 + 4241 + seq_buf_init(&last_cmd_status, last_cmd_status_buf, 4242 + sizeof(last_cmd_status_buf)); 4243 + 4244 + rdtgroup_setup_default(); 4245 + 4246 + thread_throttle_mode_init(); 4247 + 4248 + ret = resctrl_mon_resource_init(); 4249 + if (ret) 4250 + return ret; 4251 + 4252 + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); 4253 + if (ret) { 4254 + resctrl_mon_resource_exit(); 4255 + return ret; 4256 + } 4257 + 4258 + ret = register_filesystem(&rdt_fs_type); 4259 + if (ret) 4260 + goto cleanup_mountpoint; 4261 + 4262 + /* 4263 + * Adding the resctrl debugfs directory here may not be ideal since 4264 + * it would let the resctrl debugfs directory appear on the debugfs 4265 + * filesystem before the resctrl filesystem is mounted. 4266 + * It may also be ok since that would enable debugging of RDT before 4267 + * resctrl is mounted. 4268 + * The reason why the debugfs directory is created here and not in 4269 + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and 4270 + * during the debugfs directory creation also &sb->s_type->i_mutex_key 4271 + * (the lockdep class of inode->i_rwsem). Other filesystem 4272 + * interactions (eg. SyS_getdents) have the lock ordering: 4273 + * &sb->s_type->i_mutex_key --> &mm->mmap_lock 4274 + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex 4275 + * is taken, thus creating dependency: 4276 + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause 4277 + * issues considering the other two lock dependencies. 4278 + * By creating the debugfs directory here we avoid a dependency 4279 + * that may cause deadlock (even though file operations cannot 4280 + * occur until the filesystem is mounted, but I do not know how to 4281 + * tell lockdep that). 4282 + */ 4283 + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); 4284 + 4285 + return 0; 4286 + 4287 + cleanup_mountpoint: 4288 + sysfs_remove_mount_point(fs_kobj, "resctrl"); 4289 + resctrl_mon_resource_exit(); 4290 + 4291 + return ret; 4292 + } 4293 + 4294 + static bool resctrl_online_domains_exist(void) 4295 + { 4296 + struct rdt_resource *r; 4297 + 4298 + /* 4299 + * Only walk capable resources to allow resctrl_arch_get_resource() 4300 + * to return dummy 'not capable' resources. 4301 + */ 4302 + for_each_alloc_capable_rdt_resource(r) { 4303 + if (!list_empty(&r->ctrl_domains)) 4304 + return true; 4305 + } 4306 + 4307 + for_each_mon_capable_rdt_resource(r) { 4308 + if (!list_empty(&r->mon_domains)) 4309 + return true; 4310 + } 4311 + 4312 + return false; 4313 + } 4314 + 4315 + /** 4316 + * resctrl_exit() - Remove the resctrl filesystem and free resources. 4317 + * 4318 + * Called by the architecture code in response to a fatal error. 4319 + * Removes resctrl files and structures from kernfs to prevent further 4320 + * configuration. 4321 + * 4322 + * When called by the architecture code, all CPUs and resctrl domains must be 4323 + * offline. This ensures the limbo and overflow handlers are not scheduled to 4324 + * run, meaning the data structures they access can be freed by 4325 + * resctrl_mon_resource_exit(). 4326 + * 4327 + * After resctrl_exit() returns, the architecture code should return an 4328 + * error from all resctrl_arch_ functions that can do this. 4329 + * resctrl_arch_get_resource() must continue to return struct rdt_resources 4330 + * with the correct rid field to ensure the filesystem can be unmounted. 4331 + */ 4332 + void resctrl_exit(void) 4333 + { 4334 + cpus_read_lock(); 4335 + WARN_ON_ONCE(resctrl_online_domains_exist()); 4336 + 4337 + mutex_lock(&rdtgroup_mutex); 4338 + resctrl_fs_teardown(); 4339 + mutex_unlock(&rdtgroup_mutex); 4340 + 4341 + cpus_read_unlock(); 4342 + 4343 + debugfs_remove_recursive(debugfs_resctrl); 4344 + debugfs_resctrl = NULL; 4345 + unregister_filesystem(&rdt_fs_type); 4346 + 4347 + /* 4348 + * Do not remove the sysfs mount point added by resctrl_init() so that 4349 + * it can be used to umount resctrl. 4350 + */ 4351 + 4352 + resctrl_mon_resource_exit(); 4353 + }
+71 -4
include/linux/cpumask.h
··· 179 179 } 180 180 181 181 /** 182 + * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 183 + * @srcp1: the first input 184 + * @srcp2: the second input 185 + * 186 + * Return: >= nr_cpu_ids if no such cpu found. 187 + */ 188 + static __always_inline 189 + unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) 190 + { 191 + return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); 192 + } 193 + 194 + /** 182 195 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 183 196 * @srcp1: the first input 184 197 * @srcp2: the second input ··· 294 281 if (n != -1) 295 282 cpumask_check(n); 296 283 return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), 284 + small_cpumask_bits, n + 1); 285 + } 286 + 287 + /** 288 + * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p 289 + * @n: the cpu prior to the place to search (i.e. return will be > @n) 290 + * @src1p: the first cpumask pointer 291 + * @src2p: the second cpumask pointer 292 + * 293 + * Return: >= nr_cpu_ids if no further cpus set in both. 294 + */ 295 + static __always_inline 296 + unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, 297 + const struct cpumask *src2p) 298 + { 299 + /* -1 is a legal arg here. */ 300 + if (n != -1) 301 + cpumask_check(n); 302 + return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), 297 303 small_cpumask_bits, n + 1); 298 304 } 299 305 ··· 445 413 * @cpu: the cpu to ignore. 446 414 * 447 415 * Often used to find any cpu but smp_processor_id() in a mask. 416 + * If @cpu == -1, the function is equivalent to cpumask_any(). 448 417 * Return: >= nr_cpu_ids if no cpus set. 449 418 */ 450 419 static __always_inline 451 - unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) 420 + unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) 452 421 { 453 422 unsigned int i; 454 423 455 - cpumask_check(cpu); 424 + /* -1 is a legal arg here. */ 425 + if (cpu != -1) 426 + cpumask_check(cpu); 427 + 456 428 for_each_cpu(i, mask) 457 429 if (i != cpu) 458 430 break; ··· 469 433 * @mask2: the second input cpumask 470 434 * @cpu: the cpu to ignore 471 435 * 436 + * If @cpu == -1, the function is equivalent to cpumask_any_and(). 472 437 * Returns >= nr_cpu_ids if no cpus set. 473 438 */ 474 439 static __always_inline 475 440 unsigned int cpumask_any_and_but(const struct cpumask *mask1, 476 441 const struct cpumask *mask2, 477 - unsigned int cpu) 442 + int cpu) 478 443 { 479 444 unsigned int i; 480 445 481 - cpumask_check(cpu); 446 + /* -1 is a legal arg here. */ 447 + if (cpu != -1) 448 + cpumask_check(cpu); 449 + 482 450 i = cpumask_first_and(mask1, mask2); 483 451 if (i != cpu) 484 452 return i; 485 453 486 454 return cpumask_next_and(cpu, mask1, mask2); 455 + } 456 + 457 + /** 458 + * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. 459 + * @mask1: the first input cpumask 460 + * @mask2: the second input cpumask 461 + * @cpu: the cpu to ignore 462 + * 463 + * If @cpu == -1, the function returns the first matching cpu. 464 + * Returns >= nr_cpu_ids if no cpus set. 465 + */ 466 + static __always_inline 467 + unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, 468 + const struct cpumask *mask2, 469 + int cpu) 470 + { 471 + unsigned int i; 472 + 473 + /* -1 is a legal arg here. */ 474 + if (cpu != -1) 475 + cpumask_check(cpu); 476 + 477 + i = cpumask_first_andnot(mask1, mask2); 478 + if (i != cpu) 479 + return i; 480 + 481 + return cpumask_next_andnot(cpu, mask1, mask2); 487 482 } 488 483 489 484 /**
+25
include/linux/find.h
··· 29 29 unsigned long n); 30 30 extern unsigned long _find_first_and_bit(const unsigned long *addr1, 31 31 const unsigned long *addr2, unsigned long size); 32 + unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, 33 + unsigned long size); 32 34 unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, 33 35 const unsigned long *addr3, unsigned long size); 34 36 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); ··· 348 346 return _find_first_and_bit(addr1, addr2, size); 349 347 } 350 348 #endif 349 + 350 + /** 351 + * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd 352 + * @addr1: The first address to base the search on 353 + * @addr2: The second address to base the search on 354 + * @size: The bitmap size in bits 355 + * 356 + * Returns the bit number for the first set bit 357 + * If no bits are set, returns >= @size. 358 + */ 359 + static __always_inline 360 + unsigned long find_first_andnot_bit(const unsigned long *addr1, 361 + const unsigned long *addr2, 362 + unsigned long size) 363 + { 364 + if (small_const_nbits(size)) { 365 + unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); 366 + 367 + return val ? __ffs(val) : size; 368 + } 369 + 370 + return _find_first_andnot_bit(addr1, addr2, size); 371 + } 351 372 352 373 /** 353 374 * find_first_and_and_bit - find the first set bit in 3 memory regions
+33 -3
include/linux/resctrl.h
··· 8 8 #include <linux/pid.h> 9 9 #include <linux/resctrl_types.h> 10 10 11 + #ifdef CONFIG_ARCH_HAS_CPU_RESCTRL 12 + #include <asm/resctrl.h> 13 + #endif 14 + 11 15 /* CLOSID, RMID value used by the default control group */ 12 16 #define RESCTRL_RESERVED_CLOSID 0 13 17 #define RESCTRL_RESERVED_RMID 0 ··· 47 43 #define for_each_mon_capable_rdt_resource(r) \ 48 44 for_each_rdt_resource((r)) \ 49 45 if ((r)->mon_capable) 46 + 47 + enum resctrl_res_level { 48 + RDT_RESOURCE_L3, 49 + RDT_RESOURCE_L2, 50 + RDT_RESOURCE_MBA, 51 + RDT_RESOURCE_SMBA, 52 + 53 + /* Must be the last */ 54 + RDT_NUM_RESOURCES, 55 + }; 50 56 51 57 /** 52 58 * enum resctrl_conf_type - The type of configuration. ··· 372 358 u32 resctrl_arch_system_num_rmid_idx(void); 373 359 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); 374 360 375 - __init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); 361 + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); 376 362 377 363 /** 378 364 * resctrl_arch_mon_event_config_write() - Write the config for an event. ··· 412 398 return closid * 2; 413 399 } 414 400 } 401 + 402 + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); 403 + int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); 415 404 416 405 /* 417 406 * Update the ctrl_val and apply this config right now. ··· 531 514 extern unsigned int resctrl_rmid_realloc_threshold; 532 515 extern unsigned int resctrl_rmid_realloc_limit; 533 516 534 - int __init resctrl_init(void); 535 - void __exit resctrl_exit(void); 517 + int resctrl_init(void); 518 + void resctrl_exit(void); 536 519 520 + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK 521 + u64 resctrl_arch_get_prefetch_disable_bits(void); 522 + int resctrl_arch_pseudo_lock_fn(void *_plr); 523 + int resctrl_arch_measure_cycles_lat_fn(void *_plr); 524 + int resctrl_arch_measure_l2_residency(void *_plr); 525 + int resctrl_arch_measure_l3_residency(void *_plr); 526 + #else 527 + static inline u64 resctrl_arch_get_prefetch_disable_bits(void) { return 0; } 528 + static inline int resctrl_arch_pseudo_lock_fn(void *_plr) { return 0; } 529 + static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } 530 + static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } 531 + static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } 532 + #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ 537 533 #endif /* _RESCTRL_H */
+6 -10
include/linux/resctrl_types.h
··· 7 7 #ifndef __LINUX_RESCTRL_TYPES_H 8 8 #define __LINUX_RESCTRL_TYPES_H 9 9 10 + #define MAX_MBA_BW 100u 11 + #define MBM_OVERFLOW_INTERVAL 1000 12 + 10 13 /* Reads to Local DRAM Memory */ 11 14 #define READS_TO_LOCAL_MEM BIT(0) 12 15 ··· 34 31 /* Max event bits supported */ 35 32 #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) 36 33 37 - enum resctrl_res_level { 38 - RDT_RESOURCE_L3, 39 - RDT_RESOURCE_L2, 40 - RDT_RESOURCE_MBA, 41 - RDT_RESOURCE_SMBA, 42 - 43 - /* Must be the last */ 44 - RDT_NUM_RESOURCES, 45 - }; 46 - 47 34 /* 48 35 * Event IDs, the values match those used to program IA32_QM_EVTSEL before 49 36 * reading IA32_QM_CTR on RDT systems. ··· 42 49 QOS_L3_OCCUP_EVENT_ID = 0x01, 43 50 QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, 44 51 QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, 52 + 53 + /* Must be the last */ 54 + QOS_NUM_EVENTS, 45 55 }; 46 56 47 57 #endif /* __LINUX_RESCTRL_TYPES_H */
+11
lib/find_bit.c
··· 117 117 #endif 118 118 119 119 /* 120 + * Find the first bit set in 1st memory region and unset in 2nd. 121 + */ 122 + unsigned long _find_first_andnot_bit(const unsigned long *addr1, 123 + const unsigned long *addr2, 124 + unsigned long size) 125 + { 126 + return FIND_FIRST_BIT(addr1[idx] & ~addr2[idx], /* nop */, size); 127 + } 128 + EXPORT_SYMBOL(_find_first_andnot_bit); 129 + 130 + /* 120 131 * Find the first set bit in three memory regions. 121 132 */ 122 133 unsigned long _find_first_and_and_bit(const unsigned long *addr1,