Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

s390/sclp: Add support for dynamic (de)configuration of memory

Provide a new interface for dynamic configuration and deconfiguration of
hotplug memory, allowing with/without memmap_on_memory support. It is a
follow up on the discussion with David when introducing memmap_on_memory
support for s390 and support dynamic (de)configuration of memory:
https://lore.kernel.org/all/ee492da8-74b4-4a97-8b24-73e07257f01d@redhat.com/
https://lore.kernel.org/all/20241202082732.3959803-1-sumanthk@linux.ibm.com/

The original motivation for introducing memmap_on_memory on s390 was to
avoid using online memory to store struct pages metadata, particularly
for standby memory blocks. This became critical in cases where there was
an imbalance between standby and online memory, potentially leading to
boot failures due to insufficient memory for metadata allocation.

To address this, memmap_on_memory was utilized on s390. However, in its
current form, it adds struct pages metadata at the start of each memory
block at the time of addition and this configuration is static. It
cannot be changed at runtime. (When the user needs continuous physical
memory).

Inorder to provide more flexibility to the user and overcome the above
limitation, add option to dynamically configure and deconfigure
hotpluggable memory block with/without memmap_on_memory.

With the new interface, s390 will not add all possible hotplug memory in
advance, like before, to make it visible in sysfs for online/offline
actions. Instead, before memory block can be set online, it has to be
configured via a new interface in /sys/firmware/memory/memoryX/config,
which makes s390 similar to others. i.e. Adding of hotpluggable memory is
controlled by the user instead of adding it at boottime.

The s390 kernel sysfs interface to configure and deconfigure memory is
as follows (considering the upcoming lsmem changes):

* Initial memory layout:
lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x7fffffff 2G online 0-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

* Configure memory
sys="/sys"
echo 1 > $sys/firmware/memory/memory16/config
lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x7fffffff 2G online 0-15 yes no
0x80000000-0x87ffffff 128M offline 16 yes yes
0x88000000-0xffffffff 1.9G offline 17-31 no yes

* Deconfigure memory
echo 0 > $sys/firmware/memory/memory16/config
lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x7fffffff 2G online 0-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

3. Enable memmap_on_memory and online it.
echo 0 > $sys/devices/system/memory/memory5/online
echo 0 > $sys/firmware/memory/memory5/config

lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x27ffffff 640M online 0-4 yes no
0x28000000-0x2fffffff 128M offline 5 no no
0x30000000-0x7fffffff 1.3G online 6-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

echo 1 > $sys/firmware/memory/memory5/memmap_on_memory
echo 1 > $sys/firmware/memory/memory5/config
echo 1 > $sys/devices/system/memory/memory5/online

lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x27ffffff 640M online 0-4 yes no
0x28000000-0x2fffffff 128M online 5 yes yes
0x30000000-0x7fffffff 1.3G online 6-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

4. Disable memmap_on_memory and online it.
echo 0 > $sys/devices/system/memory/memory5/online
echo 0 > $sys/firmware/memory/memory5/config

lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x27ffffff 640M online 0-4 yes no
0x28000000-0x2fffffff 128M offline 5 no yes
0x30000000-0x7fffffff 1.3G online 6-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

echo 0 > $sys/firmware/memory/memory5/memmap_on_memory
echo 1 > $sys/firmware/memory/memory5/config
echo 1 > $sys/devices/system/memory/memory5/online

lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED,MEMMAP_ON_MEMORY
RANGE SIZE STATE BLOCK CONFIGURED MEMMAP_ON_MEMORY
0x00000000-0x7fffffff 2G online 0-15 yes no
0x80000000-0xffffffff 2G offline 16-31 no yes

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>

authored by

Sumanth Korikkar and committed by
Heiko Carstens
ff18dcb1 d5e88d32

+204 -83
+204 -83
drivers/s390/char/sclp_mem.c
··· 9 9 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 10 10 11 11 #include <linux/cpufeature.h> 12 + #include <linux/container_of.h> 12 13 #include <linux/err.h> 13 14 #include <linux/errno.h> 14 15 #include <linux/init.h> 16 + #include <linux/kobject.h> 17 + #include <linux/kstrtox.h> 15 18 #include <linux/memory.h> 16 19 #include <linux/memory_hotplug.h> 17 20 #include <linux/mm.h> ··· 30 27 #define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 31 28 #define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 32 29 33 - static DEFINE_MUTEX(sclp_mem_mutex); 34 30 static LIST_HEAD(sclp_mem_list); 35 31 static u8 sclp_max_storage_id; 36 32 static DECLARE_BITMAP(sclp_storage_ids, 256); ··· 38 36 struct list_head list; 39 37 u16 rn; 40 38 int standby; 39 + }; 40 + 41 + struct sclp_mem { 42 + struct kobject kobj; 43 + unsigned int id; 44 + unsigned int memmap_on_memory; 45 + unsigned int config; 46 + }; 47 + 48 + struct sclp_mem_arg { 49 + struct sclp_mem *sclp_mems; 50 + struct kset *kset; 41 51 }; 42 52 43 53 struct assign_storage_sccb { ··· 177 163 return rc ? -EIO : 0; 178 164 } 179 165 180 - static bool contains_standby_increment(unsigned long start, unsigned long end) 166 + static ssize_t sclp_config_mem_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 181 167 { 182 - struct memory_increment *incr; 183 - unsigned long istart; 168 + struct sclp_mem *sclp_mem = container_of(kobj, struct sclp_mem, kobj); 184 169 185 - list_for_each_entry(incr, &sclp_mem_list, list) { 186 - istart = rn2addr(incr->rn); 187 - if (end - 1 < istart) 188 - continue; 189 - if (start > istart + sclp.rzm - 1) 190 - continue; 191 - if (incr->standby) 192 - return true; 193 - } 194 - return false; 170 + return sysfs_emit(buf, "%u\n", READ_ONCE(sclp_mem->config)); 195 171 } 196 172 197 - static int sclp_mem_notifier(struct notifier_block *nb, 198 - unsigned long action, void *data) 173 + static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute *attr, 174 + const char *buf, size_t count) 199 175 { 200 - unsigned long start, size; 201 - struct memory_notify *arg; 176 + unsigned long addr, block_size; 177 + struct sclp_mem *sclp_mem; 178 + struct memory_block *mem; 202 179 unsigned char id; 203 - int rc = 0; 180 + bool value; 181 + int rc; 204 182 205 - arg = data; 206 - start = arg->start_pfn << PAGE_SHIFT; 207 - size = arg->nr_pages << PAGE_SHIFT; 208 - mutex_lock(&sclp_mem_mutex); 183 + rc = kstrtobool(buf, &value); 184 + if (rc) 185 + return rc; 186 + sclp_mem = container_of(kobj, struct sclp_mem, kobj); 187 + block_size = memory_block_size_bytes(); 188 + addr = sclp_mem->id * block_size; 189 + /* 190 + * Hold device_hotplug_lock when adding/removing memory blocks. 191 + * Additionally, also protect calls to find_memory_block() and 192 + * sclp_attach_storage(). 193 + */ 194 + rc = lock_device_hotplug_sysfs(); 195 + if (rc) 196 + goto out; 209 197 for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) 210 198 sclp_attach_storage(id); 211 - switch (action) { 212 - case MEM_GOING_OFFLINE: 199 + if (value) { 200 + if (sclp_mem->config) 201 + goto out_unlock; 202 + rc = sclp_mem_change_state(addr, block_size, 1); 203 + if (rc) 204 + goto out_unlock; 213 205 /* 214 - * Do not allow to set memory blocks offline that contain 215 - * standby memory. This is done to simplify the "memory online" 216 - * case. 206 + * Set entire memory block CMMA state to nodat. Later, when 207 + * page tables pages are allocated via __add_memory(), those 208 + * regions are marked __arch_set_page_dat(). 217 209 */ 218 - if (contains_standby_increment(start, start + size)) 219 - rc = -EPERM; 220 - break; 221 - case MEM_PREPARE_ONLINE: 222 - /* 223 - * Access the altmap_start_pfn and altmap_nr_pages fields 224 - * within the struct memory_notify specifically when dealing 225 - * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. 226 - * 227 - * When altmap is in use, take the specified memory range 228 - * online, which includes the altmap. 229 - */ 230 - if (arg->altmap_nr_pages) { 231 - start = PFN_PHYS(arg->altmap_start_pfn); 232 - size += PFN_PHYS(arg->altmap_nr_pages); 210 + __arch_set_page_nodat((void *)__va(addr), block_size >> PAGE_SHIFT); 211 + rc = __add_memory(0, addr, block_size, 212 + sclp_mem->memmap_on_memory ? 213 + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); 214 + if (rc) { 215 + sclp_mem_change_state(addr, block_size, 0); 216 + goto out_unlock; 233 217 } 234 - rc = sclp_mem_change_state(start, size, 1); 235 - if (rc || !arg->altmap_nr_pages) 236 - break; 237 - /* 238 - * Set CMMA state to nodat here, since the struct page memory 239 - * at the beginning of the memory block will not go through the 240 - * buddy allocator later. 241 - */ 242 - __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); 243 - break; 244 - case MEM_FINISH_OFFLINE: 245 - /* 246 - * When altmap is in use, take the specified memory range 247 - * offline, which includes the altmap. 248 - */ 249 - if (arg->altmap_nr_pages) { 250 - start = PFN_PHYS(arg->altmap_start_pfn); 251 - size += PFN_PHYS(arg->altmap_nr_pages); 218 + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr))); 219 + put_device(&mem->dev); 220 + WRITE_ONCE(sclp_mem->config, 1); 221 + } else { 222 + if (!sclp_mem->config) 223 + goto out_unlock; 224 + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr))); 225 + if (mem->state != MEM_OFFLINE) { 226 + put_device(&mem->dev); 227 + rc = -EBUSY; 228 + goto out_unlock; 252 229 } 253 - sclp_mem_change_state(start, size, 0); 254 - break; 255 - default: 256 - break; 230 + /* drop the ref just got via find_memory_block() */ 231 + put_device(&mem->dev); 232 + sclp_mem_change_state(addr, block_size, 0); 233 + __remove_memory(addr, block_size); 234 + WRITE_ONCE(sclp_mem->config, 0); 257 235 } 258 - mutex_unlock(&sclp_mem_mutex); 259 - return rc ? NOTIFY_BAD : NOTIFY_OK; 236 + out_unlock: 237 + unlock_device_hotplug(); 238 + out: 239 + return rc ? rc : count; 260 240 } 261 241 262 - static struct notifier_block sclp_mem_nb = { 263 - .notifier_call = sclp_mem_notifier, 242 + static struct kobj_attribute sclp_config_mem_attr = 243 + __ATTR(config, 0644, sclp_config_mem_show, sclp_config_mem_store); 244 + 245 + static ssize_t sclp_memmap_on_memory_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 246 + { 247 + struct sclp_mem *sclp_mem = container_of(kobj, struct sclp_mem, kobj); 248 + 249 + return sysfs_emit(buf, "%u\n", READ_ONCE(sclp_mem->memmap_on_memory)); 250 + } 251 + 252 + static ssize_t sclp_memmap_on_memory_store(struct kobject *kobj, struct kobj_attribute *attr, 253 + const char *buf, size_t count) 254 + { 255 + struct sclp_mem *sclp_mem; 256 + unsigned long block_size; 257 + struct memory_block *mem; 258 + bool value; 259 + int rc; 260 + 261 + rc = kstrtobool(buf, &value); 262 + if (rc) 263 + return rc; 264 + rc = lock_device_hotplug_sysfs(); 265 + if (rc) 266 + return rc; 267 + block_size = memory_block_size_bytes(); 268 + sclp_mem = container_of(kobj, struct sclp_mem, kobj); 269 + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(sclp_mem->id * block_size))); 270 + if (!mem) { 271 + WRITE_ONCE(sclp_mem->memmap_on_memory, value); 272 + } else { 273 + put_device(&mem->dev); 274 + rc = -EBUSY; 275 + } 276 + unlock_device_hotplug(); 277 + return rc ? rc : count; 278 + } 279 + 280 + static const struct kobj_type ktype = { 281 + .sysfs_ops = &kobj_sysfs_ops, 264 282 }; 283 + 284 + static struct kobj_attribute sclp_memmap_attr = 285 + __ATTR(memmap_on_memory, 0644, sclp_memmap_on_memory_show, sclp_memmap_on_memory_store); 286 + 287 + static struct attribute *sclp_mem_attrs[] = { 288 + &sclp_config_mem_attr.attr, 289 + &sclp_memmap_attr.attr, 290 + NULL, 291 + }; 292 + 293 + static struct attribute_group sclp_mem_attr_group = { 294 + .attrs = sclp_mem_attrs, 295 + }; 296 + 297 + static int sclp_create_mem(struct sclp_mem *sclp_mem, struct kset *kset, 298 + unsigned int id, bool config, bool memmap_on_memory) 299 + { 300 + int rc; 301 + 302 + sclp_mem->memmap_on_memory = memmap_on_memory; 303 + sclp_mem->config = config; 304 + sclp_mem->id = id; 305 + kobject_init(&sclp_mem->kobj, &ktype); 306 + rc = kobject_add(&sclp_mem->kobj, &kset->kobj, "memory%d", id); 307 + if (rc) 308 + return rc; 309 + return sysfs_create_group(&sclp_mem->kobj, &sclp_mem_attr_group); 310 + } 311 + 312 + static int sclp_create_configured_mem(struct memory_block *mem, void *argument) 313 + { 314 + struct sclp_mem *sclp_mems; 315 + struct sclp_mem_arg *arg; 316 + struct kset *kset; 317 + unsigned int id; 318 + 319 + id = mem->dev.id; 320 + arg = (struct sclp_mem_arg *)argument; 321 + sclp_mems = arg->sclp_mems; 322 + kset = arg->kset; 323 + return sclp_create_mem(&sclp_mems[id], kset, id, true, false); 324 + } 265 325 266 326 static void __init align_to_block_size(unsigned long *start, 267 327 unsigned long *size, ··· 352 264 *size = size_align; 353 265 } 354 266 355 - static void __init add_memory_merged(u16 rn) 267 + static int __init sclp_create_standby_mems_merged(struct sclp_mem *sclp_mems, 268 + struct kset *kset, u16 rn) 356 269 { 357 270 unsigned long start, size, addr, block_size; 358 271 static u16 first_rn, num; 272 + unsigned int id; 273 + int rc = 0; 359 274 360 275 if (rn && first_rn && (first_rn + num == rn)) { 361 276 num++; 362 - return; 277 + return rc; 363 278 } 364 279 if (!first_rn) 365 280 goto skip_add; ··· 377 286 if (!size) 378 287 goto skip_add; 379 288 for (addr = start; addr < start + size; addr += block_size) { 380 - add_memory(0, addr, block_size, 381 - cpu_has_edat1() ? 382 - MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); 289 + id = addr / block_size; 290 + rc = sclp_create_mem(&sclp_mems[id], kset, id, false, 291 + mhp_supports_memmap_on_memory()); 292 + if (rc) 293 + break; 383 294 } 384 295 skip_add: 385 296 first_rn = rn; 386 297 num = 1; 298 + return rc; 387 299 } 388 300 389 - static void __init sclp_add_standby_memory(void) 301 + static int __init sclp_create_standby_mems(struct sclp_mem *sclp_mems, struct kset *kset) 390 302 { 391 303 struct memory_increment *incr; 304 + int rc = 0; 392 305 393 306 list_for_each_entry(incr, &sclp_mem_list, list) { 394 307 if (incr->standby) 395 - add_memory_merged(incr->rn); 308 + rc = sclp_create_standby_mems_merged(sclp_mems, kset, incr->rn); 309 + if (rc) 310 + return rc; 396 311 } 397 - add_memory_merged(0); 312 + return sclp_create_standby_mems_merged(sclp_mems, kset, 0); 313 + } 314 + 315 + static int __init sclp_init_mem(void) 316 + { 317 + const unsigned long block_size = memory_block_size_bytes(); 318 + unsigned int max_sclp_mems; 319 + struct sclp_mem *sclp_mems; 320 + struct sclp_mem_arg arg; 321 + struct kset *kset; 322 + int rc; 323 + 324 + max_sclp_mems = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size; 325 + /* Allocate memory for all blocks ahead of time. */ 326 + sclp_mems = kcalloc(max_sclp_mems, sizeof(struct sclp_mem), GFP_KERNEL); 327 + if (!sclp_mems) 328 + return -ENOMEM; 329 + kset = kset_create_and_add("memory", NULL, firmware_kobj); 330 + if (!kset) 331 + return -ENOMEM; 332 + /* Initial memory is in the "configured" state already. */ 333 + arg.sclp_mems = sclp_mems; 334 + arg.kset = kset; 335 + rc = for_each_memory_block(&arg, sclp_create_configured_mem); 336 + if (rc) 337 + return rc; 338 + /* Standby memory is "deconfigured". */ 339 + return sclp_create_standby_mems(sclp_mems, kset); 398 340 } 399 341 400 342 static void __init insert_increment(u16 rn, int standby, int assigned) ··· 460 336 list_add(&new_incr->list, prev); 461 337 } 462 338 463 - static int __init sclp_detect_standby_memory(void) 339 + static int __init sclp_setup_memory(void) 464 340 { 465 341 struct read_storage_sccb *sccb; 466 342 int i, id, assigned, rc; ··· 512 388 goto out; 513 389 for (i = 1; i <= sclp.rnmax - assigned; i++) 514 390 insert_increment(0, 1, 0); 515 - rc = register_memory_notifier(&sclp_mem_nb); 516 - if (rc) 517 - goto out; 518 - sclp_add_standby_memory(); 391 + rc = sclp_init_mem(); 519 392 out: 520 393 free_page((unsigned long)sccb); 521 394 return rc; 522 395 } 523 - __initcall(sclp_detect_standby_memory); 396 + __initcall(sclp_setup_memory);