Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'libnvdimm-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull dax updates from Ira Weiny:
"The series adds DAX support required for the upcoming fuse/famfs file
system.[1] The support here is required because famfs is backed by
devdax rather than pmem. This all lays the groundwork for using shared
memory as a file system"

Link: https://lore.kernel.org/all/0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com/ [1]

* tag 'libnvdimm-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
dax/fsdev: fix uninitialized kaddr in fsdev_dax_zero_page_range()
dax: export dax_dev_get()
dax: Add fs_dax_get() func to prepare dax for fs-dax usage
dax: Add dax_set_ops() for setting dax_operations at bind time
dax: Add dax_operations for use by fs-dax on fsdev dax
dax: Save the kva from memremap
dax: add fsdev.c driver for fs-dax on character dax
dax: Factor out dax_folio_reset_order() helper
dax: move dax_pgoff_to_phys from [drivers/dax/] device.c to bus.c

+566 -50
+8
MAINTAINERS
··· 7303 7303 S: Supported 7304 7304 F: drivers/dax/ 7305 7305 7306 + DEVICE DIRECT ACCESS (DAX) [fsdev_dax] 7307 + M: John Groves <jgroves@micron.com> 7308 + M: John Groves <John@Groves.net> 7309 + L: nvdimm@lists.linux.dev 7310 + L: linux-cxl@vger.kernel.org 7311 + S: Supported 7312 + F: drivers/dax/fsdev.c 7313 + 7306 7314 DEVICE FREQUENCY (DEVFREQ) 7307 7315 M: MyungJoo Ham <myungjoo.ham@samsung.com> 7308 7316 M: Kyungmin Park <kyungmin.park@samsung.com>
+5
drivers/dax/Kconfig
··· 65 65 depends on DEV_DAX_HMEM && DAX 66 66 def_bool y 67 67 68 + config DEV_DAX_FSDEV 69 + tristate 70 + depends on DEV_DAX && FS_DAX 71 + default DEV_DAX 72 + 68 73 config DEV_DAX_KMEM 69 74 tristate "KMEM DAX: map dax-devices as System-RAM" 70 75 default DEV_DAX
+2
drivers/dax/Makefile
··· 5 5 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o 6 6 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 7 7 obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o 8 + obj-$(CONFIG_DEV_DAX_FSDEV) += fsdev_dax.o 8 9 9 10 dax-y := super.o 10 11 dax-y += bus.o 11 12 device_dax-y := device.o 12 13 dax_pmem-y := pmem.o 13 14 dax_cxl-y := cxl.o 15 + fsdev_dax-y := fsdev.o
+20 -2
drivers/dax/bus.c
··· 40 40 return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); 41 41 } 42 42 43 - #define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) 44 - 45 43 static struct dax_id *__dax_match_id(const struct dax_device_driver *dax_drv, 46 44 const char *dev_name) 47 45 { ··· 1428 1430 .release = dev_dax_release, 1429 1431 .groups = dax_attribute_groups, 1430 1432 }; 1433 + 1434 + /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 1435 + __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 1436 + unsigned long size) 1437 + { 1438 + for (int i = 0; i < dev_dax->nr_range; i++) { 1439 + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 1440 + struct range *range = &dax_range->range; 1441 + phys_addr_t phys; 1442 + 1443 + if (!in_range(pgoff, dax_range->pgoff, PHYS_PFN(range_len(range)))) 1444 + continue; 1445 + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; 1446 + if (phys + size - 1 <= range->end) 1447 + return phys; 1448 + break; 1449 + } 1450 + return -1; 1451 + } 1452 + EXPORT_SYMBOL_GPL(dax_pgoff_to_phys); 1431 1453 1432 1454 static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data) 1433 1455 {
+3
drivers/dax/bus.h
··· 33 33 enum dax_driver_type { 34 34 DAXDRV_KMEM_TYPE, 35 35 DAXDRV_DEVICE_TYPE, 36 + DAXDRV_FSDEV_TYPE, 36 37 }; 37 38 38 39 struct dax_device_driver { ··· 43 42 int (*probe)(struct dev_dax *dev); 44 43 void (*remove)(struct dev_dax *dev); 45 44 }; 45 + 46 + #define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) 46 47 47 48 int __dax_driver_register(struct dax_device_driver *dax_drv, 48 49 struct module *module, const char *mod_name);
+4
drivers/dax/dax-private.h
··· 69 69 * data while the device is activated in the driver. 70 70 * @region: parent region 71 71 * @dax_dev: core dax functionality 72 + * @virt_addr: kva from memremap; used by fsdev_dax 73 + * @cached_size: size of daxdev cached by fsdev_dax 72 74 * @align: alignment of this instance 73 75 * @target_node: effective numa node if dev_dax memory range is onlined 74 76 * @dyn_id: is this a dynamic or statically created instance ··· 85 83 struct dev_dax { 86 84 struct dax_region *region; 87 85 struct dax_device *dax_dev; 86 + void *virt_addr; 87 + u64 cached_size; 88 88 unsigned int align; 89 89 int target_node; 90 90 bool dyn_id;
-23
drivers/dax/device.c
··· 57 57 vma->vm_file, func); 58 58 } 59 59 60 - /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 61 - __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 62 - unsigned long size) 63 - { 64 - int i; 65 - 66 - for (i = 0; i < dev_dax->nr_range; i++) { 67 - struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 68 - struct range *range = &dax_range->range; 69 - unsigned long long pgoff_end; 70 - phys_addr_t phys; 71 - 72 - pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; 73 - if (pgoff < dax_range->pgoff || pgoff > pgoff_end) 74 - continue; 75 - phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; 76 - if (phys + size - 1 <= range->end) 77 - return phys; 78 - break; 79 - } 80 - return -1; 81 - } 82 - 83 60 static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, 84 61 unsigned long fault_size) 85 62 {
+349
drivers/dax/fsdev.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2026 Micron Technology, Inc. */ 3 + #include <linux/memremap.h> 4 + #include <linux/pagemap.h> 5 + #include <linux/module.h> 6 + #include <linux/device.h> 7 + #include <linux/cdev.h> 8 + #include <linux/slab.h> 9 + #include <linux/dax.h> 10 + #include <linux/uio.h> 11 + #include <linux/fs.h> 12 + #include <linux/mm.h> 13 + #include "dax-private.h" 14 + #include "bus.h" 15 + 16 + /* 17 + * FS-DAX compatible devdax driver 18 + * 19 + * Unlike drivers/dax/device.c which pre-initializes compound folios based 20 + * on device alignment (via vmemmap_shift), this driver leaves folios 21 + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22 + * to work without needing special handling for pre-initialized folios. 23 + * 24 + * Key differences from device.c: 25 + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26 + * - vmemmap_shift is NOT set (folios remain order-0) 27 + * - fs-dax can dynamically create compound folios as needed 28 + * - No mmap support - all access is through fs-dax/iomap 29 + */ 30 + 31 + static void fsdev_write_dax(void *addr, struct page *page, 32 + unsigned int off, unsigned int len) 33 + { 34 + while (len) { 35 + void *mem = kmap_local_page(page); 36 + unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); 37 + 38 + memcpy_flushcache(addr, mem + off, chunk); 39 + kunmap_local(mem); 40 + len -= chunk; 41 + off = 0; 42 + page++; 43 + addr += chunk; 44 + } 45 + } 46 + 47 + static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 48 + long nr_pages, enum dax_access_mode mode, void **kaddr, 49 + unsigned long *pfn) 50 + { 51 + struct dev_dax *dev_dax = dax_get_private(dax_dev); 52 + size_t size = nr_pages << PAGE_SHIFT; 53 + size_t offset = pgoff << PAGE_SHIFT; 54 + void *virt_addr = dev_dax->virt_addr + offset; 55 + phys_addr_t phys; 56 + unsigned long local_pfn; 57 + 58 + phys = dax_pgoff_to_phys(dev_dax, pgoff, size); 59 + if (phys == -1) { 60 + dev_dbg(&dev_dax->dev, 61 + "pgoff (%#lx) out of range\n", pgoff); 62 + return -EFAULT; 63 + } 64 + 65 + if (kaddr) 66 + *kaddr = virt_addr; 67 + 68 + local_pfn = PHYS_PFN(phys); 69 + if (pfn) 70 + *pfn = local_pfn; 71 + 72 + /* 73 + * Use cached_size which was computed at probe time. The size cannot 74 + * change while the driver is bound (resize returns -EBUSY). 75 + */ 76 + return PHYS_PFN(min(size, dev_dax->cached_size - offset)); 77 + } 78 + 79 + static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, 80 + pgoff_t pgoff, size_t nr_pages) 81 + { 82 + void *kaddr; 83 + long rc; 84 + 85 + WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); 86 + rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); 87 + if (rc < 0) 88 + return rc; 89 + fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); 90 + return 0; 91 + } 92 + 93 + static long fsdev_dax_direct_access(struct dax_device *dax_dev, 94 + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, 95 + void **kaddr, unsigned long *pfn) 96 + { 97 + return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, 98 + kaddr, pfn); 99 + } 100 + 101 + static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 102 + void *addr, size_t bytes, struct iov_iter *i) 103 + { 104 + return _copy_from_iter_flushcache(addr, bytes, i); 105 + } 106 + 107 + static const struct dax_operations dev_dax_ops = { 108 + .direct_access = fsdev_dax_direct_access, 109 + .zero_page_range = fsdev_dax_zero_page_range, 110 + .recovery_write = fsdev_dax_recovery_write, 111 + }; 112 + 113 + static void fsdev_cdev_del(void *cdev) 114 + { 115 + cdev_del(cdev); 116 + } 117 + 118 + static void fsdev_kill(void *dev_dax) 119 + { 120 + kill_dev_dax(dev_dax); 121 + } 122 + 123 + static void fsdev_clear_ops(void *data) 124 + { 125 + struct dev_dax *dev_dax = data; 126 + 127 + dax_set_ops(dev_dax->dax_dev, NULL); 128 + } 129 + 130 + /* 131 + * Page map operations for FS-DAX mode 132 + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 133 + * 134 + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 135 + * The core mm code in free_zone_device_folio() handles the wake_up_var() 136 + * directly for this memory type. 137 + */ 138 + static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 139 + unsigned long pfn, unsigned long nr_pages, int mf_flags) 140 + { 141 + struct dev_dax *dev_dax = pgmap->owner; 142 + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 143 + u64 len = nr_pages << PAGE_SHIFT; 144 + 145 + return dax_holder_notify_failure(dev_dax->dax_dev, offset, 146 + len, mf_flags); 147 + } 148 + 149 + static const struct dev_pagemap_ops fsdev_pagemap_ops = { 150 + .memory_failure = fsdev_pagemap_memory_failure, 151 + }; 152 + 153 + /* 154 + * Clear any stale folio state from pages in the given range. 155 + * This is necessary because device_dax pre-initializes compound folios 156 + * based on vmemmap_shift, and that state may persist after driver unbind. 157 + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 158 + * expects to find clean order-0 folios that it can build into compound 159 + * folios on demand. 160 + * 161 + * At probe time, no filesystem should be mounted yet, so all mappings 162 + * are stale and must be cleared along with compound state. 163 + */ 164 + static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 165 + { 166 + for (int i = 0; i < dev_dax->nr_range; i++) { 167 + struct range *range = &dev_dax->ranges[i].range; 168 + unsigned long pfn = PHYS_PFN(range->start); 169 + unsigned long end_pfn = PHYS_PFN(range->end) + 1; 170 + 171 + while (pfn < end_pfn) { 172 + struct folio *folio = pfn_folio(pfn); 173 + int order = dax_folio_reset_order(folio); 174 + 175 + pfn += 1UL << order; 176 + } 177 + } 178 + } 179 + 180 + static void fsdev_clear_folio_state_action(void *data) 181 + { 182 + fsdev_clear_folio_state(data); 183 + } 184 + 185 + static int fsdev_open(struct inode *inode, struct file *filp) 186 + { 187 + struct dax_device *dax_dev = inode_dax(inode); 188 + struct dev_dax *dev_dax = dax_get_private(dax_dev); 189 + 190 + filp->private_data = dev_dax; 191 + 192 + return 0; 193 + } 194 + 195 + static int fsdev_release(struct inode *inode, struct file *filp) 196 + { 197 + return 0; 198 + } 199 + 200 + static const struct file_operations fsdev_fops = { 201 + .llseek = noop_llseek, 202 + .owner = THIS_MODULE, 203 + .open = fsdev_open, 204 + .release = fsdev_release, 205 + }; 206 + 207 + static int fsdev_dax_probe(struct dev_dax *dev_dax) 208 + { 209 + struct dax_device *dax_dev = dev_dax->dax_dev; 210 + struct device *dev = &dev_dax->dev; 211 + struct dev_pagemap *pgmap; 212 + struct inode *inode; 213 + u64 data_offset = 0; 214 + struct cdev *cdev; 215 + void *addr; 216 + int rc, i; 217 + 218 + if (static_dev_dax(dev_dax)) { 219 + if (dev_dax->nr_range > 1) { 220 + dev_warn(dev, "static pgmap / multi-range device conflict\n"); 221 + return -EINVAL; 222 + } 223 + 224 + pgmap = dev_dax->pgmap; 225 + } else { 226 + size_t pgmap_size; 227 + 228 + if (dev_dax->pgmap) { 229 + dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 230 + return -EINVAL; 231 + } 232 + 233 + pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 234 + pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 235 + if (!pgmap) 236 + return -ENOMEM; 237 + 238 + pgmap->nr_range = dev_dax->nr_range; 239 + dev_dax->pgmap = pgmap; 240 + 241 + for (i = 0; i < dev_dax->nr_range; i++) { 242 + struct range *range = &dev_dax->ranges[i].range; 243 + 244 + pgmap->ranges[i] = *range; 245 + } 246 + } 247 + 248 + for (i = 0; i < dev_dax->nr_range; i++) { 249 + struct range *range = &dev_dax->ranges[i].range; 250 + 251 + if (!devm_request_mem_region(dev, range->start, 252 + range_len(range), dev_name(dev))) { 253 + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 254 + i, range->start, range->end); 255 + return -EBUSY; 256 + } 257 + } 258 + 259 + /* Cache size now; it cannot change while driver is bound */ 260 + dev_dax->cached_size = 0; 261 + for (i = 0; i < dev_dax->nr_range; i++) 262 + dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); 263 + 264 + /* 265 + * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 266 + * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 267 + * lets fs-dax dynamically build compound folios as needed, similar 268 + * to pmem behavior. 269 + */ 270 + pgmap->type = MEMORY_DEVICE_FS_DAX; 271 + pgmap->ops = &fsdev_pagemap_ops; 272 + pgmap->owner = dev_dax; 273 + 274 + addr = devm_memremap_pages(dev, pgmap); 275 + if (IS_ERR(addr)) 276 + return PTR_ERR(addr); 277 + 278 + /* 279 + * Clear any stale compound folio state left over from a previous 280 + * driver (e.g., device_dax with vmemmap_shift). Also register this 281 + * as a devm action so folio state is cleared on unbind, ensuring 282 + * clean pages for subsequent drivers (e.g., kmem for system-ram). 283 + */ 284 + fsdev_clear_folio_state(dev_dax); 285 + rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 286 + dev_dax); 287 + if (rc) 288 + return rc; 289 + 290 + /* Detect whether the data is at a non-zero offset into the memory */ 291 + if (pgmap->range.start != dev_dax->ranges[0].range.start) { 292 + u64 phys = dev_dax->ranges[0].range.start; 293 + u64 pgmap_phys = dev_dax->pgmap[0].range.start; 294 + 295 + if (!WARN_ON(pgmap_phys > phys)) 296 + data_offset = phys - pgmap_phys; 297 + 298 + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 299 + __func__, phys, pgmap_phys, data_offset); 300 + } 301 + dev_dax->virt_addr = addr + data_offset; 302 + 303 + inode = dax_inode(dax_dev); 304 + cdev = inode->i_cdev; 305 + cdev_init(cdev, &fsdev_fops); 306 + cdev->owner = dev->driver->owner; 307 + cdev_set_parent(cdev, &dev->kobj); 308 + rc = cdev_add(cdev, dev->devt, 1); 309 + if (rc) 310 + return rc; 311 + 312 + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 313 + if (rc) 314 + return rc; 315 + 316 + /* Set the dax operations for fs-dax access path */ 317 + rc = dax_set_ops(dax_dev, &dev_dax_ops); 318 + if (rc) 319 + return rc; 320 + 321 + rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); 322 + if (rc) 323 + return rc; 324 + 325 + run_dax(dax_dev); 326 + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 327 + } 328 + 329 + static struct dax_device_driver fsdev_dax_driver = { 330 + .probe = fsdev_dax_probe, 331 + .type = DAXDRV_FSDEV_TYPE, 332 + }; 333 + 334 + static int __init dax_init(void) 335 + { 336 + return dax_driver_register(&fsdev_dax_driver); 337 + } 338 + 339 + static void __exit dax_exit(void) 340 + { 341 + dax_driver_unregister(&fsdev_dax_driver); 342 + } 343 + 344 + MODULE_AUTHOR("John Groves"); 345 + MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 346 + MODULE_LICENSE("GPL"); 347 + module_init(dax_init); 348 + module_exit(dax_exit); 349 + MODULE_ALIAS_DAX_DEVICE(0);
+104 -3
drivers/dax/super.c
··· 14 14 #include <linux/fs.h> 15 15 #include <linux/cacheinfo.h> 16 16 #include "dax-private.h" 17 + #include "bus.h" 17 18 18 19 /** 19 20 * struct dax_device - anchor object for dax services ··· 112 111 } 113 112 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 114 113 114 + #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 115 + 116 + #if IS_ENABLED(CONFIG_FS_DAX) 117 + 115 118 void fs_put_dax(struct dax_device *dax_dev, void *holder) 116 119 { 117 120 if (dax_dev && holder && ··· 124 119 put_dax(dax_dev); 125 120 } 126 121 EXPORT_SYMBOL_GPL(fs_put_dax); 127 - #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 122 + 123 + /** 124 + * fs_dax_get() - get ownership of a devdax via holder/holder_ops 125 + * 126 + * fs-dax file systems call this function to prepare to use a devdax device for 127 + * fsdax. This is like fs_dax_get_by_bdev(), but the caller already has struct 128 + * dev_dax (and there is no bdev). The holder makes this exclusive. 129 + * 130 + * @dax_dev: dev to be prepared for fs-dax usage 131 + * @holder: filesystem or mapped device inside the dax_device 132 + * @hops: operations for the inner holder 133 + * 134 + * Returns: 0 on success, <0 on failure 135 + */ 136 + int fs_dax_get(struct dax_device *dax_dev, void *holder, 137 + const struct dax_holder_operations *hops) 138 + { 139 + struct dev_dax *dev_dax; 140 + struct dax_device_driver *dax_drv; 141 + int id; 142 + 143 + id = dax_read_lock(); 144 + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) { 145 + dax_read_unlock(id); 146 + return -ENODEV; 147 + } 148 + dax_read_unlock(id); 149 + 150 + /* Verify the device is bound to fsdev_dax driver */ 151 + dev_dax = dax_get_private(dax_dev); 152 + if (!dev_dax) { 153 + iput(&dax_dev->inode); 154 + return -ENODEV; 155 + } 156 + 157 + device_lock(&dev_dax->dev); 158 + if (!dev_dax->dev.driver) { 159 + device_unlock(&dev_dax->dev); 160 + iput(&dax_dev->inode); 161 + return -ENODEV; 162 + } 163 + dax_drv = to_dax_drv(dev_dax->dev.driver); 164 + if (dax_drv->type != DAXDRV_FSDEV_TYPE) { 165 + device_unlock(&dev_dax->dev); 166 + iput(&dax_dev->inode); 167 + return -EOPNOTSUPP; 168 + } 169 + device_unlock(&dev_dax->dev); 170 + 171 + if (cmpxchg(&dax_dev->holder_data, NULL, holder)) { 172 + iput(&dax_dev->inode); 173 + return -EBUSY; 174 + } 175 + 176 + dax_dev->holder_ops = hops; 177 + 178 + return 0; 179 + } 180 + EXPORT_SYMBOL_GPL(fs_dax_get); 181 + #endif /* CONFIG_FS_DAX */ 128 182 129 183 enum dax_device_flags { 130 184 /* !alive + rcu grace period == no new operations / mappings */ ··· 220 156 221 157 if (!dax_alive(dax_dev)) 222 158 return -ENXIO; 159 + 160 + if (!dax_dev->ops) 161 + return -EOPNOTSUPP; 223 162 224 163 if (nr_pages < 0) 225 164 return -EINVAL; ··· 274 207 275 208 if (!dax_alive(dax_dev)) 276 209 return -ENXIO; 210 + 211 + if (!dax_dev->ops) 212 + return -EOPNOTSUPP; 213 + 277 214 /* 278 215 * There are no callers that want to zero more than one page as of now. 279 216 * Once users are there, this check can be removed after the ··· 294 223 size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 295 224 void *addr, size_t bytes, struct iov_iter *iter) 296 225 { 297 - if (!dax_dev->ops->recovery_write) 226 + if (!dax_dev->ops || !dax_dev->ops->recovery_write) 298 227 return 0; 299 228 return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); 300 229 } ··· 377 306 set_bit(DAXDEV_NOMC, &dax_dev->flags); 378 307 } 379 308 EXPORT_SYMBOL_GPL(set_dax_nomc); 309 + 310 + /** 311 + * dax_set_ops - set the dax_operations for a dax_device 312 + * @dax_dev: the dax_device to configure 313 + * @ops: the operations to set (may be NULL to clear) 314 + * 315 + * This allows drivers to set the dax_operations after the dax_device 316 + * has been allocated. This is needed when the device is created before 317 + * the driver that needs specific ops is bound (e.g., fsdev_dax binding 318 + * to a dev_dax created by hmem). 319 + * 320 + * When setting non-NULL ops, fails if ops are already set (returns -EBUSY). 321 + * When clearing ops (NULL), always succeeds. 322 + * 323 + * Return: 0 on success, -EBUSY if ops already set 324 + */ 325 + int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops) 326 + { 327 + if (ops) { 328 + /* Setting ops: fail if already set */ 329 + if (cmpxchg(&dax_dev->ops, NULL, ops) != NULL) 330 + return -EBUSY; 331 + } else { 332 + /* Clearing ops: always allowed */ 333 + dax_dev->ops = NULL; 334 + } 335 + return 0; 336 + } 337 + EXPORT_SYMBOL_GPL(dax_set_ops); 380 338 381 339 bool dax_alive(struct dax_device *dax_dev) 382 340 { ··· 521 421 return 0; 522 422 } 523 423 524 - static struct dax_device *dax_dev_get(dev_t devt) 424 + struct dax_device *dax_dev_get(dev_t devt) 525 425 { 526 426 struct dax_device *dax_dev; 527 427 struct inode *inode; ··· 544 444 545 445 return dax_dev; 546 446 } 447 + EXPORT_SYMBOL_GPL(dax_dev_get); 547 448 548 449 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) 549 450 {
+56 -18
fs/dax.c
··· 377 377 folio->share = 1; 378 378 } 379 379 380 + /** 381 + * dax_folio_reset_order - Reset a compound DAX folio to order-0 pages 382 + * @folio: The folio to reset 383 + * 384 + * Splits a compound folio back into individual order-0 pages, 385 + * clearing compound state and restoring pgmap pointers. 386 + * 387 + * Returns: the original folio order (0 if already order-0) 388 + */ 389 + int dax_folio_reset_order(struct folio *folio) 390 + { 391 + struct dev_pagemap *pgmap = page_pgmap(&folio->page); 392 + int order = folio_order(folio); 393 + 394 + /* 395 + * DAX maintains the invariant that folio->share != 0 only when 396 + * folio->mapping == NULL (enforced by dax_folio_make_shared()). 397 + * Equivalently: folio->mapping != NULL implies folio->share == 0. 398 + * Callers ensure share has been decremented to zero before 399 + * calling here, so unconditionally clearing both fields is 400 + * correct. 401 + */ 402 + folio->mapping = NULL; 403 + folio->share = 0; 404 + 405 + if (!order) { 406 + /* 407 + * Restore pgmap explicitly even for order-0 folios. For the 408 + * dax_folio_put() caller this is a no-op (same value), but 409 + * fsdev_clear_folio_state() may call this on folios that 410 + * were previously compound and need pgmap re-established. 411 + */ 412 + folio->pgmap = pgmap; 413 + return 0; 414 + } 415 + 416 + folio_reset_order(folio); 417 + 418 + for (int i = 0; i < (1UL << order); i++) { 419 + struct page *page = folio_page(folio, i); 420 + struct folio *f = (struct folio *)page; 421 + 422 + ClearPageHead(page); 423 + clear_compound_head(page); 424 + f->mapping = NULL; 425 + f->share = 0; 426 + f->pgmap = pgmap; 427 + } 428 + 429 + return order; 430 + } 431 + EXPORT_SYMBOL_GPL(dax_folio_reset_order); 432 + 380 433 static inline unsigned long dax_folio_put(struct folio *folio) 381 434 { 382 435 unsigned long ref; ··· 443 390 if (ref) 444 391 return ref; 445 392 446 - folio->mapping = NULL; 447 - order = folio_order(folio); 448 - if (!order) 449 - return 0; 450 - folio_reset_order(folio); 393 + order = dax_folio_reset_order(folio); 451 394 395 + /* Debug check: verify refcounts are zero for all sub-folios */ 452 396 for (i = 0; i < (1UL << order); i++) { 453 - struct dev_pagemap *pgmap = page_pgmap(&folio->page); 454 397 struct page *page = folio_page(folio, i); 455 - struct folio *new_folio = (struct folio *)page; 456 398 457 - ClearPageHead(page); 458 - clear_compound_head(page); 459 - 460 - new_folio->mapping = NULL; 461 - /* 462 - * Reset pgmap which was over-written by 463 - * prep_compound_page(). 464 - */ 465 - new_folio->pgmap = pgmap; 466 - new_folio->share = 0; 467 - WARN_ON_ONCE(folio_ref_count(new_folio)); 399 + WARN_ON_ONCE(folio_ref_count((struct folio *)page)); 468 400 } 469 401 470 402 return ref;
+15 -4
include/linux/dax.h
··· 54 54 void *dax_holder(struct dax_device *dax_dev); 55 55 void put_dax(struct dax_device *dax_dev); 56 56 void kill_dax(struct dax_device *dax_dev); 57 + struct dax_device *dax_dev_get(dev_t devt); 57 58 void dax_write_cache(struct dax_device *dax_dev, bool wc); 58 59 bool dax_write_cache_enabled(struct dax_device *dax_dev); 59 60 bool dax_synchronous(struct dax_device *dax_dev); ··· 131 130 void dax_remove_host(struct gendisk *disk); 132 131 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, 133 132 void *holder, const struct dax_holder_operations *ops); 134 - void fs_put_dax(struct dax_device *dax_dev, void *holder); 135 133 #else 136 134 static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) 137 135 { ··· 145 145 { 146 146 return NULL; 147 147 } 148 - static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) 149 - { 150 - } 151 148 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 152 149 153 150 #if IS_ENABLED(CONFIG_FS_DAX) 151 + void fs_put_dax(struct dax_device *dax_dev, void *holder); 152 + int fs_dax_get(struct dax_device *dax_dev, void *holder, 153 + const struct dax_holder_operations *hops); 154 154 int dax_writeback_mapping_range(struct address_space *mapping, 155 155 struct dax_device *dax_dev, struct writeback_control *wbc); 156 + int dax_folio_reset_order(struct folio *folio); 156 157 157 158 struct page *dax_layout_busy_page(struct address_space *mapping); 158 159 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); ··· 164 163 void dax_unlock_mapping_entry(struct address_space *mapping, 165 164 unsigned long index, dax_entry_t cookie); 166 165 #else 166 + static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) 167 + { 168 + } 169 + 170 + static inline int fs_dax_get(struct dax_device *dax_dev, void *holder, 171 + const struct dax_holder_operations *hops) 172 + { 173 + return -EOPNOTSUPP; 174 + } 167 175 static inline struct page *dax_layout_busy_page(struct address_space *mapping) 168 176 { 169 177 return NULL; ··· 252 242 253 243 bool dax_alive(struct dax_device *dax_dev); 254 244 void *dax_get_private(struct dax_device *dax_dev); 245 + int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops); 255 246 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 256 247 enum dax_access_mode mode, void **kaddr, unsigned long *pfn); 257 248 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,