Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'iomap-4.19-merge' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull fs iomap refactoring from Darrick Wong:
"This is the first part of the XFS changes for 4.19.

Christoph and Andreas coordinated some refactoring work on the iomap
code in preparation for removing buffer heads from XFS and porting
gfs2 to iomap. I'm sending this small pull request ahead of the main
XFS merge to avoid holding up gfs2 unnecessarily"

* 'iomap-4.19-merge' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
iomap: add inline data support to iomap_readpage_actor
iomap: support direct I/O to inline data
iomap: refactor iomap_dio_actor
iomap: add initial support for writes without buffer heads
iomap: add an iomap-based readpage and readpages implementation
iomap: add private pointer to struct iomap
iomap: add a page_done callback
iomap: generic inline data handling
iomap: complete partial direct I/O writes synchronously
iomap: mark newly allocated buffer heads as new
fs: factor out a __generic_write_end helper

+536 -96
+40 -36
fs/buffer.c
··· 1900 1900 break; 1901 1901 case IOMAP_UNWRITTEN: 1902 1902 /* 1903 - * For unwritten regions, we always need to ensure that 1904 - * sub-block writes cause the regions in the block we are not 1905 - * writing to are zeroed. Set the buffer as new to ensure this. 1903 + * For unwritten regions, we always need to ensure that regions 1904 + * in the block we are not writing to are zeroed. Mark the 1905 + * buffer as new to ensure this. 1906 1906 */ 1907 1907 set_buffer_new(bh); 1908 1908 set_buffer_unwritten(bh); 1909 1909 /* FALLTHRU */ 1910 1910 case IOMAP_MAPPED: 1911 - if (offset >= i_size_read(inode)) 1911 + if ((iomap->flags & IOMAP_F_NEW) || 1912 + offset >= i_size_read(inode)) 1912 1913 set_buffer_new(bh); 1913 1914 bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> 1914 1915 inode->i_blkbits; ··· 2077 2076 } 2078 2077 EXPORT_SYMBOL(block_write_begin); 2079 2078 2079 + int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, 2080 + struct page *page) 2081 + { 2082 + loff_t old_size = inode->i_size; 2083 + bool i_size_changed = false; 2084 + 2085 + /* 2086 + * No need to use i_size_read() here, the i_size cannot change under us 2087 + * because we hold i_rwsem. 2088 + * 2089 + * But it's important to update i_size while still holding page lock: 2090 + * page writeout could otherwise come in and zero beyond i_size. 2091 + */ 2092 + if (pos + copied > inode->i_size) { 2093 + i_size_write(inode, pos + copied); 2094 + i_size_changed = true; 2095 + } 2096 + 2097 + unlock_page(page); 2098 + put_page(page); 2099 + 2100 + if (old_size < pos) 2101 + pagecache_isize_extended(inode, old_size, pos); 2102 + /* 2103 + * Don't mark the inode dirty under page lock. First, it unnecessarily 2104 + * makes the holding time of page lock longer. Second, it forces lock 2105 + * ordering of page lock and transaction start for journaling 2106 + * filesystems. 2107 + */ 2108 + if (i_size_changed) 2109 + mark_inode_dirty(inode); 2110 + return copied; 2111 + } 2112 + 2080 2113 int block_write_end(struct file *file, struct address_space *mapping, 2081 2114 loff_t pos, unsigned len, unsigned copied, 2082 2115 struct page *page, void *fsdata) ··· 2151 2116 loff_t pos, unsigned len, unsigned copied, 2152 2117 struct page *page, void *fsdata) 2153 2118 { 2154 - struct inode *inode = mapping->host; 2155 - loff_t old_size = inode->i_size; 2156 - int i_size_changed = 0; 2157 - 2158 2119 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2159 - 2160 - /* 2161 - * No need to use i_size_read() here, the i_size 2162 - * cannot change under us because we hold i_mutex. 2163 - * 2164 - * But it's important to update i_size while still holding page lock: 2165 - * page writeout could otherwise come in and zero beyond i_size. 2166 - */ 2167 - if (pos+copied > inode->i_size) { 2168 - i_size_write(inode, pos+copied); 2169 - i_size_changed = 1; 2170 - } 2171 - 2172 - unlock_page(page); 2173 - put_page(page); 2174 - 2175 - if (old_size < pos) 2176 - pagecache_isize_extended(inode, old_size, pos); 2177 - /* 2178 - * Don't mark the inode dirty under page lock. First, it unnecessarily 2179 - * makes the holding time of page lock longer. Second, it forces lock 2180 - * ordering of page lock and transaction start for journaling 2181 - * filesystems. 2182 - */ 2183 - if (i_size_changed) 2184 - mark_inode_dirty(inode); 2185 - 2186 - return copied; 2120 + return __generic_write_end(mapping->host, pos, copied, page); 2187 2121 } 2188 2122 EXPORT_SYMBOL(generic_write_end); 2189 2123
+2
fs/internal.h
··· 43 43 extern void guard_bio_eod(int rw, struct bio *bio); 44 44 extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, 45 45 get_block_t *get_block, struct iomap *iomap); 46 + int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, 47 + struct page *page); 46 48 47 49 /* 48 50 * char_dev.c
+474 -58
fs/iomap.c
··· 1 1 /* 2 2 * Copyright (C) 2010 Red Hat, Inc. 3 - * Copyright (c) 2016 Christoph Hellwig. 3 + * Copyright (c) 2016-2018 Christoph Hellwig. 4 4 * 5 5 * This program is free software; you can redistribute it and/or modify it 6 6 * under the terms and conditions of the GNU General Public License, ··· 18 18 #include <linux/uaccess.h> 19 19 #include <linux/gfp.h> 20 20 #include <linux/mm.h> 21 + #include <linux/mm_inline.h> 21 22 #include <linux/swap.h> 22 23 #include <linux/pagemap.h> 23 24 #include <linux/pagevec.h> ··· 105 104 } 106 105 107 106 static void 107 + iomap_read_inline_data(struct inode *inode, struct page *page, 108 + struct iomap *iomap) 109 + { 110 + size_t size = i_size_read(inode); 111 + void *addr; 112 + 113 + if (PageUptodate(page)) 114 + return; 115 + 116 + BUG_ON(page->index); 117 + BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); 118 + 119 + addr = kmap_atomic(page); 120 + memcpy(addr, iomap->inline_data, size); 121 + memset(addr + size, 0, PAGE_SIZE - size); 122 + kunmap_atomic(addr); 123 + SetPageUptodate(page); 124 + } 125 + 126 + static void 127 + iomap_read_end_io(struct bio *bio) 128 + { 129 + int error = blk_status_to_errno(bio->bi_status); 130 + struct bio_vec *bvec; 131 + int i; 132 + 133 + bio_for_each_segment_all(bvec, bio, i) 134 + page_endio(bvec->bv_page, false, error); 135 + bio_put(bio); 136 + } 137 + 138 + struct iomap_readpage_ctx { 139 + struct page *cur_page; 140 + bool cur_page_in_bio; 141 + bool is_readahead; 142 + struct bio *bio; 143 + struct list_head *pages; 144 + }; 145 + 146 + static loff_t 147 + iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 148 + struct iomap *iomap) 149 + { 150 + struct iomap_readpage_ctx *ctx = data; 151 + struct page *page = ctx->cur_page; 152 + unsigned poff = pos & (PAGE_SIZE - 1); 153 + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 154 + bool is_contig = false; 155 + sector_t sector; 156 + 157 + if (iomap->type == IOMAP_INLINE) { 158 + WARN_ON_ONCE(poff); 159 + iomap_read_inline_data(inode, page, iomap); 160 + return PAGE_SIZE; 161 + } 162 + 163 + /* we don't support blocksize < PAGE_SIZE quite yet. */ 164 + WARN_ON_ONCE(pos != page_offset(page)); 165 + WARN_ON_ONCE(plen != PAGE_SIZE); 166 + 167 + if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { 168 + zero_user(page, poff, plen); 169 + SetPageUptodate(page); 170 + goto done; 171 + } 172 + 173 + ctx->cur_page_in_bio = true; 174 + 175 + /* 176 + * Try to merge into a previous segment if we can. 177 + */ 178 + sector = iomap_sector(iomap, pos); 179 + if (ctx->bio && bio_end_sector(ctx->bio) == sector) { 180 + if (__bio_try_merge_page(ctx->bio, page, plen, poff)) 181 + goto done; 182 + is_contig = true; 183 + } 184 + 185 + if (!ctx->bio || !is_contig || bio_full(ctx->bio)) { 186 + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); 187 + int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; 188 + 189 + if (ctx->bio) 190 + submit_bio(ctx->bio); 191 + 192 + if (ctx->is_readahead) /* same as readahead_gfp_mask */ 193 + gfp |= __GFP_NORETRY | __GFP_NOWARN; 194 + ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); 195 + ctx->bio->bi_opf = REQ_OP_READ; 196 + if (ctx->is_readahead) 197 + ctx->bio->bi_opf |= REQ_RAHEAD; 198 + ctx->bio->bi_iter.bi_sector = sector; 199 + bio_set_dev(ctx->bio, iomap->bdev); 200 + ctx->bio->bi_end_io = iomap_read_end_io; 201 + } 202 + 203 + __bio_add_page(ctx->bio, page, plen, poff); 204 + done: 205 + return plen; 206 + } 207 + 208 + int 209 + iomap_readpage(struct page *page, const struct iomap_ops *ops) 210 + { 211 + struct iomap_readpage_ctx ctx = { .cur_page = page }; 212 + struct inode *inode = page->mapping->host; 213 + unsigned poff; 214 + loff_t ret; 215 + 216 + WARN_ON_ONCE(page_has_buffers(page)); 217 + 218 + for (poff = 0; poff < PAGE_SIZE; poff += ret) { 219 + ret = iomap_apply(inode, page_offset(page) + poff, 220 + PAGE_SIZE - poff, 0, ops, &ctx, 221 + iomap_readpage_actor); 222 + if (ret <= 0) { 223 + WARN_ON_ONCE(ret == 0); 224 + SetPageError(page); 225 + break; 226 + } 227 + } 228 + 229 + if (ctx.bio) { 230 + submit_bio(ctx.bio); 231 + WARN_ON_ONCE(!ctx.cur_page_in_bio); 232 + } else { 233 + WARN_ON_ONCE(ctx.cur_page_in_bio); 234 + unlock_page(page); 235 + } 236 + 237 + /* 238 + * Just like mpage_readpages and block_read_full_page we always 239 + * return 0 and just mark the page as PageError on errors. This 240 + * should be cleaned up all through the stack eventually. 241 + */ 242 + return 0; 243 + } 244 + EXPORT_SYMBOL_GPL(iomap_readpage); 245 + 246 + static struct page * 247 + iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, 248 + loff_t length, loff_t *done) 249 + { 250 + while (!list_empty(pages)) { 251 + struct page *page = lru_to_page(pages); 252 + 253 + if (page_offset(page) >= (u64)pos + length) 254 + break; 255 + 256 + list_del(&page->lru); 257 + if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, 258 + GFP_NOFS)) 259 + return page; 260 + 261 + /* 262 + * If we already have a page in the page cache at index we are 263 + * done. Upper layers don't care if it is uptodate after the 264 + * readpages call itself as every page gets checked again once 265 + * actually needed. 266 + */ 267 + *done += PAGE_SIZE; 268 + put_page(page); 269 + } 270 + 271 + return NULL; 272 + } 273 + 274 + static loff_t 275 + iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, 276 + void *data, struct iomap *iomap) 277 + { 278 + struct iomap_readpage_ctx *ctx = data; 279 + loff_t done, ret; 280 + 281 + for (done = 0; done < length; done += ret) { 282 + if (ctx->cur_page && ((pos + done) & (PAGE_SIZE - 1)) == 0) { 283 + if (!ctx->cur_page_in_bio) 284 + unlock_page(ctx->cur_page); 285 + put_page(ctx->cur_page); 286 + ctx->cur_page = NULL; 287 + } 288 + if (!ctx->cur_page) { 289 + ctx->cur_page = iomap_next_page(inode, ctx->pages, 290 + pos, length, &done); 291 + if (!ctx->cur_page) 292 + break; 293 + ctx->cur_page_in_bio = false; 294 + } 295 + ret = iomap_readpage_actor(inode, pos + done, length - done, 296 + ctx, iomap); 297 + } 298 + 299 + return done; 300 + } 301 + 302 + int 303 + iomap_readpages(struct address_space *mapping, struct list_head *pages, 304 + unsigned nr_pages, const struct iomap_ops *ops) 305 + { 306 + struct iomap_readpage_ctx ctx = { 307 + .pages = pages, 308 + .is_readahead = true, 309 + }; 310 + loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); 311 + loff_t last = page_offset(list_entry(pages->next, struct page, lru)); 312 + loff_t length = last - pos + PAGE_SIZE, ret = 0; 313 + 314 + while (length > 0) { 315 + ret = iomap_apply(mapping->host, pos, length, 0, ops, 316 + &ctx, iomap_readpages_actor); 317 + if (ret <= 0) { 318 + WARN_ON_ONCE(ret == 0); 319 + goto done; 320 + } 321 + pos += ret; 322 + length -= ret; 323 + } 324 + ret = 0; 325 + done: 326 + if (ctx.bio) 327 + submit_bio(ctx.bio); 328 + if (ctx.cur_page) { 329 + if (!ctx.cur_page_in_bio) 330 + unlock_page(ctx.cur_page); 331 + put_page(ctx.cur_page); 332 + } 333 + 334 + /* 335 + * Check that we didn't lose a page due to the arcance calling 336 + * conventions.. 337 + */ 338 + WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); 339 + return ret; 340 + } 341 + EXPORT_SYMBOL_GPL(iomap_readpages); 342 + 343 + static void 108 344 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 109 345 { 110 346 loff_t i_size = i_size_read(inode); ··· 352 114 */ 353 115 if (pos + len > i_size) 354 116 truncate_pagecache_range(inode, max(pos, i_size), pos + len); 117 + } 118 + 119 + static int 120 + iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, 121 + unsigned poff, unsigned plen, unsigned from, unsigned to, 122 + struct iomap *iomap) 123 + { 124 + struct bio_vec bvec; 125 + struct bio bio; 126 + 127 + if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { 128 + zero_user_segments(page, poff, from, to, poff + plen); 129 + return 0; 130 + } 131 + 132 + bio_init(&bio, &bvec, 1); 133 + bio.bi_opf = REQ_OP_READ; 134 + bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); 135 + bio_set_dev(&bio, iomap->bdev); 136 + __bio_add_page(&bio, page, plen, poff); 137 + return submit_bio_wait(&bio); 138 + } 139 + 140 + static int 141 + __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, 142 + struct page *page, struct iomap *iomap) 143 + { 144 + loff_t block_size = i_blocksize(inode); 145 + loff_t block_start = pos & ~(block_size - 1); 146 + loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); 147 + unsigned poff = block_start & (PAGE_SIZE - 1); 148 + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start); 149 + unsigned from = pos & (PAGE_SIZE - 1), to = from + len; 150 + 151 + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE); 152 + 153 + if (PageUptodate(page)) 154 + return 0; 155 + if (from <= poff && to >= poff + plen) 156 + return 0; 157 + return iomap_read_page_sync(inode, block_start, page, 158 + poff, plen, from, to, iomap); 355 159 } 356 160 357 161 static int ··· 413 133 if (!page) 414 134 return -ENOMEM; 415 135 416 - status = __block_write_begin_int(page, pos, len, NULL, iomap); 136 + if (iomap->type == IOMAP_INLINE) 137 + iomap_read_inline_data(inode, page, iomap); 138 + else if (iomap->flags & IOMAP_F_BUFFER_HEAD) 139 + status = __block_write_begin_int(page, pos, len, NULL, iomap); 140 + else 141 + status = __iomap_write_begin(inode, pos, len, page, iomap); 417 142 if (unlikely(status)) { 418 143 unlock_page(page); 419 144 put_page(page); ··· 431 146 return status; 432 147 } 433 148 149 + int 150 + iomap_set_page_dirty(struct page *page) 151 + { 152 + struct address_space *mapping = page_mapping(page); 153 + int newly_dirty; 154 + 155 + if (unlikely(!mapping)) 156 + return !TestSetPageDirty(page); 157 + 158 + /* 159 + * Lock out page->mem_cgroup migration to keep PageDirty 160 + * synchronized with per-memcg dirty page counters. 161 + */ 162 + lock_page_memcg(page); 163 + newly_dirty = !TestSetPageDirty(page); 164 + if (newly_dirty) 165 + __set_page_dirty(page, mapping, 0); 166 + unlock_page_memcg(page); 167 + 168 + if (newly_dirty) 169 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 170 + return newly_dirty; 171 + } 172 + EXPORT_SYMBOL_GPL(iomap_set_page_dirty); 173 + 174 + static int 175 + __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 176 + unsigned copied, struct page *page, struct iomap *iomap) 177 + { 178 + flush_dcache_page(page); 179 + 180 + /* 181 + * The blocks that were entirely written will now be uptodate, so we 182 + * don't have to worry about a readpage reading them and overwriting a 183 + * partial write. However if we have encountered a short write and only 184 + * partially written into a block, it will not be marked uptodate, so a 185 + * readpage might come in and destroy our partial write. 186 + * 187 + * Do the simplest thing, and just treat any short write to a non 188 + * uptodate page as a zero-length write, and force the caller to redo 189 + * the whole thing. 190 + */ 191 + if (unlikely(copied < len && !PageUptodate(page))) { 192 + copied = 0; 193 + } else { 194 + SetPageUptodate(page); 195 + iomap_set_page_dirty(page); 196 + } 197 + return __generic_write_end(inode, pos, copied, page); 198 + } 199 + 200 + static int 201 + iomap_write_end_inline(struct inode *inode, struct page *page, 202 + struct iomap *iomap, loff_t pos, unsigned copied) 203 + { 204 + void *addr; 205 + 206 + WARN_ON_ONCE(!PageUptodate(page)); 207 + BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); 208 + 209 + addr = kmap_atomic(page); 210 + memcpy(iomap->inline_data + pos, addr + pos, copied); 211 + kunmap_atomic(addr); 212 + 213 + mark_inode_dirty(inode); 214 + __generic_write_end(inode, pos, copied, page); 215 + return copied; 216 + } 217 + 434 218 static int 435 219 iomap_write_end(struct inode *inode, loff_t pos, unsigned len, 436 - unsigned copied, struct page *page) 220 + unsigned copied, struct page *page, struct iomap *iomap) 437 221 { 438 222 int ret; 439 223 440 - ret = generic_write_end(NULL, inode->i_mapping, pos, len, 441 - copied, page, NULL); 224 + if (iomap->type == IOMAP_INLINE) { 225 + ret = iomap_write_end_inline(inode, page, iomap, pos, copied); 226 + } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 227 + ret = generic_write_end(NULL, inode->i_mapping, pos, len, 228 + copied, page, NULL); 229 + } else { 230 + ret = __iomap_write_end(inode, pos, len, copied, page, iomap); 231 + } 232 + 233 + if (iomap->page_done) 234 + iomap->page_done(inode, pos, copied, page, iomap); 235 + 442 236 if (ret < len) 443 237 iomap_write_failed(inode, pos, len); 444 238 return ret; ··· 572 208 573 209 flush_dcache_page(page); 574 210 575 - status = iomap_write_end(inode, pos, bytes, copied, page); 211 + status = iomap_write_end(inode, pos, bytes, copied, page, 212 + iomap); 576 213 if (unlikely(status < 0)) 577 214 break; 578 215 copied = status; ··· 667 302 668 303 WARN_ON_ONCE(!PageUptodate(page)); 669 304 670 - status = iomap_write_end(inode, pos, bytes, bytes, page); 305 + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); 671 306 if (unlikely(status <= 0)) { 672 307 if (WARN_ON_ONCE(status == 0)) 673 308 return -EIO; ··· 719 354 zero_user(page, offset, bytes); 720 355 mark_page_accessed(page); 721 356 722 - return iomap_write_end(inode, pos, bytes, bytes, page); 357 + return iomap_write_end(inode, pos, bytes, bytes, page, iomap); 723 358 } 724 359 725 360 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, ··· 805 440 struct page *page = data; 806 441 int ret; 807 442 808 - ret = __block_write_begin_int(page, pos, length, NULL, iomap); 809 - if (ret) 810 - return ret; 443 + if (iomap->flags & IOMAP_F_BUFFER_HEAD) { 444 + ret = __block_write_begin_int(page, pos, length, NULL, iomap); 445 + if (ret) 446 + return ret; 447 + block_commit_write(page, 0, length); 448 + } else { 449 + WARN_ON_ONCE(!PageUptodate(page)); 450 + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE); 451 + } 811 452 812 - block_commit_write(page, 0, length); 813 453 return length; 814 454 } 815 455 ··· 1181 811 atomic_t ref; 1182 812 unsigned flags; 1183 813 int error; 814 + bool wait_for_completion; 1184 815 1185 816 union { 1186 817 /* used during submission and for synchronous completion: */ ··· 1285 914 iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); 1286 915 1287 916 if (atomic_dec_and_test(&dio->ref)) { 1288 - if (is_sync_kiocb(dio->iocb)) { 917 + if (dio->wait_for_completion) { 1289 918 struct task_struct *waiter = dio->submit.waiter; 1290 - 1291 919 WRITE_ONCE(dio->submit.waiter, NULL); 1292 920 wake_up_process(waiter); 1293 921 } else if (dio->flags & IOMAP_DIO_WRITE) { ··· 1333 963 } 1334 964 1335 965 static loff_t 1336 - iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 1337 - void *data, struct iomap *iomap) 966 + iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, 967 + struct iomap_dio *dio, struct iomap *iomap) 1338 968 { 1339 - struct iomap_dio *dio = data; 1340 969 unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); 1341 970 unsigned int fs_block_size = i_blocksize(inode), pad; 1342 971 unsigned int align = iov_iter_alignment(dio->submit.iter); ··· 1349 980 if ((pos | length | align) & ((1 << blkbits) - 1)) 1350 981 return -EINVAL; 1351 982 1352 - switch (iomap->type) { 1353 - case IOMAP_HOLE: 1354 - if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 1355 - return -EIO; 1356 - /*FALLTHRU*/ 1357 - case IOMAP_UNWRITTEN: 1358 - if (!(dio->flags & IOMAP_DIO_WRITE)) { 1359 - length = iov_iter_zero(length, dio->submit.iter); 1360 - dio->size += length; 1361 - return length; 1362 - } 983 + if (iomap->type == IOMAP_UNWRITTEN) { 1363 984 dio->flags |= IOMAP_DIO_UNWRITTEN; 1364 985 need_zeroout = true; 1365 - break; 1366 - case IOMAP_MAPPED: 1367 - if (iomap->flags & IOMAP_F_SHARED) 1368 - dio->flags |= IOMAP_DIO_COW; 1369 - if (iomap->flags & IOMAP_F_NEW) { 1370 - need_zeroout = true; 1371 - } else { 1372 - /* 1373 - * Use a FUA write if we need datasync semantics, this 1374 - * is a pure data IO that doesn't require any metadata 1375 - * updates and the underlying device supports FUA. This 1376 - * allows us to avoid cache flushes on IO completion. 1377 - */ 1378 - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1379 - (dio->flags & IOMAP_DIO_WRITE_FUA) && 1380 - blk_queue_fua(bdev_get_queue(iomap->bdev))) 1381 - use_fua = true; 1382 - } 1383 - break; 1384 - default: 1385 - WARN_ON_ONCE(1); 1386 - return -EIO; 986 + } 987 + 988 + if (iomap->flags & IOMAP_F_SHARED) 989 + dio->flags |= IOMAP_DIO_COW; 990 + 991 + if (iomap->flags & IOMAP_F_NEW) { 992 + need_zeroout = true; 993 + } else { 994 + /* 995 + * Use a FUA write if we need datasync semantics, this 996 + * is a pure data IO that doesn't require any metadata 997 + * updates and the underlying device supports FUA. This 998 + * allows us to avoid cache flushes on IO completion. 999 + */ 1000 + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1001 + (dio->flags & IOMAP_DIO_WRITE_FUA) && 1002 + blk_queue_fua(bdev_get_queue(iomap->bdev))) 1003 + use_fua = true; 1387 1004 } 1388 1005 1389 1006 /* ··· 1448 1093 return copied; 1449 1094 } 1450 1095 1096 + static loff_t 1097 + iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) 1098 + { 1099 + length = iov_iter_zero(length, dio->submit.iter); 1100 + dio->size += length; 1101 + return length; 1102 + } 1103 + 1104 + static loff_t 1105 + iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, 1106 + struct iomap_dio *dio, struct iomap *iomap) 1107 + { 1108 + struct iov_iter *iter = dio->submit.iter; 1109 + size_t copied; 1110 + 1111 + BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); 1112 + 1113 + if (dio->flags & IOMAP_DIO_WRITE) { 1114 + loff_t size = inode->i_size; 1115 + 1116 + if (pos > size) 1117 + memset(iomap->inline_data + size, 0, pos - size); 1118 + copied = copy_from_iter(iomap->inline_data + pos, length, iter); 1119 + if (copied) { 1120 + if (pos + copied > size) 1121 + i_size_write(inode, pos + copied); 1122 + mark_inode_dirty(inode); 1123 + } 1124 + } else { 1125 + copied = copy_to_iter(iomap->inline_data + pos, length, iter); 1126 + } 1127 + dio->size += copied; 1128 + return copied; 1129 + } 1130 + 1131 + static loff_t 1132 + iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, 1133 + void *data, struct iomap *iomap) 1134 + { 1135 + struct iomap_dio *dio = data; 1136 + 1137 + switch (iomap->type) { 1138 + case IOMAP_HOLE: 1139 + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) 1140 + return -EIO; 1141 + return iomap_dio_hole_actor(length, dio); 1142 + case IOMAP_UNWRITTEN: 1143 + if (!(dio->flags & IOMAP_DIO_WRITE)) 1144 + return iomap_dio_hole_actor(length, dio); 1145 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1146 + case IOMAP_MAPPED: 1147 + return iomap_dio_bio_actor(inode, pos, length, dio, iomap); 1148 + case IOMAP_INLINE: 1149 + return iomap_dio_inline_actor(inode, pos, length, dio, iomap); 1150 + default: 1151 + WARN_ON_ONCE(1); 1152 + return -EIO; 1153 + } 1154 + } 1155 + 1451 1156 /* 1452 1157 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO 1453 1158 * is being issued as AIO or not. This allows us to optimise pure data writes ··· 1546 1131 dio->end_io = end_io; 1547 1132 dio->error = 0; 1548 1133 dio->flags = 0; 1134 + dio->wait_for_completion = is_sync_kiocb(iocb); 1549 1135 1550 1136 dio->submit.iter = iter; 1551 - if (is_sync_kiocb(iocb)) { 1552 - dio->submit.waiter = current; 1553 - dio->submit.cookie = BLK_QC_T_NONE; 1554 - dio->submit.last_queue = NULL; 1555 - } 1137 + dio->submit.waiter = current; 1138 + dio->submit.cookie = BLK_QC_T_NONE; 1139 + dio->submit.last_queue = NULL; 1556 1140 1557 1141 if (iov_iter_rw(iter) == READ) { 1558 1142 if (pos >= dio->i_size) ··· 1601 1187 dio_warn_stale_pagecache(iocb->ki_filp); 1602 1188 ret = 0; 1603 1189 1604 - if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && 1190 + if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && 1605 1191 !inode->i_sb->s_dio_done_wq) { 1606 1192 ret = sb_init_dio_done_wq(inode->i_sb); 1607 1193 if (ret < 0) ··· 1616 1202 iomap_dio_actor); 1617 1203 if (ret <= 0) { 1618 1204 /* magic error code to fall back to buffered I/O */ 1619 - if (ret == -ENOTBLK) 1205 + if (ret == -ENOTBLK) { 1206 + dio->wait_for_completion = true; 1620 1207 ret = 0; 1208 + } 1621 1209 break; 1622 1210 } 1623 1211 pos += ret; ··· 1640 1224 dio->flags &= ~IOMAP_DIO_NEED_SYNC; 1641 1225 1642 1226 if (!atomic_dec_and_test(&dio->ref)) { 1643 - if (!is_sync_kiocb(iocb)) 1227 + if (!dio->wait_for_completion) 1644 1228 return -EIOCBQUEUED; 1645 1229 1646 1230 for (;;) {
+4 -2
fs/xfs/xfs_iomap.c
··· 626 626 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch 627 627 * them out if the write happens to fail. 628 628 */ 629 - iomap->flags = IOMAP_F_NEW; 629 + iomap->flags |= IOMAP_F_NEW; 630 630 trace_xfs_iomap_alloc(ip, offset, count, 0, &got); 631 631 done: 632 632 if (isnullstartblock(got.br_startblock)) ··· 1032 1032 if (XFS_FORCED_SHUTDOWN(mp)) 1033 1033 return -EIO; 1034 1034 1035 + iomap->flags |= IOMAP_F_BUFFER_HEAD; 1036 + 1035 1037 if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && 1036 1038 !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { 1037 1039 /* Reserve delalloc blocks for regular writeback. */ ··· 1134 1132 if (error) 1135 1133 return error; 1136 1134 1137 - iomap->flags = IOMAP_F_NEW; 1135 + iomap->flags |= IOMAP_F_NEW; 1138 1136 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1139 1137 1140 1138 out_finish:
+16
include/linux/iomap.h
··· 9 9 struct inode; 10 10 struct iov_iter; 11 11 struct kiocb; 12 + struct page; 12 13 struct vm_area_struct; 13 14 struct vm_fault; 14 15 ··· 30 29 */ 31 30 #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ 32 31 #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ 32 + #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ 33 33 34 34 /* 35 35 * Flags that only need to be reported for IOMAP_REPORT requests: ··· 57 55 u16 flags; /* flags for mapping */ 58 56 struct block_device *bdev; /* block device for I/O */ 59 57 struct dax_device *dax_dev; /* dax_dev for dax operations */ 58 + void *inline_data; 59 + void *private; /* filesystem private */ 60 + 61 + /* 62 + * Called when finished processing a page in the mapping returned in 63 + * this iomap. At least for now this is only supported in the buffered 64 + * write path. 65 + */ 66 + void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, 67 + struct page *page, struct iomap *iomap); 60 68 }; 61 69 62 70 /* ··· 100 88 101 89 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 102 90 const struct iomap_ops *ops); 91 + int iomap_readpage(struct page *page, const struct iomap_ops *ops); 92 + int iomap_readpages(struct address_space *mapping, struct list_head *pages, 93 + unsigned nr_pages, const struct iomap_ops *ops); 94 + int iomap_set_page_dirty(struct page *page); 103 95 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 104 96 const struct iomap_ops *ops); 105 97 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,