Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ntfs: update mft operations

Refactors MFT record handling to use folio APIs with consistency
validation, and improving allocation extension and writeback paths
for and .

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>

+1284 -1264
+1249 -1234
fs/ntfs/mft.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 /* 3 - * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 3 + * NTFS kernel mft record operations. 4 + * Part of this file is based on code from the NTFS-3G. 4 5 * 5 6 * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. 6 7 * Copyright (c) 2002 Richard Russon 8 + * Copyright (c) 2025 LG Electronics Co., Ltd. 7 9 */ 8 10 9 - #include <linux/buffer_head.h> 10 - #include <linux/slab.h> 11 - #include <linux/swap.h> 11 + #include <linux/writeback.h> 12 12 #include <linux/bio.h> 13 + #include <linux/iomap.h> 13 14 14 - #include "attrib.h" 15 - #include "aops.h" 16 15 #include "bitmap.h" 17 - #include "debug.h" 18 - #include "dir.h" 19 16 #include "lcnalloc.h" 20 - #include "malloc.h" 21 17 #include "mft.h" 22 18 #include "ntfs.h" 23 19 24 - #define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) 20 + /* 21 + * ntfs_mft_record_check - Check the consistency of an MFT record 22 + * 23 + * Make sure its general fields are safe, then examine all its 24 + * attributes and apply generic checks to them. 25 + * 26 + * Returns 0 if the checks are successful. If not, return -EIO. 27 + */ 28 + int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m, 29 + unsigned long mft_no) 30 + { 31 + struct attr_record *a; 32 + struct super_block *sb = vol->sb; 25 33 26 - /** 27 - * map_mft_record_page - map the page in which a specific mft record resides 34 + if (!ntfs_is_file_record(m->magic)) { 35 + ntfs_error(sb, "Record %llu has no FILE magic (0x%x)\n", 36 + (unsigned long long)mft_no, le32_to_cpu(*(__le32 *)m)); 37 + goto err_out; 38 + } 39 + 40 + if (le16_to_cpu(m->usa_ofs) & 0x1 || 41 + (vol->mft_record_size >> NTFS_BLOCK_SIZE_BITS) + 1 != le16_to_cpu(m->usa_count) || 42 + le16_to_cpu(m->usa_ofs) + le16_to_cpu(m->usa_count) * 2 > vol->mft_record_size) { 43 + ntfs_error(sb, "Record %llu has corrupt fix-up values fields\n", 44 + (unsigned long long)mft_no); 45 + goto err_out; 46 + } 47 + 48 + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { 49 + ntfs_error(sb, "Record %llu has corrupt allocation size (%u <> %u)\n", 50 + (unsigned long long)mft_no, 51 + vol->mft_record_size, 52 + le32_to_cpu(m->bytes_allocated)); 53 + goto err_out; 54 + } 55 + 56 + if (le32_to_cpu(m->bytes_in_use) > vol->mft_record_size) { 57 + ntfs_error(sb, "Record %llu has corrupt in-use size (%u > %u)\n", 58 + (unsigned long long)mft_no, 59 + le32_to_cpu(m->bytes_in_use), 60 + vol->mft_record_size); 61 + goto err_out; 62 + } 63 + 64 + if (le16_to_cpu(m->attrs_offset) & 7) { 65 + ntfs_error(sb, "Attributes badly aligned in record %llu\n", 66 + (unsigned long long)mft_no); 67 + goto err_out; 68 + } 69 + 70 + a = (struct attr_record *)((char *)m + le16_to_cpu(m->attrs_offset)); 71 + if ((char *)a < (char *)m || (char *)a > (char *)m + vol->mft_record_size) { 72 + ntfs_error(sb, "Record %llu is corrupt\n", 73 + (unsigned long long)mft_no); 74 + goto err_out; 75 + } 76 + 77 + return 0; 78 + 79 + err_out: 80 + return -EIO; 81 + } 82 + 83 + /* 84 + * map_mft_record_folio - map the folio in which a specific mft record resides 28 85 * @ni: ntfs inode whose mft record page to map 29 86 * 30 - * This maps the page in which the mft record of the ntfs inode @ni is situated 31 - * and returns a pointer to the mft record within the mapped page. 87 + * This maps the folio in which the mft record of the ntfs inode @ni is 88 + * situated. 32 89 * 33 - * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() 34 - * contains the negative error code returned. 90 + * This allocates a new buffer (@ni->mrec), copies the MFT record data from 91 + * the mapped folio into this buffer, and applies the MST (Multi Sector 92 + * Transfer) fixups on the copy. 93 + * 94 + * The folio is pinned (referenced) in @ni->folio to ensure the data remains 95 + * valid in the page cache, but the returned pointer is the allocated copy. 96 + * 97 + * Return: A pointer to the allocated and fixed-up mft record (@ni->mrec). 98 + * The return value needs to be checked with IS_ERR(). If it is true, 99 + * PTR_ERR() contains the negative error code. 35 100 */ 36 - static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) 101 + static inline struct mft_record *map_mft_record_folio(struct ntfs_inode *ni) 37 102 { 38 103 loff_t i_size; 39 - ntfs_volume *vol = ni->vol; 104 + struct ntfs_volume *vol = ni->vol; 40 105 struct inode *mft_vi = vol->mft_ino; 41 - struct page *page; 106 + struct folio *folio; 42 107 unsigned long index, end_index; 43 - unsigned ofs; 108 + unsigned int ofs; 44 109 45 - BUG_ON(ni->page); 110 + WARN_ON(ni->folio); 46 111 /* 47 112 * The index into the page cache and the offset within the page cache 48 - * page of the wanted mft record. FIXME: We need to check for 49 - * overflowing the unsigned long, but I don't think we would ever get 50 - * here if the volume was that big... 113 + * page of the wanted mft record. 51 114 */ 52 - index = (u64)ni->mft_no << vol->mft_record_size_bits >> 53 - PAGE_SHIFT; 54 - ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; 115 + index = NTFS_MFT_NR_TO_PIDX(vol, ni->mft_no); 116 + ofs = NTFS_MFT_NR_TO_POFS(vol, ni->mft_no); 55 117 56 118 i_size = i_size_read(mft_vi); 57 119 /* The maximum valid index into the page cache for $MFT's data. */ ··· 123 61 if (unlikely(index >= end_index)) { 124 62 if (index > end_index || (i_size & ~PAGE_MASK) < ofs + 125 63 vol->mft_record_size) { 126 - page = ERR_PTR(-ENOENT); 127 - ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " 128 - "which is beyond the end of the mft. " 129 - "This is probably a bug in the ntfs " 130 - "driver.", ni->mft_no); 64 + folio = ERR_PTR(-ENOENT); 65 + ntfs_error(vol->sb, 66 + "Attempt to read mft record 0x%lx, which is beyond the end of the mft. This is probably a bug in the ntfs driver.", 67 + ni->mft_no); 131 68 goto err_out; 132 69 } 133 70 } 134 - /* Read, map, and pin the page. */ 135 - page = ntfs_map_page(mft_vi->i_mapping, index); 136 - if (!IS_ERR(page)) { 137 - /* Catch multi sector transfer fixup errors. */ 138 - if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + 139 - ofs)))) { 140 - ni->page = page; 141 - ni->page_ofs = ofs; 142 - return page_address(page) + ofs; 71 + 72 + /* Read, map, and pin the folio. */ 73 + folio = read_mapping_folio(mft_vi->i_mapping, index, NULL); 74 + if (!IS_ERR(folio)) { 75 + u8 *addr; 76 + 77 + ni->mrec = kmalloc(vol->mft_record_size, GFP_NOFS); 78 + if (!ni->mrec) { 79 + folio_put(folio); 80 + folio = ERR_PTR(-ENOMEM); 81 + goto err_out; 143 82 } 144 - ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " 145 - "Run chkdsk.", ni->mft_no); 146 - ntfs_unmap_page(page); 147 - page = ERR_PTR(-EIO); 83 + 84 + addr = kmap_local_folio(folio, 0); 85 + memcpy(ni->mrec, addr + ofs, vol->mft_record_size); 86 + post_read_mst_fixup((struct ntfs_record *)ni->mrec, vol->mft_record_size); 87 + 88 + /* Catch multi sector transfer fixup errors. */ 89 + if (!ntfs_mft_record_check(vol, (struct mft_record *)ni->mrec, ni->mft_no)) { 90 + kunmap_local(addr); 91 + ni->folio = folio; 92 + ni->folio_ofs = ofs; 93 + return ni->mrec; 94 + } 95 + kunmap_local(addr); 96 + folio_put(folio); 97 + kfree(ni->mrec); 98 + ni->mrec = NULL; 99 + folio = ERR_PTR(-EIO); 148 100 NVolSetErrors(vol); 149 101 } 150 102 err_out: 151 - ni->page = NULL; 152 - ni->page_ofs = 0; 153 - return (void*)page; 103 + ni->folio = NULL; 104 + ni->folio_ofs = 0; 105 + return (struct mft_record *)folio; 154 106 } 155 107 156 - /** 157 - * map_mft_record - map, pin and lock an mft record 108 + /* 109 + * map_mft_record - map and pin an mft record 158 110 * @ni: ntfs inode whose MFT record to map 159 111 * 160 - * First, take the mrec_lock mutex. We might now be sleeping, while waiting 161 - * for the mutex if it was already locked by someone else. 112 + * This function ensures the MFT record for the given inode is mapped and 113 + * accessible. 162 114 * 163 - * The page of the record is mapped using map_mft_record_page() before being 164 - * returned to the caller. 115 + * It increments the reference count of the ntfs inode. If the record is 116 + * already mapped (@ni->folio is set), it returns the cached record 117 + * immediately. 165 118 * 166 - * This in turn uses ntfs_map_page() to get the page containing the wanted mft 167 - * record (it in turn calls read_cache_page() which reads it in from disk if 168 - * necessary, increments the use count on the page so that it cannot disappear 169 - * under us and returns a reference to the page cache page). 119 + * Otherwise, it calls map_mft_record_folio() to read the folio from disk 120 + * (if necessary via read_mapping_folio), allocate a buffer, and copy the 121 + * record data. 170 122 * 171 - * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it 172 - * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed 173 - * and the post-read mst fixups on each mft record in the page have been 174 - * performed, the page gets PG_uptodate set and PG_locked cleared (this is done 175 - * in our asynchronous I/O completion handler end_buffer_read_mft_async()). 176 - * ntfs_map_page() waits for PG_locked to become clear and checks if 177 - * PG_uptodate is set and returns an error code if not. This provides 178 - * sufficient protection against races when reading/using the page. 179 - * 180 - * However there is the write mapping to think about. Doing the above described 181 - * checking here will be fine, because when initiating the write we will set 182 - * PG_locked and clear PG_uptodate making sure nobody is touching the page 183 - * contents. Doing the locking this way means that the commit to disk code in 184 - * the page cache code paths is automatically sufficiently locked with us as 185 - * we will not touch a page that has been locked or is not uptodate. The only 186 - * locking problem then is them locking the page while we are accessing it. 187 - * 188 - * So that code will end up having to own the mrec_lock of all mft 189 - * records/inodes present in the page before I/O can proceed. In that case we 190 - * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be 191 - * accessing anything without owning the mrec_lock mutex. But we do need to 192 - * use them because of the read_cache_page() invocation and the code becomes so 193 - * much simpler this way that it is well worth it. 194 - * 195 - * The mft record is now ours and we return a pointer to it. You need to check 196 - * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return 197 - * the error code. 198 - * 199 - * NOTE: Caller is responsible for setting the mft record dirty before calling 200 - * unmap_mft_record(). This is obviously only necessary if the caller really 201 - * modified the mft record... 202 - * Q: Do we want to recycle one of the VFS inode state bits instead? 203 - * A: No, the inode ones mean we want to change the mft record, not we want to 204 - * write it out. 123 + * Return: A pointer to the mft record. You need to check the returned 124 + * pointer with IS_ERR(). 205 125 */ 206 - MFT_RECORD *map_mft_record(ntfs_inode *ni) 126 + struct mft_record *map_mft_record(struct ntfs_inode *ni) 207 127 { 208 - MFT_RECORD *m; 128 + struct mft_record *m; 129 + 130 + if (!ni) 131 + return ERR_PTR(-EINVAL); 209 132 210 133 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); 211 134 212 135 /* Make sure the ntfs inode doesn't go away. */ 213 136 atomic_inc(&ni->count); 214 137 215 - /* Serialize access to this mft record. */ 216 - mutex_lock(&ni->mrec_lock); 138 + if (ni->folio) 139 + return (struct mft_record *)ni->mrec; 217 140 218 - m = map_mft_record_page(ni); 141 + m = map_mft_record_folio(ni); 219 142 if (!IS_ERR(m)) 220 143 return m; 221 144 222 - mutex_unlock(&ni->mrec_lock); 223 145 atomic_dec(&ni->count); 224 146 ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); 225 147 return m; 226 148 } 227 149 228 - /** 229 - * unmap_mft_record_page - unmap the page in which a specific mft record resides 230 - * @ni: ntfs inode whose mft record page to unmap 231 - * 232 - * This unmaps the page in which the mft record of the ntfs inode @ni is 233 - * situated and returns. This is a NOOP if highmem is not configured. 234 - * 235 - * The unmap happens via ntfs_unmap_page() which in turn decrements the use 236 - * count on the page thus releasing it from the pinned state. 237 - * 238 - * We do not actually unmap the page from memory of course, as that will be 239 - * done by the page cache code itself when memory pressure increases or 240 - * whatever. 241 - */ 242 - static inline void unmap_mft_record_page(ntfs_inode *ni) 243 - { 244 - BUG_ON(!ni->page); 245 - 246 - // TODO: If dirty, blah... 247 - ntfs_unmap_page(ni->page); 248 - ni->page = NULL; 249 - ni->page_ofs = 0; 250 - return; 251 - } 252 - 253 - /** 254 - * unmap_mft_record - release a mapped mft record 150 + /* 151 + * unmap_mft_record - release a reference to a mapped mft record 255 152 * @ni: ntfs inode whose MFT record to unmap 256 153 * 257 - * We release the page mapping and the mrec_lock mutex which unmaps the mft 258 - * record and releases it for others to get hold of. We also release the ntfs 259 - * inode by decrementing the ntfs inode reference count. 154 + * This decrements the reference count of the ntfs inode. 155 + * 156 + * It releases the caller's hold on the inode. If the reference count indicates 157 + * that there are still other users (count > 1), the function returns 158 + * immediately, keeping the resources (folio and mrec buffer) pinned for 159 + * those users. 260 160 * 261 161 * NOTE: If caller has modified the mft record, it is imperative to set the mft 262 162 * record dirty BEFORE calling unmap_mft_record(). 263 163 */ 264 - void unmap_mft_record(ntfs_inode *ni) 164 + void unmap_mft_record(struct ntfs_inode *ni) 265 165 { 266 - struct page *page = ni->page; 166 + struct folio *folio; 267 167 268 - BUG_ON(!page); 168 + if (!ni) 169 + return; 269 170 270 171 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); 271 172 272 - unmap_mft_record_page(ni); 273 - mutex_unlock(&ni->mrec_lock); 274 - atomic_dec(&ni->count); 275 - /* 276 - * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to 277 - * ntfs_clear_extent_inode() in the extent inode case, and to the 278 - * caller in the non-extent, yet pure ntfs inode case, to do the actual 279 - * tear down of all structures and freeing of all allocated memory. 280 - */ 281 - return; 173 + folio = ni->folio; 174 + if (atomic_dec_return(&ni->count) > 1) 175 + return; 176 + WARN_ON(!folio); 282 177 } 283 178 284 - /** 179 + /* 285 180 * map_extent_mft_record - load an extent inode and attach it to its base 286 181 * @base_ni: base ntfs inode 287 182 * @mref: mft reference of the extent inode to load 288 - * @ntfs_ino: on successful return, pointer to the ntfs_inode structure 183 + * @ntfs_ino: on successful return, pointer to the struct ntfs_inode structure 289 184 * 290 185 * Load the extent mft record @mref and attach it to its base inode @base_ni. 291 186 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise ··· 251 232 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode 252 233 * structure of the mapped extent inode. 253 234 */ 254 - MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, 255 - ntfs_inode **ntfs_ino) 235 + struct mft_record *map_extent_mft_record(struct ntfs_inode *base_ni, u64 mref, 236 + struct ntfs_inode **ntfs_ino) 256 237 { 257 - MFT_RECORD *m; 258 - ntfs_inode *ni = NULL; 259 - ntfs_inode **extent_nis = NULL; 238 + struct mft_record *m; 239 + struct ntfs_inode *ni = NULL; 240 + struct ntfs_inode **extent_nis = NULL; 260 241 int i; 261 242 unsigned long mft_no = MREF(mref); 262 243 u16 seq_no = MSEQNO(mref); ··· 271 252 * in which case just return it. If not found, add it to the base 272 253 * inode before returning it. 273 254 */ 255 + retry: 274 256 mutex_lock(&base_ni->extent_lock); 275 257 if (base_ni->nr_extents > 0) { 276 258 extent_nis = base_ni->ext.extent_ntfs_inos; ··· 299 279 return m; 300 280 } 301 281 unmap_mft_record(ni); 302 - ntfs_error(base_ni->vol->sb, "Found stale extent mft " 303 - "reference! Corrupt filesystem. " 304 - "Run chkdsk."); 282 + ntfs_error(base_ni->vol->sb, 283 + "Found stale extent mft reference! Corrupt filesystem. Run chkdsk."); 305 284 return ERR_PTR(-EIO); 306 285 } 307 286 map_err_out: 308 - ntfs_error(base_ni->vol->sb, "Failed to map extent " 309 - "mft record, error code %ld.", -PTR_ERR(m)); 287 + ntfs_error(base_ni->vol->sb, 288 + "Failed to map extent mft record, error code %ld.", 289 + -PTR_ERR(m)); 310 290 return m; 311 291 } 292 + mutex_unlock(&base_ni->extent_lock); 293 + 312 294 /* Record wasn't there. Get a new ntfs inode and initialize it. */ 313 295 ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); 314 296 if (unlikely(!ni)) { 315 - mutex_unlock(&base_ni->extent_lock); 316 297 atomic_dec(&base_ni->count); 317 298 return ERR_PTR(-ENOMEM); 318 299 } ··· 324 303 /* Now map the record. */ 325 304 m = map_mft_record(ni); 326 305 if (IS_ERR(m)) { 327 - mutex_unlock(&base_ni->extent_lock); 328 306 atomic_dec(&base_ni->count); 329 307 ntfs_clear_extent_inode(ni); 330 308 goto map_err_out; 331 309 } 332 310 /* Verify the sequence number if it is present. */ 333 311 if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { 334 - ntfs_error(base_ni->vol->sb, "Found stale extent mft " 335 - "reference! Corrupt filesystem. Run chkdsk."); 312 + ntfs_error(base_ni->vol->sb, 313 + "Found stale extent mft reference! Corrupt filesystem. Run chkdsk."); 336 314 destroy_ni = true; 337 315 m = ERR_PTR(-EIO); 338 - goto unm_err_out; 316 + goto unm_nolock_err_out; 317 + } 318 + 319 + mutex_lock(&base_ni->extent_lock); 320 + for (i = 0; i < base_ni->nr_extents; i++) { 321 + if (mft_no == extent_nis[i]->mft_no) { 322 + mutex_unlock(&base_ni->extent_lock); 323 + ntfs_clear_extent_inode(ni); 324 + goto retry; 325 + } 339 326 } 340 327 /* Attach extent inode to base inode, reallocating memory if needed. */ 341 328 if (!(base_ni->nr_extents & 3)) { 342 - ntfs_inode **tmp; 343 - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); 329 + struct ntfs_inode **tmp; 330 + int new_size = (base_ni->nr_extents + 4) * sizeof(struct ntfs_inode *); 344 331 345 - tmp = kmalloc(new_size, GFP_NOFS); 332 + tmp = kvzalloc(new_size, GFP_NOFS); 346 333 if (unlikely(!tmp)) { 347 - ntfs_error(base_ni->vol->sb, "Failed to allocate " 348 - "internal buffer."); 334 + ntfs_error(base_ni->vol->sb, "Failed to allocate internal buffer."); 349 335 destroy_ni = true; 350 336 m = ERR_PTR(-ENOMEM); 351 337 goto unm_err_out; 352 338 } 353 339 if (base_ni->nr_extents) { 354 - BUG_ON(!base_ni->ext.extent_ntfs_inos); 340 + WARN_ON(!base_ni->ext.extent_ntfs_inos); 355 341 memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - 356 - 4 * sizeof(ntfs_inode *)); 357 - kfree(base_ni->ext.extent_ntfs_inos); 342 + 4 * sizeof(struct ntfs_inode *)); 343 + kvfree(base_ni->ext.extent_ntfs_inos); 358 344 } 359 345 base_ni->ext.extent_ntfs_inos = tmp; 360 346 } ··· 372 344 *ntfs_ino = ni; 373 345 return m; 374 346 unm_err_out: 375 - unmap_mft_record(ni); 376 347 mutex_unlock(&base_ni->extent_lock); 348 + unm_nolock_err_out: 349 + unmap_mft_record(ni); 377 350 atomic_dec(&base_ni->count); 378 351 /* 379 352 * If the extent inode was not attached to the base inode we need to ··· 385 356 return m; 386 357 } 387 358 388 - #ifdef NTFS_RW 389 - 390 - /** 391 - * __mark_mft_record_dirty - set the mft record and the page containing it dirty 359 + /* 360 + * __mark_mft_record_dirty - mark the base vfs inode dirty 392 361 * @ni: ntfs inode describing the mapped mft record 393 362 * 394 363 * Internal function. Users should call mark_mft_record_dirty() instead. 395 364 * 396 - * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, 397 - * as well as the page containing the mft record, dirty. Also, mark the base 398 - * vfs inode dirty. This ensures that any changes to the mft record are 399 - * written out to disk. 365 + * This function determines the base ntfs inode (in case @ni is an extent 366 + * inode) and marks the corresponding VFS inode dirty. 400 367 * 401 368 * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) 402 369 * on the base vfs inode, because even though file data may have been modified, ··· 406 381 * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, 407 382 * which is not what I_DIRTY_SYNC on its own would suggest. 408 383 */ 409 - void __mark_mft_record_dirty(ntfs_inode *ni) 384 + void __mark_mft_record_dirty(struct ntfs_inode *ni) 410 385 { 411 - ntfs_inode *base_ni; 386 + struct ntfs_inode *base_ni; 412 387 413 388 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); 414 - BUG_ON(NInoAttr(ni)); 415 - mark_ntfs_record_dirty(ni->page, ni->page_ofs); 389 + WARN_ON(NInoAttr(ni)); 416 390 /* Determine the base vfs inode and mark it dirty, too. */ 417 - mutex_lock(&ni->extent_lock); 418 391 if (likely(ni->nr_extents >= 0)) 419 392 base_ni = ni; 420 393 else 421 394 base_ni = ni->ext.base_ntfs_ino; 422 - mutex_unlock(&ni->extent_lock); 423 395 __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); 424 396 } 425 397 426 - static const char *ntfs_please_email = "Please email " 427 - "linux-ntfs-dev@lists.sourceforge.net and say that you saw " 428 - "this message. Thank you."; 429 - 430 - /** 431 - * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror 432 - * @vol: ntfs volume on which the mft record to synchronize resides 433 - * @mft_no: mft record number of mft record to synchronize 434 - * @m: mapped, mst protected (extent) mft record to synchronize 398 + /* 399 + * ntfs_bio_end_io - bio completion callback for MFT record writes 435 400 * 436 - * Write the mapped, mst protected (extent) mft record @m with mft record 437 - * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, 438 - * bypassing the page cache and the $MFTMirr inode itself. 439 - * 440 - * This function is only for use at umount time when the mft mirror inode has 441 - * already been disposed off. We BUG() if we are called while the mft mirror 442 - * inode is still attached to the volume. 443 - * 444 - * On success return 0. On error return -errno. 445 - * 446 - * NOTE: This function is not implemented yet as I am not convinced it can 447 - * actually be triggered considering the sequence of commits we do in super.c:: 448 - * ntfs_put_super(). But just in case we provide this place holder as the 449 - * alternative would be either to BUG() or to get a NULL pointer dereference 450 - * and Oops. 401 + * Decrements the folio reference count that was incremented before 402 + * submit_bio(). This prevents a race condition where umount could 403 + * evict the inode and release the folio while I/O is still in flight, 404 + * potentially causing data corruption or use-after-free. 451 405 */ 452 - static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, 453 - const unsigned long mft_no, MFT_RECORD *m) 406 + static void ntfs_bio_end_io(struct bio *bio) 454 407 { 455 - BUG_ON(vol->mftmirr_ino); 456 - ntfs_error(vol->sb, "Umount time mft mirror syncing is not " 457 - "implemented yet. %s", ntfs_please_email); 458 - return -EOPNOTSUPP; 408 + if (bio->bi_private) 409 + folio_put((struct folio *)bio->bi_private); 410 + bio_put(bio); 459 411 } 460 412 461 - /** 413 + /* 462 414 * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror 463 415 * @vol: ntfs volume on which the mft record to synchronize resides 464 416 * @mft_no: mft record number of mft record to synchronize 465 417 * @m: mapped, mst protected (extent) mft record to synchronize 466 - * @sync: if true, wait for i/o completion 467 418 * 468 419 * Write the mapped, mst protected (extent) mft record @m with mft record 469 420 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. ··· 447 446 * On success return 0. On error return -errno and set the volume errors flag 448 447 * in the ntfs volume @vol. 449 448 * 450 - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. 451 - * 452 - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just 453 - * schedule i/o via ->writepage or do it via kntfsd or whatever. 449 + * NOTE: We always perform synchronous i/o. 454 450 */ 455 - int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, 456 - MFT_RECORD *m, int sync) 451 + int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const unsigned long mft_no, 452 + struct mft_record *m) 457 453 { 458 - struct page *page; 459 - unsigned int blocksize = vol->sb->s_blocksize; 460 - int max_bhs = vol->mft_record_size / blocksize; 461 - struct buffer_head *bhs[MAX_BHS]; 462 - struct buffer_head *bh, *head; 463 - u8 *kmirr; 464 - runlist_element *rl; 465 - unsigned int block_start, block_end, m_start, m_end, page_ofs; 466 - int i_bhs, nr_bhs, err = 0; 467 - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; 454 + u8 *kmirr = NULL; 455 + struct folio *folio; 456 + unsigned int folio_ofs, lcn_folio_off = 0; 457 + int err = 0; 458 + struct bio *bio; 468 459 469 460 ntfs_debug("Entering for inode 0x%lx.", mft_no); 470 - BUG_ON(!max_bhs); 471 - if (WARN_ON(max_bhs > MAX_BHS)) 472 - return -EINVAL; 461 + 473 462 if (unlikely(!vol->mftmirr_ino)) { 474 463 /* This could happen during umount... */ 475 - err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); 476 - if (likely(!err)) 477 - return err; 464 + err = -EIO; 478 465 goto err_out; 479 466 } 480 467 /* Get the page containing the mirror copy of the mft record @m. */ 481 - page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> 482 - (PAGE_SHIFT - vol->mft_record_size_bits)); 483 - if (IS_ERR(page)) { 468 + folio = read_mapping_folio(vol->mftmirr_ino->i_mapping, 469 + NTFS_MFT_NR_TO_PIDX(vol, mft_no), NULL); 470 + if (IS_ERR(folio)) { 484 471 ntfs_error(vol->sb, "Failed to map mft mirror page."); 485 - err = PTR_ERR(page); 472 + err = PTR_ERR(folio); 486 473 goto err_out; 487 474 } 488 - lock_page(page); 489 - BUG_ON(!PageUptodate(page)); 490 - ClearPageUptodate(page); 475 + 476 + folio_lock(folio); 477 + folio_clear_uptodate(folio); 491 478 /* Offset of the mft mirror record inside the page. */ 492 - page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; 479 + folio_ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no); 493 480 /* The address in the page of the mirror copy of the mft record @m. */ 494 - kmirr = page_address(page) + page_ofs; 481 + kmirr = kmap_local_folio(folio, 0) + folio_ofs; 495 482 /* Copy the mst protected mft record to the mirror. */ 496 483 memcpy(kmirr, m, vol->mft_record_size); 497 - /* Create uptodate buffers if not present. */ 498 - if (unlikely(!page_has_buffers(page))) { 499 - struct buffer_head *tail; 500 484 501 - bh = head = alloc_page_buffers(page, blocksize, true); 502 - do { 503 - set_buffer_uptodate(bh); 504 - tail = bh; 505 - bh = bh->b_this_page; 506 - } while (bh); 507 - tail->b_this_page = head; 508 - attach_page_private(page, head); 485 + if (vol->cluster_size_bits > PAGE_SHIFT) { 486 + lcn_folio_off = folio->index << PAGE_SHIFT; 487 + lcn_folio_off &= vol->cluster_size_mask; 509 488 } 510 - bh = head = page_buffers(page); 511 - BUG_ON(!bh); 512 - rl = NULL; 513 - nr_bhs = 0; 514 - block_start = 0; 515 - m_start = kmirr - (u8*)page_address(page); 516 - m_end = m_start + vol->mft_record_size; 517 - do { 518 - block_end = block_start + blocksize; 519 - /* If the buffer is outside the mft record, skip it. */ 520 - if (block_end <= m_start) 521 - continue; 522 - if (unlikely(block_start >= m_end)) 523 - break; 524 - /* Need to map the buffer if it is not mapped already. */ 525 - if (unlikely(!buffer_mapped(bh))) { 526 - VCN vcn; 527 - LCN lcn; 528 - unsigned int vcn_ofs; 529 489 530 - bh->b_bdev = vol->sb->s_bdev; 531 - /* Obtain the vcn and offset of the current block. */ 532 - vcn = ((VCN)mft_no << vol->mft_record_size_bits) + 533 - (block_start - m_start); 534 - vcn_ofs = vcn & vol->cluster_size_mask; 535 - vcn >>= vol->cluster_size_bits; 536 - if (!rl) { 537 - down_read(&NTFS_I(vol->mftmirr_ino)-> 538 - runlist.lock); 539 - rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; 540 - /* 541 - * $MFTMirr always has the whole of its runlist 542 - * in memory. 543 - */ 544 - BUG_ON(!rl); 545 - } 546 - /* Seek to element containing target vcn. */ 547 - while (rl->length && rl[1].vcn <= vcn) 548 - rl++; 549 - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); 550 - /* For $MFTMirr, only lcn >= 0 is a successful remap. */ 551 - if (likely(lcn >= 0)) { 552 - /* Setup buffer head to correct block. */ 553 - bh->b_blocknr = ((lcn << 554 - vol->cluster_size_bits) + 555 - vcn_ofs) >> blocksize_bits; 556 - set_buffer_mapped(bh); 557 - } else { 558 - bh->b_blocknr = -1; 559 - ntfs_error(vol->sb, "Cannot write mft mirror " 560 - "record 0x%lx because its " 561 - "location on disk could not " 562 - "be determined (error code " 563 - "%lli).", mft_no, 564 - (long long)lcn); 565 - err = -EIO; 566 - } 567 - } 568 - BUG_ON(!buffer_uptodate(bh)); 569 - BUG_ON(!nr_bhs && (m_start != block_start)); 570 - BUG_ON(nr_bhs >= max_bhs); 571 - bhs[nr_bhs++] = bh; 572 - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); 573 - } while (block_start = block_end, (bh = bh->b_this_page) != head); 574 - if (unlikely(rl)) 575 - up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); 576 - if (likely(!err)) { 577 - /* Lock buffers and start synchronous write i/o on them. */ 578 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { 579 - struct buffer_head *tbh = bhs[i_bhs]; 490 + bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO); 491 + bio->bi_iter.bi_sector = 492 + NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) + 493 + lcn_folio_off + folio_ofs); 580 494 581 - if (!trylock_buffer(tbh)) 582 - BUG(); 583 - BUG_ON(!buffer_uptodate(tbh)); 584 - clear_buffer_dirty(tbh); 585 - get_bh(tbh); 586 - tbh->b_end_io = end_buffer_write_sync; 587 - submit_bh(REQ_OP_WRITE, tbh); 588 - } 589 - /* Wait on i/o completion of buffers. */ 590 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { 591 - struct buffer_head *tbh = bhs[i_bhs]; 592 - 593 - wait_on_buffer(tbh); 594 - if (unlikely(!buffer_uptodate(tbh))) { 595 - err = -EIO; 596 - /* 597 - * Set the buffer uptodate so the page and 598 - * buffer states do not become out of sync. 599 - */ 600 - set_buffer_uptodate(tbh); 601 - } 602 - } 603 - } else /* if (unlikely(err)) */ { 604 - /* Clean the buffers. */ 605 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) 606 - clear_buffer_dirty(bhs[i_bhs]); 495 + if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) { 496 + err = -EIO; 497 + bio_put(bio); 498 + goto unlock_folio; 607 499 } 500 + 501 + bio->bi_end_io = ntfs_bio_end_io; 502 + submit_bio(bio); 608 503 /* Current state: all buffers are clean, unlocked, and uptodate. */ 609 - /* Remove the mst protection fixups again. */ 610 - post_write_mst_fixup((NTFS_RECORD*)kmirr); 611 - flush_dcache_page(page); 612 - SetPageUptodate(page); 613 - unlock_page(page); 614 - ntfs_unmap_page(page); 504 + folio_mark_uptodate(folio); 505 + 506 + unlock_folio: 507 + folio_unlock(folio); 508 + kunmap_local(kmirr); 509 + folio_put(folio); 615 510 if (likely(!err)) { 616 511 ntfs_debug("Done."); 617 512 } else { 618 - ntfs_error(vol->sb, "I/O error while writing mft mirror " 619 - "record 0x%lx!", mft_no); 513 + ntfs_error(vol->sb, "I/O error while writing mft mirror record 0x%lx!", mft_no); 620 514 err_out: 621 - ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " 622 - "code %i). Volume will be left marked dirty " 623 - "on umount. Run ntfsfix on the partition " 624 - "after umounting to correct this.", -err); 515 + ntfs_error(vol->sb, 516 + "Failed to synchronize $MFTMirr (error code %i). Volume will be left marked dirty on umount. Run chkdsk on the partition after umounting to correct this.", 517 + err); 625 518 NVolSetErrors(vol); 626 519 } 627 520 return err; 628 521 } 629 522 630 - /** 523 + /* 631 524 * write_mft_record_nolock - write out a mapped (extent) mft record 632 525 * @ni: ntfs inode describing the mapped (extent) mft record 633 526 * @m: mapped (extent) mft record to write ··· 531 636 * ntfs inode @ni to backing store. If the mft record @m has a counterpart in 532 637 * the mft mirror, that is also updated. 533 638 * 534 - * We only write the mft record if the ntfs inode @ni is dirty and the first 535 - * buffer belonging to its mft record is dirty, too. We ignore the dirty state 536 - * of subsequent buffers because we could have raced with 537 - * fs/ntfs/aops.c::mark_ntfs_record_dirty(). 639 + * We only write the mft record if the ntfs inode @ni is dirty. 538 640 * 539 - * On success, clean the mft record and return 0. On error, leave the mft 540 - * record dirty and return -errno. 541 - * 542 - * NOTE: We always perform synchronous i/o and ignore the @sync parameter. 543 - * However, if the mft record has a counterpart in the mft mirror and @sync is 544 - * true, we write the mft record, wait for i/o completion, and only then write 545 - * the mft mirror copy. This ensures that if the system crashes either the mft 546 - * or the mft mirror will contain a self-consistent mft record @m. If @sync is 547 - * false on the other hand, we start i/o on both and then wait for completion 548 - * on them. This provides a speedup but no longer guarantees that you will end 549 - * up with a self-consistent mft record in the case of a crash but if you asked 550 - * for asynchronous writing you probably do not care about that anyway. 551 - * 552 - * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just 553 - * schedule i/o via ->writepage or do it via kntfsd or whatever. 641 + * On success, clean the mft record and return 0. 642 + * On error (specifically ENOMEM), we redirty the record so it can be retried. 643 + * For other errors, we mark the volume with errors. 554 644 */ 555 - int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) 645 + int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int sync) 556 646 { 557 - ntfs_volume *vol = ni->vol; 558 - struct page *page = ni->page; 559 - unsigned int blocksize = vol->sb->s_blocksize; 560 - unsigned char blocksize_bits = vol->sb->s_blocksize_bits; 561 - int max_bhs = vol->mft_record_size / blocksize; 562 - struct buffer_head *bhs[MAX_BHS]; 563 - struct buffer_head *bh, *head; 564 - runlist_element *rl; 565 - unsigned int block_start, block_end, m_start, m_end; 566 - int i_bhs, nr_bhs, err = 0; 647 + struct ntfs_volume *vol = ni->vol; 648 + struct folio *folio = ni->folio; 649 + int err = 0, i = 0; 650 + u8 *kaddr; 651 + struct mft_record *fixup_m; 652 + struct bio *bio; 653 + unsigned int offset = 0, folio_size; 567 654 568 655 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); 569 - BUG_ON(NInoAttr(ni)); 570 - BUG_ON(!max_bhs); 571 - BUG_ON(!PageLocked(page)); 572 - if (WARN_ON(max_bhs > MAX_BHS)) { 573 - err = -EINVAL; 574 - goto err_out; 575 - } 656 + 657 + WARN_ON(NInoAttr(ni)); 658 + WARN_ON(!folio_test_locked(folio)); 659 + 576 660 /* 577 - * If the ntfs_inode is clean no need to do anything. If it is dirty, 661 + * If the struct ntfs_inode is clean no need to do anything. If it is dirty, 578 662 * mark it as clean now so that it can be redirtied later on if needed. 579 663 * There is no danger of races since the caller is holding the locks 580 664 * for the mft record @m and the page it is in. 581 665 */ 582 666 if (!NInoTestClearDirty(ni)) 583 667 goto done; 584 - bh = head = page_buffers(page); 585 - BUG_ON(!bh); 586 - rl = NULL; 587 - nr_bhs = 0; 588 - block_start = 0; 589 - m_start = ni->page_ofs; 590 - m_end = m_start + vol->mft_record_size; 591 - do { 592 - block_end = block_start + blocksize; 593 - /* If the buffer is outside the mft record, skip it. */ 594 - if (block_end <= m_start) 595 - continue; 596 - if (unlikely(block_start >= m_end)) 597 - break; 598 - /* 599 - * If this block is not the first one in the record, we ignore 600 - * the buffer's dirty state because we could have raced with a 601 - * parallel mark_ntfs_record_dirty(). 602 - */ 603 - if (block_start == m_start) { 604 - /* This block is the first one in the record. */ 605 - if (!buffer_dirty(bh)) { 606 - BUG_ON(nr_bhs); 607 - /* Clean records are not written out. */ 608 - break; 609 - } 610 - } 611 - /* Need to map the buffer if it is not mapped already. */ 612 - if (unlikely(!buffer_mapped(bh))) { 613 - VCN vcn; 614 - LCN lcn; 615 - unsigned int vcn_ofs; 616 668 617 - bh->b_bdev = vol->sb->s_bdev; 618 - /* Obtain the vcn and offset of the current block. */ 619 - vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + 620 - (block_start - m_start); 621 - vcn_ofs = vcn & vol->cluster_size_mask; 622 - vcn >>= vol->cluster_size_bits; 623 - if (!rl) { 624 - down_read(&NTFS_I(vol->mft_ino)->runlist.lock); 625 - rl = NTFS_I(vol->mft_ino)->runlist.rl; 626 - BUG_ON(!rl); 627 - } 628 - /* Seek to element containing target vcn. */ 629 - while (rl->length && rl[1].vcn <= vcn) 630 - rl++; 631 - lcn = ntfs_rl_vcn_to_lcn(rl, vcn); 632 - /* For $MFT, only lcn >= 0 is a successful remap. */ 633 - if (likely(lcn >= 0)) { 634 - /* Setup buffer head to correct block. */ 635 - bh->b_blocknr = ((lcn << 636 - vol->cluster_size_bits) + 637 - vcn_ofs) >> blocksize_bits; 638 - set_buffer_mapped(bh); 639 - } else { 640 - bh->b_blocknr = -1; 641 - ntfs_error(vol->sb, "Cannot write mft record " 642 - "0x%lx because its location " 643 - "on disk could not be " 644 - "determined (error code %lli).", 645 - ni->mft_no, (long long)lcn); 646 - err = -EIO; 647 - } 648 - } 649 - BUG_ON(!buffer_uptodate(bh)); 650 - BUG_ON(!nr_bhs && (m_start != block_start)); 651 - BUG_ON(nr_bhs >= max_bhs); 652 - bhs[nr_bhs++] = bh; 653 - BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); 654 - } while (block_start = block_end, (bh = bh->b_this_page) != head); 655 - if (unlikely(rl)) 656 - up_read(&NTFS_I(vol->mft_ino)->runlist.lock); 657 - if (!nr_bhs) 658 - goto done; 659 - if (unlikely(err)) 660 - goto cleanup_out; 669 + kaddr = kmap_local_folio(folio, 0); 670 + fixup_m = (struct mft_record *)(kaddr + ni->folio_ofs); 671 + memcpy(fixup_m, m, vol->mft_record_size); 672 + 661 673 /* Apply the mst protection fixups. */ 662 - err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); 674 + err = pre_write_mst_fixup((struct ntfs_record *)fixup_m, vol->mft_record_size); 663 675 if (err) { 664 676 ntfs_error(vol->sb, "Failed to apply mst fixups!"); 665 - goto cleanup_out; 677 + goto err_out; 666 678 } 667 - flush_dcache_mft_record_page(ni); 668 - /* Lock buffers and start synchronous write i/o on them. */ 669 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { 670 - struct buffer_head *tbh = bhs[i_bhs]; 671 679 672 - if (!trylock_buffer(tbh)) 673 - BUG(); 674 - BUG_ON(!buffer_uptodate(tbh)); 675 - clear_buffer_dirty(tbh); 676 - get_bh(tbh); 677 - tbh->b_end_io = end_buffer_write_sync; 678 - submit_bh(REQ_OP_WRITE, tbh); 679 - } 680 - /* Synchronize the mft mirror now if not @sync. */ 681 - if (!sync && ni->mft_no < vol->mftmirr_size) 682 - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); 683 - /* Wait on i/o completion of buffers. */ 684 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { 685 - struct buffer_head *tbh = bhs[i_bhs]; 680 + folio_size = vol->mft_record_size / ni->mft_lcn_count; 681 + while (i < ni->mft_lcn_count) { 682 + unsigned int clu_off; 686 683 687 - wait_on_buffer(tbh); 688 - if (unlikely(!buffer_uptodate(tbh))) { 684 + clu_off = (unsigned int)((s64)ni->mft_no * vol->mft_record_size + offset) & 685 + vol->cluster_size_mask; 686 + 687 + bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO); 688 + bio->bi_iter.bi_sector = 689 + NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, ni->mft_lcn[i]) + 690 + clu_off); 691 + 692 + if (!bio_add_folio(bio, folio, folio_size, 693 + ni->folio_ofs + offset)) { 689 694 err = -EIO; 690 - /* 691 - * Set the buffer uptodate so the page and buffer 692 - * states do not become out of sync. 693 - */ 694 - if (PageUptodate(page)) 695 - set_buffer_uptodate(tbh); 695 + goto put_bio_out; 696 696 } 697 + 698 + /* Synchronize the mft mirror now if not @sync. */ 699 + if (!sync && ni->mft_no < vol->mftmirr_size) 700 + ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); 701 + 702 + folio_get(folio); 703 + bio->bi_private = folio; 704 + bio->bi_end_io = ntfs_bio_end_io; 705 + submit_bio(bio); 706 + offset += vol->cluster_size; 707 + i++; 697 708 } 709 + 698 710 /* If @sync, now synchronize the mft mirror. */ 699 711 if (sync && ni->mft_no < vol->mftmirr_size) 700 - ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); 701 - /* Remove the mst protection fixups again. */ 702 - post_write_mst_fixup((NTFS_RECORD*)m); 703 - flush_dcache_mft_record_page(ni); 712 + ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m); 713 + kunmap_local(kaddr); 704 714 if (unlikely(err)) { 705 715 /* I/O error during writing. This is really bad! */ 706 - ntfs_error(vol->sb, "I/O error while writing mft record " 707 - "0x%lx! Marking base inode as bad. You " 708 - "should unmount the volume and run chkdsk.", 709 - ni->mft_no); 716 + ntfs_error(vol->sb, 717 + "I/O error while writing mft record 0x%lx! Marking base inode as bad. You should unmount the volume and run chkdsk.", 718 + ni->mft_no); 710 719 goto err_out; 711 720 } 712 721 done: 713 722 ntfs_debug("Done."); 714 723 return 0; 715 - cleanup_out: 716 - /* Clean the buffers. */ 717 - for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) 718 - clear_buffer_dirty(bhs[i_bhs]); 724 + put_bio_out: 725 + bio_put(bio); 719 726 err_out: 720 727 /* 721 728 * Current state: all buffers are clean, unlocked, and uptodate. 722 729 * The caller should mark the base inode as bad so that no more i/o 723 - * happens. ->clear_inode() will still be invoked so all extent inodes 730 + * happens. ->drop_inode() will still be invoked so all extent inodes 724 731 * and other allocated memory will be freed. 725 732 */ 726 733 if (err == -ENOMEM) { 727 - ntfs_error(vol->sb, "Not enough memory to write mft record. " 728 - "Redirtying so the write is retried later."); 734 + ntfs_error(vol->sb, 735 + "Not enough memory to write mft record. Redirtying so the write is retried later."); 729 736 mark_mft_record_dirty(ni); 730 737 err = 0; 731 738 } else ··· 635 838 return err; 636 839 } 637 840 638 - /** 841 + static int ntfs_test_inode_wb(struct inode *vi, unsigned long ino, void *data) 842 + { 843 + struct ntfs_attr *na = data; 844 + 845 + if (!ntfs_test_inode(vi, na)) 846 + return 0; 847 + 848 + /* 849 + * Without this, ntfs_write_mst_block() could call iput_final() 850 + * , and ntfs_evict_big_inode() could try to unlink this inode 851 + * and the contex could be blocked infinitly in map_mft_record(). 852 + */ 853 + if (NInoBeingDeleted(NTFS_I(vi))) { 854 + na->state = NI_BeingDeleted; 855 + return -1; 856 + } 857 + 858 + /* 859 + * This condition can prevent ntfs_write_mst_block() 860 + * from applying/undo fixups while ntfs_create() being 861 + * called 862 + */ 863 + spin_lock(&vi->i_lock); 864 + if (inode_state_read_once(vi) & I_CREATING) { 865 + spin_unlock(&vi->i_lock); 866 + na->state = NI_BeingCreated; 867 + return -1; 868 + } 869 + spin_unlock(&vi->i_lock); 870 + 871 + return igrab(vi) ? 1 : -1; 872 + } 873 + 874 + /* 639 875 * ntfs_may_write_mft_record - check if an mft record may be written out 640 876 * @vol: [IN] ntfs volume on which the mft record to check resides 641 877 * @mft_no: [IN] mft record number of the mft record to check 642 878 * @m: [IN] mapped mft record to check 643 879 * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned 880 + * @ref_vi: [OUT] caller has to drop this vfs inode if one is returned 644 881 * 645 882 * Check if the mapped (base or extent) mft record @m with mft record number 646 883 * @mft_no belonging to the ntfs volume @vol may be written out. If necessary ··· 683 852 * caller is responsible for unlocking the ntfs inode and unpinning the base 684 853 * vfs inode. 685 854 * 855 + * To avoid deadlock when the caller holds a folio lock, if the function 856 + * returns @ref_vi it defers dropping the vfs inode reference by returning 857 + * it in @ref_vi instead of calling iput() directly. The caller must call 858 + * iput() on @ref_vi after releasing the folio lock. 859 + * 686 860 * Return 'true' if the mft record may be written out and 'false' if not. 687 861 * 688 862 * The caller has locked the page and cleared the uptodate flag on it which 689 863 * means that we can safely write out any dirty mft records that do not have 690 - * their inodes in icache as determined by ilookup5() as anyone 691 - * opening/creating such an inode would block when attempting to map the mft 692 - * record in read_cache_page() until we are finished with the write out. 864 + * their inodes in icache as determined by find_inode_nowait(). 693 865 * 694 866 * Here is a description of the tests we perform: 695 867 * 696 868 * If the inode is found in icache we know the mft record must be a base mft 697 869 * record. If it is dirty, we do not write it and return 'false' as the vfs 698 870 * inode write paths will result in the access times being updated which would 699 - * cause the base mft record to be redirtied and written out again. (We know 700 - * the access time update will modify the base mft record because Windows 701 - * chkdsk complains if the standard information attribute is not in the base 702 - * mft record.) 871 + * cause the base mft record to be redirtied and written out again. 703 872 * 704 873 * If the inode is in icache and not dirty, we attempt to lock the mft record 705 874 * and if we find the lock was already taken, it is not safe to write the mft ··· 710 879 * @locked_ni to the locked ntfs inode and return 'true'. 711 880 * 712 881 * Note we cannot just lock the mft record and sleep while waiting for the lock 713 - * because this would deadlock due to lock reversal (normally the mft record is 714 - * locked before the page is locked but we already have the page locked here 715 - * when we try to lock the mft record). 882 + * because this would deadlock due to lock reversal. 716 883 * 717 884 * If the inode is not in icache we need to perform further checks. 718 885 * ··· 718 889 * safely write it and return 'true'. 719 890 * 720 891 * We now know the mft record is an extent mft record. We check if the inode 721 - * corresponding to its base mft record is in icache and obtain a reference to 722 - * it if it is. If it is not, we can safely write it and return 'true'. 892 + * corresponding to its base mft record is in icache. If it is not, we cannot 893 + * safely determine the state of the extent inode, so we return 'false'. 723 894 * 724 895 * We now have the base inode for the extent mft record. We check if it has an 725 - * ntfs inode for the extent mft record attached and if not it is safe to write 896 + * ntfs inode for the extent mft record attached. If not, it is safe to write 726 897 * the extent mft record and we return 'true'. 727 898 * 728 - * The ntfs inode for the extent mft record is attached to the base inode so we 729 - * attempt to lock the extent mft record and if we find the lock was already 730 - * taken, it is not safe to write the extent mft record and we return 'false'. 899 + * If the extent inode is attached, we check if it is dirty. If so, we return 900 + * 'false' (letting the standard write_inode path handle it). 901 + * 902 + * If it is not dirty, we attempt to lock the extent mft record. If the lock 903 + * was already taken, it is not safe to write and we return 'false'. 731 904 * 732 905 * If we manage to obtain the lock we have exclusive access to the extent mft 733 - * record, which also allows us safe writeout of the extent mft record. We 734 - * set the ntfs inode of the extent mft record clean and then set @locked_ni to 735 - * the now locked ntfs inode and return 'true'. 736 - * 737 - * Note, the reason for actually writing dirty mft records here and not just 738 - * relying on the vfs inode dirty code paths is that we can have mft records 739 - * modified without them ever having actual inodes in memory. Also we can have 740 - * dirty mft records with clean ntfs inodes in memory. None of the described 741 - * cases would result in the dirty mft records being written out if we only 742 - * relied on the vfs inode dirty code paths. And these cases can really occur 743 - * during allocation of new mft records and in particular when the 744 - * initialized_size of the $MFT/$DATA attribute is extended and the new space 745 - * is initialized using ntfs_mft_record_format(). The clean inode can then 746 - * appear if the mft record is reused for a new inode before it got written 747 - * out. 906 + * record. We set @locked_ni to the now locked ntfs inode and return 'true'. 748 907 */ 749 - bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, 750 - const MFT_RECORD *m, ntfs_inode **locked_ni) 908 + bool ntfs_may_write_mft_record(struct ntfs_volume *vol, const unsigned long mft_no, 909 + const struct mft_record *m, struct ntfs_inode **locked_ni, 910 + struct inode **ref_vi) 751 911 { 752 912 struct super_block *sb = vol->sb; 753 913 struct inode *mft_vi = vol->mft_ino; 754 914 struct inode *vi; 755 - ntfs_inode *ni, *eni, **extent_nis; 915 + struct ntfs_inode *ni, *eni, **extent_nis; 756 916 int i; 757 - ntfs_attr na; 917 + struct ntfs_attr na = {0}; 758 918 759 919 ntfs_debug("Entering for inode 0x%lx.", mft_no); 760 920 /* 761 921 * Normally we do not return a locked inode so set @locked_ni to NULL. 762 922 */ 763 - BUG_ON(!locked_ni); 764 923 *locked_ni = NULL; 924 + *ref_vi = NULL; 925 + 765 926 /* 766 927 * Check if the inode corresponding to this mft record is in the VFS 767 928 * inode cache and obtain a reference to it if it is. 768 929 */ 769 930 ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); 770 931 na.mft_no = mft_no; 771 - na.name = NULL; 772 - na.name_len = 0; 773 932 na.type = AT_UNUSED; 774 933 /* 775 934 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and ··· 766 949 if (!mft_no) { 767 950 /* Balance the below iput(). */ 768 951 vi = igrab(mft_vi); 769 - BUG_ON(vi != mft_vi); 952 + WARN_ON(vi != mft_vi); 770 953 } else { 771 954 /* 772 - * Have to use ilookup5_nowait() since ilookup5() waits for the 773 - * inode lock which causes ntfs to deadlock when a concurrent 774 - * inode write via the inode dirty code paths and the page 775 - * dirty code path of the inode dirty code path when writing 776 - * $MFT occurs. 955 + * Have to use find_inode_nowait() since ilookup5_nowait() 956 + * waits for inode with I_FREEING, which causes ntfs to deadlock 957 + * when inodes are unlinked concurrently 777 958 */ 778 - vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); 959 + vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na); 960 + if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated) 961 + return false; 779 962 } 780 963 if (vi) { 781 964 ntfs_debug("Base inode 0x%lx is in icache.", mft_no); ··· 788 971 ntfs_debug("Inode 0x%lx is dirty, do not write it.", 789 972 mft_no); 790 973 atomic_dec(&ni->count); 791 - iput(vi); 974 + *ref_vi = vi; 792 975 return false; 793 976 } 794 977 ntfs_debug("Inode 0x%lx is not dirty.", mft_no); 795 978 /* The inode is not dirty, try to take the mft record lock. */ 796 979 if (unlikely(!mutex_trylock(&ni->mrec_lock))) { 797 - ntfs_debug("Mft record 0x%lx is already locked, do " 798 - "not write it.", mft_no); 980 + ntfs_debug("Mft record 0x%lx is already locked, do not write it.", mft_no); 799 981 atomic_dec(&ni->count); 800 - iput(vi); 982 + *ref_vi = vi; 801 983 return false; 802 984 } 803 985 ntfs_debug("Managed to lock mft record 0x%lx, write it.", ··· 828 1012 * is. 829 1013 */ 830 1014 na.mft_no = MREF_LE(m->base_mft_record); 831 - ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " 832 - "inode 0x%lx in icache.", mft_no, na.mft_no); 1015 + na.state = 0; 1016 + ntfs_debug("Mft record 0x%lx is an extent record. Looking for base inode 0x%lx in icache.", 1017 + mft_no, na.mft_no); 833 1018 if (!na.mft_no) { 834 1019 /* Balance the below iput(). */ 835 1020 vi = igrab(mft_vi); 836 - BUG_ON(vi != mft_vi); 837 - } else 838 - vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, 839 - &na); 840 - if (!vi) { 841 - /* 842 - * The base inode is not in icache, write this extent mft 843 - * record. 844 - */ 845 - ntfs_debug("Base inode 0x%lx is not in icache, write the " 846 - "extent record.", na.mft_no); 847 - return true; 1021 + WARN_ON(vi != mft_vi); 1022 + } else { 1023 + vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na); 1024 + if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated) 1025 + return false; 848 1026 } 1027 + 1028 + if (!vi) 1029 + return false; 849 1030 ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); 850 1031 /* 851 1032 * The base inode is in icache. Check if it has the extent inode ··· 856 1043 * extent mft record. 857 1044 */ 858 1045 mutex_unlock(&ni->extent_lock); 859 - iput(vi); 860 - ntfs_debug("Base inode 0x%lx has no attached extent inodes, " 861 - "write the extent record.", na.mft_no); 1046 + *ref_vi = vi; 1047 + ntfs_debug("Base inode 0x%lx has no attached extent inodes, write the extent record.", 1048 + na.mft_no); 862 1049 return true; 863 1050 } 864 1051 /* Iterate over the attached extent inodes. */ ··· 879 1066 */ 880 1067 if (!eni) { 881 1068 mutex_unlock(&ni->extent_lock); 882 - iput(vi); 883 - ntfs_debug("Extent inode 0x%lx is not attached to its base " 884 - "inode 0x%lx, write the extent record.", 1069 + *ref_vi = vi; 1070 + ntfs_debug("Extent inode 0x%lx is not attached to its base inode 0x%lx, write the extent record.", 885 1071 mft_no, na.mft_no); 886 1072 return true; 887 1073 } ··· 889 1077 /* Take a reference to the extent ntfs inode. */ 890 1078 atomic_inc(&eni->count); 891 1079 mutex_unlock(&ni->extent_lock); 1080 + 1081 + /* if extent inode is dirty, write_inode will write it */ 1082 + if (NInoDirty(eni)) { 1083 + atomic_dec(&eni->count); 1084 + *ref_vi = vi; 1085 + return false; 1086 + } 1087 + 892 1088 /* 893 1089 * Found the extent inode coresponding to this extent mft record. 894 1090 * Try to take the mft record lock. 895 1091 */ 896 1092 if (unlikely(!mutex_trylock(&eni->mrec_lock))) { 897 1093 atomic_dec(&eni->count); 898 - iput(vi); 899 - ntfs_debug("Extent mft record 0x%lx is already locked, do " 900 - "not write it.", mft_no); 1094 + *ref_vi = vi; 1095 + ntfs_debug("Extent mft record 0x%lx is already locked, do not write it.", 1096 + mft_no); 901 1097 return false; 902 1098 } 903 1099 ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", 904 1100 mft_no); 905 - if (NInoTestClearDirty(eni)) 906 - ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", 907 - mft_no); 908 1101 /* 909 1102 * The write has to occur while we hold the mft record lock so return 910 1103 * the locked extent ntfs inode. ··· 918 1101 return true; 919 1102 } 920 1103 921 - static const char *es = " Leaving inconsistent metadata. Unmount and run " 922 - "chkdsk."; 1104 + static const char *es = " Leaving inconsistent metadata. Unmount and run chkdsk."; 923 1105 924 - /** 1106 + #define RESERVED_MFT_RECORDS 64 1107 + 1108 + /* 925 1109 * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name 926 1110 * @vol: volume on which to search for a free mft record 927 1111 * @base_ni: open base inode if allocating an extent mft record or NULL ··· 941 1123 * 942 1124 * Locking: Caller must hold vol->mftbmp_lock for writing. 943 1125 */ 944 - static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, 945 - ntfs_inode *base_ni) 1126 + static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume *vol, 1127 + struct ntfs_inode *base_ni) 946 1128 { 947 1129 s64 pass_end, ll, data_pos, pass_start, ofs, bit; 948 1130 unsigned long flags; 949 1131 struct address_space *mftbmp_mapping; 950 - u8 *buf, *byte; 951 - struct page *page; 952 - unsigned int page_ofs, size; 1132 + u8 *buf = NULL, *byte; 1133 + struct folio *folio; 1134 + unsigned int folio_ofs, size; 953 1135 u8 pass, b; 954 1136 955 - ntfs_debug("Searching for free mft record in the currently " 956 - "initialized mft bitmap."); 1137 + ntfs_debug("Searching for free mft record in the currently initialized mft bitmap."); 957 1138 mftbmp_mapping = vol->mftbmp_ino->i_mapping; 958 1139 /* 959 1140 * Set the end of the pass making sure we do not overflow the mft ··· 972 1155 data_pos = vol->mft_data_pos; 973 1156 else 974 1157 data_pos = base_ni->mft_no + 1; 975 - if (data_pos < 24) 976 - data_pos = 24; 1158 + if (data_pos < RESERVED_MFT_RECORDS) 1159 + data_pos = RESERVED_MFT_RECORDS; 977 1160 if (data_pos >= pass_end) { 978 - data_pos = 24; 1161 + data_pos = RESERVED_MFT_RECORDS; 979 1162 pass = 2; 980 1163 /* This happens on a freshly formatted volume. */ 981 1164 if (data_pos >= pass_end) 982 1165 return -ENOSPC; 983 1166 } 1167 + 1168 + if (base_ni && base_ni->mft_no == FILE_MFT) { 1169 + data_pos = 0; 1170 + pass = 2; 1171 + } 1172 + 984 1173 pass_start = data_pos; 985 - ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " 986 - "pass_end 0x%llx, data_pos 0x%llx.", pass, 987 - (long long)pass_start, (long long)pass_end, 988 - (long long)data_pos); 1174 + ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, pass_end 0x%llx, data_pos 0x%llx.", 1175 + pass, pass_start, pass_end, data_pos); 989 1176 /* Loop until a free mft record is found. */ 990 1177 for (; pass <= 2;) { 991 1178 /* Cap size to pass_end. */ 992 1179 ofs = data_pos >> 3; 993 - page_ofs = ofs & ~PAGE_MASK; 994 - size = PAGE_SIZE - page_ofs; 1180 + folio_ofs = ofs & ~PAGE_MASK; 1181 + size = PAGE_SIZE - folio_ofs; 995 1182 ll = ((pass_end + 7) >> 3) - ofs; 996 1183 if (size > ll) 997 1184 size = ll; ··· 1005 1184 * for a zero bit. 1006 1185 */ 1007 1186 if (size) { 1008 - page = ntfs_map_page(mftbmp_mapping, 1009 - ofs >> PAGE_SHIFT); 1010 - if (IS_ERR(page)) { 1011 - ntfs_error(vol->sb, "Failed to read mft " 1012 - "bitmap, aborting."); 1013 - return PTR_ERR(page); 1187 + folio = read_mapping_folio(mftbmp_mapping, 1188 + ofs >> PAGE_SHIFT, NULL); 1189 + if (IS_ERR(folio)) { 1190 + ntfs_error(vol->sb, "Failed to read mft bitmap, aborting."); 1191 + return PTR_ERR(folio); 1014 1192 } 1015 - buf = (u8*)page_address(page) + page_ofs; 1193 + folio_lock(folio); 1194 + buf = (u8 *)kmap_local_folio(folio, 0) + folio_ofs; 1016 1195 bit = data_pos & 7; 1017 1196 data_pos &= ~7ull; 1018 - ntfs_debug("Before inner for loop: size 0x%x, " 1019 - "data_pos 0x%llx, bit 0x%llx", size, 1020 - (long long)data_pos, (long long)bit); 1197 + ntfs_debug("Before inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx", 1198 + size, data_pos, bit); 1021 1199 for (; bit < size && data_pos + bit < pass_end; 1022 1200 bit &= ~7ull, bit += 8) { 1201 + /* 1202 + * If we're extending $MFT and running out of the first 1203 + * mft record (base record) then give up searching since 1204 + * no guarantee that the found record will be accessible. 1205 + */ 1206 + if (base_ni && base_ni->mft_no == FILE_MFT && bit > 400) { 1207 + folio_unlock(folio); 1208 + kunmap_local(buf); 1209 + folio_put(folio); 1210 + return -ENOSPC; 1211 + } 1212 + 1023 1213 byte = buf + (bit >> 3); 1024 1214 if (*byte == 0xff) 1025 1215 continue; ··· 1038 1206 if (b < 8 && b >= (bit & 7)) { 1039 1207 ll = data_pos + (bit & ~7ull) + b; 1040 1208 if (unlikely(ll > (1ll << 32))) { 1041 - ntfs_unmap_page(page); 1209 + folio_unlock(folio); 1210 + kunmap_local(buf); 1211 + folio_put(folio); 1042 1212 return -ENOSPC; 1043 1213 } 1044 1214 *byte |= 1 << b; 1045 - flush_dcache_page(page); 1046 - set_page_dirty(page); 1047 - ntfs_unmap_page(page); 1048 - ntfs_debug("Done. (Found and " 1049 - "allocated mft record " 1050 - "0x%llx.)", 1051 - (long long)ll); 1215 + folio_mark_dirty(folio); 1216 + folio_unlock(folio); 1217 + kunmap_local(buf); 1218 + folio_put(folio); 1219 + ntfs_debug("Done. (Found and allocated mft record 0x%llx.)", 1220 + ll); 1052 1221 return ll; 1053 1222 } 1054 1223 } 1055 - ntfs_debug("After inner for loop: size 0x%x, " 1056 - "data_pos 0x%llx, bit 0x%llx", size, 1057 - (long long)data_pos, (long long)bit); 1224 + ntfs_debug("After inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx", 1225 + size, data_pos, bit); 1058 1226 data_pos += size; 1059 - ntfs_unmap_page(page); 1227 + folio_unlock(folio); 1228 + kunmap_local(buf); 1229 + folio_put(folio); 1060 1230 /* 1061 1231 * If the end of the pass has not been reached yet, 1062 1232 * continue searching the mft bitmap for a zero bit. ··· 1073 1239 * part of the zone which we omitted earlier. 1074 1240 */ 1075 1241 pass_end = pass_start; 1076 - data_pos = pass_start = 24; 1077 - ntfs_debug("pass %i, pass_start 0x%llx, pass_end " 1078 - "0x%llx.", pass, (long long)pass_start, 1079 - (long long)pass_end); 1242 + data_pos = pass_start = RESERVED_MFT_RECORDS; 1243 + ntfs_debug("pass %i, pass_start 0x%llx, pass_end 0x%llx.", 1244 + pass, pass_start, pass_end); 1080 1245 if (data_pos >= pass_end) 1081 1246 break; 1082 1247 } 1083 1248 } 1084 1249 /* No free mft records in currently initialized mft bitmap. */ 1085 - ntfs_debug("Done. (No free mft records left in currently initialized " 1086 - "mft bitmap.)"); 1250 + ntfs_debug("Done. (No free mft records left in currently initialized mft bitmap.)"); 1087 1251 return -ENOSPC; 1088 1252 } 1089 1253 1090 - /** 1254 + static int ntfs_mft_attr_extend(struct ntfs_inode *ni) 1255 + { 1256 + int ret = 0; 1257 + struct ntfs_inode *base_ni; 1258 + 1259 + if (NInoAttr(ni)) 1260 + base_ni = ni->ext.base_ntfs_ino; 1261 + else 1262 + base_ni = ni; 1263 + 1264 + if (!NInoAttrList(base_ni)) { 1265 + ret = ntfs_inode_add_attrlist(base_ni); 1266 + if (ret) { 1267 + pr_err("Can not add attrlist\n"); 1268 + goto out; 1269 + } else { 1270 + ret = -EAGAIN; 1271 + goto out; 1272 + } 1273 + } 1274 + 1275 + ret = ntfs_attr_update_mapping_pairs(ni, 0); 1276 + if (ret) 1277 + pr_err("MP update failed\n"); 1278 + 1279 + out: 1280 + return ret; 1281 + } 1282 + 1283 + /* 1091 1284 * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster 1092 1285 * @vol: volume on which to extend the mft bitmap attribute 1093 1286 * ··· 1131 1270 * - This function takes vol->lcnbmp_lock for writing and releases it 1132 1271 * before returning. 1133 1272 */ 1134 - static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) 1273 + static int ntfs_mft_bitmap_extend_allocation_nolock(struct ntfs_volume *vol) 1135 1274 { 1136 - LCN lcn; 1275 + s64 lcn; 1137 1276 s64 ll; 1138 1277 unsigned long flags; 1139 - struct page *page; 1140 - ntfs_inode *mft_ni, *mftbmp_ni; 1141 - runlist_element *rl, *rl2 = NULL; 1142 - ntfs_attr_search_ctx *ctx = NULL; 1143 - MFT_RECORD *mrec; 1144 - ATTR_RECORD *a = NULL; 1278 + struct folio *folio; 1279 + struct ntfs_inode *mft_ni, *mftbmp_ni; 1280 + struct runlist_element *rl, *rl2 = NULL; 1281 + struct ntfs_attr_search_ctx *ctx = NULL; 1282 + struct mft_record *mrec; 1283 + struct attr_record *a = NULL; 1145 1284 int ret, mp_size; 1146 1285 u32 old_alen = 0; 1147 1286 u8 *b, tb; ··· 1149 1288 u8 added_cluster:1; 1150 1289 u8 added_run:1; 1151 1290 u8 mp_rebuilt:1; 1152 - } status = { 0, 0, 0 }; 1291 + u8 mp_extended:1; 1292 + } status = { 0, 0, 0, 0 }; 1293 + size_t new_rl_count; 1153 1294 1154 1295 ntfs_debug("Extending mft bitmap allocation."); 1155 1296 mft_ni = NTFS_I(vol->mft_ino); ··· 1165 1302 ll = mftbmp_ni->allocated_size; 1166 1303 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1167 1304 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, 1168 - (ll - 1) >> vol->cluster_size_bits, NULL); 1305 + NTFS_B_TO_CLU(vol, ll - 1), NULL); 1169 1306 if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { 1170 1307 up_write(&mftbmp_ni->runlist.lock); 1171 - ntfs_error(vol->sb, "Failed to determine last allocated " 1172 - "cluster of mft bitmap attribute."); 1308 + ntfs_error(vol->sb, 1309 + "Failed to determine last allocated cluster of mft bitmap attribute."); 1173 1310 if (!IS_ERR(rl)) 1174 1311 ret = -EIO; 1175 1312 else ··· 1185 1322 * to us. 1186 1323 */ 1187 1324 ll = lcn >> 3; 1188 - page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, 1189 - ll >> PAGE_SHIFT); 1190 - if (IS_ERR(page)) { 1325 + folio = read_mapping_folio(vol->lcnbmp_ino->i_mapping, 1326 + ll >> PAGE_SHIFT, NULL); 1327 + if (IS_ERR(folio)) { 1191 1328 up_write(&mftbmp_ni->runlist.lock); 1192 1329 ntfs_error(vol->sb, "Failed to read from lcn bitmap."); 1193 - return PTR_ERR(page); 1330 + return PTR_ERR(folio); 1194 1331 } 1195 - b = (u8*)page_address(page) + (ll & ~PAGE_MASK); 1196 - tb = 1 << (lcn & 7ull); 1332 + 1197 1333 down_write(&vol->lcnbmp_lock); 1334 + folio_lock(folio); 1335 + b = (u8 *)kmap_local_folio(folio, 0) + (ll & ~PAGE_MASK); 1336 + tb = 1 << (lcn & 7ull); 1198 1337 if (*b != 0xff && !(*b & tb)) { 1199 1338 /* Next cluster is free, allocate it. */ 1200 1339 *b |= tb; 1201 - flush_dcache_page(page); 1202 - set_page_dirty(page); 1340 + folio_mark_dirty(folio); 1341 + folio_unlock(folio); 1342 + kunmap_local(b); 1343 + folio_put(folio); 1203 1344 up_write(&vol->lcnbmp_lock); 1204 - ntfs_unmap_page(page); 1205 1345 /* Update the mft bitmap runlist. */ 1206 1346 rl->length++; 1207 1347 rl[1].vcn++; 1208 1348 status.added_cluster = 1; 1209 1349 ntfs_debug("Appending one cluster to mft bitmap."); 1210 1350 } else { 1351 + folio_unlock(folio); 1352 + kunmap_local(b); 1353 + folio_put(folio); 1211 1354 up_write(&vol->lcnbmp_lock); 1212 - ntfs_unmap_page(page); 1213 1355 /* Allocate a cluster from the DATA_ZONE. */ 1214 1356 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, 1215 - true); 1357 + true, false, false); 1216 1358 if (IS_ERR(rl2)) { 1217 1359 up_write(&mftbmp_ni->runlist.lock); 1218 - ntfs_error(vol->sb, "Failed to allocate a cluster for " 1219 - "the mft bitmap."); 1360 + ntfs_error(vol->sb, 1361 + "Failed to allocate a cluster for the mft bitmap."); 1220 1362 return PTR_ERR(rl2); 1221 1363 } 1222 - rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); 1364 + rl = ntfs_runlists_merge(&mftbmp_ni->runlist, rl2, 0, &new_rl_count); 1223 1365 if (IS_ERR(rl)) { 1224 1366 up_write(&mftbmp_ni->runlist.lock); 1225 - ntfs_error(vol->sb, "Failed to merge runlists for mft " 1226 - "bitmap."); 1367 + ntfs_error(vol->sb, "Failed to merge runlists for mft bitmap."); 1227 1368 if (ntfs_cluster_free_from_rl(vol, rl2)) { 1228 - ntfs_error(vol->sb, "Failed to deallocate " 1229 - "allocated cluster.%s", es); 1369 + ntfs_error(vol->sb, "Failed to deallocate allocated cluster.%s", 1370 + es); 1230 1371 NVolSetErrors(vol); 1231 1372 } 1232 - ntfs_free(rl2); 1373 + kvfree(rl2); 1233 1374 return PTR_ERR(rl); 1234 1375 } 1235 1376 mftbmp_ni->runlist.rl = rl; 1377 + mftbmp_ni->runlist.count = new_rl_count; 1236 1378 status.added_run = 1; 1237 1379 ntfs_debug("Adding one run to mft bitmap."); 1238 1380 /* Find the last run in the new runlist. */ ··· 1264 1396 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, 1265 1397 0, ctx); 1266 1398 if (unlikely(ret)) { 1267 - ntfs_error(vol->sb, "Failed to find last attribute extent of " 1268 - "mft bitmap attribute."); 1399 + ntfs_error(vol->sb, 1400 + "Failed to find last attribute extent of mft bitmap attribute."); 1269 1401 if (ret == -ENOENT) 1270 1402 ret = -EIO; 1271 1403 goto undo_alloc; 1272 1404 } 1273 1405 a = ctx->attr; 1274 - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1406 + ll = le64_to_cpu(a->data.non_resident.lowest_vcn); 1275 1407 /* Search back for the previous last allocated cluster of mft bitmap. */ 1276 1408 for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { 1277 1409 if (ll >= rl2->vcn) 1278 1410 break; 1279 1411 } 1280 - BUG_ON(ll < rl2->vcn); 1281 - BUG_ON(ll >= rl2->vcn + rl2->length); 1412 + WARN_ON(ll < rl2->vcn); 1413 + WARN_ON(ll >= rl2->vcn + rl2->length); 1282 1414 /* Get the size for the new mapping pairs array for this extent. */ 1283 - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); 1415 + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1); 1284 1416 if (unlikely(mp_size <= 0)) { 1285 - ntfs_error(vol->sb, "Get size for mapping pairs failed for " 1286 - "mft bitmap attribute extent."); 1417 + ntfs_error(vol->sb, 1418 + "Get size for mapping pairs failed for mft bitmap attribute extent."); 1287 1419 ret = mp_size; 1288 1420 if (!ret) 1289 1421 ret = -EIO; ··· 1294 1426 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + 1295 1427 le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); 1296 1428 if (unlikely(ret)) { 1297 - if (ret != -ENOSPC) { 1298 - ntfs_error(vol->sb, "Failed to resize attribute " 1299 - "record for mft bitmap attribute."); 1300 - goto undo_alloc; 1301 - } 1302 - // TODO: Deal with this by moving this extent to a new mft 1303 - // record or by starting a new extent in a new mft record or by 1304 - // moving other attributes out of this mft record. 1305 - // Note: It will need to be a special mft record and if none of 1306 - // those are available it gets rather complicated... 1307 - ntfs_error(vol->sb, "Not enough space in this mft record to " 1308 - "accommodate extended mft bitmap attribute " 1309 - "extent. Cannot handle this yet."); 1310 - ret = -EOPNOTSUPP; 1429 + ret = ntfs_mft_attr_extend(mftbmp_ni); 1430 + if (!ret) 1431 + goto extended_ok; 1432 + if (ret != -EAGAIN) 1433 + status.mp_extended = 1; 1311 1434 goto undo_alloc; 1312 1435 } 1313 1436 status.mp_rebuilt = 1; 1314 1437 /* Generate the mapping pairs array directly into the attr record. */ 1315 - ret = ntfs_mapping_pairs_build(vol, (u8*)a + 1438 + ret = ntfs_mapping_pairs_build(vol, (u8 *)a + 1316 1439 le16_to_cpu(a->data.non_resident.mapping_pairs_offset), 1317 - mp_size, rl2, ll, -1, NULL); 1440 + mp_size, rl2, ll, -1, NULL, NULL, NULL); 1318 1441 if (unlikely(ret)) { 1319 - ntfs_error(vol->sb, "Failed to build mapping pairs array for " 1320 - "mft bitmap attribute."); 1442 + ntfs_error(vol->sb, 1443 + "Failed to build mapping pairs array for mft bitmap attribute."); 1321 1444 goto undo_alloc; 1322 1445 } 1323 1446 /* Update the highest_vcn. */ 1324 - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); 1447 + a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1); 1325 1448 /* 1326 1449 * We now have extended the mft bitmap allocated_size by one cluster. 1327 - * Reflect this in the ntfs_inode structure and the attribute record. 1450 + * Reflect this in the struct ntfs_inode structure and the attribute record. 1328 1451 */ 1329 1452 if (a->data.non_resident.lowest_vcn) { 1330 1453 /* 1331 1454 * We are not in the first attribute extent, switch to it, but 1332 1455 * first ensure the changes will make it to disk later. 1333 1456 */ 1334 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1335 1457 mark_mft_record_dirty(ctx->ntfs_ino); 1458 + extended_ok: 1336 1459 ntfs_attr_reinit_search_ctx(ctx); 1337 1460 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, 1338 1461 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 1339 1462 0, ctx); 1340 1463 if (unlikely(ret)) { 1341 - ntfs_error(vol->sb, "Failed to find first attribute " 1342 - "extent of mft bitmap attribute."); 1464 + ntfs_error(vol->sb, 1465 + "Failed to find first attribute extent of mft bitmap attribute."); 1343 1466 goto restore_undo_alloc; 1344 1467 } 1345 1468 a = ctx->attr; 1346 1469 } 1470 + 1347 1471 write_lock_irqsave(&mftbmp_ni->size_lock, flags); 1348 1472 mftbmp_ni->allocated_size += vol->cluster_size; 1349 1473 a->data.non_resident.allocated_size = 1350 - cpu_to_sle64(mftbmp_ni->allocated_size); 1474 + cpu_to_le64(mftbmp_ni->allocated_size); 1351 1475 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1352 1476 /* Ensure the changes make it to disk. */ 1353 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1354 1477 mark_mft_record_dirty(ctx->ntfs_ino); 1355 1478 ntfs_attr_put_search_ctx(ctx); 1356 1479 unmap_mft_record(mft_ni); 1357 1480 up_write(&mftbmp_ni->runlist.lock); 1358 1481 ntfs_debug("Done."); 1359 1482 return 0; 1483 + 1360 1484 restore_undo_alloc: 1361 1485 ntfs_attr_reinit_search_ctx(ctx); 1362 1486 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, 1363 1487 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, 1364 1488 0, ctx)) { 1365 - ntfs_error(vol->sb, "Failed to find last attribute extent of " 1366 - "mft bitmap attribute.%s", es); 1489 + ntfs_error(vol->sb, 1490 + "Failed to find last attribute extent of mft bitmap attribute.%s", es); 1367 1491 write_lock_irqsave(&mftbmp_ni->size_lock, flags); 1368 1492 mftbmp_ni->allocated_size += vol->cluster_size; 1369 1493 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); ··· 1370 1510 return ret; 1371 1511 } 1372 1512 a = ctx->attr; 1373 - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); 1513 + a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 2); 1374 1514 undo_alloc: 1375 1515 if (status.added_cluster) { 1376 1516 /* Truncate the last run in the runlist by one cluster. */ ··· 1381 1521 /* Remove the last run from the runlist. */ 1382 1522 rl->lcn = rl[1].lcn; 1383 1523 rl->length = 0; 1524 + mftbmp_ni->runlist.count--; 1384 1525 } 1385 1526 /* Deallocate the cluster. */ 1386 1527 down_write(&vol->lcnbmp_lock); 1387 1528 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1388 1529 ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); 1389 1530 NVolSetErrors(vol); 1390 - } 1531 + } else 1532 + ntfs_inc_free_clusters(vol, 1); 1391 1533 up_write(&vol->lcnbmp_lock); 1392 1534 if (status.mp_rebuilt) { 1393 - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1535 + if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( 1394 1536 a->data.non_resident.mapping_pairs_offset), 1395 1537 old_alen - le16_to_cpu( 1396 1538 a->data.non_resident.mapping_pairs_offset), 1397 - rl2, ll, -1, NULL)) { 1398 - ntfs_error(vol->sb, "Failed to restore mapping pairs " 1399 - "array.%s", es); 1539 + rl2, ll, -1, NULL, NULL, NULL)) { 1540 + ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es); 1400 1541 NVolSetErrors(vol); 1401 1542 } 1402 1543 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { 1403 - ntfs_error(vol->sb, "Failed to restore attribute " 1404 - "record.%s", es); 1544 + ntfs_error(vol->sb, "Failed to restore attribute record.%s", es); 1405 1545 NVolSetErrors(vol); 1406 1546 } 1407 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1408 1547 mark_mft_record_dirty(ctx->ntfs_ino); 1548 + } else if (status.mp_extended && ntfs_attr_update_mapping_pairs(mftbmp_ni, 0)) { 1549 + ntfs_error(vol->sb, "Failed to restore mapping pairs.%s", es); 1550 + NVolSetErrors(vol); 1409 1551 } 1410 1552 if (ctx) 1411 1553 ntfs_attr_put_search_ctx(ctx); ··· 1417 1555 return ret; 1418 1556 } 1419 1557 1420 - /** 1558 + /* 1421 1559 * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data 1422 1560 * @vol: volume on which to extend the mft bitmap attribute 1423 1561 * ··· 1431 1569 * 1432 1570 * Locking: Caller must hold vol->mftbmp_lock for writing. 1433 1571 */ 1434 - static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) 1572 + static int ntfs_mft_bitmap_extend_initialized_nolock(struct ntfs_volume *vol) 1435 1573 { 1436 1574 s64 old_data_size, old_initialized_size; 1437 1575 unsigned long flags; 1438 1576 struct inode *mftbmp_vi; 1439 - ntfs_inode *mft_ni, *mftbmp_ni; 1440 - ntfs_attr_search_ctx *ctx; 1441 - MFT_RECORD *mrec; 1442 - ATTR_RECORD *a; 1577 + struct ntfs_inode *mft_ni, *mftbmp_ni; 1578 + struct ntfs_attr_search_ctx *ctx; 1579 + struct mft_record *mrec; 1580 + struct attr_record *a; 1443 1581 int ret; 1444 1582 1445 1583 ntfs_debug("Extending mft bitmap initiailized (and data) size."); ··· 1461 1599 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, 1462 1600 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); 1463 1601 if (unlikely(ret)) { 1464 - ntfs_error(vol->sb, "Failed to find first attribute extent of " 1465 - "mft bitmap attribute."); 1602 + ntfs_error(vol->sb, 1603 + "Failed to find first attribute extent of mft bitmap attribute."); 1466 1604 if (ret == -ENOENT) 1467 1605 ret = -EIO; 1468 1606 goto put_err_out; ··· 1478 1616 */ 1479 1617 mftbmp_ni->initialized_size += 8; 1480 1618 a->data.non_resident.initialized_size = 1481 - cpu_to_sle64(mftbmp_ni->initialized_size); 1619 + cpu_to_le64(mftbmp_ni->initialized_size); 1482 1620 if (mftbmp_ni->initialized_size > old_data_size) { 1483 1621 i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); 1484 1622 a->data.non_resident.data_size = 1485 - cpu_to_sle64(mftbmp_ni->initialized_size); 1623 + cpu_to_le64(mftbmp_ni->initialized_size); 1486 1624 } 1487 1625 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1488 1626 /* Ensure the changes make it to disk. */ 1489 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1490 1627 mark_mft_record_dirty(ctx->ntfs_ino); 1491 1628 ntfs_attr_put_search_ctx(ctx); 1492 1629 unmap_mft_record(mft_ni); 1493 1630 /* Initialize the mft bitmap attribute value with zeroes. */ 1494 1631 ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); 1495 1632 if (likely(!ret)) { 1496 - ntfs_debug("Done. (Wrote eight initialized bytes to mft " 1497 - "bitmap."); 1633 + ntfs_debug("Done. (Wrote eight initialized bytes to mft bitmap."); 1634 + ntfs_inc_free_mft_records(vol, 8 * 8); 1498 1635 return 0; 1499 1636 } 1500 1637 ntfs_error(vol->sb, "Failed to write to mft bitmap."); ··· 1512 1651 } 1513 1652 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, 1514 1653 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { 1515 - ntfs_error(vol->sb, "Failed to find first attribute extent of " 1516 - "mft bitmap attribute.%s", es); 1654 + ntfs_error(vol->sb, 1655 + "Failed to find first attribute extent of mft bitmap attribute.%s", es); 1517 1656 NVolSetErrors(vol); 1518 1657 put_err_out: 1519 1658 ntfs_attr_put_search_ctx(ctx); ··· 1525 1664 write_lock_irqsave(&mftbmp_ni->size_lock, flags); 1526 1665 mftbmp_ni->initialized_size = old_initialized_size; 1527 1666 a->data.non_resident.initialized_size = 1528 - cpu_to_sle64(old_initialized_size); 1667 + cpu_to_le64(old_initialized_size); 1529 1668 if (i_size_read(mftbmp_vi) != old_data_size) { 1530 1669 i_size_write(mftbmp_vi, old_data_size); 1531 - a->data.non_resident.data_size = cpu_to_sle64(old_data_size); 1670 + a->data.non_resident.data_size = cpu_to_le64(old_data_size); 1532 1671 } 1533 1672 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1534 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1535 1673 mark_mft_record_dirty(ctx->ntfs_ino); 1536 1674 ntfs_attr_put_search_ctx(ctx); 1537 1675 unmap_mft_record(mft_ni); 1538 1676 #ifdef DEBUG 1539 1677 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 1540 - ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " 1541 - "data_size 0x%llx, initialized_size 0x%llx.", 1542 - (long long)mftbmp_ni->allocated_size, 1543 - (long long)i_size_read(mftbmp_vi), 1544 - (long long)mftbmp_ni->initialized_size); 1678 + ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 1679 + mftbmp_ni->allocated_size, i_size_read(mftbmp_vi), 1680 + mftbmp_ni->initialized_size); 1545 1681 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1546 1682 #endif /* DEBUG */ 1547 1683 err_out: 1548 1684 return ret; 1549 1685 } 1550 1686 1551 - /** 1687 + /* 1552 1688 * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute 1553 1689 * @vol: volume on which to extend the mft data attribute 1554 1690 * ··· 1564 1706 * - This function calls functions which take vol->lcnbmp_lock for 1565 1707 * writing and release it before returning. 1566 1708 */ 1567 - static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) 1709 + static int ntfs_mft_data_extend_allocation_nolock(struct ntfs_volume *vol) 1568 1710 { 1569 - LCN lcn; 1570 - VCN old_last_vcn; 1711 + s64 lcn; 1712 + s64 old_last_vcn; 1571 1713 s64 min_nr, nr, ll; 1572 1714 unsigned long flags; 1573 - ntfs_inode *mft_ni; 1574 - runlist_element *rl, *rl2; 1575 - ntfs_attr_search_ctx *ctx = NULL; 1576 - MFT_RECORD *mrec; 1577 - ATTR_RECORD *a = NULL; 1715 + struct ntfs_inode *mft_ni; 1716 + struct runlist_element *rl, *rl2; 1717 + struct ntfs_attr_search_ctx *ctx = NULL; 1718 + struct mft_record *mrec; 1719 + struct attr_record *a = NULL; 1578 1720 int ret, mp_size; 1579 1721 u32 old_alen = 0; 1580 - bool mp_rebuilt = false; 1722 + bool mp_rebuilt = false, mp_extended = false; 1723 + size_t new_rl_count; 1581 1724 1582 1725 ntfs_debug("Extending mft data allocation."); 1583 1726 mft_ni = NTFS_I(vol->mft_ino); ··· 1592 1733 ll = mft_ni->allocated_size; 1593 1734 read_unlock_irqrestore(&mft_ni->size_lock, flags); 1594 1735 rl = ntfs_attr_find_vcn_nolock(mft_ni, 1595 - (ll - 1) >> vol->cluster_size_bits, NULL); 1736 + NTFS_B_TO_CLU(vol, ll - 1), NULL); 1596 1737 if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { 1597 1738 up_write(&mft_ni->runlist.lock); 1598 - ntfs_error(vol->sb, "Failed to determine last allocated " 1599 - "cluster of mft data attribute."); 1739 + ntfs_error(vol->sb, 1740 + "Failed to determine last allocated cluster of mft data attribute."); 1600 1741 if (!IS_ERR(rl)) 1601 1742 ret = -EIO; 1602 1743 else ··· 1604 1745 return ret; 1605 1746 } 1606 1747 lcn = rl->lcn + rl->length; 1607 - ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); 1748 + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", lcn); 1608 1749 /* Minimum allocation is one mft record worth of clusters. */ 1609 - min_nr = vol->mft_record_size >> vol->cluster_size_bits; 1750 + min_nr = NTFS_B_TO_CLU(vol, vol->mft_record_size); 1610 1751 if (!min_nr) 1611 1752 min_nr = 1; 1612 1753 /* Want to allocate 16 mft records worth of clusters. */ ··· 1617 1758 read_lock_irqsave(&mft_ni->size_lock, flags); 1618 1759 ll = mft_ni->allocated_size; 1619 1760 read_unlock_irqrestore(&mft_ni->size_lock, flags); 1620 - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> 1761 + if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >> 1621 1762 vol->mft_record_size_bits >= (1ll << 32))) { 1622 1763 nr = min_nr; 1623 - if (unlikely((ll + (nr << vol->cluster_size_bits)) >> 1764 + if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >> 1624 1765 vol->mft_record_size_bits >= (1ll << 32))) { 1625 - ntfs_warning(vol->sb, "Cannot allocate mft record " 1626 - "because the maximum number of inodes " 1627 - "(2^32) has already been reached."); 1766 + ntfs_warning(vol->sb, 1767 + "Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached."); 1628 1768 up_write(&mft_ni->runlist.lock); 1629 1769 return -ENOSPC; 1630 1770 } ··· 1631 1773 ntfs_debug("Trying mft data allocation with %s cluster count %lli.", 1632 1774 nr > min_nr ? "default" : "minimal", (long long)nr); 1633 1775 old_last_vcn = rl[1].vcn; 1776 + /* 1777 + * We can release the mft_ni runlist lock, Because this function is 1778 + * the only one that expends $MFT data attribute and is called with 1779 + * mft_ni->mrec_lock. 1780 + * This is required for the lock order, vol->lcnbmp_lock => 1781 + * mft_ni->runlist.lock. 1782 + */ 1783 + up_write(&mft_ni->runlist.lock); 1784 + 1634 1785 do { 1635 1786 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, 1636 - true); 1787 + true, false, false); 1637 1788 if (!IS_ERR(rl2)) 1638 1789 break; 1639 1790 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { 1640 - ntfs_error(vol->sb, "Failed to allocate the minimal " 1641 - "number of clusters (%lli) for the " 1642 - "mft data attribute.", (long long)nr); 1643 - up_write(&mft_ni->runlist.lock); 1791 + ntfs_error(vol->sb, 1792 + "Failed to allocate the minimal number of clusters (%lli) for the mft data attribute.", 1793 + nr); 1644 1794 return PTR_ERR(rl2); 1645 1795 } 1646 1796 /* ··· 1657 1791 * before failing. 1658 1792 */ 1659 1793 nr = min_nr; 1660 - ntfs_debug("Retrying mft data allocation with minimal cluster " 1661 - "count %lli.", (long long)nr); 1794 + ntfs_debug("Retrying mft data allocation with minimal cluster count %lli.", nr); 1662 1795 } while (1); 1663 - rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); 1796 + 1797 + down_write(&mft_ni->runlist.lock); 1798 + rl = ntfs_runlists_merge(&mft_ni->runlist, rl2, 0, &new_rl_count); 1664 1799 if (IS_ERR(rl)) { 1665 1800 up_write(&mft_ni->runlist.lock); 1666 - ntfs_error(vol->sb, "Failed to merge runlists for mft data " 1667 - "attribute."); 1801 + ntfs_error(vol->sb, "Failed to merge runlists for mft data attribute."); 1668 1802 if (ntfs_cluster_free_from_rl(vol, rl2)) { 1669 - ntfs_error(vol->sb, "Failed to deallocate clusters " 1670 - "from the mft data attribute.%s", es); 1803 + ntfs_error(vol->sb, 1804 + "Failed to deallocate clusters from the mft data attribute.%s", es); 1671 1805 NVolSetErrors(vol); 1672 1806 } 1673 - ntfs_free(rl2); 1807 + kvfree(rl2); 1674 1808 return PTR_ERR(rl); 1675 1809 } 1676 1810 mft_ni->runlist.rl = rl; 1811 + mft_ni->runlist.count = new_rl_count; 1677 1812 ntfs_debug("Allocated %lli clusters.", (long long)nr); 1678 1813 /* Find the last run in the new runlist. */ 1679 1814 for (; rl[1].length; rl++) 1680 1815 ; 1816 + up_write(&mft_ni->runlist.lock); 1817 + 1681 1818 /* Update the attribute record as well. */ 1682 1819 mrec = map_mft_record(mft_ni); 1683 1820 if (IS_ERR(mrec)) { 1684 1821 ntfs_error(vol->sb, "Failed to map mft record."); 1685 1822 ret = PTR_ERR(mrec); 1823 + down_write(&mft_ni->runlist.lock); 1686 1824 goto undo_alloc; 1687 1825 } 1688 1826 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); ··· 1698 1828 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, 1699 1829 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); 1700 1830 if (unlikely(ret)) { 1701 - ntfs_error(vol->sb, "Failed to find last attribute extent of " 1702 - "mft data attribute."); 1831 + ntfs_error(vol->sb, "Failed to find last attribute extent of mft data attribute."); 1703 1832 if (ret == -ENOENT) 1704 1833 ret = -EIO; 1705 1834 goto undo_alloc; 1706 1835 } 1707 1836 a = ctx->attr; 1708 - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1837 + ll = le64_to_cpu(a->data.non_resident.lowest_vcn); 1838 + 1839 + down_write(&mft_ni->runlist.lock); 1709 1840 /* Search back for the previous last allocated cluster of mft bitmap. */ 1710 1841 for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { 1711 1842 if (ll >= rl2->vcn) 1712 1843 break; 1713 1844 } 1714 - BUG_ON(ll < rl2->vcn); 1715 - BUG_ON(ll >= rl2->vcn + rl2->length); 1845 + WARN_ON(ll < rl2->vcn); 1846 + WARN_ON(ll >= rl2->vcn + rl2->length); 1716 1847 /* Get the size for the new mapping pairs array for this extent. */ 1717 - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); 1848 + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1); 1718 1849 if (unlikely(mp_size <= 0)) { 1719 - ntfs_error(vol->sb, "Get size for mapping pairs failed for " 1720 - "mft data attribute extent."); 1850 + ntfs_error(vol->sb, 1851 + "Get size for mapping pairs failed for mft data attribute extent."); 1721 1852 ret = mp_size; 1722 1853 if (!ret) 1723 1854 ret = -EIO; 1855 + up_write(&mft_ni->runlist.lock); 1724 1856 goto undo_alloc; 1725 1857 } 1858 + up_write(&mft_ni->runlist.lock); 1859 + 1726 1860 /* Expand the attribute record if necessary. */ 1727 1861 old_alen = le32_to_cpu(a->length); 1728 1862 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + 1729 1863 le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); 1730 1864 if (unlikely(ret)) { 1731 - if (ret != -ENOSPC) { 1732 - ntfs_error(vol->sb, "Failed to resize attribute " 1733 - "record for mft data attribute."); 1734 - goto undo_alloc; 1735 - } 1736 - // TODO: Deal with this by moving this extent to a new mft 1737 - // record or by starting a new extent in a new mft record or by 1738 - // moving other attributes out of this mft record. 1739 - // Note: Use the special reserved mft records and ensure that 1740 - // this extent is not required to find the mft record in 1741 - // question. If no free special records left we would need to 1742 - // move an existing record away, insert ours in its place, and 1743 - // then place the moved record into the newly allocated space 1744 - // and we would then need to update all references to this mft 1745 - // record appropriately. This is rather complicated... 1746 - ntfs_error(vol->sb, "Not enough space in this mft record to " 1747 - "accommodate extended mft data attribute " 1748 - "extent. Cannot handle this yet."); 1749 - ret = -EOPNOTSUPP; 1865 + ret = ntfs_mft_attr_extend(mft_ni); 1866 + if (!ret) 1867 + goto extended_ok; 1868 + if (ret != -EAGAIN) 1869 + mp_extended = true; 1750 1870 goto undo_alloc; 1751 1871 } 1752 1872 mp_rebuilt = true; 1753 1873 /* Generate the mapping pairs array directly into the attr record. */ 1754 - ret = ntfs_mapping_pairs_build(vol, (u8*)a + 1874 + ret = ntfs_mapping_pairs_build(vol, (u8 *)a + 1755 1875 le16_to_cpu(a->data.non_resident.mapping_pairs_offset), 1756 - mp_size, rl2, ll, -1, NULL); 1876 + mp_size, rl2, ll, -1, NULL, NULL, NULL); 1757 1877 if (unlikely(ret)) { 1758 - ntfs_error(vol->sb, "Failed to build mapping pairs array of " 1759 - "mft data attribute."); 1878 + ntfs_error(vol->sb, "Failed to build mapping pairs array of mft data attribute."); 1760 1879 goto undo_alloc; 1761 1880 } 1762 1881 /* Update the highest_vcn. */ 1763 - a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); 1882 + a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1); 1764 1883 /* 1765 1884 * We now have extended the mft data allocated_size by nr clusters. 1766 - * Reflect this in the ntfs_inode structure and the attribute record. 1885 + * Reflect this in the struct ntfs_inode structure and the attribute record. 1767 1886 * @rl is the last (non-terminator) runlist element of mft data 1768 1887 * attribute. 1769 1888 */ ··· 1761 1902 * We are not in the first attribute extent, switch to it, but 1762 1903 * first ensure the changes will make it to disk later. 1763 1904 */ 1764 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1765 1905 mark_mft_record_dirty(ctx->ntfs_ino); 1906 + extended_ok: 1766 1907 ntfs_attr_reinit_search_ctx(ctx); 1767 1908 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, 1768 1909 mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, 1769 1910 ctx); 1770 1911 if (unlikely(ret)) { 1771 - ntfs_error(vol->sb, "Failed to find first attribute " 1772 - "extent of mft data attribute."); 1912 + ntfs_error(vol->sb, 1913 + "Failed to find first attribute extent of mft data attribute."); 1773 1914 goto restore_undo_alloc; 1774 1915 } 1775 1916 a = ctx->attr; 1776 1917 } 1918 + 1777 1919 write_lock_irqsave(&mft_ni->size_lock, flags); 1778 - mft_ni->allocated_size += nr << vol->cluster_size_bits; 1920 + mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr); 1779 1921 a->data.non_resident.allocated_size = 1780 - cpu_to_sle64(mft_ni->allocated_size); 1922 + cpu_to_le64(mft_ni->allocated_size); 1781 1923 write_unlock_irqrestore(&mft_ni->size_lock, flags); 1782 1924 /* Ensure the changes make it to disk. */ 1783 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1784 1925 mark_mft_record_dirty(ctx->ntfs_ino); 1785 1926 ntfs_attr_put_search_ctx(ctx); 1786 1927 unmap_mft_record(mft_ni); 1787 - up_write(&mft_ni->runlist.lock); 1788 1928 ntfs_debug("Done."); 1789 1929 return 0; 1790 1930 restore_undo_alloc: 1791 1931 ntfs_attr_reinit_search_ctx(ctx); 1792 1932 if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, 1793 1933 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { 1794 - ntfs_error(vol->sb, "Failed to find last attribute extent of " 1795 - "mft data attribute.%s", es); 1934 + ntfs_error(vol->sb, 1935 + "Failed to find last attribute extent of mft data attribute.%s", es); 1796 1936 write_lock_irqsave(&mft_ni->size_lock, flags); 1797 - mft_ni->allocated_size += nr << vol->cluster_size_bits; 1937 + mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr); 1798 1938 write_unlock_irqrestore(&mft_ni->size_lock, flags); 1799 1939 ntfs_attr_put_search_ctx(ctx); 1800 1940 unmap_mft_record(mft_ni); ··· 1806 1948 return ret; 1807 1949 } 1808 1950 ctx->attr->data.non_resident.highest_vcn = 1809 - cpu_to_sle64(old_last_vcn - 1); 1951 + cpu_to_le64(old_last_vcn - 1); 1810 1952 undo_alloc: 1811 1953 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { 1812 - ntfs_error(vol->sb, "Failed to free clusters from mft data " 1813 - "attribute.%s", es); 1954 + ntfs_error(vol->sb, "Failed to free clusters from mft data attribute.%s", es); 1814 1955 NVolSetErrors(vol); 1815 1956 } 1816 1957 1817 1958 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { 1818 - ntfs_error(vol->sb, "Failed to truncate mft data attribute " 1819 - "runlist.%s", es); 1959 + ntfs_error(vol->sb, "Failed to truncate mft data attribute runlist.%s", es); 1960 + NVolSetErrors(vol); 1961 + } 1962 + if (mp_extended && ntfs_attr_update_mapping_pairs(mft_ni, 0)) { 1963 + ntfs_error(vol->sb, "Failed to restore mapping pairs.%s", 1964 + es); 1820 1965 NVolSetErrors(vol); 1821 1966 } 1822 1967 if (ctx) { ··· 1829 1968 a->data.non_resident.mapping_pairs_offset), 1830 1969 old_alen - le16_to_cpu( 1831 1970 a->data.non_resident.mapping_pairs_offset), 1832 - rl2, ll, -1, NULL)) { 1833 - ntfs_error(vol->sb, "Failed to restore mapping pairs " 1834 - "array.%s", es); 1971 + rl2, ll, -1, NULL, NULL, NULL)) { 1972 + ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es); 1835 1973 NVolSetErrors(vol); 1836 1974 } 1837 1975 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { 1838 - ntfs_error(vol->sb, "Failed to restore attribute " 1839 - "record.%s", es); 1976 + ntfs_error(vol->sb, "Failed to restore attribute record.%s", es); 1840 1977 NVolSetErrors(vol); 1841 1978 } 1842 - flush_dcache_mft_record_page(ctx->ntfs_ino); 1843 1979 mark_mft_record_dirty(ctx->ntfs_ino); 1844 1980 } else if (IS_ERR(ctx->mrec)) { 1845 - ntfs_error(vol->sb, "Failed to restore attribute search " 1846 - "context.%s", es); 1981 + ntfs_error(vol->sb, "Failed to restore attribute search context.%s", es); 1847 1982 NVolSetErrors(vol); 1848 1983 } 1849 1984 ntfs_attr_put_search_ctx(ctx); 1850 1985 } 1851 1986 if (!IS_ERR(mrec)) 1852 1987 unmap_mft_record(mft_ni); 1853 - up_write(&mft_ni->runlist.lock); 1854 1988 return ret; 1855 1989 } 1856 1990 1857 - /** 1991 + /* 1858 1992 * ntfs_mft_record_layout - layout an mft record into a memory buffer 1859 1993 * @vol: volume to which the mft record will belong 1860 1994 * @mft_no: mft reference specifying the mft record number ··· 1862 2006 * 1863 2007 * Return 0 on success and -errno on error. 1864 2008 */ 1865 - static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, 1866 - MFT_RECORD *m) 2009 + static int ntfs_mft_record_layout(const struct ntfs_volume *vol, const s64 mft_no, 2010 + struct mft_record *m) 1867 2011 { 1868 - ATTR_RECORD *a; 2012 + struct attr_record *a; 1869 2013 1870 2014 ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); 1871 2015 if (mft_no >= (1ll << 32)) { 1872 - ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " 1873 - "maximum of 2^32.", (long long)mft_no); 2016 + ntfs_error(vol->sb, "Mft record number 0x%llx exceeds maximum of 2^32.", 2017 + (long long)mft_no); 1874 2018 return -ERANGE; 1875 2019 } 1876 2020 /* Start by clearing the whole mft record to gives us a clean slate. */ 1877 2021 memset(m, 0, vol->mft_record_size); 1878 2022 /* Aligned to 2-byte boundary. */ 1879 2023 if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) 1880 - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); 2024 + m->usa_ofs = cpu_to_le16((sizeof(struct mft_record_old) + 1) & ~1); 1881 2025 else { 1882 - m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); 2026 + m->usa_ofs = cpu_to_le16((sizeof(struct mft_record) + 1) & ~1); 1883 2027 /* 1884 2028 * Set the NTFS 3.1+ specific fields while we know that the 1885 2029 * volume version is 3.1+. ··· 1893 2037 NTFS_BLOCK_SIZE + 1); 1894 2038 else { 1895 2039 m->usa_count = cpu_to_le16(1); 1896 - ntfs_warning(vol->sb, "Sector size is bigger than mft record " 1897 - "size. Setting usa_count to 1. If chkdsk " 1898 - "reports this as corruption, please email " 1899 - "linux-ntfs-dev@lists.sourceforge.net stating " 1900 - "that you saw this message and that the " 1901 - "modified filesystem created was corrupt. " 1902 - "Thank you."); 2040 + ntfs_warning(vol->sb, 2041 + "Sector size is bigger than mft record size. Setting usa_count to 1. If chkdsk reports this as corruption"); 1903 2042 } 1904 2043 /* Set the update sequence number to 1. */ 1905 - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); 2044 + *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); 1906 2045 m->lsn = 0; 1907 2046 m->sequence_number = cpu_to_le16(1); 1908 2047 m->link_count = 0; ··· 1918 2067 m->base_mft_record = 0; 1919 2068 m->next_attr_instance = 0; 1920 2069 /* Add the termination attribute. */ 1921 - a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); 2070 + a = (struct attr_record *)((u8 *)m + le16_to_cpu(m->attrs_offset)); 1922 2071 a->type = AT_END; 1923 2072 a->length = 0; 1924 2073 ntfs_debug("Done."); 1925 2074 return 0; 1926 2075 } 1927 2076 1928 - /** 2077 + /* 1929 2078 * ntfs_mft_record_format - format an mft record on an ntfs volume 1930 2079 * @vol: volume on which to format the mft record 1931 2080 * @mft_no: mft record number to format ··· 1936 2085 * 1937 2086 * Return 0 on success and -errno on error. 1938 2087 */ 1939 - static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) 2088 + static int ntfs_mft_record_format(const struct ntfs_volume *vol, const s64 mft_no) 1940 2089 { 1941 2090 loff_t i_size; 1942 2091 struct inode *mft_vi = vol->mft_ino; 1943 - struct page *page; 1944 - MFT_RECORD *m; 2092 + struct folio *folio; 2093 + struct mft_record *m; 1945 2094 pgoff_t index, end_index; 1946 2095 unsigned int ofs; 1947 2096 int err; ··· 1951 2100 * The index into the page cache and the offset within the page cache 1952 2101 * page of the wanted mft record. 1953 2102 */ 1954 - index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; 1955 - ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; 2103 + index = NTFS_MFT_NR_TO_PIDX(vol, mft_no); 2104 + ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no); 1956 2105 /* The maximum valid index into the page cache for $MFT's data. */ 1957 2106 i_size = i_size_read(mft_vi); 1958 2107 end_index = i_size >> PAGE_SHIFT; 1959 2108 if (unlikely(index >= end_index)) { 1960 - if (unlikely(index > end_index || ofs + vol->mft_record_size >= 1961 - (i_size & ~PAGE_MASK))) { 1962 - ntfs_error(vol->sb, "Tried to format non-existing mft " 1963 - "record 0x%llx.", (long long)mft_no); 2109 + if (unlikely(index > end_index || 2110 + ofs + vol->mft_record_size > (i_size & ~PAGE_MASK))) { 2111 + ntfs_error(vol->sb, "Tried to format non-existing mft record 0x%llx.", 2112 + (long long)mft_no); 1964 2113 return -ENOENT; 1965 2114 } 1966 2115 } 1967 - /* Read, map, and pin the page containing the mft record. */ 1968 - page = ntfs_map_page(mft_vi->i_mapping, index); 1969 - if (IS_ERR(page)) { 1970 - ntfs_error(vol->sb, "Failed to map page containing mft record " 1971 - "to format 0x%llx.", (long long)mft_no); 1972 - return PTR_ERR(page); 2116 + 2117 + /* Read, map, and pin the folio containing the mft record. */ 2118 + folio = read_mapping_folio(mft_vi->i_mapping, index, NULL); 2119 + if (IS_ERR(folio)) { 2120 + ntfs_error(vol->sb, "Failed to map page containing mft record to format 0x%llx.", 2121 + (long long)mft_no); 2122 + return PTR_ERR(folio); 1973 2123 } 1974 - lock_page(page); 1975 - BUG_ON(!PageUptodate(page)); 1976 - ClearPageUptodate(page); 1977 - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); 2124 + folio_lock(folio); 2125 + folio_clear_uptodate(folio); 2126 + m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs); 1978 2127 err = ntfs_mft_record_layout(vol, mft_no, m); 1979 2128 if (unlikely(err)) { 1980 2129 ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", 1981 2130 (long long)mft_no); 1982 - SetPageUptodate(page); 1983 - unlock_page(page); 1984 - ntfs_unmap_page(page); 2131 + folio_mark_uptodate(folio); 2132 + folio_unlock(folio); 2133 + kunmap_local(m); 2134 + folio_put(folio); 1985 2135 return err; 1986 2136 } 1987 - flush_dcache_page(page); 1988 - SetPageUptodate(page); 1989 - unlock_page(page); 2137 + pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size); 2138 + folio_mark_uptodate(folio); 1990 2139 /* 1991 2140 * Make sure the mft record is written out to disk. We could use 1992 2141 * ilookup5() to check if an inode is in icache and so on but this is 1993 2142 * unnecessary as ntfs_writepage() will write the dirty record anyway. 1994 2143 */ 1995 - mark_ntfs_record_dirty(page, ofs); 1996 - ntfs_unmap_page(page); 2144 + ntfs_mft_mark_dirty(folio); 2145 + folio_unlock(folio); 2146 + kunmap_local(m); 2147 + folio_put(folio); 1997 2148 ntfs_debug("Done."); 1998 2149 return 0; 1999 2150 } 2000 2151 2001 - /** 2152 + /* 2002 2153 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume 2003 2154 * @vol: [IN] volume on which to allocate the mft record 2004 2155 * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 2156 + * @ni: [OUT] on success, set to the allocated ntfs inode 2005 2157 * @base_ni: [IN] open base inode if allocating an extent mft record or NULL 2006 - * @mrec: [OUT] on successful return this is the mapped mft record 2158 + * @ni_mrec: [OUT] on successful return this is the mapped mft record 2007 2159 * 2008 2160 * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. 2009 2161 * ··· 2034 2180 * optimize this we start scanning at the place specified by @base_ni or if 2035 2181 * @base_ni is NULL we start where we last stopped and we perform wrap around 2036 2182 * when we reach the end. Note, we do not try to allocate mft records below 2037 - * number 24 because numbers 0 to 15 are the defined system files anyway and 16 2038 - * to 24 are special in that they are used for storing extension mft records 2183 + * number 64 because numbers 0 to 15 are the defined system files anyway and 16 2184 + * to 64 are special in that they are used for storing extension mft records 2039 2185 * for the $DATA attribute of $MFT. This is required to avoid the possibility 2040 2186 * of creating a runlist with a circular dependency which once written to disk 2041 2187 * can never be read in again. Windows will only use records 16 to 24 for ··· 2045 2191 * doing this at some later time, it does not matter much for now. 2046 2192 * 2047 2193 * When scanning the mft bitmap, we only search up to the last allocated mft 2048 - * record. If there are no free records left in the range 24 to number of 2194 + * record. If there are no free records left in the range 64 to number of 2049 2195 * allocated mft records, then we extend the $MFT/$DATA attribute in order to 2050 2196 * create free mft records. We extend the allocated size of $MFT/$DATA by 16 2051 2197 * records at a time or one cluster, if cluster size is above 16kiB. If there ··· 2054 2200 * 2055 2201 * No matter how many mft records we allocate, we initialize only the first 2056 2202 * allocated mft record, incrementing mft data size and initialized size 2057 - * accordingly, open an ntfs_inode for it and return it to the caller, unless 2058 - * there are less than 24 mft records, in which case we allocate and initialize 2059 - * mft records until we reach record 24 which we consider as the first free mft 2203 + * accordingly, open an struct ntfs_inode for it and return it to the caller, unless 2204 + * there are less than 64 mft records, in which case we allocate and initialize 2205 + * mft records until we reach record 64 which we consider as the first free mft 2060 2206 * record for use by normal files. 2061 2207 * 2062 2208 * If during any stage we overflow the initialized data in the mft bitmap, we 2063 2209 * extend the initialized size (and data size) by 8 bytes, allocating another 2064 2210 * cluster if required. The bitmap data size has to be at least equal to the 2065 2211 * number of mft records in the mft, but it can be bigger, in which case the 2066 - * superflous bits are padded with zeroes. 2212 + * superfluous bits are padded with zeroes. 2067 2213 * 2068 2214 * Thus, when we return successfully (IS_ERR() is false), we will have: 2069 2215 * - initialized / extended the mft bitmap if necessary, 2070 2216 * - initialized / extended the mft data if necessary, 2071 2217 * - set the bit corresponding to the mft record being allocated in the 2072 2218 * mft bitmap, 2073 - * - opened an ntfs_inode for the allocated mft record, and we will have 2074 - * - returned the ntfs_inode as well as the allocated mapped, pinned, and 2219 + * - opened an struct ntfs_inode for the allocated mft record, and we will have 2220 + * - returned the struct ntfs_inode as well as the allocated mapped, pinned, and 2075 2221 * locked mft record. 2076 2222 * 2077 2223 * On error, the volume will be left in a consistent state and no record will ··· 2091 2237 * easier because otherwise there might be circular invocations of functions 2092 2238 * when reading the bitmap. 2093 2239 */ 2094 - ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, 2095 - ntfs_inode *base_ni, MFT_RECORD **mrec) 2240 + int ntfs_mft_record_alloc(struct ntfs_volume *vol, const int mode, 2241 + struct ntfs_inode **ni, struct ntfs_inode *base_ni, 2242 + struct mft_record **ni_mrec) 2096 2243 { 2097 2244 s64 ll, bit, old_data_initialized, old_data_size; 2098 2245 unsigned long flags; 2099 - struct inode *vi; 2100 - struct page *page; 2101 - ntfs_inode *mft_ni, *mftbmp_ni, *ni; 2102 - ntfs_attr_search_ctx *ctx; 2103 - MFT_RECORD *m; 2104 - ATTR_RECORD *a; 2246 + struct folio *folio; 2247 + struct ntfs_inode *mft_ni, *mftbmp_ni; 2248 + struct ntfs_attr_search_ctx *ctx; 2249 + struct mft_record *m = NULL; 2250 + struct attr_record *a; 2105 2251 pgoff_t index; 2106 2252 unsigned int ofs; 2107 2253 int err; 2108 - le16 seq_no, usn; 2254 + __le16 seq_no, usn; 2109 2255 bool record_formatted = false; 2256 + unsigned int memalloc_flags; 2110 2257 2111 - if (base_ni) { 2112 - ntfs_debug("Entering (allocating an extent mft record for " 2113 - "base mft record 0x%llx).", 2258 + if (base_ni && *ni) 2259 + return -EINVAL; 2260 + 2261 + /* @mode and @base_ni are mutually exclusive. */ 2262 + if (mode && base_ni) 2263 + return -EINVAL; 2264 + 2265 + if (base_ni) 2266 + ntfs_debug("Entering (allocating an extent mft record for base mft record 0x%llx).", 2114 2267 (long long)base_ni->mft_no); 2115 - /* @mode and @base_ni are mutually exclusive. */ 2116 - BUG_ON(mode); 2117 - } else 2268 + else 2118 2269 ntfs_debug("Entering (allocating a base mft record)."); 2119 - if (mode) { 2120 - /* @mode and @base_ni are mutually exclusive. */ 2121 - BUG_ON(base_ni); 2122 - /* We only support creation of normal files and directories. */ 2123 - if (!S_ISREG(mode) && !S_ISDIR(mode)) 2124 - return ERR_PTR(-EOPNOTSUPP); 2125 - } 2126 - BUG_ON(!mrec); 2270 + 2271 + memalloc_flags = memalloc_nofs_save(); 2272 + 2127 2273 mft_ni = NTFS_I(vol->mft_ino); 2274 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2275 + mutex_lock(&mft_ni->mrec_lock); 2128 2276 mftbmp_ni = NTFS_I(vol->mftbmp_ino); 2129 - down_write(&vol->mftbmp_lock); 2277 + search_free_rec: 2278 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2279 + down_write(&vol->mftbmp_lock); 2130 2280 bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); 2131 2281 if (bit >= 0) { 2132 2282 ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", ··· 2138 2280 goto have_alloc_rec; 2139 2281 } 2140 2282 if (bit != -ENOSPC) { 2141 - up_write(&vol->mftbmp_lock); 2142 - return ERR_PTR(bit); 2283 + if (!base_ni || base_ni->mft_no != FILE_MFT) { 2284 + up_write(&vol->mftbmp_lock); 2285 + mutex_unlock(&mft_ni->mrec_lock); 2286 + } 2287 + memalloc_nofs_restore(memalloc_flags); 2288 + return bit; 2143 2289 } 2290 + 2291 + if (base_ni && base_ni->mft_no == FILE_MFT) { 2292 + memalloc_nofs_restore(memalloc_flags); 2293 + return bit; 2294 + } 2295 + 2144 2296 /* 2145 2297 * No free mft records left. If the mft bitmap already covers more 2146 2298 * than the currently used mft records, the next records are all free, ··· 2165 2297 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 2166 2298 old_data_initialized = mftbmp_ni->initialized_size; 2167 2299 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 2168 - if (old_data_initialized << 3 > ll && old_data_initialized > 3) { 2300 + if (old_data_initialized << 3 > ll && 2301 + old_data_initialized > RESERVED_MFT_RECORDS / 8) { 2169 2302 bit = ll; 2170 - if (bit < 24) 2171 - bit = 24; 2303 + if (bit < RESERVED_MFT_RECORDS) 2304 + bit = RESERVED_MFT_RECORDS; 2172 2305 if (unlikely(bit >= (1ll << 32))) 2173 2306 goto max_err_out; 2174 2307 ntfs_debug("Found free record (#2), bit 0x%llx.", ··· 2186 2317 goto max_err_out; 2187 2318 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 2188 2319 old_data_size = mftbmp_ni->allocated_size; 2189 - ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " 2190 - "data_size 0x%llx, initialized_size 0x%llx.", 2191 - (long long)old_data_size, 2192 - (long long)i_size_read(vol->mftbmp_ino), 2193 - (long long)old_data_initialized); 2320 + ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2321 + old_data_size, i_size_read(vol->mftbmp_ino), 2322 + old_data_initialized); 2194 2323 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 2195 2324 if (old_data_initialized + 8 > old_data_size) { 2196 2325 /* Need to extend bitmap by one more cluster. */ 2197 2326 ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); 2198 2327 err = ntfs_mft_bitmap_extend_allocation_nolock(vol); 2328 + if (err == -EAGAIN) 2329 + err = ntfs_mft_bitmap_extend_allocation_nolock(vol); 2330 + 2199 2331 if (unlikely(err)) { 2200 - up_write(&vol->mftbmp_lock); 2332 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2333 + up_write(&vol->mftbmp_lock); 2201 2334 goto err_out; 2202 2335 } 2203 2336 #ifdef DEBUG 2204 2337 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 2205 - ntfs_debug("Status of mftbmp after allocation extension: " 2206 - "allocated_size 0x%llx, data_size 0x%llx, " 2207 - "initialized_size 0x%llx.", 2208 - (long long)mftbmp_ni->allocated_size, 2209 - (long long)i_size_read(vol->mftbmp_ino), 2210 - (long long)mftbmp_ni->initialized_size); 2338 + ntfs_debug("Status of mftbmp after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2339 + mftbmp_ni->allocated_size, 2340 + i_size_read(vol->mftbmp_ino), 2341 + mftbmp_ni->initialized_size); 2211 2342 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 2212 2343 #endif /* DEBUG */ 2213 2344 } ··· 2218 2349 */ 2219 2350 err = ntfs_mft_bitmap_extend_initialized_nolock(vol); 2220 2351 if (unlikely(err)) { 2221 - up_write(&vol->mftbmp_lock); 2352 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2353 + up_write(&vol->mftbmp_lock); 2222 2354 goto err_out; 2223 2355 } 2224 2356 #ifdef DEBUG 2225 2357 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 2226 - ntfs_debug("Status of mftbmp after initialized extension: " 2227 - "allocated_size 0x%llx, data_size 0x%llx, " 2228 - "initialized_size 0x%llx.", 2229 - (long long)mftbmp_ni->allocated_size, 2230 - (long long)i_size_read(vol->mftbmp_ino), 2231 - (long long)mftbmp_ni->initialized_size); 2358 + ntfs_debug("Status of mftbmp after initialized extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2359 + mftbmp_ni->allocated_size, 2360 + i_size_read(vol->mftbmp_ino), 2361 + mftbmp_ni->initialized_size); 2232 2362 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 2233 2363 #endif /* DEBUG */ 2234 2364 ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); ··· 2237 2369 err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); 2238 2370 if (unlikely(err)) { 2239 2371 ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); 2240 - up_write(&vol->mftbmp_lock); 2372 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2373 + up_write(&vol->mftbmp_lock); 2241 2374 goto err_out; 2242 2375 } 2243 2376 ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); ··· 2266 2397 * actually traversed more than once when a freshly formatted volume is 2267 2398 * first written to so it optimizes away nicely in the common case. 2268 2399 */ 2269 - read_lock_irqsave(&mft_ni->size_lock, flags); 2270 - ntfs_debug("Status of mft data before extension: " 2271 - "allocated_size 0x%llx, data_size 0x%llx, " 2272 - "initialized_size 0x%llx.", 2273 - (long long)mft_ni->allocated_size, 2274 - (long long)i_size_read(vol->mft_ino), 2275 - (long long)mft_ni->initialized_size); 2276 - while (ll > mft_ni->allocated_size) { 2277 - read_unlock_irqrestore(&mft_ni->size_lock, flags); 2278 - err = ntfs_mft_data_extend_allocation_nolock(vol); 2279 - if (unlikely(err)) { 2280 - ntfs_error(vol->sb, "Failed to extend mft data " 2281 - "allocation."); 2282 - goto undo_mftbmp_alloc_nolock; 2283 - } 2400 + if (!base_ni || base_ni->mft_no != FILE_MFT) { 2284 2401 read_lock_irqsave(&mft_ni->size_lock, flags); 2285 - ntfs_debug("Status of mft data after allocation extension: " 2286 - "allocated_size 0x%llx, data_size 0x%llx, " 2287 - "initialized_size 0x%llx.", 2288 - (long long)mft_ni->allocated_size, 2289 - (long long)i_size_read(vol->mft_ino), 2290 - (long long)mft_ni->initialized_size); 2402 + ntfs_debug("Status of mft data before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2403 + mft_ni->allocated_size, i_size_read(vol->mft_ino), 2404 + mft_ni->initialized_size); 2405 + while (ll > mft_ni->allocated_size) { 2406 + read_unlock_irqrestore(&mft_ni->size_lock, flags); 2407 + err = ntfs_mft_data_extend_allocation_nolock(vol); 2408 + if (err == -EAGAIN) 2409 + err = ntfs_mft_data_extend_allocation_nolock(vol); 2410 + 2411 + if (unlikely(err)) { 2412 + ntfs_error(vol->sb, "Failed to extend mft data allocation."); 2413 + goto undo_mftbmp_alloc_nolock; 2414 + } 2415 + read_lock_irqsave(&mft_ni->size_lock, flags); 2416 + ntfs_debug("Status of mft data after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2417 + mft_ni->allocated_size, i_size_read(vol->mft_ino), 2418 + mft_ni->initialized_size); 2419 + } 2420 + read_unlock_irqrestore(&mft_ni->size_lock, flags); 2421 + } else if (ll > mft_ni->allocated_size) { 2422 + err = -ENOSPC; 2423 + goto undo_mftbmp_alloc_nolock; 2291 2424 } 2292 - read_unlock_irqrestore(&mft_ni->size_lock, flags); 2293 2425 /* 2294 2426 * Extend mft data initialized size (and data size of course) to reach 2295 2427 * the allocated mft record, formatting the mft records allong the way. 2296 - * Note: We only modify the ntfs_inode structure as that is all that is 2428 + * Note: We only modify the struct ntfs_inode structure as that is all that is 2297 2429 * needed by ntfs_mft_record_format(). We will update the attribute 2298 2430 * record itself in one fell swoop later on. 2299 2431 */ ··· 2303 2433 old_data_size = vol->mft_ino->i_size; 2304 2434 while (ll > mft_ni->initialized_size) { 2305 2435 s64 new_initialized_size, mft_no; 2306 - 2436 + 2307 2437 new_initialized_size = mft_ni->initialized_size + 2308 2438 vol->mft_record_size; 2309 2439 mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; ··· 2339 2469 err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, 2340 2470 CASE_SENSITIVE, 0, NULL, 0, ctx); 2341 2471 if (unlikely(err)) { 2342 - ntfs_error(vol->sb, "Failed to find first attribute extent of " 2343 - "mft data attribute."); 2472 + ntfs_error(vol->sb, "Failed to find first attribute extent of mft data attribute."); 2344 2473 ntfs_attr_put_search_ctx(ctx); 2345 2474 unmap_mft_record(mft_ni); 2346 2475 goto undo_data_init; ··· 2347 2478 a = ctx->attr; 2348 2479 read_lock_irqsave(&mft_ni->size_lock, flags); 2349 2480 a->data.non_resident.initialized_size = 2350 - cpu_to_sle64(mft_ni->initialized_size); 2481 + cpu_to_le64(mft_ni->initialized_size); 2351 2482 a->data.non_resident.data_size = 2352 - cpu_to_sle64(i_size_read(vol->mft_ino)); 2483 + cpu_to_le64(i_size_read(vol->mft_ino)); 2353 2484 read_unlock_irqrestore(&mft_ni->size_lock, flags); 2354 2485 /* Ensure the changes make it to disk. */ 2355 - flush_dcache_mft_record_page(ctx->ntfs_ino); 2356 2486 mark_mft_record_dirty(ctx->ntfs_ino); 2357 2487 ntfs_attr_put_search_ctx(ctx); 2358 2488 unmap_mft_record(mft_ni); 2359 2489 read_lock_irqsave(&mft_ni->size_lock, flags); 2360 - ntfs_debug("Status of mft data after mft record initialization: " 2361 - "allocated_size 0x%llx, data_size 0x%llx, " 2362 - "initialized_size 0x%llx.", 2363 - (long long)mft_ni->allocated_size, 2364 - (long long)i_size_read(vol->mft_ino), 2365 - (long long)mft_ni->initialized_size); 2366 - BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); 2367 - BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); 2490 + ntfs_debug("Status of mft data after mft record initialization: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.", 2491 + mft_ni->allocated_size, i_size_read(vol->mft_ino), 2492 + mft_ni->initialized_size); 2493 + WARN_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); 2494 + WARN_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); 2368 2495 read_unlock_irqrestore(&mft_ni->size_lock, flags); 2369 2496 mft_rec_already_initialized: 2370 2497 /* ··· 2372 2507 * that it is allocated in the mft bitmap means that no-one will try to 2373 2508 * allocate it either. 2374 2509 */ 2375 - up_write(&vol->mftbmp_lock); 2510 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2511 + up_write(&vol->mftbmp_lock); 2376 2512 /* 2377 2513 * We now have allocated and initialized the mft record. Calculate the 2378 2514 * index of and the offset within the page cache page the record is in. 2379 2515 */ 2380 - index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; 2381 - ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; 2382 - /* Read, map, and pin the page containing the mft record. */ 2383 - page = ntfs_map_page(vol->mft_ino->i_mapping, index); 2384 - if (IS_ERR(page)) { 2385 - ntfs_error(vol->sb, "Failed to map page containing allocated " 2386 - "mft record 0x%llx.", (long long)bit); 2387 - err = PTR_ERR(page); 2516 + index = NTFS_MFT_NR_TO_PIDX(vol, bit); 2517 + ofs = NTFS_MFT_NR_TO_POFS(vol, bit); 2518 + /* Read, map, and pin the folio containing the mft record. */ 2519 + folio = read_mapping_folio(vol->mft_ino->i_mapping, index, NULL); 2520 + if (IS_ERR(folio)) { 2521 + ntfs_error(vol->sb, "Failed to map page containing allocated mft record 0x%llx.", 2522 + bit); 2523 + err = PTR_ERR(folio); 2388 2524 goto undo_mftbmp_alloc; 2389 2525 } 2390 - lock_page(page); 2391 - BUG_ON(!PageUptodate(page)); 2392 - ClearPageUptodate(page); 2393 - m = (MFT_RECORD*)((u8*)page_address(page) + ofs); 2526 + folio_lock(folio); 2527 + folio_clear_uptodate(folio); 2528 + m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs); 2394 2529 /* If we just formatted the mft record no need to do it again. */ 2395 2530 if (!record_formatted) { 2396 2531 /* Sanity check that the mft record is really not in use. */ 2397 2532 if (ntfs_is_file_record(m->magic) && 2398 2533 (m->flags & MFT_RECORD_IN_USE)) { 2399 - ntfs_error(vol->sb, "Mft record 0x%llx was marked " 2400 - "free in mft bitmap but is marked " 2401 - "used itself. Corrupt filesystem. " 2402 - "Unmount and run chkdsk.", 2403 - (long long)bit); 2404 - err = -EIO; 2405 - SetPageUptodate(page); 2406 - unlock_page(page); 2407 - ntfs_unmap_page(page); 2534 + ntfs_warning(vol->sb, 2535 + "Mft record 0x%llx was marked free in mft bitmap but is marked used itself. Unmount and run chkdsk.", 2536 + bit); 2537 + folio_mark_uptodate(folio); 2538 + folio_unlock(folio); 2539 + kunmap_local(m); 2540 + folio_put(folio); 2408 2541 NVolSetErrors(vol); 2409 - goto undo_mftbmp_alloc; 2542 + goto search_free_rec; 2410 2543 } 2411 2544 /* 2412 2545 * We need to (re-)format the mft record, preserving the ··· 2414 2551 * wrong with the previous mft record. 2415 2552 */ 2416 2553 seq_no = m->sequence_number; 2417 - usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); 2554 + usn = *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)); 2418 2555 err = ntfs_mft_record_layout(vol, bit, m); 2419 2556 if (unlikely(err)) { 2420 - ntfs_error(vol->sb, "Failed to layout allocated mft " 2421 - "record 0x%llx.", (long long)bit); 2422 - SetPageUptodate(page); 2423 - unlock_page(page); 2424 - ntfs_unmap_page(page); 2557 + ntfs_error(vol->sb, "Failed to layout allocated mft record 0x%llx.", 2558 + bit); 2559 + folio_mark_uptodate(folio); 2560 + folio_unlock(folio); 2561 + kunmap_local(m); 2562 + folio_put(folio); 2425 2563 goto undo_mftbmp_alloc; 2426 2564 } 2427 2565 if (seq_no) 2428 2566 m->sequence_number = seq_no; 2429 2567 if (usn && le16_to_cpu(usn) != 0xffff) 2430 - *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; 2568 + *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = usn; 2569 + pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size); 2431 2570 } 2432 2571 /* Set the mft record itself in use. */ 2433 2572 m->flags |= MFT_RECORD_IN_USE; 2434 2573 if (S_ISDIR(mode)) 2435 2574 m->flags |= MFT_RECORD_IS_DIRECTORY; 2436 - flush_dcache_page(page); 2437 - SetPageUptodate(page); 2575 + folio_mark_uptodate(folio); 2438 2576 if (base_ni) { 2439 - MFT_RECORD *m_tmp; 2577 + struct mft_record *m_tmp; 2440 2578 2441 2579 /* 2442 2580 * Setup the base mft record in the extent mft record. This ··· 2451 2587 * attach it to the base inode @base_ni and map, pin, and lock 2452 2588 * its, i.e. the allocated, mft record. 2453 2589 */ 2454 - m_tmp = map_extent_mft_record(base_ni, bit, &ni); 2590 + m_tmp = map_extent_mft_record(base_ni, 2591 + MK_MREF(bit, le16_to_cpu(m->sequence_number)), 2592 + ni); 2455 2593 if (IS_ERR(m_tmp)) { 2456 - ntfs_error(vol->sb, "Failed to map allocated extent " 2457 - "mft record 0x%llx.", (long long)bit); 2594 + ntfs_error(vol->sb, "Failed to map allocated extent mft record 0x%llx.", 2595 + bit); 2458 2596 err = PTR_ERR(m_tmp); 2459 2597 /* Set the mft record itself not in use. */ 2460 2598 m->flags &= cpu_to_le16( 2461 2599 ~le16_to_cpu(MFT_RECORD_IN_USE)); 2462 - flush_dcache_page(page); 2463 2600 /* Make sure the mft record is written out to disk. */ 2464 - mark_ntfs_record_dirty(page, ofs); 2465 - unlock_page(page); 2466 - ntfs_unmap_page(page); 2601 + ntfs_mft_mark_dirty(folio); 2602 + folio_unlock(folio); 2603 + kunmap_local(m); 2604 + folio_put(folio); 2467 2605 goto undo_mftbmp_alloc; 2468 2606 } 2469 - BUG_ON(m != m_tmp); 2607 + 2470 2608 /* 2471 2609 * Make sure the allocated mft record is written out to disk. 2472 2610 * No need to set the inode dirty because the caller is going ··· 2476 2610 * record (e.g. at a minimum a new attribute will be added to 2477 2611 * the mft record. 2478 2612 */ 2479 - mark_ntfs_record_dirty(page, ofs); 2480 - unlock_page(page); 2613 + ntfs_mft_mark_dirty(folio); 2614 + folio_unlock(folio); 2481 2615 /* 2482 2616 * Need to unmap the page since map_extent_mft_record() mapped 2483 2617 * it as well so we have it mapped twice at the moment. 2484 2618 */ 2485 - ntfs_unmap_page(page); 2619 + kunmap_local(m); 2620 + folio_put(folio); 2486 2621 } else { 2487 - /* 2488 - * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink 2489 - * is set to 1 but the mft record->link_count is 0. The caller 2490 - * needs to bear this in mind. 2491 - */ 2492 - vi = new_inode(vol->sb); 2493 - if (unlikely(!vi)) { 2494 - err = -ENOMEM; 2495 - /* Set the mft record itself not in use. */ 2496 - m->flags &= cpu_to_le16( 2497 - ~le16_to_cpu(MFT_RECORD_IN_USE)); 2498 - flush_dcache_page(page); 2499 - /* Make sure the mft record is written out to disk. */ 2500 - mark_ntfs_record_dirty(page, ofs); 2501 - unlock_page(page); 2502 - ntfs_unmap_page(page); 2503 - goto undo_mftbmp_alloc; 2504 - } 2505 - vi->i_ino = bit; 2506 - 2507 - /* The owner and group come from the ntfs volume. */ 2508 - vi->i_uid = vol->uid; 2509 - vi->i_gid = vol->gid; 2510 - 2511 - /* Initialize the ntfs specific part of @vi. */ 2512 - ntfs_init_big_inode(vi); 2513 - ni = NTFS_I(vi); 2514 - /* 2515 - * Set the appropriate mode, attribute type, and name. For 2516 - * directories, also setup the index values to the defaults. 2517 - */ 2518 - if (S_ISDIR(mode)) { 2519 - vi->i_mode = S_IFDIR | S_IRWXUGO; 2520 - vi->i_mode &= ~vol->dmask; 2521 - 2522 - NInoSetMstProtected(ni); 2523 - ni->type = AT_INDEX_ALLOCATION; 2524 - ni->name = I30; 2525 - ni->name_len = 4; 2526 - 2527 - ni->itype.index.block_size = 4096; 2528 - ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; 2529 - ni->itype.index.collation_rule = COLLATION_FILE_NAME; 2530 - if (vol->cluster_size <= ni->itype.index.block_size) { 2531 - ni->itype.index.vcn_size = vol->cluster_size; 2532 - ni->itype.index.vcn_size_bits = 2533 - vol->cluster_size_bits; 2534 - } else { 2535 - ni->itype.index.vcn_size = vol->sector_size; 2536 - ni->itype.index.vcn_size_bits = 2537 - vol->sector_size_bits; 2538 - } 2539 - } else { 2540 - vi->i_mode = S_IFREG | S_IRWXUGO; 2541 - vi->i_mode &= ~vol->fmask; 2542 - 2543 - ni->type = AT_DATA; 2544 - ni->name = NULL; 2545 - ni->name_len = 0; 2546 - } 2547 - if (IS_RDONLY(vi)) 2548 - vi->i_mode &= ~S_IWUGO; 2549 - 2550 - /* Set the inode times to the current time. */ 2551 - simple_inode_init_ts(vi); 2552 - /* 2553 - * Set the file size to 0, the ntfs inode sizes are set to 0 by 2554 - * the call to ntfs_init_big_inode() below. 2555 - */ 2556 - vi->i_size = 0; 2557 - vi->i_blocks = 0; 2558 - 2559 - /* Set the sequence number. */ 2560 - vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); 2561 2622 /* 2562 2623 * Manually map, pin, and lock the mft record as we already 2563 2624 * have its page mapped and it is very easy to do. 2564 2625 */ 2565 - atomic_inc(&ni->count); 2566 - mutex_lock(&ni->mrec_lock); 2567 - ni->page = page; 2568 - ni->page_ofs = ofs; 2626 + (*ni)->seq_no = le16_to_cpu(m->sequence_number); 2569 2627 /* 2570 2628 * Make sure the allocated mft record is written out to disk. 2571 2629 * NOTE: We do not set the ntfs inode dirty because this would ··· 2500 2710 * a minimum some new attributes will be added to the mft 2501 2711 * record. 2502 2712 */ 2503 - mark_ntfs_record_dirty(page, ofs); 2504 - unlock_page(page); 2505 2713 2506 - /* Add the inode to the inode hash for the superblock. */ 2507 - insert_inode_hash(vi); 2714 + (*ni)->mrec = kmalloc(vol->mft_record_size, GFP_NOFS); 2715 + if (!(*ni)->mrec) { 2716 + folio_unlock(folio); 2717 + kunmap_local(m); 2718 + folio_put(folio); 2719 + goto undo_mftbmp_alloc; 2720 + } 2508 2721 2722 + memcpy((*ni)->mrec, m, vol->mft_record_size); 2723 + post_read_mst_fixup((struct ntfs_record *)(*ni)->mrec, vol->mft_record_size); 2724 + ntfs_mft_mark_dirty(folio); 2725 + folio_unlock(folio); 2726 + (*ni)->folio = folio; 2727 + (*ni)->folio_ofs = ofs; 2728 + atomic_inc(&(*ni)->count); 2509 2729 /* Update the default mft allocation position. */ 2510 2730 vol->mft_data_pos = bit + 1; 2511 2731 } 2732 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2733 + mutex_unlock(&mft_ni->mrec_lock); 2734 + memalloc_nofs_restore(memalloc_flags); 2735 + 2512 2736 /* 2513 2737 * Return the opened, allocated inode of the allocated mft record as 2514 2738 * well as the mapped, pinned, and locked mft record. 2515 2739 */ 2516 2740 ntfs_debug("Returning opened, allocated %sinode 0x%llx.", 2517 - base_ni ? "extent " : "", (long long)bit); 2518 - *mrec = m; 2519 - return ni; 2741 + base_ni ? "extent " : "", bit); 2742 + (*ni)->mft_no = bit; 2743 + if (ni_mrec) 2744 + *ni_mrec = (*ni)->mrec; 2745 + ntfs_dec_free_mft_records(vol, 1); 2746 + return 0; 2520 2747 undo_data_init: 2521 2748 write_lock_irqsave(&mft_ni->size_lock, flags); 2522 2749 mft_ni->initialized_size = old_data_initialized; ··· 2541 2734 write_unlock_irqrestore(&mft_ni->size_lock, flags); 2542 2735 goto undo_mftbmp_alloc_nolock; 2543 2736 undo_mftbmp_alloc: 2544 - down_write(&vol->mftbmp_lock); 2737 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2738 + down_write(&vol->mftbmp_lock); 2545 2739 undo_mftbmp_alloc_nolock: 2546 2740 if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { 2547 2741 ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); 2548 2742 NVolSetErrors(vol); 2549 2743 } 2550 - up_write(&vol->mftbmp_lock); 2744 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2745 + up_write(&vol->mftbmp_lock); 2551 2746 err_out: 2552 - return ERR_PTR(err); 2747 + if (!base_ni || base_ni->mft_no != FILE_MFT) 2748 + mutex_unlock(&mft_ni->mrec_lock); 2749 + memalloc_nofs_restore(memalloc_flags); 2750 + return err; 2553 2751 max_err_out: 2554 - ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " 2555 - "number of inodes (2^32) has already been reached."); 2556 - up_write(&vol->mftbmp_lock); 2557 - return ERR_PTR(-ENOSPC); 2752 + ntfs_warning(vol->sb, 2753 + "Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached."); 2754 + if (!base_ni || base_ni->mft_no != FILE_MFT) { 2755 + up_write(&vol->mftbmp_lock); 2756 + mutex_unlock(&mft_ni->mrec_lock); 2757 + } 2758 + memalloc_nofs_restore(memalloc_flags); 2759 + return -ENOSPC; 2558 2760 } 2559 2761 2560 - /** 2561 - * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume 2562 - * @ni: ntfs inode of the mapped extent mft record to free 2563 - * @m: mapped extent mft record of the ntfs inode @ni 2762 + /* 2763 + * ntfs_mft_record_free - free an mft record on an ntfs volume 2764 + * @vol: volume on which to free the mft record 2765 + * @ni: open ntfs inode of the mft record to free 2564 2766 * 2565 - * Free the mapped extent mft record @m of the extent ntfs inode @ni. 2767 + * Free the mft record of the open inode @ni on the mounted ntfs volume @vol. 2768 + * Note that this function calls ntfs_inode_close() internally and hence you 2769 + * cannot use the pointer @ni any more after this function returns success. 2566 2770 * 2567 - * Note that this function unmaps the mft record and closes and destroys @ni 2568 - * internally and hence you cannot use either @ni nor @m any more after this 2569 - * function returns success. 2570 - * 2571 - * On success return 0 and on error return -errno. @ni and @m are still valid 2572 - * in this case and have not been freed. 2573 - * 2574 - * For some errors an error message is displayed and the success code 0 is 2575 - * returned and the volume is then left dirty on umount. This makes sense in 2576 - * case we could not rollback the changes that were already done since the 2577 - * caller no longer wants to reference this mft record so it does not matter to 2578 - * the caller if something is wrong with it as long as it is properly detached 2579 - * from the base inode. 2771 + * On success return 0 and on error return -1 with errno set to the error code. 2580 2772 */ 2581 - int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) 2773 + int ntfs_mft_record_free(struct ntfs_volume *vol, struct ntfs_inode *ni) 2582 2774 { 2583 - unsigned long mft_no = ni->mft_no; 2584 - ntfs_volume *vol = ni->vol; 2585 - ntfs_inode *base_ni; 2586 - ntfs_inode **extent_nis; 2587 - int i, err; 2588 - le16 old_seq_no; 2775 + u64 mft_no; 2776 + int err; 2589 2777 u16 seq_no; 2590 - 2591 - BUG_ON(NInoAttr(ni)); 2592 - BUG_ON(ni->nr_extents != -1); 2778 + __le16 old_seq_no; 2779 + struct mft_record *ni_mrec; 2780 + unsigned int memalloc_flags; 2781 + struct ntfs_inode *base_ni; 2593 2782 2594 - mutex_lock(&ni->extent_lock); 2595 - base_ni = ni->ext.base_ntfs_ino; 2596 - mutex_unlock(&ni->extent_lock); 2783 + if (!vol || !ni) 2784 + return -EINVAL; 2597 2785 2598 - BUG_ON(base_ni->nr_extents <= 0); 2786 + ntfs_debug("Entering for inode 0x%llx.\n", (long long)ni->mft_no); 2599 2787 2600 - ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", 2601 - mft_no, base_ni->mft_no); 2788 + ni_mrec = map_mft_record(ni); 2789 + if (IS_ERR(ni_mrec)) 2790 + return -EIO; 2602 2791 2603 - mutex_lock(&base_ni->extent_lock); 2604 - 2605 - /* Make sure we are holding the only reference to the extent inode. */ 2606 - if (atomic_read(&ni->count) > 2) { 2607 - ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " 2608 - "not freeing.", base_ni->mft_no); 2609 - mutex_unlock(&base_ni->extent_lock); 2610 - return -EBUSY; 2611 - } 2612 - 2613 - /* Dissociate the ntfs inode from the base inode. */ 2614 - extent_nis = base_ni->ext.extent_ntfs_inos; 2615 - err = -ENOENT; 2616 - for (i = 0; i < base_ni->nr_extents; i++) { 2617 - if (ni != extent_nis[i]) 2618 - continue; 2619 - extent_nis += i; 2620 - base_ni->nr_extents--; 2621 - memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * 2622 - sizeof(ntfs_inode*)); 2623 - err = 0; 2624 - break; 2625 - } 2626 - 2627 - mutex_unlock(&base_ni->extent_lock); 2628 - 2629 - if (unlikely(err)) { 2630 - ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " 2631 - "its base inode 0x%lx.", mft_no, 2632 - base_ni->mft_no); 2633 - BUG(); 2634 - } 2635 - 2636 - /* 2637 - * The extent inode is no longer attached to the base inode so no one 2638 - * can get a reference to it any more. 2639 - */ 2792 + /* Cache the mft reference for later. */ 2793 + mft_no = ni->mft_no; 2640 2794 2641 2795 /* Mark the mft record as not in use. */ 2642 - m->flags &= ~MFT_RECORD_IN_USE; 2796 + ni_mrec->flags &= ~MFT_RECORD_IN_USE; 2643 2797 2644 2798 /* Increment the sequence number, skipping zero, if it is not zero. */ 2645 - old_seq_no = m->sequence_number; 2799 + old_seq_no = ni_mrec->sequence_number; 2646 2800 seq_no = le16_to_cpu(old_seq_no); 2647 2801 if (seq_no == 0xffff) 2648 2802 seq_no = 1; 2649 2803 else if (seq_no) 2650 2804 seq_no++; 2651 - m->sequence_number = cpu_to_le16(seq_no); 2805 + ni_mrec->sequence_number = cpu_to_le16(seq_no); 2806 + 2807 + down_read(&NTFS_I(vol->mft_ino)->runlist.lock); 2808 + err = ntfs_get_block_mft_record(NTFS_I(vol->mft_ino), ni); 2809 + up_read(&NTFS_I(vol->mft_ino)->runlist.lock); 2810 + if (err) { 2811 + unmap_mft_record(ni); 2812 + return err; 2813 + } 2652 2814 2653 2815 /* 2654 2816 * Set the ntfs inode dirty and write it out. We do not need to worry ··· 2625 2849 * record to be freed is guaranteed to do it already. 2626 2850 */ 2627 2851 NInoSetDirty(ni); 2628 - err = write_mft_record(ni, m, 0); 2629 - if (unlikely(err)) { 2630 - ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " 2631 - "freeing.", mft_no); 2632 - goto rollback; 2633 - } 2634 - rollback_error: 2635 - /* Unmap and throw away the now freed extent inode. */ 2636 - unmap_extent_mft_record(ni); 2637 - ntfs_clear_extent_inode(ni); 2852 + err = write_mft_record(ni, ni_mrec, 0); 2853 + if (err) 2854 + goto sync_rollback; 2855 + 2856 + if (likely(ni->nr_extents >= 0)) 2857 + base_ni = ni; 2858 + else 2859 + base_ni = ni->ext.base_ntfs_ino; 2638 2860 2639 2861 /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ 2640 - down_write(&vol->mftbmp_lock); 2862 + memalloc_flags = memalloc_nofs_save(); 2863 + if (base_ni->mft_no != FILE_MFT) 2864 + down_write(&vol->mftbmp_lock); 2641 2865 err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); 2642 - up_write(&vol->mftbmp_lock); 2643 - if (unlikely(err)) { 2644 - /* 2645 - * The extent inode is gone but we failed to deallocate it in 2646 - * the mft bitmap. Just emit a warning and leave the volume 2647 - * dirty on umount. 2648 - */ 2649 - ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); 2650 - NVolSetErrors(vol); 2651 - } 2652 - return 0; 2653 - rollback: 2654 - /* Rollback what we did... */ 2655 - mutex_lock(&base_ni->extent_lock); 2656 - extent_nis = base_ni->ext.extent_ntfs_inos; 2657 - if (!(base_ni->nr_extents & 3)) { 2658 - int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); 2866 + if (base_ni->mft_no != FILE_MFT) 2867 + up_write(&vol->mftbmp_lock); 2868 + memalloc_nofs_restore(memalloc_flags); 2869 + if (err) 2870 + goto bitmap_rollback; 2659 2871 2660 - extent_nis = kmalloc(new_size, GFP_NOFS); 2661 - if (unlikely(!extent_nis)) { 2662 - ntfs_error(vol->sb, "Failed to allocate internal " 2663 - "buffer during rollback.%s", es); 2664 - mutex_unlock(&base_ni->extent_lock); 2665 - NVolSetErrors(vol); 2666 - goto rollback_error; 2667 - } 2668 - if (base_ni->nr_extents) { 2669 - BUG_ON(!base_ni->ext.extent_ntfs_inos); 2670 - memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, 2671 - new_size - 4 * sizeof(ntfs_inode*)); 2672 - kfree(base_ni->ext.extent_ntfs_inos); 2673 - } 2674 - base_ni->ext.extent_ntfs_inos = extent_nis; 2675 - } 2676 - m->flags |= MFT_RECORD_IN_USE; 2677 - m->sequence_number = old_seq_no; 2678 - extent_nis[base_ni->nr_extents++] = ni; 2679 - mutex_unlock(&base_ni->extent_lock); 2680 - mark_mft_record_dirty(ni); 2872 + unmap_mft_record(ni); 2873 + ntfs_inc_free_mft_records(vol, 1); 2874 + return 0; 2875 + 2876 + /* Rollback what we did... */ 2877 + bitmap_rollback: 2878 + memalloc_flags = memalloc_nofs_save(); 2879 + if (base_ni->mft_no != FILE_MFT) 2880 + down_write(&vol->mftbmp_lock); 2881 + if (ntfs_bitmap_set_bit(vol->mftbmp_ino, mft_no)) 2882 + ntfs_error(vol->sb, "ntfs_bitmap_set_bit failed in bitmap_rollback\n"); 2883 + if (base_ni->mft_no != FILE_MFT) 2884 + up_write(&vol->mftbmp_lock); 2885 + memalloc_nofs_restore(memalloc_flags); 2886 + sync_rollback: 2887 + ntfs_error(vol->sb, 2888 + "Eeek! Rollback failed in %s. Leaving inconsistent metadata!\n", __func__); 2889 + ni_mrec->flags |= MFT_RECORD_IN_USE; 2890 + ni_mrec->sequence_number = old_seq_no; 2891 + NInoSetDirty(ni); 2892 + write_mft_record(ni, ni_mrec, 0); 2893 + unmap_mft_record(ni); 2681 2894 return err; 2682 2895 } 2683 - #endif /* NTFS_RW */ 2896 + 2897 + static s64 lcn_from_index(struct ntfs_volume *vol, struct ntfs_inode *ni, 2898 + unsigned long index) 2899 + { 2900 + s64 vcn; 2901 + s64 lcn; 2902 + 2903 + vcn = ntfs_pidx_to_cluster(vol, index); 2904 + 2905 + down_read(&ni->runlist.lock); 2906 + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, false); 2907 + up_read(&ni->runlist.lock); 2908 + 2909 + return lcn; 2910 + } 2911 + 2912 + /* 2913 + * ntfs_write_mft_block - Write back a folio containing MFT records 2914 + * @folio: The folio to write back (contains one or more MFT records) 2915 + * @wbc: Writeback control structure 2916 + * 2917 + * This function is called as part of the address_space_operations 2918 + * .writepages implementation for the $MFT inode (or $MFTMirr). 2919 + * It handles writing one folio (normally 4KiB page) worth of MFT records 2920 + * to the underlying block device. 2921 + * 2922 + * Return: 0 on success, or -errno on error. 2923 + */ 2924 + static int ntfs_write_mft_block(struct folio *folio, struct writeback_control *wbc) 2925 + { 2926 + struct address_space *mapping = folio->mapping; 2927 + struct inode *vi = mapping->host; 2928 + struct ntfs_inode *ni = NTFS_I(vi); 2929 + struct ntfs_volume *vol = ni->vol; 2930 + u8 *kaddr; 2931 + struct ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; 2932 + int nr_locked_nis = 0, err = 0, mft_ofs, prev_mft_ofs; 2933 + struct inode *ref_inos[PAGE_SIZE / NTFS_BLOCK_SIZE]; 2934 + int nr_ref_inos = 0; 2935 + struct bio *bio = NULL; 2936 + unsigned long mft_no; 2937 + struct ntfs_inode *tni; 2938 + s64 lcn; 2939 + s64 vcn = ntfs_pidx_to_cluster(vol, folio->index); 2940 + s64 end_vcn = ntfs_bytes_to_cluster(vol, ni->allocated_size); 2941 + unsigned int folio_sz; 2942 + struct runlist_element *rl; 2943 + loff_t i_size = i_size_read(vi); 2944 + 2945 + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, folio index 0x%lx.", 2946 + vi->i_ino, ni->type, folio->index); 2947 + 2948 + /* We have to zero every time due to mmap-at-end-of-file. */ 2949 + if (folio->index >= (i_size >> folio_shift(folio))) 2950 + /* The page straddles i_size. */ 2951 + folio_zero_segment(folio, 2952 + offset_in_folio(folio, i_size), 2953 + folio_size(folio)); 2954 + 2955 + lcn = lcn_from_index(vol, ni, folio->index); 2956 + if (lcn <= LCN_HOLE) { 2957 + folio_start_writeback(folio); 2958 + folio_unlock(folio); 2959 + folio_end_writeback(folio); 2960 + return -EIO; 2961 + } 2962 + 2963 + /* Map folio so we can access its contents. */ 2964 + kaddr = kmap_local_folio(folio, 0); 2965 + /* Clear the page uptodate flag whilst the mst fixups are applied. */ 2966 + folio_clear_uptodate(folio); 2967 + 2968 + for (mft_ofs = 0; mft_ofs < PAGE_SIZE && vcn < end_vcn; 2969 + mft_ofs += vol->mft_record_size) { 2970 + /* Get the mft record number. */ 2971 + mft_no = (((s64)folio->index << PAGE_SHIFT) + mft_ofs) >> 2972 + vol->mft_record_size_bits; 2973 + vcn = ntfs_mft_no_to_cluster(vol, mft_no); 2974 + /* Check whether to write this mft record. */ 2975 + tni = NULL; 2976 + if (ntfs_may_write_mft_record(vol, mft_no, 2977 + (struct mft_record *)(kaddr + mft_ofs), 2978 + &tni, &ref_inos[nr_ref_inos])) { 2979 + unsigned int mft_record_off = 0; 2980 + s64 vcn_off = vcn; 2981 + 2982 + /* 2983 + * Skip $MFT extent mft records and let them being written 2984 + * by writeback to avioid deadlocks. the $MFT runlist 2985 + * lock must be taken before $MFT extent mrec_lock is taken. 2986 + */ 2987 + if (tni && tni->nr_extents < 0 && 2988 + tni->ext.base_ntfs_ino == NTFS_I(vol->mft_ino)) { 2989 + mutex_unlock(&tni->mrec_lock); 2990 + atomic_dec(&tni->count); 2991 + iput(vol->mft_ino); 2992 + continue; 2993 + } 2994 + 2995 + /* 2996 + * The record should be written. If a locked ntfs 2997 + * inode was returned, add it to the array of locked 2998 + * ntfs inodes. 2999 + */ 3000 + if (tni) 3001 + locked_nis[nr_locked_nis++] = tni; 3002 + else if (ref_inos[nr_ref_inos]) 3003 + nr_ref_inos++; 3004 + 3005 + if (bio && (mft_ofs != prev_mft_ofs + vol->mft_record_size)) { 3006 + flush_bio: 3007 + bio->bi_end_io = ntfs_bio_end_io; 3008 + submit_bio(bio); 3009 + bio = NULL; 3010 + } 3011 + 3012 + if (vol->cluster_size < folio_size(folio)) { 3013 + down_write(&ni->runlist.lock); 3014 + rl = ntfs_attr_vcn_to_rl(ni, vcn_off, &lcn); 3015 + up_write(&ni->runlist.lock); 3016 + if (IS_ERR(rl) || lcn < 0) { 3017 + err = -EIO; 3018 + goto unm_done; 3019 + } 3020 + 3021 + if (bio && 3022 + (bio_end_sector(bio) >> (vol->cluster_size_bits - 9)) != 3023 + lcn) { 3024 + bio->bi_end_io = ntfs_bio_end_io; 3025 + submit_bio(bio); 3026 + bio = NULL; 3027 + } 3028 + } 3029 + 3030 + if (!bio) { 3031 + unsigned int off; 3032 + 3033 + off = ((mft_no << vol->mft_record_size_bits) + 3034 + mft_record_off) & vol->cluster_size_mask; 3035 + 3036 + bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, 3037 + GFP_NOIO); 3038 + bio->bi_iter.bi_sector = 3039 + ntfs_bytes_to_sector(vol, 3040 + ntfs_cluster_to_bytes(vol, lcn) + off); 3041 + } 3042 + 3043 + if (vol->cluster_size == NTFS_BLOCK_SIZE && 3044 + (mft_record_off || 3045 + rl->length - (vcn_off - rl->vcn) == 1 || 3046 + mft_ofs + NTFS_BLOCK_SIZE >= PAGE_SIZE)) 3047 + folio_sz = NTFS_BLOCK_SIZE; 3048 + else 3049 + folio_sz = vol->mft_record_size; 3050 + if (!bio_add_folio(bio, folio, folio_sz, 3051 + mft_ofs + mft_record_off)) { 3052 + err = -EIO; 3053 + bio_put(bio); 3054 + goto unm_done; 3055 + } 3056 + mft_record_off += folio_sz; 3057 + 3058 + if (mft_record_off != vol->mft_record_size) { 3059 + vcn_off++; 3060 + goto flush_bio; 3061 + } 3062 + prev_mft_ofs = mft_ofs; 3063 + 3064 + if (mft_no < vol->mftmirr_size) 3065 + ntfs_sync_mft_mirror(vol, mft_no, 3066 + (struct mft_record *)(kaddr + mft_ofs)); 3067 + } else if (ref_inos[nr_ref_inos]) 3068 + nr_ref_inos++; 3069 + } 3070 + 3071 + if (bio) { 3072 + bio->bi_end_io = ntfs_bio_end_io; 3073 + submit_bio(bio); 3074 + } 3075 + unm_done: 3076 + folio_mark_uptodate(folio); 3077 + kunmap_local(kaddr); 3078 + 3079 + folio_start_writeback(folio); 3080 + folio_unlock(folio); 3081 + folio_end_writeback(folio); 3082 + 3083 + /* Unlock any locked inodes. */ 3084 + while (nr_locked_nis-- > 0) { 3085 + struct ntfs_inode *base_tni; 3086 + 3087 + tni = locked_nis[nr_locked_nis]; 3088 + mutex_unlock(&tni->mrec_lock); 3089 + 3090 + /* Get the base inode. */ 3091 + mutex_lock(&tni->extent_lock); 3092 + if (tni->nr_extents >= 0) 3093 + base_tni = tni; 3094 + else 3095 + base_tni = tni->ext.base_ntfs_ino; 3096 + mutex_unlock(&tni->extent_lock); 3097 + ntfs_debug("Unlocking %s inode 0x%lx.", 3098 + tni == base_tni ? "base" : "extent", 3099 + tni->mft_no); 3100 + atomic_dec(&tni->count); 3101 + iput(VFS_I(base_tni)); 3102 + } 3103 + 3104 + /* Dropping deferred references */ 3105 + while (nr_ref_inos-- > 0) { 3106 + if (ref_inos[nr_ref_inos]) 3107 + iput(ref_inos[nr_ref_inos]); 3108 + } 3109 + 3110 + if (unlikely(err && err != -ENOMEM)) 3111 + NVolSetErrors(vol); 3112 + if (likely(!err)) 3113 + ntfs_debug("Done."); 3114 + return err; 3115 + } 3116 + 3117 + /* 3118 + * ntfs_mft_writepages - Write back dirty folios for the $MFT inode 3119 + * @mapping: address space of the $MFT inode 3120 + * @wbc: writeback control 3121 + * 3122 + * Writeback iterator for MFT records. Iterates over dirty folios and 3123 + * delegates actual writing to ntfs_write_mft_block() for each folio. 3124 + * Called from the address_space_operations .writepages vector of the 3125 + * $MFT inode. 3126 + * 3127 + * Returns 0 on success, or the first error encountered. 3128 + */ 3129 + int ntfs_mft_writepages(struct address_space *mapping, 3130 + struct writeback_control *wbc) 3131 + { 3132 + struct folio *folio = NULL; 3133 + int error; 3134 + 3135 + if (NVolShutdown(NTFS_I(mapping->host)->vol)) 3136 + return -EIO; 3137 + 3138 + while ((folio = writeback_iter(mapping, wbc, folio, &error))) 3139 + error = ntfs_write_mft_block(folio, wbc); 3140 + return error; 3141 + } 3142 + 3143 + void ntfs_mft_mark_dirty(struct folio *folio) 3144 + { 3145 + iomap_dirty_folio(folio->mapping, folio); 3146 + }
+35 -30
fs/ntfs/mst.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 /* 3 - * mst.c - NTFS multi sector transfer protection handling code. Part of the 4 - * Linux-NTFS project. 3 + * NTFS multi sector transfer protection handling code. 5 4 * 6 5 * Copyright (c) 2001-2004 Anton Altaparmakov 7 6 */ 8 7 8 + #include <linux/ratelimit.h> 9 + 9 10 #include "ntfs.h" 10 11 11 - /** 12 + /* 12 13 * post_read_mst_fixup - deprotect multi sector transfer protected data 13 14 * @b: pointer to the data to deprotect 14 15 * @size: size in bytes of @b ··· 26 25 * be fixed up. Thus, we return success and not failure in this case. This is 27 26 * in contrast to pre_write_mst_fixup(), see below. 28 27 */ 29 - int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) 28 + int post_read_mst_fixup(struct ntfs_record *b, const u32 size) 30 29 { 31 30 u16 usa_ofs, usa_count, usn; 32 31 u16 *usa_pos, *data_pos; ··· 36 35 /* Decrement usa_count to get number of fixups. */ 37 36 usa_count = le16_to_cpu(b->usa_count) - 1; 38 37 /* Size and alignment checks. */ 39 - if ( size & (NTFS_BLOCK_SIZE - 1) || 40 - usa_ofs & 1 || 41 - usa_ofs + (usa_count * 2) > size || 42 - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) 38 + if (size & (NTFS_BLOCK_SIZE - 1) || usa_ofs & 1 || 39 + usa_ofs + (usa_count * 2) > size || 40 + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) 43 41 return 0; 44 42 /* Position of usn in update sequence array. */ 45 - usa_pos = (u16*)b + usa_ofs/sizeof(u16); 43 + usa_pos = (u16 *)b + usa_ofs/sizeof(u16); 46 44 /* 47 45 * The update sequence number which has to be equal to each of the 48 46 * u16 values before they are fixed up. Note no need to care for ··· 53 53 /* 54 54 * Position in protected data of first u16 that needs fixing up. 55 55 */ 56 - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; 56 + data_pos = (u16 *)b + NTFS_BLOCK_SIZE / sizeof(u16) - 1; 57 57 /* 58 58 * Check for incomplete multi sector transfer(s). 59 59 */ 60 60 while (usa_count--) { 61 61 if (*data_pos != usn) { 62 + struct mft_record *m = (struct mft_record *)b; 63 + 64 + pr_err_ratelimited("ntfs: Incomplete multi sector transfer detected! (Record magic : 0x%x, mft number : 0x%x, base mft number : 0x%lx, mft in use : %d, data : 0x%x, usn 0x%x)\n", 65 + le32_to_cpu(m->magic), le32_to_cpu(m->mft_record_number), 66 + MREF_LE(m->base_mft_record), m->flags & MFT_RECORD_IN_USE, 67 + *data_pos, usn); 62 68 /* 63 69 * Incomplete multi sector transfer detected! )-: 64 70 * Set the magic to "BAAD" and return failure. ··· 73 67 b->magic = magic_BAAD; 74 68 return -EINVAL; 75 69 } 76 - data_pos += NTFS_BLOCK_SIZE/sizeof(u16); 70 + data_pos += NTFS_BLOCK_SIZE / sizeof(u16); 77 71 } 78 72 /* Re-setup the variables. */ 79 73 usa_count = le16_to_cpu(b->usa_count) - 1; 80 - data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; 74 + data_pos = (u16 *)b + NTFS_BLOCK_SIZE / sizeof(u16) - 1; 81 75 /* Fixup all sectors. */ 82 76 while (usa_count--) { 83 77 /* ··· 91 85 return 0; 92 86 } 93 87 94 - /** 88 + /* 95 89 * pre_write_mst_fixup - apply multi sector transfer protection 96 90 * @b: pointer to the data to protect 97 91 * @size: size in bytes of @b ··· 112 106 * otherwise a random word will be used (whatever was in the record at that 113 107 * position at that time). 114 108 */ 115 - int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) 109 + int pre_write_mst_fixup(struct ntfs_record *b, const u32 size) 116 110 { 117 - le16 *usa_pos, *data_pos; 111 + __le16 *usa_pos, *data_pos; 118 112 u16 usa_ofs, usa_count, usn; 119 - le16 le_usn; 113 + __le16 le_usn; 120 114 121 115 /* Sanity check + only fixup if it makes sense. */ 122 116 if (!b || ntfs_is_baad_record(b->magic) || 123 - ntfs_is_hole_record(b->magic)) 117 + ntfs_is_hole_record(b->magic)) 124 118 return -EINVAL; 125 119 /* Setup the variables. */ 126 120 usa_ofs = le16_to_cpu(b->usa_ofs); 127 121 /* Decrement usa_count to get number of fixups. */ 128 122 usa_count = le16_to_cpu(b->usa_count) - 1; 129 123 /* Size and alignment checks. */ 130 - if ( size & (NTFS_BLOCK_SIZE - 1) || 131 - usa_ofs & 1 || 132 - usa_ofs + (usa_count * 2) > size || 133 - (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) 124 + if (size & (NTFS_BLOCK_SIZE - 1) || usa_ofs & 1 || 125 + usa_ofs + (usa_count * 2) > size || 126 + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) 134 127 return -EINVAL; 135 128 /* Position of usn in update sequence array. */ 136 - usa_pos = (le16*)((u8*)b + usa_ofs); 129 + usa_pos = (__le16 *)((u8 *)b + usa_ofs); 137 130 /* 138 131 * Cyclically increment the update sequence number 139 132 * (skipping 0 and -1, i.e. 0xffff). ··· 143 138 le_usn = cpu_to_le16(usn); 144 139 *usa_pos = le_usn; 145 140 /* Position in data of first u16 that needs fixing up. */ 146 - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; 141 + data_pos = (__le16 *)b + NTFS_BLOCK_SIZE/sizeof(__le16) - 1; 147 142 /* Fixup all sectors. */ 148 143 while (usa_count--) { 149 144 /* ··· 154 149 /* Apply fixup to data. */ 155 150 *data_pos = le_usn; 156 151 /* Increment position in data as well. */ 157 - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); 152 + data_pos += NTFS_BLOCK_SIZE / sizeof(__le16); 158 153 } 159 154 return 0; 160 155 } 161 156 162 - /** 157 + /* 163 158 * post_write_mst_fixup - fast deprotect multi sector transfer protected data 164 159 * @b: pointer to the data to deprotect 165 160 * ··· 167 162 * for any errors, because we assume we have just used pre_write_mst_fixup(), 168 163 * thus the data will be fine or we would never have gotten here. 169 164 */ 170 - void post_write_mst_fixup(NTFS_RECORD *b) 165 + void post_write_mst_fixup(struct ntfs_record *b) 171 166 { 172 - le16 *usa_pos, *data_pos; 167 + __le16 *usa_pos, *data_pos; 173 168 174 169 u16 usa_ofs = le16_to_cpu(b->usa_ofs); 175 170 u16 usa_count = le16_to_cpu(b->usa_count) - 1; 176 171 177 172 /* Position of usn in update sequence array. */ 178 - usa_pos = (le16*)b + usa_ofs/sizeof(le16); 173 + usa_pos = (__le16 *)b + usa_ofs/sizeof(__le16); 179 174 180 175 /* Position in protected data of first u16 that needs fixing up. */ 181 - data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; 176 + data_pos = (__le16 *)b + NTFS_BLOCK_SIZE/sizeof(__le16) - 1; 182 177 183 178 /* Fixup all sectors. */ 184 179 while (usa_count--) { ··· 189 184 *data_pos = *(++usa_pos); 190 185 191 186 /* Increment position in data as well. */ 192 - data_pos += NTFS_BLOCK_SIZE/sizeof(le16); 187 + data_pos += NTFS_BLOCK_SIZE/sizeof(__le16); 193 188 } 194 189 }