Merge branch 'akpm' (patches from Andrew)

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:

- more ocfs2 work

- various leftovers

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
memory_hotplug: cond_resched in __remove_pages
bfs: add sanity check at bfs_fill_super()
kernel/sysctl.c: remove duplicated include
kernel/kexec_file.c: remove some duplicated includes
mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask
ocfs2: fix clusters leak in ocfs2_defrag_extent()
ocfs2: dlmglue: clean up timestamp handling
ocfs2: don't put and assigning null to bh allocated outside
ocfs2: fix a misuse a of brelse after failing ocfs2_check_dir_entry
ocfs2: don't use iocb when EIOCBQUEUED returns
ocfs2: without quota support, avoid calling quota recovery
ocfs2: remove ocfs2_is_o2cb_active()
mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings
include/linux/notifier.h: SRCU: fix ctags
mm: handle no memcg case in memcg_kmem_charge() properly

Linus Torvalds 7 years ago cddfa11a 5f215853

+172 -124

19 changed files

expand all collapse all

bfs

inode.c

ocfs2

buffer_head_io.c

dir.c

dlmglue.c

file.c

journal.c

move_extents.c

stackglue.c

stackglue.h

include

linux

gfp.h

mempolicy.h

notifier.h

kernel

kexec_file.c

sysctl.c

huge_memory.c

memcontrol.c

memory_hotplug.c

mempolicy.c

shmem.c

+6 -3

fs/bfs/inode.c

reviewed

··· 350 350 351 351 s->s_magic = BFS_MAGIC; 352 352 353 353 - if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { 353 353 + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || 354 354 + le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { 354 355 printf("Superblock is corrupted\n"); 355 356 goto out1; 356 357 } ··· 360 359 sizeof(struct bfs_inode) 361 360 + BFS_ROOT_INO - 1; 362 361 imap_len = (info->si_lasti / 8) + 1; 363 363 - info->si_imap = kzalloc(imap_len, GFP_KERNEL); 364 364 - if (!info->si_imap) 362 362 + info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); 363 363 + if (!info->si_imap) { 364 364 + printf("Cannot allocate %u bytes\n", imap_len); 365 365 goto out1; 366 366 + } 366 367 for (i = 0; i < BFS_ROOT_INO; i++) 367 368 set_bit(i, info->si_imap); 368 369

+59 -18

fs/ocfs2/buffer_head_io.c

reviewed

··· 99 99 return ret; 100 100 } 101 101 102 102 + /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it 103 103 + * will be easier to handle read failure. 104 104 + */ 102 105 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 103 106 unsigned int nr, struct buffer_head *bhs[]) 104 107 { 105 108 int status = 0; 106 109 unsigned int i; 107 110 struct buffer_head *bh; 111 111 + int new_bh = 0; 108 112 109 113 trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); 110 114 111 115 if (!nr) 112 116 goto bail; 117 117 + 118 118 + /* Don't put buffer head and re-assign it to NULL if it is allocated 119 119 + * outside since the caller can't be aware of this alternation! 120 120 + */ 121 121 + new_bh = (bhs[0] == NULL); 113 122 114 123 for (i = 0 ; i < nr ; i++) { 115 124 if (bhs[i] == NULL) { ··· 126 117 if (bhs[i] == NULL) { 127 118 status = -ENOMEM; 128 119 mlog_errno(status); 129 129 - goto bail; 120 120 + break; 130 121 } 131 122 } 132 123 bh = bhs[i]; ··· 167 158 submit_bh(REQ_OP_READ, 0, bh); 168 159 } 169 160 161 161 + read_failure: 170 162 for (i = nr; i > 0; i--) { 171 163 bh = bhs[i - 1]; 164 164 + 165 165 + if (unlikely(status)) { 166 166 + if (new_bh && bh) { 167 167 + /* If middle bh fails, let previous bh 168 168 + * finish its read and then put it to 169 169 + * aovoid bh leak 170 170 + */ 171 171 + if (!buffer_jbd(bh)) 172 172 + wait_on_buffer(bh); 173 173 + put_bh(bh); 174 174 + bhs[i - 1] = NULL; 175 175 + } else if (bh && buffer_uptodate(bh)) { 176 176 + clear_buffer_uptodate(bh); 177 177 + } 178 178 + continue; 179 179 + } 172 180 173 181 /* No need to wait on the buffer if it's managed by JBD. */ 174 182 if (!buffer_jbd(bh)) ··· 196 170 * so we can safely record this and loop back 197 171 * to cleanup the other buffers. */ 198 172 status = -EIO; 199 199 - put_bh(bh); 200 200 - bhs[i - 1] = NULL; 173 173 + goto read_failure; 201 174 } 202 175 } 203 176 ··· 204 179 return status; 205 180 } 206 181 182 182 + /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it 183 183 + * will be easier to handle read failure. 184 184 + */ 207 185 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, 208 186 struct buffer_head *bhs[], int flags, 209 187 int (*validate)(struct super_block *sb, ··· 216 188 int i, ignore_cache = 0; 217 189 struct buffer_head *bh; 218 190 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 191 191 + int new_bh = 0; 219 192 220 193 trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); 221 194 ··· 242 213 goto bail; 243 214 } 244 215 216 216 + /* Don't put buffer head and re-assign it to NULL if it is allocated 217 217 + * outside since the caller can't be aware of this alternation! 218 218 + */ 219 219 + new_bh = (bhs[0] == NULL); 220 220 + 245 221 ocfs2_metadata_cache_io_lock(ci); 246 222 for (i = 0 ; i < nr ; i++) { 247 223 if (bhs[i] == NULL) { ··· 255 221 ocfs2_metadata_cache_io_unlock(ci); 256 222 status = -ENOMEM; 257 223 mlog_errno(status); 258 258 - goto bail; 224 224 + /* Don't forget to put previous bh! */ 225 225 + break; 259 226 } 260 227 } 261 228 bh = bhs[i]; ··· 351 316 } 352 317 } 353 318 354 354 - status = 0; 355 355 - 319 319 + read_failure: 356 320 for (i = (nr - 1); i >= 0; i--) { 357 321 bh = bhs[i]; 358 322 359 323 if (!(flags & OCFS2_BH_READAHEAD)) { 360 360 - if (status) { 361 361 - /* Clear the rest of the buffers on error */ 362 362 - put_bh(bh); 363 363 - bhs[i] = NULL; 324 324 + if (unlikely(status)) { 325 325 + /* Clear the buffers on error including those 326 326 + * ever succeeded in reading 327 327 + */ 328 328 + if (new_bh && bh) { 329 329 + /* If middle bh fails, let previous bh 330 330 + * finish its read and then put it to 331 331 + * aovoid bh leak 332 332 + */ 333 333 + if (!buffer_jbd(bh)) 334 334 + wait_on_buffer(bh); 335 335 + put_bh(bh); 336 336 + bhs[i] = NULL; 337 337 + } else if (bh && buffer_uptodate(bh)) { 338 338 + clear_buffer_uptodate(bh); 339 339 + } 364 340 continue; 365 341 } 366 342 /* We know this can't have changed as we hold the ··· 389 343 * uptodate. */ 390 344 status = -EIO; 391 345 clear_buffer_needs_validate(bh); 392 392 - put_bh(bh); 393 393 - bhs[i] = NULL; 394 394 - continue; 346 346 + goto read_failure; 395 347 } 396 348 397 349 if (buffer_needs_validate(bh)) { ··· 399 355 BUG_ON(buffer_jbd(bh)); 400 356 clear_buffer_needs_validate(bh); 401 357 status = validate(sb, bh); 402 402 - if (status) { 403 403 - put_bh(bh); 404 404 - bhs[i] = NULL; 405 405 - continue; 406 406 - } 358 358 + if (status) 359 359 + goto read_failure; 407 360 } 408 361 } 409 362

+1 -2

fs/ocfs2/dir.c

reviewed

··· 1897 1897 /* On error, skip the f_pos to the 1898 1898 next block. */ 1899 1899 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; 1900 1900 - brelse(bh); 1901 1901 - continue; 1900 1900 + break; 1902 1901 } 1903 1902 if (le64_to_cpu(de->inode)) { 1904 1903 unsigned char d_type = DT_UNKNOWN;

+10 -18

fs/ocfs2/dlmglue.c

reviewed

··· 2123 2123 2124 2124 /* LVB only has room for 64 bits of time here so we pack it for 2125 2125 * now. */ 2126 2126 - static u64 ocfs2_pack_timespec(struct timespec *spec) 2126 2126 + static u64 ocfs2_pack_timespec(struct timespec64 *spec) 2127 2127 { 2128 2128 u64 res; 2129 2129 - u64 sec = spec->tv_sec; 2129 2129 + u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull); 2130 2130 u32 nsec = spec->tv_nsec; 2131 2131 2132 2132 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); ··· 2142 2142 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2143 2143 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2144 2144 struct ocfs2_meta_lvb *lvb; 2145 2145 - struct timespec ts; 2146 2145 2147 2146 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 2148 2147 ··· 2162 2163 lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); 2163 2164 lvb->lvb_imode = cpu_to_be16(inode->i_mode); 2164 2165 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); 2165 2165 - ts = timespec64_to_timespec(inode->i_atime); 2166 2166 lvb->lvb_iatime_packed = 2167 2167 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2168 2168 - ts = timespec64_to_timespec(inode->i_ctime); 2167 2167 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); 2169 2168 lvb->lvb_ictime_packed = 2170 2170 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2171 2171 - ts = timespec64_to_timespec(inode->i_mtime); 2169 2169 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); 2172 2170 lvb->lvb_imtime_packed = 2173 2173 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2171 2171 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 2174 2172 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); 2175 2173 lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); 2176 2174 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); ··· 2176 2180 mlog_meta_lvb(0, lockres); 2177 2181 } 2178 2182 2179 2179 - static void ocfs2_unpack_timespec(struct timespec *spec, 2183 2183 + static void ocfs2_unpack_timespec(struct timespec64 *spec, 2180 2184 u64 packed_time) 2181 2185 { 2182 2186 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; ··· 2185 2189 2186 2190 static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 2187 2191 { 2188 2188 - struct timespec ts; 2189 2192 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2190 2193 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2191 2194 struct ocfs2_meta_lvb *lvb; ··· 2212 2217 i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); 2213 2218 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 2214 2219 set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); 2215 2215 - ocfs2_unpack_timespec(&ts, 2220 2220 + ocfs2_unpack_timespec(&inode->i_atime, 2216 2221 be64_to_cpu(lvb->lvb_iatime_packed)); 2217 2217 - inode->i_atime = timespec_to_timespec64(ts); 2218 2218 - ocfs2_unpack_timespec(&ts, 2222 2222 + ocfs2_unpack_timespec(&inode->i_mtime, 2219 2223 be64_to_cpu(lvb->lvb_imtime_packed)); 2220 2220 - inode->i_mtime = timespec_to_timespec64(ts); 2221 2221 - ocfs2_unpack_timespec(&ts, 2224 2224 + ocfs2_unpack_timespec(&inode->i_ctime, 2222 2225 be64_to_cpu(lvb->lvb_ictime_packed)); 2223 2223 - inode->i_ctime = timespec_to_timespec64(ts); 2224 2226 spin_unlock(&oi->ip_lock); 2225 2227 } 2226 2228 ··· 3595 3603 * we can recover correctly from node failure. Otherwise, we may get 3596 3604 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. 3597 3605 */ 3598 3598 - if (!ocfs2_is_o2cb_active() && 3606 3606 + if (ocfs2_userspace_stack(osb) && 3599 3607 lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 3600 3608 lvb = 1; 3601 3609

+2 -2

fs/ocfs2/file.c

reviewed

··· 2343 2343 2344 2344 written = __generic_file_write_iter(iocb, from); 2345 2345 /* buffered aio wouldn't have proper lock coverage today */ 2346 2346 - BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2346 2346 + BUG_ON(written == -EIOCBQUEUED && !direct_io); 2347 2347 2348 2348 /* 2349 2349 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io ··· 2463 2463 trace_generic_file_read_iter_ret(ret); 2464 2464 2465 2465 /* buffered aio wouldn't have proper lock coverage today */ 2466 2466 - BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2466 2466 + BUG_ON(ret == -EIOCBQUEUED && !direct_io); 2467 2467 2468 2468 /* see ocfs2_file_write_iter */ 2469 2469 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

+34 -17

fs/ocfs2/journal.c

reviewed

··· 1378 1378 int rm_quota_used = 0, i; 1379 1379 struct ocfs2_quota_recovery *qrec; 1380 1380 1381 1381 + /* Whether the quota supported. */ 1382 1382 + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, 1383 1383 + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) 1384 1384 + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, 1385 1385 + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); 1386 1386 + 1381 1387 status = ocfs2_wait_on_mount(osb); 1382 1388 if (status < 0) { 1383 1389 goto bail; 1384 1390 } 1385 1391 1386 1386 - rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); 1387 1387 - if (!rm_quota) { 1388 1388 - status = -ENOMEM; 1389 1389 - goto bail; 1392 1392 + if (quota_enabled) { 1393 1393 + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); 1394 1394 + if (!rm_quota) { 1395 1395 + status = -ENOMEM; 1396 1396 + goto bail; 1397 1397 + } 1390 1398 } 1391 1399 restart: 1392 1400 status = ocfs2_super_lock(osb, 1); ··· 1430 1422 * then quota usage would be out of sync until some node takes 1431 1423 * the slot. So we remember which nodes need quota recovery 1432 1424 * and when everything else is done, we recover quotas. */ 1433 1433 - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); 1434 1434 - if (i == rm_quota_used) 1435 1435 - rm_quota[rm_quota_used++] = slot_num; 1425 1425 + if (quota_enabled) { 1426 1426 + for (i = 0; i < rm_quota_used 1427 1427 + && rm_quota[i] != slot_num; i++) 1428 1428 + ; 1429 1429 + 1430 1430 + if (i == rm_quota_used) 1431 1431 + rm_quota[rm_quota_used++] = slot_num; 1432 1432 + } 1436 1433 1437 1434 status = ocfs2_recover_node(osb, node_num, slot_num); 1438 1435 skip_recovery: ··· 1465 1452 /* Now it is right time to recover quotas... We have to do this under 1466 1453 * superblock lock so that no one can start using the slot (and crash) 1467 1454 * before we recover it */ 1468 1468 - for (i = 0; i < rm_quota_used; i++) { 1469 1469 - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); 1470 1470 - if (IS_ERR(qrec)) { 1471 1471 - status = PTR_ERR(qrec); 1472 1472 - mlog_errno(status); 1473 1473 - continue; 1455 1455 + if (quota_enabled) { 1456 1456 + for (i = 0; i < rm_quota_used; i++) { 1457 1457 + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); 1458 1458 + if (IS_ERR(qrec)) { 1459 1459 + status = PTR_ERR(qrec); 1460 1460 + mlog_errno(status); 1461 1461 + continue; 1462 1462 + } 1463 1463 + ocfs2_queue_recovery_completion(osb->journal, 1464 1464 + rm_quota[i], 1465 1465 + NULL, NULL, qrec, 1466 1466 + ORPHAN_NEED_TRUNCATE); 1474 1467 } 1475 1475 - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], 1476 1476 - NULL, NULL, qrec, 1477 1477 - ORPHAN_NEED_TRUNCATE); 1478 1468 } 1479 1469 1480 1470 ocfs2_super_unlock(osb, 1); ··· 1499 1483 1500 1484 mutex_unlock(&osb->recovery_lock); 1501 1485 1502 1502 - kfree(rm_quota); 1486 1486 + if (quota_enabled) 1487 1487 + kfree(rm_quota); 1503 1488 1504 1489 /* no one is callint kthread_stop() for us so the kthread() api 1505 1490 * requires that we call do_exit(). And it isn't exported, but

+17

fs/ocfs2/move_extents.c

reviewed

··· 25 25 #include "ocfs2_ioctl.h" 26 26 27 27 #include "alloc.h" 28 28 + #include "localalloc.h" 28 29 #include "aops.h" 29 30 #include "dlmglue.h" 30 31 #include "extent_map.h" ··· 234 233 struct ocfs2_refcount_tree *ref_tree = NULL; 235 234 u32 new_phys_cpos, new_len; 236 235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 236 236 + int need_free = 0; 237 237 238 238 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 239 239 BUG_ON(!ocfs2_is_refcount_inode(inode)); ··· 310 308 if (!partial) { 311 309 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 312 310 ret = -ENOSPC; 311 311 + need_free = 1; 313 312 goto out_commit; 314 313 } 315 314 } ··· 335 332 mlog_errno(ret); 336 333 337 334 out_commit: 335 335 + if (need_free && context->data_ac) { 336 336 + struct ocfs2_alloc_context *data_ac = context->data_ac; 337 337 + 338 338 + if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) 339 339 + ocfs2_free_local_alloc_bits(osb, handle, data_ac, 340 340 + new_phys_cpos, new_len); 341 341 + else 342 342 + ocfs2_free_clusters(handle, 343 343 + data_ac->ac_inode, 344 344 + data_ac->ac_bh, 345 345 + ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), 346 346 + new_len); 347 347 + } 348 348 + 338 349 ocfs2_commit_trans(osb, handle); 339 350 340 351 out_unlock_mutex:

-6

fs/ocfs2/stackglue.c

reviewed

··· 48 48 */ 49 49 static struct ocfs2_stack_plugin *active_stack; 50 50 51 51 - inline int ocfs2_is_o2cb_active(void) 52 52 - { 53 53 - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); 54 54 - } 55 55 - EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); 56 56 - 57 51 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) 58 52 { 59 53 struct ocfs2_stack_plugin *p;

-3

fs/ocfs2/stackglue.h

reviewed

··· 298 298 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); 299 299 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); 300 300 301 301 - /* In ocfs2_downconvert_lock(), we need to know which stack we are using */ 302 302 - int ocfs2_is_o2cb_active(void); 303 303 - 304 301 extern struct kset *ocfs2_kset; 305 302 306 303 #endif /* STACKGLUE_H */

+4 -8

include/linux/gfp.h

reviewed

··· 510 510 } 511 511 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 512 512 struct vm_area_struct *vma, unsigned long addr, 513 513 - int node, bool hugepage); 514 514 - #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 515 515 - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) 513 513 + int node); 516 514 #else 517 515 #define alloc_pages(gfp_mask, order) \ 518 516 alloc_pages_node(numa_node_id(), gfp_mask, order) 519 519 - #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ 520 520 - alloc_pages(gfp_mask, order) 521 521 - #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 517 517 + #define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ 522 518 alloc_pages(gfp_mask, order) 523 519 #endif 524 520 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 525 521 #define alloc_page_vma(gfp_mask, vma, addr) \ 526 526 - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) 522 522 + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) 527 523 #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 528 528 - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 524 524 + alloc_pages_vma(gfp_mask, 0, vma, addr, node) 529 525 530 526 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 531 527 extern unsigned long get_zeroed_page(gfp_t gfp_mask);

include/linux/mempolicy.h

reviewed

··· 139 139 struct mempolicy *get_task_policy(struct task_struct *p); 140 140 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 141 141 unsigned long addr); 142 142 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 143 143 + unsigned long addr); 142 144 bool vma_policy_mof(struct vm_area_struct *vma); 143 145 144 146 extern void numa_default_policy(void);

+1 -2

include/linux/notifier.h

reviewed

··· 122 122 123 123 #ifdef CONFIG_TREE_SRCU 124 124 #define _SRCU_NOTIFIER_HEAD(name, mod) \ 125 125 - static DEFINE_PER_CPU(struct srcu_data, \ 126 126 - name##_head_srcu_data); \ 125 125 + static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ 127 126 mod struct srcu_notifier_head name = \ 128 127 SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) 129 128

-2

kernel/kexec_file.c

reviewed

··· 25 25 #include <linux/elf.h> 26 26 #include <linux/elfcore.h> 27 27 #include <linux/kernel.h> 28 28 - #include <linux/kexec.h> 29 29 - #include <linux/slab.h> 30 28 #include <linux/syscalls.h> 31 29 #include <linux/vmalloc.h> 32 30 #include "kexec_internal.h"

-1

kernel/sysctl.c

reviewed

··· 66 66 #include <linux/kexec.h> 67 67 #include <linux/bpf.h> 68 68 #include <linux/mount.h> 69 69 - #include <linux/pipe_fs_i.h> 70 69 71 70 #include <linux/uaccess.h> 72 71 #include <asm/processor.h>

+29 -9

mm/huge_memory.c

reviewed

··· 629 629 * available 630 630 * never: never stall for any thp allocation 631 631 */ 632 632 - static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 632 632 + static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) 633 633 { 634 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 635 635 + gfp_t this_node = 0; 636 636 + 637 637 + #ifdef CONFIG_NUMA 638 638 + struct mempolicy *pol; 639 639 + /* 640 640 + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not 641 641 + * specified, to express a general desire to stay on the current 642 642 + * node for optimistic allocation attempts. If the defrag mode 643 643 + * and/or madvise hint requires the direct reclaim then we prefer 644 644 + * to fallback to other node rather than node reclaim because that 645 645 + * can lead to excessive reclaim even though there is free memory 646 646 + * on other nodes. We expect that NUMA preferences are specified 647 647 + * by memory policies. 648 648 + */ 649 649 + pol = get_vma_policy(vma, addr); 650 650 + if (pol->mode != MPOL_BIND) 651 651 + this_node = __GFP_THISNODE; 652 652 + mpol_cond_put(pol); 653 653 + #endif 635 654 636 655 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 637 656 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 638 657 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 639 639 - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 658 658 + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; 640 659 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 641 660 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 642 642 - __GFP_KSWAPD_RECLAIM); 661 661 + __GFP_KSWAPD_RECLAIM | this_node); 643 662 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 644 663 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 645 645 - 0); 646 646 - return GFP_TRANSHUGE_LIGHT; 664 664 + this_node); 665 665 + return GFP_TRANSHUGE_LIGHT | this_node; 647 666 } 648 667 649 668 /* Caller must hold page table lock. */ ··· 734 715 pte_free(vma->vm_mm, pgtable); 735 716 return ret; 736 717 } 737 737 - gfp = alloc_hugepage_direct_gfpmask(vma); 738 738 - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 718 718 + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 719 719 + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); 739 720 if (unlikely(!page)) { 740 721 count_vm_event(THP_FAULT_FALLBACK); 741 722 return VM_FAULT_FALLBACK; ··· 1305 1286 alloc: 1306 1287 if (transparent_hugepage_enabled(vma) && 1307 1288 !transparent_hugepage_debug_cow()) { 1308 1308 - huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1309 1309 - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1289 1289 + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 1290 1290 + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, 1291 1291 + haddr, numa_node_id()); 1310 1292 } else 1311 1293 new_page = NULL; 1312 1294

+1 -1

mm/memcontrol.c

reviewed

··· 2593 2593 struct mem_cgroup *memcg; 2594 2594 int ret = 0; 2595 2595 2596 2596 - if (memcg_kmem_bypass()) 2596 2596 + if (mem_cgroup_disabled() || memcg_kmem_bypass()) 2597 2597 return 0; 2598 2598 2599 2599 memcg = get_mem_cgroup_from_current();

mm/memory_hotplug.c

reviewed

··· 586 586 for (i = 0; i < sections_to_remove; i++) { 587 587 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 588 588 589 589 + cond_resched(); 589 590 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, 590 591 altmap); 591 592 map_offset = 0;

+4 -31

mm/mempolicy.c

reviewed

··· 1116 1116 } else if (PageTransHuge(page)) { 1117 1117 struct page *thp; 1118 1118 1119 1119 - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, 1120 1120 - HPAGE_PMD_ORDER); 1119 1119 + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, 1120 1120 + address, numa_node_id()); 1121 1121 if (!thp) 1122 1122 return NULL; 1123 1123 prep_transhuge_page(thp); ··· 1662 1662 * freeing by another task. It is the caller's responsibility to free the 1663 1663 * extra reference for shared policies. 1664 1664 */ 1665 1665 - static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1665 1665 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1666 1666 unsigned long addr) 1667 1667 { 1668 1668 struct mempolicy *pol = __get_vma_policy(vma, addr); ··· 2011 2011 * @vma: Pointer to VMA or NULL if not available. 2012 2012 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2013 2013 * @node: Which node to prefer for allocation (modulo policy). 2014 2014 - * @hugepage: for hugepages try only the preferred node if possible 2015 2014 * 2016 2015 * This function allocates a page from the kernel page pool and applies 2017 2016 * a NUMA policy associated with the VMA or the current process. ··· 2021 2022 */ 2022 2023 struct page * 2023 2024 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2024 2024 - unsigned long addr, int node, bool hugepage) 2025 2025 + unsigned long addr, int node) 2025 2026 { 2026 2027 struct mempolicy *pol; 2027 2028 struct page *page; ··· 2037 2038 mpol_cond_put(pol); 2038 2039 page = alloc_page_interleave(gfp, order, nid); 2039 2040 goto out; 2040 2040 - } 2041 2041 - 2042 2042 - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { 2043 2043 - int hpage_node = node; 2044 2044 - 2045 2045 - /* 2046 2046 - * For hugepage allocation and non-interleave policy which 2047 2047 - * allows the current node (or other explicitly preferred 2048 2048 - * node) we only try to allocate from the current/preferred 2049 2049 - * node and don't fall back to other nodes, as the cost of 2050 2050 - * remote accesses would likely offset THP benefits. 2051 2051 - * 2052 2052 - * If the policy is interleave, or does not allow the current 2053 2053 - * node in its nodemask, we allocate the standard way. 2054 2054 - */ 2055 2055 - if (pol->mode == MPOL_PREFERRED && 2056 2056 - !(pol->flags & MPOL_F_LOCAL)) 2057 2057 - hpage_node = pol->v.preferred_node; 2058 2058 - 2059 2059 - nmask = policy_nodemask(gfp, pol); 2060 2060 - if (!nmask || node_isset(hpage_node, *nmask)) { 2061 2061 - mpol_cond_put(pol); 2062 2062 - page = __alloc_pages_node(hpage_node, 2063 2063 - gfp | __GFP_THISNODE, order); 2064 2064 - goto out; 2065 2065 - } 2066 2041 } 2067 2042 2068 2043 nmask = policy_nodemask(gfp, pol);

+1 -1

mm/shmem.c

reviewed

··· 1435 1435 1436 1436 shmem_pseudo_vma_init(&pvma, info, hindex); 1437 1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1438 1438 - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1438 1438 + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); 1439 1439 shmem_pseudo_vma_destroy(&pvma); 1440 1440 if (page) 1441 1441 prep_transhuge_page(page);