Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:

- more ocfs2 work

- various leftovers

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
memory_hotplug: cond_resched in __remove_pages
bfs: add sanity check at bfs_fill_super()
kernel/sysctl.c: remove duplicated include
kernel/kexec_file.c: remove some duplicated includes
mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask
ocfs2: fix clusters leak in ocfs2_defrag_extent()
ocfs2: dlmglue: clean up timestamp handling
ocfs2: don't put and assigning null to bh allocated outside
ocfs2: fix a misuse a of brelse after failing ocfs2_check_dir_entry
ocfs2: don't use iocb when EIOCBQUEUED returns
ocfs2: without quota support, avoid calling quota recovery
ocfs2: remove ocfs2_is_o2cb_active()
mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings
include/linux/notifier.h: SRCU: fix ctags
mm: handle no memcg case in memcg_kmem_charge() properly

+172 -124
+6 -3
fs/bfs/inode.c
··· 350 350 351 351 s->s_magic = BFS_MAGIC; 352 352 353 - if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { 353 + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || 354 + le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { 354 355 printf("Superblock is corrupted\n"); 355 356 goto out1; 356 357 } ··· 360 359 sizeof(struct bfs_inode) 361 360 + BFS_ROOT_INO - 1; 362 361 imap_len = (info->si_lasti / 8) + 1; 363 - info->si_imap = kzalloc(imap_len, GFP_KERNEL); 364 - if (!info->si_imap) 362 + info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); 363 + if (!info->si_imap) { 364 + printf("Cannot allocate %u bytes\n", imap_len); 365 365 goto out1; 366 + } 366 367 for (i = 0; i < BFS_ROOT_INO; i++) 367 368 set_bit(i, info->si_imap); 368 369
+59 -18
fs/ocfs2/buffer_head_io.c
··· 99 99 return ret; 100 100 } 101 101 102 + /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it 103 + * will be easier to handle read failure. 104 + */ 102 105 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 103 106 unsigned int nr, struct buffer_head *bhs[]) 104 107 { 105 108 int status = 0; 106 109 unsigned int i; 107 110 struct buffer_head *bh; 111 + int new_bh = 0; 108 112 109 113 trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); 110 114 111 115 if (!nr) 112 116 goto bail; 117 + 118 + /* Don't put buffer head and re-assign it to NULL if it is allocated 119 + * outside since the caller can't be aware of this alternation! 120 + */ 121 + new_bh = (bhs[0] == NULL); 113 122 114 123 for (i = 0 ; i < nr ; i++) { 115 124 if (bhs[i] == NULL) { ··· 126 117 if (bhs[i] == NULL) { 127 118 status = -ENOMEM; 128 119 mlog_errno(status); 129 - goto bail; 120 + break; 130 121 } 131 122 } 132 123 bh = bhs[i]; ··· 167 158 submit_bh(REQ_OP_READ, 0, bh); 168 159 } 169 160 161 + read_failure: 170 162 for (i = nr; i > 0; i--) { 171 163 bh = bhs[i - 1]; 164 + 165 + if (unlikely(status)) { 166 + if (new_bh && bh) { 167 + /* If middle bh fails, let previous bh 168 + * finish its read and then put it to 169 + * aovoid bh leak 170 + */ 171 + if (!buffer_jbd(bh)) 172 + wait_on_buffer(bh); 173 + put_bh(bh); 174 + bhs[i - 1] = NULL; 175 + } else if (bh && buffer_uptodate(bh)) { 176 + clear_buffer_uptodate(bh); 177 + } 178 + continue; 179 + } 172 180 173 181 /* No need to wait on the buffer if it's managed by JBD. */ 174 182 if (!buffer_jbd(bh)) ··· 196 170 * so we can safely record this and loop back 197 171 * to cleanup the other buffers. */ 198 172 status = -EIO; 199 - put_bh(bh); 200 - bhs[i - 1] = NULL; 173 + goto read_failure; 201 174 } 202 175 } 203 176 ··· 204 179 return status; 205 180 } 206 181 182 + /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it 183 + * will be easier to handle read failure. 184 + */ 207 185 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, 208 186 struct buffer_head *bhs[], int flags, 209 187 int (*validate)(struct super_block *sb, ··· 216 188 int i, ignore_cache = 0; 217 189 struct buffer_head *bh; 218 190 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 191 + int new_bh = 0; 219 192 220 193 trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); 221 194 ··· 242 213 goto bail; 243 214 } 244 215 216 + /* Don't put buffer head and re-assign it to NULL if it is allocated 217 + * outside since the caller can't be aware of this alternation! 218 + */ 219 + new_bh = (bhs[0] == NULL); 220 + 245 221 ocfs2_metadata_cache_io_lock(ci); 246 222 for (i = 0 ; i < nr ; i++) { 247 223 if (bhs[i] == NULL) { ··· 255 221 ocfs2_metadata_cache_io_unlock(ci); 256 222 status = -ENOMEM; 257 223 mlog_errno(status); 258 - goto bail; 224 + /* Don't forget to put previous bh! */ 225 + break; 259 226 } 260 227 } 261 228 bh = bhs[i]; ··· 351 316 } 352 317 } 353 318 354 - status = 0; 355 - 319 + read_failure: 356 320 for (i = (nr - 1); i >= 0; i--) { 357 321 bh = bhs[i]; 358 322 359 323 if (!(flags & OCFS2_BH_READAHEAD)) { 360 - if (status) { 361 - /* Clear the rest of the buffers on error */ 362 - put_bh(bh); 363 - bhs[i] = NULL; 324 + if (unlikely(status)) { 325 + /* Clear the buffers on error including those 326 + * ever succeeded in reading 327 + */ 328 + if (new_bh && bh) { 329 + /* If middle bh fails, let previous bh 330 + * finish its read and then put it to 331 + * aovoid bh leak 332 + */ 333 + if (!buffer_jbd(bh)) 334 + wait_on_buffer(bh); 335 + put_bh(bh); 336 + bhs[i] = NULL; 337 + } else if (bh && buffer_uptodate(bh)) { 338 + clear_buffer_uptodate(bh); 339 + } 364 340 continue; 365 341 } 366 342 /* We know this can't have changed as we hold the ··· 389 343 * uptodate. */ 390 344 status = -EIO; 391 345 clear_buffer_needs_validate(bh); 392 - put_bh(bh); 393 - bhs[i] = NULL; 394 - continue; 346 + goto read_failure; 395 347 } 396 348 397 349 if (buffer_needs_validate(bh)) { ··· 399 355 BUG_ON(buffer_jbd(bh)); 400 356 clear_buffer_needs_validate(bh); 401 357 status = validate(sb, bh); 402 - if (status) { 403 - put_bh(bh); 404 - bhs[i] = NULL; 405 - continue; 406 - } 358 + if (status) 359 + goto read_failure; 407 360 } 408 361 } 409 362
+1 -2
fs/ocfs2/dir.c
··· 1897 1897 /* On error, skip the f_pos to the 1898 1898 next block. */ 1899 1899 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; 1900 - brelse(bh); 1901 - continue; 1900 + break; 1902 1901 } 1903 1902 if (le64_to_cpu(de->inode)) { 1904 1903 unsigned char d_type = DT_UNKNOWN;
+10 -18
fs/ocfs2/dlmglue.c
··· 2123 2123 2124 2124 /* LVB only has room for 64 bits of time here so we pack it for 2125 2125 * now. */ 2126 - static u64 ocfs2_pack_timespec(struct timespec *spec) 2126 + static u64 ocfs2_pack_timespec(struct timespec64 *spec) 2127 2127 { 2128 2128 u64 res; 2129 - u64 sec = spec->tv_sec; 2129 + u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull); 2130 2130 u32 nsec = spec->tv_nsec; 2131 2131 2132 2132 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); ··· 2142 2142 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2143 2143 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2144 2144 struct ocfs2_meta_lvb *lvb; 2145 - struct timespec ts; 2146 2145 2147 2146 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 2148 2147 ··· 2162 2163 lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); 2163 2164 lvb->lvb_imode = cpu_to_be16(inode->i_mode); 2164 2165 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); 2165 - ts = timespec64_to_timespec(inode->i_atime); 2166 2166 lvb->lvb_iatime_packed = 2167 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2168 - ts = timespec64_to_timespec(inode->i_ctime); 2167 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); 2169 2168 lvb->lvb_ictime_packed = 2170 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2171 - ts = timespec64_to_timespec(inode->i_mtime); 2169 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); 2172 2170 lvb->lvb_imtime_packed = 2173 - cpu_to_be64(ocfs2_pack_timespec(&ts)); 2171 + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 2174 2172 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); 2175 2173 lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); 2176 2174 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); ··· 2176 2180 mlog_meta_lvb(0, lockres); 2177 2181 } 2178 2182 2179 - static void ocfs2_unpack_timespec(struct timespec *spec, 2183 + static void ocfs2_unpack_timespec(struct timespec64 *spec, 2180 2184 u64 packed_time) 2181 2185 { 2182 2186 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; ··· 2185 2189 2186 2190 static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 2187 2191 { 2188 - struct timespec ts; 2189 2192 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2190 2193 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2191 2194 struct ocfs2_meta_lvb *lvb; ··· 2212 2217 i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); 2213 2218 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 2214 2219 set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); 2215 - ocfs2_unpack_timespec(&ts, 2220 + ocfs2_unpack_timespec(&inode->i_atime, 2216 2221 be64_to_cpu(lvb->lvb_iatime_packed)); 2217 - inode->i_atime = timespec_to_timespec64(ts); 2218 - ocfs2_unpack_timespec(&ts, 2222 + ocfs2_unpack_timespec(&inode->i_mtime, 2219 2223 be64_to_cpu(lvb->lvb_imtime_packed)); 2220 - inode->i_mtime = timespec_to_timespec64(ts); 2221 - ocfs2_unpack_timespec(&ts, 2224 + ocfs2_unpack_timespec(&inode->i_ctime, 2222 2225 be64_to_cpu(lvb->lvb_ictime_packed)); 2223 - inode->i_ctime = timespec_to_timespec64(ts); 2224 2226 spin_unlock(&oi->ip_lock); 2225 2227 } 2226 2228 ··· 3595 3603 * we can recover correctly from node failure. Otherwise, we may get 3596 3604 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. 3597 3605 */ 3598 - if (!ocfs2_is_o2cb_active() && 3606 + if (ocfs2_userspace_stack(osb) && 3599 3607 lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 3600 3608 lvb = 1; 3601 3609
+2 -2
fs/ocfs2/file.c
··· 2343 2343 2344 2344 written = __generic_file_write_iter(iocb, from); 2345 2345 /* buffered aio wouldn't have proper lock coverage today */ 2346 - BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2346 + BUG_ON(written == -EIOCBQUEUED && !direct_io); 2347 2347 2348 2348 /* 2349 2349 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io ··· 2463 2463 trace_generic_file_read_iter_ret(ret); 2464 2464 2465 2465 /* buffered aio wouldn't have proper lock coverage today */ 2466 - BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2466 + BUG_ON(ret == -EIOCBQUEUED && !direct_io); 2467 2467 2468 2468 /* see ocfs2_file_write_iter */ 2469 2469 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+34 -17
fs/ocfs2/journal.c
··· 1378 1378 int rm_quota_used = 0, i; 1379 1379 struct ocfs2_quota_recovery *qrec; 1380 1380 1381 + /* Whether the quota supported. */ 1382 + int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, 1383 + OCFS2_FEATURE_RO_COMPAT_USRQUOTA) 1384 + || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, 1385 + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA); 1386 + 1381 1387 status = ocfs2_wait_on_mount(osb); 1382 1388 if (status < 0) { 1383 1389 goto bail; 1384 1390 } 1385 1391 1386 - rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); 1387 - if (!rm_quota) { 1388 - status = -ENOMEM; 1389 - goto bail; 1392 + if (quota_enabled) { 1393 + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); 1394 + if (!rm_quota) { 1395 + status = -ENOMEM; 1396 + goto bail; 1397 + } 1390 1398 } 1391 1399 restart: 1392 1400 status = ocfs2_super_lock(osb, 1); ··· 1430 1422 * then quota usage would be out of sync until some node takes 1431 1423 * the slot. So we remember which nodes need quota recovery 1432 1424 * and when everything else is done, we recover quotas. */ 1433 - for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); 1434 - if (i == rm_quota_used) 1435 - rm_quota[rm_quota_used++] = slot_num; 1425 + if (quota_enabled) { 1426 + for (i = 0; i < rm_quota_used 1427 + && rm_quota[i] != slot_num; i++) 1428 + ; 1429 + 1430 + if (i == rm_quota_used) 1431 + rm_quota[rm_quota_used++] = slot_num; 1432 + } 1436 1433 1437 1434 status = ocfs2_recover_node(osb, node_num, slot_num); 1438 1435 skip_recovery: ··· 1465 1452 /* Now it is right time to recover quotas... We have to do this under 1466 1453 * superblock lock so that no one can start using the slot (and crash) 1467 1454 * before we recover it */ 1468 - for (i = 0; i < rm_quota_used; i++) { 1469 - qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); 1470 - if (IS_ERR(qrec)) { 1471 - status = PTR_ERR(qrec); 1472 - mlog_errno(status); 1473 - continue; 1455 + if (quota_enabled) { 1456 + for (i = 0; i < rm_quota_used; i++) { 1457 + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); 1458 + if (IS_ERR(qrec)) { 1459 + status = PTR_ERR(qrec); 1460 + mlog_errno(status); 1461 + continue; 1462 + } 1463 + ocfs2_queue_recovery_completion(osb->journal, 1464 + rm_quota[i], 1465 + NULL, NULL, qrec, 1466 + ORPHAN_NEED_TRUNCATE); 1474 1467 } 1475 - ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], 1476 - NULL, NULL, qrec, 1477 - ORPHAN_NEED_TRUNCATE); 1478 1468 } 1479 1469 1480 1470 ocfs2_super_unlock(osb, 1); ··· 1499 1483 1500 1484 mutex_unlock(&osb->recovery_lock); 1501 1485 1502 - kfree(rm_quota); 1486 + if (quota_enabled) 1487 + kfree(rm_quota); 1503 1488 1504 1489 /* no one is callint kthread_stop() for us so the kthread() api 1505 1490 * requires that we call do_exit(). And it isn't exported, but
+17
fs/ocfs2/move_extents.c
··· 25 25 #include "ocfs2_ioctl.h" 26 26 27 27 #include "alloc.h" 28 + #include "localalloc.h" 28 29 #include "aops.h" 29 30 #include "dlmglue.h" 30 31 #include "extent_map.h" ··· 234 233 struct ocfs2_refcount_tree *ref_tree = NULL; 235 234 u32 new_phys_cpos, new_len; 236 235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 236 + int need_free = 0; 237 237 238 238 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 239 239 BUG_ON(!ocfs2_is_refcount_inode(inode)); ··· 310 308 if (!partial) { 311 309 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 312 310 ret = -ENOSPC; 311 + need_free = 1; 313 312 goto out_commit; 314 313 } 315 314 } ··· 335 332 mlog_errno(ret); 336 333 337 334 out_commit: 335 + if (need_free && context->data_ac) { 336 + struct ocfs2_alloc_context *data_ac = context->data_ac; 337 + 338 + if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) 339 + ocfs2_free_local_alloc_bits(osb, handle, data_ac, 340 + new_phys_cpos, new_len); 341 + else 342 + ocfs2_free_clusters(handle, 343 + data_ac->ac_inode, 344 + data_ac->ac_bh, 345 + ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), 346 + new_len); 347 + } 348 + 338 349 ocfs2_commit_trans(osb, handle); 339 350 340 351 out_unlock_mutex:
-6
fs/ocfs2/stackglue.c
··· 48 48 */ 49 49 static struct ocfs2_stack_plugin *active_stack; 50 50 51 - inline int ocfs2_is_o2cb_active(void) 52 - { 53 - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); 54 - } 55 - EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); 56 - 57 51 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) 58 52 { 59 53 struct ocfs2_stack_plugin *p;
-3
fs/ocfs2/stackglue.h
··· 298 298 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); 299 299 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); 300 300 301 - /* In ocfs2_downconvert_lock(), we need to know which stack we are using */ 302 - int ocfs2_is_o2cb_active(void); 303 - 304 301 extern struct kset *ocfs2_kset; 305 302 306 303 #endif /* STACKGLUE_H */
+4 -8
include/linux/gfp.h
··· 510 510 } 511 511 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 512 512 struct vm_area_struct *vma, unsigned long addr, 513 - int node, bool hugepage); 514 - #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 515 - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) 513 + int node); 516 514 #else 517 515 #define alloc_pages(gfp_mask, order) \ 518 516 alloc_pages_node(numa_node_id(), gfp_mask, order) 519 - #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ 520 - alloc_pages(gfp_mask, order) 521 - #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 517 + #define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ 522 518 alloc_pages(gfp_mask, order) 523 519 #endif 524 520 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 525 521 #define alloc_page_vma(gfp_mask, vma, addr) \ 526 - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) 522 + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) 527 523 #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 528 - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 524 + alloc_pages_vma(gfp_mask, 0, vma, addr, node) 529 525 530 526 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 531 527 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
+2
include/linux/mempolicy.h
··· 139 139 struct mempolicy *get_task_policy(struct task_struct *p); 140 140 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 141 141 unsigned long addr); 142 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 143 + unsigned long addr); 142 144 bool vma_policy_mof(struct vm_area_struct *vma); 143 145 144 146 extern void numa_default_policy(void);
+1 -2
include/linux/notifier.h
··· 122 122 123 123 #ifdef CONFIG_TREE_SRCU 124 124 #define _SRCU_NOTIFIER_HEAD(name, mod) \ 125 - static DEFINE_PER_CPU(struct srcu_data, \ 126 - name##_head_srcu_data); \ 125 + static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ 127 126 mod struct srcu_notifier_head name = \ 128 127 SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) 129 128
-2
kernel/kexec_file.c
··· 25 25 #include <linux/elf.h> 26 26 #include <linux/elfcore.h> 27 27 #include <linux/kernel.h> 28 - #include <linux/kexec.h> 29 - #include <linux/slab.h> 30 28 #include <linux/syscalls.h> 31 29 #include <linux/vmalloc.h> 32 30 #include "kexec_internal.h"
-1
kernel/sysctl.c
··· 66 66 #include <linux/kexec.h> 67 67 #include <linux/bpf.h> 68 68 #include <linux/mount.h> 69 - #include <linux/pipe_fs_i.h> 70 69 71 70 #include <linux/uaccess.h> 72 71 #include <asm/processor.h>
+29 -9
mm/huge_memory.c
··· 629 629 * available 630 630 * never: never stall for any thp allocation 631 631 */ 632 - static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 632 + static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) 633 633 { 634 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 635 + gfp_t this_node = 0; 636 + 637 + #ifdef CONFIG_NUMA 638 + struct mempolicy *pol; 639 + /* 640 + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not 641 + * specified, to express a general desire to stay on the current 642 + * node for optimistic allocation attempts. If the defrag mode 643 + * and/or madvise hint requires the direct reclaim then we prefer 644 + * to fallback to other node rather than node reclaim because that 645 + * can lead to excessive reclaim even though there is free memory 646 + * on other nodes. We expect that NUMA preferences are specified 647 + * by memory policies. 648 + */ 649 + pol = get_vma_policy(vma, addr); 650 + if (pol->mode != MPOL_BIND) 651 + this_node = __GFP_THISNODE; 652 + mpol_cond_put(pol); 653 + #endif 635 654 636 655 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 637 656 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 638 657 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 639 - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 658 + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; 640 659 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 641 660 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 642 - __GFP_KSWAPD_RECLAIM); 661 + __GFP_KSWAPD_RECLAIM | this_node); 643 662 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 644 663 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 645 - 0); 646 - return GFP_TRANSHUGE_LIGHT; 664 + this_node); 665 + return GFP_TRANSHUGE_LIGHT | this_node; 647 666 } 648 667 649 668 /* Caller must hold page table lock. */ ··· 734 715 pte_free(vma->vm_mm, pgtable); 735 716 return ret; 736 717 } 737 - gfp = alloc_hugepage_direct_gfpmask(vma); 738 - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 718 + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 719 + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); 739 720 if (unlikely(!page)) { 740 721 count_vm_event(THP_FAULT_FALLBACK); 741 722 return VM_FAULT_FALLBACK; ··· 1305 1286 alloc: 1306 1287 if (transparent_hugepage_enabled(vma) && 1307 1288 !transparent_hugepage_debug_cow()) { 1308 - huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1309 - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1289 + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 1290 + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, 1291 + haddr, numa_node_id()); 1310 1292 } else 1311 1293 new_page = NULL; 1312 1294
+1 -1
mm/memcontrol.c
··· 2593 2593 struct mem_cgroup *memcg; 2594 2594 int ret = 0; 2595 2595 2596 - if (memcg_kmem_bypass()) 2596 + if (mem_cgroup_disabled() || memcg_kmem_bypass()) 2597 2597 return 0; 2598 2598 2599 2599 memcg = get_mem_cgroup_from_current();
+1
mm/memory_hotplug.c
··· 586 586 for (i = 0; i < sections_to_remove; i++) { 587 587 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 588 588 589 + cond_resched(); 589 590 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, 590 591 altmap); 591 592 map_offset = 0;
+4 -31
mm/mempolicy.c
··· 1116 1116 } else if (PageTransHuge(page)) { 1117 1117 struct page *thp; 1118 1118 1119 - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, 1120 - HPAGE_PMD_ORDER); 1119 + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, 1120 + address, numa_node_id()); 1121 1121 if (!thp) 1122 1122 return NULL; 1123 1123 prep_transhuge_page(thp); ··· 1662 1662 * freeing by another task. It is the caller's responsibility to free the 1663 1663 * extra reference for shared policies. 1664 1664 */ 1665 - static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1665 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1666 1666 unsigned long addr) 1667 1667 { 1668 1668 struct mempolicy *pol = __get_vma_policy(vma, addr); ··· 2011 2011 * @vma: Pointer to VMA or NULL if not available. 2012 2012 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2013 2013 * @node: Which node to prefer for allocation (modulo policy). 2014 - * @hugepage: for hugepages try only the preferred node if possible 2015 2014 * 2016 2015 * This function allocates a page from the kernel page pool and applies 2017 2016 * a NUMA policy associated with the VMA or the current process. ··· 2021 2022 */ 2022 2023 struct page * 2023 2024 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2024 - unsigned long addr, int node, bool hugepage) 2025 + unsigned long addr, int node) 2025 2026 { 2026 2027 struct mempolicy *pol; 2027 2028 struct page *page; ··· 2037 2038 mpol_cond_put(pol); 2038 2039 page = alloc_page_interleave(gfp, order, nid); 2039 2040 goto out; 2040 - } 2041 - 2042 - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { 2043 - int hpage_node = node; 2044 - 2045 - /* 2046 - * For hugepage allocation and non-interleave policy which 2047 - * allows the current node (or other explicitly preferred 2048 - * node) we only try to allocate from the current/preferred 2049 - * node and don't fall back to other nodes, as the cost of 2050 - * remote accesses would likely offset THP benefits. 2051 - * 2052 - * If the policy is interleave, or does not allow the current 2053 - * node in its nodemask, we allocate the standard way. 2054 - */ 2055 - if (pol->mode == MPOL_PREFERRED && 2056 - !(pol->flags & MPOL_F_LOCAL)) 2057 - hpage_node = pol->v.preferred_node; 2058 - 2059 - nmask = policy_nodemask(gfp, pol); 2060 - if (!nmask || node_isset(hpage_node, *nmask)) { 2061 - mpol_cond_put(pol); 2062 - page = __alloc_pages_node(hpage_node, 2063 - gfp | __GFP_THISNODE, order); 2064 - goto out; 2065 - } 2066 2041 } 2067 2042 2068 2043 nmask = policy_nodemask(gfp, pol);
+1 -1
mm/shmem.c
··· 1435 1435 1436 1436 shmem_pseudo_vma_init(&pvma, info, hindex); 1437 1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1438 - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1438 + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); 1439 1439 shmem_pseudo_vma_destroy(&pvma); 1440 1440 if (page) 1441 1441 prep_transhuge_page(page);