From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 6 +++--- arch/arm/xen/mm.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/x86/kernel/pci-dma.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ad4eb2d26e16..e62400e5fb99 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT)) + else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, caller, want_vaddr); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (!(gfp & __GFP_WAIT)) + else if (!gfpflags_allow_blocking(gfp)) addr = __alloc_from_pool(size, &page); else addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, @@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, *handle = DMA_ERROR_CODE; size = PAGE_ALIGN(size); - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return __iommu_alloc_atomic(dev, size, handle); /* diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7c34f7126b04..c5f9a9e3d1f3 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,7 +25,7 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { struct memblock_region *reg; - gfp_t flags = __GFP_NOWARN; + gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6320361d8d4c..bb4bf6a06ad6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) { struct page *page; void *addr; @@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size, size = PAGE_ALIGN(size); - if (!coherent && !(flags & __GFP_WAIT)) { + if (!coherent && !gfpflags_allow_blocking(flags)) { struct page *page = NULL; void *addr = __alloc_from_pool(size, &page, flags); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); -- cgit v1.2.3 From 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:54 -0800 Subject: mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Cc: Vlastimil Babka Acked-by: Paul E. McKenney Cc: Aneesh Kumar K.V Cc: Andi Kleen Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/xtensa/configs/iss_defconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index f3dfe0d921c2..44c6764d9146 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -169,7 +169,6 @@ CONFIG_FLATMEM_MANUAL=y # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y -CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 -- cgit v1.2.3 From 06d8f8178c5709ff21cfbe16b6c078d2669b6e80 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:05 -0800 Subject: arch/sh/kernel/traps_64.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/cpu/sh5/unwind.c | 2 +- arch/sh/kernel/traps_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sh/kernel/cpu/sh5/unwind.c b/arch/sh/kernel/cpu/sh5/unwind.c index 10aed41757fc..3a4fed406fc6 100644 --- a/arch/sh/kernel/cpu/sh5/unwind.c +++ b/arch/sh/kernel/cpu/sh5/unwind.c @@ -159,7 +159,7 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, /* Sign extend */ regcache[dest] = - ((((s64)(u64)op >> 10) & 0xffff) << 54) >> 54; + sign_extend64((((u64)op >> 10) & 0xffff), 9); break; case (0xd0 >> 2): /* addi */ case (0xd4 >> 2): /* addi.l */ diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 112ea11c030d..d208c27ccc67 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -101,7 +101,7 @@ static int generate_and_check_address(struct pt_regs *regs, if (displacement_not_indexed) { __s64 displacement; displacement = (opcode >> 10) & 0x3ff; - displacement = ((displacement << 54) >> 54); /* sign extend */ + displacement = sign_extend64(displacement, 9); addr = (__u64)((__s64)base_address + (displacement << width_shift)); } else { __u64 offset; -- cgit v1.2.3 From 78e3c7951021b4e1a554b3d619506b55b0619073 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:08 -0800 Subject: arch/x86/kernel/cpu/perf_event_msr.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_msr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13934f2..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } -- cgit v1.2.3