From bea41197ead3e03308bdd10c11db3ce91ae5c8ab Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 25 Jun 2015 14:59:39 -0700 Subject: s390/mm: make hugepages_supported a boot time decision There is a potential bug with KVM and hugetlbfs if the hardware does not support hugepages (EDAT1). We fix this by making EDAT1 a hard requirement for hugepages and therefore removing and simplifying code. As s390, with the sw-emulated hugepages, was the only user of arch_prepare/release_hugepage I also removed theses calls from common and other architecture code. This patch (of 5): By dropping support for hugepages on machines which do not have the hardware feature EDAT1, we fix a potential s390 KVM bug. The bug would happen if a guest is backed by hugetlbfs (not supported currently), but does not get pagetables with PGSTE. This would lead to random memory overwrites. Signed-off-by: Dominik Dingel Acked-by: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/page.h | 8 ++++---- arch/s390/kernel/setup.c | 2 ++ arch/s390/mm/pgtable.c | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 53eacbd4f09b..0844b780c6a4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -17,7 +17,10 @@ #define PAGE_DEFAULT_ACC 0 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) -#define HPAGE_SHIFT 20 +#include +#ifndef __ASSEMBLY__ + +extern unsigned int HPAGE_SHIFT; #define HPAGE_SIZE (1UL << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) @@ -27,9 +30,6 @@ #define ARCH_HAS_PREPARE_HUGEPAGE #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH -#include -#ifndef __ASSEMBLY__ - static inline void storage_key_init_range(unsigned long start, unsigned long end) { #if PAGE_DEFAULT_KEY diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index af4f41d52cde..73941bf42350 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -880,6 +880,8 @@ void __init setup_arch(char **cmdline_p) */ setup_hwcaps(); + HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0; + /* * Create kernel page tables and switch to virtual addressing. */ diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b33f66110ca9..16154720bdb6 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -31,6 +31,8 @@ #define ALLOC_ORDER 2 #define FRAG_MASK 0x03 +unsigned int HPAGE_SHIFT; + unsigned long *crst_table_alloc(struct mm_struct *mm) { struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); -- cgit v1.2.3 From ce415712cf920e08d622f07eaa9f5d1eb7e93919 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 25 Jun 2015 14:59:47 -0700 Subject: s390/hugetlb: remove dead code for sw emulated huge pages We now support only hugepages on hardware with EDAT1 support. So we remove the prepare/release_hugepage hooks and simplify set_huge_pte_at and huge_ptep_get. Signed-off-by: Dominik Dingel Acked-by: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/hugetlb.h | 3 --- arch/s390/mm/hugetlbpage.c | 60 +++-------------------------------------- 2 files changed, 3 insertions(+), 60 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index dfb542ade6b1..0130d0379edd 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -37,9 +37,6 @@ static inline int prepare_hugepage_range(struct file *file, #define arch_clear_hugepage_flags(page) do { } while (0) -int arch_prepare_hugepage(struct page *page); -void arch_release_hugepage(struct page *page); - static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c3f8e3df92ff..3bbfd4f43ed7 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -86,31 +86,16 @@ static inline pte_t __pmd_to_pte(pmd_t pmd) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - pmd_t pmd; + pmd_t pmd = __pte_to_pmd(pte); - pmd = __pte_to_pmd(pte); - if (!MACHINE_HAS_HPAGE) { - /* Emulated huge ptes loose the dirty and young bit */ - pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN; - pmd_val(pmd) |= pte_page(pte)[1].index; - } else - pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; + pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; *(pmd_t *) ptep = pmd; } pte_t huge_ptep_get(pte_t *ptep) { - unsigned long origin; - pmd_t pmd; + pmd_t pmd = *(pmd_t *) ptep; - pmd = *(pmd_t *) ptep; - if (!MACHINE_HAS_HPAGE && pmd_present(pmd)) { - origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN; - pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN; - pmd_val(pmd) |= *(unsigned long *) origin; - /* Emulated huge ptes are young and dirty by definition */ - pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG | _SEGMENT_ENTRY_DIRTY; - } return __pmd_to_pte(pmd); } @@ -125,45 +110,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } -int arch_prepare_hugepage(struct page *page) -{ - unsigned long addr = page_to_phys(page); - pte_t pte; - pte_t *ptep; - int i; - - if (MACHINE_HAS_HPAGE) - return 0; - - ptep = (pte_t *) pte_alloc_one(&init_mm, addr); - if (!ptep) - return -ENOMEM; - - pte_val(pte) = addr; - for (i = 0; i < PTRS_PER_PTE; i++) { - set_pte_at(&init_mm, addr + i * PAGE_SIZE, ptep + i, pte); - pte_val(pte) += PAGE_SIZE; - } - page[1].index = (unsigned long) ptep; - return 0; -} - -void arch_release_hugepage(struct page *page) -{ - pte_t *ptep; - - if (MACHINE_HAS_HPAGE) - return; - - ptep = (pte_t *) page[1].index; - if (!ptep) - return; - clear_table((unsigned long *) ptep, _PAGE_INVALID, - PTRS_PER_PTE * sizeof(pte_t)); - page_table_free(&init_mm, (unsigned long *) ptep); - page[1].index = 0; -} - pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { -- cgit v1.2.3 From cbd7d9c2b70f5e2fc78e0c90b3034b94dca6c82b Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 25 Jun 2015 14:59:49 -0700 Subject: s390/mm: forward check for huge pmds to pmd_large() We already do the check in pmd_large, so we can just forward the call. Signed-off-by: Dominik Dingel Acked-by: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/hugetlbpage.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 3bbfd4f43ed7..fb4bf2c4379e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -141,10 +141,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) int pmd_huge(pmd_t pmd) { - if (!MACHINE_HAS_HPAGE) - return 0; - - return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE); + return pmd_large(pmd); } int pud_huge(pud_t pud) -- cgit v1.2.3 From cf54e2fce51c7ad2479fe8cf213a2ed618a8189b Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 25 Jun 2015 14:59:52 -0700 Subject: s390/mm: change HPAGE_SHIFT type to int With making HPAGE_SHIFT an unsigned integer we also accidentally changed pageblock_order. In order to avoid compiler warnings we make HPAGE_SHFIT an int again. Signed-off-by: Dominik Dingel Suggested-by: Andrew Morton Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/page.h | 2 +- arch/s390/mm/pgtable.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 0844b780c6a4..dd345238d9a7 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -20,7 +20,7 @@ #include #ifndef __ASSEMBLY__ -extern unsigned int HPAGE_SHIFT; +extern int HPAGE_SHIFT; #define HPAGE_SIZE (1UL << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 16154720bdb6..33082d0d101b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -31,7 +31,7 @@ #define ALLOC_ORDER 2 #define FRAG_MASK 0x03 -unsigned int HPAGE_SHIFT; +int HPAGE_SHIFT; unsigned long *crst_table_alloc(struct mm_struct *mm) { -- cgit v1.2.3 From 3033f14ab78c326871a4902591c2518410add24a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 25 Jun 2015 15:01:19 -0700 Subject: clone: support passing tls argument via C rather than pt_regs magic clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. This patch (of 2): clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thiago Macieira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_wrapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index d7fa2f0f1425..f8498dde67b1 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -202,7 +202,7 @@ COMPAT_SYSCALL_WRAP1(epoll_create1, int, flags); COMPAT_SYSCALL_WRAP2(tkill, int, pid, int, sig); COMPAT_SYSCALL_WRAP3(tgkill, int, tgid, int, pid, int, sig); COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags); -COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val); +COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls); COMPAT_SYSCALL_WRAP2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags); COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim); COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag); -- cgit v1.2.3