diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-26 18:08:18 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-26 18:08:18 -0800 |
commit | e57d9f638af9673f38d9f09de66fa0a28303127d (patch) | |
tree | 1948825bba57b4563ef0a5e7dd7e90634441b66e /arch/x86 | |
parent | d6e867a6ae13bc02cd01c535764e5b051d26cf28 (diff) | |
parent | 6848ac7ca39a226ede5df7af0efcc4ef0611e99c (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main changes in this cycle were:
- Update and clean up x86 fault handling, by Andy Lutomirski.
- Drop usage of __flush_tlb_all() in kernel_physical_mapping_init()
and related fallout, by Dan Williams.
- CPA cleanups and reorganization by Peter Zijlstra: simplify the
flow and remove a few warts.
- Other misc cleanups"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits)
x86/mm/dump_pagetables: Use DEFINE_SHOW_ATTRIBUTE()
x86/mm/cpa: Rename @addrinarray to @numpages
x86/mm/cpa: Better use CLFLUSHOPT
x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function
x86/mm/cpa: Make cpa_data::numpages invariant
x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation
x86/mm/cpa: Simplify the code after making cpa->vaddr invariant
x86/mm/cpa: Make cpa_data::vaddr invariant
x86/mm/cpa: Add __cpa_addr() helper
x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests
x86/mm: Drop usage of __flush_tlb_all() in kernel_physical_mapping_init()
x86/mm: Validate kernel_physical_mapping_init() PTE population
generic/pgtable: Introduce set_pte_safe()
generic/pgtable: Introduce {p4d,pgd}_same()
generic/pgtable: Make {pmd, pud}_same() unconditionally available
x86/fault: Clean up the page fault oops decoder a bit
x86/fault: Decode page fault OOPSes better
x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
x86/oops: Show the correct CS value in show_regs()
x86/fault: Don't try to recover from an implicit supervisor access
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/entry/vsyscall/vsyscall_64.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/disabled-features.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/pgalloc.h | 27 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 5 | ||||
-rw-r--r-- | arch/x86/mm/debug_pagetables.c | 58 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 244 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 30 | ||||
-rw-r--r-- | arch/x86/mm/mm_internal.h | 2 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 31 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 271 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 4 |
11 files changed, 345 insertions, 337 deletions
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 85fd85d52ffd..d78bcc03e60e 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; - thread->error_code = 6; /* user fault, no page, write */ + thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33833d1909af..a5ea841cc6d2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_SMAP +# define DISABLE_SMAP 0 +#else +# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) +#endif + #ifdef CONFIG_X86_INTEL_UMIP # define DISABLE_UMIP 0 #else @@ -68,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX) +#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index ec7f43327033..1ea41aaef68b 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -80,6 +80,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } +static inline void pmd_populate_kernel_safe(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { @@ -132,6 +139,12 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); } + +static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd))); +} #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -141,6 +154,12 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); +} + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; @@ -173,6 +192,14 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } +static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + if (!pgtable_l5_enabled()) + return; + paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); +} + static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 721d02bd2d0d..6a62f4af9fcf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; - unsigned int ds, cs, es; + unsigned int ds, es; show_iret_regs(regs); @@ -100,7 +100,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) } asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); @@ -116,7 +115,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, es, cr0); printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 225fe2f0bfec..cd84f067e41d 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -10,20 +10,9 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .owner = THIS_MODULE, - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_show_curknl(struct seq_file *m, void *v) +static int ptdump_curknl_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -33,23 +22,12 @@ static int ptdump_show_curknl(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curknl(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curknl, NULL); -} - -static const struct file_operations ptdump_curknl_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curknl, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); #ifdef CONFIG_PAGE_TABLE_ISOLATION static struct dentry *pe_curusr; -static int ptdump_show_curusr(struct seq_file *m, void *v) +static int ptdump_curusr_show(struct seq_file *m, void *v) { if (current->mm->pgd) { down_read(¤t->mm->mmap_sem); @@ -59,42 +37,20 @@ static int ptdump_show_curusr(struct seq_file *m, void *v) return 0; } -static int ptdump_open_curusr(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_curusr, NULL); -} - -static const struct file_operations ptdump_curusr_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_curusr, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); #endif #if defined(CONFIG_EFI) && defined(CONFIG_X86_64) static struct dentry *pe_efi; -static int ptdump_show_efi(struct seq_file *m, void *v) +static int ptdump_efi_show(struct seq_file *m, void *v) { if (efi_mm.pgd) ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); return 0; } -static int ptdump_open_efi(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show_efi, NULL); -} - -static const struct file_operations ptdump_efi_fops = { - .owner = THIS_MODULE, - .open = ptdump_open_efi, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump_efi); #endif static struct dentry *dir, *pe_knl, *pe_curknl; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 71d4b9d4d43f..2ff25ad33233 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ +#include <asm/desc.h> /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -571,10 +572,55 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(&desc, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", + name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +/* + * This helper function transforms the #PF error_code bits into + * "[PROT] [USER]" type of descriptive, almost human-readable error strings: + */ +static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) +{ + if (error_code & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char err_txt[64]; + if (!oops_may_print()) return; @@ -602,6 +648,52 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + err_txt[0] = 0; + + /* + * Note: length of these appended strings including the separation space and the + * zero delimiter must fit into err_txt[]. + */ + err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); + err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); + err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); + err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); + + pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* + * This can happen for quite a few reasons. The more obvious + * ones are faults accessing the GDT, or LDT. Perhaps + * surprisingly, if the CPU tries to deliver a benign or + * contributory exception from user code and gets a page fault + * during delivery, the page fault can be delivered as though + * it originated directly from user code. This could happen + * due to wrong permissions on the IDT, GDT, LDT, TSS, or + * kernel or IST stack. + */ + store_idt(&idt); + + /* Usable even on Xen PV -- it's just slow. */ + native_store_gdt(&gdt); + + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", + idt.address, idt.size, gdt.address, gdt.size); + + store_ldt(ldtr); + show_ldttss(&gdt, "LDTR", ldtr); + + store_tr(tr); + show_ldttss(&gdt, "TR", tr); + } + dump_pagetable(address); } @@ -621,16 +713,30 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* + * To avoid leaking information about the kernel page + * table layout, pretend that user-mode accesses to + * kernel addresses are always protection faults. + */ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -639,6 +745,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* + * This is an implicit supervisor-mode access from user + * mode. Bypass all the kernel-mode recovery code and just + * OOPS. + */ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -656,9 +771,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -726,6 +839,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: @@ -737,10 +851,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -794,7 +904,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */ @@ -821,9 +931,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +1045,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { @@ -1148,23 +1254,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1230,7 +1319,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1252,10 +1340,16 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * Check for invalid kernel (supervisor) access to user - * pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) access to user + * pages in the user address space. The odd case here is WRUSS, + * which, according to the preliminary documentation, does not respect + * SMAP and will have the USER bit set so, in all cases, SMAP + * enforcement appears to be consistent with the USER bit. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && + !(hw_error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) + { bad_area_nosemaphore(regs, hw_error_code, address); return; } @@ -1270,13 +1364,6 @@ void do_user_addr_fault(struct pt_regs *regs, } /* - * hw_error_code is literally the "page fault error code" passed to - * the kernel directly from the hardware. But, we will shortly be - * modifying it in software, so give it a new name. - */ - sw_error_code = hw_error_code; - - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * @@ -1285,26 +1372,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* - * Up to this point, X86_PF_USER set in hw_error_code - * indicated a user-mode access. But, after this, - * X86_PF_USER in sw_error_code will indicate either - * that, *or* an implicit kernel(supervisor)-mode access - * which originated from user mode. - */ - if (!(hw_error_code & X86_PF_USER)) { - /* - * The CPU was in user mode, but the CPU says - * the fault was not a user-mode access. - * Must be an implicit kernel-mode access, - * which we do not expect to happen in the - * user address space. - */ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1313,9 +1380,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1328,7 +1395,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1344,18 +1411,15 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and - * 2. The access did not originate in userspace. Note: either the - * hardware or earlier page fault code may set X86_PF_USER - * in sw_error_code. + * 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1371,29 +1435,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } - if (sw_error_code & X86_PF_USER) { - /* - * Accessing the stack below %sp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535, $31" pushes - * 32 pointers and then decrements %sp by 65535.) - */ - if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, sw_error_code, address); - return; - } - } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } @@ -1402,8 +1454,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(sw_error_code, vma))) { - bad_area_access_error(regs, sw_error_code, address, vma); + if (unlikely(access_error(hw_error_code, vma))) { + bad_area_access_error(regs, hw_error_code, address, vma); return; } @@ -1442,13 +1494,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, sw_error_code, address, fault); + mm_fault_error(regs, hw_error_code, address, fault); return; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fab264948c2..484c1b92f078 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -432,7 +432,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pte(pte, __pte(0)); + set_pte_safe(pte, __pte(0)); continue; } @@ -452,7 +452,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; } @@ -487,7 +487,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pmd(pmd, __pmd(0)); + set_pmd_safe(pmd, __pmd(0)); continue; } @@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte((pte_t *)pmd, + set_pte_safe((pte_t *)pmd, pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); @@ -536,7 +536,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, pte); + pmd_populate_kernel_safe(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -573,7 +573,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_pud(pud, __pud(0)); + set_pud_safe(pud, __pud(0)); continue; } @@ -584,7 +584,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, paddr_end, page_size_mask, prot); - __flush_tlb_all(); continue; } /* @@ -611,7 +610,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); - set_pte((pte_t *)pud, + set_pte_safe((pte_t *)pud, pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); @@ -624,10 +623,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, page_size_mask, prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, pmd); + pud_populate_safe(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -659,7 +657,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RESERVED_KERN)) - set_p4d(p4d, __p4d(0)); + set_p4d_safe(p4d, __p4d(0)); continue; } @@ -668,7 +666,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, paddr_last = phys_pud_init(pud, paddr, paddr_end, page_size_mask); - __flush_tlb_all(); continue; } @@ -677,10 +674,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, page_size_mask); spin_lock(&init_mm.page_table_lock); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_safe(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); } - __flush_tlb_all(); return paddr_last; } @@ -723,9 +719,9 @@ kernel_physical_mapping_init(unsigned long paddr_start, spin_lock(&init_mm.page_table_lock); if (pgtable_l5_enabled()) - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_safe(&init_mm, pgd, p4d); else - p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -733,8 +729,6 @@ kernel_physical_mapping_init(unsigned long paddr_start, if (pgd_changed) sync_global_pgds(vaddr_start, vaddr_end - 1); - __flush_tlb_all(); - return paddr_last; } diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 4e1f6e1b8159..319bde386d5f 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -19,4 +19,6 @@ extern int after_bootmem; void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); +extern unsigned long tlb_single_page_flush_ceiling; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 08f8f76a4852..facce271e8b9 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -23,7 +23,8 @@ static __read_mostly int print = 1; enum { - NTEST = 400, + NTEST = 3 * 100, + NPAGES = 100, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -110,6 +111,9 @@ static int print_split(struct split_state *s) static unsigned long addr[NTEST]; static unsigned int len[NTEST]; +static struct page *pages[NPAGES]; +static unsigned long addrs[NPAGES]; + /* Change the global bit on random pages in the direct mapping */ static int pageattr_test(void) { @@ -120,7 +124,6 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; - unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -137,7 +140,7 @@ static int pageattr_test(void) unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32() % 100; + len[i] = prandom_u32() % NPAGES; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) @@ -167,14 +170,29 @@ static int pageattr_test(void) break; } __set_bit(pfn + k, bm); + addrs[k] = addr[i] + k*PAGE_SIZE; + pages[k] = pfn_to_page(pfn + k); } if (!addr[i] || !pte || !k) { addr[i] = 0; continue; } - test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + switch (i % 3) { + case 0: + err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0); + break; + + case 1: + err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + break; + + case 2: + err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST); + break; + } + + if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -206,8 +224,7 @@ static int pageattr_test(void) failed++; continue; } - test_addr = addr[i]; - err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e44fe1a63f72..4f8972311a77 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,6 +26,8 @@ #include <asm/pat.h> #include <asm/set_memory.h> +#include "mm_internal.h" + /* * The current flushing context - we pass it instead of 5 arguments: */ @@ -35,11 +37,11 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; - int flags; + unsigned long curpage; unsigned long pfn; - unsigned force_split : 1, + unsigned int flags; + unsigned int force_split : 1, force_static_prot : 1; - int curpage; struct page **pages; }; @@ -228,19 +230,28 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif +static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) +{ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[idx]; + + if (unlikely(PageHighMem(page))) + return 0; + + return (unsigned long)page_address(page); + } + + if (cpa->flags & CPA_ARRAY) + return cpa->vaddr[idx]; + + return *cpa->vaddr + idx * PAGE_SIZE; +} + /* * Flushing functions */ -/** - * clflush_cache_range - flush a cache range with clflush - * @vaddr: virtual start address - * @size: number of bytes to flush - * - * clflushopt is an unordered instruction which needs fencing with mfence or - * sfence to avoid ordering issues. - */ -void clflush_cache_range(void *vaddr, unsigned int size) +static void clflush_cache_range_opt(void *vaddr, unsigned int size |