From 80552f0f7aebdd8deda8ea455292cbfbf462d655 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 16 Apr 2019 10:22:57 -0400 Subject: mm/slab: Remove store_stackinfo() store_stackinfo() does not seem used in actual SLAB debugging. Potentially, it could be added to check_poison_obj() to provide more information but this seems like an overkill due to the declining popularity of SLAB, so just remove it instead. Signed-off-by: Qian Cai Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Acked-by: Vlastimil Babka Cc: Andrew Morton Cc: Andy Lutomirski Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Josh Poimboeuf Cc: linux-mm Cc: Pekka Enberg Cc: rientjes@google.com Cc: sean.j.christopherson@intel.com Link: https://lkml.kernel.org/r/20190416142258.18694-1-cai@lca.pw --- mm/slab.c | 48 ++++++------------------------------------------ 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 47a380a486ee..e79ef28396e2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) } #ifdef CONFIG_DEBUG_PAGEALLOC -static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, - unsigned long caller) -{ - int size = cachep->object_size; - - addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; - - if (size < 5 * sizeof(unsigned long)) - return; - - *addr++ = 0x12345678; - *addr++ = caller; - *addr++ = smp_processor_id(); - size -= 3 * sizeof(unsigned long); - { - unsigned long *sptr = &caller; - unsigned long svalue; - - while (!kstack_end(sptr)) { - svalue = *sptr++; - if (kernel_text_address(svalue)) { - *addr++ = svalue; - size -= sizeof(unsigned long); - if (size <= sizeof(unsigned long)) - break; - } - } - - } - *addr++ = 0x87654321; -} - -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) { if (!is_debug_pagealloc_cache(cachep)) return; - if (caller) - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) {} + int map) {} #endif @@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2434,7 +2398,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, 0); + slab_kernel_map(cachep, objp, 0); } } #endif @@ -2813,7 +2777,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, caller); + slab_kernel_map(cachep, objp, 0); } return objp; } @@ -3077,7 +3041,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) -- cgit v1.2.3 From 7dbcf2b0b770eeb803a416ee8dcbef78e6389d40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:38 +0200 Subject: x86/irq/64: Limit IST stack overflow check to #DB stack Commit 37fe6a42b343 ("x86: Check stack overflow in detail") added a broad check for the full exception stack area, i.e. it considers the full exception stack area as valid. That's wrong in two aspects: 1) It does not check the individual areas one by one 2) #DF, NMI and #MCE are not enabling interrupts which means that a regular device interrupt cannot happen in their context. In fact if a device interrupt hits one of those IST stacks that's a bug because some code path enabled interrupts while handling the exception. Limit the check to the #DB stack and consider all other IST stacks as 'overflow' or invalid. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Mitsuo Hayasaka Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.682135110@linutronix.de --- arch/x86/kernel/irq_64.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..b50ac9c7397b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,9 +26,18 @@ int sysctl_panic_on_stackoverflow; /* * Probabilistic stack overflow check: * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. + * Regular device interrupts can enter on the following stacks: + * + * - User stack + * + * - Kernel task stack + * + * - Interrupt stack if a device driver reenables interrupts + * which should only happen in really old drivers. + * + * - Debug IST stack + * + * All other contexts are invalid. */ static inline void stack_overflow_check(struct pt_regs *regs) { @@ -53,8 +62,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + estack_bottom = (u64)oist->ist[DEBUG_STACK]; + estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; -- cgit v1.2.3 From fa33215422fd415a07ec2a00e9f1acdaf0fa8e94 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 17:59:39 +0200 Subject: x86/dumpstack: Fix off-by-one errors in stack identification The get_stack_info() function is off-by-one when checking whether an address is on a IRQ stack or a IST stack. This prevents an overflowed IRQ or IST stack from being dumped properly. [ tglx: Do the same for 32-bit ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.785651055@linutronix.de --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..d305440ebe9c 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -41,7 +41,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -66,7 +66,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..90f0fa88cbb3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -65,7 +65,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack <= begin || stack >= end) + if (stack < begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -88,7 +88,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; -- cgit v1.2.3 From 4f44b8f0b33b7111216f0fad353315f796b81617 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 17:59:40 +0200 Subject: x86/irq/64: Remove a hardcoded irq_stack_union access stack_overflow_check() is using both irq_stack_ptr and irq_stack_union to find the IRQ stack. That's going to break when vmapped irq stacks are introduced. Change it to just use irq_stack_ptr. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.872549191@linutronix.de --- arch/x86/kernel/irq_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b50ac9c7397b..f6dcc8fea5c0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -55,9 +55,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_top = irq_stack_bottom - IRQ_STACK_SIZE + STACK_TOP_MARGIN; if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; -- cgit v1.2.3 From df835e7083bee33e98635aca26b39b63ebc6cca7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:41 +0200 Subject: x86/irq/64: Sanitize the top/bottom confusion On x86, stacks go top to bottom, but the stack overflow check uses it the other way round, which is just confusing. Clean it up and sanitize the warning string a bit. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.961241397@linutronix.de --- arch/x86/kernel/irq_64.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f6dcc8fea5c0..cf200466d5c8 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -42,7 +42,7 @@ int sysctl_panic_on_stackoverflow; static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 +#define STACK_MARGIN 128 struct orig_ist *oist; u64 irq_stack_top, irq_stack_bottom; u64 estack_top, estack_bottom; @@ -51,25 +51,25 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - irq_stack_top = irq_stack_bottom - IRQ_STACK_SIZE + STACK_TOP_MARGIN; - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + irq_stack_top = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_bottom = irq_stack_top - IRQ_STACK_SIZE + STACK_MARGIN; + if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; oist = this_cpu_ptr(&orig_ist); - estack_bottom = (u64)oist->ist[DEBUG_STACK]; - estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) + estack_top = (u64)oist->ist[DEBUG_STACK]; + estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; + if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx, irq stack:%Lx-%Lx, exception stack: %Lx-%Lx, ip:%pF)\n", current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + irq_stack_bottom, irq_stack_top, + estack_bottom, estack_top, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); -- cgit v1.2.3 From 99d334511b337884cadbdfae28da912a4edb1001 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:42 +0200 Subject: x86/idt: Remove unused macro SISTG Commit d8ba61ba58c8 ("x86/entry/64: Don't use IST entry for #BP stack") removed the last user but left the macro around. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Dou Liyang Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.050689789@linutronix.de --- arch/x86/kernel/idt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..2877606e97de 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -45,10 +45,6 @@ struct idt_data { #define ISTG(_vector, _addr, _ist) \ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) - /* Task gate */ #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) -- cgit v1.2.3 From 6f36bd8d2e8c221eaaf4ce5b0ebbb11c00b0ac98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:43 +0200 Subject: x86/64: Remove stale CURRENT_MASK Nothing uses that and before people get the wrong ideas, get rid of it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Andy Lutomirski Cc: Baoquan He Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.139284839@linutronix.de --- arch/x86/include/asm/page_64_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8f657286d599..bcd8c0518604 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,7 +14,6 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -- cgit v1.2.3 From 30842211506e376b76394a9cb4e6d0c9d258b8d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:44 +0200 Subject: x86/exceptions: Remove unused stack defines on 32bit Nothing requires those for 32bit builds. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Michal Hocko Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.227822695@linutronix.de --- arch/x86/include/asm/page_32_types.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0d5c739eebd7..57b5dc15ca7e 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,11 +22,7 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 0 -#define DEBUG_STACK 0 -#define MCE_STACK 0 -#define N_EXCEPTION_STACKS 1 +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* -- cgit v1.2.3 From 8f34c5b5afce91d171bb0802631197484cb69b8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:45 +0200 Subject: x86/exceptions: Make IST index zero based The defines for the exception stack (IST) array in the TSS are using the SDM convention IST1 - IST7. That causes all sorts of code to subtract 1 for array indices related to IST. That's confusing at best and does not provide any value. Make the indices zero based and fixup the usage sites. The only code which needs to adjust the 0 based index is the interrupt descriptor setup which needs to add 1 now. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: linux-doc@vger.kernel.org Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.331772825@linutronix.de --- Documentation/x86/kernel-stacks | 8 ++++---- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/page_64_types.h | 13 ++++++++----- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 14 +++++++------- arch/x86/kernel/idt.c | 15 +++++++++------ arch/x86/kernel/irq_64.c | 2 +- arch/x86/mm/fault.c | 2 +- 8 files changed, 34 insertions(+), 28 deletions(-) diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 9a0aa4d3a866..1b04596caea9 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt. The currently assigned IST stacks are :- -* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 8 - Double Fault Exception (#DF). @@ -68,7 +68,7 @@ The currently assigned IST stacks are :- Using a separate stack allows the kernel to recover from it well enough in many cases to still output an oops. -* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE). Used for non-maskable interrupts (NMI). @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* DEBUG_STACK. DEBUG_STKSZ +* ESTACK_DB. DEBUG_STKSZ Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,7 +86,7 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. -* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..fd0a50452cb3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -841,7 +841,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) /** * idtentry - Generate an IDT entry stub @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=ESTACK_DB idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index bcd8c0518604..6ab2c54c1bf9 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -24,11 +24,14 @@ #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 2 -#define DEBUG_STACK 3 -#define MCE_STACK 4 -#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ +/* + * The index for the tss.ist[] array. The hardware limit is 7 entries. + */ +#define ESTACK_DF 0 +#define ESTACK_NMI 1 +#define ESTACK_DB 2 +#define ESTACK_MCE 3 +#define N_EXCEPTION_STACKS 4 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..0e4cb718fc4a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -516,7 +516,7 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); */ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ + [ESTACK_DB] = DEBUG_STKSZ }; #endif @@ -1760,7 +1760,7 @@ void cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - if (v == DEBUG_STACK-1) + if (v == ESTACK_DB) per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 90f0fa88cbb3..455b47ef9250 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -18,16 +18,16 @@ #include -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", +static const char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { +static const unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ + [ESTACK_DB] = DEBUG_STKSZ }; const char *stack_type_name(enum stack_type type) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2877606e97de..2188f734ec61 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,9 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -180,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, ESTACK_DB), + ISTG(X86_TRAP_NMI, nmi, ESTACK_NMI), + ISTG(X86_TRAP_DF, double_fault, ESTACK_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, ESTACK_MCE), #endif }; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index cf200466d5c8..182e8b245e06 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -61,7 +61,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[DEBUG_STACK]; + estack_top = (u64)oist->ist[ESTACK_DB]; estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..0524e1d74f24 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -793,7 +793,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + unsigned long stack = this_cpu_read(orig_ist.ist[ESTACK_DF]) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but -- cgit v1.2.3 From 881a463cf21dbf83aab2cf6c9a359f34f88c2491 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:46 +0200 Subject: x86/cpu_entry_area: Cleanup setup functions No point in retrieving the entry area pointer over and over. Do it once and use unsigned int for 'cpu' everywhere. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.419653165@linutronix.de --- arch/x86/mm/cpu_entry_area.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 19c6abf9ea31..c2a54f75d335 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void __init percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(unsigned int cpu) { #ifdef CONFIG_CPU_SUP_INTEL - int npages; + unsigned int npages; void *cea; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) @@ -79,8 +79,9 @@ static void __init percpu_setup_debug_store(int cpu) } /* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(unsigned int cpu) { + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; @@ -101,10 +102,9 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), - gdt_prot); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + cea_map_percpu_pages(&cea->entry_stack_page, per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); @@ -128,19 +128,18 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, - &per_cpu(cpu_tss_rw, cpu), + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + per_cpu(cpu_entry_area, cpu) = cea; #endif #ifdef CONFIG_X86_64 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + cea_map_percpu_pages(&cea->exception_stacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); #endif -- cgit v1.2.3 From 019b17b3ffe48100e52f609ca1c6ed6e5a40cba1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:47 +0200 Subject: x86/exceptions: Add structs for exception stacks At the moment everything assumes a full linear mapping of the various exception stacks. Adding guard pages to the cpu entry area mapping of the exception stacks will break that assumption. As a preparatory step convert both the real storage and the effective mapping in the cpu entry area from character arrays to structures. To ensure that both arrays have the same ordering and the same size of the individual stacks fill the members with a macro. The guard size is the only difference between the two resulting structures. For now both have guard size 0 until the preparation of all usage sites is done. Provide a couple of helper macros which are used in the following conversions. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.506807893@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/mm/cpu_entry_area.c | 8 ++---- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c706415443..af8c312673de 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,51 @@ #include #include +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize) \ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[DEBUG_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0) +}; + +/* + * The effective cpu entry area mapping with guard pages. Guard size is + * zero until the code which makes assumptions about linear mappings is + * cleaned up. + */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(0) +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +77,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0e4cb718fc4a..24b801ea7522 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1754,7 +1754,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; + char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index c2a54f75d335..6a09b84c13fe 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -138,9 +137,8 @@ static void __init setup_cpu_entry_area(unsigned int cpu) #ifdef CONFIG_X86_64 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&cea->exception_stacks, - &per_cpu(exception_stacks, cpu), + sizeof(((struct cpu_entry_area *)0)->estacks)); + cea_map_percpu_pages(&cea->estacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); #endif percpu_setup_debug_store(cpu); -- cgit v1.2.3 From a4af767ae59cc579569bbfe49513a0037d5989ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:48 +0200 Subject: x86/cpu_entry_area: Prepare for IST guard pages To allow guard pages between the IST stacks each stack needs to be mapped individually. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.592691557@linutronix.de --- arch/x86/mm/cpu_entry_area.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 6a09b84c13fe..2b1407662a6d 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -77,6 +77,34 @@ static void __init percpu_setup_debug_store(unsigned int cpu) #endif } +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB); + cea_map_stack(MCE); +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +#endif + /* Setup the fixmap mappings only once per-processor */ static void __init setup_cpu_entry_area(unsigned int cpu) { @@ -134,13 +162,8 @@ static void __init setup_cpu_entry_area(unsigned int cpu) per_cpu(cpu_entry_area, cpu) = cea; #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->estacks)); - cea_map_percpu_pages(&cea->estacks, &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); -#endif + percpu_setup_exception_stacks(cpu); + percpu_setup_debug_store(cpu); } -- cgit v1.2.3 From 7623f37e411156e6e09b95cf5c76e509c5fda640 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:49 +0200 Subject: x86/cpu_entry_area: Provide exception stack accessor Store a pointer to the per cpu entry area exception stack mappings to allow fast retrieval. Required for converting various places from using the shadow IST array to directly doing address calculations on the actual mapping address. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.680960459@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 4 ++++ arch/x86/mm/cpu_entry_area.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index af8c312673de..9b406f067ecf 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -99,6 +99,7 @@ struct cpu_entry_area { #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); @@ -118,4 +119,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } +#define __this_cpu_ist_top_va(name) \ + CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 2b1407662a6d..a00d0d059c8a 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -14,6 +14,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -92,6 +93,9 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) unsigned int npages; BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + /* * The exceptions stack mappings in the per cpu area are protected * by guard pages so each stack must be mapped separately. -- cgit v1.2.3 From d876b67343a648f3613506c7dbfed088fa0c875b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:50 +0200 Subject: x86/traps: Use cpu_entry_area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.784487230@linutronix.de --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0524e1d74f24..06c089513d39 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,6 +28,7 @@ #include /* vma_pkey() */ #include /* efi_recover_from_page_fault()*/ #include /* store_idt(), ... */ +#include /* exception stack */ #define CREATE_TRACE_POINTS #include @@ -793,7 +794,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[ESTACK_DF]) - sizeof(void *); + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but -- cgit v1.2.3 From bf5882abab773afd1277415e2f826b21de28f30d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:51 +0200 Subject: x86/irq/64: Use cpu entry area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.885741626@linutronix.de --- arch/x86/kernel/irq_64.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 182e8b245e06..7eb6f8d11bfd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,6 +18,8 @@ #include #include #include + +#include #include #include @@ -43,10 +45,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW #define STACK_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; + u64 irq_stack_top, irq_stack_bottom, estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); + struct cea_exception_stacks *estacks; if (user_mode(regs)) return; @@ -60,9 +61,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[ESTACK_DB]; - estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; + estacks = __this_cpu_read(cea_exception_stacks); + estack_top = CEA_ESTACK_TOP(estacks, DB); + estack_bottom = CEA_ESTACK_BOT(estacks, DB) + STACK_MARGIN; if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; -- cgit v1.2.3 From afcd21dad88b68d646e91ed36948117d58b4c197 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:52 +0200 Subject: x86/dumpstack/64: Use cpu_entry_area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Cc: Josh Poimboeuf Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.974900463@linutronix.de --- arch/x86/kernel/dumpstack_64.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 455b47ef9250..f6fbd0438f9e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,6 +16,7 @@ #include #include +#include #include static const char *exception_stack_names[N_EXCEPTION_STACKS] = { @@ -25,11 +26,6 @@ static const char *exception_stack_names[N_EXCEPTION_STACKS] = { [ ESTACK_MCE ] = "#MC", }; -static const unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [ESTACK_DB] = DEBUG_STKSZ -}; - const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); @@ -52,25 +48,44 @@ const char *stack_type_name(enum stack_type type) return NULL; } +struct estack_layout { + unsigned int begin; + unsigned int end; +}; + +#define ESTACK_ENTRY(x) { \ + .begin = offsetof(struct cea_exception_stacks, x## _stack), \ + .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ + } + +static const struct estack_layout layout[N_EXCEPTION_STACKS] = { + [ ESTACK_DF ] = ESTACK_ENTRY(DF), + [ ESTACK_NMI ] = ESTACK_ENTRY(NMI), + [ ESTACK_DB ] = ESTACK_ENTRY(DB), + [ ESTACK_MCE ] = ESTACK_ENTRY(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long estacks, begin, end, stk = (unsigned long)stack; struct pt_regs *regs; - unsigned k; + unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); + for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); + begin = estacks + layout[k].begin; + end = estacks + layout[k].end; regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stk < begin || stk >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; -- cgit v1.2.3 From f6ef73224a0f0400c3979c8bc68b383f9d2eb9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:53 +0200 Subject: x86/cpu: Prepare TSS.IST setup for guard pages Convert the TSS.IST setup code to use the cpu entry area information directly instead of assuming a linear mapping of the IST stacks. The store to orig_ist[] is no longer required as there are no users anymore. This is the last preparatory step towards IST guard pages. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.061686012@linutronix.de --- arch/x86/kernel/cpu/common.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 24b801ea7522..4b01b71415f5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [ESTACK_DB] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1690,17 +1677,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1699,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1736,12 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == ESTACK_DB) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; -- cgit v1.2.3 From 4d68c3d0ecd5fcba8876e8a58ac41ffb360de43e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:54 +0200 Subject: x86/cpu: Remove orig_ist array All users gone. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Pingfan Liu Cc: Pu Wen Cc: Sean Christopherson Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.151435667@linutronix.de --- arch/x86/include/asm/processor.h | 9 --------- arch/x86/kernel/cpu/common.c | 6 ------ 2 files changed, 15 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12..8fcfcd1a8375 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,16 +374,7 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - union irq_stack_union { char irq_stack[IRQ_STACK_SIZE]; /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b01b71415f5..8243f198fb7f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1549,12 +1549,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); -- cgit v1.2.3 From 3207426925d2b4da390be8068df1d1c2b36e5918 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:55 +0200 Subject: x86/exceptions: Disconnect IST index and stack order The entry order of the TSS.IST array and the order of the stack storage/mapping are not required to be the same. With the upcoming split of the debug stack this is going to fall apart as the number of TSS.IST array entries stays the same while the actual stacks are increasing. Make them separate so that code like dumpstack can just utilize the mapping order. The IST index is solely required for the actual TSS.IST array initialization. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Josh Poimboeuf Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.241588113@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/cpu_entry_area.h | 11 +++++++++++ arch/x86/include/asm/page_64_types.h | 9 ++++----- arch/x86/include/asm/stacktrace.h | 2 ++ arch/x86/kernel/cpu/common.c | 10 +++++----- arch/x86/kernel/idt.c | 8 ++++---- 6 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fd0a50452cb3..5c0348504a4b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=ESTACK_DB +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 9b406f067ecf..310eeb62d418 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -35,6 +35,17 @@ struct cea_exception_stacks { ESTACKS_MEMBERS(0) }; +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + #define CEA_ESTACK_SIZE(st) \ sizeof(((struct cea_exception_stacks *)0)->st## _stack) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6ab2c54c1bf9..056de887b220 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -27,11 +27,10 @@ /* * The index for the tss.ist[] array. The hardware limit is 7 entries. */ -#define ESTACK_DF 0 -#define ESTACK_NMI 1 -#define ESTACK_DB 2 -#define ESTACK_MCE 3 -#define N_EXCEPTION_STACKS 4 +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f335aad404a4..d6d758a187b6 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include #include + +#include #include enum stack_type { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8243f198fb7f..143aceaf9a9a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1731,11 +1731,11 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); - per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2188f734ec61..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -183,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, ESTACK_DB), - ISTG(X86_TRAP_NMI, nmi, ESTACK_NMI), - ISTG(X86_TRAP_DF, double_fault, ESTACK_DF), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, ESTACK_MCE), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; -- cgit v1.2.3 From 1bdb67e5aa2d5d43c48cb7d93393fcba276c9e71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:56 +0200 Subject: x86/exceptions: Enable IST guard pages All usage sites which expected that the exception stacks in the CPU entry area are mapped linearly are fixed up. Enable guard pages between the IST stacks. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Sean Christopherson Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.349862042@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 310eeb62d418..9c96406e6d2b 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -26,13 +26,9 @@ struct exception_stacks { ESTACKS_MEMBERS(0) }; -/* - * The effective cpu entry area mapping with guard pages. Guard size is - * zero until the code which makes assumptions about linear mappings is - * cleaned up. - */ +/* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(PAGE_SIZE) }; /* -- cgit v1.2.3 From 2a594d4ccf3f10f80b77d71bd3dad10813ac0137 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:57 +0200 Subject: x86/exceptions: Split debug IST stack The debug IST stack is actually two separate debug stacks to handle #DB recursion. This is required because the CPU starts always at top of stack on exception entry, which means on #DB recursion the second #DB would overwrite the stack of the first. The low level entry code therefore adjusts the top of stack on entry so a secondary #DB starts from a different stack page. But the stack pages are adjacent without a guard page between them. Split the debug stack into 3 stacks which are separated by guard pages. The 3rd stack is never mapped into the cpu_entry_area and is only there to catch triple #DB nesting: --- top of DB_stack <- Initial stack --- end of DB_stack guard page --- top of DB1_stack <- Top of stack after entering first #DB --- end of DB1_stack guard page --- top of DB2_stack <- Top of stack after entering second #DB --- end of DB2_stack guard page If DB2 would not act as the final guard hole, a second #DB would point the top of #DB stack to the stack below #DB1 which would be valid and not catch the not so desired triple nesting. The backing store does not allocate any memory for DB2 and its guard page as it is not g