Commit 5c4a1c09 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "These are the fixes left over from the v5.4 cycle:

   - Various low level 32-bit entry code fixes and improvements by Andy
     Lutomirski, Peter Zijlstra and Thomas Gleixner.

   - Fix 32-bit Xen PV breakage, by Jan Beulich"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry/32: Fix FIXUP_ESPFIX_STACK with user CR3
  x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise
  selftests/x86/sigreturn/32: Invalidate DS and ES when abusing the kernel
  selftests/x86/mov_ss_trap: Fix the SYSENTER test
  x86/entry/32: Fix NMI vs ESPFIX
  x86/entry/32: Unwind the ESPFIX stack earlier on exception entry
  x86/entry/32: Move FIXUP_FRAME after pushing %fs in SAVE_ALL
  x86/entry/32: Use %ss segment where required
  x86/entry/32: Fix IRET exception
  x86/cpu_entry_area: Add guard page for entry stack on 32bit
  x86/pti/32: Size initial_page_table correctly
  x86/doublefault/32: Fix stack canaries in the double fault handler
  x86/xen/32: Simplify ring check in xen_iret_crit_fixup()
  x86/xen/32: Make xen_iret_crit_fixup() independent of frame layout
  x86/stackframe/32: Repair 32-bit Xen PV
parents 53a07a14 4a13b0e3
Loading
Loading
Loading
Loading
+133 −78
Original line number Diff line number Diff line
@@ -172,7 +172,7 @@
	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
	.if \no_user_check == 0
	/* coming from usermode? */
	testl	$SEGMENT_RPL_MASK, PT_CS(%esp)
	testl	$USER_SEGMENT_RPL_MASK, PT_CS(%esp)
	jz	.Lend_\@
	.endif
	/* On user-cr3? */
@@ -205,64 +205,76 @@
#define CS_FROM_ENTRY_STACK	(1 << 31)
#define CS_FROM_USER_CR3	(1 << 30)
#define CS_FROM_KERNEL		(1 << 29)
#define CS_FROM_ESPFIX		(1 << 28)

.macro FIXUP_FRAME
	/*
	 * The high bits of the CS dword (__csh) are used for CS_FROM_*.
	 * Clear them in case hardware didn't do this for us.
	 */
	andl	$0x0000ffff, 3*4(%esp)
	andl	$0x0000ffff, 4*4(%esp)

#ifdef CONFIG_VM86
	testl	$X86_EFLAGS_VM, 4*4(%esp)
	testl	$X86_EFLAGS_VM, 5*4(%esp)
	jnz	.Lfrom_usermode_no_fixup_\@
#endif
	testl	$SEGMENT_RPL_MASK, 3*4(%esp)
	testl	$USER_SEGMENT_RPL_MASK, 4*4(%esp)
	jnz	.Lfrom_usermode_no_fixup_\@

	orl	$CS_FROM_KERNEL, 3*4(%esp)
	orl	$CS_FROM_KERNEL, 4*4(%esp)

	/*
	 * When we're here from kernel mode; the (exception) stack looks like:
	 *
	 *  5*4(%esp) - <previous context>
	 *  4*4(%esp) - flags
	 *  3*4(%esp) - cs
	 *  2*4(%esp) - ip
	 *  1*4(%esp) - orig_eax
	 *  0*4(%esp) - gs / function
	 *  6*4(%esp) - <previous context>
	 *  5*4(%esp) - flags
	 *  4*4(%esp) - cs
	 *  3*4(%esp) - ip
	 *  2*4(%esp) - orig_eax
	 *  1*4(%esp) - gs / function
	 *  0*4(%esp) - fs
	 *
	 * Lets build a 5 entry IRET frame after that, such that struct pt_regs
	 * is complete and in particular regs->sp is correct. This gives us
	 * the original 5 enties as gap:
	 * the original 6 enties as gap:
	 *
	 * 12*4(%esp) - <previous context>
	 * 11*4(%esp) - gap / flags
	 * 10*4(%esp) - gap / cs
	 *  9*4(%esp) - gap / ip
	 *  8*4(%esp) - gap / orig_eax
	 *  7*4(%esp) - gap / gs / function
	 *  6*4(%esp) - ss
	 *  5*4(%esp) - sp
	 *  4*4(%esp) - flags
	 *  3*4(%esp) - cs
	 *  2*4(%esp) - ip
	 *  1*4(%esp) - orig_eax
	 *  0*4(%esp) - gs / function
	 * 14*4(%esp) - <previous context>
	 * 13*4(%esp) - gap / flags
	 * 12*4(%esp) - gap / cs
	 * 11*4(%esp) - gap / ip
	 * 10*4(%esp) - gap / orig_eax
	 *  9*4(%esp) - gap / gs / function
	 *  8*4(%esp) - gap / fs
	 *  7*4(%esp) - ss
	 *  6*4(%esp) - sp
	 *  5*4(%esp) - flags
	 *  4*4(%esp) - cs
	 *  3*4(%esp) - ip
	 *  2*4(%esp) - orig_eax
	 *  1*4(%esp) - gs / function
	 *  0*4(%esp) - fs
	 */

	pushl	%ss		# ss
	pushl	%esp		# sp (points at ss)
	addl	$6*4, (%esp)	# point sp back at the previous context
	pushl	6*4(%esp)	# flags
	pushl	6*4(%esp)	# cs
	pushl	6*4(%esp)	# ip
	pushl	6*4(%esp)	# orig_eax
	pushl	6*4(%esp)	# gs / function
	addl	$7*4, (%esp)	# point sp back at the previous context
	pushl	7*4(%esp)	# flags
	pushl	7*4(%esp)	# cs
	pushl	7*4(%esp)	# ip
	pushl	7*4(%esp)	# orig_eax
	pushl	7*4(%esp)	# gs / function
	pushl	7*4(%esp)	# fs
.Lfrom_usermode_no_fixup_\@:
.endm

.macro IRET_FRAME
	/*
	 * We're called with %ds, %es, %fs, and %gs from the interrupted
	 * frame, so we shouldn't use them.  Also, we may be in ESPFIX
	 * mode and therefore have a nonzero SS base and an offset ESP,
	 * so any attempt to access the stack needs to use SS.  (except for
	 * accesses through %esp, which automatically use SS.)
	 */
	testl $CS_FROM_KERNEL, 1*4(%esp)
	jz .Lfinished_frame_\@

@@ -276,31 +288,40 @@
	movl	5*4(%esp), %eax		# (modified) regs->sp

	movl	4*4(%esp), %ecx		# flags
	movl	%ecx, -4(%eax)
	movl	%ecx, %ss:-1*4(%eax)

	movl	3*4(%esp), %ecx		# cs
	andl	$0x0000ffff, %ecx
	movl	%ecx, -8(%eax)
	movl	%ecx, %ss:-2*4(%eax)

	movl	2*4(%esp), %ecx		# ip
	movl	%ecx, -12(%eax)
	movl	%ecx, %ss:-3*4(%eax)

	movl	1*4(%esp), %ecx		# eax
	movl	%ecx, -16(%eax)
	movl	%ecx, %ss:-4*4(%eax)

	popl	%ecx
	lea	-16(%eax), %esp
	lea	-4*4(%eax), %esp
	popl	%eax
.Lfinished_frame_\@:
.endm

.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
	cld
.if \skip_gs == 0
	PUSH_GS
.endif
	FIXUP_FRAME
	pushl	%fs

	pushl	%eax
	movl	$(__KERNEL_PERCPU), %eax
	movl	%eax, %fs
.if \unwind_espfix > 0
	UNWIND_ESPFIX_STACK
.endif
	popl	%eax

	FIXUP_FRAME
	pushl	%es
	pushl	%ds
	pushl	\pt_regs_ax
@@ -313,8 +334,6 @@
	movl	$(__USER_DS), %edx
	movl	%edx, %ds
	movl	%edx, %es
	movl	$(__KERNEL_PERCPU), %edx
	movl	%edx, %fs
.if \skip_gs == 0
	SET_KERNEL_GS %edx
.endif
@@ -324,8 +343,8 @@
.endif
.endm

.macro SAVE_ALL_NMI cr3_reg:req
	SAVE_ALL
.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
	SAVE_ALL unwind_espfix=\unwind_espfix

	BUG_IF_WRONG_CR3

@@ -357,6 +376,7 @@
2:	popl	%es
3:	popl	%fs
	POP_GS \pop
	IRET_FRAME
.pushsection .fixup, "ax"
4:	movl	$0, (%esp)
	jmp	1b
@@ -395,7 +415,8 @@

.macro CHECK_AND_APPLY_ESPFIX
#ifdef CONFIG_X86_ESPFIX32
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET

	ALTERNATIVE	"jmp .Lend_\@", "", X86_BUG_ESPFIX

@@ -1075,7 +1096,6 @@ restore_all:
	/* Restore user state */
	RESTORE_REGS pop=4			# skip orig_eax/error_code
.Lirq_return:
	IRET_FRAME
	/*
	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
	 * when returning from IPI handler and when returning from
@@ -1128,30 +1148,43 @@ ENDPROC(entry_INT80_32)
 * We can't call C functions using the ESPFIX stack. This code reads
 * the high word of the segment base from the GDT and swiches to the
 * normal stack and adjusts ESP with the matching offset.
 *
 * We might be on user CR3 here, so percpu data is not mapped and we can't
 * access the GDT through the percpu segment.  Instead, use SGDT to find
 * the cpu_entry_area alias of the GDT.
 */
#ifdef CONFIG_X86_ESPFIX32
	/* fixup the stack */
	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
	pushl	%ecx
	subl	$2*4, %esp
	sgdt	(%esp)
	movl	2(%esp), %ecx				/* GDT address */
	/*
	 * Careful: ECX is a linear pointer, so we need to force base
	 * zero.  %cs is the only known-linear segment we have right now.
	 */
	mov	%cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al	/* bits 16..23 */
	mov	%cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah	/* bits 24..31 */
	shl	$16, %eax
	addl	$2*4, %esp
	popl	%ecx
	addl	%esp, %eax			/* the adjusted stack pointer */
	pushl	$__KERNEL_DS
	pushl	%eax
	lss	(%esp), %esp			/* switch to the normal stack segment */
#endif
.endm

.macro UNWIND_ESPFIX_STACK
	/* It's safe to clobber %eax, all other regs need to be preserved */
#ifdef CONFIG_X86_ESPFIX32
	movl	%ss, %eax
	/* see if on espfix stack */
	cmpw	$__ESPFIX_SS, %ax
	jne	27f
	movl	$__KERNEL_DS, %eax
	movl	%eax, %ds
	movl	%eax, %es
	jne	.Lno_fixup_\@
	/* switch to normal stack */
	FIXUP_ESPFIX_STACK
27:
.Lno_fixup_\@:
#endif
.endm

@@ -1341,11 +1374,6 @@ END(spurious_interrupt_bug)

#ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback)
	pushl	$-1				/* orig_ax = -1 => not a system call */
	SAVE_ALL
	ENCODE_FRAME_POINTER
	TRACE_IRQS_OFF

	/*
	 * Check to see if we got the event in the critical
	 * region in xen_iret_direct, after we've reenabled
@@ -1353,16 +1381,17 @@ ENTRY(xen_hypervisor_callback)
	 * iret instruction's behaviour where it delivers a
	 * pending interrupt when enabling interrupts:
	 */
	movl	PT_EIP(%esp), %eax
	cmpl	$xen_iret_start_crit, %eax
	cmpl	$xen_iret_start_crit, (%esp)
	jb	1f
	cmpl	$xen_iret_end_crit, %eax
	cmpl	$xen_iret_end_crit, (%esp)
	jae	1f

	jmp	xen_iret_crit_fixup

ENTRY(xen_do_upcall)
1:	mov	%esp, %eax
	call	xen_iret_crit_fixup
1:
	pushl	$-1				/* orig_ax = -1 => not a system call */
	SAVE_ALL
	ENCODE_FRAME_POINTER
	TRACE_IRQS_OFF
	mov	%esp, %eax
	call	xen_evtchn_do_upcall
#ifndef CONFIG_PREEMPTION
	call	xen_maybe_preempt_hcall
@@ -1449,10 +1478,9 @@ END(page_fault)

common_exception_read_cr2:
	/* the function address is in %gs's slot on the stack */
	SAVE_ALL switch_stacks=1 skip_gs=1
	SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1

	ENCODE_FRAME_POINTER
	UNWIND_ESPFIX_STACK

	/* fixup %gs */
	GS_TO_REG %ecx
@@ -1474,9 +1502,8 @@ END(common_exception_read_cr2)

common_exception:
	/* the function address is in %gs's slot on the stack */
	SAVE_ALL switch_stacks=1 skip_gs=1
	SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
	ENCODE_FRAME_POINTER
	UNWIND_ESPFIX_STACK

	/* fixup %gs */
	GS_TO_REG %ecx
@@ -1515,6 +1542,10 @@ ENTRY(nmi)
	ASM_CLAC

#ifdef CONFIG_X86_ESPFIX32
	/*
	 * ESPFIX_SS is only ever set on the return to user path
	 * after we've switched to the entry stack.
	 */
	pushl	%eax
	movl	%ss, %eax
	cmpw	$__ESPFIX_SS, %ax
@@ -1550,6 +1581,11 @@ ENTRY(nmi)
	movl	%ebx, %esp

.Lnmi_return:
#ifdef CONFIG_X86_ESPFIX32
	testl	$CS_FROM_ESPFIX, PT_CS(%esp)
	jnz	.Lnmi_from_espfix
#endif

	CHECK_AND_APPLY_ESPFIX
	RESTORE_ALL_NMI cr3_reg=%edi pop=4
	jmp	.Lirq_return
@@ -1557,23 +1593,42 @@ ENTRY(nmi)
#ifdef CONFIG_X86_ESPFIX32
.Lnmi_espfix_stack:
	/*
	 * create the pointer to lss back
	 * Create the pointer to LSS back
	 */
	pushl	%ss
	pushl	%esp
	addl	$4, (%esp)
	/* copy the iret frame of 12 bytes */
	.rept 3
	pushl	16(%esp)
	.endr
	pushl	%eax
	SAVE_ALL_NMI cr3_reg=%edi

	/* Copy the (short) IRET frame */
	pushl	4*4(%esp)	# flags
	pushl	4*4(%esp)	# cs
	pushl	4*4(%esp)	# ip

	pushl	%eax		# orig_ax

	SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
	ENCODE_FRAME_POINTER
	FIXUP_ESPFIX_STACK			# %eax == %esp

	/* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
	xorl	$(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)

	xorl	%edx, %edx			# zero error code
	call	do_nmi
	movl	%esp, %eax			# pt_regs pointer
	jmp	.Lnmi_from_sysenter_stack

.Lnmi_from_espfix:
	RESTORE_ALL_NMI cr3_reg=%edi
	lss	12+4(%esp), %esp		# back to espfix stack
	/*
	 * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
	 * fix up the gap and long frame:
	 *
	 *  3 - original frame	(exception)
	 *  2 - ESPFIX block	(above)
	 *  6 - gap		(FIXUP_FRAME)
	 *  5 - long frame	(FIXUP_FRAME)
	 *  1 - orig_ax
	 */
	lss	(1+5+6)*4(%esp), %esp			# back to espfix stack
	jmp	.Lirq_return
#endif
END(nmi)
+12 −6
Original line number Diff line number Diff line
@@ -78,8 +78,12 @@ struct cpu_entry_area {

	/*
	 * The GDT is just below entry_stack and thus serves (on x86_64) as
	 * a a read-only guard page.
	 * a read-only guard page. On 32-bit the GDT must be writeable, so
	 * it needs an extra guard page.
	 */
#ifdef CONFIG_X86_32
	char guard_entry_stack[PAGE_SIZE];
#endif
	struct entry_stack_page entry_stack_page;

	/*
@@ -94,7 +98,6 @@ struct cpu_entry_area {
	 */
	struct cea_exception_stacks estacks;
#endif
#ifdef CONFIG_CPU_SUP_INTEL
	/*
	 * Per CPU debug store for Intel performance monitoring. Wastes a
	 * full page at the moment.
@@ -105,11 +108,13 @@ struct cpu_entry_area {
	 * Reserve enough fixmap PTEs.
	 */
	struct debug_store_buffers cpu_debug_buffers;
#endif
};

#define CPU_ENTRY_AREA_SIZE		(sizeof(struct cpu_entry_area))
#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS)
#define CPU_ENTRY_AREA_ARRAY_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS)

/* Total size includes the readonly IDT mapping page as well: */
#define CPU_ENTRY_AREA_TOTAL_SIZE	(CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)

DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
@@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);

/* Single page reserved for the readonly IDT mapping: */
#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE
#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)

#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT)

#define CPU_ENTRY_AREA_MAP_SIZE			\
	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)

extern struct cpu_entry_area *get_cpu_entry_area(int cpu);

+4 −4
Original line number Diff line number Diff line
@@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
 * to avoid include recursion hell
 */
#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40)
#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 39)

/* The +1 is for the readonly IDT page: */
#define CPU_ENTRY_AREA_BASE	\
	((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1))   \
	 & PMD_MASK)
	((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)

#define LDT_BASE_ADDR		\
	((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
+12 −0
Original line number Diff line number Diff line
@@ -31,6 +31,18 @@
 */
#define SEGMENT_RPL_MASK	0x3

/*
 * When running on Xen PV, the actual privilege level of the kernel is 1,
 * not 0. Testing the Requested Privilege Level in a segment selector to
 * determine whether the context is user mode or kernel mode with
 * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level
 * matches the 0x3 mask.
 *
 * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV
 * kernels because privilege level 2 is never used.
 */
#define USER_SEGMENT_RPL_MASK	0x2

/* User mode is privilege level 3: */
#define USER_RPL		0x3

+3 −0
Original line number Diff line number Diff line
@@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
	.ss		= __KERNEL_DS,
	.ds		= __USER_DS,
	.fs		= __KERNEL_PERCPU,
#ifndef CONFIG_X86_32_LAZY_GS
	.gs		= __KERNEL_STACK_CANARY,
#endif

	.__cr3		= __pa_nodebug(swapper_pg_dir),
};
Loading