Commit fffbb5dc authored by Denys Vlasenko's avatar Denys Vlasenko Committed by Ingo Molnar
Browse files

x86/asm/entry/64: Move opportunistic sysret code to syscall code path



This change does two things:

Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.

Removes "opportunistic sysret" code from "retint_swapgs:" code
block, since now it won't be reached by syscall return. This in
fact removes most of the code in question.

   text	   data	    bss	    dec	    hex	filename
  12530	      0	      0	  12530	   30f2	entry_64.o.before
  12562	      0	      0	  12562	   3112	entry_64.o

Run-tested.

Acked-and-Tested-by: default avatarBorislav Petkov <bp@suse.de>
Signed-off-by: default avatarDenys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com


Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 4bcc7827
Loading
Loading
Loading
Loading
+86 −72
Original line number Diff line number Diff line
@@ -355,7 +355,7 @@ GLOBAL(int_with_check)
	andl %edi,%edx
	jnz   int_careful
	andl	$~TS_COMPAT,TI_status(%rcx)
	jmp   retint_swapgs
	jmp	syscall_return

	/* Either reschedule or signal or syscall exit tracking needed. */
	/* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
	DISABLE_INTERRUPTS(CLBR_NONE)
	TRACE_IRQS_OFF
	jmp int_with_check

syscall_return:
	/* The IRETQ could re-enable interrupts: */
	DISABLE_INTERRUPTS(CLBR_ANY)
	TRACE_IRQS_IRETQ

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.
	 */
	movq RCX(%rsp),%rcx
	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
	jne opportunistic_sysret_failed

	/*
	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
	 * in kernel space.  This essentially lets the user take over
	 * the kernel, since userspace controls RSP.  It's not worth
	 * testing for canonicalness exactly -- this check detects any
	 * of the 17 high bits set, which is true for non-canonical
	 * or kernel addresses.  (This will pessimize vsyscall=native.
	 * Big deal.)
	 *
	 * If virtual addresses ever become wider, this will need
	 * to be updated to remain correct on both old and new CPUs.
	 */
	.ifne __VIRTUAL_MASK_SHIFT - 47
	.error "virtual address width changed -- SYSRET checks need update"
	.endif
	shr $__VIRTUAL_MASK_SHIFT, %rcx
	jnz opportunistic_sysret_failed

	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
	jne opportunistic_sysret_failed

	movq R11(%rsp),%r11
	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
	jne opportunistic_sysret_failed

	/*
	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
	 * restoring TF results in a trap from userspace immediately after
	 * SYSRET.  This would cause an infinite loop whenever #DB happens
	 * with register state that satisfies the opportunistic SYSRET
	 * conditions.  For example, single-stepping this user code:
	 *
	 *           movq $stuck_here,%rcx
	 *           pushfq
	 *           popq %r11
	 *   stuck_here:
	 *
	 * would never get past 'stuck_here'.
	 */
	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
	jnz opportunistic_sysret_failed

	/* nothing to check for RSP */

	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
	jne opportunistic_sysret_failed

	/*
	 * We win!  This label is here just for ease of understanding
	 * perf profiles.  Nothing jumps here.
	 */
syscall_return_via_sysret:
	CFI_REMEMBER_STATE
	/* r11 is already restored (see code above) */
	RESTORE_C_REGS_EXCEPT_R11
	movq RSP(%rsp),%rsp
	USERGS_SYSRET64
	CFI_RESTORE_STATE

opportunistic_sysret_failed:
	SWAPGS
	jmp	restore_c_regs_and_iret
	CFI_ENDPROC
END(system_call)


	.macro FORK_LIKE func
ENTRY(stub_\func)
	CFI_STARTPROC
@@ -673,76 +750,8 @@ retint_swapgs: /* return to user-space */
	DISABLE_INTERRUPTS(CLBR_ANY)
	TRACE_IRQS_IRETQ

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.
	 */
	movq RCX(%rsp),%rcx
	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
	jne opportunistic_sysret_failed

	/*
	 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
	 * in kernel space.  This essentially lets the user take over
	 * the kernel, since userspace controls RSP.  It's not worth
	 * testing for canonicalness exactly -- this check detects any
	 * of the 17 high bits set, which is true for non-canonical
	 * or kernel addresses.  (This will pessimize vsyscall=native.
	 * Big deal.)
	 *
	 * If virtual addresses ever become wider, this will need
	 * to be updated to remain correct on both old and new CPUs.
	 */
	.ifne __VIRTUAL_MASK_SHIFT - 47
	.error "virtual address width changed -- sysret checks need update"
	.endif
	shr $__VIRTUAL_MASK_SHIFT, %rcx
	jnz opportunistic_sysret_failed

	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
	jne opportunistic_sysret_failed

	movq R11(%rsp),%r11
	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
	jne opportunistic_sysret_failed

	/*
	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
	 * restoring TF results in a trap from userspace immediately after
	 * SYSRET.  This would cause an infinite loop whenever #DB happens
	 * with register state that satisfies the opportunistic SYSRET
	 * conditions.  For example, single-stepping this user code:
	 *
	 *           movq $stuck_here,%rcx
	 *           pushfq
	 *           popq %r11
	 *   stuck_here:
	 *
	 * would never get past 'stuck_here'.
	 */
	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
	jnz opportunistic_sysret_failed

	/* nothing to check for RSP */

	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
	jne opportunistic_sysret_failed

	/*
	 * We win!  This label is here just for ease of understanding
	 * perf profiles.  Nothing jumps here.
	 */
irq_return_via_sysret:
	CFI_REMEMBER_STATE
	/* r11 is already restored (see code above) */
	RESTORE_C_REGS_EXCEPT_R11
	movq RSP(%rsp),%rsp
	USERGS_SYSRET64
	CFI_RESTORE_STATE

opportunistic_sysret_failed:
	SWAPGS
	jmp restore_args
	jmp	restore_c_regs_and_iret

/* Returning to kernel space */
retint_kernel:
@@ -761,7 +770,12 @@ retint_kernel:
	 * The iretq could re-enable interrupts:
	 */
	TRACE_IRQS_IRETQ
restore_args:

/*
 * At this label, code paths which return to kernel and to user,
 * which come from interrupts/exception and from syscalls, merge.
 */
restore_c_regs_and_iret:
	RESTORE_C_REGS
	REMOVE_PT_GPREGS_FROM_STACK 8