Commit da9803df authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86_seves_for_v5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 SEV-ES support from Borislav Petkov:
 "SEV-ES enhances the current guest memory encryption support called SEV
  by also encrypting the guest register state, making the registers
  inaccessible to the hypervisor by en-/decrypting them on world
  switches. Thus, it adds additional protection to Linux guests against
  exfiltration, control flow and rollback attacks.

  With SEV-ES, the guest is in full control of what registers the
  hypervisor can access. This is provided by a guest-host exchange
  mechanism based on a new exception vector called VMM Communication
  Exception (#VC), a new instruction called VMGEXIT and a shared
  Guest-Host Communication Block which is a decrypted page shared
  between the guest and the hypervisor.

  Intercepts to the hypervisor become #VC exceptions in an SEV-ES guest
  so in order for that exception mechanism to work, the early x86 init
  code needed to be made able to handle exceptions, which, in itself,
  brings a bunch of very nice cleanups and improvements to the early
  boot code like an early page fault handler, allowing for on-demand
  building of the identity mapping. With that, !KASLR configurations do
  not use the EFI page table anymore but switch to a kernel-controlled
  one.

  The main part of this series adds the support for that new exchange
  mechanism. The goal has been to keep this as much as possibly separate
  from the core x86 code by concentrating the machinery in two
  SEV-ES-specific files:

    arch/x86/kernel/sev-es-shared.c
    arch/x86/kernel/sev-es.c

  Other interaction with core x86 code has been kept at minimum and
  behind static keys to minimize the performance impact on !SEV-ES
  setups.

  Work by Joerg Roedel and Thomas Lendacky and others"

* tag 'x86_seves_for_v5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (73 commits)
  x86/sev-es: Use GHCB accessor for setting the MMIO scratch buffer
  x86/sev-es: Check required CPU features for SEV-ES
  x86/efi: Add GHCB mappings when SEV-ES is active
  x86/sev-es: Handle NMI State
  x86/sev-es: Support CPU offline/online
  x86/head/64: Don't call verify_cpu() on starting APs
  x86/smpboot: Load TSS and getcpu GDT entry before loading IDT
  x86/realmode: Setup AP jump table
  x86/realmode: Add SEV-ES specific trampoline entry point
  x86/vmware: Add VMware-specific handling for VMMCALL under SEV-ES
  x86/kvm: Add KVM-specific VMMCALL handling under SEV-ES
  x86/paravirt: Allow hypervisor-specific VMMCALL handling under SEV-ES
  x86/sev-es: Handle #DB Events
  x86/sev-es: Handle #AC Events
  x86/sev-es: Handle VMMCALL Events
  x86/sev-es: Handle MWAIT/MWAITX Events
  x86/sev-es: Handle MONITOR/MONITORX Events
  x86/sev-es: Handle INVD Events
  x86/sev-es: Handle RDPMC Events
  x86/sev-es: Handle RDTSC(P) Events
  ...
parents 6873139e 0ddfb1cf
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT
	select DYNAMIC_PHYSICAL_MASK
	select ARCH_USE_MEMREMAP_PROT
	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
	select INSTRUCTION_DECODER
	help
	  Say yes to enable support for the encryption of system memory.
	  This requires an AMD processor that supports Secure Memory
+9 −2
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding
@@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h

# sev-es.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/

KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
UBSAN_SANITIZE :=n
@@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
	vmlinux-objs-y += $(obj)/ident_map_64.o
	vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
	vmlinux-objs-y += $(obj)/mem_encrypt.o
	vmlinux-objs-y += $(obj)/pgtable_64.o
	vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o
endif

vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+0 −4
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
#ifdef CONFIG_RANDOMIZE_BASE

#include "../cpuflags.c"

bool has_cpuflag(int flag)
@@ -9,5 +7,3 @@ bool has_cpuflag(int flag)

	return test_bit(flag, cpu.flags);
}

#endif
+32 −1
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
#include "pgtable.h"

/*
@@ -415,6 +416,10 @@ SYM_CODE_START(startup_64)

.Lon_kernel_cs:

	pushq	%rsi
	call	load_stage1_idt
	popq	%rsi

	/*
	 * paging_prepare() sets up the trampoline and checks if we need to
	 * enable 5-level paging.
@@ -527,6 +532,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
	shrq	$3, %rcx
	rep	stosq

/*
 * If running as an SEV guest, the encryption mask is required in the
 * page-table setup code below. When the guest also has SEV-ES enabled
 * set_sev_encryption_mask() will cause #VC exceptions, but the stage2
 * handler can't map its GHCB because the page-table is not set up yet.
 * So set up the encryption mask here while still on the stage1 #VC
 * handler. Then load stage2 IDT and switch to the kernel's own
 * page-table.
 */
	pushq	%rsi
	call	set_sev_encryption_mask
	call	load_stage2_idt
	call	initialize_identity_maps
	popq	%rsi

/*
 * Do the extraction, and jump to the new kernel..
 */
@@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt)
	.quad   0x0000000000000000	/* TS continued */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)

SYM_DATA_START(boot_idt_desc)
	.word	boot_idt_end - boot_idt - 1
	.quad	0
SYM_DATA_END(boot_idt_desc)
	.balign 8
SYM_DATA_START(boot_idt)
	.rept	BOOT_IDT_ENTRIES
	.quad	0
	.quad	0
	.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)

#ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0)
#endif

#ifdef CONFIG_EFI_MIXED
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1)
+349 −0
Original line number Diff line number Diff line
@@ -19,16 +19,29 @@
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION

#include "error.h"
#include "misc.h"

/* These actually do the work of building the kernel identity maps. */
#include <linux/pgtable.h>
#include <asm/cmpxchg.h>
#include <asm/trap_pf.h>
#include <asm/trapnr.h>
#include <asm/init.h>
/* Use the static base for this part of the boot process */
#undef __PAGE_OFFSET
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
#include "../../mm/ident_map.c"

#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift = 39;
unsigned int ptrs_per_p4d = 1;
#endif

/* Used by PAGE_KERN* macros: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;

/* Used to track our page table allocation area. */
struct alloc_pgt_data {
	unsigned char *pgt_buf;
@@ -74,12 +87,28 @@ phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
 */
static struct x86_mapping_info mapping_info;

/*
 * Adds the specified range to the identity mappings.
 */
static void add_identity_map(unsigned long start, unsigned long end)
{
	int ret;

	/* Align boundary to 2M. */
	start = round_down(start, PMD_SIZE);
	end = round_up(end, PMD_SIZE);
	if (start >= end)
		return;

	/* Build the mapping. */
	ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
	if (ret)
		error("Error: kernel_ident_mapping_init() failed\n");
}

/* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void)
{
	/* If running as an SEV guest, the encryption mask is required. */
	set_sev_encryption_mask();

	/* Exclude the encryption mask from __PHYSICAL_MASK */
	physical_mask &= ~sme_me_mask;

@@ -109,37 +138,22 @@ void initialize_identity_maps(void)
	 */
	top_level_pgt = read_cr3_pa();
	if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
		debug_putstr("booted via startup_32()\n");
		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
	} else {
		debug_putstr("booted via startup_64()\n");
		pgt_data.pgt_buf = _pgtable;
		pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
		top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
	}
}

	/*
 * Adds the specified range to what will become the new identity mappings.
 * Once all ranges have been added, the new mapping is activated by calling
 * finalize_identity_maps() below.
	 * New page-table is set up - map the kernel image and load it
	 * into cr3.
	 */
void add_identity_map(unsigned long start, unsigned long size)
{
	unsigned long end = start + size;

	/* Align boundary to 2M. */
	start = round_down(start, PMD_SIZE);
	end = round_up(end, PMD_SIZE);
	if (start >= end)
		return;

	/* Build the mapping. */
	kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
				  start, end);
	add_identity_map((unsigned long)_head, (unsigned long)_end);
	write_cr3(top_level_pgt);
}

/*
@@ -151,3 +165,185 @@ void finalize_identity_maps(void)
{
	write_cr3(top_level_pgt);
}

static pte_t *split_large_pmd(struct x86_mapping_info *info,
			      pmd_t *pmdp, unsigned long __address)
{
	unsigned long page_flags;
	unsigned long address;
	pte_t *pte;
	pmd_t pmd;
	int i;

	pte = (pte_t *)info->alloc_pgt_page(info->context);
	if (!pte)
		return NULL;

	address     = __address & PMD_MASK;
	/* No large page - clear PSE flag */
	page_flags  = info->page_flag & ~_PAGE_PSE;

	/* Populate the PTEs */
	for (i = 0; i < PTRS_PER_PMD; i++) {
		set_pte(&pte[i], __pte(address | page_flags));
		address += PAGE_SIZE;
	}

	/*
	 * Ideally we need to clear the large PMD first and do a TLB
	 * flush before we write the new PMD. But the 2M range of the
	 * PMD might contain the code we execute and/or the stack
	 * we are on, so we can't do that. But that should be safe here
	 * because we are going from large to small mappings and we are
	 * also the only user of the page-table, so there is no chance
	 * of a TLB multihit.
	 */
	pmd = __pmd((unsigned long)pte | info->kernpg_flag);
	set_pmd(pmdp, pmd);
	/* Flush TLB to establish the new PMD */
	write_cr3(top_level_pgt);

	return pte + pte_index(__address);
}

static void clflush_page(unsigned long address)
{
	unsigned int flush_size;
	char *cl, *start, *end;

	/*
	 * Hardcode cl-size to 64 - CPUID can't be used here because that might
	 * cause another #VC exception and the GHCB is not ready to use yet.
	 */
	flush_size = 64;
	start      = (char *)(address & PAGE_MASK);
	end        = start + PAGE_SIZE;

	/*
	 * First make sure there are no pending writes on the cache-lines to
	 * flush.
	 */
	asm volatile("mfence" : : : "memory");

	for (cl = start; cl != end; cl += flush_size)
		clflush(cl);
}

static int set_clr_page_flags(struct x86_mapping_info *info,
			      unsigned long address,
			      pteval_t set, pteval_t clr)
{
	pgd_t *pgdp = (pgd_t *)top_level_pgt;
	p4d_t *p4dp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep, pte;

	/*
	 * First make sure there is a PMD mapping for 'address'.
	 * It should already exist, but keep things generic.
	 *
	 * To map the page just read from it and fault it in if there is no
	 * mapping yet. add_identity_map() can't be called here because that
	 * would unconditionally map the address on PMD level, destroying any
	 * PTE-level mappings that might already exist. Use assembly here so
	 * the access won't be optimized away.
	 */
	asm volatile("mov %[address], %%r9"
		     :: [address] "g" (*(unsigned long *)address)
		     : "r9", "memory");

	/*
	 * The page is mapped at least with PMD size - so skip checks and walk
	 * directly to the PMD.
	 */
	p4dp = p4d_offset(pgdp, address);
	pudp = pud_offset(p4dp, address);
	pmdp = pmd_offset(pudp, address);

	if (pmd_large(*pmdp))
		ptep = split_large_pmd(info, pmdp, address);
	else
		ptep = pte_offset_kernel(pmdp, address);

	if (!ptep)
		return -ENOMEM;

	/*
	 * Changing encryption attributes of a page requires to flush it from
	 * the caches.
	 */
	if ((set | clr) & _PAGE_ENC)
		clflush_page(address);

	/* Update PTE */
	pte = *ptep;
	pte = pte_set_flags(pte, set);
	pte = pte_clear_flags(pte, clr);
	set_pte(ptep, pte);

	/* Flush TLB after changing encryption attribute */
	write_cr3(top_level_pgt);

	return 0;
}

int set_page_decrypted(unsigned long address)
{
	return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
}

int set_page_encrypted(unsigned long address)
{
	return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
}

int set_page_non_present(unsigned long address)
{
	return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
}

static void do_pf_error(const char *msg, unsigned long error_code,
			unsigned long address, unsigned long ip)
{
	error_putstr(msg);

	error_putstr("\nError Code: ");
	error_puthex(error_code);
	error_putstr("\nCR2: 0x");
	error_puthex(address);
	error_putstr("\nRIP relative to _head: 0x");
	error_puthex(ip - (unsigned long)_head);
	error_putstr("\n");

	error("Stopping.\n");
}

void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	unsigned long address = native_read_cr2();
	unsigned long end;
	bool ghcb_fault;

	ghcb_fault = sev_es_check_ghcb_fault(address);

	address   &= PMD_MASK;
	end        = address + PMD_SIZE;

	/*
	 * Check for unexpected error codes. Unexpected are:
	 *	- Faults on present pages
	 *	- User faults
	 *	- Reserved bits set
	 */
	if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
		do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
	else if (ghcb_fault)
		do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);

	/*
	 * Error code is sane - now identity map the 2M region around
	 * the faulting address.
	 */
	add_identity_map(address, end);
}
Loading