Commit 8c5bd25b authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull kvm fixes from Paolo Bonzini:
 "Fix unwinding of KVM_CREATE_VM failure, VT-d posted interrupts,
  DAX/ZONE_DEVICE, and module unload/reload"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved
  KVM: VMX: Introduce pi_is_pir_empty() helper
  KVM: VMX: Do not change PID.NDST when loading a blocked vCPU
  KVM: VMX: Consider PID.PIR to determine if vCPU has pending interrupts
  KVM: VMX: Fix comment to specify PID.ON instead of PIR.ON
  KVM: X86: Fix initialization of MSR lists
  KVM: fix placement of refcount initialization
  KVM: Fix NULL-ptr deref after kvm_create_vm fails
parents eb094f06 a78986aa
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -3393,7 +3393,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
	 * here.
	 */
	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
	    level == PT_PAGE_TABLE_LEVEL &&
	    !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
	    PageTransCompoundMap(pfn_to_page(pfn)) &&
	    !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
		unsigned long mask;
@@ -6009,8 +6009,8 @@ restart:
		 * the guest, and the guest page table is using 4K page size
		 * mapping if the indirect sp has level = 1.
		 */
		if (sp->role.direct &&
			!kvm_is_reserved_pfn(pfn) &&
		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
		    !kvm_is_zone_device_pfn(pfn) &&
		    PageTransCompoundMap(pfn_to_page(pfn))) {
			pte_list_remove(rmap_head, sptep);

+20 −3
Original line number Diff line number Diff line
@@ -1268,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
		return;

	/*
	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
	 * correctly.
	 */
	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
		pi_clear_sn(pi_desc);
		goto after_clear_sn;
	}

	/* The full case.  */
	do {
		old.control = new.control = pi_desc->control;
@@ -1283,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
	} while (cmpxchg64(&pi_desc->control, old.control,
			   new.control) != old.control);

after_clear_sn:

	/*
	 * Clear SN before reading the bitmap.  The VT-d firmware
	 * writes the bitmap and reads SN atomically (5.2.3 in the
@@ -1291,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
	 */
	smp_mb__after_atomic();

	if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
	if (!pi_is_pir_empty(pi_desc))
		pi_set_on(pi_desc);
}

@@ -6137,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
	if (pi_test_on(&vmx->pi_desc)) {
		pi_clear_on(&vmx->pi_desc);
		/*
		 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
		 * But on x86 this is just a compiler barrier anyway.
		 */
		smp_mb__after_atomic();
@@ -6167,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)

static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
	return pi_test_on(vcpu_to_pi_desc(vcpu));
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	return pi_test_on(pi_desc) ||
		(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
}

static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+11 −0
Original line number Diff line number Diff line
@@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}

static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
	return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
}

static inline void pi_set_sn(struct pi_desc *pi_desc)
{
	set_bit(POSTED_INTR_SN,
@@ -373,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
		(unsigned long *)&pi_desc->control);
}

static inline void pi_clear_sn(struct pi_desc *pi_desc)
{
	clear_bit(POSTED_INTR_SN,
		(unsigned long *)&pi_desc->control);
}

static inline int pi_test_on(struct pi_desc *pi_desc)
{
	return test_bit(POSTED_INTR_ON,
+26 −30
Original line number Diff line number Diff line
@@ -1133,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 *
 * This list is modified at module load time to reflect the
 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
 * extract the supported MSRs from the related const lists.
 * msrs_to_save is selected from the msrs_to_save_all to reflect the
 * capabilities of the host cpu. This capabilities test skips MSRs that are
 * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
 * may depend on host virtualization features rather than host cpu features.
 */

static u32 msrs_to_save[] = {
static const u32 msrs_to_save_all[] = {
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
	MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1180,9 +1182,10 @@ static u32 msrs_to_save[] = {
	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
};

static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
static unsigned num_msrs_to_save;

static u32 emulated_msrs[] = {
static const u32 emulated_msrs_all[] = {
	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1221,7 +1224,7 @@ static u32 emulated_msrs[] = {
	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
	 * We always support the "true" VMX control MSRs, even if the host
	 * processor does not, so I am putting these registers here rather
	 * than in msrs_to_save.
	 * than in msrs_to_save_all.
	 */
	MSR_IA32_VMX_BASIC,
	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1240,13 +1243,14 @@ static u32 emulated_msrs[] = {
	MSR_KVM_POLL_CONTROL,
};

static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;

/*
 * List of msr numbers which are used to expose MSR-based features that
 * can be used by a hypervisor to validate requested CPU features.
 */
static u32 msr_based_features[] = {
static const u32 msr_based_features_all[] = {
	MSR_IA32_VMX_BASIC,
	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
	MSR_IA32_VMX_PINBASED_CTLS,
@@ -1271,6 +1275,7 @@ static u32 msr_based_features[] = {
	MSR_IA32_ARCH_CAPABILITIES,
};

static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;

static u64 kvm_get_arch_capabilities(void)
@@ -5118,22 +5123,22 @@ static void kvm_init_msr_list(void)
{
	struct x86_pmu_capability x86_pmu;
	u32 dummy[2];
	unsigned i, j;
	unsigned i;

	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
			 "Please update the fixed PMCs in msrs_to_save[]");
			 "Please update the fixed PMCs in msrs_to_saved_all[]");

	perf_get_x86_pmu_capability(&x86_pmu);

	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
			continue;

		/*
		 * Even MSRs that are valid in the host may not be exposed
		 * to the guests in some cases.
		 */
		switch (msrs_to_save[i]) {
		switch (msrs_to_save_all[i]) {
		case MSR_IA32_BNDCFGS:
			if (!kvm_mpx_supported())
				continue;
@@ -5161,17 +5166,17 @@ static void kvm_init_msr_list(void)
			break;
		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
			if (!kvm_x86_ops->pt_supported() ||
				msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
				continue;
			break;
		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
			if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
				continue;
			break;
		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
			if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
				continue;
		}
@@ -5179,34 +5184,25 @@ static void kvm_init_msr_list(void)
			break;
		}

		if (j < i)
			msrs_to_save[j] = msrs_to_save[i];
		j++;
		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
	}
	num_msrs_to_save = j;

	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
			continue;

		if (j < i)
			emulated_msrs[j] = emulated_msrs[i];
		j++;
		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
	}
	num_emulated_msrs = j;

	for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
		struct kvm_msr_entry msr;

		msr.index = msr_based_features[i];
		msr.index = msr_based_features_all[i];
		if (kvm_get_msr_feature(&msr))
			continue;

		if (j < i)
			msr_based_features[j] = msr_based_features[i];
		j++;
		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
	}
	num_msr_based_features = j;
}

static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+1 −0
Original line number Diff line number Diff line
@@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);

bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);

struct kvm_irq_ack_notifier {
	struct hlist_node link;
Loading