Commit 4d2af08e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull iommu fixes from Joerg Roedel:
 "A couple of fixes for the AMD IOMMU driver have piled up:

   - Some fixes for the reworked IO page-table which caused memory leaks
     or did not allow to downgrade mappings under some conditions.

   - Locking fixes to fix a couple of possible races around accessing
     'struct protection_domain'. The races got introduced when the
     dma-ops path became lock-less in the fast-path"

* tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/amd: Lock code paths traversing protection_domain->dev_list
  iommu/amd: Lock dev_data in attach/detach code paths
  iommu/amd: Check for busy devices earlier in attach_device()
  iommu/amd: Take domain->lock for complete attach/detach path
  iommu/amd: Remove amd_iommu_devtable_lock
  iommu/amd: Remove domain->updated
  iommu/amd: Wait for completion of IOTLB flush in attach_device
  iommu/amd: Unmap all L7 PTEs when downgrading page-sizes
  iommu/amd: Introduce first_pte_l7() helper
  iommu/amd: Fix downgrading default page-sizes in alloc_pte()
  iommu/amd: Fix pages leak in free_pagetable()
parents 02dc96ef 2a78f996
Loading
Loading
Loading
Loading
+136 −93
Original line number Diff line number Diff line
@@ -70,7 +70,6 @@
 */
#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))

static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
static DEFINE_SPINLOCK(pd_bitmap_lock);

/* List of all available dev_data structures */
@@ -202,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid)
	if (!dev_data)
		return NULL;

	spin_lock_init(&dev_data->lock);
	dev_data->devid = devid;
	ratelimit_default_init(&dev_data->rs);

@@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev)
	 */
}

/*
 * Helper function to get the first pte of a large mapping
 */
static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
			 unsigned long *count)
{
	unsigned long pte_mask, pg_size, cnt;
	u64 *fpte;

	pg_size  = PTE_PAGE_SIZE(*pte);
	cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
	pte_mask = ~((cnt << 3) - 1);
	fpte     = (u64 *)(((unsigned long)pte) & pte_mask);

	if (page_size)
		*page_size = pg_size;

	if (count)
		*count = cnt;

	return fpte;
}

/****************************************************************************
 *
 * Interrupt handling functions
@@ -1311,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain,
		dma_addr_t iova, size_t size)
{
	if (unlikely(amd_iommu_np_cache)) {
		unsigned long flags;

		spin_lock_irqsave(&domain->lock, flags);
		domain_flush_pages(domain, iova, size);
		domain_flush_complete(domain);
		spin_unlock_irqrestore(&domain->lock, flags);
	}
}

@@ -1425,7 +1452,7 @@ static void free_pagetable(struct protection_domain *domain)
	BUG_ON(domain->mode < PAGE_MODE_NONE ||
	       domain->mode > PAGE_MODE_6_LEVEL);

	free_sub_pt(root, domain->mode, freelist);
	freelist = free_sub_pt(root, domain->mode, freelist);

	free_page_list(freelist);
}
@@ -1435,10 +1462,11 @@ static void free_pagetable(struct protection_domain *domain)
 * another level increases the size of the address space by 9 bits to a size up
 * to 64 bits.
 */
static void increase_address_space(struct protection_domain *domain,
static bool increase_address_space(struct protection_domain *domain,
				   gfp_t gfp)
{
	unsigned long flags;
	bool ret = false;
	u64 *pte;

	spin_lock_irqsave(&domain->lock, flags);
@@ -1455,19 +1483,21 @@ static void increase_address_space(struct protection_domain *domain,
					iommu_virt_to_phys(domain->pt_root));
	domain->pt_root  = pte;
	domain->mode    += 1;
	domain->updated  = true;

	ret = true;

out:
	spin_unlock_irqrestore(&domain->lock, flags);

	return;
	return ret;
}

static u64 *alloc_pte(struct protection_domain *domain,
		      unsigned long address,
		      unsigned long page_size,
		      u64 **pte_page,
		      gfp_t gfp)
		      gfp_t gfp,
		      bool *updated)
{
	int level, end_lvl;
	u64 *pte, *page;
@@ -1475,7 +1505,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
	BUG_ON(!is_power_of_2(page_size));

	while (address > PM_LEVEL_SIZE(domain->mode))
		increase_address_space(domain, gfp);
		*updated = increase_address_space(domain, gfp) || *updated;

	level   = domain->mode - 1;
	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
@@ -1489,9 +1519,32 @@ static u64 *alloc_pte(struct protection_domain *domain,
		__pte     = *pte;
		pte_level = PM_PTE_LEVEL(__pte);

		if (!IOMMU_PTE_PRESENT(__pte) ||
		/*
		 * If we replace a series of large PTEs, we need
		 * to tear down all of them.
		 */
		if (IOMMU_PTE_PRESENT(__pte) &&
		    pte_level == PAGE_MODE_7_LEVEL) {
			unsigned long count, i;
			u64 *lpte;

			lpte = first_pte_l7(pte, NULL, &count);

			/*
			 * Unmap the replicated PTEs that still match the
			 * original large mapping
			 */
			for (i = 0; i < count; ++i)
				cmpxchg64(&lpte[i], __pte, 0ULL);

			*updated = true;
			continue;
		}

		if (!IOMMU_PTE_PRESENT(__pte) ||
		    pte_level == PAGE_MODE_NONE) {
			page = (u64 *)get_zeroed_page(gfp);

			if (!page)
				return NULL;

@@ -1500,8 +1553,8 @@ static u64 *alloc_pte(struct protection_domain *domain,
			/* pte could have been changed somewhere. */
			if (cmpxchg64(pte, __pte, __npte) != __pte)
				free_page((unsigned long)page);
			else if (pte_level == PAGE_MODE_7_LEVEL)
				domain->updated = true;
			else if (IOMMU_PTE_PRESENT(__pte))
				*updated = true;

			continue;
		}
@@ -1566,17 +1619,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
		*page_size = PTE_LEVEL_PAGE_SIZE(level);
	}

	if (PM_PTE_LEVEL(*pte) == 0x07) {
		unsigned long pte_mask;

	/*
	 * If we have a series of large PTEs, make
	 * sure to return a pointer to the first one.
	 */
		*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
		pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
		pte        = (u64 *)(((unsigned long)pte) & pte_mask);
	}
	if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
		pte = first_pte_l7(pte, page_size, NULL);

	return pte;
}
@@ -1615,26 +1663,29 @@ static int iommu_map_page(struct protection_domain *dom,
			  gfp_t gfp)
{
	struct page *freelist = NULL;
	bool updated = false;
	u64 __pte, *pte;
	int i, count;
	int ret, i, count;

	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
	BUG_ON(!IS_ALIGNED(phys_addr, page_size));

	ret = -EINVAL;
	if (!(prot & IOMMU_PROT_MASK))
		return -EINVAL;
		goto out;

	count = PAGE_SIZE_PTE_COUNT(page_size);
	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);

	ret = -ENOMEM;
	if (!pte)
		return -ENOMEM;
		goto out;

	for (i = 0; i < count; ++i)
		freelist = free_clear_pte(&pte[i], pte[i], freelist);

	if (freelist != NULL)
		dom->updated = true;
		updated = true;

	if (count > 1) {
		__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@@ -1650,12 +1701,21 @@ static int iommu_map_page(struct protection_domain *dom,
	for (i = 0; i < count; ++i)
		pte[i] = __pte;

	ret = 0;

out:
	if (updated) {
		unsigned long flags;

		spin_lock_irqsave(&dom->lock, flags);
		update_domain(dom);
		spin_unlock_irqrestore(&dom->lock, flags);
	}

	/* Everything flushed out, free pages now */
	free_page_list(freelist);

	return 0;
	return ret;
}

static unsigned long iommu_unmap_page(struct protection_domain *dom,
@@ -1806,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain)

static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
{
	unsigned long flags;

	spin_lock_irqsave(&dom->domain.lock, flags);
	domain_flush_tlb(&dom->domain);
	domain_flush_complete(&dom->domain);
	spin_unlock_irqrestore(&dom->domain.lock, flags);
}

static void iova_domain_flush_tlb(struct iova_domain *iovad)
@@ -2022,36 +2086,6 @@ static void do_detach(struct iommu_dev_data *dev_data)
	domain->dev_cnt                 -= 1;
}

/*
 * If a device is not yet associated with a domain, this function makes the
 * device visible in the domain
 */
static int __attach_device(struct iommu_dev_data *dev_data,
			   struct protection_domain *domain)
{
	int ret;

	/* lock domain */
	spin_lock(&domain->lock);

	ret = -EBUSY;
	if (dev_data->domain != NULL)
		goto out_unlock;

	/* Attach alias group root */
	do_attach(dev_data, domain);

	ret = 0;

out_unlock:

	/* ready */
	spin_unlock(&domain->lock);

	return ret;
}


static void pdev_iommuv2_disable(struct pci_dev *pdev)
{
	pci_disable_ats(pdev);
@@ -2133,19 +2167,28 @@ static int attach_device(struct device *dev,
	unsigned long flags;
	int ret;

	spin_lock_irqsave(&domain->lock, flags);

	dev_data = get_dev_data(dev);

	spin_lock(&dev_data->lock);

	ret = -EBUSY;
	if (dev_data->domain != NULL)
		goto out;

	if (!dev_is_pci(dev))
		goto skip_ats_check;

	pdev = to_pci_dev(dev);
	if (domain->flags & PD_IOMMUV2_MASK) {
		ret = -EINVAL;
		if (!dev_data->passthrough)
			return -EINVAL;
			goto out;

		if (dev_data->iommu_v2) {
			if (pdev_iommuv2_enable(pdev) != 0)
				return -EINVAL;
				goto out;

			dev_data->ats.enabled = true;
			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
@@ -2158,9 +2201,9 @@ static int attach_device(struct device *dev,
	}

skip_ats_check:
	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
	ret = __attach_device(dev_data, domain);
	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
	ret = 0;

	do_attach(dev_data, domain);

	/*
	 * We might boot into a crash-kernel here. The crashed kernel
@@ -2169,23 +2212,14 @@ skip_ats_check:
	 */
	domain_flush_tlb_pde(domain);

	return ret;
}

/*
 * Removes a device from a protection domain (unlocked)
 */
static void __detach_device(struct iommu_dev_data *dev_data)
{
	struct protection_domain *domain;

	domain = dev_data->domain;
	domain_flush_complete(domain);

	spin_lock(&domain->lock);
out:
	spin_unlock(&dev_data->lock);

	do_detach(dev_data);
	spin_unlock_irqrestore(&domain->lock, flags);

	spin_unlock(&domain->lock);
	return ret;
}

/*
@@ -2200,6 +2234,10 @@ static void detach_device(struct device *dev)
	dev_data = get_dev_data(dev);
	domain   = dev_data->domain;

	spin_lock_irqsave(&domain->lock, flags);

	spin_lock(&dev_data->lock);

	/*
	 * First check if the device is still attached. It might already
	 * be detached from its domain because the generic
@@ -2207,15 +2245,12 @@ static void detach_device(struct device *dev)
	 * our alias handling.
	 */
	if (WARN_ON(!dev_data->domain))
		return;
		goto out;

	/* lock device table */
	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
	__detach_device(dev_data);
	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
	do_detach(dev_data);

	if (!dev_is_pci(dev))
		return;
		goto out;

	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
		pdev_iommuv2_disable(to_pci_dev(dev));
@@ -2223,6 +2258,11 @@ static void detach_device(struct device *dev)
		pci_disable_ats(to_pci_dev(dev));

	dev_data->ats.enabled = false;

out:
	spin_unlock(&dev_data->lock);

	spin_unlock_irqrestore(&domain->lock, flags);
}

static int amd_iommu_add_device(struct device *dev)
@@ -2354,15 +2394,10 @@ static void update_device_table(struct protection_domain *domain)

static void update_domain(struct protection_domain *domain)
{
	if (!domain->updated)
		return;

	update_device_table(domain);

	domain_flush_devices(domain);
	domain_flush_tlb_pde(domain);

	domain->updated = false;
}

static int dir2prot(enum dma_data_direction direction)
@@ -2392,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev,
{
	dma_addr_t offset = paddr & ~PAGE_MASK;
	dma_addr_t address, start, ret;
	unsigned long flags;
	unsigned int pages;
	int prot = 0;
	int i;
@@ -2429,8 +2465,10 @@ out_unmap:
		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
	}

	spin_lock_irqsave(&dma_dom->domain.lock, flags);
	domain_flush_tlb(&dma_dom->domain);
	domain_flush_complete(&dma_dom->domain);
	spin_unlock_irqrestore(&dma_dom->domain.lock, flags);

	dma_ops_free_iova(dma_dom, address, pages);

@@ -2459,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
	}

	if (amd_iommu_unmap_flush) {
		unsigned long flags;

		spin_lock_irqsave(&dma_dom->domain.lock, flags);
		domain_flush_tlb(&dma_dom->domain);
		domain_flush_complete(&dma_dom->domain);
		spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
		dma_ops_free_iova(dma_dom, dma_addr, pages);
	} else {
		pages = __roundup_pow_of_two(pages);
@@ -2866,16 +2908,16 @@ static void cleanup_domain(struct protection_domain *domain)
	struct iommu_dev_data *entry;
	unsigned long flags;

	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
	spin_lock_irqsave(&domain->lock, flags);

	while (!list_empty(&domain->dev_list)) {
		entry = list_first_entry(&domain->dev_list,
					 struct iommu_dev_data, list);
		BUG_ON(!entry->domain);
		__detach_device(entry);
		do_detach(entry);
	}

	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
	spin_unlock_irqrestore(&domain->lock, flags);
}

static void protection_domain_free(struct protection_domain *domain)
@@ -3226,9 +3268,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
	struct protection_domain *dom = to_pdomain(domain);
	unsigned long flags;

	spin_lock_irqsave(&dom->lock, flags);
	domain_flush_tlb_pde(dom);
	domain_flush_complete(dom);
	spin_unlock_irqrestore(&dom->lock, flags);
}

static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
@@ -3290,7 +3335,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom)

	/* Update data structure */
	domain->mode    = PAGE_MODE_NONE;
	domain->updated = true;

	/* Make changes visible to IOMMUs */
	update_domain(domain);
@@ -3336,7 +3380,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)

	domain->glx      = levels;
	domain->flags   |= PD_IOMMUV2_MASK;
	domain->updated  = true;

	update_domain(domain);

+3 −1
Original line number Diff line number Diff line
@@ -475,7 +475,6 @@ struct protection_domain {
	int glx;		/* Number of levels for GCR3 table */
	u64 *gcr3_tbl;		/* Guest CR3 table */
	unsigned long flags;	/* flags to find out type of domain */
	bool updated;		/* complete domain flush required */
	unsigned dev_cnt;	/* devices assigned to this domain */
	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
@@ -634,6 +633,9 @@ struct devid_map {
 * This struct contains device specific data for the IOMMU
 */
struct iommu_dev_data {
	/*Protect against attach/detach races */
	spinlock_t lock;

	struct list_head list;		  /* For domain->dev_list */
	struct llist_node dev_data_list;  /* For global dev_data_list */
	struct protection_domain *domain; /* Domain the device is bound to */