iommu/arm-smmu-v3: Reduce contention during command-queue insertion (587e6c10) · Commits · 戴 / test

drivers/iommu/arm-smmu-v3.c

+533 −144

Original line number	Diff line number	Diff line
		@@ -183,7 +183,7 @@

		#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1))
		#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift))
		#define Q_OVERFLOW_FLAG (1 << 31)
		#define Q_OVERFLOW_FLAG (1U << 31)
		#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG)
		#define Q_ENT(q, p) ((q)->base + \
		Q_IDX(&((q)->llq), p) * \
		@@ -307,6 +307,8 @@
		#define CMDQ_ERR_CERROR_ABT_IDX 2
		#define CMDQ_ERR_CERROR_ATC_INV_IDX 3

		#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG

		#define CMDQ_0_OP GENMASK_ULL(7, 0)
		#define CMDQ_0_SSV (1UL << 11)

		@@ -369,9 +371,8 @@
		#define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12)

		/* High-level queue structures */
		#define ARM_SMMU_POLL_TIMEOUT_US 100
		#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */
		#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10
		#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */
		#define ARM_SMMU_POLL_SPIN_COUNT 10

		#define MSI_IOVA_BASE 0x8000000
		#define MSI_IOVA_LENGTH 0x100000
		@@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent {

		#define CMDQ_OP_CMD_SYNC 0x46
		struct {
		u32 msidata;
		u64 msiaddr;
		} sync;
		};
		};

		struct arm_smmu_ll_queue {
		union {
		u64 val;
		struct {
		u32 prod;
		u32 cons;
		};
		struct {
		atomic_t prod;
		atomic_t cons;
		} atomic;
		u8 __pad[SMP_CACHE_BYTES];
		} ____cacheline_aligned_in_smp;
		u32 max_n_shift;
		};

		@@ -499,9 +509,18 @@ struct arm_smmu_queue {
		u32 __iomem *cons_reg;
		};

		struct arm_smmu_queue_poll {
		ktime_t timeout;
		unsigned int delay;
		unsigned int spin_cnt;
		bool wfe;
		};

		struct arm_smmu_cmdq {
		struct arm_smmu_queue q;
		spinlock_t lock;
		atomic_long_t *valid_map;
		atomic_t owner_prod;
		atomic_t lock;
		};

		struct arm_smmu_evtq {
		@@ -581,8 +600,6 @@ struct arm_smmu_device {

		int gerr_irq;
		int combined_irq;
		u32 sync_nr;
		u8 prev_cmd_opcode;

		unsigned long ias; /* IPA */
		unsigned long oas; /* PA */
		@@ -601,12 +618,6 @@ struct arm_smmu_device {

		struct arm_smmu_strtab_cfg strtab_cfg;

		/* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
		union {
		u32 sync_count;
		u64 padding;
		};

		/* IOMMU core code handle */
		struct iommu_device iommu;
		};
		@@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
		}

		/* Low-level queue manipulation functions */
		static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
		{
		u32 space, prod, cons;

		prod = Q_IDX(q, q->prod);
		cons = Q_IDX(q, q->cons);

		if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
		space = (1 << q->max_n_shift) - (prod - cons);
		else
		space = cons - prod;

		return space >= n;
		}

		static bool queue_full(struct arm_smmu_ll_queue *q)
		{
		return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
		@@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
		Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
		}

		static void queue_sync_cons_in(struct arm_smmu_queue *q)
		static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
		{
		q->llq.cons = readl_relaxed(q->cons_reg);
		return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
		(Q_IDX(q, q->cons) > Q_IDX(q, prod))) \|\|
		((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
		(Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
		}

		static void queue_sync_cons_out(struct arm_smmu_queue *q)
		@@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
		return ret;
		}

		static void queue_sync_prod_out(struct arm_smmu_queue *q)
		static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
		{
		writel(q->llq.prod, q->prod_reg);
		u32 prod = (Q_WRP(q, q->prod) \| Q_IDX(q, q->prod)) + n;
		return Q_OVF(q->prod) \| Q_WRP(q, prod) \| Q_IDX(q, prod);
		}

		static void queue_inc_prod(struct arm_smmu_ll_queue *q)
		static void queue_poll_init(struct arm_smmu_device *smmu,
		struct arm_smmu_queue_poll *qp)
		{
		u32 prod = (Q_WRP(q, q->prod) \| Q_IDX(q, q->prod)) + 1;
		q->prod = Q_OVF(q->prod) \| Q_WRP(q, prod) \| Q_IDX(q, prod);
		qp->delay = 1;
		qp->spin_cnt = 0;
		qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
		qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
		}

		/*
		* Wait for the SMMU to consume items. If sync is true, wait until the queue
		* is empty. Otherwise, wait until there is at least one free slot.
		*/
		static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
		static int queue_poll(struct arm_smmu_queue_poll *qp)
		{
		ktime_t timeout;
		unsigned int delay = 1, spin_cnt = 0;

		/* Wait longer if it's a CMD_SYNC */
		timeout = ktime_add_us(ktime_get(), sync ?
		ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
		ARM_SMMU_POLL_TIMEOUT_US);

		while (queue_sync_cons_in(q),
		(sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
		if (ktime_compare(ktime_get(), timeout) > 0)
		if (ktime_compare(ktime_get(), qp->timeout) > 0)
		return -ETIMEDOUT;

		if (wfe) {
		if (qp->wfe) {
		wfe();
		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
		} else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
		cpu_relax();
		continue;
		} else {
		udelay(delay);
		delay *= 2;
		spin_cnt = 0;
		}
		udelay(qp->delay);
		qp->delay *= 2;
		qp->spin_cnt = 0;
		}

		return 0;
		@@ -788,17 +805,6 @@ static void queue_write(__le64 dst, u64 src, size_t n_dwords)
		dst++ = cpu_to_le64(src++);
		}

		static int queue_insert_raw(struct arm_smmu_queue q, u64 ent)
		{
		if (queue_full(&q->llq))
		return -ENOSPC;

		queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
		queue_inc_prod(&q->llq);
		queue_sync_prod_out(q);
		return 0;
		}

		static void queue_read(__le64 dst, u64 src, size_t n_dwords)
		{
		int i;
		@@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 cmd, struct arm_smmu_cmdq_ent ent)
		cmd[1] \|= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
		break;
		case CMDQ_OP_CMD_SYNC:
		if (ent->sync.msiaddr)
		if (ent->sync.msiaddr) {
		cmd[0] \|= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
		else
		cmd[1] \|= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
		} else {
		cmd[0] \|= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
		}
		cmd[0] \|= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
		cmd[0] \|= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
		/*
		* Commands are written little-endian, but we want the SMMU to
		* receive MSIData, and thus write it back to memory, in CPU
		* byte order, so big-endian needs an extra byteswap here.
		*/
		cmd[0] \|= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
		cpu_to_le32(ent->sync.msidata));
		cmd[1] \|= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
		break;
		default:
		return -ENOENT;
		@@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 cmd, struct arm_smmu_cmdq_ent ent)
		return 0;
		}

		static void arm_smmu_cmdq_build_sync_cmd(u64 cmd, struct arm_smmu_device smmu,
		u32 prod)
		{
		struct arm_smmu_queue *q = &smmu->cmdq.q;
		struct arm_smmu_cmdq_ent ent = {
		.opcode = CMDQ_OP_CMD_SYNC,
		};

		/*
		* Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
		* payload, so the write will zero the entire command on that platform.
		*/
		if (smmu->features & ARM_SMMU_FEAT_MSI &&
		smmu->features & ARM_SMMU_FEAT_COHERENCY) {
		ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
		q->ent_dwords * 8;
		}

		arm_smmu_cmdq_build_cmd(cmd, &ent);
		}

		static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
		{
		static const char *cerror_str[] = {
		@@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
		queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
		}

		static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device smmu, u64 cmd)
		/*
		* Command queue locking.
		* This is a form of bastardised rwlock with the following major changes:
		*
		* - The only LOCK routines are exclusive_trylock() and shared_lock().
		* Neither have barrier semantics, and instead provide only a control
		* dependency.
		*
		* - The UNLOCK routines are supplemented with shared_tryunlock(), which
		* fails if the caller appears to be the last lock holder (yes, this is
		* racy). All successful UNLOCK routines have RELEASE semantics.
		*/
		static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
		{
		struct arm_smmu_queue *q = &smmu->cmdq.q;
		bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
		int val;

		/*
		* We can try to avoid the cmpxchg() loop by simply incrementing the
		* lock counter. When held in exclusive state, the lock counter is set
		* to INT_MIN so these increments won't hurt as the value will remain
		* negative.
		*/
		if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
		return;

		smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
		do {
		val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
		} while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
		}

		while (queue_insert_raw(q, cmd) == -ENOSPC) {
		if (queue_poll_cons(q, false, wfe))
		dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
		static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
		{
		(void)atomic_dec_return_release(&cmdq->lock);
		}

		static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
		{
		if (atomic_read(&cmdq->lock) == 1)
		return false;

		arm_smmu_cmdq_shared_unlock(cmdq);
		return true;
		}

		static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
		struct arm_smmu_cmdq_ent *ent)
		#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \
		({ \
		bool __ret; \
		local_irq_save(flags); \
		__ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \
		if (!__ret) \
		local_irq_restore(flags); \
		__ret; \
		})

		#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
		({ \
		atomic_set_release(&cmdq->lock, 0); \
		local_irq_restore(flags); \
		})


		/*
		* Command queue insertion.
		* This is made fiddly by our attempts to achieve some sort of scalability
		* since there is one queue shared amongst all of the CPUs in the system. If
		* you like mixed-size concurrency, dependency ordering and relaxed atomics,
		* then you'll love this monstrosity.
		*
		* The basic idea is to split the queue up into ranges of commands that are
		* owned by a given CPU; the owner may not have written all of the commands
		* itself, but is responsible for advancing the hardware prod pointer when
		* the time comes. The algorithm is roughly:
		*
		* 1. Allocate some space in the queue. At this point we also discover
		* whether the head of the queue is currently owned by another CPU,
		* or whether we are the owner.
		*
		* 2. Write our commands into our allocated slots in the queue.
		*
		* 3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
		*
		* 4. If we are an owner:
		* a. Wait for the previous owner to finish.
		* b. Mark the queue head as unowned, which tells us the range
		* that we are responsible for publishing.
		* c. Wait for all commands in our owned range to become valid.
		* d. Advance the hardware prod pointer.
		* e. Tell the next owner we've finished.
		*
		* 5. If we are inserting a CMD_SYNC (we may or may not have been an
		* owner), then we need to stick around until it has completed:
		* a. If we have MSIs, the SMMU can write back into the CMD_SYNC
		* to clear the first 4 bytes.
		* b. Otherwise, we spin waiting for the hardware cons pointer to
		* advance past our command.
		*
		* The devil is in the details, particularly the use of locking for handling
		* SYNC completion and freeing up space in the queue before we think that it is
		* full.
		*/
		static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
		u32 sprod, u32 eprod, bool set)
		{
		u32 swidx, sbidx, ewidx, ebidx;
		struct arm_smmu_ll_queue llq = {
		.max_n_shift = cmdq->q.llq.max_n_shift,
		.prod = sprod,
		};

		ewidx = BIT_WORD(Q_IDX(&llq, eprod));
		ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;

		while (llq.prod != eprod) {
		unsigned long mask;
		atomic_long_t *ptr;
		u32 limit = BITS_PER_LONG;

		swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
		sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;

		ptr = &cmdq->valid_map[swidx];

		if ((swidx == ewidx) && (sbidx < ebidx))
		limit = ebidx;

		mask = GENMASK(limit - 1, sbidx);

		/*
		* The valid bit is the inverse of the wrap bit. This means
		* that a zero-initialised queue is invalid and, after marking
		* all entries as valid, they become invalid again when we
		* wrap.
		*/
		if (set) {
		atomic_long_xor(mask, ptr);
		} else { /* Poll */
		unsigned long valid;

		valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
		atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
		}

		llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
		}
		}

		/* Mark all entries in the range [sprod, eprod) as valid */
		static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
		u32 sprod, u32 eprod)
		{
		__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
		}

		/* Wait for all entries in the range [sprod, eprod) to become valid */
		static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
		u32 sprod, u32 eprod)
		{
		__arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
		}

		/* Wait for the command queue to become non-full */
		static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
		struct arm_smmu_ll_queue *llq)
		{
		u64 cmd[CMDQ_ENT_DWORDS];
		unsigned long flags;
		struct arm_smmu_queue_poll qp;
		struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
		int ret = 0;

		if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
		ent->opcode);
		return;
		/*
		* Try to update our copy of cons by grabbing exclusive cmdq access. If
		* that fails, spin until somebody else updates it for us.
		*/
		if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
		WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
		arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
		llq->val = READ_ONCE(cmdq->q.llq.val);
		return 0;
		}

		spin_lock_irqsave(&smmu->cmdq.lock, flags);
		arm_smmu_cmdq_insert_cmd(smmu, cmd);
		spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
		queue_poll_init(smmu, &qp);
		do {
		llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
		if (!queue_full(llq))
		break;

		ret = queue_poll(&qp);
		} while (!ret);

		return ret;
		}

		/*
		* The difference between val and sync_idx is bounded by the maximum size of
		* a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
		* Wait until the SMMU signals a CMD_SYNC completion MSI.
		* Must be called with the cmdq lock held in some capacity.
		*/
		static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
		static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
		struct arm_smmu_ll_queue *llq)
		{
		ktime_t timeout;
		u32 val;
		int ret = 0;
		struct arm_smmu_queue_poll qp;
		struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
		u32 cmd = (u32 )(Q_ENT(&cmdq->q, llq->prod));

		timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
		val = smp_cond_load_acquire(&smmu->sync_count,
		(int)(VAL - sync_idx) >= 0 \|\|
		!ktime_before(ktime_get(), timeout));
		queue_poll_init(smmu, &qp);

		return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
		/*
		* The MSI won't generate an event, since it's being written back
		* into the command queue.
		*/
		qp.wfe = false;
		smp_cond_load_relaxed(cmd, !VAL \|\| (ret = queue_poll(&qp)));
		llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
		return ret;
		}

		static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
		/*
		* Wait until the SMMU cons index passes llq->prod.
		* Must be called with the cmdq lock held in some capacity.
		*/
		static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
		struct arm_smmu_ll_queue *llq)
		{
		u64 cmd[CMDQ_ENT_DWORDS];
		unsigned long flags;
		struct arm_smmu_cmdq_ent ent = {
		.opcode = CMDQ_OP_CMD_SYNC,
		.sync = {
		.msiaddr = virt_to_phys(&smmu->sync_count),
		},
		};
		struct arm_smmu_queue_poll qp;
		struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
		u32 prod = llq->prod;
		int ret = 0;

		spin_lock_irqsave(&smmu->cmdq.lock, flags);
		queue_poll_init(smmu, &qp);
		llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
		do {
		if (queue_consumed(llq, prod))
		break;

		/* Piggy-back on the previous command if it's a SYNC */
		if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
		ent.sync.msidata = smmu->sync_nr;
		} else {
		ent.sync.msidata = ++smmu->sync_nr;
		arm_smmu_cmdq_build_cmd(cmd, &ent);
		arm_smmu_cmdq_insert_cmd(smmu, cmd);
		ret = queue_poll(&qp);

		/*
		* This needs to be a readl() so that our subsequent call
		* to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
		*
		* Specifically, we need to ensure that we observe all
		* shared_lock()s by other CMD_SYNCs that share our owner,
		* so that a failing call to tryunlock() means that we're
		* the last one out and therefore we can safely advance
		* cmdq->q.llq.cons. Roughly speaking:
		*
		* CPU 0 CPU1 CPU2 (us)
		*
		* if (sync)
		* shared_lock();
		*
		* dma_wmb();
		* set_valid_map();
		*
		* if (owner) {
		* poll_valid_map();
		* <control dependency>
		* writel(prod_reg);
		*
		* readl(cons_reg);
		* tryunlock();
		*
		* Requires us to see CPU 0's shared_lock() acquisition.
		*/
		llq->cons = readl(cmdq->q.cons_reg);
		} while (!ret);

		return ret;
		}

		spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
		static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
		struct arm_smmu_ll_queue *llq)
		{
		if (smmu->features & ARM_SMMU_FEAT_MSI &&
		smmu->features & ARM_SMMU_FEAT_COHERENCY)
		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);

		return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
		return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
		}

		static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
		static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq cmdq, u64 cmds,
		u32 prod, int n)
		{
		u64 cmd[CMDQ_ENT_DWORDS];
		int i;
		struct arm_smmu_ll_queue llq = {
		.max_n_shift = cmdq->q.llq.max_n_shift,
		.prod = prod,
		};

		for (i = 0; i < n; ++i) {
		u64 cmd = &cmds[i CMDQ_ENT_DWORDS];

		prod = queue_inc_prod_n(&llq, i);
		queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
		}
		}

		static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
		u64 *cmds, int n, bool sync)
		{
		u64 cmd_sync[CMDQ_ENT_DWORDS];
		u32 prod;
		unsigned long flags;
		bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
		struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
		int ret;
		bool owner;
		struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
		struct arm_smmu_ll_queue llq = {
		.max_n_shift = cmdq->q.llq.max_n_shift,
		}, head = llq;
		int ret = 0;

		arm_smmu_cmdq_build_cmd(cmd, &ent);
		/* 1. Allocate some space in the queue */
		local_irq_save(flags);
		llq.val = READ_ONCE(cmdq->q.llq.val);
		do {
		u64 old;

		while (!queue_has_space(&llq, n + sync)) {
		local_irq_restore(flags);
		if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
		dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
		local_irq_save(flags);
		}

		head.cons = llq.cons;
		head.prod = queue_inc_prod_n(&llq, n + sync) \|
		CMDQ_PROD_OWNED_FLAG;

		old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
		if (old == llq.val)
		break;

		spin_lock_irqsave(&smmu->cmdq.lock, flags);
		arm_smmu_cmdq_insert_cmd(smmu, cmd);
		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
		spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
		llq.val = old;
		} while (1);
		owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
		head.prod &= ~CMDQ_PROD_OWNED_FLAG;
		llq.prod &= ~CMDQ_PROD_OWNED_FLAG;

		/*
		* 2. Write our commands into the queue
		* Dependency ordering from the cmpxchg() loop above.
		*/
		arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
		if (sync) {
		prod = queue_inc_prod_n(&llq, n);
		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
		queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);

		/*
		* In order to determine completion of our CMD_SYNC, we must
		* ensure that the queue can't wrap twice without us noticing.
		* We achieve that by taking the cmdq lock as shared before
		* marking our slot as valid.
		*/
		arm_smmu_cmdq_shared_lock(cmdq);
		}

		/* 3. Mark our slots as valid, ensuring commands are visible first */
		dma_wmb();
		arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);

		/* 4. If we are the owner, take control of the SMMU hardware */
		if (owner) {
		/* a. Wait for previous owner to finish */
		atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);

		/* b. Stop gathering work by clearing the owned flag */
		prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
		&cmdq->q.llq.atomic.prod);
		prod &= ~CMDQ_PROD_OWNED_FLAG;

		/*
		* c. Wait for any gathered work to be written to the queue.
		* Note that we read our own entries so that we have the control
		* dependency required by (d).
		*/
		arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);

		/*
		* d. Advance the hardware prod pointer
		* Control dependency ordering from the entries becoming valid.
		*/
		writel_relaxed(prod, cmdq->q.prod_reg);

		/*
		* e. Tell the next owner we're done
		* Make sure we've updated the hardware first, so that we don't
		* race to update prod and potentially move it backwards.
		*/
		atomic_set_release(&cmdq->owner_prod, prod);
		}

		/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
		if (sync) {
		llq.prod = queue_inc_prod_n(&llq, n);
		ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
		if (ret) {
		dev_err_ratelimited(smmu->dev,
		"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
		llq.prod,
		readl_relaxed(cmdq->q.prod_reg),
		readl_relaxed(cmdq->q.cons_reg));
		}

		/*
		* Try to unlock the cmq lock. This will fail if we're the last
		* reader, in which case we can safely update cmdq->q.llq.cons
		*/
		if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
		WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
		arm_smmu_cmdq_shared_unlock(cmdq);
		}
		}

		local_irq_restore(flags);
		return ret;
		}

		static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
		static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
		struct arm_smmu_cmdq_ent *ent)
		{
		int ret;
		bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
		(smmu->features & ARM_SMMU_FEAT_COHERENCY);
		u64 cmd[CMDQ_ENT_DWORDS];

		ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
		: __arm_smmu_cmdq_issue_sync(smmu);
		if (ret)
		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
		return ret;
		if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
		dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
		ent->opcode);
		return -EINVAL;
		}

		return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
		}

		static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
		{
		return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
		}

		/* Context descriptor manipulation functions */
		@@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie)
		/*
		* NOTE: when io-pgtable is in non-strict mode, we may get here with
		* PTEs previously cleared by unmaps on the current CPU not yet visible
		* to the SMMU. We are relying on the DSB implicit in
		* queue_sync_prod_out() to guarantee those are observed before the
		* TLBI. Do be careful, 007.
		* to the SMMU. We are relying on the dma_wmb() implicit during cmd
		* insertion to guarantee those are observed before the TLBI. Do be
		* careful, 007.
		*/
		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
		arm_smmu_cmdq_issue_sync(smmu);
		@@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
		return 0;
		}

		static void arm_smmu_cmdq_free_bitmap(void *data)
		{
		unsigned long *bitmap = data;
		bitmap_free(bitmap);
		}

		static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
		{
		int ret = 0;
		struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
		unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
		atomic_long_t *bitmap;

		atomic_set(&cmdq->owner_prod, 0);
		atomic_set(&cmdq->lock, 0);

		bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
		if (!bitmap) {
		dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
		ret = -ENOMEM;
		} else {
		cmdq->valid_map = bitmap;
		devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
		}

		return ret;
		}

		static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
		{
		int ret;

		/* cmdq */
		spin_lock_init(&smmu->cmdq.lock);
		ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
		ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
		"cmdq");
		if (ret)
		return ret;

		ret = arm_smmu_cmdq_init(smmu);
		if (ret)
		return ret;

		/* evtq */
		ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
		ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
		@@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
		/* Queue sizes, capped to ensure natural alignment */
		smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
		FIELD_GET(IDR1_CMDQS, reg));
		if (!smmu->cmdq.q.llq.max_n_shift) {
		/* Odd alignment restrictions on the base, so ignore for now */
		dev_err(smmu->dev, "unit-length command queue not supported\n");
		if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) {
		/*
		* The cmdq valid_map relies on the total number of entries
		* being a multiple of BITS_PER_LONG. There's also no way
		* we can handle the weird alignment restrictions on the
		* base pointer for a unit-length queue.
		*/
		dev_err(smmu->dev, "command queue size < %d entries not supported\n",
		BITS_PER_LONG);
		return -ENXIO;
		}

Admin message