Commit a98d73c7 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: Replace dma-fence mechanism with completions



habanalabs driver uses dma-fence mechanism for synchronization.
dma-fence mechanism was designed solely for GPUs, hence we purpose
a simpler mechanism based on completions to replace current
dma-fence objects.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent b71590ef
Loading
Loading
Loading
Loading
+47 −48
Original line number Diff line number Diff line
@@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
			hw_sob->q_idx, hw_sob->sob_id);
}

static const char *hl_fence_get_driver_name(struct dma_fence *fence)
{
	return "HabanaLabs";
}

static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
{
	struct hl_cs_compl *hl_cs_compl =
		container_of(fence, struct hl_cs_compl, base_fence);

	return dev_name(hl_cs_compl->hdev->dev);
}

static bool hl_fence_enable_signaling(struct dma_fence *fence)
{
	return true;
}

static void hl_fence_release(struct dma_fence *fence)
static void hl_fence_release(struct kref *kref)
{
	struct hl_fence *fence =
		container_of(kref, struct hl_fence, refcount);
	struct hl_cs_compl *hl_cs_cmpl =
		container_of(fence, struct hl_cs_compl, base_fence);
	struct hl_device *hdev = hl_cs_cmpl->hdev;
@@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
	}

free:
	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
	kfree(hl_cs_cmpl);
}

static const struct dma_fence_ops hl_fence_ops = {
	.get_driver_name = hl_fence_get_driver_name,
	.get_timeline_name = hl_fence_get_timeline_name,
	.enable_signaling = hl_fence_enable_signaling,
	.release = hl_fence_release
};
void hl_fence_put(struct hl_fence *fence)
{
	if (fence)
		kref_put(&fence->refcount, hl_fence_release);
}

void hl_fence_get(struct hl_fence *fence)
{
	if (fence)
		kref_get(&fence->refcount);
}

static void hl_fence_init(struct hl_fence *fence)
{
	kref_init(&fence->refcount);
	fence->error = 0;
	init_completion(&fence->completion);
}

static void cs_get(struct hl_cs *cs)
{
@@ -336,7 +332,7 @@ static void cs_do_release(struct kref *ref)
		 * In case the wait for signal CS was submitted, the put occurs
		 * in init_signal_wait_cs() right before hanging on the PQ.
		 */
		dma_fence_put(cs->signal_fence);
		hl_fence_put(cs->signal_fence);
	}

	/*
@@ -348,19 +344,18 @@ static void cs_do_release(struct kref *ref)
	hl_ctx_put(cs->ctx);

	/* We need to mark an error for not submitted because in that case
	 * the dma fence release flow is different. Mainly, we don't need
	 * the hl fence release flow is different. Mainly, we don't need
	 * to handle hw_sob for signal/wait
	 */
	if (cs->timedout)
		dma_fence_set_error(cs->fence, -ETIMEDOUT);
		cs->fence->error = -ETIMEDOUT;
	else if (cs->aborted)
		dma_fence_set_error(cs->fence, -EIO);
		cs->fence->error = -EIO;
	else if (!cs->submitted)
		dma_fence_set_error(cs->fence, -EBUSY);

	dma_fence_signal(cs->fence);
	dma_fence_put(cs->fence);
		cs->fence->error = -EBUSY;

	complete_all(&cs->fence->completion);
	hl_fence_put(cs->fence);
	cs_counters_aggregate(hdev, cs->ctx);

	kfree(cs->jobs_in_queue_cnt);
@@ -401,7 +396,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
			enum hl_cs_type cs_type, struct hl_cs **cs_new)
{
	struct hl_cs_compl *cs_cmpl;
	struct dma_fence *other = NULL;
	struct hl_fence *other = NULL;
	struct hl_cs *cs;
	int rc;

@@ -434,7 +429,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
	cs_cmpl->cs_seq = ctx->cs_sequence;
	other = ctx->cs_pending[cs_cmpl->cs_seq &
				(hdev->asic_prop.max_pending_cs - 1)];
	if ((other) && (!dma_fence_is_signaled(other))) {

	if (other && !completion_done(&other->completion)) {
		dev_dbg(hdev->dev,
			"Rejecting CS because of too many in-flights CS\n");
		rc = -EAGAIN;
@@ -448,8 +444,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
		goto free_fence;
	}

	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
			ctx->asid, ctx->cs_sequence);
	/* init hl_fence */
	hl_fence_init(&cs_cmpl->base_fence);

	cs->sequence = cs_cmpl->cs_seq;

@@ -458,9 +454,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
							&cs_cmpl->base_fence;
	ctx->cs_sequence++;

	dma_fence_get(&cs_cmpl->base_fence);
	hl_fence_get(&cs_cmpl->base_fence);

	dma_fence_put(other);
	hl_fence_put(other);

	spin_unlock(&ctx->cs_lock);

@@ -773,7 +769,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
	struct hl_ctx *ctx = hpriv->ctx;
	struct hl_cs_chunk *cs_chunk_array, *chunk;
	struct hw_queue_properties *hw_queue_prop;
	struct dma_fence *sig_fence = NULL;
	struct hl_fence *sig_fence = NULL;
	struct hl_cs_job *job;
	struct hl_cs *cs;
	struct hl_cb *cb;
@@ -883,14 +879,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
			dev_err(hdev->dev,
				"CS seq 0x%llx is not of a signal CS\n",
				signal_seq);
			dma_fence_put(sig_fence);
			hl_fence_put(sig_fence);
			rc = -EINVAL;
			goto free_signal_seq_array;
		}

		if (dma_fence_is_signaled(sig_fence)) {
		if (completion_done(&sig_fence->completion)) {
			/* signal CS already finished */
			dma_fence_put(sig_fence);
			hl_fence_put(sig_fence);
			rc = 0;
			goto free_signal_seq_array;
		}
@@ -902,7 +898,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
	rc = allocate_cs(hdev, ctx, cs_type, &cs);
	if (rc) {
		if (cs_type == CS_TYPE_WAIT)
			dma_fence_put(sig_fence);
			hl_fence_put(sig_fence);
		hl_ctx_put(ctx);
		goto free_signal_seq_array;
	}
@@ -1162,7 +1158,7 @@ out:
static long _hl_cs_wait_ioctl(struct hl_device *hdev,
		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
{
	struct dma_fence *fence;
	struct hl_fence *fence;
	unsigned long timeout;
	long rc;

@@ -1181,12 +1177,15 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
				"Can't wait on CS %llu because current CS is at seq %llu\n",
				seq, ctx->cs_sequence);
	} else if (fence) {
		rc = dma_fence_wait_timeout(fence, true, timeout);
		rc = wait_for_completion_interruptible_timeout(
				&fence->completion, timeout);

		if (fence->error == -ETIMEDOUT)
			rc = -ETIMEDOUT;
		else if (fence->error == -EIO)
			rc = -EIO;
		dma_fence_put(fence);

		hl_fence_put(fence);
	} else {
		dev_dbg(hdev->dev,
			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+7 −6
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
	 */

	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
		dma_fence_put(ctx->cs_pending[i]);
		hl_fence_put(ctx->cs_pending[i]);

	kfree(ctx->cs_pending);

@@ -128,7 +128,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
	atomic_set(&ctx->thread_ctx_switch_token, 1);
	ctx->thread_ctx_switch_wait_token = 0;
	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
				sizeof(struct dma_fence *),
				sizeof(struct hl_fence *),
				GFP_KERNEL);
	if (!ctx->cs_pending)
		return -ENOMEM;
@@ -184,10 +184,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
	return kref_put(&ctx->refcount, hl_ctx_do_release);
}

struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
	struct dma_fence *fence;
	struct hl_fence *fence;

	spin_lock(&ctx->cs_lock);

@@ -201,8 +201,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
		return NULL;
	}

	fence = dma_fence_get(
			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
	fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
	hl_fence_get(fence);

	spin_unlock(&ctx->cs_lock);

	return fence;
+22 −8
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@
#include <linux/cdev.h>
#include <linux/iopoll.h>
#include <linux/irqreturn.h>
#include <linux/dma-fence.h>
#include <linux/dma-direction.h>
#include <linux/scatterlist.h>
#include <linux/hashtable.h>
@@ -342,9 +341,22 @@ struct asic_fixed_properties {
	u8				completion_queues_count;
};

/**
 * struct hl_fence - software synchronization primitive
 * @completion: fence is implemented using completion
 * @refcount: refcount for this fence
 * @error: mark this fence with error
 *
 */
struct hl_fence {
	struct completion	completion;
	struct kref		refcount;
	int			error;
};

/**
 * struct hl_cs_compl - command submission completion object.
 * @base_fence: kernel fence object.
 * @base_fence: hl fence object.
 * @lock: spinlock to protect fence.
 * @hdev: habanalabs device structure.
 * @hw_sob: the H/W SOB used in this signal/wait CS.
@@ -353,7 +365,7 @@ struct asic_fixed_properties {
 * @sob_val: the SOB value that is used in this signal/wait CS.
 */
struct hl_cs_compl {
	struct dma_fence	base_fence;
	struct hl_fence		base_fence;
	spinlock_t		lock;
	struct hl_device	*hdev;
	struct hl_hw_sob	*hw_sob;
@@ -800,7 +812,7 @@ struct hl_va_range {
 * @hdev: pointer to the device structure.
 * @refcount: reference counter for the context. Context is released only when
 *		this hits 0l. It is incremented on CS and CS_WAIT.
 * @cs_pending: array of DMA fence objects representing pending CS.
 * @cs_pending: array of hl fence objects representing pending CS.
 * @host_va_range: holds available virtual addresses for host mappings.
 * @host_huge_va_range: holds available virtual addresses for host mappings
 *                      with huge pages.
@@ -832,7 +844,7 @@ struct hl_ctx {
	struct hl_fpriv		*hpriv;
	struct hl_device	*hdev;
	struct kref		refcount;
	struct dma_fence	**cs_pending;
	struct hl_fence		**cs_pending;
	struct hl_va_range	*host_va_range;
	struct hl_va_range	*host_huge_va_range;
	struct hl_va_range	*dram_va_range;
@@ -919,8 +931,8 @@ struct hl_cs {
	struct list_head	job_list;
	spinlock_t		job_lock;
	struct kref		refcount;
	struct dma_fence	*fence;
	struct dma_fence	*signal_fence;
	struct hl_fence		*fence;
	struct hl_fence		*signal_fence;
	struct work_struct	finish_work;
	struct delayed_work	work_tdr;
	struct list_head	mirror_node;
@@ -1739,7 +1751,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
void hl_ctx_do_release(struct kref *ref);
void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
int hl_ctx_put(struct hl_ctx *ctx);
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);

@@ -1781,6 +1793,8 @@ void hl_cs_rollback_all(struct hl_device *hdev);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void hl_sob_reset_error(struct kref *ref);
void hl_fence_put(struct hl_fence *fence);
void hl_fence_get(struct hl_fence *fence);

void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev);
+1 −1
Original line number Diff line number Diff line
@@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
		 * wait CS was submitted.
		 */
		mb();
		dma_fence_put(cs->signal_fence);
		hl_fence_put(cs->signal_fence);
		cs->signal_fence = NULL;
	}
}