Commit c16d45f4 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: Use pending CS amount per ASIC



Training schemes requires much more concurrent command submissions than
inference does. In addition, training command submissions can be completed
in a non serialized manner. Hence, we add support in which each ASIC will
be able to configure the amount of concurrent pending command submissions,
rather than use a predefined amount. This change will enhance performance
by allowing the user to add more concurrent work without waiting for the
previous work to be completed.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 0b168c8f
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -418,7 +418,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
	spin_lock(&ctx->cs_lock);

	cs_cmpl->cs_seq = ctx->cs_sequence;
	other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
	other = ctx->cs_pending[cs_cmpl->cs_seq &
				(hdev->asic_prop.max_pending_cs - 1)];
	if ((other) && (!dma_fence_is_signaled(other))) {
		spin_unlock(&ctx->cs_lock);
		dev_dbg(hdev->dev,
@@ -432,7 +433,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,

	cs->sequence = cs_cmpl->cs_seq;

	ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
	ctx->cs_pending[cs_cmpl->cs_seq &
			(hdev->asic_prop.max_pending_cs - 1)] =
							&cs_cmpl->base_fence;
	ctx->cs_sequence++;

+11 −3
Original line number Diff line number Diff line
@@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
	 * to this function unless the ref count is 0
	 */

	for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
		dma_fence_put(ctx->cs_pending[i]);

	kfree(ctx->cs_pending);

	if (ctx->asid != HL_KERNEL_ASID_ID) {
		/* The engines are stopped as there is no executing CS, but the
		 * Coresight might be still working by accessing addresses
@@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
	spin_lock_init(&ctx->cs_lock);
	atomic_set(&ctx->thread_ctx_switch_token, 1);
	ctx->thread_ctx_switch_wait_token = 0;
	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
				sizeof(struct dma_fence *),
				GFP_KERNEL);
	if (!ctx->cs_pending)
		return -ENOMEM;

	if (is_kernel_ctx) {
		ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
@@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)

struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
	struct dma_fence *fence;

	spin_lock(&ctx->cs_lock);
@@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
		return ERR_PTR(-EINVAL);
	}

	if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
	if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
		spin_unlock(&ctx->cs_lock);
		return NULL;
	}

	fence = dma_fence_get(
			ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
	spin_unlock(&ctx->cs_lock);

	return fence;
+2 −0
Original line number Diff line number Diff line
@@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
	strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
					CARD_NAME_MAX_LEN);

	prop->max_pending_cs = GAUDI_MAX_PENDING_CS;

	return 0;
}

+6 −0
Original line number Diff line number Diff line
@@ -57,6 +57,12 @@

#define GAUDI_DEFAULT_CARD_NAME		"HL2000"

#define GAUDI_MAX_PENDING_CS		1024

#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
#endif

#define PCI_DMA_NUMBER_OF_CHNLS		3
#define HBM_DMA_NUMBER_OF_CHNLS		5
#define DMA_NUMBER_OF_CHNLS		(PCI_DMA_NUMBER_OF_CHNLS + \
+2 −0
Original line number Diff line number Diff line
@@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)

	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
		CARD_NAME_MAX_LEN);

	prop->max_pending_cs = GOYA_MAX_PENDING_CS;
}

/*
Loading