Commit b78cda79 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge tag 'misc-habanalabs-next-2019-11-21' of...

Merge tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.5:

- MMU code improvements that includes:
  - Distinguish between "normal" unmapping and unmapping that is done as
    part of the tear-down of a user process. This improves performance of
    unmapping during reset of the device.
  - Add future ASIC support in generic MMU code.

- Improve device reset code by adding more protection around accessing the
  device during the reset process.

- Add new H/W queue type for future ASIC support

- Add more information to be retrieved by users through INFO IOCTL:
  - clock rate
  - board name
  - reset counters

- Small bug fixes and minor improvements to code.

* tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux: (31 commits)
  habanalabs: add more protection of device during reset
  habanalabs: flush EQ workers in hard reset
  habanalabs: make the reset code more consistent
  habanalabs: expose reset counters via existing INFO IOCTL
  habanalabs: make code more concise
  habanalabs: use defines for F/W files
  habanalabs: remove prints on successful device initialization
  habanalabs: remove unnecessary checks
  habanalabs: invalidate MMU cache only once
  habanalabs: skip VA block list update in reset flow
  habanalabs: optimize MMU unmap
  habanalabs: prevent read/write from/to the device during hard reset
  habanalabs: split MMU properties to PCI/DRAM
  habanalabs: re-factor MMU masks and documentation
  habanalabs: type specific MMU cache invalidation
  habanalabs: re-factor memory module code
  habanalabs: export uapi defines to user-space
  habanalabs: don't print error when queues are full
  habanalabs: increase max jobs number to 512
  habanalabs: set ETR as non-secured
  ...
parents 599ea01c 5feccddc
Loading
Loading
Loading
Loading
+87 −40
Original line number Diff line number Diff line
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
	kref_put(&cs->refcount, cs_do_release);
}

static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
{
	/*
	 * Patched CB is created for external queues jobs, and for H/W queues
	 * jobs if the user CB was allocated by driver and MMU is disabled.
	 */
	return (job->queue_type == QUEUE_TYPE_EXT ||
			(job->queue_type == QUEUE_TYPE_HW &&
					job->is_kernel_allocated_cb &&
					!hdev->mmu_enable));
}

/*
 * cs_parser - parse the user command submission
 *
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
	parser.patched_cb = NULL;
	parser.user_cb = job->user_cb;
	parser.user_cb_size = job->user_cb_size;
	parser.ext_queue = job->ext_queue;
	parser.queue_type = job->queue_type;
	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
	job->patched_cb = NULL;

	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
	if (job->ext_queue) {

	if (is_cb_patched(hdev, job)) {
		if (!rc) {
			job->patched_cb = parser.patched_cb;
			job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
{
	struct hl_cs *cs = job->cs;

	if (job->ext_queue) {
	if (is_cb_patched(hdev, job)) {
		hl_userptr_delete_list(hdev, &job->userptr_list);

		/*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
		}
	}

	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
	 * enabled, the user CB isn't released in cs_parser() and thus should be
	 * released here.
	 */
	if (job->queue_type == QUEUE_TYPE_HW &&
			job->is_kernel_allocated_cb && hdev->mmu_enable) {
		spin_lock(&job->user_cb->lock);
		job->user_cb->cs_cnt--;
		spin_unlock(&job->user_cb->lock);

		hl_cb_put(job->user_cb);
	}

	/*
	 * This is the only place where there can be multiple threads
	 * modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)

	hl_debugfs_remove_job(hdev, job);

	if (job->ext_queue)
	if (job->queue_type == QUEUE_TYPE_EXT ||
			job->queue_type == QUEUE_TYPE_HW)
		cs_put(cs);

	kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
	free_job(hdev, job);
}

static struct hl_cb *validate_queue_index(struct hl_device *hdev,
					struct hl_cb_mgr *cb_mgr,
static int validate_queue_index(struct hl_device *hdev,
				struct hl_cs_chunk *chunk,
					bool *ext_queue)
				enum hl_queue_type *queue_type,
				bool *is_kernel_allocated_cb)
{
	struct asic_fixed_properties *asic = &hdev->asic_prop;
	struct hw_queue_properties *hw_queue_prop;
	u32 cb_handle;
	struct hl_cb *cb;

	/* Assume external queue */
	*ext_queue = true;

	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];

@@ -406,20 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
		dev_err(hdev->dev, "Queue index %d is invalid\n",
			chunk->queue_index);
		return NULL;
		return -EINVAL;
	}

	if (hw_queue_prop->driver_only) {
		dev_err(hdev->dev,
			"Queue index %d is restricted for the kernel driver\n",
			chunk->queue_index);
		return NULL;
	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
		*ext_queue = false;
		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
		return -EINVAL;
	}

	/* Retrieve CB object */
	*queue_type = hw_queue_prop->type;
	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;

	return 0;
}

static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
					struct hl_cb_mgr *cb_mgr,
					struct hl_cs_chunk *chunk)
{
	struct hl_cb *cb;
	u32 cb_handle;

	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);

	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -444,7 +476,8 @@ release_cb:
	return NULL;
}

struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
{
	struct hl_cs_job *job;

@@ -452,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
	if (!job)
		return NULL;

	job->ext_queue = ext_queue;
	job->queue_type = queue_type;
	job->is_kernel_allocated_cb = is_kernel_allocated_cb;

	if (job->ext_queue) {
	if (is_cb_patched(hdev, job))
		INIT_LIST_HEAD(&job->userptr_list);

	if (job->queue_type == QUEUE_TYPE_EXT)
		INIT_WORK(&job->finish_work, job_wq_completion);
	}

	return job;
}
@@ -470,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
	struct hl_cs_job *job;
	struct hl_cs *cs;
	struct hl_cb *cb;
	bool ext_queue_present = false;
	bool int_queues_only = true;
	u32 size_to_copy;
	int rc, i, parse_cnt;

@@ -514,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
	/* Validate ALL the CS chunks before submitting the CS */
	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
		bool ext_queue;
		enum hl_queue_type queue_type;
		bool is_kernel_allocated_cb;

		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
					&ext_queue);
		if (ext_queue) {
			ext_queue_present = true;
		rc = validate_queue_index(hdev, chunk, &queue_type,
						&is_kernel_allocated_cb);
		if (rc)
			goto free_cs_object;

		if (is_kernel_allocated_cb) {
			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
			if (!cb) {
				rc = -EINVAL;
				goto free_cs_object;
			}
		} else {
			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
		}

		job = hl_cs_allocate_job(hdev, ext_queue);
		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
			int_queues_only = false;

		job = hl_cs_allocate_job(hdev, queue_type,
						is_kernel_allocated_cb);
		if (!job) {
			dev_err(hdev->dev, "Failed to allocate a new job\n");
			rc = -ENOMEM;
			if (ext_queue)
			if (is_kernel_allocated_cb)
				goto release_cb;
			else
				goto free_cs_object;
@@ -540,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
		job->cs = cs;
		job->user_cb = cb;
		job->user_cb_size = chunk->cb_size;
		if (job->ext_queue)
		if (is_kernel_allocated_cb)
			job->job_cb_size = cb->size;
		else
			job->job_cb_size = chunk->cb_size;
@@ -553,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
		/*
		 * Increment CS reference. When CS reference is 0, CS is
		 * done and can be signaled to user and free all its resources
		 * Only increment for JOB on external queues, because only
		 * for those JOBs we get completion
		 * Only increment for JOB on external or H/W queues, because
		 * only for those JOBs we get completion
		 */
		if (job->ext_queue)
		if (job->queue_type == QUEUE_TYPE_EXT ||
				job->queue_type == QUEUE_TYPE_HW)
			cs_get(cs);

		hl_debugfs_add_job(hdev, job);
@@ -570,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
		}
	}

	if (!ext_queue_present) {
	if (int_queues_only) {
		dev_err(hdev->dev,
			"Reject CS %d.%llu because no external queues jobs\n",
			"Reject CS %d.%llu because only internal queues jobs are present\n",
			cs->ctx->asid, cs->sequence);
		rc = -EINVAL;
		goto free_cs_object;
@@ -580,6 +626,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,

	rc = hl_hw_queue_schedule_cs(cs);
	if (rc) {
		if (rc != -EAGAIN)
			dev_err(hdev->dev,
				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
				cs->ctx->asid, cs->sequence, rc);
+80 −32
Original line number Diff line number Diff line
@@ -307,45 +307,57 @@ static inline u64 get_hop0_addr(struct hl_ctx *ctx)
			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}

static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
		u64 virt_addr)
static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
					u64 virt_addr, u64 mask, u64 shift)
{
	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
			((virt_addr & HOP0_MASK) >> HOP0_SHIFT);
			((virt_addr & mask) >> shift);
}

static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
		u64 virt_addr)
static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
					struct hl_mmu_properties *mmu_specs,
					u64 hop_addr, u64 vaddr)
{
	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
			((virt_addr & HOP1_MASK) >> HOP1_SHIFT);
	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop0_mask,
					mmu_specs->hop0_shift);
}

static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
		u64 virt_addr)
static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
					struct hl_mmu_properties *mmu_specs,
					u64 hop_addr, u64 vaddr)
{
	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
			((virt_addr & HOP2_MASK) >> HOP2_SHIFT);
	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop1_mask,
					mmu_specs->hop1_shift);
}

static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
		u64 virt_addr)
static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
					struct hl_mmu_properties *mmu_specs,
					u64 hop_addr, u64 vaddr)
{
	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
			((virt_addr & HOP3_MASK) >> HOP3_SHIFT);
	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop2_mask,
					mmu_specs->hop2_shift);
}

static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
		u64 virt_addr)
static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
					struct hl_mmu_properties *mmu_specs,
					u64 hop_addr, u64 vaddr)
{
	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
			((virt_addr & HOP4_MASK) >> HOP4_SHIFT);
	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop3_mask,
					mmu_specs->hop3_shift);
}

static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
					struct hl_mmu_properties *mmu_specs,
					u64 hop_addr, u64 vaddr)
{
	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop4_mask,
					mmu_specs->hop4_shift);
}

static inline u64 get_next_hop_addr(u64 curr_pte)
{
	if (curr_pte & PAGE_PRESENT_MASK)
		return curr_pte & PHYS_ADDR_MASK;
		return curr_pte & HOP_PHYS_ADDR_MASK;
	else
		return ULLONG_MAX;
}
@@ -355,7 +367,10 @@ static int mmu_show(struct seq_file *s, void *data)
	struct hl_debugfs_entry *entry = s->private;
	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
	struct hl_device *hdev = dev_entry->hdev;
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	struct hl_mmu_properties *mmu_prop;
	struct hl_ctx *ctx;
	bool is_dram_addr;

	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
@@ -377,33 +392,39 @@ static int mmu_show(struct seq_file *s, void *data)
		return 0;
	}

	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
				prop->va_space_dram_start_address,
				prop->va_space_dram_end_address);

	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;

	mutex_lock(&ctx->mmu_lock);

	/* the following lookup is copied from unmap() in mmu.c */

	hop0_addr = get_hop0_addr(ctx);
	hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
	hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
	hop1_addr = get_next_hop_addr(hop0_pte);

	if (hop1_addr == ULLONG_MAX)
		goto not_mapped;

	hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
	hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
	hop2_addr = get_next_hop_addr(hop1_pte);

	if (hop2_addr == ULLONG_MAX)
		goto not_mapped;

	hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
	hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
	hop3_addr = get_next_hop_addr(hop2_pte);

	if (hop3_addr == ULLONG_MAX)
		goto not_mapped;

	hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);

	if (!(hop3_pte & LAST_MASK)) {
@@ -412,7 +433,8 @@ static int mmu_show(struct seq_file *s, void *data)
		if (hop4_addr == ULLONG_MAX)
			goto not_mapped;

		hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
							virt_addr);
		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
		if (!(hop4_pte & PAGE_PRESENT_MASK))
			goto not_mapped;
@@ -506,6 +528,12 @@ static int engines_show(struct seq_file *s, void *data)
	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
	struct hl_device *hdev = dev_entry->hdev;

	if (atomic_read(&hdev->in_reset)) {
		dev_warn_ratelimited(hdev->dev,
				"Can't check device idle during reset\n");
		return 0;
	}

	hdev->asic_funcs->is_device_idle(hdev, NULL, s);

	return 0;
@@ -534,41 +562,50 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
				u64 *phys_addr)
{
	struct hl_ctx *ctx = hdev->compute_ctx;
	struct asic_fixed_properties *prop = &hdev->asic_prop;
	struct hl_mmu_properties *mmu_prop;
	u64 hop_addr, hop_pte_addr, hop_pte;
	u64 offset_mask = HOP4_MASK | OFFSET_MASK;
	u64 offset_mask = HOP4_MASK | FLAGS_MASK;
	int rc = 0;
	bool is_dram_addr;

	if (!ctx) {
		dev_err(hdev->dev, "no ctx available\n");
		return -EINVAL;
	}

	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
				prop->va_space_dram_start_address,
				prop->va_space_dram_end_address);

	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;

	mutex_lock(&ctx->mmu_lock);

	/* hop 0 */
	hop_addr = get_hop0_addr(ctx);
	hop_pte_addr = get_hop0_pte_addr(ctx, hop_addr, virt_addr);
	hop_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

	/* hop 1 */
	hop_addr = get_next_hop_addr(hop_pte);
	if (hop_addr == ULLONG_MAX)
		goto not_mapped;
	hop_pte_addr = get_hop1_pte_addr(ctx, hop_addr, virt_addr);
	hop_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

	/* hop 2 */
	hop_addr = get_next_hop_addr(hop_pte);
	if (hop_addr == ULLONG_MAX)
		goto not_mapped;
	hop_pte_addr = get_hop2_pte_addr(ctx, hop_addr, virt_addr);
	hop_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

	/* hop 3 */
	hop_addr = get_next_hop_addr(hop_pte);
	if (hop_addr == ULLONG_MAX)
		goto not_mapped;
	hop_pte_addr = get_hop3_pte_addr(ctx, hop_addr, virt_addr);
	hop_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

	if (!(hop_pte & LAST_MASK)) {
@@ -576,10 +613,11 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
		hop_addr = get_next_hop_addr(hop_pte);
		if (hop_addr == ULLONG_MAX)
			goto not_mapped;
		hop_pte_addr = get_hop4_pte_addr(ctx, hop_addr, virt_addr);
		hop_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop_addr,
							virt_addr);
		hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);

		offset_mask = OFFSET_MASK;
		offset_mask = FLAGS_MASK;
	}

	if (!(hop_pte & PAGE_PRESENT_MASK))
@@ -608,6 +646,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
	u32 val;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
		return 0;
	}

	if (*ppos)
		return 0;

@@ -637,6 +680,11 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
	u32 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
		return 0;
	}

	rc = kstrtouint_from_user(buf, count, 16, &value);
	if (rc)
		return rc;
+11 −7
Original line number Diff line number Diff line
@@ -42,12 +42,10 @@ static void hpriv_release(struct kref *ref)
{
	struct hl_fpriv *hpriv;
	struct hl_device *hdev;
	struct hl_ctx *ctx;

	hpriv = container_of(ref, struct hl_fpriv, refcount);

	hdev = hpriv->hdev;
	ctx = hpriv->ctx;

	put_pid(hpriv->taskpid);

@@ -889,13 +887,19 @@ again:
	/* Go over all the queues, release all CS and their jobs */
	hl_cs_rollback_all(hdev);

	/* Kill processes here after CS rollback. This is because the process
	 * can't really exit until all its CSs are done, which is what we
	 * do in cs rollback
	if (hard_reset) {
		/* Kill processes here after CS rollback. This is because the
		 * process can't really exit until all its CSs are done, which
		 * is what we do in cs rollback
		 */
	if (from_hard_reset_thread)
		device_kill_open_processes(hdev);

		/* Flush the Event queue workers to make sure no other thread is
		 * reading or writing to registers during the reset
		 */
		flush_workqueue(hdev->eq_wq);
	}

	/* Release kernel context */
	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
		hdev->kernel_ctx = NULL;
+1 −4
Original line number Diff line number Diff line
@@ -143,10 +143,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
			sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);

	if (!rc) {
		if (result == ARMCP_PACKET_FENCE_VAL)
			dev_info(hdev->dev,
				"queue test on CPU queue succeeded\n");
		else
		if (result != ARMCP_PACKET_FENCE_VAL)
			dev_err(hdev->dev,
				"CPU queue test failed (0x%08lX)\n", result);
	} else {
+53 −25
Original line number Diff line number Diff line
@@ -72,6 +72,9 @@
 *
 */

#define GOYA_UBOOT_FW_FILE	"habanalabs/goya/goya-u-boot.bin"
#define GOYA_LINUX_FW_FILE	"habanalabs/goya/goya-fit.itb"

#define GOYA_MMU_REGS_NUM		63

#define GOYA_DMA_POOL_BLK_SIZE		0x100		/* 256 bytes */
@@ -337,17 +340,20 @@ void goya_get_fixed_properties(struct hl_device *hdev)
	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
		prop->hw_queues_props[i].driver_only = 0;
		prop->hw_queues_props[i].requires_kernel_cb = 1;
	}

	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
		prop->hw_queues_props[i].driver_only = 1;
		prop->hw_queues_props[i].requires_kernel_cb = 0;
	}

	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
			NUMBER_OF_INT_HW_QUEUES; i++) {
		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
		prop->hw_queues_props[i].driver_only = 0;
		prop->hw_queues_props[i].requires_kernel_cb = 0;
	}

	for (; i < HL_MAX_QUEUES; i++)
@@ -377,6 +383,23 @@ void goya_get_fixed_properties(struct hl_device *hdev)
	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
	prop->dram_page_size = PAGE_SIZE_2MB;

	prop->dmmu.hop0_shift = HOP0_SHIFT;
	prop->dmmu.hop1_shift = HOP1_SHIFT;
	prop->dmmu.hop2_shift = HOP2_SHIFT;
	prop->dmmu.hop3_shift = HOP3_SHIFT;
	prop->dmmu.hop4_shift = HOP4_SHIFT;
	prop->dmmu.hop0_mask = HOP0_MASK;
	prop->dmmu.hop1_mask = HOP1_MASK;
	prop->dmmu.hop2_mask = HOP2_MASK;
	prop->dmmu.hop3_mask = HOP3_MASK;
	prop->dmmu.hop4_mask = HOP4_MASK;
	prop->dmmu.huge_page_size = PAGE_SIZE_2MB;

	/* No difference between PMMU and DMMU except of page size */
	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
	prop->dmmu.page_size = PAGE_SIZE_2MB;
	prop->pmmu.page_size = PAGE_SIZE_4KB;

	prop->va_space_host_start_address = VA_HOST_SPACE_START;
	prop->va_space_host_end_address = VA_HOST_SPACE_END;
	prop->va_space_dram_start_address = VA_DDR_SPACE_START;
@@ -393,6 +416,9 @@ void goya_get_fixed_properties(struct hl_device *hdev)
	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
	prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
	prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;

	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
		CARD_NAME_MAX_LEN);
}

/*
@@ -1454,6 +1480,9 @@ static void goya_init_golden_registers(struct hl_device *hdev)
				1 << TPC0_NRTR_SCRAMB_EN_VAL_SHIFT);
		WREG32(mmTPC0_NRTR_NON_LIN_SCRAMB + offset,
				1 << TPC0_NRTR_NON_LIN_SCRAMB_EN_SHIFT);

		WREG32_FIELD(TPC0_CFG_MSS_CONFIG, offset,
				ICACHE_FETCH_LINE_NUM, 2);
	}

	WREG32(mmDMA_NRTR_SCRAMB_EN, 1 << DMA_NRTR_SCRAMB_EN_VAL_SHIFT);
@@ -1533,7 +1562,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
	u32 mtr_base_lo, mtr_base_hi;
	u32 so_base_lo, so_base_hi;
	u32 gic_base_lo, gic_base_hi;
	u64 qman_base_addr;

	mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
	mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
@@ -1545,9 +1573,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
	gic_base_hi =
		upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);

	qman_base_addr = hdev->asic_prop.sram_base_address +
				MME_QMAN_BASE_OFFSET;

	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
	WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
	WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO,	so_base_lo);
@@ -2141,13 +2166,11 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
 */
static int goya_push_uboot_to_device(struct hl_device *hdev)
{
	char fw_name[200];
	void __iomem *dst;

	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
	dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;

	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
	return hl_fw_push_fw_to_device(hdev, GOYA_UBOOT_FW_FILE, dst);
}

/*
@@ -2160,13 +2183,11 @@ static int goya_push_uboot_to_device(struct hl_device *hdev)
 */
static int goya_push_linux_to_device(struct hl_device *hdev)
{
	char fw_name[200];
	void __iomem *dst;

	snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
	dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;

	return hl_fw_push_fw_to_device(hdev, fw_name, dst);
	return hl_fw_push_fw_to_device(hdev, GOYA_LINUX_FW_FILE, dst);
}

static int goya_pldm_init_cpu(struct hl_device *hdev)
@@ -2291,6 +2312,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
		10000,
		cpu_timeout);

	/* Read U-Boot version now in case we will later fail */
	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);

	if (rc) {
		dev_err(hdev->dev, "Error in ARM u-boot!");
		switch (status) {
@@ -2328,6 +2353,11 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
				"ARM status %d - u-boot stopped by user\n",
				status);
			break;
		case CPU_BOOT_STATUS_TS_INIT_FAIL:
			dev_err(hdev->dev,
				"ARM status %d - Thermal Sensor initialization failed\n",
				status);
			break;
		default:
			dev_err(hdev->dev,
				"ARM status %d - Invalid status code\n",
@@ -2337,10 +2367,6 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
		return -EIO;
	}

	/* Read U-Boot version now in case we will later fail */
	goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
	goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);

	if (!hdev->fw_loading) {
		dev_info(hdev->dev, "Skip loading FW\n");
		goto out;
@@ -2453,7 +2479,8 @@ int goya_mmu_init(struct hl_device *hdev)
	WREG32_AND(mmSTLB_STLB_FEATURE_EN,
			(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));

	hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
	hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
					VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);

	WREG32(mmMMU_MMU_ENABLE, 1);
	WREG32(mmMMU_SPI_MASK, 0xF);
@@ -2978,9 +3005,6 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
			"H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n",
			hw_queue_id, (unsigned long long) fence_dma_addr, tmp);
		rc = -EIO;
	} else {
		dev_info(hdev->dev, "queue test on H/W queue %d succeeded\n",
			hw_queue_id);
	}

free_pkt:
@@ -3925,7 +3949,7 @@ static int goya_parse_cb_no_ext_queue(struct hl_device *hdev,
		return 0;

	dev_err(hdev->dev,
		"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n",
		"Internal CB address 0x%px + 0x%x is not in SRAM nor in DRAM\n",
		parser->user_cb, parser->user_cb_size);

	return -EFAULT;
@@ -3935,7 +3959,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
{
	struct goya_device *goya = hdev->asic_specific;

	if (!parser->ext_queue)
	if (parser->queue_type == QUEUE_TYPE_INT)
		return goya_parse_cb_no_ext_queue(hdev, parser);

	if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4606,7 +4630,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
		lin_dma_pkt++;
	} while (--lin_dma_pkts_cnt);

	job = hl_cs_allocate_job(hdev, true);
	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
	if (!job) {
		dev_err(hdev->dev, "Failed to allocate a new job\n");
		rc = -ENOMEM;
@@ -4835,13 +4859,15 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
		goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
}

static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard)
static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
					u32 flags)
{
	struct goya_device *goya = hdev->asic_specific;
	u32 status, timeout_usec;
	int rc;

	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
		hdev->hard_reset_pending)
		return;

	/* no need in L1 only invalidation in Goya */
@@ -4880,7 +4906,8 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
	u32 status, timeout_usec, inv_data, pi;
	int rc;

	if (!(goya->hw_cap_initialized & HW_CAP_MMU))
	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
		hdev->hard_reset_pending)
		return;

	/* no need in L1 only invalidation in Goya */
@@ -5137,7 +5164,8 @@ static const struct hl_asic_funcs goya_funcs = {
	.init_iatu = goya_init_iatu,
	.rreg = hl_rreg,
	.wreg = hl_wreg,
	.halt_coresight = goya_halt_coresight
	.halt_coresight = goya_halt_coresight,
	.get_clk_rate = goya_get_clk_rate
};

/*
Loading