Commit 09c34e8d authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher
Browse files

drm/amdkfd: Improve HWS hang detection and handling



Move HWS hang detection into unmap_queues_cpsch to catch hangs in all
cases. If this happens during a reset, don't schedule another reset
because the reset already in progress is expected to take care of it.

Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Tested-by: default avatarEmily Deng <Emily.Deng@amd.com>
Reviewed-by: default avatarshaoyunl <shaoyun.liu@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 63e088ac
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{
	if (!kfd->init_complete)
		return 0;

	kfd->dqm->ops.pre_reset(kfd->dqm);

	kgd2kfd_suspend(kfd);

	kfd_signal_reset_event(kfd);
+21 −6
Original line number Diff line number Diff line
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
	return 0;
}

static void pre_reset(struct device_queue_manager *dqm)
{
	dqm_lock(dqm);
	dqm->is_resetting = true;
	dqm_unlock(dqm);
}

static int allocate_sdma_queue(struct device_queue_manager *dqm,
				struct queue *q)
{
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
	dqm_lock(dqm);
	/* clear hang status when driver try to start the hw scheduler */
	dqm->is_hws_hang = false;
	dqm->is_resetting = false;
	dqm->sched_running = true;
	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
	dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
	/* should be timed out */
	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
				queue_preemption_timeout_ms);
	if (retval)
	if (retval) {
		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
		dqm->is_hws_hang = true;
		/* It's possible we're detecting a HWS hang in the
		 * middle of a GPU reset. No need to schedule another
		 * reset in this case.
		 */
		if (!dqm->is_resetting)
			schedule_work(&dqm->hw_exception_work);
		return retval;
	}

	pm_release_ib(&dqm->packets);
	dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
	if (dqm->is_hws_hang)
		return -EIO;
	retval = unmap_queues_cpsch(dqm, filter, filter_param);
	if (retval) {
		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
		dqm->is_hws_hang = true;
		schedule_work(&dqm->hw_exception_work);
	if (retval)
		return retval;
	}

	return map_queues_cpsch(dqm);
}
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		dqm->ops.initialize = initialize_cpsch;
		dqm->ops.start = start_cpsch;
		dqm->ops.stop = stop_cpsch;
		dqm->ops.pre_reset = pre_reset;
		dqm->ops.destroy_queue = destroy_queue_cpsch;
		dqm->ops.update_queue = update_queue;
		dqm->ops.register_process = register_process;
@@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		/* initialize dqm for no cp scheduling */
		dqm->ops.start = start_nocpsch;
		dqm->ops.stop = stop_nocpsch;
		dqm->ops.pre_reset = pre_reset;
		dqm->ops.create_queue = create_queue_nocpsch;
		dqm->ops.destroy_queue = destroy_queue_nocpsch;
		dqm->ops.update_queue = update_queue;
+2 −0
Original line number Diff line number Diff line
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
	int	(*initialize)(struct device_queue_manager *dqm);
	int	(*start)(struct device_queue_manager *dqm);
	int	(*stop)(struct device_queue_manager *dqm);
	void	(*pre_reset)(struct device_queue_manager *dqm);
	void	(*uninitialize)(struct device_queue_manager *dqm);
	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
					struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {

	/* hw exception  */
	bool			is_hws_hang;
	bool			is_resetting;
	struct work_struct	hw_exception_work;
	struct kfd_mem_obj	hiq_sdma_mqd;
	bool			sched_running;