Commit 2440ff2c authored by Junwei Zhang's avatar Junwei Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: add timer to fence to detect scheduler lockup



Change-Id: I67e987db0efdca28faa80b332b75571192130d33
Signed-off-by: default avatarJunwei Zhang <Jerry.Zhang@amd.com>
Reviewed-by: default avatarDavid Zhou <david1.zhou@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
parent d6c10f6b
Loading
Loading
Loading
Loading
+13 −1
Original line number Diff line number Diff line
@@ -628,8 +628,20 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
	init_waitqueue_head(&ring->fence_drv.fence_queue);

	if (amdgpu_enable_scheduler) {
		long timeout = msecs_to_jiffies(amdgpu_lockup_timeout);
		if (timeout == 0) {
			/*
			 * FIXME:
			 * Delayed workqueue cannot use it directly,
			 * so the scheduler will not use delayed workqueue if
			 * MAX_SCHEDULE_TIMEOUT is set.
			 * Currently keep it simple and silly.
			 */
			timeout = MAX_SCHEDULE_TIMEOUT;
		}
		r = amd_sched_init(&ring->sched, &amdgpu_sched_ops,
				   amdgpu_sched_hw_submission, ring->name);
				   amdgpu_sched_hw_submission,
				   timeout, ring->name);
		if (r) {
			DRM_ERROR("Failed to create scheduler on ring %s.\n",
				  ring->name);
+42 −1
Original line number Diff line number Diff line
@@ -327,19 +327,49 @@ static void amd_sched_process_job(struct fence *f, struct fence_cb *cb)
	struct amd_sched_fence *s_fence =
		container_of(cb, struct amd_sched_fence, cb);
	struct amd_gpu_scheduler *sched = s_fence->sched;
	unsigned long flags;

	atomic_dec(&sched->hw_rq_count);
	amd_sched_fence_signal(s_fence);
	if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
		cancel_delayed_work_sync(&s_fence->dwork);
		spin_lock_irqsave(&sched->fence_list_lock, flags);
		list_del_init(&s_fence->list);
		spin_unlock_irqrestore(&sched->fence_list_lock, flags);
	}
	fence_put(&s_fence->base);
	wake_up_interruptible(&sched->wake_up_worker);
}

static void amd_sched_fence_work_func(struct work_struct *work)
{
	struct amd_sched_fence *s_fence =
		container_of(work, struct amd_sched_fence, dwork.work);
	struct amd_gpu_scheduler *sched = s_fence->sched;
	struct amd_sched_fence *entity, *tmp;
	unsigned long flags;

	DRM_ERROR("[%s] scheduler is timeout!\n", sched->name);

	/* Clean all pending fences */
	list_for_each_entry_safe(entity, tmp, &sched->fence_list, list) {
		DRM_ERROR("  fence no %d\n", entity->base.seqno);
		cancel_delayed_work_sync(&entity->dwork);
		spin_lock_irqsave(&sched->fence_list_lock, flags);
		list_del_init(&entity->list);
		spin_unlock_irqrestore(&sched->fence_list_lock, flags);
		fence_put(&entity->base);
	}
}

static int amd_sched_main(void *param)
{
	struct sched_param sparam = {.sched_priority = 1};
	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
	int r, count;

	spin_lock_init(&sched->fence_list_lock);
	INIT_LIST_HEAD(&sched->fence_list);
	sched_setscheduler(current, SCHED_FIFO, &sparam);

	while (!kthread_should_stop()) {
@@ -347,6 +377,7 @@ static int amd_sched_main(void *param)
		struct amd_sched_fence *s_fence;
		struct amd_sched_job *sched_job;
		struct fence *fence;
		unsigned long flags;

		wait_event_interruptible(sched->wake_up_worker,
			kthread_should_stop() ||
@@ -357,6 +388,15 @@ static int amd_sched_main(void *param)

		entity = sched_job->s_entity;
		s_fence = sched_job->s_fence;

		if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
			INIT_DELAYED_WORK(&s_fence->dwork, amd_sched_fence_work_func);
			schedule_delayed_work(&s_fence->dwork, sched->timeout);
			spin_lock_irqsave(&sched->fence_list_lock, flags);
			list_add_tail(&s_fence->list, &sched->fence_list);
			spin_unlock_irqrestore(&sched->fence_list_lock, flags);
		}

		atomic_inc(&sched->hw_rq_count);
		fence = sched->ops->run_job(sched_job);
		if (fence) {
@@ -392,11 +432,12 @@ static int amd_sched_main(void *param)
*/
int amd_sched_init(struct amd_gpu_scheduler *sched,
		   struct amd_sched_backend_ops *ops,
		   unsigned hw_submission, const char *name)
		   unsigned hw_submission, long timeout, const char *name)
{
	sched->ops = ops;
	sched->hw_submission_limit = hw_submission;
	sched->name = name;
	sched->timeout = timeout;
	amd_sched_rq_init(&sched->sched_rq);
	amd_sched_rq_init(&sched->kernel_rq);

+6 −1
Original line number Diff line number Diff line
@@ -68,6 +68,8 @@ struct amd_sched_fence {
	struct amd_gpu_scheduler	*sched;
	spinlock_t			lock;
	void                            *owner;
	struct delayed_work		dwork;
	struct list_head		list;
};

struct amd_sched_job {
@@ -103,18 +105,21 @@ struct amd_sched_backend_ops {
struct amd_gpu_scheduler {
	struct amd_sched_backend_ops	*ops;
	uint32_t			hw_submission_limit;
	long				timeout;
	const char			*name;
	struct amd_sched_rq		sched_rq;
	struct amd_sched_rq		kernel_rq;
	wait_queue_head_t		wake_up_worker;
	wait_queue_head_t		job_scheduled;
	atomic_t			hw_rq_count;
	struct list_head		fence_list;
	spinlock_t			fence_list_lock;
	struct task_struct		*thread;
};

int amd_sched_init(struct amd_gpu_scheduler *sched,
		   struct amd_sched_backend_ops *ops,
		   uint32_t hw_submission, const char *name);
		   uint32_t hw_submission, long timeout, const char *name);
void amd_sched_fini(struct amd_gpu_scheduler *sched);

int amd_sched_entity_init(struct amd_gpu_scheduler *sched,