amd/scheduler:imple job skip feature(v3) (48f05f29) · Commits · 戴 / test

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

+8 −5

Original line number	Diff line number	Diff line
		@@ -180,7 +180,7 @@ static struct dma_fence amdgpu_job_dependency(struct amd_sched_job sched_job,

		static struct dma_fence amdgpu_job_run(struct amd_sched_job sched_job)
		{
		struct dma_fence *fence = NULL;
		struct dma_fence fence = NULL, finished;
		struct amdgpu_device *adev;
		struct amdgpu_job *job;
		int r;
		@@ -190,15 +190,18 @@ static struct dma_fence amdgpu_job_run(struct amd_sched_job sched_job)
		return NULL;
		}
		job = to_amdgpu_job(sched_job);
		finished = &job->base.s_fence->finished;
		adev = job->adev;

		BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));

		trace_amdgpu_sched_run_job(job);
		/* skip ib schedule when vram is lost */
		if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
		DRM_ERROR("Skip scheduling IBs!\n");

		if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */

		if (finished->error < 0) {
		DRM_INFO("Skip scheduling IBs!\n");
		} else {
		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
		&fence);

drivers/gpu/drm/amd/scheduler/gpu_scheduler.c

+23 −16

Original line number	Diff line number	Diff line
		@@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
		if (amd_sched_entity_add_dependency_cb(entity))
		return NULL;

		/* skip jobs from entity that marked guilty */
		if (entity->guilty && atomic_read(entity->guilty))
		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);

		spsc_queue_pop(&entity->job_queue);
		return sched_job;
		}
		@@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
		job->sched->ops->timedout_job(job);
		}

		static void amd_sched_set_guilty(struct amd_sched_job *s_job,
		struct amd_sched_entity *s_entity)
		{
		if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
		if (s_entity->guilty)
		atomic_set(s_entity->guilty, 1);
		}

		void amd_sched_hw_job_reset(struct amd_gpu_scheduler sched, struct amd_sched_job bad)
		{
		struct amd_sched_job *s_job;
		@@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
		spin_unlock(&sched->job_list_lock);

		if (bad) {
		bool found = false;

		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
		/* don't increase @bad's karma if it's from KERNEL RQ,
		* becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
		* corrupt but keep in mind that kernel jobs always considered good.
		*/
		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
		struct amd_sched_rq *rq = &sched->sched_rq[i];

		spin_lock(&rq->lock);
		list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
		if (bad->s_fence->scheduled.context == entity->fence_context) {
		found = true;
		amd_sched_set_guilty(bad, entity);
		if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
		if (entity->guilty)
		atomic_set(entity->guilty, 1);
		break;
		}
		}
		spin_unlock(&rq->lock);
		if (found)
		if (&entity->list != &rq->entities)
		break;
		}
		}
		@@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
		void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
		{
		struct amd_sched_job s_job, tmp;
		bool found_guilty = false;
		int r;

		spin_lock(&sched->job_list_lock);
		@@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
		list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
		struct amd_sched_fence *s_fence = s_job->s_fence;
		struct dma_fence *fence;
		uint64_t guilty_context;

		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
		found_guilty = true;
		guilty_context = s_job->s_fence->scheduled.context;
		}

		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
		dma_fence_set_error(&s_fence->finished, -ECANCELED);

		spin_unlock(&sched->job_list_lock);
		fence = sched->ops->run_job(s_job);
		@@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
		r);
		dma_fence_put(fence);
		} else {
		DRM_ERROR("Failed to run job!\n");
		amd_sched_process_job(NULL, &s_fence->cb);
		}
		spin_lock(&sched->job_list_lock);
		@@ -664,7 +672,6 @@ static int amd_sched_main(void *param)
		r);
		dma_fence_put(fence);
		} else {
		DRM_ERROR("Failed to run job!\n");
		amd_sched_process_job(NULL, &s_fence->cb);
		}

Admin message