Commit f1403342 authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/amdgpu: revert "fix system hang issue during GPU reset"



The whole approach wasn't thought through till the end.

We already had a reset lock like this in the past and it caused the same problems like this one.

Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary.

This reverts commit df9c8d1a.

Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 05f39286
Loading
Loading
Loading
Loading
+2 −7
Original line number Diff line number Diff line
@@ -949,9 +949,9 @@ struct amdgpu_device {
	bool                            in_suspend;
	bool				in_hibernate;

	atomic_t                        in_gpu_reset;
	bool                            in_gpu_reset;
	enum pp_mp1_state               mp1_state;
	struct rw_semaphore	reset_sem;
	struct mutex  lock_reset;
	struct amdgpu_doorbell_index doorbell_index;

	struct mutex			notifier_lock;
@@ -1266,9 +1266,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
       return adev->gmc.tmz_enabled;
}

static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
{
	return atomic_read(&adev->in_gpu_reset) ? true : false;
}

#endif
+3 −37
Original line number Diff line number Diff line
@@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
	if (cp_mqd_gfx9)
		bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;

	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	r = amdgpu_bo_create(adev, &bp, &bo);
	if (r) {
		dev_err(adev->dev,
			"failed to allocate BO for amdkfd (%d)\n", r);
		goto err;
		return r;
	}

	/* map the buffer */
@@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,

	amdgpu_bo_unreserve(bo);

	up_read(&adev->reset_sem);
	return 0;

allocate_mem_kmap_bo_failed:
@@ -295,25 +291,19 @@ allocate_mem_pin_bo_failed:
	amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed:
	amdgpu_bo_unref(&bo);
err:
	up_read(&adev->reset_sem);

	return r;
}

void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;

	down_read(&adev->reset_sem);

	amdgpu_bo_reserve(bo, true);
	amdgpu_bo_kunmap(bo);
	amdgpu_bo_unpin(bo);
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unref(&(bo));

	up_read(&adev->reset_sem);
}

int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -345,14 +335,9 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,

void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;

	down_read(&adev->reset_sem);

	amdgpu_bo_unref(&bo);

	up_read(&adev->reset_sem);
}

uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -626,15 +611,8 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
	job->vmid = vmid;

	if (!down_read_trylock(&adev->reset_sem)) {
		ret = -EIO;
		goto err_ib_sched;
	}

	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);

	up_read(&adev->reset_sem);

	if (ret) {
		DRM_ERROR("amdgpu: failed to schedule IB.\n");
		goto err_ib_sched;
@@ -670,9 +648,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;

	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	if (adev->family == AMDGPU_FAMILY_AI) {
		int i;

@@ -682,8 +657,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
	}

	up_read(&adev->reset_sem);

	return 0;
}

@@ -692,18 +665,11 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	const uint32_t flush_type = 0;
	bool all_hub = false;
	int ret = -EIO;

	if (adev->family == AMDGPU_FAMILY_AI)
		all_hub = true;

	if (down_read_trylock(&adev->reset_sem)) {
		ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
					pasid, flush_type, all_hub);
		up_read(&adev->reset_sem);
	}

	return ret;
	return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
}

bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
+1 −1
Original line number Diff line number Diff line
@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	uint32_t temp;
	struct v10_compute_mqd *m = get_mqd(mqd);

	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;

#if 0
+1 −1
Original line number Diff line number Diff line
@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	unsigned long flags, end_jiffies;
	int retry;

	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;

	acquire_queue(kgd, pipe_id, queue_id);
+1 −1
Original line number Diff line number Diff line
@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	int retry;
	struct vi_mqd *m = get_mqd(mqd);

	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;

	acquire_queue(kgd, pipe_id, queue_id);
Loading