Commit df9c8d1a authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher
Browse files

drm/amdgpu: fix system hang issue during GPU reset



when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

v5:
1. Fix some style issues.

Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Suggested-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: default avatarChristian König <christian.koenig@amd.com>
Suggested-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Suggested-by: default avatarLijo Lazar <Lijo.Lazar@amd.com>
Suggested-by: default avatarLuben Tukov <luben.tuikov@amd.com>
Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent c5079f35
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -961,9 +961,9 @@ struct amdgpu_device {
	bool                            in_suspend;
	bool				in_hibernate;

	bool                            in_gpu_reset;
	atomic_t                        in_gpu_reset;
	enum pp_mp1_state               mp1_state;
	struct mutex  lock_reset;
	struct rw_semaphore	reset_sem;
	struct amdgpu_doorbell_index doorbell_index;

	struct mutex			notifier_lock;
@@ -1278,4 +1278,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
       return adev->gmc.tmz_enabled;
}

static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
{
	return atomic_read(&adev->in_gpu_reset) ? true : false;
}

#endif
+37 −3
Original line number Diff line number Diff line
@@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
	if (cp_mqd_gfx9)
		bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;

	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	r = amdgpu_bo_create(adev, &bp, &bo);
	if (r) {
		dev_err(adev->dev,
			"failed to allocate BO for amdkfd (%d)\n", r);
		return r;
		goto err;
	}

	/* map the buffer */
@@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,

	amdgpu_bo_unreserve(bo);

	up_read(&adev->reset_sem);
	return 0;

allocate_mem_kmap_bo_failed:
@@ -291,19 +295,25 @@ allocate_mem_pin_bo_failed:
	amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed:
	amdgpu_bo_unref(&bo);

err:
	up_read(&adev->reset_sem);
	return r;
}

void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;

	down_read(&adev->reset_sem);

	amdgpu_bo_reserve(bo, true);
	amdgpu_bo_kunmap(bo);
	amdgpu_bo_unpin(bo);
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unref(&(bo));

	up_read(&adev->reset_sem);
}

int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -335,9 +345,14 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,

void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;

	down_read(&adev->reset_sem);

	amdgpu_bo_unref(&bo);

	up_read(&adev->reset_sem);
}

uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -611,12 +626,19 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
	job->vmid = vmid;

	if (!down_read_trylock(&adev->reset_sem)) {
		ret = -EIO;
		goto err_ib_sched;
	}

	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
	if (ret) {
		DRM_ERROR("amdgpu: failed to schedule IB.\n");
		goto err_ib_sched;
	}

	up_read(&adev->reset_sem);

	ret = dma_fence_wait(f, false);

err_ib_sched:
@@ -647,6 +669,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;

	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	if (adev->family == AMDGPU_FAMILY_AI) {
		int i;

@@ -656,6 +681,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
	}

	up_read(&adev->reset_sem);

	return 0;
}

@@ -664,11 +691,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	const uint32_t flush_type = 0;
	bool all_hub = false;
	int ret = -EIO;

	if (adev->family == AMDGPU_FAMILY_AI)
		all_hub = true;

	return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
	if (down_read_trylock(&adev->reset_sem)) {
		ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
					pasid, flush_type, all_hub);
		up_read(&adev->reset_sem);
	}

	return ret;
}

bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
+1 −1
Original line number Diff line number Diff line
@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	uint32_t temp;
	struct v10_compute_mqd *m = get_mqd(mqd);

	if (adev->in_gpu_reset)
	if (amdgpu_in_reset(adev))
		return -EIO;

#if 0
+1 −1
Original line number Diff line number Diff line
@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	unsigned long flags, end_jiffies;
	int retry;

	if (adev->in_gpu_reset)
	if (amdgpu_in_reset(adev))
		return -EIO;

	acquire_queue(kgd, pipe_id, queue_id);
+1 −1
Original line number Diff line number Diff line
@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	int retry;
	struct vi_mqd *m = get_mqd(mqd);

	if (adev->in_gpu_reset)
	if (amdgpu_in_reset(adev))
		return -EIO;

	acquire_queue(kgd, pipe_id, queue_id);
Loading