Commit 733fa1f7 authored by Yong Zhao's avatar Yong Zhao Committed by Oded Gabbay
Browse files

drm/amdkfd: Fix suspend/resume issue on Carrizo v2



When we do suspend/resume through "sudo pm-suspend" while there is
HSA activity running, upon resume we will encounter HWS hanging, which
is caused by memory read/write failures. The root cause is that when
suspend, we neglected to unbind pasid from kfd device.

Another major change is that the bind/unbinding is changed to be
performed on a per process basis, instead of whether there are queues
in dqm.

v2:
- free IOMMU device if kfd_bind_processes_to_device fails in kfd_resume
- add comments to kfd_bind/unbind_processes_to/from_device
- minor cleanups

Signed-off-by: default avatarYong Zhao <yong.zhao@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent b8935a7c
Loading
Loading
Loading
Loading
+16 −7
Original line number Diff line number Diff line
@@ -184,7 +184,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);

	if (dev)
		kfd_unbind_process_from_device(dev, pasid);
		kfd_process_iommu_unbind_callback(dev, pasid);
}

/*
@@ -332,13 +332,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)

void kgd2kfd_suspend(struct kfd_dev *kfd)
{
	if (kfd->init_complete) {
	if (!kfd->init_complete)
		return;

	kfd->dqm->ops.stop(kfd->dqm);

	kfd_unbind_processes_from_device(kfd);

	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
	amd_iommu_free_device(kfd->pdev);
}
}

int kgd2kfd_resume(struct kfd_dev *kfd)
{
@@ -362,6 +366,10 @@ static int kfd_resume(struct kfd_dev *kfd)
	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
				     iommu_invalid_ppr_cb);

	err = kfd_bind_processes_to_device(kfd);
	if (err)
		goto processes_bind_error;

	err = kfd->dqm->ops.start(kfd->dqm);
	if (err) {
		dev_err(kfd_device,
@@ -373,6 +381,7 @@ static int kfd_resume(struct kfd_dev *kfd)
	return err;

dqm_start_error:
processes_bind_error:
	amd_iommu_free_device(kfd->pdev);

	return err;
+0 −13
Original line number Diff line number Diff line
@@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)

static int start_cpsch(struct device_queue_manager *dqm)
{
	struct device_process_node *node;
	int retval;

	retval = 0;
@@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)

	init_interrupts(dqm);

	list_for_each_entry(node, &dqm->queues, list)
		if (node->qpd->pqm->process && dqm->dev)
			kfd_bind_process_to_device(dqm->dev,
						node->qpd->pqm->process);

	execute_queues_cpsch(dqm, true);

	return 0;
@@ -714,15 +708,8 @@ fail_packet_manager_init:

static int stop_cpsch(struct device_queue_manager *dqm)
{
	struct device_process_node *node;
	struct kfd_process_device *pdd;

	destroy_queues_cpsch(dqm, true, true);

	list_for_each_entry(node, &dqm->queues, list) {
		pdd = qpd_to_pdd(node->qpd);
		pdd->bound = false;
	}
	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
	pm_uninit(&dqm->packets);

+12 −3
Original line number Diff line number Diff line
@@ -432,6 +432,13 @@ struct qcm_process_device {
	uint32_t sh_hidden_private_base;
};


enum kfd_pdd_bound {
	PDD_UNBOUND = 0,
	PDD_BOUND,
	PDD_BOUND_SUSPENDED,
};

/* Data that is per-process-per device. */
struct kfd_process_device {
	/*
@@ -456,7 +463,7 @@ struct kfd_process_device {
	uint64_t scratch_limit;

	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
	bool bound;
	enum kfd_pdd_bound bound;

	/* This flag tells if we should reset all
	 * wavefronts on process termination
@@ -548,7 +555,9 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);

struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
						struct kfd_process *p);
void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
int kfd_bind_processes_to_device(struct kfd_dev *dev);
void kfd_unbind_processes_from_device(struct kfd_dev *dev);
void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
							struct kfd_process *p);
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+81 −16
Original line number Diff line number Diff line
@@ -174,9 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work)
		if (pdd->reset_wavefronts)
			dbgdev_wave_reset_wavefronts(pdd->dev, p);

		if (pdd->bound == PDD_BOUND)
			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
		list_del(&pdd->per_device_list);

		list_del(&pdd->per_device_list);
		kfree(pdd);
	}

@@ -351,9 +352,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,

	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
		if (pdd->dev == dev)
			break;

			return pdd;

	return NULL;
}

struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
@@ -368,6 +369,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
		pdd->qpd.dqm = dev->dqm;
		pdd->reset_wavefronts = false;
		pdd->bound = PDD_UNBOUND;
		list_add(&pdd->per_device_list, &p->per_device_data);
	}

@@ -393,19 +395,91 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
		return ERR_PTR(-ENOMEM);
	}

	if (pdd->bound)
	if (pdd->bound == PDD_BOUND) {
		return pdd;
	} else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
		pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
		return ERR_PTR(-EINVAL);
	}

	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
	if (err < 0)
		return ERR_PTR(err);

	pdd->bound = true;
	pdd->bound = PDD_BOUND;

	return pdd;
}

void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
/*
 * Bind processes do the device that have been temporarily unbound
 * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
 */
int kfd_bind_processes_to_device(struct kfd_dev *dev)
{
	struct kfd_process_device *pdd;
	struct kfd_process *p;
	unsigned int temp;
	int err = 0;

	int idx = srcu_read_lock(&kfd_processes_srcu);

	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
		mutex_lock(&p->mutex);
		pdd = kfd_get_process_device_data(dev, p);
		if (pdd->bound != PDD_BOUND_SUSPENDED) {
			mutex_unlock(&p->mutex);
			continue;
		}

		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
				p->lead_thread);
		if (err < 0) {
			pr_err("unexpected pasid %d binding failure\n",
					p->pasid);
			mutex_unlock(&p->mutex);
			break;
		}

		pdd->bound = PDD_BOUND;
		mutex_unlock(&p->mutex);
	}

	srcu_read_unlock(&kfd_processes_srcu, idx);

	return err;
}

/*
 * Temporarily unbind currently bound processes from the device and
 * mark them as PDD_BOUND_SUSPENDED. These processes will be restored
 * to PDD_BOUND state in kfd_bind_processes_to_device.
 */
void kfd_unbind_processes_from_device(struct kfd_dev *dev)
{
	struct kfd_process_device *pdd;
	struct kfd_process *p;
	unsigned int temp, temp_bound, temp_pasid;

	int idx = srcu_read_lock(&kfd_processes_srcu);

	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
		mutex_lock(&p->mutex);
		pdd = kfd_get_process_device_data(dev, p);
		temp_bound = pdd->bound;
		temp_pasid = p->pasid;
		if (pdd->bound == PDD_BOUND)
			pdd->bound = PDD_BOUND_SUSPENDED;
		mutex_unlock(&p->mutex);

		if (temp_bound == PDD_BOUND)
			amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
	}

	srcu_read_unlock(&kfd_processes_srcu, idx);
}

void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
{
	struct kfd_process *p;
	struct kfd_process_device *pdd;
@@ -438,15 +512,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
		pdd->reset_wavefronts = false;
	}

	/*
	 * Just mark pdd as unbound, because we still need it
	 * to call amd_iommu_unbind_pasid() in when the
	 * process exits.
	 * We don't call amd_iommu_unbind_pasid() here
	 * because the IOMMU called us.
	 */
	pdd->bound = false;

	mutex_unlock(&p->mutex);
}