Commit c6a6e2db authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher
Browse files

drm/amdgpu: Redo XGMI reset synchronization.



Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.

v2: Return right away with a warning if no xgmi hive, update doc.

Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarLe Ma <Le.Ma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f33a8770
Loading
Loading
Loading
Loading
+31 −6
Original line number Diff line number Diff line
@@ -66,6 +66,7 @@
#include "amdgpu_pmu.h"

#include <linux/suspend.h>
#include <drm/task_barrier.h>

MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2664,14 +2665,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
	struct amdgpu_device *adev =
		container_of(__work, struct amdgpu_device, xgmi_reset_work);
	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);

	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
		adev->asic_reset_res = (adev->in_baco == false) ?
				amdgpu_device_baco_enter(adev->ddev) :
				amdgpu_device_baco_exit(adev->ddev);
	else
	/* It's a bug to not have a hive within this function */
	if (WARN_ON(!hive))
		return;

	/*
	 * Use task barrier to synchronize all xgmi reset works across the
	 * hive. task_barrier_enter and task_barrier_exit will block
	 * until all the threads running the xgmi reset works reach
	 * those points. task_barrier_full will do both blocks.
	 */
	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

		task_barrier_enter(&hive->tb);
		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);

		if (adev->asic_reset_res)
			goto fail;

		task_barrier_exit(&hive->tb);
		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);

		if (adev->asic_reset_res)
			goto fail;
	} else {

		task_barrier_full(&hive->tb);
		adev->asic_reset_res =  amdgpu_asic_reset(adev);
	}

fail:
	if (adev->asic_reset_res)
		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
			 adev->asic_reset_res, adev->ddev->unique);