Merge tag 'drm-next-5.5-2019-11-08' of git://people.freedesktop.org/~agd5f/linux into drm-next (0990ca23) · Commits · 戴 / test

Documentation/gpu/amdgpu.rst

+35 −0

Original line number	Diff line number	Diff line
		@@ -82,12 +82,21 @@ AMDGPU XGMI Support
		AMDGPU RAS Support
		==================

		The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and
		debugfs (for error injection).

		RAS debugfs/sysfs Control and Error Injection Interfaces
		--------------------------------------------------------

		.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
		:doc: AMDGPU RAS debugfs control interface

		RAS Reboot Behavior for Unrecoverable Errors
		--------------------------------------------------------

		.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
		:doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors

		RAS Error Count sysfs Interface
		-------------------------------

		@@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface
		.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
		:internal:

		Sample Code
		-----------
		Sample code for testing error injection can be found here:
		https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c

		This is part of the libdrm amdgpu unit tests which cover several areas of the GPU.
		There are four sets of tests:

		RAS Basic Test

		The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files
		are present.

		RAS Query Test

		This test checks the RAS availability and enablement status for each supported IP block as well as
		the error counts.

		RAS Inject Test

		This test injects errors for each IP.

		RAS Disable Test

		This test tests disabling of RAS features for each IP block.


		GPU Power/Thermal Controls and Monitoring
		=========================================

drivers/gpu/drm/amd/amdgpu/amdgpu.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -977,6 +977,9 @@ struct amdgpu_device {

		uint64_t unique_id;
		uint64_t df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];

		/* device pstate */
		int pstate;
		};

		static inline struct amdgpu_device amdgpu_ttm_adev(struct ttm_bo_device bdev)

drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c

+2 −4

Original line number	Diff line number	Diff line
		@@ -33,7 +33,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
		{
		unsigned long start_jiffies;
		unsigned long end_jiffies;
		struct dma_fence *fence = NULL;
		struct dma_fence *fence;
		int i, r;

		start_jiffies = jiffies;
		@@ -44,16 +44,14 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
		if (r)
		goto exit_do_move;
		r = dma_fence_wait(fence, false);
		dma_fence_put(fence);
		if (r)
		goto exit_do_move;
		dma_fence_put(fence);
		}
		end_jiffies = jiffies;
		r = jiffies_to_msecs(end_jiffies - start_jiffies);

		exit_do_move:
		if (fence)
		dma_fence_put(fence);
		return r;
		}

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

+10 −0

Original line number	Diff line number	Diff line
		@@ -859,6 +859,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file m, void data)
		struct amdgpu_device *adev = dev->dev_private;
		int r = 0, i;

		/* Avoid accidently unparking the sched thread during GPU reset */
		mutex_lock(&adev->lock_reset);

		/* hold on the scheduler */
		for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
		struct amdgpu_ring *ring = adev->rings[i];
		@@ -884,6 +887,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file m, void data)
		kthread_unpark(ring->sched.thread);
		}

		mutex_unlock(&adev->lock_reset);

		return 0;
		}

		@@ -1036,6 +1041,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
		if (!fences)
		return -ENOMEM;

		/* Avoid accidently unparking the sched thread during GPU reset */
		mutex_lock(&adev->lock_reset);

		/* stop the scheduler */
		kthread_park(ring->sched.thread);

		@@ -1075,6 +1083,8 @@ failure:
		/* restart the scheduler */
		kthread_unpark(ring->sched.thread);

		mutex_unlock(&adev->lock_reset);

		ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

		kfree(fences);

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+34 −2

Original line number	Diff line number	Diff line
		@@ -2057,6 +2057,7 @@ out:
		*/
		static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
		{
		struct amdgpu_gpu_instance *gpu_instance;
		int i = 0, r;

		for (i = 0; i < adev->num_ip_blocks; i++) {
		@@ -2082,8 +2083,39 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
		if (r)
		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

		/* set to low pstate by default */
		amdgpu_xgmi_set_pstate(adev, 0);

		if (adev->gmc.xgmi.num_physical_nodes > 1) {
		mutex_lock(&mgpu_info.mutex);

		/*
		* Reset device p-state to low as this was booted with high.
		*
		* This should be performed only after all devices from the same
		* hive get initialized.
		*
		* However, it's unknown how many device in the hive in advance.
		* As this is counted one by one during devices initializations.
		*
		* So, we wait for all XGMI interlinked devices initialized.
		* This may bring some delays as those devices may come from
		* different hives. But that should be OK.
		*/
		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
		for (i = 0; i < mgpu_info.num_gpu; i++) {
		gpu_instance = &(mgpu_info.gpu_ins[i]);
		if (gpu_instance->adev->flags & AMD_IS_APU)
		continue;

		r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
		if (r) {
		DRM_ERROR("pstate setting failed (%d).\n", r);
		break;
		}
		}
		}

		mutex_unlock(&mgpu_info.mutex);
		}

		return 0;
		}

Admin message