Commit 0990ca23 authored by Dave Airlie's avatar Dave Airlie
Browse files

Merge tag 'drm-next-5.5-2019-11-08' of git://people.freedesktop.org/~agd5f/linux into drm-next



drm-next-5.5-2019-11-08:

amdgpu:
- Enable VCN dynamic powergating on RV/RV2
- Fixes for Navi14
- Misc Navi fixes
- Fix MSI-X tear down
- Misc Arturus fixes
- Fix xgmi powerstate handling
- Documenation fixes

scheduler:
- Fix static code checker warning
- Fix possible thread reactivation while thread is stopped
- Avoid cleanup if thread is parked

radeon:
- SI dpm fix ported from amdgpu

Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191108212713.5078-1-alexander.deucher@amd.com
parents 77e0723b 53dbc27a
Loading
Loading
Loading
Loading
+35 −0
Original line number Diff line number Diff line
@@ -82,12 +82,21 @@ AMDGPU XGMI Support
AMDGPU RAS Support
==================

The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and
debugfs (for error injection).

RAS debugfs/sysfs Control and Error Injection Interfaces
--------------------------------------------------------

.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
   :doc: AMDGPU RAS debugfs control interface

RAS Reboot Behavior for Unrecoverable Errors
--------------------------------------------------------

.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
   :doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors

RAS Error Count sysfs Interface
-------------------------------

@@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
   :internal:

Sample Code
-----------
Sample code for testing error injection can be found here:
https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c

This is part of the libdrm amdgpu unit tests which cover several areas of the GPU.
There are four sets of tests:

RAS Basic Test

The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files
are present.

RAS Query Test

This test checks the RAS availability and enablement status for each supported IP block as well as
the error counts.

RAS Inject Test

This test injects errors for each IP.

RAS Disable Test

This test tests disabling of RAS features for each IP block.


GPU Power/Thermal Controls and Monitoring
=========================================
+3 −0
Original line number Diff line number Diff line
@@ -977,6 +977,9 @@ struct amdgpu_device {

	uint64_t			unique_id;
	uint64_t	df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];

	/* device pstate */
	int				pstate;
};

static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
+2 −4
Original line number Diff line number Diff line
@@ -33,7 +33,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
{
	unsigned long start_jiffies;
	unsigned long end_jiffies;
	struct dma_fence *fence = NULL;
	struct dma_fence *fence;
	int i, r;

	start_jiffies = jiffies;
@@ -44,16 +44,14 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
		if (r)
			goto exit_do_move;
		r = dma_fence_wait(fence, false);
		dma_fence_put(fence);
		if (r)
			goto exit_do_move;
		dma_fence_put(fence);
	}
	end_jiffies = jiffies;
	r = jiffies_to_msecs(end_jiffies - start_jiffies);

exit_do_move:
	if (fence)
		dma_fence_put(fence);
	return r;
}

+10 −0
Original line number Diff line number Diff line
@@ -859,6 +859,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
	struct amdgpu_device *adev = dev->dev_private;
	int r = 0, i;

	/* Avoid accidently unparking the sched thread during GPU reset */
	mutex_lock(&adev->lock_reset);

	/* hold on the scheduler */
	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
		struct amdgpu_ring *ring = adev->rings[i];
@@ -884,6 +887,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
		kthread_unpark(ring->sched.thread);
	}

	mutex_unlock(&adev->lock_reset);

	return 0;
}

@@ -1036,6 +1041,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
	if (!fences)
		return -ENOMEM;

	/* Avoid accidently unparking the sched thread during GPU reset */
	mutex_lock(&adev->lock_reset);

	/* stop the scheduler */
	kthread_park(ring->sched.thread);

@@ -1075,6 +1083,8 @@ failure:
	/* restart the scheduler */
	kthread_unpark(ring->sched.thread);

	mutex_unlock(&adev->lock_reset);

	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);

	kfree(fences);
+34 −2
Original line number Diff line number Diff line
@@ -2057,6 +2057,7 @@ out:
 */
static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
{
	struct amdgpu_gpu_instance *gpu_instance;
	int i = 0, r;

	for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -2082,8 +2083,39 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
	if (r)
		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);

	/* set to low pstate by default */
	amdgpu_xgmi_set_pstate(adev, 0);

	if (adev->gmc.xgmi.num_physical_nodes > 1) {
		mutex_lock(&mgpu_info.mutex);

		/*
		 * Reset device p-state to low as this was booted with high.
		 *
		 * This should be performed only after all devices from the same
		 * hive get initialized.
		 *
		 * However, it's unknown how many device in the hive in advance.
		 * As this is counted one by one during devices initializations.
		 *
		 * So, we wait for all XGMI interlinked devices initialized.
		 * This may bring some delays as those devices may come from
		 * different hives. But that should be OK.
		 */
		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
			for (i = 0; i < mgpu_info.num_gpu; i++) {
				gpu_instance = &(mgpu_info.gpu_ins[i]);
				if (gpu_instance->adev->flags & AMD_IS_APU)
					continue;

				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
				if (r) {
					DRM_ERROR("pstate setting failed (%d).\n", r);
					break;
				}
			}
		}

		mutex_unlock(&mgpu_info.mutex);
	}

	return 0;
}
Loading