drm/amdgpu: add graceful VM fault handling v3 (ec671737) · Commits · 戴 / test

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+73 −0

Original line number	Diff line number	Diff line
		@@ -3126,3 +3126,76 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
		}
		}
		}

		/**
		* amdgpu_vm_handle_fault - graceful handling of VM faults.
		* @adev: amdgpu device pointer
		* @pasid: PASID of the VM
		* @addr: Address of the fault
		*
		* Try to gracefully handle a VM fault. Return true if the fault was handled and
		* shouldn't be reported any more.
		*/
		bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
		uint64_t addr)
		{
		struct amdgpu_bo *root;
		uint64_t value, flags;
		struct amdgpu_vm *vm;
		long r;

		spin_lock(&adev->vm_manager.pasid_lock);
		vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
		if (vm)
		root = amdgpu_bo_ref(vm->root.base.bo);
		else
		root = NULL;
		spin_unlock(&adev->vm_manager.pasid_lock);

		if (!root)
		return false;

		r = amdgpu_bo_reserve(root, true);
		if (r)
		goto error_unref;

		/* Double check that the VM still exists */
		spin_lock(&adev->vm_manager.pasid_lock);
		vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
		if (vm && vm->root.base.bo != root)
		vm = NULL;
		spin_unlock(&adev->vm_manager.pasid_lock);
		if (!vm)
		goto error_unlock;

		addr /= AMDGPU_GPU_PAGE_SIZE;
		flags = AMDGPU_PTE_VALID \| AMDGPU_PTE_SNOOPED \|
		AMDGPU_PTE_SYSTEM;

		if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
		/* Redirect the access to the dummy page */
		value = adev->dummy_page_addr;
		flags \|= AMDGPU_PTE_EXECUTABLE \| AMDGPU_PTE_READABLE \|
		AMDGPU_PTE_WRITEABLE;
		} else {
		/* Let the hw retry silently on the PTE */
		value = 0;
		}

		r = amdgpu_vm_bo_update_mapping(adev, vm, true, NULL, addr, addr + 1,
		flags, value, NULL, NULL);
		if (r)
		goto error_unlock;

		r = amdgpu_vm_update_pdes(adev, vm, true);

		error_unlock:
		amdgpu_bo_unreserve(root);
		if (r < 0)
		DRM_ERROR("Can't handle page fault (%ld)\n", r);

		error_unref:
		amdgpu_bo_unref(&root);

		return false;
		}

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -413,6 +413,8 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);

		void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
		struct amdgpu_task_info *task_info);
		bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
		uint64_t addr);

		void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

+4 −0

Original line number	Diff line number	Diff line
		@@ -380,6 +380,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
		}

		/* If it's the first fault for this address, process it normally */
		if (retry_fault && !in_interrupt() &&
		amdgpu_vm_handle_fault(adev, entry->pasid, addr))
		return 1; /* This also prevents sending it to KFD */

		if (!amdgpu_sriov_vf(adev)) {
		/*
		* Issue a dummy read to wait for the status register to

Admin message