Commit e38bfd30 authored by Oded Gabbay's avatar Oded Gabbay
Browse files

habanalabs: set clock gating per engine



For debugging purposes, we need to allow the root user better control of
the clock gating feature of the DMA and compute engines. Therefore, change
the clock gating debugfs interface to be bitmask instead of true/false.
Each bit represents a different engine, according to gaudi_engine_id enum.

See debugfs documentation for more details.

Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarOmer Shpigelman <oshpigelman@habana.ai>
parent 2edc66e2
Loading
Loading
Loading
Loading
+10 −1
Original line number Diff line number Diff line
@@ -16,7 +16,16 @@ Description: Allow the root user to disable/enable in runtime the clock
                gating mechanism in Gaudi. Due to how Gaudi is built, the
                clock gating needs to be disabled in order to access the
                registers of the TPC and MME engines. This is sometimes needed
                during debug by the user and hence the user needs this option
                during debug by the user and hence the user needs this option.
                The user can supply a bitmask value, each bit represents
                a different engine to disable/enable its clock gating feature.
                The bitmask is composed of 20 bits:
                0  -  7 : DMA channels
                8  - 11 : MME engines
                12 - 19 : TPC engines
                The bit's location of a specific engine can be determined
                using (1 << GAUDI_ENGINE_ID_*). GAUDI_ENGINE_ID_* values
                are defined in uapi habanalabs.h file in enum gaudi_engine_id

What:           /sys/kernel/debug/habanalabs/hl<n>/command_buffers
Date:           Jan 2019
+5 −12
Original line number Diff line number Diff line
@@ -981,7 +981,7 @@ static ssize_t hl_clk_gate_read(struct file *f, char __user *buf,
	if (*ppos)
		return 0;

	sprintf(tmp_buf, "%d\n", hdev->clock_gating);
	sprintf(tmp_buf, "0x%llx\n", hdev->clock_gating_mask);
	rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
			strlen(tmp_buf) + 1);

@@ -993,7 +993,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
{
	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
	struct hl_device *hdev = entry->hdev;
	u32 value;
	u64 value;
	ssize_t rc;

	if (atomic_read(&hdev->in_reset)) {
@@ -1002,19 +1002,12 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
		return 0;
	}

	rc = kstrtouint_from_user(buf, count, 10, &value);
	rc = kstrtoull_from_user(buf, count, 16, &value);
	if (rc)
		return rc;

	if (value) {
		hdev->clock_gating = 1;
		if (hdev->asic_funcs->enable_clock_gating)
			hdev->asic_funcs->enable_clock_gating(hdev);
	} else {
		if (hdev->asic_funcs->disable_clock_gating)
			hdev->asic_funcs->disable_clock_gating(hdev);
		hdev->clock_gating = 0;
	}
	hdev->clock_gating_mask = value;
	hdev->asic_funcs->set_clock_gating(hdev);

	return count;
}
+1 −1
Original line number Diff line number Diff line
@@ -608,7 +608,7 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
		hdev->in_debug = 0;

		if (!hdev->hard_reset_pending)
			hdev->asic_funcs->enable_clock_gating(hdev);
			hdev->asic_funcs->set_clock_gating(hdev);

		goto out;
	}
+74 −38
Original line number Diff line number Diff line
@@ -98,6 +98,11 @@

#define GAUDI_ARB_WDT_TIMEOUT		0x1000000

#define GAUDI_CLK_GATE_DEBUGFS_MASK	(\
		BIT(GAUDI_ENGINE_ID_MME_0) |\
		BIT(GAUDI_ENGINE_ID_MME_2) |\
		GENMASK_ULL(GAUDI_ENGINE_ID_TPC_7, GAUDI_ENGINE_ID_TPC_0))

static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
		"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
		"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -106,14 +111,14 @@ static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
};

static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = {
	[GAUDI_PCI_DMA_1] = 0,
	[GAUDI_PCI_DMA_2] = 1,
	[GAUDI_PCI_DMA_3] = 5,
	[GAUDI_HBM_DMA_1] = 2,
	[GAUDI_HBM_DMA_2] = 3,
	[GAUDI_HBM_DMA_3] = 4,
	[GAUDI_HBM_DMA_4] = 6,
	[GAUDI_HBM_DMA_5] = 7
	[GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0,
	[GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1,
	[GAUDI_PCI_DMA_3] = GAUDI_ENGINE_ID_DMA_5,
	[GAUDI_HBM_DMA_1] = GAUDI_ENGINE_ID_DMA_2,
	[GAUDI_HBM_DMA_2] = GAUDI_ENGINE_ID_DMA_3,
	[GAUDI_HBM_DMA_3] = GAUDI_ENGINE_ID_DMA_4,
	[GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_6,
	[GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_7
};

static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = {
@@ -1819,7 +1824,7 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)

	gaudi_init_rate_limiter(hdev);

	gaudi_disable_clock_gating(hdev);
	hdev->asic_funcs->disable_clock_gating(hdev);

	for (tpc_id = 0, tpc_offset = 0;
				tpc_id < TPC_NUMBER_OF_ENGINES;
@@ -2531,46 +2536,55 @@ static void gaudi_tpc_stall(struct hl_device *hdev)
	WREG32(mmTPC7_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT);
}

static void gaudi_enable_clock_gating(struct hl_device *hdev)
static void gaudi_set_clock_gating(struct hl_device *hdev)
{
	struct gaudi_device *gaudi = hdev->asic_specific;
	u32 qman_offset;
	int i;

	if (!hdev->clock_gating)
		return;

	if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE)
		return;

	/* In case we are during debug session, don't enable the clock gate
	 * as it may interfere
	 */
	if (hdev->in_debug)
		return;

	for (i = 0, qman_offset = 0 ; i < PCI_DMA_NUMBER_OF_CHNLS ; i++) {
	for (i = GAUDI_PCI_DMA_1, qman_offset = 0 ; i < GAUDI_HBM_DMA_1 ; i++) {
		if (!(hdev->clock_gating_mask &
					(BIT_ULL(gaudi_dma_assignment[i]))))
			continue;

		qman_offset = gaudi_dma_assignment[i] * DMA_QMAN_OFFSET;
		WREG32(mmDMA0_QM_CGM_CFG1 + qman_offset, QMAN_CGM1_PWR_GATE_EN);
		WREG32(mmDMA0_QM_CGM_CFG + qman_offset,
				QMAN_UPPER_CP_CGM_PWR_GATE_EN);
	}

	for (; i < HBM_DMA_NUMBER_OF_CHNLS ; i++) {
	for (i = GAUDI_HBM_DMA_1 ; i < GAUDI_DMA_MAX ; i++) {
		if (!(hdev->clock_gating_mask &
					(BIT_ULL(gaudi_dma_assignment[i]))))
			continue;

		qman_offset = gaudi_dma_assignment[i] * DMA_QMAN_OFFSET;
		WREG32(mmDMA0_QM_CGM_CFG1 + qman_offset, QMAN_CGM1_PWR_GATE_EN);
		WREG32(mmDMA0_QM_CGM_CFG + qman_offset,
				QMAN_COMMON_CP_CGM_PWR_GATE_EN);
	}

	if (hdev->clock_gating_mask & (BIT_ULL(GAUDI_ENGINE_ID_MME_0))) {
		WREG32(mmMME0_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
	WREG32(mmMME0_QM_CGM_CFG,
			QMAN_COMMON_CP_CGM_PWR_GATE_EN);
		WREG32(mmMME0_QM_CGM_CFG, QMAN_COMMON_CP_CGM_PWR_GATE_EN);
	}

	if (hdev->clock_gating_mask & (BIT_ULL(GAUDI_ENGINE_ID_MME_2))) {
		WREG32(mmMME2_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
	WREG32(mmMME2_QM_CGM_CFG,
			QMAN_COMMON_CP_CGM_PWR_GATE_EN);
		WREG32(mmMME2_QM_CGM_CFG, QMAN_COMMON_CP_CGM_PWR_GATE_EN);
	}

	for (i = 0, qman_offset = 0 ; i < TPC_NUMBER_OF_ENGINES ; i++) {
		if (!(hdev->clock_gating_mask &
					(BIT_ULL(GAUDI_ENGINE_ID_TPC_0 + i))))
			continue;

		WREG32(mmTPC0_QM_CGM_CFG1 + qman_offset,
				QMAN_CGM1_PWR_GATE_EN);
		WREG32(mmTPC0_QM_CGM_CFG + qman_offset,
@@ -2663,7 +2677,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
	gaudi_stop_hbm_dma_qmans(hdev);
	gaudi_stop_pci_dma_qmans(hdev);

	gaudi_disable_clock_gating(hdev);
	hdev->asic_funcs->disable_clock_gating(hdev);

	msleep(wait_timeout_ms);

@@ -3003,7 +3017,7 @@ static int gaudi_hw_init(struct hl_device *hdev)

	gaudi_init_tpc_qmans(hdev);

	gaudi_enable_clock_gating(hdev);
	hdev->asic_funcs->set_clock_gating(hdev);

	gaudi_enable_timestamp(hdev);

@@ -3112,7 +3126,9 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
					HW_CAP_HBM_DMA | HW_CAP_PLL |
					HW_CAP_MMU |
					HW_CAP_SRAM_SCRAMBLER |
					HW_CAP_HBM_SCRAMBLER);
					HW_CAP_HBM_SCRAMBLER |
					HW_CAP_CLK_GATE);

	memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
}

@@ -4526,13 +4542,18 @@ static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
	int rc = 0;

	if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
		if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {

		if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
				(hdev->clock_gating_mask &
						GAUDI_CLK_GATE_DEBUGFS_MASK)) {

			dev_err_ratelimited(hdev->dev,
				"Can't read register - clock gating is enabled!\n");
			rc = -EFAULT;
		} else {
			*val = RREG32(addr - CFG_BASE);
		}

	} else if ((addr >= SRAM_BASE_ADDR) &&
			(addr < SRAM_BASE_ADDR + SRAM_BAR_SIZE)) {
		*val = readl(hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4568,13 +4589,18 @@ static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
	int rc = 0;

	if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
		if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {

		if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
				(hdev->clock_gating_mask &
						GAUDI_CLK_GATE_DEBUGFS_MASK)) {

			dev_err_ratelimited(hdev->dev,
				"Can't write register - clock gating is enabled!\n");
			rc = -EFAULT;
		} else {
			WREG32(addr - CFG_BASE, val);
		}

	} else if ((addr >= SRAM_BASE_ADDR) &&
			(addr < SRAM_BASE_ADDR + SRAM_BAR_SIZE)) {
		writel(val, hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4610,7 +4636,11 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
	int rc = 0;

	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
		if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {

		if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
				(hdev->clock_gating_mask &
						GAUDI_CLK_GATE_DEBUGFS_MASK)) {

			dev_err_ratelimited(hdev->dev,
				"Can't read register - clock gating is enabled!\n");
			rc = -EFAULT;
@@ -4620,6 +4650,7 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)

			*val = (((u64) val_h) << 32) | val_l;
		}

	} else if ((addr >= SRAM_BASE_ADDR) &&
		   (addr <= SRAM_BASE_ADDR + SRAM_BAR_SIZE - sizeof(u64))) {
		*val = readq(hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4656,7 +4687,11 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
	int rc = 0;

	if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
		if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {

		if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
				(hdev->clock_gating_mask &
						GAUDI_CLK_GATE_DEBUGFS_MASK)) {

			dev_err_ratelimited(hdev->dev,
				"Can't write register - clock gating is enabled!\n");
			rc = -EFAULT;
@@ -4665,6 +4700,7 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
			WREG32(addr + sizeof(u32) - CFG_BASE,
				upper_32_bits(val));
		}

	} else if ((addr >= SRAM_BASE_ADDR) &&
		   (addr <= SRAM_BASE_ADDR + SRAM_BAR_SIZE - sizeof(u64))) {
		writeq(val, hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4886,7 +4922,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
	gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid);
	gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid);

	hdev->asic_funcs->enable_clock_gating(hdev);
	hdev->asic_funcs->set_clock_gating(hdev);

	mutex_unlock(&gaudi->clk_gate_mutex);
}
@@ -5267,7 +5303,7 @@ static void gaudi_print_ecc_info_generic(struct hl_device *hdev,
	}

	if (disable_clock_gating) {
		hdev->asic_funcs->enable_clock_gating(hdev);
		hdev->asic_funcs->set_clock_gating(hdev);
		mutex_unlock(&gaudi->clk_gate_mutex);
	}
}
@@ -5754,7 +5790,7 @@ static bool gaudi_tpc_read_interrupts(struct hl_device *hdev, u8 tpc_id,
	/* Clear interrupts */
	WREG32(mmTPC0_CFG_TPC_INTR_CAUSE + tpc_offset, 0);

	hdev->asic_funcs->enable_clock_gating(hdev);
	hdev->asic_funcs->set_clock_gating(hdev);

	mutex_unlock(&gaudi->clk_gate_mutex);

@@ -6270,7 +6306,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
	if (s)
		seq_puts(s, "\n");

	hdev->asic_funcs->enable_clock_gating(hdev);
	hdev->asic_funcs->set_clock_gating(hdev);

	mutex_unlock(&gaudi->clk_gate_mutex);

@@ -6371,7 +6407,7 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
		dev_err(hdev->dev,
			"Timeout while waiting for TPC%d icache prefetch\n",
			tpc_id);
		hdev->asic_funcs->enable_clock_gating(hdev);
		hdev->asic_funcs->set_clock_gating(hdev);
		mutex_unlock(&gaudi->clk_gate_mutex);
		return -EIO;
	}
@@ -6400,7 +6436,7 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
		1000,
		kernel_timeout);

	hdev->asic_funcs->enable_clock_gating(hdev);
	hdev->asic_funcs->set_clock_gating(hdev);
	mutex_unlock(&gaudi->clk_gate_mutex);

	if (rc) {
@@ -6741,7 +6777,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
	.mmu_invalidate_cache = gaudi_mmu_invalidate_cache,
	.mmu_invalidate_cache_range = gaudi_mmu_invalidate_cache_range,
	.send_heartbeat = gaudi_send_heartbeat,
	.enable_clock_gating = gaudi_enable_clock_gating,
	.set_clock_gating = gaudi_set_clock_gating,
	.disable_clock_gating = gaudi_disable_clock_gating,
	.debug_coresight = gaudi_debug_coresight,
	.is_device_idle = gaudi_is_device_idle,
+4 −4
Original line number Diff line number Diff line
@@ -5028,14 +5028,14 @@ int goya_armcp_info_get(struct hl_device *hdev)
	return 0;
}

static void goya_enable_clock_gating(struct hl_device *hdev)
static void goya_set_clock_gating(struct hl_device *hdev)
{

	/* clock gating not supported in Goya */
}

static void goya_disable_clock_gating(struct hl_device *hdev)
{

	/* clock gating not supported in Goya */
}

static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
@@ -5259,7 +5259,7 @@ static const struct hl_asic_funcs goya_funcs = {
	.mmu_invalidate_cache = goya_mmu_invalidate_cache,
	.mmu_invalidate_cache_range = goya_mmu_invalidate_cache_range,
	.send_heartbeat = goya_send_heartbeat,
	.enable_clock_gating = goya_enable_clock_gating,
	.set_clock_gating = goya_set_clock_gating,
	.disable_clock_gating = goya_disable_clock_gating,
	.debug_coresight = goya_debug_coresight,
	.is_device_idle = goya_is_device_idle,
Loading