Commit 0a068add authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay
Browse files

habanalabs: add information about PCIe controller



Update firmware header with new API for getting pcie info
such as tx/rx throughput and replay counter.
These counters are needed by customers for monitor and maintenance
of multiple devices.
Add new opcodes to the INFO ioctl to retrieve these counters.

Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent a98d73c7
Loading
Loading
Loading
Loading
+48 −0
Original line number Diff line number Diff line
@@ -363,6 +363,54 @@ out:
	return rc;
}

int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
		struct hl_info_pci_counters *counters)
{
	struct armcp_packet pkt = {};
	long result;
	int rc;

	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_THROUGHPUT_GET <<
			ARMCP_PKT_CTL_OPCODE_SHIFT);

	/* Fetch PCI rx counter */
	pkt.index = cpu_to_le32(armcp_pcie_throughput_rx);
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
		return rc;
	}
	counters->rx_throughput = result;

	/* Fetch PCI tx counter */
	pkt.index = cpu_to_le32(armcp_pcie_throughput_tx);
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
		return rc;
	}
	counters->tx_throughput = result;

	/* Fetch PCI replay counter */
	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_REPLAY_CNT_GET <<
			ARMCP_PKT_CTL_OPCODE_SHIFT);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
			HL_ARMCP_INFO_TIMEOUT_USEC, &result);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to handle ArmCP PCI info pkt, error %d\n", rc);
		return rc;
	}
	counters->replay_cnt = (u32) result;

	return rc;
}

static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
{
	u32 err_val;
+4 −0
Original line number Diff line number Diff line
@@ -1483,6 +1483,7 @@ struct hl_device_idle_busy_ts {
 * @soft_reset_cnt: number of soft reset since the driver was loaded.
 * @hard_reset_cnt: number of hard reset since the driver was loaded.
 * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
 * @clk_throttling_reason: bitmask represents the current clk throttling reasons
 * @id: device minor.
 * @id_control: minor of the control device
 * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -1587,6 +1588,7 @@ struct hl_device {
	u32				soft_reset_cnt;
	u32				hard_reset_cnt;
	u32				idle_busy_ts_idx;
	u32				clk_throttling_reason;
	u16				id;
	u16				id_control;
	u16				cpu_pci_msb_addr;
@@ -1841,6 +1843,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_armcp_info_get(struct hl_device *hdev);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
		struct hl_info_pci_counters *counters);
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
			u32 boot_err0_reg, bool skip_bmc,
+41 −0
Original line number Diff line number Diff line
@@ -276,6 +276,41 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
		min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
}

static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	struct hl_device *hdev = hpriv->hdev;
	struct hl_info_pci_counters pci_counters = {0};
	u32 max_size = args->return_size;
	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
	int rc;

	if ((!max_size) || (!out))
		return -EINVAL;

	rc = hl_fw_armcp_pci_counters_get(hdev, &pci_counters);
	if (rc)
		return rc;

	return copy_to_user(out, &pci_counters,
		min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
}

static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	struct hl_device *hdev = hpriv->hdev;
	struct hl_info_clk_throttle clk_throttle = {0};
	u32 max_size = args->return_size;
	void __user *out = (void __user *) (uintptr_t) args->return_pointer;

	if ((!max_size) || (!out))
		return -EINVAL;

	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;

	return copy_to_user(out, &clk_throttle,
		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
}

static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
	struct hl_device *hdev = hpriv->hdev;
@@ -360,6 +395,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
	case HL_INFO_CS_COUNTERS:
		return cs_counters_info(hpriv, args);

	case HL_INFO_PCI_COUNTERS:
		return pci_counters_info(hpriv, args);

	case HL_INFO_CLK_THROTTLE_REASON:
		return clk_throttle_info(hpriv, args);

	default:
		dev_err(dev, "Invalid request %d\n", args->op);
		rc = -ENOTTY;
+4 −0
Original line number Diff line number Diff line
@@ -5653,21 +5653,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
{
	switch (event_type) {
	case GAUDI_EVENT_FIX_POWER_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to power consumption\n");
		break;

	case GAUDI_EVENT_FIX_POWER_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
		dev_info_ratelimited(hdev->dev,
			"Power envelop is safe, back to optimal clock\n");
		break;

	case GAUDI_EVENT_FIX_THERMAL_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to overheating\n");
		break;

	case GAUDI_EVENT_FIX_THERMAL_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
		dev_info_ratelimited(hdev->dev,
			"Thermal envelop is safe, back to optimal clock\n");
		break;
+4 −0
Original line number Diff line number Diff line
@@ -4580,18 +4580,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
{
	switch (event_type) {
	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to power consumption\n");
		break;
	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
		dev_info_ratelimited(hdev->dev,
			"Power envelop is safe, back to optimal clock\n");
		break;
	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
		dev_info_ratelimited(hdev->dev,
			"Clock throttling due to overheating\n");
		break;
	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
		dev_info_ratelimited(hdev->dev,
			"Thermal envelop is safe, back to optimal clock\n");
		break;
Loading