Commit 788cacf3 authored by Oded Gabbay's avatar Oded Gabbay
Browse files

habanalabs: set 4s timeout for message to device CPU



We see that sometimes the CPU in GOYA and GAUDI is occupied by the
power/thermal loop and can't answer requests from the driver fast enough.

Therefore, to avoid false notifications on timeouts, increase the timeout
to 4 seconds on each message sent to the device CPU.

Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarTomer Tayar <ttayar@habana.ai>
parent e38bfd30
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
	pkt.i2c_reg = i2c_reg;

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
					HL_DEVICE_TIMEOUT_USEC, (long *) val);
						0, (long *) val);

	if (rc)
		dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
	pkt.value = cpu_to_le64(val);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
					HL_DEVICE_TIMEOUT_USEC, NULL);
						0, NULL);

	if (rc)
		dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
@@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
	pkt.value = cpu_to_le64(state);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
						HL_DEVICE_TIMEOUT_USEC, NULL);
						0, NULL);

	if (rc)
		dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
+5 −5
Original line number Diff line number Diff line
@@ -61,7 +61,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
	pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);

	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
				sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
						sizeof(pkt), 0, NULL);
}

int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
@@ -144,7 +144,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
	pkt.value = cpu_to_le64(event_type);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
			HL_DEVICE_TIMEOUT_USEC, &result);
						0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
@@ -183,7 +183,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
						ARMCP_PKT_CTL_OPCODE_SHIFT);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
			total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
						total_pkt_size, 0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -204,7 +204,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
	test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
			sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
						sizeof(test_pkt), 0, &result);

	if (!rc) {
		if (result != ARMCP_PACKET_FENCE_VAL)
@@ -248,7 +248,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
	hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
			sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
						sizeof(hb_pkt), 0, &result);

	if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
		rc = -EIO;
+4 −0
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@
#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC	(HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC	(HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC	1000000		/* 1s */
#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC	4000000		/* 4s */

#define GAUDI_QMAN0_FENCE_VAL		0x72E91AB9

@@ -3479,6 +3480,9 @@ static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg,
		return 0;
	}

	if (!timeout)
		timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC;

	return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len,
						timeout, result);
}
+8 −4
Original line number Diff line number Diff line
@@ -88,6 +88,7 @@
#define GOYA_PLDM_MMU_TIMEOUT_USEC	(MMU_CONFIG_TIMEOUT_USEC * 100)
#define GOYA_PLDM_QMAN0_TIMEOUT_USEC	(HL_DEVICE_TIMEOUT_USEC * 30)
#define GOYA_BOOT_FIT_REQ_TIMEOUT_USEC	1000000		/* 1s */
#define GOYA_MSG_TO_CPU_TIMEOUT_USEC	4000000		/* 4s */

#define GOYA_QMAN0_FENCE_VAL		0xD169B243

@@ -2830,6 +2831,9 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
		return 0;
	}

	if (!timeout)
		timeout = GOYA_MSG_TO_CPU_TIMEOUT_USEC;

	return hl_fw_send_cpu_message(hdev, GOYA_QUEUE_ID_CPU_PQ, msg, len,
					timeout, result);
}
@@ -4431,8 +4435,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
	pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
						ARMCP_PKT_CTL_OPCODE_SHIFT);

	rc = goya_send_cpu_message(hdev, (u32 *) pkt, total_pkt_size,
			HL_DEVICE_TIMEOUT_USEC, &result);
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
						total_pkt_size,	0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -4464,8 +4468,8 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
				ARMCP_PKT_CTL_OPCODE_SHIFT);
	pkt.value = cpu_to_le64(event_type);

	rc = goya_send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
			HL_DEVICE_TIMEOUT_USEC, &result);
	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
						0, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+5 −1
Original line number Diff line number Diff line
@@ -588,7 +588,11 @@ enum hl_pll_frequency {
 * @hw_queues_unlock: release H/W queues lock.
 * @get_pci_id: retrieve PCI ID.
 * @get_eeprom_data: retrieve EEPROM data from F/W.
 * @send_cpu_message: send buffer to ArmCP.
 * @send_cpu_message: send message to F/W. If the message is timedout, the
 *                    driver will eventually reset the device. The timeout can
 *                    be determined by the calling function or it can be 0 and
 *                    then the timeout is the default timeout for the specific
 *                    ASIC
 * @get_hw_state: retrieve the H/W state
 * @pci_bars_map: Map PCI BARs.
 * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
Loading