summaryrefslogtreecommitdiffstats
path: root/drivers/misc
diff options
context:
space:
mode:
authorOfir Bitton <obitton@habana.ai>2020-07-21 10:49:51 +0300
committerOded Gabbay <oded.gabbay@gmail.com>2020-09-22 18:49:49 +0300
commit0a068adde505a90ece23caaf19b77567e1d18298 (patch)
treebb4a8b951eef94143747d05ac8ef68c19d1549ba /drivers/misc
parenta98d73c7fae486f7fea83bdec8599d4fccb6807d (diff)
habanalabs: add information about PCIe controller
Update firmware header with new API for getting pcie info such as tx/rx throughput and replay counter. These counters are needed by customers for monitor and maintenance of multiple devices. Add new opcodes to the INFO ioctl to retrieve these counters. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Diffstat (limited to 'drivers/misc')
-rw-r--r--drivers/misc/habanalabs/common/firmware_if.c48
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h4
-rw-r--r--drivers/misc/habanalabs/common/habanalabs_ioctl.c41
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c4
-rw-r--r--drivers/misc/habanalabs/goya/goya.c4
-rw-r--r--drivers/misc/habanalabs/include/common/armcp_if.h10
6 files changed, 111 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index f52bc690dfc5..61f5edc96e16 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -363,6 +363,54 @@ out:
return rc;
}
+int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
+ struct hl_info_pci_counters *counters)
+{
+ struct armcp_packet pkt = {};
+ long result;
+ int rc;
+
+ pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_THROUGHPUT_GET <<
+ ARMCP_PKT_CTL_OPCODE_SHIFT);
+
+ /* Fetch PCI rx counter */
+ pkt.index = cpu_to_le32(armcp_pcie_throughput_rx);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+ HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+ return rc;
+ }
+ counters->rx_throughput = result;
+
+ /* Fetch PCI tx counter */
+ pkt.index = cpu_to_le32(armcp_pcie_throughput_tx);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+ HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+ return rc;
+ }
+ counters->tx_throughput = result;
+
+ /* Fetch PCI replay counter */
+ pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_REPLAY_CNT_GET <<
+ ARMCP_PKT_CTL_OPCODE_SHIFT);
+
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+ HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to handle ArmCP PCI info pkt, error %d\n", rc);
+ return rc;
+ }
+ counters->replay_cnt = (u32) result;
+
+ return rc;
+}
+
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
{
u32 err_val;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index f97eebc64979..2c9fcb513215 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1483,6 +1483,7 @@ struct hl_device_idle_busy_ts {
* @soft_reset_cnt: number of soft reset since the driver was loaded.
* @hard_reset_cnt: number of hard reset since the driver was loaded.
* @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
+ * @clk_throttling_reason: bitmask represents the current clk throttling reasons
* @id: device minor.
* @id_control: minor of the control device
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -1587,6 +1588,7 @@ struct hl_device {
u32 soft_reset_cnt;
u32 hard_reset_cnt;
u32 idle_busy_ts_idx;
+ u32 clk_throttling_reason;
u16 id;
u16 id_control;
u16 cpu_pci_msb_addr;
@@ -1841,6 +1843,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_armcp_info_get(struct hl_device *hdev);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+int hl_fw_armcp_pci_counters_get(struct hl_device *hdev,
+ struct hl_info_pci_counters *counters);
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
u32 boot_err0_reg, bool skip_bmc,
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 5af1c03da473..4d838b1a3bbe 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -276,6 +276,41 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
}
+static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ struct hl_info_pci_counters pci_counters = {0};
+ u32 max_size = args->return_size;
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+ int rc;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ rc = hl_fw_armcp_pci_counters_get(hdev, &pci_counters);
+ if (rc)
+ return rc;
+
+ return copy_to_user(out, &pci_counters,
+ min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
+}
+
+static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ struct hl_info_clk_throttle clk_throttle = {0};
+ u32 max_size = args->return_size;
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+
+ return copy_to_user(out, &clk_throttle,
+ min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+}
+
static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
@@ -360,6 +395,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_CS_COUNTERS:
return cs_counters_info(hpriv, args);
+ case HL_INFO_PCI_COUNTERS:
+ return pci_counters_info(hpriv, args);
+
+ case HL_INFO_CLK_THROTTLE_REASON:
+ return clk_throttle_info(hpriv, args);
+
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4009b7df4caf..adb5c5594ac1 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5653,21 +5653,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
{
switch (event_type) {
case GAUDI_EVENT_FIX_POWER_ENV_S:
+ hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to power consumption\n");
break;
case GAUDI_EVENT_FIX_POWER_ENV_E:
+ hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Power envelop is safe, back to optimal clock\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
+ hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to overheating\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
+ hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Thermal envelop is safe, back to optimal clock\n");
break;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 33cd2ae653d2..954f2c022d33 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4580,18 +4580,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
{
switch (event_type) {
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
+ hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to power consumption\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
+ hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Power envelop is safe, back to optimal clock\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
+ hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to overheating\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
+ hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Thermal envelop is safe, back to optimal clock\n");
break;
diff --git a/drivers/misc/habanalabs/include/common/armcp_if.h b/drivers/misc/habanalabs/include/common/armcp_if.h
index 07f9972db28d..1403c937253c 100644
--- a/drivers/misc/habanalabs/include/common/armcp_if.h
+++ b/drivers/misc/habanalabs/include/common/armcp_if.h
@@ -243,6 +243,8 @@ enum armcp_packet_id {
ARMCP_PACKET_TEMPERATURE_SET, /* sysfs */
ARMCP_PACKET_VOLTAGE_SET, /* sysfs */
ARMCP_PACKET_CURRENT_SET, /* sysfs */
+ ARMCP_PACKET_PCIE_THROUGHPUT_GET, /* internal */
+ ARMCP_PACKET_PCIE_REPLAY_CNT_GET, /* internal */
};
#define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5
@@ -277,6 +279,9 @@ struct armcp_packet {
__u8 pad; /* unused */
};
+ /* For any general request */
+ __le32 index;
+
/* For frequency get/set */
__le32 pll_index;
@@ -344,6 +349,11 @@ enum armcp_pwm_attributes {
armcp_pwm_enable
};
+enum armcp_pcie_throughput_attributes {
+ armcp_pcie_throughput_tx,
+ armcp_pcie_throughput_rx
+};
+
/* Event Queue Packets */
struct eq_generic_event {