From 9494a8dd8d22cbff8ce358aaa223fffe1b070cb0 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 16 Feb 2019 00:39:17 +0200 Subject: habanalabs: add h/w queues module This patch adds the H/W queues module and the code to initialize Goya's various compute and DMA engines and their queues. Goya has 5 DMA channels, 8 TPC engines and a single MME engine. For each channel/engine, there is a H/W queue logic which is used to pass commands from the user to the H/W. That logic is called QMAN. There are two types of QMANs: external and internal. The DMA QMANs are considered external while the TPC and MME QMANs are considered internal. For each external queue there is a completion queue, which is located on the Host memory. The differences between external and internal QMANs are: 1. The location of the queue's memory. External QMANs are located on the Host memory while internal QMANs are located on the on-chip memory. 2. The external QMAN write an entry to a completion queue and sends an MSI-X interrupt upon completion of a command buffer that was given to it. The internal QMAN doesn't do that. Reviewed-by: Mike Rapoport Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/Makefile | 2 +- drivers/misc/habanalabs/device.c | 75 +- drivers/misc/habanalabs/goya/goya.c | 1527 ++++++++++++++++++-- drivers/misc/habanalabs/goya/goyaP.h | 7 + drivers/misc/habanalabs/habanalabs.h | 174 ++- drivers/misc/habanalabs/habanalabs_drv.c | 5 + drivers/misc/habanalabs/hw_queue.c | 400 +++++ drivers/misc/habanalabs/include/armcp_if.h | 292 ++++ .../habanalabs/include/goya/goya_async_events.h | 186 +++ .../misc/habanalabs/include/goya/goya_packets.h | 129 ++ drivers/misc/habanalabs/include/qman_if.h | 56 + drivers/misc/habanalabs/irq.c | 149 ++ include/uapi/misc/habanalabs.h | 29 + 13 files changed, 2915 insertions(+), 116 deletions(-) create mode 100644 drivers/misc/habanalabs/hw_queue.c create mode 100644 drivers/misc/habanalabs/include/goya/goya_async_events.h create mode 100644 drivers/misc/habanalabs/include/goya/goya_packets.h create mode 100644 drivers/misc/habanalabs/include/qman_if.h create mode 100644 drivers/misc/habanalabs/irq.c diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 2530c9b78ca4..c07f3ccb57dc 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -5,7 +5,7 @@ obj-m := habanalabs.o habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ - command_buffer.o + command_buffer.o hw_queue.o irq.o include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 5eefc2952be5..06e2b7f32499 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -174,13 +174,23 @@ static int device_early_init(struct hl_device *hdev) if (rc) goto early_fini; + hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0); + if (hdev->cq_wq == NULL) { + dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); + rc = -ENOMEM; + goto asid_fini; + } + hl_cb_mgr_init(&hdev->kernel_cb_mgr); mutex_init(&hdev->fd_open_cnt_lock); + mutex_init(&hdev->send_cpu_message_lock); atomic_set(&hdev->fd_open_cnt, 0); return 0; +asid_fini: + hl_asid_fini(hdev); early_fini: if (hdev->asic_funcs->early_fini) hdev->asic_funcs->early_fini(hdev); @@ -196,9 +206,12 @@ early_fini: */ static void device_early_fini(struct hl_device *hdev) { + mutex_destroy(&hdev->send_cpu_message_lock); hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr); + destroy_workqueue(hdev->cq_wq); + hl_asid_fini(hdev); if (hdev->asic_funcs->early_fini) @@ -277,7 +290,7 @@ int hl_device_resume(struct hl_device *hdev) */ int hl_device_init(struct hl_device *hdev, struct class *hclass) { - int rc; + int i, rc, cq_ready_cnt; /* Create device */ rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops); @@ -298,11 +311,48 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) if (rc) goto early_fini; + /* + * Initialize the H/W queues. Must be done before hw_init, because + * there the addresses of the kernel queue are being written to the + * registers of the device + */ + rc = hl_hw_queues_create(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize kernel queues\n"); + goto sw_fini; + } + + /* + * Initialize the completion queues. Must be done before hw_init, + * because there the addresses of the completion queues are being + * passed as arguments to request_irq + */ + hdev->completion_queue = + kcalloc(hdev->asic_prop.completion_queues_count, + sizeof(*hdev->completion_queue), GFP_KERNEL); + + if (!hdev->completion_queue) { + dev_err(hdev->dev, "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } + + for (i = 0, cq_ready_cnt = 0; + i < hdev->asic_prop.completion_queues_count; + i++, cq_ready_cnt++) { + rc = hl_cq_init(hdev, &hdev->completion_queue[i], i); + if (rc) { + dev_err(hdev->dev, + "failed to initialize completion queue\n"); + goto cq_fini; + } + } + /* Allocate the kernel context */ hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); if (!hdev->kernel_ctx) { rc = -ENOMEM; - goto sw_fini; + goto cq_fini; } hdev->user_ctx = NULL; @@ -328,6 +378,14 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->disabled = false; + /* Check that the communication with the device is working */ + rc = hdev->asic_funcs->test_queues(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to detect if device is alive\n"); + rc = 0; + goto out_disabled; + } + dev_notice(hdev->dev, "Successfully added device to habanalabs driver\n"); @@ -339,6 +397,12 @@ release_ctx: "kernel ctx is still alive on initialization failure\n"); free_ctx: kfree(hdev->kernel_ctx); +cq_fini: + for (i = 0 ; i < cq_ready_cnt ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); +hw_queues_destroy: + hl_hw_queues_destroy(hdev); sw_fini: hdev->asic_funcs->sw_fini(hdev); early_fini: @@ -368,6 +432,7 @@ out_disabled: */ void hl_device_fini(struct hl_device *hdev) { + int i; dev_info(hdev->dev, "Removing device\n"); /* Mark device as disabled */ @@ -382,6 +447,12 @@ void hl_device_fini(struct hl_device *hdev) /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, true); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); + + hl_hw_queues_destroy(hdev); + /* Call ASIC S/W finalize function */ hdev->asic_funcs->sw_fini(hdev); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1fa5b91fd703..a45a183d4e5c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -90,6 +90,26 @@ static void goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + int i; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; + prop->hw_queues_props[i].kmd_only = 0; + } + + for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; + prop->hw_queues_props[i].kmd_only = 1; + } + + for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES + + NUMBER_OF_INT_HW_QUEUES; i++) { + prop->hw_queues_props[i].type = QUEUE_TYPE_INT; + prop->hw_queues_props[i].kmd_only = 0; + } + + for (; i < HL_MAX_QUEUES; i++) + prop->hw_queues_props[i].type = QUEUE_TYPE_NA; prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; @@ -118,6 +138,18 @@ static void goya_get_fixed_properties(struct hl_device *hdev) prop->high_pll = PLL_HIGH_DEFAULT; } +int goya_send_pci_access_msg(struct hl_device *hdev, u32 opcode) +{ + struct armcp_packet pkt; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = opcode << ARMCP_PKT_CTL_OPCODE_SHIFT; + + return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, + sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL); +} + /* * goya_pci_bars_map - Map PCI BARS of Goya device * @@ -510,6 +542,8 @@ static int goya_sw_init(struct hl_device *hdev) if (!goya) return -ENOMEM; + goya->test_cpu_queue = goya_test_cpu_queue; + /* according to goya_init_iatu */ goya->ddr_bar_cur_addr = DRAM_PHYS_BASE; hdev->asic_specific = goya; @@ -596,6 +630,311 @@ int goya_sw_fini(struct hl_device *hdev) return 0; } +static void goya_init_dma_qman(struct hl_device *hdev, int dma_id, + dma_addr_t bus_address) +{ + struct goya_device *goya = hdev->asic_specific; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmDMA_QM_0_PQ_BASE_LO + reg_off, lower_32_bits(bus_address)); + WREG32(mmDMA_QM_0_PQ_BASE_HI + reg_off, upper_32_bits(bus_address)); + + WREG32(mmDMA_QM_0_PQ_SIZE + reg_off, ilog2(HL_QUEUE_LENGTH)); + WREG32(mmDMA_QM_0_PQ_PI + reg_off, 0); + WREG32(mmDMA_QM_0_PQ_CI + reg_off, 0); + + WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmDMA_QM_0_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmDMA_QM_0_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmDMA_QM_0_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + WREG32(mmDMA_QM_0_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_DMA0_QM + dma_id); + + /* PQ has buffer of 2 cache lines, while CQ has 8 lines */ + WREG32(mmDMA_QM_0_PQ_CFG1 + reg_off, 0x00020002); + WREG32(mmDMA_QM_0_CQ_CFG1 + reg_off, 0x00080008); + + if (dma_id == 0) + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED); + else + if (goya->hw_cap_initialized & HW_CAP_MMU) + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, + QMAN_DMA_PARTLY_TRUSTED); + else + WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, + QMAN_DMA_FULLY_TRUSTED); + + WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN); + WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE); +} + +static void goya_init_dma_ch(struct hl_device *hdev, int dma_id) +{ + u32 gic_base_lo, gic_base_hi; + u64 sob_addr; + u32 reg_off = dma_id * (mmDMA_CH_1_CFG1 - mmDMA_CH_0_CFG1); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmDMA_CH_0_ERRMSG_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmDMA_CH_0_ERRMSG_ADDR_HI + reg_off, gic_base_hi); + WREG32(mmDMA_CH_0_ERRMSG_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_DMA0_CH + dma_id); + + if (dma_id) { + sob_addr = CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1000 + + (dma_id - 1) * 4; + WREG32(mmDMA_CH_0_WR_COMP_ADDR_LO + reg_off, + lower_32_bits(sob_addr)); + WREG32(mmDMA_CH_0_WR_COMP_ADDR_HI + reg_off, + upper_32_bits(sob_addr)); + WREG32(mmDMA_CH_0_WR_COMP_WDATA + reg_off, 0x80000001); + } +} + +/* + * goya_init_dma_qmans - Initialize QMAN DMA registers + * + * @hdev: pointer to hl_device structure + * + * Initialize the H/W registers of the QMAN DMA channels + * + */ +static void goya_init_dma_qmans(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + struct hl_hw_queue *q; + dma_addr_t bus_address; + int i; + + if (goya->hw_cap_initialized & HW_CAP_DMA) + return; + + q = &hdev->kernel_queues[0]; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) { + bus_address = q->bus_address + + hdev->asic_prop.host_phys_base_address; + + goya_init_dma_qman(hdev, i, bus_address); + goya_init_dma_ch(hdev, i); + } + + goya->hw_cap_initialized |= HW_CAP_DMA; +} + +/* + * goya_disable_external_queues - Disable external queues + * + * @hdev: pointer to hl_device structure + * + */ +static void goya_disable_external_queues(struct hl_device *hdev) +{ + WREG32(mmDMA_QM_0_GLBL_CFG0, 0); + WREG32(mmDMA_QM_1_GLBL_CFG0, 0); + WREG32(mmDMA_QM_2_GLBL_CFG0, 0); + WREG32(mmDMA_QM_3_GLBL_CFG0, 0); + WREG32(mmDMA_QM_4_GLBL_CFG0, 0); +} + +static int goya_stop_queue(struct hl_device *hdev, u32 cfg_reg, + u32 cp_sts_reg, u32 glbl_sts0_reg) +{ + int rc; + u32 status; + + /* use the values of TPC0 as they are all the same*/ + + WREG32(cfg_reg, 1 << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + + status = RREG32(cp_sts_reg); + if (status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK) { + rc = hl_poll_timeout( + hdev, + cp_sts_reg, + status, + !(status & TPC0_QM_CP_STS_FENCE_IN_PROGRESS_MASK), + 1000, + QMAN_FENCE_TIMEOUT_USEC); + + /* if QMAN is stuck in fence no need to check for stop */ + if (rc) + return 0; + } + + rc = hl_poll_timeout( + hdev, + glbl_sts0_reg, + status, + (status & TPC0_QM_GLBL_STS0_CP_IS_STOP_MASK), + 1000, + QMAN_STOP_TIMEOUT_USEC); + + if (rc) { + dev_err(hdev->dev, + "Timeout while waiting for QMAN to stop\n"); + return -EINVAL; + } + + return 0; +} + +/* + * goya_stop_external_queues - Stop external queues + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +static int goya_stop_external_queues(struct hl_device *hdev) +{ + int rc, retval = 0; + + rc = goya_stop_queue(hdev, + mmDMA_QM_0_GLBL_CFG1, + mmDMA_QM_0_CP_STS, + mmDMA_QM_0_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 0\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_1_GLBL_CFG1, + mmDMA_QM_1_CP_STS, + mmDMA_QM_1_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 1\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_2_GLBL_CFG1, + mmDMA_QM_2_CP_STS, + mmDMA_QM_2_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 2\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_3_GLBL_CFG1, + mmDMA_QM_3_CP_STS, + mmDMA_QM_3_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 3\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmDMA_QM_4_GLBL_CFG1, + mmDMA_QM_4_CP_STS, + mmDMA_QM_4_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop DMA QMAN 4\n"); + retval = -EIO; + } + + return retval; +} + +static void goya_resume_external_queues(struct hl_device *hdev) +{ + WREG32(mmDMA_QM_0_GLBL_CFG1, 0); + WREG32(mmDMA_QM_1_GLBL_CFG1, 0); + WREG32(mmDMA_QM_2_GLBL_CFG1, 0); + WREG32(mmDMA_QM_3_GLBL_CFG1, 0); + WREG32(mmDMA_QM_4_GLBL_CFG1, 0); +} + +/* + * goya_init_cpu_queues - Initialize PQ/CQ/EQ of CPU + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +int goya_init_cpu_queues(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + dma_addr_t bus_address; + u32 status; + struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; + int err; + + if (!hdev->cpu_queues_enable) + return 0; + + if (goya->hw_cap_initialized & HW_CAP_CPU_Q) + return 0; + + bus_address = cpu_pq->bus_address + + hdev->asic_prop.host_phys_base_address; + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address)); + + bus_address = hdev->cpu_accessible_dma_address + + hdev->asic_prop.host_phys_base_address; + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address)); + + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_10, CPU_ACCESSIBLE_MEM_SIZE); + + /* Used for EQ CI */ + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, 0); + + WREG32(mmCPU_IF_PF_PQ_PI, 0); + + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, PQ_INIT_STATUS_READY_FOR_CP); + + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_PI_UPDATE); + + err = hl_poll_timeout( + hdev, + mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, + status, + (status == PQ_INIT_STATUS_READY_FOR_HOST), + 1000, + GOYA_CPU_TIMEOUT_USEC); + + if (err) { + dev_err(hdev->dev, + "Failed to communicate with ARM CPU (ArmCP timeout)\n"); + return -EIO; + } + + goya->hw_cap_initialized |= HW_CAP_CPU_Q; + return 0; +} + static void goya_set_pll_refclk(struct hl_device *hdev) { WREG32(mmCPU_PLL_DIV_SEL_0, 0x0); @@ -1023,144 +1362,644 @@ static void goya_init_golden_registers(struct hl_device *hdev) goya->hw_cap_initialized |= HW_CAP_GOLDEN; } - -/* - * goya_push_fw_to_device - Push FW code to device - * - * @hdev: pointer to hl_device structure - * - * Copy fw code from firmware file to device memory. - * Returns 0 on success - * - */ -static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name, - void __iomem *dst) +static void goya_init_mme_qman(struct hl_device *hdev) { - const struct firmware *fw; - const u64 *fw_data; - size_t fw_size, i; - int rc; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; - rc = request_firmware(&fw, fw_name, hdev->dev); + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - if (rc) { - dev_err(hdev->dev, "Failed to request %s\n", fw_name); - goto out; - } + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); - fw_size = fw->size; - if ((fw_size % 4) != 0) { - dev_err(hdev->dev, "illegal %s firmware size %zu\n", - fw_name, fw_size); - rc = -EINVAL; - goto out; - } + qman_base_addr = hdev->asic_prop.sram_base_address + + MME_QMAN_BASE_OFFSET; - dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size); + WREG32(mmMME_QM_PQ_BASE_LO, lower_32_bits(qman_base_addr)); + WREG32(mmMME_QM_PQ_BASE_HI, upper_32_bits(qman_base_addr)); + WREG32(mmMME_QM_PQ_SIZE, ilog2(MME_QMAN_LENGTH)); + WREG32(mmMME_QM_PQ_PI, 0); + WREG32(mmMME_QM_PQ_CI, 0); + WREG32(mmMME_QM_CP_LDMA_SRC_BASE_LO_OFFSET, 0x10C0); + WREG32(mmMME_QM_CP_LDMA_SRC_BASE_HI_OFFSET, 0x10C4); + WREG32(mmMME_QM_CP_LDMA_TSIZE_OFFSET, 0x10C8); + WREG32(mmMME_QM_CP_LDMA_COMMIT_OFFSET, 0x10CC); - fw_data = (const u64 *) fw->data; + WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_LO, mtr_base_lo); + WREG32(mmMME_QM_CP_MSG_BASE0_ADDR_HI, mtr_base_hi); + WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_LO, so_base_lo); + WREG32(mmMME_QM_CP_MSG_BASE1_ADDR_HI, so_base_hi); - if ((fw->size % 8) != 0) - fw_size -= 8; + /* QMAN CQ has 8 cache lines */ + WREG32(mmMME_QM_CQ_CFG1, 0x00080008); - for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { - if (!(i & (0x80000 - 1))) { - dev_dbg(hdev->dev, - "copied so far %zu out of %zu for %s firmware", - i, fw_size, fw_name); - usleep_range(20, 100); - } + WREG32(mmMME_QM_GLBL_ERR_ADDR_LO, gic_base_lo); + WREG32(mmMME_QM_GLBL_ERR_ADDR_HI, gic_base_hi); - writeq(*fw_data, dst); - } + WREG32(mmMME_QM_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_QM); - if ((fw->size % 8) != 0) - writel(*(const u32 *) fw_data, dst); + WREG32(mmMME_QM_GLBL_ERR_CFG, QMAN_MME_ERR_MSG_EN); -out: - release_firmware(fw); - return rc; + WREG32(mmMME_QM_GLBL_PROT, QMAN_MME_ERR_PROT); + + WREG32(mmMME_QM_GLBL_CFG0, QMAN_MME_ENABLE); } -static int goya_pldm_init_cpu(struct hl_device *hdev) +static void goya_init_mme_cmdq(struct hl_device *hdev) { - char fw_name[200]; - void __iomem *dst; - u32 val, unit_rst_val; - int rc; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; - /* Must initialize SRAM scrambler before pushing u-boot to SRAM */ - goya_init_golden_registers(hdev); + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - /* Put ARM cores into reset */ - WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT); - val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); - /* Reset the CA53 MACRO */ - unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); - WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET); - val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); - WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val); - val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + qman_base_addr = hdev->asic_prop.sram_base_address + + MME_QMAN_BASE_OFFSET; - snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin"); - dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET; - rc = goya_push_fw_to_device(hdev, fw_name, dst); - if (rc) - return rc; + WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo); + WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi); + WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO, so_base_lo); + WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_HI, so_base_hi); - snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb"); - dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET; - rc = goya_push_fw_to_device(hdev, fw_name, dst); - if (rc) - return rc; + /* CMDQ CQ has 20 cache lines */ + WREG32(mmMME_CMDQ_CQ_CFG1, 0x00140014); - WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY); - WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA); + WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_LO, gic_base_lo); + WREG32(mmMME_CMDQ_GLBL_ERR_ADDR_HI, gic_base_hi); - WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0, - lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); - WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0, - upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + WREG32(mmMME_CMDQ_GLBL_ERR_WDATA, GOYA_ASYNC_EVENT_ID_MME_CMDQ); - /* Release ARM core 0 from reset */ - WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, - CPU_RESET_CORE0_DEASSERT); - val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + WREG32(mmMME_CMDQ_GLBL_ERR_CFG, CMDQ_MME_ERR_MSG_EN); - return 0; + WREG32(mmMME_CMDQ_GLBL_PROT, CMDQ_MME_ERR_PROT); + + WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE); } -/* - * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx. - * The version string should be located by that offset. - */ -static void goya_read_device_fw_version(struct hl_device *hdev, - enum goya_fw_component fwc) +static void goya_init_mme_qmans(struct hl_device *hdev) { - const char *name; - u32 ver_off; - char *dest; + struct goya_device *goya = hdev->asic_specific; + u32 so_base_lo, so_base_hi; - switch (fwc) { - case FW_COMP_UBOOT: - ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29); - dest = hdev->asic_prop.uboot_ver; - name = "U-Boot"; - break; - case FW_COMP_PREBOOT: - ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28); - dest = hdev->asic_prop.preboot_ver; - name = "Preboot"; - break; - default: - dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + if (goya->hw_cap_initialized & HW_CAP_MME) return; - } - ver_off &= ~((u32)SRAM_BASE_ADDR); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); - if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) { + WREG32(mmMME_SM_BASE_ADDRESS_LOW, so_base_lo); + WREG32(mmMME_SM_BASE_ADDRESS_HIGH, so_base_hi); + + goya_init_mme_qman(hdev); + goya_init_mme_cmdq(hdev); + + goya->hw_cap_initialized |= HW_CAP_MME; +} + +static void goya_init_tpc_qman(struct hl_device *hdev, u32 base_off, int tpc_id) +{ + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u64 qman_base_addr; + u32 reg_off = tpc_id * (mmTPC1_QM_PQ_PI - mmTPC0_QM_PQ_PI); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + qman_base_addr = hdev->asic_prop.sram_base_address + base_off; + + WREG32(mmTPC0_QM_PQ_BASE_LO + reg_off, lower_32_bits(qman_base_addr)); + WREG32(mmTPC0_QM_PQ_BASE_HI + reg_off, upper_32_bits(qman_base_addr)); + WREG32(mmTPC0_QM_PQ_SIZE + reg_off, ilog2(TPC_QMAN_LENGTH)); + WREG32(mmTPC0_QM_PQ_PI + reg_off, 0); + WREG32(mmTPC0_QM_PQ_CI + reg_off, 0); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET + reg_off, 0x10C0); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_HI_OFFSET + reg_off, 0x10C4); + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET + reg_off, 0x10C8); + WREG32(mmTPC0_QM_CP_LDMA_COMMIT_OFFSET + reg_off, 0x10CC); + + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + + WREG32(mmTPC0_QM_CQ_CFG1 + reg_off, 0x00080008); + + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + + WREG32(mmTPC0_QM_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_TPC0_QM + tpc_id); + + WREG32(mmTPC0_QM_GLBL_ERR_CFG + reg_off, QMAN_TPC_ERR_MSG_EN); + + WREG32(mmTPC0_QM_GLBL_PROT + reg_off, QMAN_TPC_ERR_PROT); + + WREG32(mmTPC0_QM_GLBL_CFG0 + reg_off, QMAN_TPC_ENABLE); +} + +static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id) +{ + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 gic_base_lo, gic_base_hi; + u32 reg_off = tpc_id * (mmTPC1_CMDQ_CQ_CFG1 - mmTPC0_CMDQ_CQ_CFG1); + + mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + gic_base_lo = + lower_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + gic_base_hi = + upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); + + WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_LO + reg_off, mtr_base_lo); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE0_ADDR_HI + reg_off, mtr_base_hi); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_LO + reg_off, so_base_lo); + WREG32(mmTPC0_CMDQ_CP_MSG_BASE1_ADDR_HI + reg_off, so_base_hi); + + WREG32(mmTPC0_CMDQ_CQ_CFG1 + reg_off, 0x00140014); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_LO + reg_off, gic_base_lo); + WREG32(mmTPC0_CMDQ_GLBL_ERR_ADDR_HI + reg_off, gic_base_hi); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_WDATA + reg_off, + GOYA_ASYNC_EVENT_ID_TPC0_CMDQ + tpc_id); + + WREG32(mmTPC0_CMDQ_GLBL_ERR_CFG + reg_off, CMDQ_TPC_ERR_MSG_EN); + + WREG32(mmTPC0_CMDQ_GLBL_PROT + reg_off, CMDQ_TPC_ERR_PROT); + + WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE); +} + +static void goya_init_tpc_qmans(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + u32 so_base_lo, so_base_hi; + u32 cfg_off = mmTPC1_CFG_SM_BASE_ADDRESS_LOW - + mmTPC0_CFG_SM_BASE_ADDRESS_LOW; + int i; + + if (goya->hw_cap_initialized & HW_CAP_TPC) + return; + + so_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + + for (i = 0 ; i < TPC_MAX_NUM ; i++) { + WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_LOW + i * cfg_off, + so_base_lo); + WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_HIGH + i * cfg_off, + so_base_hi); + } + + goya_init_tpc_qman(hdev, TPC0_QMAN_BASE_OFFSET, 0); + goya_init_tpc_qman(hdev, TPC1_QMAN_BASE_OFFSET, 1); + goya_init_tpc_qman(hdev, TPC2_QMAN_BASE_OFFSET, 2); + goya_init_tpc_qman(hdev, TPC3_QMAN_BASE_OFFSET, 3); + goya_init_tpc_qman(hdev, TPC4_QMAN_BASE_OFFSET, 4); + goya_init_tpc_qman(hdev, TPC5_QMAN_BASE_OFFSET, 5); + goya_init_tpc_qman(hdev, TPC6_QMAN_BASE_OFFSET, 6); + goya_init_tpc_qman(hdev, TPC7_QMAN_BASE_OFFSET, 7); + + for (i = 0 ; i < TPC_MAX_NUM ; i++) + goya_init_tpc_cmdq(hdev, i); + + goya->hw_cap_initialized |= HW_CAP_TPC; +} + +/* + * goya_disable_internal_queues - Disable internal queues + * + * @hdev: pointer to hl_device structure + * + */ +static void goya_disable_internal_queues(struct hl_device *hdev) +{ + WREG32(mmMME_QM_GLBL_CFG0, 0); + WREG32(mmMME_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC0_QM_GLBL_CFG0, 0); + WREG32(mmTPC0_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC1_QM_GLBL_CFG0, 0); + WREG32(mmTPC1_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC2_QM_GLBL_CFG0, 0); + WREG32(mmTPC2_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC3_QM_GLBL_CFG0, 0); + WREG32(mmTPC3_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC4_QM_GLBL_CFG0, 0); + WREG32(mmTPC4_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC5_QM_GLBL_CFG0, 0); + WREG32(mmTPC5_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC6_QM_GLBL_CFG0, 0); + WREG32(mmTPC6_CMDQ_GLBL_CFG0, 0); + + WREG32(mmTPC7_QM_GLBL_CFG0, 0); + WREG32(mmTPC7_CMDQ_GLBL_CFG0, 0); +} + +/* + * goya_stop_internal_queues - Stop internal queues + * + * @hdev: pointer to hl_device structure + * + * Returns 0 on success + * + */ +static int goya_stop_internal_queues(struct hl_device *hdev) +{ + int rc, retval = 0; + + /* + * Each queue (QMAN) is a separate H/W logic. That means that each + * QMAN can be stopped independently and failure to stop one does NOT + * mandate we should not try to stop other QMANs + */ + + rc = goya_stop_queue(hdev, + mmMME_QM_GLBL_CFG1, + mmMME_QM_CP_STS, + mmMME_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop MME QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmMME_CMDQ_GLBL_CFG1, + mmMME_CMDQ_CP_STS, + mmMME_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop MME CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC0_QM_GLBL_CFG1, + mmTPC0_QM_CP_STS, + mmTPC0_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 0 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC0_CMDQ_GLBL_CFG1, + mmTPC0_CMDQ_CP_STS, + mmTPC0_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 0 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC1_QM_GLBL_CFG1, + mmTPC1_QM_CP_STS, + mmTPC1_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 1 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC1_CMDQ_GLBL_CFG1, + mmTPC1_CMDQ_CP_STS, + mmTPC1_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 1 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC2_QM_GLBL_CFG1, + mmTPC2_QM_CP_STS, + mmTPC2_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 2 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC2_CMDQ_GLBL_CFG1, + mmTPC2_CMDQ_CP_STS, + mmTPC2_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 2 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC3_QM_GLBL_CFG1, + mmTPC3_QM_CP_STS, + mmTPC3_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 3 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC3_CMDQ_GLBL_CFG1, + mmTPC3_CMDQ_CP_STS, + mmTPC3_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 3 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC4_QM_GLBL_CFG1, + mmTPC4_QM_CP_STS, + mmTPC4_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 4 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC4_CMDQ_GLBL_CFG1, + mmTPC4_CMDQ_CP_STS, + mmTPC4_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 4 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC5_QM_GLBL_CFG1, + mmTPC5_QM_CP_STS, + mmTPC5_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 5 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC5_CMDQ_GLBL_CFG1, + mmTPC5_CMDQ_CP_STS, + mmTPC5_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 5 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC6_QM_GLBL_CFG1, + mmTPC6_QM_CP_STS, + mmTPC6_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 6 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC6_CMDQ_GLBL_CFG1, + mmTPC6_CMDQ_CP_STS, + mmTPC6_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 6 CMDQ\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC7_QM_GLBL_CFG1, + mmTPC7_QM_CP_STS, + mmTPC7_QM_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 7 QMAN\n"); + retval = -EIO; + } + + rc = goya_stop_queue(hdev, + mmTPC7_CMDQ_GLBL_CFG1, + mmTPC7_CMDQ_CP_STS, + mmTPC7_CMDQ_GLBL_STS0); + + if (rc) { + dev_err(hdev->dev, "failed to stop TPC 7 CMDQ\n"); + retval = -EIO; + } + + return retval; +} + +static void goya_resume_internal_queues(struct hl_device *hdev) +{ + WREG32(mmMME_QM_GLBL_CFG1, 0); + WREG32(mmMME_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC0_QM_GLBL_CFG1, 0); + WREG32(mmTPC0_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC1_QM_GLBL_CFG1, 0); + WREG32(mmTPC1_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC2_QM_GLBL_CFG1, 0); + WREG32(mmTPC2_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC3_QM_GLBL_CFG1, 0); + WREG32(mmTPC3_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC4_QM_GLBL_CFG1, 0); + WREG32(mmTPC4_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC5_QM_GLBL_CFG1, 0); + WREG32(mmTPC5_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC6_QM_GLBL_CFG1, 0); + WREG32(mmTPC6_CMDQ_GLBL_CFG1, 0); + + WREG32(mmTPC7_QM_GLBL_CFG1, 0); + WREG32(mmTPC7_CMDQ_GLBL_CFG1, 0); +} + + +/* + * goya_push_fw_to_device - Push FW code to device + * + * @hdev: pointer to hl_device structure + * + * Copy fw code from firmware file to device memory. + * Returns 0 on success + * + */ +static int goya_push_fw_to_device(struct hl_device *hdev, const char *fw_name, + void __iomem *dst) +{ + const struct firmware *fw; + const u64 *fw_data; + size_t fw_size, i; + int rc; + + rc = request_firmware(&fw, fw_name, hdev->dev); + + if (rc) { + dev_err(hdev->dev, "Failed to request %s\n", fw_name); + goto out; + } + + fw_size = fw->size; + if ((fw_size % 4) != 0) { + dev_err(hdev->dev, "illegal %s firmware size %zu\n", + fw_name, fw_size); + rc = -EINVAL; + goto out; + } + + dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size); + + fw_data = (const u64 *) fw->data; + + if ((fw->size % 8) != 0) + fw_size -= 8; + + for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { + if (!(i & (0x80000 - 1))) { + dev_dbg(hdev->dev, + "copied so far %zu out of %zu for %s firmware", + i, fw_size, fw_name); + usleep_range(20, 100); + } + + writeq(*fw_data, dst); + } + + if ((fw->size % 8) != 0) + writel(*(const u32 *) fw_data, dst); + +out: + release_firmware(fw); + return rc; +} + +static int goya_pldm_init_cpu(struct hl_device *hdev) +{ + char fw_name[200]; + void __iomem *dst; + u32 val, unit_rst_val; + int rc; + + /* Must initialize SRAM scrambler before pushing u-boot to SRAM */ + goya_init_golden_registers(hdev); + + /* Put ARM cores into reset */ + WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, CPU_RESET_ASSERT); + val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + + /* Reset the CA53 MACRO */ + unit_rst_val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, CA53_RESET); + val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + WREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N, unit_rst_val); + val = RREG32(mmPSOC_GLOBAL_CONF_UNIT_RST_N); + + snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin"); + dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET; + rc = goya_push_fw_to_device(hdev, fw_name, dst); + if (rc) + return rc; + + snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb"); + dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET; + rc = goya_push_fw_to_device(hdev, fw_name, dst); + if (rc) + return rc; + + WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_FIT_RDY); + WREG32(mmPSOC_GLOBAL_CONF_WARM_REBOOT, CPU_BOOT_STATUS_NA); + + WREG32(mmCPU_CA53_CFG_RST_ADDR_LSB_0, + lower_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + WREG32(mmCPU_CA53_CFG_RST_ADDR_MSB_0, + upper_32_bits(SRAM_BASE_ADDR + UBOOT_FW_OFFSET)); + + /* Release ARM core 0 from reset */ + WREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL, + CPU_RESET_CORE0_DEASSERT); + val = RREG32(mmCPU_CA53_CFG_ARM_RST_CONTROL); + + return 0; +} + +/* + * FW component passes an offset from SRAM_BASE_ADDR in SCRATCHPAD_xx. + * The version string should be located by that offset. + */ +static void goya_read_device_fw_version(struct hl_device *hdev, + enum goya_fw_component fwc) +{ + const char *name; + u32 ver_off; + char *dest; + + switch (fwc) { + case FW_COMP_UBOOT: + ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29); + dest = hdev->asic_prop.uboot_ver; + name = "U-Boot"; + break; + case FW_COMP_PREBOOT: + ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28); + dest = hdev->asic_prop.preboot_ver; + name = "Preboot"; + break; + default: + dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + return; + } + + ver_off &= ~((u32)SRAM_BASE_ADDR); + + if (ver_off < SRAM_SIZE - VERSION_MAX_LEN) { memcpy_fromio(dest, hdev->pcie_bar[SRAM_CFG_BAR_ID] + ver_off, VERSION_MAX_LEN); } else { @@ -1344,6 +2183,19 @@ static int goya_hw_init(struct hl_device *hdev) goya_init_security(hdev); + goya_init_dma_qmans(hdev); + + goya_init_mme_qmans(hdev); + + goya_init_tpc_qmans(hdev); + + rc = goya_init_cpu_queues(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n", + rc); + goto disable_queues; + } + /* CPU initialization is finished, we can now move to 48 bit DMA mask */ rc = pci_set_dma_mask(hdev->pdev, DMA_BIT_MASK(48)); if (rc) { @@ -1352,7 +2204,7 @@ static int goya_hw_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Unable to set pci dma mask to 32 bits\n"); - return rc; + goto disable_pci_access; } } @@ -1364,7 +2216,7 @@ static int goya_hw_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Unable to set pci consistent dma mask to 32 bits\n"); - return rc; + goto disable_pci_access; } } @@ -1372,6 +2224,14 @@ static int goya_hw_init(struct hl_device *hdev) val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); return 0; + +disable_pci_access: + goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS); +disable_queues: + goya_disable_internal_queues(hdev); + goya_disable_external_queues(hdev); + + return rc; } /* @@ -1460,12 +2320,40 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) int goya_suspend(struct hl_device *hdev) { - return 0; + int rc; + + rc = goya_stop_internal_queues(hdev); + + if (rc) { + dev_err(hdev->dev, "failed to stop internal queues\n"); + return rc; + } + + rc = goya_stop_external_queues(hdev); + + if (rc) { + dev_err(hdev->dev, "failed to stop external queues\n"); + return rc; + } + + rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS); + if (rc) + dev_err(hdev->dev, "Failed to disable PCI access from CPU\n"); + + return rc; } int goya_resume(struct hl_device *hdev) { - return 0; + int rc; + + goya_resume_external_queues(hdev); + goya_resume_internal_queues(hdev); + + rc = goya_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS); + if (rc) + dev_err(hdev->dev, "Failed to enable PCI access from CPU\n"); + return rc; } int goya_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) @@ -1489,6 +2377,101 @@ int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, return rc; } +void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) +{ + u32 db_reg_offset, db_value; + bool invalid_queue = false; + + switch (hw_queue_id) { + case GOYA_QUEUE_ID_DMA_0: + db_reg_offset = mmDMA_QM_0_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_1: + db_reg_offset = mmDMA_QM_1_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_2: + db_reg_offset = mmDMA_QM_2_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_3: + db_reg_offset = mmDMA_QM_3_PQ_PI; + break; + + case GOYA_QUEUE_ID_DMA_4: + db_reg_offset = mmDMA_QM_4_PQ_PI; + break; + + case GOYA_QUEUE_ID_CPU_PQ: + if (hdev->cpu_queues_enable) + db_reg_offset = mmCPU_IF_PF_PQ_PI; + else + invalid_queue = true; + break; + + case GOYA_QUEUE_ID_MME: + db_reg_offset = mmMME_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC0: + db_reg_offset = mmTPC0_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC1: + db_reg_offset = mmTPC1_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC2: + db_reg_offset = mmTPC2_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC3: + db_reg_offset = mmTPC3_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC4: + db_reg_offset = mmTPC4_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC5: + db_reg_offset = mmTPC5_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC6: + db_reg_offset = mmTPC6_QM_PQ_PI; + break; + + case GOYA_QUEUE_ID_TPC7: + db_reg_offset = mmTPC7_QM_PQ_PI; + break; + + default: + invalid_queue = true; + } + + if (invalid_queue) { + /* Should never get here */ + dev_err(hdev->dev, "h/w queue %d is invalid. Can't set pi\n", + hw_queue_id); + return; + } + + db_value = pi; + + /* ring the doorbell */ + WREG32(db_reg_offset, db_value); + + if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_PI_UPDATE); +} + +void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val) +{ + /* Not needed in Goya */ +} + void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { @@ -1501,6 +2484,315 @@ void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr, dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle); } +void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, + dma_addr_t *dma_handle, u16 *queue_len) +{ + void *base; + u32 offset; + + *dma_handle = hdev->asic_prop.sram_base_address; + + base = hdev->pcie_bar[SRAM_CFG_BAR_ID]; + + switch (queue_id) { + case GOYA_QUEUE_ID_MME: + offset = MME_QMAN_BASE_OFFSET; + *queue_len = MME_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC0: + offset = TPC0_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC1: + offset = TPC1_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC2: + offset = TPC2_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC3: + offset = TPC3_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC4: + offset = TPC4_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC5: + offset = TPC5_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC6: + offset = TPC6_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + case GOYA_QUEUE_ID_TPC7: + offset = TPC7_QMAN_BASE_OFFSET; + *queue_len = TPC_QMAN_LENGTH; + break; + default: + dev_err(hdev->dev, "Got invalid queue id %d\n", queue_id); + return NULL; + } + + base += offset; + *dma_handle += offset; + + return base; +} + +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, + u32 timeout, long *result) +{ + struct goya_device *goya = hdev->asic_specific; + struct armcp_packet *pkt; + dma_addr_t pkt_dma_addr; + u32 tmp; + int rc = 0; + + if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) { + if (result) + *result = 0; + return 0; + } + + if (len > CPU_CB_SIZE) { + dev_err(hdev->dev, "Invalid CPU message size of %d bytes\n", + len); + return -ENOMEM; + } + + pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len, + &pkt_dma_addr); + if (!pkt) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for packet to CPU\n"); + return -ENOMEM; + } + + memcpy(pkt, msg, len); + + mutex_lock(&hdev->send_cpu_message_lock); + + if (hdev->disabled) + goto out; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len, + pkt_dma_addr); + if (rc) { + dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc); + goto out; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) &pkt->fence, + timeout, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, + "Timeout while waiting for CPU packet fence\n"); + goto out; + } + + if (tmp == ARMCP_PACKET_FENCE_VAL) { + rc = (pkt->ctl & ARMCP_PKT_CTL_RC_MASK) >> + ARMCP_PKT_CTL_RC_SHIFT; + if (rc) { + dev_err(hdev->dev, + "F/W ERROR %d for CPU packet %d\n", + rc, (pkt->ctl & ARMCP_PKT_CTL_OPCODE_MASK) + >> ARMCP_PKT_CTL_OPCODE_SHIFT); + rc = -EINVAL; + } else if (result) { + *result = pkt->result; + } + } else { + dev_err(hdev->dev, "CPU packet wrong fence value\n"); + rc = -EINVAL; + } + +out: + mutex_unlock(&hdev->send_cpu_message_lock); + + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt); + + return rc; +} + +int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) +{ + struct packet_msg_prot *fence_pkt; + dma_addr_t pkt_dma_addr; + u32 fence_val, tmp; + dma_addr_t fence_dma_addr; + u32 *fence_ptr; + int rc; + + fence_val = GOYA_QMAN0_FENCE_VAL; + + fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate memory for queue testing\n"); + return -ENOMEM; + } + + *fence_ptr = 0; + + fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev, + sizeof(struct packet_msg_prot), + GFP_KERNEL, &pkt_dma_addr); + if (!fence_pkt) { + dev_err(hdev->dev, + "Failed to allocate packet for queue testing\n"); + rc = -ENOMEM; + goto free_fence_ptr; + } + + fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + fence_pkt->value = fence_val; + fence_pkt->addr = fence_dma_addr + + hdev->asic_prop.host_phys_base_address; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, + sizeof(struct packet_msg_prot), + pkt_dma_addr); + if (rc) { + dev_err(hdev->dev, + "Failed to send fence packet\n"); + goto free_pkt; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr, + GOYA_TEST_QUEUE_WAIT_USEC, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); + + if ((!rc) && (tmp == fence_val)) { + dev_info(hdev->dev, + "queue test on H/W queue %d succeeded\n", + hw_queue_id); + } else { + dev_err(hdev->dev, + "H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n", + hw_queue_id, (unsigned long long) fence_dma_addr, tmp); + rc = -EINVAL; + } + +free_pkt: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt, + pkt_dma_addr); +free_fence_ptr: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + fence_dma_addr); + return rc; +} + +int goya_test_cpu_queue(struct hl_device *hdev) +{ + struct armcp_packet test_pkt; + long result; + int rc; + + /* cpu_queues_enable flag is always checked in send cpu message */ + + memset(&test_pkt, 0, sizeof(test_pkt)); + + test_pkt.ctl = ARMCP_PACKET_TEST << ARMCP_PKT_CTL_OPCODE_SHIFT; + test_pkt.value = ARMCP_PACKET_FENCE_VAL; + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt, + sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result); + + if (!rc) + dev_info(hdev->dev, "queue test on CPU queue succeeded\n"); + else + dev_err(hdev->dev, "CPU queue test failed (0x%08lX)\n", result); + + return rc; +} + +static int goya_test_queues(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + int i, rc, ret_val = 0; + + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { + rc = goya_test_queue(hdev, i); + if (rc) + ret_val = -EINVAL; + } + + if (hdev->cpu_queues_enable) { + rc = goya->test_cpu_queue(hdev); + if (rc) + ret_val = -EINVAL; + } + + return ret_val; +} + +void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size, gfp_t mem_flags, + dma_addr_t *dma_handle) +{ + if (size > GOYA_DMA_POOL_BLK_SIZE) + return NULL; + + return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle); +} + +void goya_dma_pool_free(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr) +{ + dma_pool_free(hdev->dma_pool, vaddr, dma_addr); +} + +void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle) +{ + u64 kernel_addr; + + /* roundup to CPU_PKT_SIZE */ + size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK; + + kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size); + + *dma_handle = hdev->cpu_accessible_dma_address + + (kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem); + + return (void *) (uintptr_t) kernel_addr; +} + +void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, + void *vaddr) +{ + /* roundup to CPU_PKT_SIZE */ + size = (size + (CPU_PKT_SIZE - 1)) & CPU_PKT_MASK; + + gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr, + size); +} + + +static void goya_hw_queues_lock(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + + spin_lock(&goya->hw_queues_lock); +} + +static void goya_hw_queues_unlock(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + + spin_unlock(&goya->hw_queues_lock); +} + static const struct hl_asic_funcs goya_funcs = { .early_init = goya_early_init, .early_fini = goya_early_fini, @@ -1512,8 +2804,19 @@ static const struct hl_asic_funcs goya_funcs = { .resume = goya_resume, .mmap = goya_mmap, .cb_mmap = goya_cb_mmap, + .ring_doorbell = goya_ring_doorbell, + .flush_pq_write = goya_flush_pq_write, .dma_alloc_coherent = goya_dma_alloc_coherent, .dma_free_coherent = goya_dma_free_coherent, + .get_int_queue_base = goya_get_int_queue_base, + .test_queues = goya_test_queues, + .dma_pool_zalloc = goya_dma_pool_zalloc, + .dma_pool_free = goya_dma_pool_free, + .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc, + .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free, + .hw_queues_lock = goya_hw_queues_lock, + .hw_queues_unlock = goya_hw_queues_unlock, + .send_cpu_message = goya_send_cpu_message }; /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 65cbb45d7083..791605cbecfe 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -11,7 +11,9 @@ #include #include "habanalabs.h" #include "include/hl_boot_if.h" +#include "include/goya/goya_packets.h" #include "include/goya/goya.h" +#include "include/goya/goya_async_events.h" #include "include/goya/goya_fw_if.h" #define NUMBER_OF_CMPLT_QUEUES 5 @@ -145,12 +147,17 @@ enum goya_fw_component { }; struct goya_device { + int (*test_cpu_queue)(struct hl_device *hdev); + /* TODO: remove hw_queues_lock after moving to scheduler code */ spinlock_t hw_queues_lock; u64 ddr_bar_cur_addr; u32 hw_cap_initialized; }; +int goya_test_cpu_queue(struct hl_device *hdev); +int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, + u32 timeout, long *result); void goya_init_security(struct hl_device *hdev); #endif /* GOYAP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e099f7a9dac2..2121babbebdc 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -9,6 +9,7 @@ #define HABANALABSP_H_ #include "include/armcp_if.h" +#include "include/qman_if.h" #define pr_fmt(fmt) "habanalabs: " fmt @@ -26,9 +27,36 @@ struct hl_device; struct hl_fpriv; +/** + * enum hl_queue_type - Supported QUEUE types. + * @QUEUE_TYPE_NA: queue is not available. + * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the + * host. + * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's + * memories and/or operates the compute engines. + * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU. + */ +enum hl_queue_type { + QUEUE_TYPE_NA, + QUEUE_TYPE_EXT, + QUEUE_TYPE_INT, + QUEUE_TYPE_CPU +}; + +/** + * struct hw_queue_properties - queue information. + * @type: queue type. + * @kmd_only: true if only KMD is allowed to send a job to this queue, false + * otherwise. + */ +struct hw_queue_properties { + enum hl_queue_type type; + u8 kmd_only; +}; /** * struct asic_fixed_properties - ASIC specific immutable properties. + * @hw_queues_props: H/W queues properties. * @uboot_ver: F/W U-boot version. * @preboot_ver: F/W Preboot version. * @sram_base_address: SRAM physical start address. @@ -59,6 +87,7 @@ struct hl_fpriv; * @tpc_enabled_mask: which TPCs are enabled. */ struct asic_fixed_properties { + struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES]; char uboot_ver[VERSION_MAX_LEN]; char preboot_ver[VERSION_MAX_LEN]; u64 sram_base_address; @@ -132,7 +161,89 @@ struct hl_cb { }; +/* + * QUEUES + */ + +struct hl_cs_job; + +/* + * Currently, there are two limitations on the maximum length of a queue: + * + * 1. The memory footprint of the queue. The current allocated space for the + * queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE, + * the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE, + * which currently is 4096/16 = 256 entries. + * + * To increase that, we need either to decrease the size of the + * BD (difficult), or allocate more than a single page (easier). + * + * 2. Because the size of the JOB handle field in the BD CTL / completion queue + * is 10-bit, we can have up to 1024 open jobs per hardware queue. + * Therefore, each queue can hold up to 1024 entries. + * + * HL_QUEUE_LENGTH is in units of struct hl_bd. + * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE + */ + +#define HL_PAGE_SIZE 4096 /* minimum page size */ +/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */ #define HL_QUEUE_LENGTH 256 +#define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) + +/* + * HL_CQ_LENGTH is in units of struct hl_cq_entry. + * HL_CQ_LENGTH should be <= HL_PAGE_SIZE + */ +#define HL_CQ_LENGTH HL_QUEUE_LENGTH +#define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) + + + +/** + * struct hl_hw_queue - describes a H/W transport queue. + * @shadow_queue: pointer to a shadow queue that holds pointers to jobs. + * @queue_type: type of queue. + * @kernel_address: holds the queue's kernel virtual address. + * @bus_address: holds the queue's DMA address. + * @pi: holds the queue's pi value. + * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci). + * @hw_queue_id: the id of the H/W queue. + * @int_queue_len: length of internal queue (number of entries). + * @valid: is the queue valid (we have array of 32 queues, not all of them + * exists). + */ +struct hl_hw_queue { + struct hl_cs_job **shadow_queue; + enum hl_queue_type queue_type; + u64 kernel_address; + dma_addr_t bus_address; + u32 pi; + u32 ci; + u32 hw_queue_id; + u16 int_queue_len; + u8 valid; +}; + +/** + * struct hl_cq - describes a completion queue + * @hdev: pointer to the device structure + * @kernel_address: holds the queue's kernel virtual address + * @bus_address: holds the queue's DMA address + * @hw_queue_id: the id of the matching H/W queue + * @ci: ci inside the queue + * @pi: pi inside the queue + * @free_slots_cnt: counter of free slots in queue + */ +struct hl_cq { + struct hl_device *hdev; + u64 kernel_address; + dma_addr_t bus_address; + u32 hw_queue_id; + u32 ci; + u32 pi; + atomic_t free_slots_cnt; +}; /* @@ -164,6 +275,8 @@ enum hl_asic_type { * @resume: handles IP specific H/W or SW changes for resume. * @mmap: mmap function, does nothing. * @cb_mmap: maps a CB. + * @ring_doorbell: increment PI on a given QMAN. + * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. * @dma_alloc_coherent: Allocate coherent DMA memory by calling * dma_alloc_coherent(). This is ASIC function because its * implementation is not trivial when the driver is loaded @@ -172,6 +285,16 @@ enum hl_asic_type { * This is ASIC function because its implementation is not * trivial when the driver is loaded in simulation mode * (not upstreamed). + * @get_int_queue_base: get the internal queue base address. + * @test_queues: run simple test on all queues for sanity check. + * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. + * size of allocation is HL_DMA_POOL_BLK_SIZE. + * @dma_pool_free: free small DMA allocation from pool. + * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. + * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. + * @hw_queues_lock: acquire H/W queues lock. + * @hw_queues_unlock: release H/W queues lock. + * @send_cpu_message: send buffer to ArmCP. */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -185,10 +308,27 @@ struct hl_asic_funcs { int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma); int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, u64 kaddress, phys_addr_t paddress, u32 size); + void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); + void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); void (*dma_free_coherent)(struct hl_device *hdev, size_t size, void *cpu_addr, dma_addr_t dma_handle); + void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id, + dma_addr_t *dma_handle, u16 *queue_len); + int (*test_queues)(struct hl_device *hdev); + void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size, + gfp_t mem_flags, dma_addr_t *dma_handle); + void (*dma_pool_free)(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr); + void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev, + size_t size, dma_addr_t *dma_handle); + void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, + size_t size, void *vaddr); + void (*hw_queues_lock)(struct hl_device *hdev); + void (*hw_queues_unlock)(struct hl_device *hdev); + int (*send_cpu_message)(struct hl_device *hdev, u32 *msg, + u16 len, u32 timeout, long *result); }; @@ -224,6 +364,17 @@ struct hl_ctx_mgr { }; + + +/** + * struct hl_cs_job - command submission job. + * @finish_work: workqueue object to run when job is completed. + * @id: the id of this job inside a CS. + */ +struct hl_cs_job { + struct work_struct finish_work; + u32 id; +}; /* * FILE PRIVATE STRUCTURE */ @@ -298,7 +449,11 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * @dev: realted kernel basic device structure. * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. + * @completion_queue: array of hl_cq. + * @cq_wq: work queue of completion queues for executing work in process context + * @eq_wq: work queue of event queue for executing work in process context. * @kernel_ctx: KMD context structure. + * @kernel_queues: array of hl_hw_queue. * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs. * @dma_pool: DMA pool for small allocations. * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address. @@ -312,6 +467,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); * only a single process at a time. In addition, we need a * lock here so we can flush user processes which are opening * the device while we are trying to hard reset it + * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue. * @asic_prop: ASIC specific immutable properties. * @asic_funcs: ASIC specific functions. * @asic_specific: ASIC specific information to use only from ASIC files. @@ -331,7 +487,10 @@ struct hl_device { struct device *dev; char asic_name[16]; enum hl_asic_type asic_type; + struct hl_cq *completion_queue; + struct workqueue_struct *cq_wq; struct hl_ctx *kernel_ctx; + struct hl_hw_queue *kernel_queues; struct hl_cb_mgr kernel_cb_mgr; struct dma_pool *dma_pool; void *cpu_accessible_dma_mem; @@ -341,6 +500,7 @@ struct hl_device { struct mutex asid_mutex; /* TODO: remove fd_open_cnt_lock for multiple process support */ struct mutex fd_open_cnt_lock; + struct mutex send_cpu_message_lock; struct asic_fixed_properties asic_prop; const struct hl_asic_funcs *asic_funcs; void *asic_specific; @@ -358,6 +518,7 @@ struct hl_device { /* Parameters for bring-up */ u8 cpu_enable; u8 reset_pcilink; + u8 cpu_queues_enable; u8 fw_loading; u8 pldm; }; @@ -400,7 +561,18 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, u32 timeout_us, u32 *val); int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr, u32 timeout_us, u32 *val); - +int hl_hw_queues_create(struct hl_device *hdev); +void hl_hw_queues_destroy(struct hl_device *hdev); +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, + u32 cb_size, u64 cb_ptr); +u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); + +#define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1) +#define hl_pi_2_offset(pi) ((pi) & (HL_QUEUE_LENGTH - 1)) + +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id); +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q); int hl_asid_init(struct hl_device *hdev); void hl_asid_fini(struct hl_device *hdev); unsigned long hl_asid_alloc(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 59c2fd196659..93576249307b 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -169,6 +169,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, /* Parameters for bring-up - set them to defaults */ hdev->cpu_enable = 1; hdev->reset_pcilink = 0; + hdev->cpu_queues_enable = 1; hdev->fw_loading = 1; hdev->pldm = 0; @@ -176,6 +177,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, if (!hdev->cpu_enable) hdev->fw_loading = 0; + /* If we don't load FW, no need to initialize CPU queues */ + if (!hdev->fw_loading) + hdev->cpu_queues_enable = 0; + hdev->disabled = true; hdev->pdev = pdev; /* can be NULL in case of simulator device */ diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c new file mode 100644 index 000000000000..2ec43f36cdb8 --- /dev/null +++ b/drivers/misc/habanalabs/hw_queue.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include + +/* + * hl_queue_add_ptr - add to pi or ci and checks if it wraps around + * + * @ptr: the current pi/ci value + * @val: the amount to add + * + * Add val to ptr. It can go until twice the queue length. + */ +inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) +{ + ptr += val; + ptr &= ((HL_QUEUE_LENGTH << 1) - 1); + return ptr; +} + +static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) +{ + int delta = (q->pi - q->ci); + + if (delta >= 0) + return (queue_len - delta); + else + return (abs(delta) - queue_len); +} + +/* + * ext_queue_submit_bd - Submit a buffer descriptor to an external queue + * + * @hdev: pointer to habanalabs device structure + * @q: pointer to habanalabs queue structure + * @ctl: BD's control word + * @len: BD's length + * @ptr: BD's pointer + * + * This function assumes there is enough space on the queue to submit a new + * BD to it. It initializes the next BD and calls the device specific + * function to set the pi (and doorbell) + * + * This function must be called when the scheduler mutex is taken + * + */ +static void ext_queue_submit_bd(struct hl_devi