diff options
author | Catherine Sullivan <csully@google.com> | 2019-07-01 15:57:53 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-07-01 19:36:35 -0700 |
commit | f5cedc84a30d2d3d0e0a7f3eb53fbd66d9bf5517 (patch) | |
tree | 7fc062c823dde5a109a823b0d80f62cb90113ae1 | |
parent | 893ce44df56580fb878ca5af9c4a5fd87567da50 (diff) |
gve: Add transmit and receive support
Add support for passing traffic.
Signed-off-by: Catherine Sullivan <csully@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Jon Olson <jonolson@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Luigi Rizzo <lrizzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/device_drivers/google/gve.rst | 30 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve.h | 260 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.c | 138 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.h | 83 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc.h | 113 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_main.c | 573 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx.c | 443 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 584 |
9 files changed, 2221 insertions, 5 deletions
diff --git a/Documentation/networking/device_drivers/google/gve.rst b/Documentation/networking/device_drivers/google/gve.rst index 7397c82f4c8f..df8974fb3270 100644 --- a/Documentation/networking/device_drivers/google/gve.rst +++ b/Documentation/networking/device_drivers/google/gve.rst @@ -42,6 +42,8 @@ The driver interacts with the device in the following ways: - See description below - Interrupts - See supported interrupts below + - Transmit and Receive Queues + - See description below Registers --------- @@ -80,3 +82,31 @@ Notification Block Interrupts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The notification block interrupts are used to tell the driver to poll the queues associated with that interrupt. + +The handler for these irqs schedule the napi for that block to run +and poll the queues. + +Traffic Queues +-------------- +gVNIC's queues are composed of a descriptor ring and a buffer and are +assigned to a notification block. + +The descriptor rings are power-of-two-sized ring buffers consisting of +fixed-size descriptors. They advance their head pointer using a __be32 +doorbell located in Bar2. The tail pointers are advanced by consuming +descriptors in-order and updating a __be32 counter. Both the doorbell +and the counter overflow to zero. + +Each queue's buffers must be registered in advance with the device as a +queue page list, and packet data can only be put in those pages. + +Transmit +~~~~~~~~ +gve maps the buffers for transmit rings into a FIFO and copies the packets +into the FIFO before sending them to the NIC. + +Receive +~~~~~~~ +The buffers for receive rings are put into a data ring that is the same +length as the descriptor ring and the head and tail pointers advance over +the rings together. diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index cec03ee6d931..a1890c93705b 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,4 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_adminq.o +gve-objs := gve_main.o gve_tx.o gve_rx.o gve_adminq.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 47fb86e5aeff..110ae46adc69 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -10,6 +10,8 @@ #include <linux/dma-mapping.h> #include <linux/netdevice.h> #include <linux/pci.h> +#include <linux/u64_stats_sync.h> +#include "gve_desc.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 @@ -20,18 +22,152 @@ #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 -/* 1 for management */ +/* Driver can alloc up to 2 segments for the header and 2 for the payload. */ +#define GVE_TX_MAX_IOVEC 4 +/* 1 for management, 1 for rx, 1 for tx */ #define GVE_MIN_MSIX 3 +/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */ +struct gve_rx_desc_queue { + struct gve_rx_desc *desc_ring; /* the descriptor ring */ + dma_addr_t bus; /* the bus for the desc_ring */ + u32 cnt; /* free-running total number of completed packets */ + u32 fill_cnt; /* free-running total number of descriptors posted */ + u32 mask; /* masks the cnt to the size of the ring */ + u8 seqno; /* the next expected seqno for this desc*/ +}; + +/* The page info for a single slot in the RX data queue */ +struct gve_rx_slot_page_info { + struct page *page; + void *page_address; + u32 page_offset; /* offset to write to in page */ +}; + +/* A list of pages registered with the device during setup and used by a queue + * as buffers + */ +struct gve_queue_page_list { + u32 id; /* unique id */ + u32 num_entries; + struct page **pages; /* list of num_entries pages */ + dma_addr_t *page_buses; /* the dma addrs of the pages */ +}; + +/* Each slot in the data ring has a 1:1 mapping to a slot in the desc ring */ +struct gve_rx_data_queue { + struct gve_rx_data_slot *data_ring; /* read by NIC */ + dma_addr_t data_bus; /* dma mapping of the slots */ + struct gve_rx_slot_page_info *page_info; /* page info of the buffers */ + struct gve_queue_page_list *qpl; /* qpl assigned to this queue */ + u32 mask; /* masks the cnt to the size of the ring */ + u32 cnt; /* free-running total number of completed packets */ +}; + +struct gve_priv; + +/* An RX ring that contains a power-of-two sized desc and data ring. */ +struct gve_rx_ring { + struct gve_priv *gve; + struct gve_rx_desc_queue desc; + struct gve_rx_data_queue data; + u64 rbytes; /* free-running bytes received */ + u64 rpackets; /* free-running packets received */ + u32 q_num; /* queue index */ + u32 ntfy_id; /* notification block index */ + struct gve_queue_resources *q_resources; /* head and tail pointer idx */ + dma_addr_t q_resources_bus; /* dma address for the queue resources */ + struct u64_stats_sync statss; /* sync stats for 32bit archs */ +}; + +/* A TX desc ring entry */ +union gve_tx_desc { + struct gve_tx_pkt_desc pkt; /* first desc for a packet */ + struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ +}; + +/* Tracks the memory in the fifo occupied by a segment of a packet */ +struct gve_tx_iovec { + u32 iov_offset; /* offset into this segment */ + u32 iov_len; /* length */ + u32 iov_padding; /* padding associated with this segment */ +}; + +/* Tracks the memory in the fifo occupied by the skb. Mapped 1:1 to a desc + * ring entry but only used for a pkt_desc not a seg_desc + */ +struct gve_tx_buffer_state { + struct sk_buff *skb; /* skb for this pkt */ + struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ +}; + +/* A TX buffer - each queue has one */ +struct gve_tx_fifo { + void *base; /* address of base of FIFO */ + u32 size; /* total size */ + atomic_t available; /* how much space is still available */ + u32 head; /* offset to write at */ + struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */ +}; + +/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */ +struct gve_tx_ring { + /* Cacheline 0 -- Accessed & dirtied during transmit */ + struct gve_tx_fifo tx_fifo; + u32 req; /* driver tracked head pointer */ + u32 done; /* driver tracked tail pointer */ + + /* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */ + __be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */ + u64 pkt_done; /* free-running - total packets completed */ + u64 bytes_done; /* free-running - total bytes completed */ + + /* Cacheline 2 -- Read-mostly fields */ + union gve_tx_desc *desc ____cacheline_aligned; + struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */ + struct netdev_queue *netdev_txq; + struct gve_queue_resources *q_resources; /* head and tail pointer idx */ + u32 mask; /* masks req and done down to queue size */ + + /* Slow-path fields */ + u32 q_num ____cacheline_aligned; /* queue idx */ + u32 stop_queue; /* count of queue stops */ + u32 wake_queue; /* count of queue wakes */ + u32 ntfy_id; /* notification block index */ + dma_addr_t bus; /* dma address of the descr ring */ + dma_addr_t q_resources_bus; /* dma address of the queue resources */ + struct u64_stats_sync statss; /* sync stats for 32bit archs */ +} ____cacheline_aligned; + +/* Wraps the info for one irq including the napi struct and the queues + * associated with that irq. + */ struct gve_notify_block { __be32 irq_db_index; /* idx into Bar2 - set by device, must be 1st */ char name[IFNAMSIZ + 16]; /* name registered with the kernel */ struct napi_struct napi; /* kernel napi struct for this block */ struct gve_priv *priv; + struct gve_tx_ring *tx; /* tx rings on this block */ + struct gve_rx_ring *rx; /* rx rings on this block */ } ____cacheline_aligned; +/* Tracks allowed and current queue settings */ +struct gve_queue_config { + u16 max_queues; + u16 num_queues; /* current */ +}; + +/* Tracks the available and used qpl IDs */ +struct gve_qpl_config { + u32 qpl_map_size; /* map memory size */ + unsigned long *qpl_id_map; /* bitmap of used qpl ids */ +}; + struct gve_priv { struct net_device *dev; + struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ + struct gve_rx_ring *rx; /* array of rx_cfg.num_queues */ + struct gve_queue_page_list *qpls; /* array of num qpls */ struct gve_notify_block *ntfy_blocks; /* array of num_ntfy_blks */ dma_addr_t ntfy_block_bus; struct msix_entry *msix_vectors; /* array of num_ntfy_blks + 1 */ @@ -41,7 +177,18 @@ struct gve_priv { dma_addr_t counter_array_bus; u16 num_event_counters; + u16 tx_desc_cnt; /* num desc per ring */ + u16 rx_desc_cnt; /* num desc per ring */ + u16 tx_pages_per_qpl; /* tx buffer length */ + u16 rx_pages_per_qpl; /* rx buffer length */ + u64 max_registered_pages; + u64 num_registered_pages; /* num pages registered with NIC */ + u32 rx_copybreak; /* copy packets smaller than this */ + u16 default_num_queues; /* default num queues to set up */ + struct gve_queue_config tx_cfg; + struct gve_queue_config rx_cfg; + struct gve_qpl_config qpl_cfg; /* map used QPL ids */ u32 num_ntfy_blks; /* spilt between TX and RX so must be even */ struct gve_registers __iomem *reg_bar0; /* see gve_register.h */ @@ -49,6 +196,9 @@ struct gve_priv { u32 msg_enable; /* level for netif* netdev print macros */ struct pci_dev *pdev; + /* metrics */ + u32 tx_timeo_cnt; + /* Admin queue - see gve_adminq.h*/ union gve_adminq_command *adminq; dma_addr_t adminq_bus_addr; @@ -132,4 +282,112 @@ static inline __be32 __iomem *gve_irq_doorbell(struct gve_priv *priv, { return &priv->db_bar2[be32_to_cpu(block->irq_db_index)]; } + +/* Returns the index into ntfy_blocks of the given tx ring's block + */ +static inline u32 gve_tx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) +{ + return queue_idx; +} + +/* Returns the index into ntfy_blocks of the given rx ring's block + */ +static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) +{ + return (priv->num_ntfy_blks / 2) + queue_idx; +} + +/* Returns the number of tx queue page lists + */ +static inline u32 gve_num_tx_qpls(struct gve_priv *priv) +{ + return priv->tx_cfg.num_queues; +} + +/* Returns the number of rx queue page lists + */ +static inline u32 gve_num_rx_qpls(struct gve_priv *priv) +{ + return priv->rx_cfg.num_queues; +} + +/* Returns a pointer to the next available tx qpl in the list of qpls + */ +static inline +struct gve_queue_page_list *gve_assign_tx_qpl(struct gve_priv *priv) +{ + int id = find_first_zero_bit(priv->qpl_cfg.qpl_id_map, + priv->qpl_cfg.qpl_map_size); + + /* we are out of tx qpls */ + if (id >= gve_num_tx_qpls(priv)) + return NULL; + + set_bit(id, priv->qpl_cfg.qpl_id_map); + return &priv->qpls[id]; +} + +/* Returns a pointer to the next available rx qpl in the list of qpls + */ +static inline +struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) +{ + int id = find_next_zero_bit(priv->qpl_cfg.qpl_id_map, + priv->qpl_cfg.qpl_map_size, + gve_num_tx_qpls(priv)); + + /* we are out of rx qpls */ + if (id == priv->qpl_cfg.qpl_map_size) + return NULL; + + set_bit(id, priv->qpl_cfg.qpl_id_map); + return &priv->qpls[id]; +} + +/* Unassigns the qpl with the given id + */ +static inline void gve_unassign_qpl(struct gve_priv *priv, int id) +{ + clear_bit(id, priv->qpl_cfg.qpl_id_map); +} + +/* Returns the correct dma direction for tx and rx qpls + */ +static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv, + int id) +{ + if (id < gve_num_tx_qpls(priv)) + return DMA_TO_DEVICE; + else + return DMA_FROM_DEVICE; +} + +/* Returns true if the max mtu allows page recycling */ +static inline bool gve_can_recycle_pages(struct net_device *dev) +{ + /* We can't recycle the pages if we can't fit a packet into half a + * page. + */ + return dev->max_mtu <= PAGE_SIZE / 2; +} + +/* buffers */ +int gve_alloc_page(struct device *dev, struct page **page, dma_addr_t *dma, + enum dma_data_direction); +void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, + enum dma_data_direction); +/* tx handling */ +netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev); +bool gve_tx_poll(struct gve_notify_block *block, int budget); +int gve_tx_alloc_rings(struct gve_priv *priv); +void gve_tx_free_rings(struct gve_priv *priv); +__be32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx); +/* rx handling */ +void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx); +bool gve_rx_poll(struct gve_notify_block *block, int budget); +int gve_rx_alloc_rings(struct gve_priv *priv); +void gve_rx_free_rings(struct gve_priv *priv); +bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, + netdev_features_t feat); #endif /* _GVE_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 4bf32e4e2810..c3ba7baf0107 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -190,6 +190,72 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv) return gve_adminq_execute_cmd(priv, &cmd); } +int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) +{ + struct gve_tx_ring *tx = &priv->tx[queue_index]; + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE); + cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { + .queue_id = cpu_to_be32(queue_index), + .reserved = 0, + .queue_resources_addr = cpu_to_be64(tx->q_resources_bus), + .tx_ring_addr = cpu_to_be64(tx->bus), + .queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id), + .ntfy_id = cpu_to_be32(tx->ntfy_id), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) +{ + struct gve_rx_ring *rx = &priv->rx[queue_index]; + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE); + cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { + .queue_id = cpu_to_be32(queue_index), + .index = cpu_to_be32(queue_index), + .reserved = 0, + .ntfy_id = cpu_to_be32(rx->ntfy_id), + .queue_resources_addr = cpu_to_be64(rx->q_resources_bus), + .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus), + .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus), + .queue_page_list_id = cpu_to_be32(rx->data.qpl->id), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE); + cmd.destroy_tx_queue = (struct gve_adminq_destroy_tx_queue) { + .queue_id = cpu_to_be32(queue_index), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE); + cmd.destroy_rx_queue = (struct gve_adminq_destroy_rx_queue) { + .queue_id = cpu_to_be32(queue_index), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + int gve_adminq_describe_device(struct gve_priv *priv) { struct gve_device_descriptor *descriptor; @@ -215,6 +281,25 @@ int gve_adminq_describe_device(struct gve_priv *priv) if (err) goto free_device_descriptor; + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { + netif_err(priv, drv, priv->dev, "Tx desc count %d too low\n", + priv->tx_desc_cnt); + err = -EINVAL; + goto free_device_descriptor; + } + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) + < PAGE_SIZE || + priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0]) + < PAGE_SIZE) { + netif_err(priv, drv, priv->dev, "Rx desc count %d too low\n", + priv->rx_desc_cnt); + err = -EINVAL; + goto free_device_descriptor; + } + priv->max_registered_pages = + be64_to_cpu(descriptor->max_registered_pages); mtu = be16_to_cpu(descriptor->mtu); if (mtu < ETH_MIN_MTU) { netif_err(priv, drv, priv->dev, "MTU %d below minimum MTU\n", @@ -227,6 +312,14 @@ int gve_adminq_describe_device(struct gve_priv *priv) ether_addr_copy(priv->dev->dev_addr, descriptor->mac); mac = descriptor->mac; netif_info(priv, drv, priv->dev, "MAC addr: %pM\n", mac); + priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl); + priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl); + if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) { + netif_err(priv, drv, priv->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", + priv->rx_pages_per_qpl); + priv->rx_desc_cnt = priv->rx_pages_per_qpl; + } + priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues); free_device_descriptor: dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor, @@ -234,6 +327,51 @@ free_device_descriptor: return err; } +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl) +{ + struct device *hdev = &priv->pdev->dev; + u32 num_entries = qpl->num_entries; + u32 size = num_entries * sizeof(qpl->page_buses[0]); + union gve_adminq_command cmd; + dma_addr_t page_list_bus; + __be64 *page_list; + int err; + int i; + + memset(&cmd, 0, sizeof(cmd)); + page_list = dma_alloc_coherent(hdev, size, &page_list_bus, GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) + page_list[i] = cpu_to_be64(qpl->page_buses[i]); + + cmd.opcode = cpu_to_be32(GVE_ADMINQ_REGISTER_PAGE_LIST); + cmd.reg_page_list = (struct gve_adminq_register_page_list) { + .page_list_id = cpu_to_be32(qpl->id), + .num_pages = cpu_to_be32(num_entries), + .page_address_list_addr = cpu_to_be64(page_list_bus), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + dma_free_coherent(hdev, size, page_list, page_list_bus); + return err; +} + +int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_UNREGISTER_PAGE_LIST); + cmd.unreg_page_list = (struct gve_adminq_unregister_page_list) { + .page_list_id = cpu_to_be32(page_list_id), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu) { union gve_adminq_command cmd; diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index b860b0d69393..4dfa06edc0f8 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -13,6 +13,12 @@ enum gve_adminq_opcodes { GVE_ADMINQ_DESCRIBE_DEVICE = 0x1, GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES = 0x2, + GVE_ADMINQ_REGISTER_PAGE_LIST = 0x3, + GVE_ADMINQ_UNREGISTER_PAGE_LIST = 0x4, + GVE_ADMINQ_CREATE_TX_QUEUE = 0x5, + GVE_ADMINQ_CREATE_RX_QUEUE = 0x6, + GVE_ADMINQ_DESTROY_TX_QUEUE = 0x7, + GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, }; @@ -89,6 +95,70 @@ struct gve_adminq_configure_device_resources { static_assert(sizeof(struct gve_adminq_configure_device_resources) == 32); +struct gve_adminq_register_page_list { + __be32 page_list_id; + __be32 num_pages; + __be64 page_address_list_addr; +}; + +static_assert(sizeof(struct gve_adminq_register_page_list) == 16); + +struct gve_adminq_unregister_page_list { + __be32 page_list_id; +}; + +static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4); + +struct gve_adminq_create_tx_queue { + __be32 queue_id; + __be32 reserved; + __be64 queue_resources_addr; + __be64 tx_ring_addr; + __be32 queue_page_list_id; + __be32 ntfy_id; +}; + +static_assert(sizeof(struct gve_adminq_create_tx_queue) == 32); + +struct gve_adminq_create_rx_queue { + __be32 queue_id; + __be32 index; + __be32 reserved; + __be32 ntfy_id; + __be64 queue_resources_addr; + __be64 rx_desc_ring_addr; + __be64 rx_data_ring_addr; + __be32 queue_page_list_id; + u8 padding[4]; +}; + +static_assert(sizeof(struct gve_adminq_create_rx_queue) == 48); + +/* Queue resources that are shared with the device */ +struct gve_queue_resources { + union { + struct { + __be32 db_index; /* Device -> Guest */ + __be32 counter_index; /* Device -> Guest */ + }; + u8 reserved[64]; + }; +}; + +static_assert(sizeof(struct gve_queue_resources) == 64); + +struct gve_adminq_destroy_tx_queue { + __be32 queue_id; +}; + +static_assert(sizeof(struct gve_adminq_destroy_tx_queue) == 4); + +struct gve_adminq_destroy_rx_queue { + __be32 queue_id; +}; + +static_assert(sizeof(struct gve_adminq_destroy_rx_queue) == 4); + /* GVE Set Driver Parameter Types */ enum gve_set_driver_param_types { GVE_SET_PARAM_MTU = 0x1, @@ -109,7 +179,13 @@ union gve_adminq_command { union { struct gve_adminq_configure_device_resources configure_device_resources; + struct gve_adminq_create_tx_queue create_tx_queue; + struct gve_adminq_create_rx_queue create_rx_queue; + struct gve_adminq_destroy_tx_queue destroy_tx_queue; + struct gve_adminq_destroy_rx_queue destroy_rx_queue; struct gve_adminq_describe_device describe_device; + struct gve_adminq_register_page_list reg_page_list; + struct gve_adminq_unregister_page_list unreg_page_list; struct gve_adminq_set_driver_parameter set_driver_param; }; }; @@ -130,5 +206,12 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv, dma_addr_t db_array_bus_addr, u32 num_ntfy_blks); int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); +int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_id); +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl); +int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id); int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); #endif /* _GVE_ADMINQ_H */ diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h new file mode 100644 index 000000000000..54779871d52e --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_desc.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +/* GVE Transmit Descriptor formats */ + +#ifndef _GVE_DESC_H_ +#define _GVE_DESC_H_ + +#include <linux/build_bug.h> + +/* A note on seg_addrs + * + * Base addresses encoded in seg_addr are not assumed to be physical + * addresses. The ring format assumes these come from some linear address + * space. This could be physical memory, kernel virtual memory, user virtual + * memory. gVNIC uses lists of registered pages. Each queue is assumed + * to be associated with a single such linear address space to ensure a + * consistent meaning for seg_addrs posted to its rings. + */ + +struct gve_tx_pkt_desc { + u8 type_flags; /* desc type is lower 4 bits, flags upper */ + u8 l4_csum_offset; /* relative offset of L4 csum word */ + u8 l4_hdr_offset; /* Offset of start of L4 headers in packet */ + u8 desc_cnt; /* Total descriptors for this packet */ + __be16 len; /* Total length of this packet (in bytes) */ + __be16 seg_len; /* Length of this descriptor's segment */ + __be64 seg_addr; /* Base address (see note) of this segment */ +} __packed; + +struct gve_tx_seg_desc { + u8 type_flags; /* type is lower 4 bits, flags upper */ + u8 l3_offset; /* TSO: 2 byte units to start of IPH */ + __be16 reserved; + __be16 mss; /* TSO MSS */ + __be16 seg_len; + __be64 seg_addr; +} __packed; + +/* GVE Transmit Descriptor Types */ +#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */ +#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */ +#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */ + +/* GVE Transmit Descriptor Flags for Std Pkts */ +#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */ +#define GVE_TXF_TSTAMP BIT(2) /* Timestamp required */ + +/* GVE Transmit Descriptor Flags for TSO Segs */ +#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */ + +/* GVE Receive Packet Descriptor */ +/* The start of an ethernet packet comes 2 bytes into the rx buffer. + * gVNIC adds this padding so that both the DMA and the L3/4 protocol header + * access is aligned. + */ +#define GVE_RX_PAD 2 + +struct gve_rx_desc { + u8 padding[48]; + __be32 rss_hash; /* Receive-side scaling hash (Toeplitz for gVNIC) */ + __be16 mss; + __be16 reserved; /* Reserved to zero */ + u8 hdr_len; /* Header length (L2-L4) including padding */ + u8 hdr_off; /* 64-byte-scaled offset into RX_DATA entry */ + __sum16 csum; /* 1's-complement partial checksum of L3+ bytes */ + __be16 len; /* Length of the received packet */ + __be16 flags_seq; /* Flags [15:3] and sequence number [2:0] (1-7) */ +} __packed; +static_assert(sizeof(struct gve_rx_desc) == 64); + +/* As with the Tx ring format, the qpl_offset entries below are offsets into an + * ordered list of registered pages. + */ +struct gve_rx_data_slot { + /* byte offset into the rx registered segment of this slot */ + __be64 qpl_offset; +}; + +/* GVE Recive Packet Descriptor Seq No */ +#define GVE_SEQNO(x) (be16_to_cpu(x) & 0x7) + +/* GVE Recive Packet Descriptor Flags */ +#define GVE_RXFLG(x) cpu_to_be16(1 << (3 + (x))) +#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ +#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ +#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ +#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ +#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ +#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ + +/* GVE IRQ */ +#define GVE_IRQ_ACK BIT(31) +#define GVE_IRQ_MASK BIT(30) +#define GVE_IRQ_EVENT BIT(29) + +static inline bool gve_needs_rss(__be16 flag) +{ + if (flag & GVE_RXF_FRAG) + return false; + if (flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) + return true; + return false; +} + +static inline u8 gve_next_seqno(u8 seq) +{ + return (seq + 1) == 8 ? 1 : seq + 1; +} +#endif /* _GVE_DESC_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 41d8cfa8f998..44462dce0254 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -16,6 +16,8 @@ #include "gve_adminq.h" #include "gve_register.h" +#define GVE_DEFAULT_RX_COPYBREAK (256) + #define DEFAULT_MSG_LEVEL (NETIF_MSG_DRV | NETIF_MSG_LINK) #define GVE_VERSION "1.0.0" #define GVE_VERSION_PREFIX "GVE-" @@ -23,6 +25,34 @@ static const char gve_version_str[] = GVE_VERSION; static const char gve_version_prefix[] = GVE_VERSION_PREFIX; +static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + struct gve_priv *priv = netdev_priv(dev); + unsigned int start; + int ring; + + if (priv->rx) { + for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { + do { + u64_stats_fetch_begin(&priv->rx[ring].statss); + s->rx_packets += priv->rx[ring].rpackets; + s->rx_bytes += priv->rx[ring].rbytes; + } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + start)); + } + } + if (priv->tx) { + for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { + do { + u64_stats_fetch_begin(&priv->tx[ring].statss); + s->tx_packets += priv->tx[ring].pkt_done; + s->tx_bytes += priv->tx[ring].bytes_done; + } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + start)); + } + } +} + static int gve_alloc_counter_array(struct gve_priv *priv) { priv->counter_array = @@ -52,9 +82,50 @@ static irqreturn_t gve_mgmnt_intr(int irq, void *arg) static irqreturn_t gve_intr(int irq, void *arg) { + struct gve_notify_block *block = arg; + struct gve_priv *priv = block->priv; + + iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); + napi_schedule_irqoff(&block->napi); return IRQ_HANDLED; } +static int gve_napi_poll(struct napi_struct *napi, int budget) +{ + struct gve_notify_block *block; + __be32 __iomem *irq_doorbell; + bool reschedule = false; + struct gve_priv *priv; + + block = container_of(napi, struct gve_notify_block, napi); + priv = block->priv; + + if (block->tx) + reschedule |= gve_tx_poll(block, budget); + if (block->rx) + reschedule |= gve_rx_poll(block, budget); + + if (reschedule) + return budget; + + napi_complete(napi); + irq_doorbell = gve_irq_doorbell(priv, block); + iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); + + /* Double check we have no extra work. + * Ensure unmask synchronizes with checking for work. + */ + dma_rmb(); + if (block->tx) + reschedule |= gve_tx_poll(block, -1); + if (block->rx) + reschedule |= gve_rx_poll(block, -1); + if (reschedule && napi_reschedule(napi)) + iowrite32be(GVE_IRQ_MASK, irq_doorbell); + + return 0; +} + static int gve_alloc_notify_blocks(struct gve_priv *priv) { int num_vecs_requested = priv->num_ntfy_blks + 1; @@ -79,10 +150,23 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) goto abort_with_msix_vectors; } if (vecs_enabled != num_vecs_requested) { - priv->num_ntfy_blks = (vecs_enabled - 1) & ~0x1; + int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1; + int vecs_per_type = new_num_ntfy_blks / 2; + int vecs_left = new_num_ntfy_blks % 2; + + priv->num_ntfy_blks = new_num_ntfy_blks; + priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues, + vecs_per_type); + priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues, + vecs_per_type + vecs_left); dev_err(&priv->pdev->dev, - "Only received %d msix. Lowering number of notification blocks to %d\n", - vecs_enabled, priv->num_ntfy_blks); + "Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n", + vecs_enabled, priv->tx_cfg.max_queues, + priv->rx_cfg.max_queues); + if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues) + priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; + if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues) + priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; } /* Half the notification blocks go to TX and half to RX */ active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus()); @@ -219,6 +303,463 @@ static void gve_teardown_device_resources(struct gve_priv *priv) gve_clear_device_resources_ok(priv); } +static void gve_add_napi(struct gve_priv *priv, int ntfy_idx) +{ + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + netif_napi_add(priv->dev, &block->napi, gve_napi_poll, + NAPI_POLL_WEIGHT); +} + +static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx) +{ + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + netif_napi_del(&block->napi); +} + +static int gve_register_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to register queue page list %d\n", + priv->qpls[i].id); + return err; + } + } + return 0; +} + +static int gve_unregister_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to unregister queue page list %d\n", + priv->qpls[i].id); + return err; + } + } + return 0; +} + +static int gve_create_rings(struct gve_priv *priv) +{ + int err; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_adminq_create_tx_queue(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create tx queue %d\n", + i); + return err; + } + netif_dbg(priv, drv, priv->dev, "created tx queue %d\n", i); + } + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_adminq_create_rx_queue(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create rx queue %d\n", + i); + return err; + } + /* Rx data ring has been prefilled with packet buffers at + * queue allocation ti |