diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 08412c279297bf..8d44493cc89bee 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -676,6 +676,18 @@ operations: reply: attributes: - id + - + name: bind-tx + doc: Bind dmabuf to netdev for TX + attribute-set: dmabuf + do: + request: + attributes: + - ifindex + - fd + reply: + attributes: + - id kernel-family: headers: [ "linux/list.h"] diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst index d9536364533149..cf0904a3397aff 100644 --- a/Documentation/networking/devmem.rst +++ b/Documentation/networking/devmem.rst @@ -62,7 +62,7 @@ More Info https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ -Interface +Rx Interface ========= @@ -235,6 +235,147 @@ can be less than the tokens provided by the user in case of: (a) an internal kernel leak bug. (b) the user passed more than 1024 frags. +TX Interface +============ + + +Example +------- + +./tools/testing/selftests/drivers/net/hw/ncdevmem:do_client shows an example of +setting up the TX path of this API. + + +NIC Setup +--------- + +The user must bind a TX dmabuf to a given NIC using the netlink API:: + + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + + tx_dmabuf_id = rsp->id; + + +The netlink API returns a dmabuf_id: a unique ID that refers to this dmabuf +that has been bound. + +The user can unbind the dmabuf from the netdevice by closing the netlink socket +that established the binding. We do this so that the binding is automatically +unbound even if the userspace process crashes. + +Note that any reasonably well-behaved dmabuf from any exporter should work with +devmem TCP, even if the dmabuf is not actually backed by devmem. An example of +this is udmabuf, which wraps user memory (non-devmem) in a dmabuf. + +Socket Setup +------------ + +The user application must use MSG_ZEROCOPY flag when sending devmem TCP. Devmem +cannot be copied by the kernel, so the semantics of the devmem TX are similar +to the semantics of MSG_ZEROCOPY:: + + setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + +It is also recommended that the user binds the TX socket to the same interface +the dma-buf has been bound to via SO_BINDTODEVICE:: + + setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname) + 1); + + +Sending data +------------ + +Devmem data is sent using the SCM_DEVMEM_DMABUF cmsg. + +The user should create a msghdr where, + +* iov_base is set to the offset into the dmabuf to start sending from +* iov_len is set to the number of bytes to be sent from the dmabuf + +The user passes the dma-buf id to send from via the dmabuf_tx_cmsg.dmabuf_id. + +The example below sends 1024 bytes from offset 100 into the dmabuf, and 2048 +from offset 2000 into the dmabuf. The dmabuf to send from is tx_dmabuf_id:: + + char ctrl_data[CMSG_SPACE(sizeof(struct dmabuf_tx_cmsg))]; + struct dmabuf_tx_cmsg ddmabuf; + struct msghdr msg = {}; + struct cmsghdr *cmsg; + struct iovec iov[2]; + + iov[0].iov_base = (void*)100; + iov[0].iov_len = 1024; + iov[1].iov_base = (void*)2000; + iov[1].iov_len = 2048; + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct dmabuf_tx_cmsg)); + + ddmabuf.dmabuf_id = tx_dmabuf_id; + + *((struct dmabuf_tx_cmsg *)CMSG_DATA(cmsg)) = ddmabuf; + + sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + + +Reusing TX dmabufs +------------------ + +Similar to MSG_ZEROCOPY with regular memory, the user should not modify the +contents of the dma-buf while a send operation is in progress. This is because +the kernel does not keep a copy of the dmabuf contents. Instead, the kernel +will pin and send data from the buffer available to the userspace. + +Just as in MSG_ZEROCOPY, the kernel notifies the userspace of send completions +using MSG_ERRQUEUE:: + + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + int retries = 10; + __u32 hi, lo; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + serr = (void *)CMSG_DATA(cm); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stdout, "tx complete [%d,%d]\n", lo, hi); + } + } + +After the associated sendmsg has been completed, the dmabuf can be reused by +the userspace. + Implementation & Caveats ======================== diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 803dfc1efb7518..86aefb5938afcf 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -85,6 +85,7 @@ Contents: netdevices netfilter-sysctl netif-msg + netmem nexthop-group-resilient nf_conntrack-sysctl nf_flowtable diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a45..2d43d80af55ae4 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -9,6 +9,7 @@ Type Name fastpath_tx_access ..struct ..net_device unsigned_long:32 priv_flags read_mostly - __dev_queue_xmit(tx) unsigned_long:1 lltx read_mostly - HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx) +unsigned long:1 netmem_tx:1; read_mostly char name[16] - - struct_netdev_name_node* name_node struct_dev_ifalias* ifalias diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index 5014f7cc1398ba..02bd7536fc0ca2 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -188,3 +188,8 @@ Redundancy) frames from one port to another in hardware. This should be set for devices which duplicate outgoing HSR (High-availability Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically frames in hardware. + +* netmem-tx + +This should be set for devices which support netmem TX. See +Documentation/networking/netmem.rst diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst new file mode 100644 index 00000000000000..fa541a75c1f86c --- /dev/null +++ b/Documentation/networking/netmem.rst @@ -0,0 +1,98 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Netmem Support for Network Drivers +================================== + +This document outlines the requirements for network drivers to support netmem, +an abstract memory type that enables features like device memory TCP. By +supporting netmem, drivers can work with various underlying memory types +with little to no modification. + +Benefits of Netmem : + +* Flexibility: Netmem can be backed by different memory types (e.g., struct + page, DMA-buf), allowing drivers to support various use cases such as device + memory TCP. +* Future-proof: Drivers with netmem support are ready for upcoming + features that rely on it. +* Simplified Development: Drivers interact with a consistent API, + regardless of the underlying memory implementation. + +Driver Rx Requirements +====================== + +1. The driver must support page_pool. + +2. The driver must support the tcp-data-split ethtool option. + +3. The driver must use the page_pool netmem APIs for payload memory. The netmem + APIs currently 1-to-1 correspond with page APIs. Conversion to netmem should + be achievable by switching the page APIs to netmem APIs and tracking memory + via netmem_refs in the driver rather than struct page * : + + - page_pool_alloc -> page_pool_alloc_netmem + - page_pool_get_dma_addr -> page_pool_get_dma_addr_netmem + - page_pool_put_page -> page_pool_put_netmem + + Not all page APIs have netmem equivalents at the moment. If your driver + relies on a missing netmem API, feel free to add and propose to netdev@, or + reach out to the maintainers and/or almasrymina@google.com for help adding + the netmem API. + +4. The driver must use the following PP_FLAGS: + + - PP_FLAG_DMA_MAP: netmem is not dma-mappable by the driver. The driver + must delegate the dma mapping to the page_pool, which knows when + dma-mapping is (or is not) appropriate. + - PP_FLAG_DMA_SYNC_DEV: netmem dma addr is not necessarily dma-syncable + by the driver. The driver must delegate the dma syncing to the page_pool, + which knows when dma-syncing is (or is not) appropriate. + - PP_FLAG_ALLOW_UNREADABLE_NETMEM. The driver must specify this flag iff + tcp-data-split is enabled. + +5. The driver must not assume the netmem is readable and/or backed by pages. + The netmem returned by the page_pool may be unreadable, in which case + netmem_address() will return NULL. The driver must correctly handle + unreadable netmem, i.e. don't attempt to handle its contents when + netmem_address() is NULL. + + Ideally, drivers should not have to check the underlying netmem type via + helpers like netmem_is_net_iov() or convert the netmem to any of its + underlying types via netmem_to_page() or netmem_to_net_iov(). In most cases, + netmem or page_pool helpers that abstract this complexity are provided + (and more can be added). + +6. The driver must use page_pool_dma_sync_netmem_for_cpu() in lieu of + dma_sync_single_range_for_cpu(). For some memory providers, dma_syncing for + CPU will be done by the page_pool, for others (particularly dmabuf memory + provider), dma syncing for CPU is the responsibility of the userspace using + dmabuf APIs. The driver must delegate the entire dma-syncing operation to + the page_pool which will do it correctly. + +7. Avoid implementing driver-specific recycling on top of the page_pool. Drivers + cannot hold onto a struct page to do their own recycling as the netmem may + not be backed by a struct page. However, you may hold onto a page_pool + reference with page_pool_fragment_netmem() or page_pool_ref_netmem() for + that purpose, but be mindful that some netmem types might have longer + circulation times, such as when userspace holds a reference in zerocopy + scenarios. + +Driver TX Requirements +====================== + +1. The Driver must not pass the netmem dma_addr to any of the dma-mapping APIs + directly. This is because netmem dma_addrs may come from a source like + dma-buf that is not compatible with the dma-mapping APIs. + + Helpers like netmem_dma_unmap_page_attrs() & netmem_dma_unmap_addr_set() + should be used in lieu of dma_unmap_page[_attrs](), dma_unmap_addr_set(). + The netmem variants will handle netmem dma_addrs correctly regardless of the + source, delegating to the dma-mapping APIs when appropriate. + + Not all dma-mapping APIs have netmem equivalents at the moment. If your + driver relies on a missing netmem API, feel free to add and propose to + netdev@, or reach out to the maintainers and/or almasrymina@google.com for + help adding the netmem API. + +2. Driver should declare support by setting `netdev->netmem_tx = true` diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index 8641a00f8e6385..564862a57124e2 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -18,6 +18,7 @@ if NET_VENDOR_GOOGLE config GVE tristate "Google Virtual NIC (gVNIC) support" depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN)) + select PAGE_POOL help This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 9ed07080b38a25..4520f1c07a638f 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,5 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o \ + gve_buffer_mgmt_dqo.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 8ddd366d9fde54..93d8dde2b07805 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "gve_desc.h" @@ -60,11 +61,16 @@ #define GVE_DEFAULT_RX_BUFFER_OFFSET 2048 +#define GVE_PAGE_POOL_SIZE_MULTIPLIER 4 + #define GVE_FLOW_RULES_CACHE_SIZE \ (GVE_ADMINQ_BUFFER_SIZE / sizeof(struct gve_adminq_queried_flow_rule)) #define GVE_FLOW_RULE_IDS_CACHE_SIZE \ (GVE_ADMINQ_BUFFER_SIZE / sizeof(((struct gve_adminq_queried_flow_rule *)0)->location)) +#define GVE_RSS_KEY_SIZE 40 +#define GVE_RSS_INDIR_SIZE 128 + #define GVE_XDP_ACTIONS 5 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 @@ -99,9 +105,13 @@ struct gve_rx_desc_queue { /* The page info for a single slot in the RX data queue */ struct gve_rx_slot_page_info { - struct page *page; + union { + struct page *page; /* QPL mode */ + netmem_ref netmem; /* RDA mode */ + }; void *page_address; u32 page_offset; /* offset to write to in page */ + unsigned int buf_size; int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ u16 pad; /* adjustment for rx padding */ u8 can_flip; /* tracks if the networking stack is using the page */ @@ -273,6 +283,8 @@ struct gve_rx_ring { /* Address info of the buffers for header-split */ struct gve_header_buf hdr_bufs; + + struct page_pool *page_pool; } dqo; }; @@ -283,6 +295,8 @@ struct gve_rx_ring { u32 fill_cnt; /* free-running total number of descs and buffs posted */ u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ u64 rx_hsplit_pkt; /* free-running packets with headers split */ + u64 rx_devmem_pkt; /* devmem packets processed */ + u64 rx_devmem_dropped; /* devmem pkts dropped */ u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ u64 rx_copied_pkt; /* free-running total number of copied packets */ u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ @@ -666,6 +680,7 @@ struct gve_rx_alloc_rings_cfg { u16 packet_buffer_size; bool raw_addressing; bool enable_header_split; + bool reset_rss; /* Allocated resources are returned here */ struct gve_rx_ring *rx; @@ -716,6 +731,11 @@ struct gve_flow_rules_cache { u32 rule_ids_cache_num; }; +struct gve_rss_config { + u8 *hash_key; + u32 *hash_lut; +}; + struct gve_priv { struct net_device *dev; struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ @@ -836,6 +856,8 @@ struct gve_priv { u16 rss_key_size; u16 rss_lut_size; + bool cache_rss_config; + struct gve_rss_config rss_config; }; enum gve_service_task_flags_bit { @@ -1173,6 +1195,36 @@ void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx); u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hplit); bool gve_header_split_supported(const struct gve_priv *priv); int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split); +/* rx buffer handling */ +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs); +void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, + bool free_page); +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx); +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list); +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx); +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct); +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state); +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc); +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx); + /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); @@ -1184,13 +1236,16 @@ int gve_adjust_config(struct gve_priv *priv, struct gve_rx_alloc_rings_cfg *rx_alloc_cfg); int gve_adjust_queues(struct gve_priv *priv, struct gve_queue_config new_rx_config, - struct gve_queue_config new_tx_config); + struct gve_queue_config new_tx_config, + bool reset_rss); /* flow steering rule */ int gve_get_flow_rule_entry(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_get_flow_rule_ids(struct gve_priv *priv, struct ethtool_rxnfc *cmd, u32 *rule_locs); int gve_add_flow_rule(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_del_flow_rule(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_flow_rules_reset(struct gve_priv *priv); +/* RSS config */ +int gve_init_rss_config(struct gve_priv *priv, u16 num_queues); /* report stats handling */ void gve_handle_report_stats(struct gve_priv *priv); /* exported by ethtool.c */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 060e0e6749380f..b7585a0f4221b1 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -885,6 +885,15 @@ static void gve_set_default_desc_cnt(struct gve_priv *priv, priv->min_rx_desc_cnt = priv->rx_desc_cnt; } +static void gve_set_default_rss_sizes(struct gve_priv *priv) +{ + if (!gve_is_gqi(priv)) { + priv->rss_key_size = GVE_RSS_KEY_SIZE; + priv->rss_lut_size = GVE_RSS_INDIR_SIZE; + priv->cache_rss_config = true; + } +} + static void gve_enable_supported_features(struct gve_priv *priv, u32 supported_features_mask, const struct gve_device_option_jumbo_frames @@ -968,6 +977,10 @@ static void gve_enable_supported_features(struct gve_priv *priv, be16_to_cpu(dev_op_rss_config->hash_key_size); priv->rss_lut_size = be16_to_cpu(dev_op_rss_config->hash_lut_size); + priv->cache_rss_config = false; + dev_dbg(&priv->pdev->dev, + "RSS device option enabled with key size of %u, lut size of %u.\n", + priv->rss_key_size, priv->rss_lut_size); } } @@ -1052,6 +1065,8 @@ int gve_adminq_describe_device(struct gve_priv *priv) /* set default descriptor counts */ gve_set_default_desc_cnt(priv, descriptor); + gve_set_default_rss_sizes(priv); + /* DQO supports LRO. */ if (!gve_is_gqi(priv)) priv->dev->hw_features |= NETIF_F_LRO; @@ -1290,8 +1305,9 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv) int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *rxfh) { + const u32 *hash_lut_to_config = NULL; + const u8 *hash_key_to_config = NULL; dma_addr_t lut_bus = 0, key_bus = 0; - u16 key_size = 0, lut_size = 0; union gve_adminq_command cmd; __be32 *lut = NULL; u8 hash_alg = 0; @@ -1301,7 +1317,7 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r switch (rxfh->hfunc) { case ETH_RSS_HASH_NO_CHANGE: - break; + fallthrough; case ETH_RSS_HASH_TOP: hash_alg = ETH_RSS_HASH_TOP; break; @@ -1310,27 +1326,46 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r } if (rxfh->indir) { - lut_size = priv->rss_lut_size; + if (rxfh->indir_size != priv->rss_lut_size) + return -EINVAL; + + hash_lut_to_config = rxfh->indir; + } else if (priv->cache_rss_config) { + hash_lut_to_config = priv->rss_config.hash_lut; + } + + if (hash_lut_to_config) { lut = dma_alloc_coherent(&priv->pdev->dev, - lut_size * sizeof(*lut), + priv->rss_lut_size * sizeof(*lut), &lut_bus, GFP_KERNEL); if (!lut) return -ENOMEM; for (i = 0; i < priv->rss_lut_size; i++) - lut[i] = cpu_to_be32(rxfh->indir[i]); + lut[i] = cpu_to_be32(hash_lut_to_config[i]); } if (rxfh->key) { - key_size = priv->rss_key_size; + if (rxfh->key_size != priv->rss_key_size) { + err = -EINVAL; + goto out; + } + + hash_key_to_config = rxfh->key; + } else if (priv->cache_rss_config) { + hash_key_to_config = priv->rss_config.hash_key; + } + + if (hash_key_to_config) { key = dma_alloc_coherent(&priv->pdev->dev, - key_size, &key_bus, GFP_KERNEL); + priv->rss_key_size, + &key_bus, GFP_KERNEL); if (!key) { err = -ENOMEM; goto out; } - memcpy(key, rxfh->key, key_size); + memcpy(key, hash_key_to_config, priv->rss_key_size); } /* Zero-valued fields in the cmd.configure_rss instruct the device to @@ -1344,8 +1379,10 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r BIT(GVE_RSS_HASH_TCPV6) | BIT(GVE_RSS_HASH_UDPV6)), .hash_alg = hash_alg, - .hash_key_size = cpu_to_be16(key_size), - .hash_lut_size = cpu_to_be16(lut_size), + .hash_key_size = + cpu_to_be16((key_bus) ? priv->rss_key_size : 0), + .hash_lut_size = + cpu_to_be16((lut_bus) ? priv->rss_lut_size : 0), .hash_key_addr = cpu_to_be64(key_bus), .hash_lut_addr = cpu_to_be64(lut_bus), }; @@ -1355,11 +1392,11 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r out: if (lut) dma_free_coherent(&priv->pdev->dev, - lut_size * sizeof(*lut), + priv->rss_lut_size * sizeof(*lut), lut, lut_bus); if (key) dma_free_coherent(&priv->pdev->dev, - key_size, key, key_bus); + priv->rss_key_size, key, key_bus); return err; } @@ -1463,12 +1500,15 @@ static int gve_adminq_process_rss_query(struct gve_priv *priv, rxfh->hfunc = descriptor->hash_alg; rss_info_addr = (void *)(descriptor + 1); - if (rxfh->key) + if (rxfh->key) { + rxfh->key_size = priv->rss_key_size; memcpy(rxfh->key, rss_info_addr, priv->rss_key_size); + } rss_info_addr += priv->rss_key_size; lut = (__be32 *)rss_info_addr; if (rxfh->indir) { + rxfh->indir_size = priv->rss_lut_size; for (i = 0; i < priv->rss_lut_size; i++) rxfh->indir[i] = be32_to_cpu(lut[i]); } diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c new file mode 100644 index 00000000000000..1d11b043fd820f --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2024 Google, Inc. + */ + +#include "gve.h" +#include "gve_utils.h" + +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ + return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = rx->dqo.free_buf_states; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from free list */ + rx->dqo.free_buf_states = buf_state->next; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + return buf_state->next == buffer_id; +} + +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = rx->dqo.free_buf_states; + rx->dqo.free_buf_states = buffer_id; +} + +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = list->head; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from list */ + list->head = buf_state->next; + if (buf_state->next == -1) + list->tail = -1; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = -1; + + if (list->head == -1) { + list->head = buffer_id; + list->tail = buffer_id; + } else { + int tail = list->tail; + + rx->dqo.buf_states[tail].next = buffer_id; + list->tail = buffer_id; + } +} + +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + int i; + + /* Recycled buf states are immediately usable. */ + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); + if (likely(buf_state)) + return buf_state; + + if (unlikely(rx->dqo.used_buf_states.head == -1)) + return NULL; + + /* Used buf states are only usable when ref count reaches 0, which means + * no SKBs refer to them. + * + * Search a limited number before giving up. + */ + for (i = 0; i < 5; i++) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) { + rx->dqo.used_buf_states_cnt--; + return buf_state; + } + + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + } + + return NULL; +} + +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + u32 idx; + + idx = rx->dqo.next_qpl_page_idx; + if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { + net_err_ratelimited("%s: Out of QPL pages\n", + priv->dev->name); + return -ENOMEM; + } + buf_state->page_info.page = rx->dqo.qpl->pages[idx]; + buf_state->addr = rx->dqo.qpl->page_buses[idx]; + rx->dqo.next_qpl_page_idx++; + buf_state->page_info.page_offset = 0; + buf_state->page_info.page_address = + page_address(buf_state->page_info.page); + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; + buf_state->last_single_ref_offset = 0; + + /* The page already has 1 ref. */ + page_ref_add(buf_state->page_info.page, INT_MAX - 1); + buf_state->page_info.pagecnt_bias = INT_MAX; + + return 0; +} + +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state) +{ + if (!buf_state->page_info.page) + return; + + page_ref_sub(buf_state->page_info.page, + buf_state->page_info.pagecnt_bias - 1); + buf_state->page_info.page = NULL; +} + +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + const u16 data_buffer_size = priv->data_buffer_size_dqo; + int pagecount; + + /* Can't reuse if we only fit one buffer per page */ + if (data_buffer_size * 2 > PAGE_SIZE) + goto mark_used; + + pagecount = gve_buf_ref_cnt(buf_state); + + /* Record the offset when we have a single remaining reference. + * + * When this happens, we know all of the other offsets of the page are + * usable. + */ + if (pagecount == 1) { + buf_state->last_single_ref_offset = + buf_state->page_info.page_offset; + } + + /* Use the next buffer sized chunk in the page. */ + buf_state->page_info.page_offset += data_buffer_size; + buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + + /* If we wrap around to the same offset without ever dropping to 1 + * reference, then we don't know if this offset was ever freed. + */ + if (buf_state->page_info.page_offset == + buf_state->last_single_ref_offset) { + goto mark_used; + } + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return; + +mark_used: + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + rx->dqo.used_buf_states_cnt++; +} + +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct) +{ + netmem_ref netmem = buf_state->page_info.netmem; + + if (!netmem) + return; + + page_pool_put_full_netmem(rx->dqo.page_pool, netmem, allow_direct); + buf_state->page_info.netmem = 0; +} + +static int gve_alloc_from_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + netmem_ref netmem; + + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; + netmem = page_pool_alloc_netmem(rx->dqo.page_pool, + &buf_state->page_info.page_offset, + &buf_state->page_info.buf_size, + GFP_ATOMIC); + + if (!netmem) + return -ENOMEM; + + buf_state->page_info.netmem = netmem; + buf_state->page_info.page_address = netmem_address(netmem); + buf_state->addr = page_pool_get_dma_addr_netmem(netmem); + + return 0; +} + +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx) +{ + u32 ntfy_id = gve_rx_idx_to_ntfy(priv, rx->q_num); + struct page_pool_params pp = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, + .order = 0, + .pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt, + .dev = &priv->pdev->dev, + .netdev = priv->dev, + .napi = &priv->ntfy_blocks[ntfy_id].napi, + .max_len = PAGE_SIZE, + .dma_dir = DMA_FROM_DEVICE, + }; + + if (priv->header_split_enabled) { + pp.flags |= PP_FLAG_ALLOW_UNREADABLE_NETMEM; + pp.queue_idx = rx->q_num; + } + + return page_pool_create(&pp); +} + +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + gve_free_to_page_pool(rx, buf_state, true); + gve_free_buf_state(rx, buf_state); + } else { + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + } +} + +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + buf_state->page_info.netmem = 0; + gve_free_buf_state(rx, buf_state); + } else { + gve_dec_pagecnt_bias(&buf_state->page_info); + gve_try_recycle_buf(rx->gve, rx, buf_state); + } +} + +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc) +{ + struct gve_rx_buf_state_dqo *buf_state; + + if (rx->dqo.page_pool) { + buf_state = gve_alloc_buf_state(rx); + if (WARN_ON_ONCE(!buf_state)) + return -ENOMEM; + + if (gve_alloc_from_page_pool(rx, buf_state)) + goto free_buf_state; + } else { + buf_state = gve_get_recycled_buf_state(rx); + if (unlikely(!buf_state)) { + buf_state = gve_alloc_buf_state(rx); + if (unlikely(!buf_state)) + return -ENOMEM; + + if (unlikely(gve_alloc_qpl_page_dqo(rx, buf_state))) + goto free_buf_state; + } + } + desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); + desc->buf_addr = cpu_to_le64(buf_state->addr + + buf_state->page_info.page_offset); + + return 0; + +free_buf_state: + gve_free_buf_state(rx, buf_state); + return -ENOMEM; +} diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index bdfc6e77b2af56..425fe753602a30 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -5,6 +5,7 @@ */ #include +#include #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" @@ -40,7 +41,8 @@ static u32 gve_get_msglevel(struct net_device *netdev) * as declared in enum xdp_action inside file uapi/linux/bpf.h . */ static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = { - "rx_packets", "rx_hsplit_pkt", "tx_packets", "rx_bytes", + "rx_packets", "rx_hsplit_pkt", "rx_devmem_pkts", + "rx_devmem_dropped", "tx_packets", "rx_bytes", "tx_bytes", "rx_dropped", "tx_dropped", "tx_timeouts", "rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt", "rx_hsplit_unsplit_pkt", @@ -149,13 +151,14 @@ static void gve_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { - u64 tmp_rx_pkts, tmp_rx_hsplit_pkt, tmp_rx_bytes, tmp_rx_hsplit_bytes, + u64 tmp_rx_pkts, tmp_rx_hsplit_pkt, tmp_rx_devmem_pkt, + tmp_rx_devmem_dropped, tmp_rx_bytes, tmp_rx_hsplit_bytes, tmp_rx_skb_alloc_fail, tmp_rx_buf_alloc_fail, tmp_rx_desc_err_dropped_pkt, tmp_rx_hsplit_unsplit_pkt, tmp_tx_pkts, tmp_tx_bytes; u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_hsplit_unsplit_pkt, - rx_pkts, rx_hsplit_pkt, rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, - tx_dropped; + rx_pkts, rx_hsplit_pkt, rx_devmem_pkt, rx_devmem_dropped, + rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, tx_dropped; int stats_idx, base_stats_idx, max_stats_idx; struct stats *report_stats; int *rx_qid_to_stats_idx; @@ -196,6 +199,7 @@ gve_get_ethtool_stats(struct net_device *netdev, } for (rx_pkts = 0, rx_bytes = 0, rx_hsplit_pkt = 0, + rx_devmem_pkt = 0, rx_devmem_dropped = 0, rx_skb_alloc_fail = 0, rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, rx_hsplit_unsplit_pkt = 0, ring = 0; @@ -208,6 +212,8 @@ gve_get_ethtool_stats(struct net_device *netdev, u64_stats_fetch_begin(&priv->rx[ring].statss); tmp_rx_pkts = rx->rpackets; tmp_rx_hsplit_pkt = rx->rx_hsplit_pkt; + tmp_rx_devmem_pkt = rx->rx_devmem_pkt; + tmp_rx_devmem_dropped = rx->rx_devmem_dropped; tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; @@ -219,6 +225,9 @@ gve_get_ethtool_stats(struct net_device *netdev, start)); rx_pkts += tmp_rx_pkts; rx_hsplit_pkt += tmp_rx_hsplit_pkt; + rx_devmem_pkt += tmp_rx_devmem_pkt; + rx_devmem_dropped += tmp_rx_devmem_dropped; + rx_bytes += tmp_rx_bytes; rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; @@ -245,6 +254,8 @@ gve_get_ethtool_stats(struct net_device *netdev, i = 0; data[i++] = rx_pkts; data[i++] = rx_hsplit_pkt; + data[i++] = rx_devmem_pkt; + data[i++] = rx_devmem_dropped; data[i++] = tx_pkts; data[i++] = rx_bytes; data[i++] = tx_bytes; @@ -482,6 +493,7 @@ static int gve_set_channels(struct net_device *netdev, struct ethtool_channels old_settings; int new_tx = cmd->tx_count; int new_rx = cmd->rx_count; + bool reset_rss = false; gve_get_channels(netdev, &old_settings); @@ -498,16 +510,14 @@ static int gve_set_channels(struct net_device *netdev, return -EINVAL; } - if (!netif_running(netdev)) { - priv->tx_cfg.num_queues = new_tx; - priv->rx_cfg.num_queues = new_rx; - return 0; - } + if (new_rx != priv->rx_cfg.num_queues && + priv->cache_rss_config && !netif_is_rxfh_configured(netdev)) + reset_rss = true; new_tx_cfg.num_queues = new_tx; new_rx_cfg.num_queues = new_rx; - return gve_adjust_queues(priv, new_rx_cfg, new_tx_cfg); + return gve_adjust_queues(priv, new_rx_cfg, new_tx_cfg, reset_rss); } static void gve_get_ringparam(struct net_device *netdev, @@ -855,6 +865,25 @@ static u32 gve_get_rxfh_indir_size(struct net_device *netdev) return priv->rss_lut_size; } +static void gve_get_rss_config_cache(struct gve_priv *priv, + struct ethtool_rxfh_param *rxfh) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + rxfh->hfunc = ETH_RSS_HASH_TOP; + + if (rxfh->key) { + rxfh->key_size = priv->rss_key_size; + memcpy(rxfh->key, rss_config->hash_key, priv->rss_key_size); + } + + if (rxfh->indir) { + rxfh->indir_size = priv->rss_lut_size; + memcpy(rxfh->indir, rss_config->hash_lut, + priv->rss_lut_size * sizeof(*rxfh->indir)); + } +} + static int gve_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) { struct gve_priv *priv = netdev_priv(netdev); @@ -862,18 +891,46 @@ static int gve_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rx if (!priv->rss_key_size || !priv->rss_lut_size) return -EOPNOTSUPP; + if (priv->cache_rss_config) { + gve_get_rss_config_cache(priv, rxfh); + return 0; + } + return gve_adminq_query_rss_config(priv, rxfh); } +static void gve_set_rss_config_cache(struct gve_priv *priv, + struct ethtool_rxfh_param *rxfh) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + if (rxfh->key) + memcpy(rss_config->hash_key, rxfh->key, priv->rss_key_size); + + if (rxfh->indir) + memcpy(rss_config->hash_lut, rxfh->indir, + priv->rss_lut_size * sizeof(*rxfh->indir)); +} + static int gve_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { struct gve_priv *priv = netdev_priv(netdev); + int err; if (!priv->rss_key_size || !priv->rss_lut_size) return -EOPNOTSUPP; - return gve_adminq_configure_rss(priv, rxfh); + err = gve_adminq_configure_rss(priv, rxfh); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Fail to configure RSS config"); + return err; + } + + if (priv->cache_rss_config) + gve_set_rss_config_cache(priv, rxfh); + + return 0; } const struct ethtool_ops gve_ethtool_ops = { diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 862c4575701fec..13b9fe651d10e2 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -184,6 +184,43 @@ static void gve_free_flow_rule_caches(struct gve_priv *priv) flow_rules_cache->rules_cache = NULL; } +static int gve_alloc_rss_config_cache(struct gve_priv *priv) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + if (!priv->cache_rss_config) + return 0; + + rss_config->hash_key = kcalloc(priv->rss_key_size, + sizeof(rss_config->hash_key[0]), + GFP_KERNEL); + if (!rss_config->hash_key) + return -ENOMEM; + + rss_config->hash_lut = kcalloc(priv->rss_lut_size, + sizeof(rss_config->hash_lut[0]), + GFP_KERNEL); + if (!rss_config->hash_lut) + goto free_rss_key_cache; + + return 0; + +free_rss_key_cache: + kfree(rss_config->hash_key); + rss_config->hash_key = NULL; + return -ENOMEM; +} + +static void gve_free_rss_config_cache(struct gve_priv *priv) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + kfree(rss_config->hash_key); + kfree(rss_config->hash_lut); + + memset(rss_config, 0, sizeof(*rss_config)); +} + static int gve_alloc_counter_array(struct gve_priv *priv) { priv->counter_array = @@ -575,9 +612,12 @@ static int gve_setup_device_resources(struct gve_priv *priv) err = gve_alloc_flow_rule_caches(priv); if (err) return err; - err = gve_alloc_counter_array(priv); + err = gve_alloc_rss_config_cache(priv); if (err) goto abort_with_flow_rule_caches; + err = gve_alloc_counter_array(priv); + if (err) + goto abort_with_rss_config_cache; err = gve_alloc_notify_blocks(priv); if (err) goto abort_with_counter; @@ -611,6 +651,12 @@ static int gve_setup_device_resources(struct gve_priv *priv) } } + err = gve_init_rss_config(priv, priv->rx_cfg.num_queues); + if (err) { + dev_err(&priv->pdev->dev, "Failed to init RSS config"); + goto abort_with_ptype_lut; + } + err = gve_adminq_report_stats(priv, priv->stats_report_len, priv->stats_report_bus, GVE_STATS_REPORT_TIMER_PERIOD); @@ -629,6 +675,8 @@ static int gve_setup_device_resources(struct gve_priv *priv) gve_free_notify_blocks(priv); abort_with_counter: gve_free_counter_array(priv); +abort_with_rss_config_cache: + gve_free_rss_config_cache(priv); abort_with_flow_rule_caches: gve_free_flow_rule_caches(priv); @@ -669,6 +717,7 @@ static void gve_teardown_device_resources(struct gve_priv *priv) priv->ptype_lut_dqo = NULL; gve_free_flow_rule_caches(priv); + gve_free_rss_config_cache(priv); gve_free_counter_array(priv); gve_free_notify_blocks(priv); gve_free_stats_report(priv); @@ -1390,6 +1439,12 @@ static int gve_queues_start(struct gve_priv *priv, if (err) goto stop_and_free_rings; + if (rx_alloc_cfg->reset_rss) { + err = gve_init_rss_config(priv, priv->rx_cfg.num_queues); + if (err) + goto reset; + } + err = gve_register_qpls(priv); if (err) goto reset; @@ -1786,6 +1841,26 @@ static int gve_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } +int gve_init_rss_config(struct gve_priv *priv, u16 num_queues) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + struct ethtool_rxfh_param rxfh = {0}; + u16 i; + + if (!priv->cache_rss_config) + return 0; + + for (i = 0; i < priv->rss_lut_size; i++) + rss_config->hash_lut[i] = + ethtool_rxfh_indir_default(i, num_queues); + + netdev_rss_key_fill(rss_config->hash_key, priv->rss_key_size); + + rxfh.hfunc = ETH_RSS_HASH_TOP; + + return gve_adminq_configure_rss(priv, &rxfh); +} + int gve_flow_rules_reset(struct gve_priv *priv) { if (!priv->max_flow_rules) @@ -1834,7 +1909,8 @@ int gve_adjust_config(struct gve_priv *priv, int gve_adjust_queues(struct gve_priv *priv, struct gve_queue_config new_rx_config, - struct gve_queue_config new_tx_config) + struct gve_queue_config new_tx_config, + bool reset_rss) { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; @@ -1847,6 +1923,7 @@ int gve_adjust_queues(struct gve_priv *priv, tx_alloc_cfg.qcfg = &new_tx_config; rx_alloc_cfg.qcfg_tx = &new_tx_config; rx_alloc_cfg.qcfg = &new_rx_config; + rx_alloc_cfg.reset_rss = reset_rss; tx_alloc_cfg.num_rings = new_tx_config.num_queues; /* Add dedicated XDP TX queues if enabled. */ @@ -1858,6 +1935,11 @@ int gve_adjust_queues(struct gve_priv *priv, return err; } /* Set the config for the next up. */ + if (reset_rss) { + err = gve_init_rss_config(priv, new_rx_config.num_queues); + if (err) + return err; + } priv->tx_cfg = new_tx_config; priv->rx_cfg = new_rx_config; @@ -2561,6 +2643,54 @@ static const struct netdev_queue_mgmt_ops gve_queue_mgmt_ops = { .ndo_queue_stop = gve_rx_queue_stop, }; +static void gve_get_rx_queue_stats(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *rx_stats) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_rx_ring *rx = &priv->rx[idx]; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&rx->statss); + rx_stats->packets = rx->rpackets; + rx_stats->bytes = rx->rbytes; + rx_stats->alloc_fail = rx->rx_skb_alloc_fail + + rx->rx_buf_alloc_fail; + } while (u64_stats_fetch_retry(&rx->statss, start)); +} + +static void gve_get_tx_queue_stats(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *tx_stats) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx = &priv->tx[idx]; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&tx->statss); + tx_stats->packets = tx->pkt_done; + tx_stats->bytes = tx->bytes_done; + } while (u64_stats_fetch_retry(&tx->statss, start)); +} + +static void gve_get_base_stats(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx) +{ + rx->packets = 0; + rx->bytes = 0; + rx->alloc_fail = 0; + + tx->packets = 0; + tx->bytes = 0; +} + +static const struct netdev_stat_ops gve_stat_ops = { + .get_queue_stats_rx = gve_get_rx_queue_stats, + .get_queue_stats_tx = gve_get_tx_queue_stats, + .get_base_stats = gve_get_base_stats, +}; + static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { int max_tx_queues, max_rx_queues; @@ -2616,6 +2746,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev->ethtool_ops = &gve_ethtool_ops; dev->netdev_ops = &gve_netdev_ops; dev->queue_mgmt_ops = &gve_queue_mgmt_ops; + dev->stat_ops = &gve_stat_ops; /* Set default and supported features. * @@ -2669,6 +2800,10 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format); + + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) + dev->netmem_tx = true; + gve_clear_probe_in_progress(priv); queue_work(priv->gve_wq, &priv->service_task); return 0; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 1154c1d8f66f05..9abbacb32bbce8 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -14,191 +14,11 @@ #include #include #include +#include +#include +#include #include -static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) -{ - return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; -} - -static void gve_free_page_dqo(struct gve_priv *priv, - struct gve_rx_buf_state_dqo *bs, - bool free_page) -{ - page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); - if (free_page) - gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, - DMA_FROM_DEVICE); - bs->page_info.page = NULL; -} - -static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = rx->dqo.free_buf_states; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from free list */ - rx->dqo.free_buf_states = buf_state->next; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - return buf_state->next == buffer_id; -} - -static void gve_free_buf_state(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = rx->dqo.free_buf_states; - rx->dqo.free_buf_states = buffer_id; -} - -static struct gve_rx_buf_state_dqo * -gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = list->head; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from list */ - list->head = buf_state->next; - if (buf_state->next == -1) - list->tail = -1; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static void gve_enqueue_buf_state(struct gve_rx_ring *rx, - struct gve_index_list *list, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = -1; - - if (list->head == -1) { - list->head = buffer_id; - list->tail = buffer_id; - } else { - int tail = list->tail; - - rx->dqo.buf_states[tail].next = buffer_id; - list->tail = buffer_id; - } -} - -static struct gve_rx_buf_state_dqo * -gve_get_recycled_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - int i; - - /* Recycled buf states are immediately usable. */ - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); - if (likely(buf_state)) - return buf_state; - - if (unlikely(rx->dqo.used_buf_states.head == -1)) - return NULL; - - /* Used buf states are only usable when ref count reaches 0, which means - * no SKBs refer to them. - * - * Search a limited number before giving up. - */ - for (i = 0; i < 5; i++) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) { - rx->dqo.used_buf_states_cnt--; - return buf_state; - } - - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - } - - /* For QPL, we cannot allocate any new buffers and must - * wait for the existing ones to be available. - */ - if (rx->dqo.qpl) - return NULL; - - /* If there are no free buf states discard an entry from - * `used_buf_states` so it can be used. - */ - if (unlikely(rx->dqo.free_buf_states == -1)) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) - return buf_state; - - gve_free_page_dqo(rx->gve, buf_state, true); - gve_free_buf_state(rx, buf_state); - } - - return NULL; -} - -static int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - struct gve_priv *priv = rx->gve; - u32 idx; - - if (!rx->dqo.qpl) { - int err; - - err = gve_alloc_page(priv, &priv->pdev->dev, - &buf_state->page_info.page, - &buf_state->addr, - DMA_FROM_DEVICE, GFP_ATOMIC); - if (err) - return err; - } else { - idx = rx->dqo.next_qpl_page_idx; - if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { - net_err_ratelimited("%s: Out of QPL pages\n", - priv->dev->name); - return -ENOMEM; - } - buf_state->page_info.page = rx->dqo.qpl->pages[idx]; - buf_state->addr = rx->dqo.qpl->page_buses[idx]; - rx->dqo.next_qpl_page_idx++; - } - buf_state->page_info.page_offset = 0; - buf_state->page_info.page_address = - page_address(buf_state->page_info.page); - buf_state->last_single_ref_offset = 0; - - /* The page already has 1 ref. */ - page_ref_add(buf_state->page_info.page, INT_MAX - 1); - buf_state->page_info.pagecnt_bias = INT_MAX; - - return 0; -} - static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) { struct device *hdev = &priv->pdev->dev; @@ -278,8 +98,10 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } } @@ -290,10 +112,12 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_rx_ring *rx = &priv->rx[idx]; if (!gve_rx_was_added_to_block(priv, idx)) return; + page_pool_disable_direct_recycling(rx->dqo.page_pool); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); @@ -321,9 +145,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - /* Only free page for RDA. QPL pages are freed in gve_main. */ - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } if (rx->dqo.qpl) { @@ -350,6 +176,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, kvfree(rx->dqo.buf_states); rx->dqo.buf_states = NULL; + if (rx->dqo.page_pool) { + page_pool_destroy(rx->dqo.page_pool); + rx->dqo.page_pool = NULL; + } + gve_rx_free_hdr_bufs(priv, rx); netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); @@ -382,6 +213,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) { struct device *hdev = &priv->pdev->dev; + struct page_pool *pool; int qpl_page_cnt; size_t size; u32 qpl_id; @@ -395,8 +227,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, rx->gve = priv; rx->q_num = idx; - rx->dqo.num_buf_states = cfg->raw_addressing ? - min_t(s16, S16_MAX, buffer_queue_slots * 4) : + rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, sizeof(rx->dqo.buf_states[0]), @@ -424,7 +255,13 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, if (!rx->dqo.bufq.desc_ring) goto err; - if (!cfg->raw_addressing) { + if (cfg->raw_addressing) { + pool = gve_rx_create_page_pool(priv, rx); + if (IS_ERR(pool)) + goto err; + + rx->dqo.page_pool = pool; + } else { qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); @@ -521,26 +358,14 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); while (num_posted < num_avail_slots) { struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; - struct gve_rx_buf_state_dqo *buf_state; - - buf_state = gve_get_recycled_buf_state(rx); - if (unlikely(!buf_state)) { - buf_state = gve_alloc_buf_state(rx); - if (unlikely(!buf_state)) - break; - - if (unlikely(gve_alloc_page_dqo(rx, buf_state))) { - u64_stats_update_begin(&rx->statss); - rx->rx_buf_alloc_fail++; - u64_stats_update_end(&rx->statss); - gve_free_buf_state(rx, buf_state); - break; - } + + if (unlikely(gve_alloc_buffer(rx, desc))) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + break; } - desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); - desc->buf_addr = cpu_to_le64(buf_state->addr + - buf_state->page_info.page_offset); if (rx->dqo.hdr_bufs.data) desc->header_buf_addr = cpu_to_le64(rx->dqo.hdr_bufs.addr + @@ -557,48 +382,6 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) rx->fill_cnt += num_posted; } -static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - const u16 data_buffer_size = priv->data_buffer_size_dqo; - int pagecount; - - /* Can't reuse if we only fit one buffer per page */ - if (data_buffer_size * 2 > PAGE_SIZE) - goto mark_used; - - pagecount = gve_buf_ref_cnt(buf_state); - - /* Record the offset when we have a single remaining reference. - * - * When this happens, we know all of the other offsets of the page are - * usable. - */ - if (pagecount == 1) { - buf_state->last_single_ref_offset = - buf_state->page_info.page_offset; - } - - /* Use the next buffer sized chunk in the page. */ - buf_state->page_info.page_offset += data_buffer_size; - buf_state->page_info.page_offset &= (PAGE_SIZE - 1); - - /* If we wrap around to the same offset without ever dropping to 1 - * reference, then we don't know if this offset was ever freed. - */ - if (buf_state->page_info.page_offset == - buf_state->last_single_ref_offset) { - goto mark_used; - } - - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); - return; - -mark_used: - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - rx->dqo.used_buf_states_cnt++; -} - static void gve_rx_skb_csum(struct sk_buff *skb, const struct gve_rx_compl_desc_dqo *desc, struct gve_ptype ptype) @@ -696,6 +479,24 @@ static int gve_rx_copy_ondemand(struct gve_rx_ring *rx, return 0; } +static void gve_skb_add_rx_frag(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + int num_frags, u16 buf_len) +{ + if (rx->dqo.qpl) { + skb_add_rx_frag(rx->ctx.skb_tail, num_frags, + buf_state->page_info.page, + buf_state->page_info.page_offset, + buf_len, buf_state->page_info.buf_size); + } else { + skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags, + buf_state->page_info.netmem, + buf_state->page_info.page_offset, + buf_len, + buf_state->page_info.buf_size); + } +} + /* Chains multi skbs for single rx packet. * Returns 0 if buffer is appended, -1 otherwise. */ @@ -713,6 +514,9 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (!skb) return -1; + if (rx->dqo.page_pool) + skb_mark_for_recycle(skb); + if (rx->ctx.skb_tail == rx->ctx.skb_head) skb_shinfo(rx->ctx.skb_head)->frag_list = skb; else @@ -723,23 +527,15 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (rx->ctx.skb_tail != rx->ctx.skb_head) { rx->ctx.skb_head->len += buf_len; rx->ctx.skb_head->data_len += buf_len; - rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo; + rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; } /* Trigger ondemand page allocation if we are running low on buffers */ if (gve_rx_should_trigger_copy_ondemand(rx)) return gve_rx_copy_ondemand(rx, buf_state, buf_len); - skb_add_rx_frag(rx->ctx.skb_tail, num_frags, - buf_state->page_info.page, - buf_state->page_info.page_offset, - buf_len, priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); - - /* Advances buffer page-offset if page is partially used. - * Marks buffer as used if page is full. - */ - gve_try_recycle_buf(priv, rx, buf_state); + gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len); + gve_reuse_buffer(rx, buf_state); return 0; } @@ -773,8 +569,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, } if (unlikely(compl_desc->rx_error)) { - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return -EINVAL; } @@ -784,7 +579,31 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, /* Page might have not been used for awhile and was likely last written * by a different thread. */ - prefetch(buf_state->page_info.page); + if (rx->dqo.qpl) { + prefetch(buf_state->page_info.page); + } else { + if (!netmem_is_net_iov(buf_state->page_info.netmem)) + prefetch(netmem_to_page(buf_state->page_info.netmem)); + } + + if (!hsplit && !rx->ctx.skb_head && + netmem_is_net_iov(buf_state->page_info.netmem)) { + /* !hsplit indicates the packet is not split, and the header went + * to the packet buffer. If the packet buffer is a dma_buf + * page, those can't be easily mapped into the kernel space to + * access the header required to process the packet. + * + * In the future we may be able to map the dma_buf page to + * kernel space to access the header for dma_buf providers that + * support that, but for now, simply drop the packet. We expect + * the TCP packets that we care about to be header split + * anyway. + */ + rx->rx_devmem_dropped++; + + gve_free_buffer(rx, buf_state); + return -EFAULT; + } /* Copy the header into the skb in the case of header split */ if (hsplit) { @@ -798,6 +617,9 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, if (unlikely(!rx->ctx.skb_head)) goto error; rx->ctx.skb_tail = rx->ctx.skb_head; + + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); } else { unsplit = 1; } @@ -809,9 +631,10 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, } /* Sync the portion of dma buffer for CPU to read. */ - dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, - buf_state->page_info.page_offset, - buf_len, DMA_FROM_DEVICE); + page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool, + buf_state->page_info.netmem, + buf_state->page_info.page_offset, + buf_len); /* Append to current skb if one exists. */ if (rx->ctx.skb_head) { @@ -822,7 +645,10 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, return 0; } - if (eop && buf_len <= priv->rx_copybreak) { + /* We can't copy dma-buf pages. Ignore any copybreak setting. */ + if (eop && buf_len <= priv->rx_copybreak && + (!netmem_is_net_iov(buf_state->page_info.netmem) || !buf_len)) { + rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, &buf_state->page_info, buf_len); if (unlikely(!rx->ctx.skb_head)) @@ -834,8 +660,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, rx->rx_copybreak_pkt++; u64_stats_update_end(&rx->statss); - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return 0; } @@ -850,16 +675,15 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, return 0; } - skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, - buf_state->page_info.page_offset, buf_len, - priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); - gve_try_recycle_buf(priv, rx, buf_state); + gve_skb_add_rx_frag(rx, buf_state, 0, buf_len); + gve_reuse_buffer(rx, buf_state); return 0; error: - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + gve_free_buffer(rx, buf_state); return -ENOMEM; } @@ -914,10 +738,16 @@ static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, return err; } - if (skb_headlen(rx->ctx.skb_head) == 0) + if (skb_headlen(rx->ctx.skb_head) == 0) { + if (!skb_frags_readable(napi_get_frags(napi))) + rx->rx_devmem_pkt++; napi_gro_frags(napi); - else + } + else { + if (!skb_frags_readable(rx->ctx.skb_head)) + rx->rx_devmem_pkt++; napi_gro_receive(napi, rx->ctx.skb_head); + } return 0; } diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index f879426cb5523a..8127cbef3e94a6 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -667,7 +667,8 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, goto err; dma_unmap_len_set(pkt, len[pkt->num_bufs], len); - dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); + netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt, + dma[pkt->num_bufs], addr); ++pkt->num_bufs; gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr, @@ -1045,8 +1046,9 @@ static void gve_unmap_packet(struct device *dev, dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); for (i = 1; i < pkt->num_bufs; i++) { - dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), - dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); + netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), + DMA_TO_DEVICE, 0); } pkt->num_bufs = 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 35b886385f3298..f9a0093d41ad23 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1723,6 +1723,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2024,6 +2025,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39f1d16f362887..492f23695ba68e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1698,13 +1698,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); + +struct net_devmem_dmabuf_binding; void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1712,12 +1715,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3651,6 +3656,11 @@ static inline dma_addr_t skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } + return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488ab..9e49372ef1a05e 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d30..9f4e436d7ca62a 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,6 +8,7 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include #include #include @@ -171,4 +172,26 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 60a5347922becc..35dfeb05362ef1 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -116,22 +116,22 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -static inline struct page *page_pool_alloc(struct page_pool *pool, - unsigned int *offset, - unsigned int *size, gfp_t gfp) +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page; + netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; - return page_pool_alloc_pages(pool, gfp); + return page_pool_alloc_netmems(pool, gfp); } - page = page_pool_alloc_frag(pool, offset, *size, gfp); - if (unlikely(!page)) - return NULL; + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize @@ -142,7 +142,14 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, pool->frag_offset = max_size; } - return page; + return netmem; +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** @@ -418,7 +425,21 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); } /** @@ -437,10 +458,21 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { - dma_sync_single_range_for_cpu(pool->p.dev, - page_pool_get_dma_addr(page), - offset + pool->p.offset, dma_sync_size, - page_pool_get_dma_dir(pool)); + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe39d..19924504efca0c 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -164,7 +164,8 @@ struct page_pool { bool has_init_callback:1; /* slow::init_callback is set */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif @@ -242,7 +243,7 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, diff --git a/include/net/sock.h b/include/net/sock.h index fa055cf1785efd..84bcf1740e9250 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1804,6 +1804,7 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7c308f04e7a063..56f7b87f8bd0ca 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -199,6 +199,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece46..09c74a1d836b66 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -63,6 +63,8 @@ #include #include +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -692,9 +694,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -702,6 +744,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (unlikely(binding)) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -735,7 +779,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/dev.c b/net/core/dev.c index c761f862bc5a2d..ac5c81d378c792 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3765,6 +3765,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device skb = validate_xmit_xfrm(skb, features, again); + if (!skb_frags_readable(skb) && !dev->netmem_tx) + goto out_kfree_skb; + return skb; out_kfree_skb: diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee1135..df6ff563919883 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -44,8 +45,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -63,8 +66,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -109,6 +114,14 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase + */ + + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -122,8 +135,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -174,8 +185,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack) +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; @@ -218,22 +230,33 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -277,6 +300,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, niov->owner = owner; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->base_virtual / PAGE_SIZE + i] = niov; } virtual += len; @@ -288,6 +313,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); @@ -302,6 +329,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + void dev_dmabuf_uninstall(struct net_device *dev) { struct net_devmem_dmabuf_binding *binding; @@ -322,6 +364,63 @@ void dev_dmabuf_uninstall(struct net_device *dev) } } +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + size_t idx; + + if (virt_addr >= binding->dmabuf->size) + return NULL; + + idx = virt_addr / PAGE_SIZE; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[idx]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -331,11 +430,11 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482f5..3334a85292696b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -21,12 +21,20 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -42,6 +50,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -64,15 +80,17 @@ struct dmabuf_genpool_chunk_owner { struct net_devmem_dmabuf_binding *binding; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack); +net_devmem_bind_dmabuf(struct net_device *dev, enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); void dev_dmabuf_uninstall(struct net_device *dev); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_iov_owner(const struct net_iov *niov) @@ -104,10 +122,10 @@ static inline u32 net_iov_binding_id(const struct net_iov *niov) return net_iov_owner(niov)->binding->id; } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -115,29 +133,58 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) { if (!refcount_dec_and_test(&binding->ref)) return; - - __net_devmem_dmabuf_binding_free(binding); + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + +static inline void +__net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -175,6 +222,19 @@ static inline u32 net_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8614988fc67b9a..b9bbd8a19ad9bd 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -87,6 +87,12 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +177,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd04296..dc742b6410e91f 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index ad426b3a03b526..32503a32fb89a3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -782,7 +782,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -839,6 +839,76 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) return err; } +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct list_head *sock_binding_list; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + sock_binding_list = genl_sk_priv_get(&netdev_nl_family, + NETLINK_CB(skb).sk); + if (IS_ERR(sock_binding_list)) + return PTR_ERR(sock_binding_list); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (!netdev || !netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock; + } + + list_add(&binding->list, sock_binding_list); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + rtnl_unlock(); + + return genlmsg_reply(rsp, info); + +err_unlock: + rtnl_unlock(); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + void netdev_nl_sock_priv_init(struct list_head *priv) { INIT_LIST_HEAD(priv); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d213536..dbef64d841e5c6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -201,6 +201,7 @@ static int page_pool_init(struct page_pool *pool, memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) @@ -287,6 +288,9 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_priv) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, @@ -574,7 +578,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; @@ -590,11 +594,11 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } -EXPORT_SYMBOL(page_pool_alloc_netmem); +EXPORT_SYMBOL(page_pool_alloc_netmems); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { - return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); @@ -956,7 +960,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, } if (!netmem) { - netmem = page_pool_alloc_netmem(pool, gfp); + netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 61a950f13a91c7..1b524c81f40a33 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -89,6 +89,7 @@ #include #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -1698,7 +1699,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1713,7 +1715,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1736,7 +1738,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1766,7 +1768,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1781,7 +1784,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1885,7 +1888,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1904,7 +1908,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; @@ -7294,3 +7299,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) { + /* Assume any net_iov is devmem and route it to + * net_devmem_get_net_iov. As new net_iov types are added they + * need to be checked here. + */ + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) { + /* Assume any net_iov is devmem and route it to + * net_devmem_put_net_iov. As new net_iov types are added they + * need to be checked here. + */ + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/sock.c b/net/core/sock.c index a83f64a1d96a29..1ab81cbe89bd4b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2940,6 +2940,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 49811c9281d424..3587e82a70ca5d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1023,7 +1023,7 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 68cb6a966b18b8..c1bd9d04448f16 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1050,6 +1050,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1057,11 +1058,24 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + bool sockc_valid = true; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags), + .dmabuf_id = 0 }; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + /* Don't return error until MSG_FASTOPEN has been + * processed; that may succeed even if the cmsg is + * invalid. + */ + sockc_valid = false; + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1069,7 +1083,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + sockc_valid && !!sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1078,12 +1093,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (sockc_valid && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (sockc_valid && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1122,14 +1152,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; - goto out_err; - } - } + if (!sockc_valid) + goto out_err; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -1247,7 +1271,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1328,6 +1353,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1345,6 +1372,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 434ddf263b88a3..ad4632c3e795ba 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1519,7 +1519,7 @@ static int __ip6_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d8809655..9bfaa530ebb3d1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -108,7 +108,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, &info->msg->msg_iter, - len); + len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7c308f04e7a063..56f7b87f8bd0ca 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -199,6 +199,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore new file mode 100644 index 00000000000000..e9fe6ede681ab8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -0,0 +1 @@ +ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index c9f2f48fc30f4d..1c6a77480923c4 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -3,6 +3,7 @@ TEST_PROGS = \ csum.py \ devlink_port_split.py \ + devmem.py \ ethtool.sh \ ethtool_extended_state.sh \ ethtool_mm.sh \ @@ -26,4 +27,12 @@ TEST_INCLUDES := \ ../../../net/forwarding/tc_common.sh \ # +# YNL files, must be before "include ..lib.mk" +YNL_GEN_FILES := ncdevmem +TEST_GEN_FILES += $(YNL_GEN_FILES) + include ../../../lib.mk + +# YNL build +YNL_GENS := ethtool netdev +include ../../../net/ynl.mk diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py new file mode 100755 index 00000000000000..39696aaf219d47 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, KsftSkipEx +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, rand_port, wait_port_listen +from lib.py import ksft_disruptive + + +def require_devmem(cfg): + if not hasattr(cfg, "_devmem_probed"): + probe_command = f"{cfg.bin_local} -f {cfg.ifname}" + cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 + cfg._devmem_probed = True + + if not cfg._devmem_supported: + raise KsftSkipEx("Test requires devmem support") + + +@ksft_disruptive +def check_rx(cfg) -> None: + cfg.require_v6() + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}" + + with bkg(listen_cmd) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.v6}]:{port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") + +@ksft_disruptive +def check_tx(cfg) -> None: + cfg.require_ipver("6") + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"socat -U - TCP6-LISTEN:{port}" + + with bkg(listen_cmd, exit_wait=True) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_remote} -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run([check_rx, check_tx], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c new file mode 100644 index 00000000000000..fe8a5ceaa98038 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -0,0 +1,1065 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. + * + * Usage: + * + * On server: + * ncdevmem -s [-c ] -f eth1 -l -p 5201 + * + * On client: + * echo -n "hello\nworld" | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. + * + * Test data validation (devmem TCP on RX only): + * + * On server: + * ncdevmem -s [-c ] -f eth1 -l -p 5201 -v 7 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ + * head -c 1G | \ + * nc 5201 -p 5201 + * + * Test data validation (devmem TCP on RX and TX, validation happens on RX): + * + * On server: + * ncdevmem -s [-c ] -l -p 5201 -v 8 -f eth1 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06\\x07) | \ + * head -c 1M | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 + */ +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include +#include +#include +#include +#include +#include +#include +#define __iovec_defined +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netdev-user.h" +#include "ethtool-user.h" +#include + +#define PAGE_SHIFT 12 +#define TEST_PREFIX "ncdevmem" +#define NUM_PAGES 16000 + +#ifndef MSG_SOCK_DEVMEM +#define MSG_SOCK_DEVMEM 0x2000000 +#endif + +static char *server_ip; +static char *client_ip; +static char *port; +static size_t do_validation; +static int start_queue = -1; +static int num_queues = -1; +static char *ifname; +static unsigned int ifindex; +static unsigned int dmabuf_id; +static uint32_t tx_dmabuf_id; +static int waittime_ms = 500; + +struct memory_buffer { + int fd; + size_t size; + + int devfd; + int memfd; + char *buf_mem; +}; + +struct memory_provider { + struct memory_buffer *(*alloc)(size_t size); + void (*free)(struct memory_buffer *ctx); + void (*memcpy_to_device)(struct memory_buffer *dst, size_t off, + void *src, int n); + void (*memcpy_from_device)(void *dst, struct memory_buffer *src, + size_t off, int n); +}; + +static struct memory_buffer *udmabuf_alloc(size_t size) +{ + struct udmabuf_create create; + struct memory_buffer *ctx; + int ret; + + ctx = malloc(sizeof(*ctx)); + if (!ctx) + error(1, ENOMEM, "malloc failed"); + + ctx->size = size; + + ctx->devfd = open("/dev/udmabuf", O_RDWR); + if (ctx->devfd < 0) + error(1, errno, + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", + TEST_PREFIX); + + ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); + if (ctx->memfd < 0) + error(1, errno, "%s: [skip,no-memfd]\n", TEST_PREFIX); + + ret = fcntl(ctx->memfd, F_ADD_SEALS, F_SEAL_SHRINK); + if (ret < 0) + error(1, errno, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + + ret = ftruncate(ctx->memfd, size); + if (ret == -1) + error(1, errno, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + + memset(&create, 0, sizeof(create)); + + create.memfd = ctx->memfd; + create.offset = 0; + create.size = size; + ctx->fd = ioctl(ctx->devfd, UDMABUF_CREATE, &create); + if (ctx->fd < 0) + error(1, errno, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); + + ctx->buf_mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, 0); + if (ctx->buf_mem == MAP_FAILED) + error(1, errno, "%s: [FAIL, map udmabuf]\n", TEST_PREFIX); + + return ctx; +} + +static void udmabuf_free(struct memory_buffer *ctx) +{ + munmap(ctx->buf_mem, ctx->size); + close(ctx->fd); + close(ctx->memfd); + close(ctx->devfd); + free(ctx); +} + +static void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off, + void *src, int n) +{ + struct dma_buf_sync sync = {}; + + sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); + + memcpy(dst->buf_mem + off, src, n); + + sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); +} + +static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, + size_t off, int n) +{ + struct dma_buf_sync sync = {}; + + sync.flags = DMA_BUF_SYNC_START; + ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync); + + memcpy(dst, src->buf_mem + off, n); + + sync.flags = DMA_BUF_SYNC_END; + ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync); +} + +static struct memory_provider udmabuf_memory_provider = { + .alloc = udmabuf_alloc, + .free = udmabuf_free, + .memcpy_to_device = udmabuf_memcpy_to_device, + .memcpy_from_device = udmabuf_memcpy_from_device, +}; + +static struct memory_provider *provider = &udmabuf_memory_provider; + +static void print_nonzero_bytes(void *ptr, size_t size) +{ + unsigned char *p = ptr; + unsigned int i; + + for (i = 0; i < size; i++) + putchar(p[i]); +} + +void validate_buffer(void *line, size_t size) +{ + static unsigned char seed = 1; + unsigned char *ptr = line; + unsigned char expected; + static int errors; + size_t i; + + for (i = 0; i < size; i++) { + expected = seed ? seed : '\n'; + if (ptr[i] != expected) { + fprintf(stderr, + "Failed validation: expected=%u, actual=%u, index=%lu\n", + expected, ptr[i], i); + errors++; + if (errors > 20) + error(1, 0, "validation failed."); + } + seed++; + if (seed == do_validation) + seed = 0; + } + + fprintf(stdout, "Validated buffer\n"); +} + +static int rxq_num(int ifindex) +{ + struct ethtool_channels_get_req *req; + struct ethtool_channels_get_rsp *rsp; + struct ynl_error yerr; + struct ynl_sock *ys; + int num = -1; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = ethtool_channels_get_req_alloc(); + ethtool_channels_get_req_set_header_dev_index(req, ifindex); + rsp = ethtool_channels_get(ys, req); + if (rsp) + num = rsp->rx_count + rsp->combined_count; + ethtool_channels_get_req_free(req); + ethtool_channels_get_rsp_free(rsp); + + ynl_sock_destroy(ys); + + return num; +} + +#define run_command(cmd, ...) \ + ({ \ + char command[256]; \ + memset(command, 0, sizeof(command)); \ + snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ + fprintf(stderr, "Running: %s\n", command); \ + system(command); \ + }) + +static int reset_flow_steering(void) +{ + /* Depending on the NIC, toggling ntuple off and on might not + * be allowed. Additionally, attempting to delete existing filters + * will fail if no filters are present. Therefore, do not enforce + * the exit status. + */ + + run_command("sudo ethtool -K %s ntuple off >&2", ifname); + run_command("sudo ethtool -K %s ntuple on >&2", ifname); + run_command( + "sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 ethtool -N %s delete >&2", + ifname, ifname); + return 0; +} + +static const char *tcp_data_split_str(int val) +{ + switch (val) { + case 0: + return "off"; + case 1: + return "auto"; + case 2: + return "on"; + default: + return "?"; + } +} + +static int configure_headersplit(bool on) +{ + struct ethtool_rings_get_req *get_req; + struct ethtool_rings_get_rsp *get_rsp; + struct ethtool_rings_set_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + int ret; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = ethtool_rings_set_req_alloc(); + ethtool_rings_set_req_set_header_dev_index(req, ifindex); + /* 0 - off, 1 - auto, 2 - on */ + ethtool_rings_set_req_set_tcp_data_split(req, on ? 2 : 0); + ret = ethtool_rings_set(ys, req); + if (ret < 0) + fprintf(stderr, "YNL failed: %s\n", ys->err.msg); + ethtool_rings_set_req_free(req); + + if (ret == 0) { + get_req = ethtool_rings_get_req_alloc(); + ethtool_rings_get_req_set_header_dev_index(get_req, ifindex); + get_rsp = ethtool_rings_get(ys, get_req); + ethtool_rings_get_req_free(get_req); + if (get_rsp) + fprintf(stderr, "TCP header split: %s\n", + tcp_data_split_str(get_rsp->tcp_data_split)); + ethtool_rings_get_rsp_free(get_rsp); + } + + ynl_sock_destroy(ys); + + return ret; +} + +static int configure_rss(void) +{ + return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue); +} + +static int configure_channels(unsigned int rx, unsigned int tx) +{ + return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx); +} + +static int configure_flow_steering(struct sockaddr_in6 *server_sin) +{ + const char *type = "tcp6"; + const char *server_addr; + char buf[40]; + + inet_ntop(AF_INET6, &server_sin->sin6_addr, buf, sizeof(buf)); + server_addr = buf; + + if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) { + type = "tcp4"; + server_addr = strrchr(server_addr, ':') + 1; + } + + return run_command("sudo ethtool -N %s flow-type %s %s %s dst-ip %s %s %s dst-port %s queue %d >&2", + ifname, + type, + client_ip ? "src-ip" : "", + client_ip ?: "", + server_addr, + client_ip ? "src-port" : "", + client_ip ? port : "", + port, start_queue); +} + +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + struct netdev_queue_id *queues, + unsigned int n_queue_index, struct ynl_sock **ys) +{ + struct netdev_bind_rx_req *req = NULL; + struct netdev_bind_rx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = netdev_bind_rx_req_alloc(); + netdev_bind_rx_req_set_ifindex(req, ifindex); + netdev_bind_rx_req_set_fd(req, dmabuf_fd); + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); + + rsp = netdev_bind_rx(*ys, req); + if (!rsp) { + perror("netdev_bind_rx"); + goto err_close; + } + + if (!rsp->_present.id) { + perror("id not present"); + goto err_close; + } + + fprintf(stderr, "got dmabuf id=%d\n", rsp->id); + dmabuf_id = rsp->id; + + netdev_bind_rx_req_free(req); + netdev_bind_rx_rsp_free(rsp); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_rx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + +static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + struct ynl_sock **ys) +{ + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + if (!rsp) { + perror("netdev_bind_tx"); + goto err_close; + } + + if (!rsp->_present.id) { + perror("id not present"); + goto err_close; + } + + fprintf(stderr, "got tx dmabuf id=%d\n", rsp->id); + tx_dmabuf_id = rsp->id; + + netdev_bind_tx_req_free(req); + netdev_bind_tx_rsp_free(rsp); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_tx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + +static void enable_reuseaddr(int fd) +{ + int opt = 1; + int ret; + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + if (ret) + error(1, errno, "%s: [FAIL, SO_REUSEPORT]\n", TEST_PREFIX); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + if (ret) + error(1, errno, "%s: [FAIL, SO_REUSEADDR]\n", TEST_PREFIX); +} + +static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) +{ + int ret; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + + ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr); + if (ret != 1) { + /* fallback to plain IPv4 */ + ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]); + if (ret != 1) + return -1; + + /* add ::ffff prefix */ + sin6->sin6_addr.s6_addr32[0] = 0; + sin6->sin6_addr.s6_addr32[1] = 0; + sin6->sin6_addr.s6_addr16[4] = 0; + sin6->sin6_addr.s6_addr16[5] = 0xffff; + } + + return 0; +} + +static int do_server(struct memory_buffer *mem) +{ + char ctrl_data[sizeof(int) * 20000]; + struct netdev_queue_id *queues; + size_t non_page_aligned_frags = 0; + struct sockaddr_in6 client_addr; + struct sockaddr_in6 server_sin; + size_t page_aligned_frags = 0; + size_t total_received = 0; + socklen_t client_addr_len; + bool is_devmem = false; + char *tmp_mem = NULL; + struct ynl_sock *ys; + char iobuf[819200]; + char buffer[256]; + int socket_fd; + int client_fd; + size_t i = 0; + int ret; + + ret = parse_address(server_ip, atoi(port), &server_sin); + if (ret < 0) + error(1, 0, "parse server address"); + + if (reset_flow_steering()) + error(1, 0, "Failed to reset flow steering\n"); + + if (configure_headersplit(1)) + error(1, 0, "Failed to enable TCP header split\n"); + + /* Configure RSS to divert all traffic from our devmem queues */ + if (configure_rss()) + error(1, 0, "Failed to configure rss\n"); + + /* Flow steer our devmem flows to start_queue */ + if (configure_flow_steering(&server_sin)) + error(1, 0, "Failed to configure flow steering\n"); + + sleep(1); + + queues = malloc(sizeof(*queues) * num_queues); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + error(1, 0, "Failed to bind\n"); + + tmp_mem = malloc(mem->size); + if (!tmp_mem) + error(1, ENOMEM, "malloc failed"); + + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (socket_fd < 0) + error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); + + enable_reuseaddr(socket_fd); + + fprintf(stderr, "binding to address %s:%d\n", server_ip, + ntohs(server_sin.sin6_port)); + + ret = bind(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) + error(1, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); + + ret = listen(socket_fd, 1); + if (ret) + error(1, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); + + client_addr_len = sizeof(client_addr); + + inet_ntop(AF_INET6, &server_sin.sin6_addr, buffer, + sizeof(buffer)); + fprintf(stderr, "Waiting or connection on %s:%d\n", buffer, + ntohs(server_sin.sin6_port)); + client_fd = accept(socket_fd, &client_addr, &client_addr_len); + + inet_ntop(AF_INET6, &client_addr.sin6_addr, buffer, + sizeof(buffer)); + fprintf(stderr, "Got connection from %s:%d\n", buffer, + ntohs(client_addr.sin6_port)); + + while (1) { + struct iovec iov = { .iov_base = iobuf, + .iov_len = sizeof(iobuf) }; + struct dmabuf_cmsg *dmabuf_cmsg = NULL; + struct cmsghdr *cm = NULL; + struct msghdr msg = { 0 }; + struct dmabuf_token token; + ssize_t ret; + + is_devmem = false; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); + fprintf(stderr, "recvmsg ret=%ld\n", ret); + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + continue; + if (ret < 0) { + perror("recvmsg"); + continue; + } + if (ret == 0) { + fprintf(stderr, "client exited\n"); + goto cleanup; + } + + i++; + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_SOCKET || + (cm->cmsg_type != SCM_DEVMEM_DMABUF && + cm->cmsg_type != SCM_DEVMEM_LINEAR)) { + fprintf(stderr, "skipping non-devmem cmsg\n"); + continue; + } + + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); + is_devmem = true; + + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) { + /* TODO: process data copied from skb's linear + * buffer. + */ + fprintf(stderr, + "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", + dmabuf_cmsg->frag_size); + + continue; + } + + token.token_start = dmabuf_cmsg->frag_token; + token.token_count = 1; + + total_received += dmabuf_cmsg->frag_size; + fprintf(stderr, + "received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", + dmabuf_cmsg->frag_offset >> PAGE_SHIFT, + dmabuf_cmsg->frag_offset % getpagesize(), + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size, dmabuf_cmsg->frag_token, + total_received, dmabuf_cmsg->dmabuf_id); + + if (dmabuf_cmsg->dmabuf_id != dmabuf_id) + error(1, 0, + "received on wrong dmabuf_id: flow steering error\n"); + + if (dmabuf_cmsg->frag_size % getpagesize()) + non_page_aligned_frags++; + else + page_aligned_frags++; + + provider->memcpy_from_device(tmp_mem, mem, + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size); + + if (do_validation) + validate_buffer(tmp_mem, + dmabuf_cmsg->frag_size); + else + print_nonzero_bytes(tmp_mem, + dmabuf_cmsg->frag_size); + + ret = setsockopt(client_fd, SOL_SOCKET, + SO_DEVMEM_DONTNEED, &token, + sizeof(token)); + if (ret != 1) + error(1, 0, + "SO_DEVMEM_DONTNEED not enough tokens"); + } + if (!is_devmem) + error(1, 0, "flow steering error\n"); + + fprintf(stderr, "total_received=%lu\n", total_received); + } + + fprintf(stderr, "%s: ok\n", TEST_PREFIX); + + fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + + fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + page_aligned_frags, non_page_aligned_frags); + +cleanup: + + free(tmp_mem); + close(client_fd); + close(socket_fd); + ynl_sock_destroy(ys); + + return 0; +} + +void run_devmem_tests(void) +{ + struct netdev_queue_id *queues; + struct memory_buffer *mem; + struct ynl_sock *ys; + size_t i = 0; + + mem = provider->alloc(getpagesize() * NUM_PAGES); + + /* Configure RSS to divert all traffic from our devmem queues */ + if (configure_rss()) + error(1, 0, "rss error\n"); + + queues = calloc(num_queues, sizeof(*queues)); + + if (configure_headersplit(1)) + error(1, 0, "Failed to configure header split\n"); + + if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + error(1, 0, "Binding empty queues array should have failed\n"); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (configure_headersplit(0)) + error(1, 0, "Failed to configure header split\n"); + + if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + error(1, 0, "Configure dmabuf with header split off should have failed\n"); + + if (configure_headersplit(1)) + error(1, 0, "Failed to configure header split\n"); + + for (i = 0; i < num_queues; i++) { + queues[i]._present.type = 1; + queues[i]._present.id = 1; + queues[i].type = NETDEV_QUEUE_TYPE_RX; + queues[i].id = start_queue + i; + } + + if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) + error(1, 0, "Failed to bind\n"); + + /* Deactivating a bound queue should not be legal */ + if (!configure_channels(num_queues, num_queues - 1)) + error(1, 0, "Deactivating a bound queue should be illegal.\n"); + + /* Closing the netlink socket does an implicit unbind */ + ynl_sock_destroy(ys); + + provider->free(mem); +} + +static uint64_t gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000ULL) + (tv.tv_usec / 1000ULL); +} + +static int do_poll(int fd) +{ + struct pollfd pfd; + int ret; + + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, waittime_ms); + if (ret == -1) + error(1, errno, "poll"); + + return ret && (pfd.revents & POLLERR); +} + +static void wait_compl(int fd) +{ + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + __u32 hi, lo; + int ret; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) + continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret < 0) { + if (errno == EAGAIN) + continue; + error(1, errno, "recvmsg(MSG_ERRQUEUE)"); + return; + } + if (msg.msg_flags & MSG_CTRUNC) + error(1, 0, "MSG_CTRUNC\n"); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_IP && + cm->cmsg_level != SOL_IPV6) + continue; + if (cm->cmsg_level == SOL_IP && + cm->cmsg_type != IP_RECVERR) + continue; + if (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type != IPV6_RECVERR) + continue; + + serr = (void *)CMSG_DATA(cm); + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "wrong origin %u", serr->ee_origin); + if (serr->ee_errno != 0) + error(1, 0, "wrong errno %d", serr->ee_errno); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stderr, "tx complete [%d,%d]\n", lo, hi); + return; + } + } + + error(1, 0, "did not receive tx completion"); +} + +static int do_client(struct memory_buffer *mem) +{ + char ctrl_data[CMSG_SPACE(sizeof(__u32))]; + struct sockaddr_in6 server_sin; + struct sockaddr_in6 client_sin; + struct ynl_sock *ys = NULL; + struct msghdr msg = {}; + ssize_t line_size = 0; + struct cmsghdr *cmsg; + struct iovec iov[2]; + char *line = NULL; + unsigned long mid; + size_t len = 0; + int socket_fd; + __u32 ddmabuf; + int opt = 1; + int ret; + + ret = parse_address(server_ip, atoi(port), &server_sin); + if (ret < 0) + error(1, 0, "parse server address"); + + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (socket_fd < 0) + error(1, socket_fd, "create socket"); + + enable_reuseaddr(socket_fd); + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, + strlen(ifname) + 1); + if (ret) + error(1, errno, "bindtodevice"); + + if (bind_tx_queue(ifindex, mem->fd, &ys)) + error(1, 0, "Failed to bind\n"); + + if (client_ip) { + ret = parse_address(client_ip, atoi(port), &client_sin); + if (ret < 0) + error(1, 0, "parse client address"); + + ret = bind(socket_fd, &client_sin, sizeof(client_sin)); + if (ret) + error(1, errno, "bind"); + } + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + if (ret) + error(1, errno, "set sock opt"); + + fprintf(stderr, "Connect to %s %d (via %s)\n", server_ip, + ntohs(server_sin.sin6_port), ifname); + + ret = connect(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) + error(1, errno, "connect"); + + while (1) { + free(line); + line = NULL; + line_size = getline(&line, &len, stdin); + + if (line_size < 0) + break; + + mid = (line_size / 2) + 1; + + iov[0].iov_base = (void *)1; + iov[0].iov_len = mid; + iov[1].iov_base = (void *)(mid + 2); + iov[1].iov_len = line_size - mid; + + provider->memcpy_to_device(mem, (size_t)iov[0].iov_base, line, + iov[0].iov_len); + provider->memcpy_to_device(mem, (size_t)iov[1].iov_base, + line + iov[0].iov_len, + iov[1].iov_len); + + fprintf(stderr, + "read line_size=%ld iov[0].iov_base=%lu, iov[0].iov_len=%lu, iov[1].iov_base=%lu, iov[1].iov_len=%lu\n", + line_size, (unsigned long)iov[0].iov_base, + iov[0].iov_len, (unsigned long)iov[1].iov_base, + iov[1].iov_len); + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + + ddmabuf = tx_dmabuf_id; + + *((__u32 *)CMSG_DATA(cmsg)) = ddmabuf; + + ret = sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + if (ret < 0) + error(1, errno, "Failed sendmsg"); + + fprintf(stderr, "sendmsg_ret=%d\n", ret); + + if (ret != line_size) + error(1, errno, "Did not send all bytes"); + + wait_compl(socket_fd); + } + + fprintf(stderr, "%s: tx ok\n", TEST_PREFIX); + + free(line); + close(socket_fd); + + if (ys) + ynl_sock_destroy(ys); + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct memory_buffer *mem; + int is_server = 0, opt; + int ret; + + while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) { + switch (opt) { + case 'l': + is_server = 1; + break; + case 's': + server_ip = optarg; + break; + case 'c': + client_ip = optarg; + break; + case 'p': + port = optarg; + break; + case 'v': + do_validation = atoll(optarg); + break; + case 'q': + num_queues = atoi(optarg); + break; + case 't': + start_queue = atoi(optarg); + break; + case 'f': + ifname = optarg; + break; + case '?': + fprintf(stderr, "unknown option: %c\n", optopt); + break; + } + } + + if (!ifname) + error(1, 0, "Missing -f argument\n"); + + ifindex = if_nametoindex(ifname); + + fprintf(stderr, "using ifindex=%u\n", ifindex); + + if (!server_ip && !client_ip) { + if (start_queue < 0 && num_queues < 0) { + num_queues = rxq_num(ifindex); + if (num_queues < 0) + error(1, 0, "couldn't detect number of queues\n"); + if (num_queues < 2) + error(1, 0, + "number of device queues is too low\n"); + /* make sure can bind to multiple queues */ + start_queue = num_queues / 2; + num_queues /= 2; + } + + if (start_queue < 0 || num_queues < 0) + error(1, 0, "Both -t and -q are required\n"); + + run_devmem_tests(); + return 0; + } + + if (start_queue < 0 && num_queues < 0) { + num_queues = rxq_num(ifindex); + if (num_queues < 2) + error(1, 0, "number of device queues is too low\n"); + + num_queues = 1; + start_queue = rxq_num(ifindex) - num_queues; + + if (start_queue < 0) + error(1, 0, "couldn't detect number of queues\n"); + + fprintf(stderr, "using queues %d..%d\n", start_queue, start_queue + num_queues); + } + + for (; optind < argc; optind++) + fprintf(stderr, "extra arguments: %s\n", argv[optind]); + + if (start_queue < 0) + error(1, 0, "Missing -t argument\n"); + + if (num_queues < 0) + error(1, 0, "Missing -q argument\n"); + + if (!server_ip) + error(1, 0, "Missing -s argument\n"); + + if (!port) + error(1, 0, "Missing -p argument\n"); + + mem = provider->alloc(getpagesize() * NUM_PAGES); + ret = is_server ? do_server(mem) : do_client(mem); + provider->free(mem); + + return ret; +} diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 59fe07ee2df9d9..cec3e1296aa271 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -18,7 +18,6 @@ ipv6_flowlabel_mgr log.txt msg_oob msg_zerocopy -ncdevmem netlink-dumps nettest psock_fanout diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2c4b6e404a7c7f..0466ce43409fa2 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -101,8 +101,6 @@ TEST_PROGS += ipv6_route_update_soft_lockup.sh # YNL files, must be before "include ..lib.mk" EXTRA_CLEAN += $(OUTPUT)/libynl.a -YNL_GEN_FILES := ncdevmem -TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh @@ -113,10 +111,6 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk -# YNL build -YNL_GENS := netdev -include ynl.mk - $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c deleted file mode 100644 index 64d6805381c50c..00000000000000 --- a/tools/testing/selftests/net/ncdevmem.c +++ /dev/null @@ -1,570 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#define __EXPORTED_HEADERS__ - -#include -#include -#include -#include -#include -#include -#include -#define __iovec_defined -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "netdev-user.h" -#include - -#define PAGE_SHIFT 12 -#define TEST_PREFIX "ncdevmem" -#define NUM_PAGES 16000 - -#ifndef MSG_SOCK_DEVMEM -#define MSG_SOCK_DEVMEM 0x2000000 -#endif - -/* - * tcpdevmem netcat. Works similarly to netcat but does device memory TCP - * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. - * - * Usage: - * - * On server: - * ncdevmem -s -c -f eth1 -l -p 5201 -v 7 - * - * On client: - * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ - * tr \\n \\0 | \ - * head -c 5G | \ - * nc 5201 -p 5201 - * - * Note this is compatible with regular netcat. i.e. the sender or receiver can - * be replaced with regular netcat to test the RX or TX path in isolation. - */ - -static char *server_ip = "192.168.1.4"; -static char *client_ip = "192.168.1.2"; -static char *port = "5201"; -static size_t do_validation; -static int start_queue = 8; -static int num_queues = 8; -static char *ifname = "eth1"; -static unsigned int ifindex; -static unsigned int dmabuf_id; - -void print_bytes(void *ptr, size_t size) -{ - unsigned char *p = ptr; - int i; - - for (i = 0; i < size; i++) - printf("%02hhX ", p[i]); - printf("\n"); -} - -void print_nonzero_bytes(void *ptr, size_t size) -{ - unsigned char *p = ptr; - unsigned int i; - - for (i = 0; i < size; i++) - putchar(p[i]); - printf("\n"); -} - -void validate_buffer(void *line, size_t size) -{ - static unsigned char seed = 1; - unsigned char *ptr = line; - int errors = 0; - size_t i; - - for (i = 0; i < size; i++) { - if (ptr[i] != seed) { - fprintf(stderr, - "Failed validation: expected=%u, actual=%u, index=%lu\n", - seed, ptr[i], i); - errors++; - if (errors > 20) - error(1, 0, "validation failed."); - } - seed++; - if (seed == do_validation) - seed = 0; - } - - fprintf(stdout, "Validated buffer\n"); -} - -#define run_command(cmd, ...) \ - ({ \ - char command[256]; \ - memset(command, 0, sizeof(command)); \ - snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ - printf("Running: %s\n", command); \ - system(command); \ - }) - -static int reset_flow_steering(void) -{ - int ret = 0; - - ret = run_command("sudo ethtool -K %s ntuple off", ifname); - if (ret) - return ret; - - return run_command("sudo ethtool -K %s ntuple on", ifname); -} - -static int configure_headersplit(bool on) -{ - return run_command("sudo ethtool -G %s tcp-data-split %s", ifname, - on ? "on" : "off"); -} - -static int configure_rss(void) -{ - return run_command("sudo ethtool -X %s equal %d", ifname, start_queue); -} - -static int configure_channels(unsigned int rx, unsigned int tx) -{ - return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx); -} - -static int configure_flow_steering(void) -{ - return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", - ifname, client_ip, server_ip, port, port, start_queue); -} - -static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, - struct netdev_queue_id *queues, - unsigned int n_queue_index, struct ynl_sock **ys) -{ - struct netdev_bind_rx_req *req = NULL; - struct netdev_bind_rx_rsp *rsp = NULL; - struct ynl_error yerr; - - *ys = ynl_sock_create(&ynl_netdev_family, &yerr); - if (!*ys) { - fprintf(stderr, "YNL: %s\n", yerr.msg); - return -1; - } - - req = netdev_bind_rx_req_alloc(); - netdev_bind_rx_req_set_ifindex(req, ifindex); - netdev_bind_rx_req_set_fd(req, dmabuf_fd); - __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); - - rsp = netdev_bind_rx(*ys, req); - if (!rsp) { - perror("netdev_bind_rx"); - goto err_close; - } - - if (!rsp->_present.id) { - perror("id not present"); - goto err_close; - } - - printf("got dmabuf id=%d\n", rsp->id); - dmabuf_id = rsp->id; - - netdev_bind_rx_req_free(req); - netdev_bind_rx_rsp_free(rsp); - - return 0; - -err_close: - fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); - netdev_bind_rx_req_free(req); - ynl_sock_destroy(*ys); - return -1; -} - -static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) -{ - struct udmabuf_create create; - int ret; - - *devfd = open("/dev/udmabuf", O_RDWR); - if (*devfd < 0) { - error(70, 0, - "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", - TEST_PREFIX); - } - - *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); - if (*memfd < 0) - error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX); - - /* Required for udmabuf */ - ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); - if (ret < 0) - error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); - - ret = ftruncate(*memfd, dmabuf_size); - if (ret == -1) - error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); - - memset(&create, 0, sizeof(create)); - - create.memfd = *memfd; - create.offset = 0; - create.size = dmabuf_size; - *buf = ioctl(*devfd, UDMABUF_CREATE, &create); - if (*buf < 0) - error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); -} - -int do_server(void) -{ - char ctrl_data[sizeof(int) * 20000]; - struct netdev_queue_id *queues; - size_t non_page_aligned_frags = 0; - struct sockaddr_in client_addr; - struct sockaddr_in server_sin; - size_t page_aligned_frags = 0; - int devfd, memfd, buf, ret; - size_t total_received = 0; - socklen_t client_addr_len; - bool is_devmem = false; - char *buf_mem = NULL; - struct ynl_sock *ys; - size_t dmabuf_size; - char iobuf[819200]; - char buffer[256]; - int socket_fd; - int client_fd; - size_t i = 0; - int opt = 1; - - dmabuf_size = getpagesize() * NUM_PAGES; - - create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); - - if (reset_flow_steering()) - error(1, 0, "Failed to reset flow steering\n"); - - /* Configure RSS to divert all traffic from our devmem queues */ - if (configure_rss()) - error(1, 0, "Failed to configure rss\n"); - - /* Flow steer our devmem flows to start_queue */ - if (configure_flow_steering()) - error(1, 0, "Failed to configure flow steering\n"); - - sleep(1); - - queues = malloc(sizeof(*queues) * num_queues); - - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - - if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) - error(1, 0, "Failed to bind\n"); - - buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED, - buf, 0); - if (buf_mem == MAP_FAILED) - error(1, 0, "mmap()"); - - server_sin.sin_family = AF_INET; - server_sin.sin_port = htons(atoi(port)); - - ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); - if (socket < 0) - error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX); - - socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); - if (socket < 0) - error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); - - ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, - sizeof(opt)); - if (ret) - error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); - - ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, - sizeof(opt)); - if (ret) - error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); - - printf("binding to address %s:%d\n", server_ip, - ntohs(server_sin.sin_port)); - - ret = bind(socket_fd, &server_sin, sizeof(server_sin)); - if (ret) - error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); - - ret = listen(socket_fd, 1); - if (ret) - error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); - - client_addr_len = sizeof(client_addr); - - inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, - sizeof(buffer)); - printf("Waiting or connection on %s:%d\n", buffer, - ntohs(server_sin.sin_port)); - client_fd = accept(socket_fd, &client_addr, &client_addr_len); - - inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, - sizeof(buffer)); - printf("Got connection from %s:%d\n", buffer, - ntohs(client_addr.sin_port)); - - while (1) { - struct iovec iov = { .iov_base = iobuf, - .iov_len = sizeof(iobuf) }; - struct dmabuf_cmsg *dmabuf_cmsg = NULL; - struct dma_buf_sync sync = { 0 }; - struct cmsghdr *cm = NULL; - struct msghdr msg = { 0 }; - struct dmabuf_token token; - ssize_t ret; - - is_devmem = false; - printf("\n\n"); - - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = ctrl_data; - msg.msg_controllen = sizeof(ctrl_data); - ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); - printf("recvmsg ret=%ld\n", ret); - if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) - continue; - if (ret < 0) { - perror("recvmsg"); - continue; - } - if (ret == 0) { - printf("client exited\n"); - goto cleanup; - } - - i++; - for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { - if (cm->cmsg_level != SOL_SOCKET || - (cm->cmsg_type != SCM_DEVMEM_DMABUF && - cm->cmsg_type != SCM_DEVMEM_LINEAR)) { - fprintf(stdout, "skipping non-devmem cmsg\n"); - continue; - } - - dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); - is_devmem = true; - - if (cm->cmsg_type == SCM_DEVMEM_LINEAR) { - /* TODO: process data copied from skb's linear - * buffer. - */ - fprintf(stdout, - "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", - dmabuf_cmsg->frag_size); - - continue; - } - - token.token_start = dmabuf_cmsg->frag_token; - token.token_count = 1; - - total_received += dmabuf_cmsg->frag_size; - printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", - dmabuf_cmsg->frag_offset >> PAGE_SHIFT, - dmabuf_cmsg->frag_offset % getpagesize(), - dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size, - dmabuf_cmsg->frag_token, total_received, - dmabuf_cmsg->dmabuf_id); - - if (dmabuf_cmsg->dmabuf_id != dmabuf_id) - error(1, 0, - "received on wrong dmabuf_id: flow steering error\n"); - - if (dmabuf_cmsg->frag_size % getpagesize()) - non_page_aligned_frags++; - else - page_aligned_frags++; - - sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START; - ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); - - if (do_validation) - validate_buffer( - ((unsigned char *)buf_mem) + - dmabuf_cmsg->frag_offset, - dmabuf_cmsg->frag_size); - else - print_nonzero_bytes( - ((unsigned char *)buf_mem) + - dmabuf_cmsg->frag_offset, - dmabuf_cmsg->frag_size); - - sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END; - ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); - - ret = setsockopt(client_fd, SOL_SOCKET, - SO_DEVMEM_DONTNEED, &token, - sizeof(token)); - if (ret != 1) - error(1, 0, - "SO_DEVMEM_DONTNEED not enough tokens"); - } - if (!is_devmem) - error(1, 0, "flow steering error\n"); - - printf("total_received=%lu\n", total_received); - } - - fprintf(stdout, "%s: ok\n", TEST_PREFIX); - - fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", - page_aligned_frags, non_page_aligned_frags); - - fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", - page_aligned_frags, non_page_aligned_frags); - -cleanup: - - munmap(buf_mem, dmabuf_size); - close(client_fd); - close(socket_fd); - close(buf); - close(memfd); - close(devfd); - ynl_sock_destroy(ys); - - return 0; -} - -void run_devmem_tests(void) -{ - struct netdev_queue_id *queues; - int devfd, memfd, buf; - struct ynl_sock *ys; - size_t dmabuf_size; - size_t i = 0; - - dmabuf_size = getpagesize() * NUM_PAGES; - - create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); - - /* Configure RSS to divert all traffic from our devmem queues */ - if (configure_rss()) - error(1, 0, "rss error\n"); - - queues = calloc(num_queues, sizeof(*queues)); - - if (configure_headersplit(1)) - error(1, 0, "Failed to configure header split\n"); - - if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) - error(1, 0, "Binding empty queues array should have failed\n"); - - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - - if (configure_headersplit(0)) - error(1, 0, "Failed to configure header split\n"); - - if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) - error(1, 0, "Configure dmabuf with header split off should have failed\n"); - - if (configure_headersplit(1)) - error(1, 0, "Failed to configure header split\n"); - - for (i = 0; i < num_queues; i++) { - queues[i]._present.type = 1; - queues[i]._present.id = 1; - queues[i].type = NETDEV_QUEUE_TYPE_RX; - queues[i].id = start_queue + i; - } - - if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) - error(1, 0, "Failed to bind\n"); - - /* Deactivating a bound queue should not be legal */ - if (!configure_channels(num_queues, num_queues - 1)) - error(1, 0, "Deactivating a bound queue should be illegal.\n"); - - /* Closing the netlink socket does an implicit unbind */ - ynl_sock_destroy(ys); -} - -int main(int argc, char *argv[]) -{ - int is_server = 0, opt; - - while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) { - switch (opt) { - case 'l': - is_server = 1; - break; - case 's': - server_ip = optarg; - break; - case 'c': - client_ip = optarg; - break; - case 'p': - port = optarg; - break; - case 'v': - do_validation = atoll(optarg); - break; - case 'q': - num_queues = atoi(optarg); - break; - case 't': - start_queue = atoi(optarg); - break; - case 'f': - ifname = optarg; - break; - case '?': - printf("unknown option: %c\n", optopt); - break; - } - } - - ifindex = if_nametoindex(ifname); - - for (; optind < argc; optind++) - printf("extra arguments: %s\n", argv[optind]); - - run_devmem_tests(); - - if (is_server) - return do_server(); - - return 0; -}