google · gfantom · Aug 23, 2023 · Aug 25, 2023 · Sep 12, 2023 · Sep 12, 2023
diff --git a/Makefile b/Makefile
@@ -18,7 +18,20 @@
 
 all: binaries
 
-CFLAGS = -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
+CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
+
+HEADERS_DIR := usr/include
+
+ifdef WITH_TCPDEVMEM_CUDA
+	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I $(HEADERS_DIR)
+endif
+ifdef WITH_TCPDEVMEM_UDMABUF
+	CFLAGS += -DWITH_TCPDEVMEM_UDMABUF -DNDEBUG=1 -static -I $(HEADERS_DIR)
+	LDFLAGS += -static
+endif
+
+ifndef_any_of = $(filter undefined,$(foreach v,$(1),$(origin $(v))))
+ifdef_any_of = $(filter-out undefined,$(foreach v,$(1),$(origin $(v))))
 
 lib := \
 	check_all_options.o \
@@ -48,6 +61,16 @@ lib := \
 tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
 tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
+ifdef WITH_TCPDEVMEM_CUDA
+	tcp_stream-objs += tcpdevmem_cuda.o
+endif
+ifdef WITH_TCPDEVMEM_UDMABUF
+	tcp_stream-objs += tcpdevmem_udmabuf.o
+endif
+ifneq ($(call ifdef_any_of,WITH_TCPDEVMEM_CUDA WITH_TCPDEVMEM_UDMABUF),)
+	tcp_stream-objs += tcpdevmem.o
+endif
+
 
 tcp_crr-objs := tcp_crr_main.o tcp_crr.o rr.o $(lib)
 
@@ -63,11 +86,18 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 
 ext-libs := -lm -lrt -lpthread
 
+tcpdevmem_cuda.o: tcpdevmem_cuda.cu
+	nvcc -arch=sm_90 -O3 -g -I $(HEADERS_DIR) -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
+
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
 tcp_stream: $(tcp_stream-objs)
+ifdef WITH_TCPDEVMEM_CUDA
+	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
+else
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
+endif
 
 tcp_crr: $(tcp_crr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)

diff --git a/README_tcpdevmem.md b/README_tcpdevmem.md
@@ -0,0 +1,212 @@
+# Neper with TCPDevmem run instructions
+
+Table of Contents
+- [Neper with TCPDevmem run instructions](#neper-with-tcpdevmem-run-instructions)
+  - [TCPDevmem UDMABUF: Compiling tcp\_stream](#tcpdevmem-udmabuf-compiling-tcp_stream)
+    - [Manually specifying kernel headers directory (i.e. NOT in `usr/include`)](#manually-specifying-kernel-headers-directory-ie-not-in-usrinclude)
+  - [Running tcp\_stream](#running-tcp_stream)
+    - [Added flags](#added-flags)
+    - [Running tcp\_stream via `multi_neper.py`](#running-tcp_stream-via-multi_neperpy)
+      - [Example of successful output](#example-of-successful-output)
+    - [Running tcp\_stream directly](#running-tcp_stream-directly)
+
+
+## TCPDevmem UDMABUF: Compiling tcp_stream
+
+**UDMABUF-capable tcp_stream can be built statically on a workstation.**
+
+Neper can be built statically on a host with UDMABUF header files.
+
+```
+# clone the Neper repository and checkout the tcpd branch
+git clone -b tcpd https://github.com/google/neper.git
+cd neper
+
+# copy kernel header files to Neper working directory
+# (assumed to be found in ~/kernel/usr/include)
+mkdir usr
+cp -r ~/kernel/usr/include/ ./usr/
+
+make tcp_steam WITH_TCPDEVMEM_UDMABUF=1
+
+# copy the binary to your hosts
+scp tcp_stream root@${HOST1}:~/
+scp multi_neper.py root@${HOST1}:~/
+
+scp tcp_stream root@${HOST2}:~/
+scp multi_neper.py root@${HOST2}:~/
+```
+
+### Manually specifying kernel headers directory (i.e. NOT in `usr/include`)
+
+Copying the header files is unnecessary if you override `HEADERS_DIR` variable when running make. The default value for this variable is `usr/include`.
+
+```
+git clone -b tcpd https://github.com/google/neper.git
+cd neper
+
+make tcp_steam WITH_TCPDEVMEM_UDMABUF=1 HEADERS_DIR=~/kernel/usr/include
+```
+
+
+## Running tcp_stream
+
+
+### Added flags
+
+In general, these flags will be automatically populated by `multi_neper.py`.
+
+```
+--tcpd-validate     # payload validation - must pass to both Tx/Rx if enabled
+--tcpd-tcpd-rx-cpy  # copies payload to another buffer (but doesn't validate)
+--tcpd-nic-pci-addr
+--tcpd-gpu-pci-addr
+--tcpd-phys-len     # CUDA mode allows for a much larger value than UDMABUF mode
+--tcpd-src-ip
+--tcpd-dst-ip
+--tcpd-link-name
+--queue-start
+--queue-num
+```
+
+`--tcpd-validate`: Client populates the send buffer with [1,111] repeating, and Host verifies the repeating sequence.
+
+
+### Running tcp_stream via `multi_neper.py`
+
+`multi_neper.py` is a python script that runs in parallel multiple tcp_streams, which is useful when running tcp_stream across multiple pairs of NICs.
+
+The script also calls ethtool commands on the receiver (host) before spawning tcp_streams, to set the receiver into a TCPDevmem-capable state.
+
+To view all of `multi_neper.py`’s accepted flags, run `multi_neper.py --help`.
+
+
+```
+# Rx (host)
+FLOWS=2
+BUF_SIZE=409600
+DEVS=eth1,eth2,eth3,eth4
+DSTS=192.168.1.26,192.168.2.26,192.168.3.26,192.168.4.26 # host IP addresses
+SRCS=192.168.1.23,192.168.2.23,192.168.3.23,192.168.4.23 # client IP addresses
+./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --mode cuda
+
+
+# Tx (client)
+FLOWS=2
+BUF_SIZE=409600
+DEVS=eth1,eth2,eth3,eth4
+DSTS=192.168.1.26,192.168.2.26,192.168.3.26,192.168.4.26
+SRCS=192.168.1.23,192.168.2.23,192.168.3.23,192.168.4.23
+./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --client \
+  --mode cuda
+```
+
+#### Example of successful output
+
+```
+DEBUG:root:minflt_end=6037
+DEBUG:root:majflt_start=0
+DEBUG:root:majflt_end=0
+DEBUG:root:nvcsw_start=653
+DEBUG:root:nvcsw_end=675141
+DEBUG:root:nivcsw_start=2
+DEBUG:root:nivcsw_end=1018
+DEBUG:root:num_samples=155
+DEBUG:root:time_end=613529.729042674
+DEBUG:root:correlation_coefficient=1.00
+DEBUG:root:throughput=193669.32
+DEBUG:root:throughput_units=Mbit/s
+DEBUG:root:local_throughput=193669323769
+DEBUG:root:remote_throughput=0
+DEBUG:root:
+[eth1] Throughput (Mb/s): 193551.94
+[eth2] Throughput (Mb/s): 193652.69
+[eth3] Throughput (Mb/s): 193640.21
+[eth4] Throughput (Mb/s): 193669.32
+```
+
+
+
+### Running tcp_stream directly
+
+**If you’re running Neper outside of the container, make sure to run**
+
+```
+sudo -s
+```
+
+**before everything. `ethtool` commands and queue-binding is only available to superuser.**
+
+Before running tcp_stream, the ethtool commands that `multi_neper.py` runs should also be run:
+
+```
+# run as superuser, if running Neper as root
+sudo -s
+
+res_link() {
+ethtool --set-priv-flags $1 enable-strict-header-split on
+ethtool --set-priv-flags $1 enable-strict-header-split off
+ethtool --set-priv-flags $1 enable-header-split off
+ethtool --set-rxfh-indir $1 equal 16
+ethtool -K $1 ntuple off
+ethtool --set-priv-flags $1 enable-strict-header-split off
+ethtool --set-priv-flags $1 enable-header-split off
+ethtool -K $1 ntuple off
+ethtool --set-priv-flags $1 enable-max-rx-buffer-size on
+ethtool -K $1 ntuple on
+}
+
+# call on each link you plan to run tcp_stream across
+res_link eth1
+```
+
+
+You can then run `multi_neper.py` with the `--dry-run` flag, to see what tcp_stream commands the script would run:
+
+
+```
+$ FLOWS=1
+$ BUF_SIZE=409600
+$ DEVS=eth1
+$ DSTS=192.168.1.26
+$ SRCS=192.168.1.23
+$ ./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --client \
+  --mode cuda \
+  --dry-run
+
+DEBUG:root:running on ['eth1']
+DEBUG:root:('taskset --cpu-list 2-2 ./tcp_stream -T 1 -F 1 --port 12345 --source-port 12345 --control-port 12866 --buffer-size 409600  -l 10 --num-ports 1 --tcpd-phys-len 2147483648 --tcpd-nic-pci-addr 0000:06:00.0 --tcpd-gpu-pci-addr 0000:04:00.0 -c -H 192.168.1.26', {'CUDA_VISIBLE_DEVICES': '0', ...
+```
+
+The script will print the tcp_stream command, as well as the environment variables. The only environment variable that matters is `CUDA_VISIBLE_DEVICES` if running in `cuda` mode, which tells tcp_stream which GPU it should allocate memory on.
+
+You can then reset the receiver, and copy/paste the command:
+
+```
+# on Rx (host)
+res_link eth1
+./multi_neper.py --dry-run ${other_rx_args}
+
+CUDA_VISIBLE_DEVICES=0 ./tcp_stream # copy cmd from previous line
+
+
+# on Tx (client)
+./multi_neper.py --dry-run ${other_tx_args}
+
+CUDA_VISIBLE_DEVICES=0 ./tcp_stream # copy cmd from previous line
+```
diff --git a/check_all_options.c b/check_all_options.c
@@ -103,6 +103,34 @@ void check_options_tcp_rr(struct options *opts, struct callbacks *cb)
 
 void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
 {
+#ifdef WITH_TCPDEVMEM_CUDA
+      if (opts->tcpd_gpu_pci_addr) {
+            CHECK(cb, opts->tcpd_nic_pci_addr,
+                  "Must provide NIC PCI address if GPU PCI address was provided.");
+
+            if (opts->client) {
+                  CHECK(cb, !opts->tcpd_rx_cpy,
+                        "Copying CUDA buffer to userspace only allowed on hosts.");
+            }
+      }
+#endif /* WITH_TCPDEVMEM_CUDA */
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
+      if (opts->tcpd_nic_pci_addr) {
+            CHECK(cb, opts->tcpd_phys_len > 0,
+                  "Must provide non-zero --tcpd-phys-len flag when running in devmem TCP mode.");
+            CHECK(cb, opts->num_flows == opts->num_threads,
+                  "Thread/Flow count must be equal when running in devmem TCP mode.");
+            CHECK(cb, opts->num_flows == opts->num_ports,
+                  "Number of ports should equal number of flows when running in devmem TCP mode.");
+
+            if (!opts->client) {
+                  CHECK(cb, opts->tcpd_src_ip,
+                        "Must provide source IP address for devmem TCP host.");
+                  CHECK(cb, opts->tcpd_dst_ip,
+                        "Must provide destination IP address for devmem TCP host.");
+            }
+      }
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)

diff --git a/define_all_flags.c b/define_all_flags.c
@@ -145,6 +145,18 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          split_bidir ,    false,    0,  "Bidirectional using separate tx/rx sockets");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
+        DEFINE_FLAG(fp, bool,          tcpd_validate,   false,  0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
+        DEFINE_FLAG(fp, bool,          tcpd_rx_cpy,     false,  0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
+        DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,    0, "NIC PCI addr, e.x. 0000:06:00.0");
+        DEFINE_FLAG(fp, const char *,  tcpd_gpu_pci_addr, 0,    0, "GPU PCI addr, e.x. 0000:04:00.0");
+        DEFINE_FLAG(fp, unsigned long long, tcpd_phys_len, 0,   0, "Remote memory length for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_src_ip,     0,      0, "Src ip address for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_dst_ip,     0,      0, "Dst ip address for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_link_name,  "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
+        DEFINE_FLAG(fp, int,           queue_start,     8,      0, "Queue to start flow-steering at");
+        DEFINE_FLAG(fp, int,           queue_num,       4,      0, "Number of queues to flow-steer to");
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 
         /* Return the updated fp */
         return (fp);

diff --git a/flags.c b/flags.c
@@ -157,6 +157,8 @@ static void default_parser(const char *type, char *arg, void *out,
                 *(unsigned long *)out = strtoul(arg, NULL, 0);
         else if (strcmp(type, "double") == 0)
                 *(double *)out = atof(arg);
+        else if (strcmp(type, "unsigned long long") == 0)
+                *(unsigned long long *)out = strtoull(arg, NULL, 0);
         else
                 LOG_ERROR(cb, "Unknown type `%s' for arg `%s'.", type, arg);
 }
@@ -339,6 +341,8 @@ static void print_flag(const struct flag *flag, struct callbacks *cb)
                 PRINT(cb, name, "%f", *(double *)var);
         else if (strcmp(type, "long long") == 0)
                 PRINT(cb, name, "%lld", *(long long *)var);
+        else if (strcmp(type, "unsigned long long") == 0)
+                PRINT(cb, name, "%llu", *(unsigned long long *)var);
         else
                 LOG_ERROR(cb, "Unknown type `%s' for variable %s", type, name);
 }

diff --git a/flow.c b/flow.c
@@ -19,6 +19,12 @@
 #include "socket.h"
 #include "thread.h"
 #include "stats.h"
+#ifdef WITH_TCPDEVMEM_CUDA
+#include "tcpdevmem_cuda.h"
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMABUF
+#include "tcpdevmem_udmabuf.h"
+#endif /* WITH_TCPDEVMEM_UDMABUF */
 
 /*
  * We define the flow struct locally to this file to force outside users to go
@@ -271,6 +277,16 @@ void flow_delete(struct flow *f)
                 thread_clear_flow_or_die(f->f_thread, f);
         }
 
+#ifdef WITH_TCPDEVMEM_CUDA
+        if (flow_thread(f)->opts->tcpd_gpu_pci_addr)
+                cuda_flow_cleanup(f->f_mbuf);
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMABUF
+        if (flow_thread(f)->opts->tcpd_nic_pci_addr
+            && !flow_thread(f)->opts->tcpd_gpu_pci_addr)
+                udmabuf_flow_cleanup(f->f_mbuf);
+#endif /* WITH_TCPDEVMEM_UDMABUF */
+
 /* TODO: need to free the stat struct here for crr tests */
         free(f->f_opaque);
         /* Right now the test is always false, but let's leave it in case

diff --git a/lib.h b/lib.h
@@ -108,6 +108,18 @@ struct options {
         bool async_connect;
 
         /* tcp_stream */
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
+        bool tcpd_validate;
+        bool tcpd_rx_cpy;
+        const char *tcpd_nic_pci_addr;
+        const char *tcpd_gpu_pci_addr;
+        unsigned long long tcpd_phys_len;
+        const char *tcpd_src_ip;
+        const char *tcpd_dst_ip;
+        const char *tcpd_link_name;
+        int queue_start;
+        int queue_num;
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
         bool split_bidir;  /* implies enable_read, enable_write, split rx/tx */
         bool enable_read;
         bool enable_write;