[PATCH v10 5/9] net/tap: rewrite the RSS BPF program
    Stephen Hemminger 
    stephen at networkplumber.org
       
    Wed May  1 18:12:04 CEST 2024
    
    
  
Rewrite of the BPF program used to do queue based RSS.
Important changes:
	- uses newer BPF map format BTF
	- accepts key as parameter rather than constant default
	- can do L3 or L4 hashing
	- supports IPv4 options
	- supports IPv6 extension headers
	- restructured for readability
The usage of BPF is different as well:
	- the incoming configuration is looked up based on
	  class parameters rather than patching the BPF code.
	- the resulting queue is placed in skb by using skb mark
	  than requiring a second pass through classifier step.
Note: This version only works with later patch to enable it on
the DPDK driver side. It is submitted as an incremental patch
to allow for easier review. Bisection still works because
the old instruction are still present for now.
Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
---
 .gitignore                            |   3 -
 drivers/net/tap/bpf/Makefile          |  19 --
 drivers/net/tap/bpf/README            |  49 +++++
 drivers/net/tap/bpf/bpf_api.h         | 276 --------------------------
 drivers/net/tap/bpf/bpf_elf.h         |  53 -----
 drivers/net/tap/bpf/bpf_extract.py    |  85 --------
 drivers/net/tap/bpf/meson.build       |  81 ++++++++
 drivers/net/tap/bpf/tap_bpf_program.c | 255 ------------------------
 drivers/net/tap/bpf/tap_rss.c         | 264 ++++++++++++++++++++++++
 9 files changed, 394 insertions(+), 691 deletions(-)
 delete mode 100644 drivers/net/tap/bpf/Makefile
 create mode 100644 drivers/net/tap/bpf/README
 delete mode 100644 drivers/net/tap/bpf/bpf_api.h
 delete mode 100644 drivers/net/tap/bpf/bpf_elf.h
 delete mode 100644 drivers/net/tap/bpf/bpf_extract.py
 create mode 100644 drivers/net/tap/bpf/meson.build
 delete mode 100644 drivers/net/tap/bpf/tap_bpf_program.c
 create mode 100644 drivers/net/tap/bpf/tap_rss.c
diff --git a/.gitignore b/.gitignore
index 3f444dcace..01a47a7606 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,9 +36,6 @@ TAGS
 # ignore python bytecode files
 *.pyc
 
-# ignore BPF programs
-drivers/net/tap/bpf/tap_bpf_program.o
-
 # DTS results
 dts/output
 
diff --git a/drivers/net/tap/bpf/Makefile b/drivers/net/tap/bpf/Makefile
deleted file mode 100644
index 9efeeb1bc7..0000000000
--- a/drivers/net/tap/bpf/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# This file is not built as part of normal DPDK build.
-# It is used to generate the eBPF code for TAP RSS.
-
-CLANG=clang
-CLANG_OPTS=-O2
-TARGET=../tap_bpf_insns.h
-
-all: $(TARGET)
-
-clean:
-	rm tap_bpf_program.o $(TARGET)
-
-tap_bpf_program.o: tap_bpf_program.c
-	$(CLANG) $(CLANG_OPTS) -emit-llvm -c $< -o - | \
-	llc -march=bpf -filetype=obj -o $@
-
-$(TARGET): tap_bpf_program.o
-	python3 bpf_extract.py -stap_bpf_program.c -o $@ $<
diff --git a/drivers/net/tap/bpf/README b/drivers/net/tap/bpf/README
new file mode 100644
index 0000000000..181f76a134
--- /dev/null
+++ b/drivers/net/tap/bpf/README
@@ -0,0 +1,49 @@
+This is the BPF program used to implement Receive Side Scaling (RSS)
+across mulitple queues if required by a flow action. The program is
+loaded into the krnel when first RSS flow rule is created and is never unloaded.
+
+When flow rules with the TAP device, packets are first handled by the
+ingress queue discipline that then runs a series of classifier filter rules.
+The first stage is the flow based classifier (flower); for RSS queue
+action the second stage is an the kernel skbedit action which sets
+the skb mark to a key based on the flow id; the final stage
+is this BPF program which then maps flow id and packet header
+into a queue id.
+
+This version is built the BPF Compile Once — Run Everywhere (CO-RE)
+framework and uses libbpf and bpftool.
+
+Limitations
+-----------
+- requires libbpf to run
+
+- rebuilding the BPF requires the clang compiler with bpf available
+  as a targe architecture and bpftool to convert object to headers.
+
+  Some older versions of Ubuntu do not have a working bpftool package.
+
+- only standard Toeplitz hash with standard 40 byte key is supported.
+
+- the number of flow rules using RSS is limited to 32.
+
+Building
+--------
+During the DPDK build process the meson build file checks that
+libbpf, bpftool, and clang are available. If everything works then
+BPF RSS is enabled.
+
+The steps are:
+
+1. Usws clang to compile tap_rss.c to produce tap_rss.bpf.o
+
+2. Uses bpftool generate a skeleton header file tap_rss.skel.h
+   from tap_rss.bpf.o. This header contains wrapper functions for
+   managing the BPF and the actual BPF code as a large byte array.
+
+3. The header file is include in tap_flow.c so that it can load
+   the BPF code (via libbpf).
+
+References
+----------
+BPF and XDP reference guide
+https://docs.cilium.io/en/latest/bpf/progtypes/
diff --git a/drivers/net/tap/bpf/bpf_api.h b/drivers/net/tap/bpf/bpf_api.h
deleted file mode 100644
index 4cd25fa593..0000000000
--- a/drivers/net/tap/bpf/bpf_api.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
-
-#ifndef __BPF_API__
-#define __BPF_API__
-
-/* Note:
- *
- * This file can be included into eBPF kernel programs. It contains
- * a couple of useful helper functions, map/section ABI (bpf_elf.h),
- * misc macros and some eBPF specific LLVM built-ins.
- */
-
-#include <stdint.h>
-
-#include <linux/pkt_cls.h>
-#include <linux/bpf.h>
-#include <linux/filter.h>
-
-#include <asm/byteorder.h>
-
-#include "bpf_elf.h"
-
-/** libbpf pin type. */
-enum libbpf_pin_type {
-	LIBBPF_PIN_NONE,
-	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
-	LIBBPF_PIN_BY_NAME,
-};
-
-/** Type helper macros. */
-
-#define __uint(name, val) int (*name)[val]
-#define __type(name, val) typeof(val) *name
-#define __array(name, val) typeof(val) *name[]
-
-/** Misc macros. */
-
-#ifndef __stringify
-# define __stringify(X)		#X
-#endif
-
-#ifndef __maybe_unused
-# define __maybe_unused		__attribute__((__unused__))
-#endif
-
-#ifndef offsetof
-# define offsetof(TYPE, MEMBER)	__builtin_offsetof(TYPE, MEMBER)
-#endif
-
-#ifndef likely
-# define likely(X)		__builtin_expect(!!(X), 1)
-#endif
-
-#ifndef unlikely
-# define unlikely(X)		__builtin_expect(!!(X), 0)
-#endif
-
-#ifndef htons
-# define htons(X)		__constant_htons((X))
-#endif
-
-#ifndef ntohs
-# define ntohs(X)		__constant_ntohs((X))
-#endif
-
-#ifndef htonl
-# define htonl(X)		__constant_htonl((X))
-#endif
-
-#ifndef ntohl
-# define ntohl(X)		__constant_ntohl((X))
-#endif
-
-#ifndef __inline__
-# define __inline__		__attribute__((always_inline))
-#endif
-
-/** Section helper macros. */
-
-#ifndef __section
-# define __section(NAME)						\
-	__attribute__((section(NAME), used))
-#endif
-
-#ifndef __section_tail
-# define __section_tail(ID, KEY)					\
-	__section(__stringify(ID) "/" __stringify(KEY))
-#endif
-
-#ifndef __section_xdp_entry
-# define __section_xdp_entry						\
-	__section(ELF_SECTION_PROG)
-#endif
-
-#ifndef __section_cls_entry
-# define __section_cls_entry						\
-	__section(ELF_SECTION_CLASSIFIER)
-#endif
-
-#ifndef __section_act_entry
-# define __section_act_entry						\
-	__section(ELF_SECTION_ACTION)
-#endif
-
-#ifndef __section_lwt_entry
-# define __section_lwt_entry						\
-	__section(ELF_SECTION_PROG)
-#endif
-
-#ifndef __section_license
-# define __section_license						\
-	__section(ELF_SECTION_LICENSE)
-#endif
-
-#ifndef __section_maps
-# define __section_maps							\
-	__section(ELF_SECTION_MAPS)
-#endif
-
-/** Declaration helper macros. */
-
-#ifndef BPF_LICENSE
-# define BPF_LICENSE(NAME)						\
-	char ____license[] __section_license = NAME
-#endif
-
-/** Classifier helper */
-
-#ifndef BPF_H_DEFAULT
-# define BPF_H_DEFAULT	-1
-#endif
-
-/** BPF helper functions for tc. Individual flags are in linux/bpf.h */
-
-#ifndef __BPF_FUNC
-# define __BPF_FUNC(NAME, ...)						\
-	(* NAME)(__VA_ARGS__) __maybe_unused
-#endif
-
-#ifndef BPF_FUNC
-# define BPF_FUNC(NAME, ...)						\
-	__BPF_FUNC(NAME, __VA_ARGS__) = (void *) BPF_FUNC_##NAME
-#endif
-
-/* Map access/manipulation */
-static void *BPF_FUNC(map_lookup_elem, void *map, const void *key);
-static int BPF_FUNC(map_update_elem, void *map, const void *key,
-		    const void *value, uint32_t flags);
-static int BPF_FUNC(map_delete_elem, void *map, const void *key);
-
-/* Time access */
-static uint64_t BPF_FUNC(ktime_get_ns);
-
-/* Debugging */
-
-/* FIXME: __attribute__ ((format(printf, 1, 3))) not possible unless
- * llvm bug https://llvm.org/bugs/show_bug.cgi?id=26243 gets resolved.
- * It would require ____fmt to be made const, which generates a reloc
- * entry (non-map).
- */
-static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...);
-
-#ifndef printt
-# define printt(fmt, ...)						\
-	__extension__ ({						\
-		char ____fmt[] = fmt;					\
-		trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__);	\
-	})
-#endif
-
-/* Random numbers */
-static uint32_t BPF_FUNC(get_prandom_u32);
-
-/* Tail calls */
-static void BPF_FUNC(tail_call, struct __sk_buff *skb, void *map,
-		     uint32_t index);
-
-/* System helpers */
-static uint32_t BPF_FUNC(get_smp_processor_id);
-static uint32_t BPF_FUNC(get_numa_node_id);
-
-/* Packet misc meta data */
-static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb);
-static int BPF_FUNC(skb_under_cgroup, void *map, uint32_t index);
-
-static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb);
-static uint32_t BPF_FUNC(get_hash_recalc, struct __sk_buff *skb);
-static uint32_t BPF_FUNC(set_hash_invalid, struct __sk_buff *skb);
-
-/* Packet redirection */
-static int BPF_FUNC(redirect, int ifindex, uint32_t flags);
-static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex,
-		    uint32_t flags);
-
-/* Packet manipulation */
-static int BPF_FUNC(skb_load_bytes, struct __sk_buff *skb, uint32_t off,
-		    void *to, uint32_t len);
-static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off,
-		    const void *from, uint32_t len, uint32_t flags);
-
-static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off,
-		    uint32_t from, uint32_t to, uint32_t flags);
-static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off,
-		    uint32_t from, uint32_t to, uint32_t flags);
-static int BPF_FUNC(csum_diff, const void *from, uint32_t from_size,
-		    const void *to, uint32_t to_size, uint32_t seed);
-static int BPF_FUNC(csum_update, struct __sk_buff *skb, uint32_t wsum);
-
-static int BPF_FUNC(skb_change_type, struct __sk_buff *skb, uint32_t type);
-static int BPF_FUNC(skb_change_proto, struct __sk_buff *skb, uint32_t proto,
-		    uint32_t flags);
-static int BPF_FUNC(skb_change_tail, struct __sk_buff *skb, uint32_t nlen,
-		    uint32_t flags);
-
-static int BPF_FUNC(skb_pull_data, struct __sk_buff *skb, uint32_t len);
-
-/* Event notification */
-static int __BPF_FUNC(skb_event_output, struct __sk_buff *skb, void *map,
-		      uint64_t index, const void *data, uint32_t size) =
-		      (void *) BPF_FUNC_perf_event_output;
-
-/* Packet vlan encap/decap */
-static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto,
-		    uint16_t vlan_tci);
-static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb);
-
-/* Packet tunnel encap/decap */
-static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb,
-		    struct bpf_tunnel_key *to, uint32_t size, uint32_t flags);
-static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb,
-		    const struct bpf_tunnel_key *from, uint32_t size,
-		    uint32_t flags);
-
-static int BPF_FUNC(skb_get_tunnel_opt, struct __sk_buff *skb,
-		    void *to, uint32_t size);
-static int BPF_FUNC(skb_set_tunnel_opt, struct __sk_buff *skb,
-		    const void *from, uint32_t size);
-
-/** LLVM built-ins, mem*() routines work for constant size */
-
-#ifndef lock_xadd
-# define lock_xadd(ptr, val)	((void) __sync_fetch_and_add(ptr, val))
-#endif
-
-#ifndef memset
-# define memset(s, c, n)	__builtin_memset((s), (c), (n))
-#endif
-
-#ifndef memcpy
-# define memcpy(d, s, n)	__builtin_memcpy((d), (s), (n))
-#endif
-
-#ifndef memmove
-# define memmove(d, s, n)	__builtin_memmove((d), (s), (n))
-#endif
-
-/* FIXME: __builtin_memcmp() is not yet fully usable unless llvm bug
- * https://llvm.org/bugs/show_bug.cgi?id=26218 gets resolved. Also
- * this one would generate a reloc entry (non-map), otherwise.
- */
-#if 0
-#ifndef memcmp
-# define memcmp(a, b, n)	__builtin_memcmp((a), (b), (n))
-#endif
-#endif
-
-unsigned long long load_byte(void *skb, unsigned long long off)
-	asm ("llvm.bpf.load.byte");
-
-unsigned long long load_half(void *skb, unsigned long long off)
-	asm ("llvm.bpf.load.half");
-
-unsigned long long load_word(void *skb, unsigned long long off)
-	asm ("llvm.bpf.load.word");
-
-#endif /* __BPF_API__ */
diff --git a/drivers/net/tap/bpf/bpf_elf.h b/drivers/net/tap/bpf/bpf_elf.h
deleted file mode 100644
index ea8a11c95c..0000000000
--- a/drivers/net/tap/bpf/bpf_elf.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
-#ifndef __BPF_ELF__
-#define __BPF_ELF__
-
-#include <asm/types.h>
-
-/* Note:
- *
- * Below ELF section names and bpf_elf_map structure definition
- * are not (!) kernel ABI. It's rather a "contract" between the
- * application and the BPF loader in tc. For compatibility, the
- * section names should stay as-is. Introduction of aliases, if
- * needed, are a possibility, though.
- */
-
-/* ELF section names, etc */
-#define ELF_SECTION_LICENSE	"license"
-#define ELF_SECTION_MAPS	"maps"
-#define ELF_SECTION_PROG	"prog"
-#define ELF_SECTION_CLASSIFIER	"classifier"
-#define ELF_SECTION_ACTION	"action"
-
-#define ELF_MAX_MAPS		64
-#define ELF_MAX_LICENSE_LEN	128
-
-/* Object pinning settings */
-#define PIN_NONE		0
-#define PIN_OBJECT_NS		1
-#define PIN_GLOBAL_NS		2
-
-/* ELF map definition */
-struct bpf_elf_map {
-	__u32 type;
-	__u32 size_key;
-	__u32 size_value;
-	__u32 max_elem;
-	__u32 flags;
-	__u32 id;
-	__u32 pinning;
-	__u32 inner_id;
-	__u32 inner_idx;
-};
-
-#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val)		\
-	struct ____btf_map_##name {				\
-		type_key key;					\
-		type_val value;					\
-	};							\
-	struct ____btf_map_##name				\
-	    __attribute__ ((section(".maps." #name), used))	\
-	    ____btf_map_##name = { }
-
-#endif /* __BPF_ELF__ */
diff --git a/drivers/net/tap/bpf/bpf_extract.py b/drivers/net/tap/bpf/bpf_extract.py
deleted file mode 100644
index 73c4dafe4e..0000000000
--- a/drivers/net/tap/bpf/bpf_extract.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright (c) 2023 Stephen Hemminger <stephen at networkplumber.org>
-
-import argparse
-import sys
-import struct
-from tempfile import TemporaryFile
-from elftools.elf.elffile import ELFFile
-
-
-def load_sections(elffile):
-    """Get sections of interest from ELF"""
-    result = []
-    parts = [("cls_q", "cls_q_insns"), ("l3_l4", "l3_l4_hash_insns")]
-    for name, tag in parts:
-        section = elffile.get_section_by_name(name)
-        if section:
-            insns = struct.iter_unpack('<BBhL', section.data())
-            result.append([tag, insns])
-    return result
-
-
-def dump_section(name, insns, out):
-    """Dump the array of BPF instructions"""
-    print(f'\nstatic struct bpf_insn {name}[] = {{', file=out)
-    for bpf in insns:
-        code = bpf[0]
-        src = bpf[1] >> 4
-        dst = bpf[1] & 0xf
-        off = bpf[2]
-        imm = bpf[3]
-        print(f'\t{{{code:#04x}, {dst:4d}, {src:4d}, {off:8d}, {imm:#010x}}},',
-              file=out)
-    print('};', file=out)
-
-
-def parse_args():
-    """Parse command line arguments"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-s',
-                        '--source',
-                        type=str,
-                        help="original source file")
-    parser.add_argument('-o', '--out', type=str, help="output C file path")
-    parser.add_argument("file",
-                        nargs='+',
-                        help="object file path or '-' for stdin")
-    return parser.parse_args()
-
-
-def open_input(path):
-    """Open the file or stdin"""
-    if path == "-":
-        temp = TemporaryFile()
-        temp.write(sys.stdin.buffer.read())
-        return temp
-    return open(path, 'rb')
-
-
-def write_header(out, source):
-    """Write file intro header"""
-    print("/* SPDX-License-Identifier: BSD-3-Clause", file=out)
-    if source:
-        print(f' * Auto-generated from {source}', file=out)
-    print(" * This not the original source file. Do NOT edit it.", file=out)
-    print(" */\n", file=out)
-
-
-def main():
-    '''program main function'''
-    args = parse_args()
-
-    with open(args.out, 'w',
-              encoding="utf-8") if args.out else sys.stdout as out:
-        write_header(out, args.source)
-        for path in args.file:
-            elffile = ELFFile(open_input(path))
-            sections = load_sections(elffile)
-            for name, insns in sections:
-                dump_section(name, insns, out)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/drivers/net/tap/bpf/meson.build b/drivers/net/tap/bpf/meson.build
new file mode 100644
index 0000000000..f2c03a19fd
--- /dev/null
+++ b/drivers/net/tap/bpf/meson.build
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2024 Stephen Hemminger <stephen at networkplumber.org>
+
+enable_tap_rss = false
+
+libbpf = dependency('libbpf', required: false, method: 'pkg-config')
+if not libbpf.found()
+    message('net/tap: no RSS support missing libbpf')
+    subdir_done()
+endif
+
+# Debian install this in /usr/sbin which is not in $PATH
+bpftool = find_program('bpftool', '/usr/sbin/bpftool', required: false, version: '>= 5.6.0')
+if not bpftool.found()
+    message('net/tap: no RSS support missing bpftool')
+    subdir_done()
+endif
+
+clang_supports_bpf = false
+clang = find_program('clang', required: false)
+if clang.found()
+    clang_supports_bpf = run_command(clang, '-target', 'bpf', '--print-supported-cpus',
+                                     check: false).returncode() == 0
+endif
+
+if not clang_supports_bpf
+    message('net/tap: no RSS support missing clang BPF')
+    subdir_done()
+endif
+
+enable_tap_rss = true
+
+libbpf_include_dir = libbpf.get_variable(pkgconfig : 'includedir')
+
+# The include files <linux/bpf.h> and others include <asm/types.h>
+# but <asm/types.h> is not defined for multi-lib environment target.
+# Workaround by using include directoriy from the host build environment.
+machine_name = run_command('uname', '-m').stdout().strip()
+march_include_dir = '/usr/include/' + machine_name + '-linux-gnu'
+
+clang_flags = [
+    '-O2',
+    '-Wall',
+    '-Wextra',
+    '-target',
+    'bpf',
+    '-g',
+    '-c',
+]
+
+bpf_o_cmd = [
+    clang,
+    clang_flags,
+    '-idirafter',
+    libbpf_include_dir,
+    '-idirafter',
+    march_include_dir,
+    '@INPUT@',
+    '-o',
+    '@OUTPUT@'
+]
+
+skel_h_cmd = [
+    bpftool,
+    'gen',
+    'skeleton',
+    '@INPUT@'
+]
+
+tap_rss_o = custom_target(
+    'tap_rss.bpf.o',
+    input: 'tap_rss.c',
+    output: 'tap_rss.o',
+    command: bpf_o_cmd)
+
+tap_rss_skel_h = custom_target(
+    'tap_rss.skel.h',
+    input: tap_rss_o,
+    output: 'tap_rss.skel.h',
+    command: skel_h_cmd,
+    capture: true)
diff --git a/drivers/net/tap/bpf/tap_bpf_program.c b/drivers/net/tap/bpf/tap_bpf_program.c
deleted file mode 100644
index f05aed021c..0000000000
--- a/drivers/net/tap/bpf/tap_bpf_program.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
- * Copyright 2017 Mellanox Technologies, Ltd
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <asm/types.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/if_ether.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/if_tunnel.h>
-#include <linux/filter.h>
-
-#include "bpf_api.h"
-#include "bpf_elf.h"
-#include "../tap_rss.h"
-
-/** Create IPv4 address */
-#define IPv4(a, b, c, d) ((__u32)(((a) & 0xff) << 24) | \
-		(((b) & 0xff) << 16) | \
-		(((c) & 0xff) << 8)  | \
-		((d) & 0xff))
-
-#define PORT(a, b) ((__u16)(((a) & 0xff) << 8) | \
-		((b) & 0xff))
-
-/*
- * The queue number is offset by a unique QUEUE_OFFSET, to distinguish
- * packets that have gone through this rule (skb->cb[1] != 0) from others.
- */
-#define QUEUE_OFFSET		0x7cafe800
-#define PIN_GLOBAL_NS		2
-
-#define KEY_IDX			0
-#define BPF_MAP_ID_KEY	1
-
-struct vlan_hdr {
-	__be16 proto;
-	__be16 tci;
-};
-
-struct bpf_elf_map __attribute__((section("maps"), used))
-map_keys = {
-	.type           =       BPF_MAP_TYPE_HASH,
-	.id             =       BPF_MAP_ID_KEY,
-	.size_key       =       sizeof(__u32),
-	.size_value     =       sizeof(struct rss_key),
-	.max_elem       =       256,
-	.pinning        =       PIN_GLOBAL_NS,
-};
-
-__section("cls_q") int
-match_q(struct __sk_buff *skb)
-{
-	__u32 queue = skb->cb[1];
-	/* queue is set by tap_flow_bpf_cls_q() before load */
-	volatile __u32 q = 0xdeadbeef;
-	__u32 match_queue = QUEUE_OFFSET + q;
-
-	/* printt("match_q$i() queue = %d\n", queue); */
-
-	if (queue != match_queue)
-		return TC_ACT_OK;
-
-	/* queue match */
-	skb->cb[1] = 0;
-	return TC_ACT_UNSPEC;
-}
-
-
-struct ipv4_l3_l4_tuple {
-	__u32    src_addr;
-	__u32    dst_addr;
-	__u16    dport;
-	__u16    sport;
-} __attribute__((packed));
-
-struct ipv6_l3_l4_tuple {
-	__u8        src_addr[16];
-	__u8        dst_addr[16];
-	__u16       dport;
-	__u16       sport;
-} __attribute__((packed));
-
-static const __u8 def_rss_key[TAP_RSS_HASH_KEY_SIZE] = {
-	0xd1, 0x81, 0xc6, 0x2c,
-	0xf7, 0xf4, 0xdb, 0x5b,
-	0x19, 0x83, 0xa2, 0xfc,
-	0x94, 0x3e, 0x1a, 0xdb,
-	0xd9, 0x38, 0x9e, 0x6b,
-	0xd1, 0x03, 0x9c, 0x2c,
-	0xa7, 0x44, 0x99, 0xad,
-	0x59, 0x3d, 0x56, 0xd9,
-	0xf3, 0x25, 0x3c, 0x06,
-	0x2a, 0xdc, 0x1f, 0xfc,
-};
-
-static __u32  __attribute__((always_inline))
-rte_softrss_be(const __u32 *input_tuple, const uint8_t *rss_key,
-		__u8 input_len)
-{
-	__u32 i, j, hash = 0;
-#pragma unroll
-	for (j = 0; j < input_len; j++) {
-#pragma unroll
-		for (i = 0; i < 32; i++) {
-			if (input_tuple[j] & (1U << (31 - i))) {
-				hash ^= ((const __u32 *)def_rss_key)[j] << i |
-				(__u32)((uint64_t)
-				(((const __u32 *)def_rss_key)[j + 1])
-					>> (32 - i));
-			}
-		}
-	}
-	return hash;
-}
-
-static int __attribute__((always_inline))
-rss_l3_l4(struct __sk_buff *skb)
-{
-	void *data_end = (void *)(long)skb->data_end;
-	void *data = (void *)(long)skb->data;
-	__u16 proto = (__u16)skb->protocol;
-	__u32 key_idx = 0xdeadbeef;
-	__u32 hash;
-	struct rss_key *rsskey;
-	__u64 off = ETH_HLEN;
-	int j;
-	__u8 *key = 0;
-	__u32 len;
-	__u32 queue = 0;
-	bool mf = 0;
-	__u16 frag_off = 0;
-
-	rsskey = map_lookup_elem(&map_keys, &key_idx);
-	if (!rsskey) {
-		printt("hash(): rss key is not configured\n");
-		return TC_ACT_OK;
-	}
-	key = (__u8 *)rsskey->key;
-
-	/* Get correct proto for 802.1ad */
-	if (skb->vlan_present && skb->vlan_proto == htons(ETH_P_8021AD)) {
-		if (data + ETH_ALEN * 2 + sizeof(struct vlan_hdr) +
-		    sizeof(proto) > data_end)
-			return TC_ACT_OK;
-		proto = *(__u16 *)(data + ETH_ALEN * 2 +
-				   sizeof(struct vlan_hdr));
-		off += sizeof(struct vlan_hdr);
-	}
-
-	if (proto == htons(ETH_P_IP)) {
-		if (data + off + sizeof(struct iphdr) + sizeof(__u32)
-			> data_end)
-			return TC_ACT_OK;
-
-		__u8 *src_dst_addr = data + off + offsetof(struct iphdr, saddr);
-		__u8 *frag_off_addr = data + off + offsetof(struct iphdr, frag_off);
-		__u8 *prot_addr = data + off + offsetof(struct iphdr, protocol);
-		__u8 *src_dst_port = data + off + sizeof(struct iphdr);
-		struct ipv4_l3_l4_tuple v4_tuple = {
-			.src_addr = IPv4(*(src_dst_addr + 0),
-					*(src_dst_addr + 1),
-					*(src_dst_addr + 2),
-					*(src_dst_addr + 3)),
-			.dst_addr = IPv4(*(src_dst_addr + 4),
-					*(src_dst_addr + 5),
-					*(src_dst_addr + 6),
-					*(src_dst_addr + 7)),
-			.sport = 0,
-			.dport = 0,
-		};
-		/** Fetch the L4-payer port numbers only in-case of TCP/UDP
-		 ** and also if the packet is not fragmented. Since fragmented
-		 ** chunks do not have L4 TCP/UDP header.
-		 **/
-		if (*prot_addr == IPPROTO_UDP || *prot_addr == IPPROTO_TCP) {
-			frag_off = PORT(*(frag_off_addr + 0),
-					*(frag_off_addr + 1));
-			mf = frag_off & 0x2000;
-			frag_off = frag_off & 0x1fff;
-			if (mf == 0 && frag_off == 0) {
-				v4_tuple.sport = PORT(*(src_dst_port + 0),
-						*(src_dst_port + 1));
-				v4_tuple.dport = PORT(*(src_dst_port + 2),
-						*(src_dst_port + 3));
-			}
-		}
-		__u8 input_len = sizeof(v4_tuple) / sizeof(__u32);
-		if (rsskey->hash_fields & (1 << HASH_FIELD_IPV4_L3))
-			input_len--;
-		hash = rte_softrss_be((__u32 *)&v4_tuple, key, 3);
-	} else if (proto == htons(ETH_P_IPV6)) {
-		if (data + off + sizeof(struct ipv6hdr) +
-					sizeof(__u32) > data_end)
-			return TC_ACT_OK;
-		__u8 *src_dst_addr = data + off +
-					offsetof(struct ipv6hdr, saddr);
-		__u8 *src_dst_port = data + off +
-					sizeof(struct ipv6hdr);
-		__u8 *next_hdr = data + off +
-					offsetof(struct ipv6hdr, nexthdr);
-
-		struct ipv6_l3_l4_tuple v6_tuple;
-		for (j = 0; j < 4; j++)
-			*((uint32_t *)&v6_tuple.src_addr + j) =
-				__builtin_bswap32(*((uint32_t *)
-						src_dst_addr + j));
-		for (j = 0; j < 4; j++)
-			*((uint32_t *)&v6_tuple.dst_addr + j) =
-				__builtin_bswap32(*((uint32_t *)
-						src_dst_addr + 4 + j));
-
-		/** Fetch the L4 header port-numbers only if next-header
-		 * is TCP/UDP **/
-		if (*next_hdr == IPPROTO_UDP || *next_hdr == IPPROTO_TCP) {
-			v6_tuple.sport = PORT(*(src_dst_port + 0),
-				      *(src_dst_port + 1));
-			v6_tuple.dport = PORT(*(src_dst_port + 2),
-				      *(src_dst_port + 3));
-		} else {
-			v6_tuple.sport = 0;
-			v6_tuple.dport = 0;
-		}
-
-		__u8 input_len = sizeof(v6_tuple) / sizeof(__u32);
-		if (rsskey->hash_fields & (1 << HASH_FIELD_IPV6_L3))
-			input_len--;
-		hash = rte_softrss_be((__u32 *)&v6_tuple, key, 9);
-	} else {
-		return TC_ACT_PIPE;
-	}
-
-	queue = rsskey->queues[(hash % rsskey->nb_queues) &
-				       (TAP_MAX_QUEUES - 1)];
-	skb->cb[1] = QUEUE_OFFSET + queue;
-	/* printt(">>>>> rss_l3_l4 hash=0x%x queue=%u\n", hash, queue); */
-
-	return TC_ACT_RECLASSIFY;
-}
-
-#define RSS(L)						\
-	__section(#L) int				\
-		L ## _hash(struct __sk_buff *skb)	\
-	{						\
-		return rss_ ## L (skb);			\
-	}
-
-RSS(l3_l4)
-
-BPF_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/tap/bpf/tap_rss.c b/drivers/net/tap/bpf/tap_rss.c
new file mode 100644
index 0000000000..888b3bdc24
--- /dev/null
+++ b/drivers/net/tap/bpf/tap_rss.c
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ * Copyright 2017 Mellanox Technologies, Ltd
+ */
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "../tap_rss.h"
+
+/*
+ * This map provides configuration information about flows which need BPF RSS.
+ *
+ * The hash is indexed by the skb mark.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct rss_key));
+	__uint(max_entries, TAP_RSS_MAX);
+} rss_map SEC(".maps");
+
+#define IP_MF		0x2000		/** IP header Flags **/
+#define IP_OFFSET	0x1FFF		/** IP header fragment offset **/
+
+/*
+ * Compute Toeplitz hash over the input tuple.
+ * This is same as rte_softrss_be in lib/hash
+ * but loop needs to be setup to match BPF restrictions.
+ */
+static __u32 __attribute__((always_inline))
+softrss_be(const __u32 *input_tuple, __u32 input_len, const __u32 *key)
+{
+	__u32 i, j, hash = 0;
+
+#pragma unroll
+	for (j = 0; j < input_len; j++) {
+#pragma unroll
+		for (i = 0; i < 32; i++) {
+			if (input_tuple[j] & (1U << (31 - i)))
+				hash ^= key[j] << i | key[j + 1] >> (32 - i);
+		}
+	}
+	return hash;
+}
+
+/*
+ * Compute RSS hash for IPv4 packet.
+ * return in 0 if RSS not specified
+ */
+static __u32 __attribute__((always_inline))
+parse_ipv4(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
+{
+	struct iphdr iph;
+	__u32 off = 0;
+
+	if (bpf_skb_load_bytes_relative(skb, off, &iph, sizeof(iph), BPF_HDR_START_NET))
+		return 0;	/* no IP header present */
+
+	struct {
+		__u32    src_addr;
+		__u32    dst_addr;
+		__u16    dport;
+		__u16    sport;
+	} v4_tuple = {
+		.src_addr = bpf_ntohl(iph.saddr),
+		.dst_addr = bpf_ntohl(iph.daddr),
+	};
+
+	/* If only calculating L3 hash, do it now */
+	if (hash_type & (1 << HASH_FIELD_IPV4_L3))
+		return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32) - 1, key);
+
+	/* If packet is fragmented then no L4 hash is possible */
+	if ((iph.frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
+		return 0;
+
+	/* Do RSS on UDP or TCP protocols */
+	if (iph.protocol == IPPROTO_UDP || iph.protocol == IPPROTO_TCP) {
+		__u16 src_dst_port[2];
+
+		off += iph.ihl * 4;
+		if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
+						BPF_HDR_START_NET))
+			return 0; /* TCP or UDP header missing */
+
+		v4_tuple.sport = bpf_ntohs(src_dst_port[0]);
+		v4_tuple.dport = bpf_ntohs(src_dst_port[1]);
+		return softrss_be((__u32 *)&v4_tuple, sizeof(v4_tuple) / sizeof(__u32), key);
+	}
+
+	/* Other protocol */
+	return 0;
+}
+
+/*
+ * Parse Ipv6 extended headers, update offset and return next proto.
+ * returns next proto on success, -1 on malformed header
+ */
+static int __attribute__((always_inline))
+skip_ip6_ext(__u16 proto, const struct __sk_buff *skb, __u32 *off, int *frag)
+{
+	struct ext_hdr {
+		__u8 next_hdr;
+		__u8 len;
+	} xh;
+	unsigned int i;
+
+	*frag = 0;
+
+#define MAX_EXT_HDRS 5
+#pragma unroll
+	for (i = 0; i < MAX_EXT_HDRS; i++) {
+		switch (proto) {
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_ROUTING:
+		case IPPROTO_DSTOPTS:
+			if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
+							BPF_HDR_START_NET))
+				return -1;
+
+			*off += (xh.len + 1) * 8;
+			proto = xh.next_hdr;
+			break;
+		case IPPROTO_FRAGMENT:
+			if (bpf_skb_load_bytes_relative(skb, *off, &xh, sizeof(xh),
+							BPF_HDR_START_NET))
+				return -1;
+
+			*off += 8;
+			proto = xh.next_hdr;
+			*frag = 1;
+			return proto; /* this is always the last ext hdr */
+		default:
+			return proto;
+		}
+	}
+
+	/* too many extension headers give up */
+	return -1;
+}
+
+/*
+ * Compute RSS hash for IPv6 packet.
+ * return in 0 if RSS not specified
+ */
+static __u32 __attribute__((always_inline))
+parse_ipv6(const struct __sk_buff *skb, __u32 hash_type, const __u32 *key)
+{
+	struct {
+		__u32       src_addr[4];
+		__u32       dst_addr[4];
+		__u16       dport;
+		__u16       sport;
+	} v6_tuple = { };
+	struct ipv6hdr ip6h;
+	__u32 off = 0, j;
+	int proto, frag;
+
+	if (bpf_skb_load_bytes_relative(skb, off, &ip6h, sizeof(ip6h), BPF_HDR_START_NET))
+		return 0;	/* missing IPv6 header */
+
+#pragma unroll
+	for (j = 0; j < 4; j++) {
+		v6_tuple.src_addr[j] = bpf_ntohl(ip6h.saddr.in6_u.u6_addr32[j]);
+		v6_tuple.dst_addr[j] = bpf_ntohl(ip6h.daddr.in6_u.u6_addr32[j]);
+	}
+
+	/* If only doing L3 hash, do it now */
+	if (hash_type & (1 << HASH_FIELD_IPV6_L3))
+		return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32) - 1, key);
+
+	/* Skip extension headers if present */
+	off += sizeof(ip6h);
+	proto = skip_ip6_ext(ip6h.nexthdr, skb, &off, &frag);
+	if (proto < 0)
+		return 0;
+
+	/* If packet is a fragment then no L4 hash is possible */
+	if (frag)
+		return 0;
+
+	/* Do RSS on UDP or TCP */
+	if (proto == IPPROTO_UDP || proto == IPPROTO_TCP) {
+		__u16 src_dst_port[2];
+
+		if (bpf_skb_load_bytes_relative(skb, off, &src_dst_port, sizeof(src_dst_port),
+						BPF_HDR_START_NET))
+			return 0;
+
+		v6_tuple.sport = bpf_ntohs(src_dst_port[0]);
+		v6_tuple.dport = bpf_ntohs(src_dst_port[1]);
+
+		return softrss_be((__u32 *)&v6_tuple, sizeof(v6_tuple) / sizeof(__u32), key);
+	}
+
+	return 0;
+}
+
+/*
+ * Compute RSS hash for packets.
+ * Returns 0 if no hash is possible.
+ */
+static __u32 __attribute__((always_inline))
+calculate_rss_hash(const struct __sk_buff *skb, const struct rss_key *rsskey)
+{
+	const __u32 *key = (const __u32 *)rsskey->key;
+
+	if (skb->protocol == bpf_htons(ETH_P_IP))
+		return parse_ipv4(skb, rsskey->hash_fields, key);
+	else if (skb->protocol == bpf_htons(ETH_P_IPV6))
+		return parse_ipv6(skb, rsskey->hash_fields, key);
+	else
+		return 0;
+}
+
+/*
+ * Scale value to be into range [0, n)
+ * Assumes val is large (ie hash covers whole u32 range)
+ */
+static __u32  __attribute__((always_inline))
+reciprocal_scale(__u32 val, __u32 n)
+{
+	return (__u32)(((__u64)val * n) >> 32);
+}
+
+/*
+ * When this BPF program is run by tc from the filter classifier,
+ * it is able to read skb metadata and packet data.
+ *
+ * For packets where RSS is not possible, then just return TC_ACT_OK.
+ * When RSS is desired, change the skb->queue_mapping and set TC_ACT_PIPE
+ * to continue processing.
+ *
+ * This should be BPF_PROG_TYPE_SCHED_ACT so section needs to be "action"
+ */
+SEC("action") int
+rss_flow_action(struct __sk_buff *skb)
+{
+	const struct rss_key *rsskey;
+	__u32 mark = skb->mark;
+	__u32 hash;
+
+	/* Lookup RSS configuration for that BPF class */
+	rsskey = bpf_map_lookup_elem(&rss_map, &mark);
+	if (rsskey == NULL)
+		return TC_ACT_OK;
+
+	hash = calculate_rss_hash(skb, rsskey);
+	if (!hash)
+		return TC_ACT_OK;
+
+	/* Fold hash to the number of queues configured */
+	skb->queue_mapping = reciprocal_scale(hash, rsskey->nb_queues);
+	return TC_ACT_PIPE;
+}
+
+char _license[] SEC("license") = "Dual BSD/GPL";
-- 
2.43.0
    
    
More information about the dev
mailing list