diff --git a/patches/v2/v2-0000-cover-letter.patch b/patches/v2/v2-0000-cover-letter.patch new file mode 100644 index 0000000..5e047bf --- /dev/null +++ b/patches/v2/v2-0000-cover-letter.patch @@ -0,0 +1,107 @@ +From: Christian Hopps +Subject: [PATCH ipsec-next v2 0/17] Add IP-TFS mode to xfrm + +Summary of Changes +------------------ + +This patchset adds a new xfrm mode implementing on-demand IP-TFS. IP-TFS +(AggFrag encapsulation) has been standardized in RFC9347. + + Link: https://www.rfc-editor.org/rfc/rfc9347.txt + +This feature supports demand driven (i.e., non-constant send rate) +IP-TFS to take advantage of the AGGFRAG ESP payload encapsulation. This +payload type supports aggregation and fragmentation of the inner IP +packet stream which in turn yields higher small-packet bandwidth as well +as reducing MTU/PMTU issues. Congestion control is unimplementated as +the send rate is demand driven rather than constant. + +In order to allow loading this fucntionality as a module a set of +callbacks xfrm_mode_cbs has been added to xfrm as well. + +Patchset Changes: +----------------- + + 23 files changed, 3252 insertions(+), 19 deletions(-) + Documentation/networking/xfrm_sysctl.rst | 30 + + include/net/netns/xfrm.h | 6 + + include/net/xfrm.h | 40 + + include/uapi/linux/in.h | 2 + + include/uapi/linux/ip.h | 16 + + include/uapi/linux/ipsec.h | 3 +- + include/uapi/linux/snmp.h | 3 + + include/uapi/linux/xfrm.h | 9 +- + net/ipv4/esp4.c | 3 +- + net/ipv6/esp6.c | 3 +- + net/netfilter/nft_xfrm.c | 3 +- + net/xfrm/Makefile | 1 + + net/xfrm/trace_iptfs.h | 218 +++ + net/xfrm/xfrm_compat.c | 10 +- + net/xfrm/xfrm_device.c | 4 +- + net/xfrm/xfrm_input.c | 14 +- + net/xfrm/xfrm_iptfs.c | 2741 ++++++++++++++++++++++++++++++ + net/xfrm/xfrm_output.c | 6 + + net/xfrm/xfrm_policy.c | 26 +- + net/xfrm/xfrm_proc.c | 3 + + net/xfrm/xfrm_state.c | 60 + + net/xfrm/xfrm_sysctl.c | 38 + + net/xfrm/xfrm_user.c | 32 + + +Patchset Structure: +------------------- + +The first 8 commits are changes to the xfrm infrastructure to support +the callbacks as well as more generic IP-TFS additions that may be used +outside the actual IP-TFS implementation. + + - iptfs: config: add CONFIG_XFRM_IPTFS + - iptfs: uapi: ip: add ip_tfs_*_hdr packet formats + - iptfs: uapi: IPPROTO_AGGFRAG AGGFRAG in ESP + - iptfs: sysctl: allow configuration of global default values + - iptfs: netlink: add config (netlink) options + - iptfs: xfrm: Add mode_cbs module functionality + - iptfs: xfrm: add generic iptfs defines and functionality + +The last 9+1 commits constitute the IP-TFS implementation constructed in +layers to make review easier. The first 9 commits all apply to a single +file `net/xfrm/xfrm_iptfs.c`, the last commit adds a new tracepoint +header file along with the use of these new tracepoint calls. + + - iptfs: impl: add new iptfs xfrm mode impl + - iptfs: impl: add user packet (tunnel ingress) handling + - iptfs: impl: share page fragments of inner packets + - iptfs: impl: add fragmenting of larger than MTU user packets + - iptfs: impl: add basic receive packet (tunnel egress) handling + - iptfs: impl: handle received fragmented inner packets + - iptfs: impl: add reusing received skb for the tunnel egress packet + - iptfs: impl: add skb-fragment sharing code + - iptfs: impl: handle reordering of received packets + - iptfs: impl: add tracepoint functionality + +Patchset History: +----------------- + +RFCv1 (11/10/2023) + +RFCv1 -> RFCv2 (11/12/2023) + + Updates based on feedback from Simon Horman, Antony, + Michael Richardson, and kernel test robot. + +RFCv2 -> v1 (2/19/2024) + + Updates based on feedback from Sabrina Dubroca, kernel test robot + +v1 -> v2 (5/19/2024) + + Updates based on feedback from Sabrina Dubroca, Simon Horman, Antony. + + o Add handling of new netlink SA direction attribute (Antony). + o Split single patch/commit of xfrm_iptfs.c (the actual IP-TFS impl) + into 9+1 distinct layered functionality commits for aiding review. + - xfrm: fix return check on clone() callback + - xfrm: add sa_len() callback in xfrm_mode_cbs for copy to user + - iptfs: remove unneeded skb free count variable + - iptfs: remove unused variable and "breadcrumb" for future code. + - iptfs: use do_div() to avoid "__udivd13 missing" link failure. + - iptfs: remove some BUG_ON() assertions questioned in review. \ No newline at end of file diff --git a/patches/v2/v2-0001-iptfs-config-add-CONFIG_XFRM_IPTFS.patch b/patches/v2/v2-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch similarity index 89% rename from patches/v2/v2-0001-iptfs-config-add-CONFIG_XFRM_IPTFS.patch rename to patches/v2/v2-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch index ff7ee0d..94f3a2a 100644 --- a/patches/v2/v2-0001-iptfs-config-add-CONFIG_XFRM_IPTFS.patch +++ b/patches/v2/v2-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch @@ -1,7 +1,7 @@ -From 53a3a402912b9373cf757d9d613bbc270a66881e Mon Sep 17 00:00:00 2001 +From 5624f2ed73dc421981acfdd4978962dcc9c47a14 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Sun, 12 Nov 2023 06:28:49 -0500 -Subject: [PATCH ipsec-next v2 1/8] iptfs: config: add CONFIG_XFRM_IPTFS +Subject: [PATCH ipsec-next v2 01/17] xfrm: config: add CONFIG_XFRM_IPTFS Add new Kconfig option to enable IP-TFS (RFC9347) functionality. @@ -38,5 +38,5 @@ index d7b16f2c23e9..f0157702718f 100644 bool -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0002-iptfs-uapi-ip-add-ip_tfs_-_hdr-packet-formats.patch b/patches/v2/v2-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch similarity index 86% rename from patches/v2/v2-0002-iptfs-uapi-ip-add-ip_tfs_-_hdr-packet-formats.patch rename to patches/v2/v2-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch index 553ca66..aae71fc 100644 --- a/patches/v2/v2-0002-iptfs-uapi-ip-add-ip_tfs_-_hdr-packet-formats.patch +++ b/patches/v2/v2-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch @@ -1,7 +1,7 @@ -From 4cdbf70870800953a9e42d3f936097cc87553e17 Mon Sep 17 00:00:00 2001 +From 648cd1fb4681e18c6e5c519218cd9535c21a14e6 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Wed, 20 Apr 2022 13:15:20 -0400 -Subject: [PATCH ipsec-next v2 2/8] iptfs: uapi: ip: add ip_tfs_*_hdr packet +Subject: [PATCH ipsec-next v2 02/17] include: uapi: add ip_tfs_*_hdr packet formats Add the on-wire basic and congestion-control IP-TFS packet headers. @@ -39,5 +39,5 @@ index 283dec7e3645..5bd7ce934d74 100644 enum { -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0003-iptfs-uapi-IPPROTO_AGGFRAG-AGGFRAG-in-ESP.patch b/patches/v2/v2-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch similarity index 83% rename from patches/v2/v2-0003-iptfs-uapi-IPPROTO_AGGFRAG-AGGFRAG-in-ESP.patch rename to patches/v2/v2-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch index ffffc7a..dcb2057 100644 --- a/patches/v2/v2-0003-iptfs-uapi-IPPROTO_AGGFRAG-AGGFRAG-in-ESP.patch +++ b/patches/v2/v2-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch @@ -1,7 +1,8 @@ -From e0b405306b76c78e9b3ef5e2a76a594a13b9d8f0 Mon Sep 17 00:00:00 2001 +From 362a550bcdb56935c48bcc9532468af191f0525f Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Sat, 27 Aug 2022 02:26:52 +0000 -Subject: [PATCH ipsec-next v2 3/8] iptfs: uapi: IPPROTO_AGGFRAG AGGFRAG in ESP +Subject: [PATCH ipsec-next v2 03/17] include: uapi: add IPPROTO_AGGFRAG for + AGGFRAG in ESP Add the RFC assigned IP protocol number for AGGFRAG. @@ -24,5 +25,5 @@ index e682ab628dfa..e6a1f3e4c58c 100644 #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MPTCP = 262, /* Multipath TCP connection */ -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0004-iptfs-sysctl-allow-configuration-of-global-defaul.patch b/patches/v2/v2-0004-xfrm-sysctl-allow-configuration-of-global-default.patch similarity index 96% rename from patches/v2/v2-0004-iptfs-sysctl-allow-configuration-of-global-defaul.patch rename to patches/v2/v2-0004-xfrm-sysctl-allow-configuration-of-global-default.patch index b405b6d..f15edd4 100644 --- a/patches/v2/v2-0004-iptfs-sysctl-allow-configuration-of-global-defaul.patch +++ b/patches/v2/v2-0004-xfrm-sysctl-allow-configuration-of-global-default.patch @@ -1,7 +1,7 @@ -From f78a7f448129d65028fff3c9257a3d0477e8b2d8 Mon Sep 17 00:00:00 2001 +From 75b39d8517b8b16aa9225caa7548954b3a8d972d Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Wed, 26 Jul 2023 14:20:52 +0000 -Subject: [PATCH ipsec-next v2 4/8] iptfs: sysctl: allow configuration of +Subject: [PATCH ipsec-next v2 04/17] xfrm: sysctl: allow configuration of global default values Add sysctls for the changing the IPTFS default SA values. @@ -132,5 +132,5 @@ index 7fdeafc838a7..dddb1025b7de 100644 /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0005-iptfs-netlink-add-config-netlink-options.patch b/patches/v2/v2-0005-iptfs-netlink-add-config-netlink-options.patch deleted file mode 100644 index e62d49b..0000000 --- a/patches/v2/v2-0005-iptfs-netlink-add-config-netlink-options.patch +++ /dev/null @@ -1,109 +0,0 @@ -From 8451b570ef7b726939aed17aeeaa72af96067f94 Mon Sep 17 00:00:00 2001 -From: Christian Hopps -Date: Sun, 12 Nov 2023 06:02:21 -0500 -Subject: [PATCH ipsec-next v2 5/8] iptfs: netlink: add config (netlink) - options - -Add netlink options for configuring IP-TFS SAs. - -Signed-off-by: Christian Hopps ---- - include/uapi/linux/xfrm.h | 9 ++++++++- - net/xfrm/xfrm_compat.c | 10 ++++++++-- - net/xfrm/xfrm_user.c | 16 ++++++++++++++++ - 3 files changed, 32 insertions(+), 3 deletions(-) - -diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h -index 6a77328be114..fcb8f1583284 100644 ---- a/include/uapi/linux/xfrm.h -+++ b/include/uapi/linux/xfrm.h -@@ -153,7 +153,8 @@ enum { - #define XFRM_MODE_ROUTEOPTIMIZATION 2 - #define XFRM_MODE_IN_TRIGGER 3 - #define XFRM_MODE_BEET 4 --#define XFRM_MODE_MAX 5 -+#define XFRM_MODE_IPTFS 5 -+#define XFRM_MODE_MAX 6 - - /* Netlink configuration messages. */ - enum { -@@ -315,6 +316,12 @@ enum xfrm_attr_type_t { - XFRMA_SET_MARK_MASK, /* __u32 */ - XFRMA_IF_ID, /* __u32 */ - XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ -+ XFRMA_IPTFS_PKT_SIZE, /* __u32 Size of outer packet, 0 for PMTU */ -+ XFRMA_IPTFS_MAX_QSIZE, /* __u32 max ingress queue size */ -+ XFRMA_IPTFS_DONT_FRAG, /* don't use fragmentation */ -+ XFRMA_IPTFS_DROP_TIME, /* __u32 usec to wait for next seq */ -+ XFRMA_IPTFS_REORDER_WINDOW, /* __u16 reorder window size */ -+ XFRMA_IPTFS_INIT_DELAY, /* __u32 initial packet wait delay (usec) */ - __XFRMA_MAX - - #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ -diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c -index 655fe4ff8621..0e533b89b17f 100644 ---- a/net/xfrm/xfrm_compat.c -+++ b/net/xfrm/xfrm_compat.c -@@ -277,9 +277,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) - case XFRMA_SET_MARK_MASK: - case XFRMA_IF_ID: - case XFRMA_MTIMER_THRESH: -+ case XFRMA_IPTFS_PKT_SIZE: -+ case XFRMA_IPTFS_MAX_QSIZE: -+ case XFRMA_IPTFS_DONT_FRAG: -+ case XFRMA_IPTFS_DROP_TIME: -+ case XFRMA_IPTFS_REORDER_WINDOW: -+ case XFRMA_IPTFS_INIT_DELAY: - return xfrm_nla_cpy(dst, src, nla_len(src)); - default: -- BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); -+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_INIT_DELAY); - pr_warn_once("unsupported nla_type %d\n", src->nla_type); - return -EOPNOTSUPP; - } -@@ -434,7 +440,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, - int err; - - if (type > XFRMA_MAX) { -- BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); -+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_INIT_DELAY); - NL_SET_ERR_MSG(extack, "Bad attribute"); - return -EOPNOTSUPP; - } -diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c -index ad01997c3aa9..d4c88d29703e 100644 ---- a/net/xfrm/xfrm_user.c -+++ b/net/xfrm/xfrm_user.c -@@ -272,6 +272,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, - NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); - goto out; - } -+ if ((attrs[XFRMA_IPTFS_PKT_SIZE] || -+ attrs[XFRMA_IPTFS_MAX_QSIZE] || -+ attrs[XFRMA_IPTFS_DONT_FRAG] || -+ attrs[XFRMA_IPTFS_DROP_TIME] || -+ attrs[XFRMA_IPTFS_REORDER_WINDOW] || -+ attrs[XFRMA_IPTFS_INIT_DELAY]) && -+ p->mode != XFRM_MODE_IPTFS) { -+ NL_SET_ERR_MSG(extack, "IPTFS options can only be used in IPTFS mode"); -+ goto out; -+ } - break; - - case IPPROTO_COMP: -@@ -3046,6 +3056,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { - [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, - [XFRMA_IF_ID] = { .type = NLA_U32 }, - [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, -+ [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, -+ [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, -+ [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, -+ [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, -+ [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, -+ [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, - }; - EXPORT_SYMBOL_GPL(xfrma_policy); - --- -2.43.2 - diff --git a/patches/v2/v2-0005-xfrm-netlink-add-config-netlink-options.patch b/patches/v2/v2-0005-xfrm-netlink-add-config-netlink-options.patch new file mode 100644 index 0000000..6eff4a9 --- /dev/null +++ b/patches/v2/v2-0005-xfrm-netlink-add-config-netlink-options.patch @@ -0,0 +1,159 @@ +From 463acd2b1d5eefd6cab011f39a1e6c68496be678 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sun, 12 Nov 2023 06:02:21 -0500 +Subject: [PATCH ipsec-next v2 05/17] xfrm: netlink: add config (netlink) + options + +Add netlink options for configuring IP-TFS SAs. + +Signed-off-by: Christian Hopps +--- + include/uapi/linux/xfrm.h | 9 ++++++- + net/xfrm/xfrm_compat.c | 10 ++++++-- + net/xfrm/xfrm_user.c | 52 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 68 insertions(+), 3 deletions(-) + +diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h +index 18ceaba8486e..3bd1f810e079 100644 +--- a/include/uapi/linux/xfrm.h ++++ b/include/uapi/linux/xfrm.h +@@ -158,7 +158,8 @@ enum { + #define XFRM_MODE_ROUTEOPTIMIZATION 2 + #define XFRM_MODE_IN_TRIGGER 3 + #define XFRM_MODE_BEET 4 +-#define XFRM_MODE_MAX 5 ++#define XFRM_MODE_IPTFS 5 ++#define XFRM_MODE_MAX 6 + + /* Netlink configuration messages. */ + enum { +@@ -321,6 +322,12 @@ enum xfrm_attr_type_t { + XFRMA_IF_ID, /* __u32 */ + XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ + XFRMA_SA_DIR, /* __u8 */ ++ XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */ ++ XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size */ ++ XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */ ++ XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */ ++ XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size */ ++ XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */ + __XFRMA_MAX + + #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ +diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c +index 703d4172c7d7..a28b9f6503e5 100644 +--- a/net/xfrm/xfrm_compat.c ++++ b/net/xfrm/xfrm_compat.c +@@ -280,9 +280,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) + case XFRMA_IF_ID: + case XFRMA_MTIMER_THRESH: + case XFRMA_SA_DIR: ++ case XFRMA_IPTFS_PKT_SIZE: ++ case XFRMA_IPTFS_MAX_QSIZE: ++ case XFRMA_IPTFS_DONT_FRAG: ++ case XFRMA_IPTFS_DROP_TIME: ++ case XFRMA_IPTFS_REORDER_WINDOW: ++ case XFRMA_IPTFS_INIT_DELAY: + return xfrm_nla_cpy(dst, src, nla_len(src)); + default: +- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); ++ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_INIT_DELAY); + pr_warn_once("unsupported nla_type %d\n", src->nla_type); + return -EOPNOTSUPP; + } +@@ -437,7 +443,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, + int err; + + if (type > XFRMA_MAX) { +- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); ++ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_INIT_DELAY); + NL_SET_ERR_MSG(extack, "Bad attribute"); + return -EOPNOTSUPP; + } +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index e83c687bd64e..6537bd520363 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -297,6 +297,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); + goto out; + } ++ if ((attrs[XFRMA_IPTFS_DROP_TIME] || ++ attrs[XFRMA_IPTFS_REORDER_WINDOW] || ++ attrs[XFRMA_IPTFS_DONT_FRAG] || ++ attrs[XFRMA_IPTFS_INIT_DELAY] || ++ attrs[XFRMA_IPTFS_MAX_QSIZE] || ++ attrs[XFRMA_IPTFS_PKT_SIZE]) && ++ p->mode != XFRM_MODE_IPTFS) { ++ NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode"); ++ goto out; ++ } + break; + + case IPPROTO_COMP: +@@ -417,6 +427,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + goto out; + } + ++ if (attrs[XFRMA_IPTFS_DROP_TIME]) { ++ NL_SET_ERR_MSG(extack, "Drop time should not be set for output SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) { ++ NL_SET_ERR_MSG(extack, "Reorder window should not be set for output SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ + if (attrs[XFRMA_REPLAY_VAL]) { + struct xfrm_replay_state *replay; + +@@ -454,6 +476,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + } + + } ++ ++ if (attrs[XFRMA_IPTFS_DONT_FRAG]) { ++ NL_SET_ERR_MSG(extack, "Don't fragment should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_INIT_DELAY]) { ++ NL_SET_ERR_MSG(extack, "Initial delay should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) { ++ NL_SET_ERR_MSG(extack, "Max queue size should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_PKT_SIZE]) { ++ NL_SET_ERR_MSG(extack, "Packet size should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } + } + + out: +@@ -3165,6 +3211,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { + [XFRMA_IF_ID] = { .type = NLA_U32 }, + [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), ++ [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, ++ [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, ++ [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, + }; + EXPORT_SYMBOL_GPL(xfrma_policy); + +-- +2.45.1 + diff --git a/patches/v2/v2-0006-iptfs-xfrm-Add-mode_cbs-module-functionality.patch b/patches/v2/v2-0006-xfrm-add-mode_cbs-module-functionality.patch similarity index 80% rename from patches/v2/v2-0006-iptfs-xfrm-Add-mode_cbs-module-functionality.patch rename to patches/v2/v2-0006-xfrm-add-mode_cbs-module-functionality.patch index c1ef349..dc37e84 100644 --- a/patches/v2/v2-0006-iptfs-xfrm-Add-mode_cbs-module-functionality.patch +++ b/patches/v2/v2-0006-xfrm-add-mode_cbs-module-functionality.patch @@ -1,8 +1,7 @@ -From e23c61a038e7bca3d8b23edfa696e977f04879b4 Mon Sep 17 00:00:00 2001 +From bbb79d6fa6900e22cbff9499f1d4ce78b907ba41 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 3 Aug 2023 15:15:00 +0000 -Subject: [PATCH ipsec-next v2 6/8] iptfs: xfrm: Add mode_cbs module - functionality +Subject: [PATCH ipsec-next v2 06/17] xfrm: add mode_cbs module functionality Add a set of callbacks xfrm_mode_cbs to xfrm_state. These callbacks enable the addition of new xfrm modes, such as IP-TFS to be defined @@ -10,20 +9,20 @@ in modules. Signed-off-by: Christian Hopps --- - include/net/xfrm.h | 38 ++++++++++++++++++++++++++++++++++ + include/net/xfrm.h | 39 ++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_device.c | 3 ++- - net/xfrm/xfrm_input.c | 14 +++++++++++-- + net/xfrm/xfrm_input.c | 14 ++++++++++-- net/xfrm/xfrm_output.c | 2 ++ - net/xfrm/xfrm_policy.c | 18 +++++++++------- - net/xfrm/xfrm_state.c | 47 ++++++++++++++++++++++++++++++++++++++++++ - net/xfrm/xfrm_user.c | 10 +++++++++ - 7 files changed, 122 insertions(+), 10 deletions(-) + net/xfrm/xfrm_policy.c | 18 ++++++++++------ + net/xfrm/xfrm_state.c | 48 ++++++++++++++++++++++++++++++++++++++++++ + net/xfrm/xfrm_user.c | 13 ++++++++++++ + 7 files changed, 127 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h -index 1d107241b901..f1d5e99f0a47 100644 +index 7c9be06f8302..cb75ec2993bf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h -@@ -204,6 +204,7 @@ struct xfrm_state { +@@ -206,6 +206,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; @@ -31,17 +30,17 @@ index 1d107241b901..f1d5e99f0a47 100644 int trailer_len; u32 extra_flags; struct xfrm_mark smark; -@@ -289,6 +290,9 @@ struct xfrm_state { - /* Private data of this transformer, format is opaque, +@@ -292,6 +293,9 @@ struct xfrm_state { * interpreted by xfrm_type methods. */ void *data; + u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) -@@ -441,6 +445,40 @@ struct xfrm_type_offload { +@@ -444,6 +448,41 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); @@ -57,6 +56,7 @@ index 1d107241b901..f1d5e99f0a47 100644 + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + int (*clone)(struct xfrm_state *x, struct xfrm_state *orig); ++ unsigned int (*sa_len)(const struct xfrm_state *x); + + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + @@ -83,7 +83,7 @@ index 1d107241b901..f1d5e99f0a47 100644 { switch(family) { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c -index 3784534c9185..8b848540ea47 100644 +index 2455a76a1cff..f91b2bee8190 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, @@ -97,10 +97,10 @@ index 3784534c9185..8b848540ea47 100644 static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c -index bd4ce21d76d7..824f7b7f90e0 100644 +index 71b42de6e3c9..8ef1af2d39bf 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c -@@ -437,6 +437,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, +@@ -438,6 +438,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, WARN_ON_ONCE(1); break; default: @@ -110,7 +110,7 @@ index bd4ce21d76d7..824f7b7f90e0 100644 WARN_ON_ONCE(1); break; } -@@ -479,6 +482,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +@@ -485,6 +488,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = x->props.family; @@ -121,7 +121,7 @@ index bd4ce21d76d7..824f7b7f90e0 100644 /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; -@@ -660,11 +667,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +@@ -672,11 +679,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; @@ -139,7 +139,7 @@ index bd4ce21d76d7..824f7b7f90e0 100644 decaps = 1; break; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c -index 662c83beb345..8f98e42d4252 100644 +index e5722c95b8bb..ef81359e4038 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) @@ -152,10 +152,10 @@ index 662c83beb345..8f98e42d4252 100644 break; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c -index 53b7ce4a4db0..f3cd8483d427 100644 +index 298b3a9eb48d..a3b50e8bc85a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c -@@ -2713,13 +2713,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, +@@ -2721,13 +2721,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; @@ -181,7 +181,7 @@ index 53b7ce4a4db0..f3cd8483d427 100644 xdst_prev = xdst; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c -index bda5327bf34d..2b58e35bea63 100644 +index 649bb739df0d..e9ea5c5dd183 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -513,6 +513,36 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) @@ -238,19 +238,20 @@ index bda5327bf34d..2b58e35bea63 100644 } return x; } -@@ -1745,6 +1778,11 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, - x->new_mapping = 0; +@@ -1747,6 +1780,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->new_mapping_sport = 0; + x->dir = orig->dir; ++ x->mode_cbs = orig->mode_cbs; + if (x->mode_cbs && x->mode_cbs->clone) { -+ if (!x->mode_cbs->clone(x, orig)) ++ if (x->mode_cbs->clone(x, orig)) + goto error; + } + return x; error: -@@ -2765,6 +2803,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +@@ -2786,6 +2825,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) case XFRM_MODE_TUNNEL: break; default: @@ -260,7 +261,7 @@ index bda5327bf34d..2b58e35bea63 100644 WARN_ON_ONCE(1); break; } -@@ -2850,6 +2891,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, +@@ -2871,6 +2913,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, goto error; } @@ -274,10 +275,10 @@ index bda5327bf34d..2b58e35bea63 100644 return err; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c -index d4c88d29703e..92d11f2306e7 100644 +index 6537bd520363..dfd52637abed 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c -@@ -779,6 +779,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, +@@ -914,6 +914,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } @@ -290,7 +291,7 @@ index d4c88d29703e..92d11f2306e7 100644 return x; error: -@@ -1192,6 +1198,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, +@@ -1327,6 +1333,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } @@ -298,9 +299,19 @@ index d4c88d29703e..92d11f2306e7 100644 + ret = x->mode_cbs->copy_to_user(x, skb); + if (ret) + goto out; - if (x->mapping_maxage) + if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); - out: + if (ret) +@@ -3526,6 +3536,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) + if (x->dir) + l += nla_total_size(sizeof(x->dir)); + ++ if (x->mode_cbs && x->mode_cbs->sa_len) ++ l += x->mode_cbs->sa_len(x); ++ + return l; + } + -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0007-iptfs-xfrm-add-generic-iptfs-defines-and-function.patch b/patches/v2/v2-0007-xfrm-add-generic-iptfs-defines-and-functionality.patch similarity index 77% rename from patches/v2/v2-0007-iptfs-xfrm-add-generic-iptfs-defines-and-function.patch rename to patches/v2/v2-0007-xfrm-add-generic-iptfs-defines-and-functionality.patch index b7056d5..c87d2e7 100644 --- a/patches/v2/v2-0007-iptfs-xfrm-add-generic-iptfs-defines-and-function.patch +++ b/patches/v2/v2-0007-xfrm-add-generic-iptfs-defines-and-functionality.patch @@ -1,7 +1,7 @@ -From 894fe2a5f9fe6b4c18d0a30f73c97011f7ccada3 Mon Sep 17 00:00:00 2001 +From 431a8a04ba62da7f671c3f930ac069320f46f70a Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Tue, 7 Nov 2023 13:23:27 -0500 -Subject: [PATCH ipsec-next v2 7/8] iptfs: xfrm: add generic iptfs defines and +Subject: [PATCH ipsec-next v2 07/17] xfrm: add generic iptfs defines and functionality Define `XFRM_MODE_IPTFS` and `IPSEC_MODE_IPTFS` constants, and add these to @@ -11,20 +11,20 @@ Signed-off-by: Christian Hopps --- include/net/xfrm.h | 1 + include/uapi/linux/ipsec.h | 3 ++- - include/uapi/linux/snmp.h | 2 ++ + include/uapi/linux/snmp.h | 3 +++ net/ipv4/esp4.c | 3 ++- net/ipv6/esp6.c | 3 ++- net/netfilter/nft_xfrm.c | 3 ++- net/xfrm/xfrm_device.c | 1 + net/xfrm/xfrm_output.c | 4 ++++ net/xfrm/xfrm_policy.c | 8 ++++++-- - net/xfrm/xfrm_proc.c | 2 ++ + net/xfrm/xfrm_proc.c | 3 +++ net/xfrm/xfrm_state.c | 12 ++++++++++++ - net/xfrm/xfrm_user.c | 3 +++ - 12 files changed, 39 insertions(+), 6 deletions(-) + net/xfrm/xfrm_user.c | 10 ++++++++++ + 12 files changed, 48 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h -index f1d5e99f0a47..1e2af0b27631 100644 +index cb75ec2993bf..0d8abf7bd32e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -37,6 +37,7 @@ @@ -50,23 +50,24 @@ index 50d8ee1791e2..696b790f4346 100644 enum { diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h -index a0819c6a5988..06907abbdab9 100644 +index adf5fd78dd50..77eb078f06a6 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h -@@ -337,6 +337,8 @@ enum - LINUX_MIB_XFRMFWDHDRERROR, /* XfrmFwdHdrError*/ - LINUX_MIB_XFRMOUTSTATEINVALID, /* XfrmOutStateInvalid */ +@@ -339,6 +339,9 @@ enum LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ + LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */ + LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */ ++ LINUX_MIB_XFRMNOSKBERROR, /* XfrmNoSkbError */ + LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */ + LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */ __LINUX_MIB_XFRMMAX }; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c -index 4ccfc104f13a..7607698690f9 100644 +index 7d38ddd64115..2da9fd4efb70 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c -@@ -818,7 +818,8 @@ int esp_input_done2(struct sk_buff *skb, int err) +@@ -809,7 +809,8 @@ int esp_input_done2(struct sk_buff *skb, int err) } skb_pull_rcsum(skb, hlen); @@ -77,10 +78,10 @@ index 4ccfc104f13a..7607698690f9 100644 else skb_set_transport_header(skb, -ihl); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c -index 2cc1a45742d8..81bdaceead27 100644 +index 27df148530a6..0e50ff3eccdb 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c -@@ -867,7 +867,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) +@@ -858,7 +858,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); @@ -91,7 +92,7 @@ index 2cc1a45742d8..81bdaceead27 100644 else skb_set_transport_header(skb, -hdr_len); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c -index 452f8587adda..291b029391cd 100644 +index 1c866757db55..620238c6ef4c 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) @@ -105,7 +106,7 @@ index 452f8587adda..291b029391cd 100644 static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c -index 8b848540ea47..a40f5e09829e 100644 +index f91b2bee8190..b73af6918028 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -69,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, @@ -117,7 +118,7 @@ index 8b848540ea47..a40f5e09829e 100644 if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c -index 8f98e42d4252..938b81c0a8d3 100644 +index ef81359e4038..b5025cf6136e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -677,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) @@ -132,7 +133,7 @@ index 8f98e42d4252..938b81c0a8d3 100644 /* non-Tunnel Mode */ if (!skb->encapsulation) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c -index f3cd8483d427..ed19e3d19296 100644 +index a3b50e8bc85a..dd58f84b4c13 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2474,6 +2474,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, @@ -143,7 +144,7 @@ index f3cd8483d427..ed19e3d19296 100644 tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; -@@ -3258,7 +3259,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, +@@ -3266,7 +3267,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && @@ -153,7 +154,7 @@ index f3cd8483d427..ed19e3d19296 100644 dst->flags |= DST_XFRM_TUNNEL; return dst; -@@ -4495,6 +4497,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm +@@ -4503,6 +4505,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: @@ -161,7 +162,7 @@ index f3cd8483d427..ed19e3d19296 100644 if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, -@@ -4537,7 +4540,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, +@@ -4545,7 +4548,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && @@ -172,20 +173,21 @@ index f3cd8483d427..ed19e3d19296 100644 /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c -index fee9b5cf37a7..d92b1b760749 100644 +index eeb984be03a7..e851b388995a 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c -@@ -41,6 +41,8 @@ static const struct snmp_mib xfrm_mib_list[] = { - SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), - SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), +@@ -43,6 +43,9 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), + SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), ++ SNMP_MIB_ITEM("XfrmNoSkbError", LINUX_MIB_XFRMNOSKBERROR), + SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), + SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), SNMP_MIB_SENTINEL }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c -index 2b58e35bea63..eaa21456bbe2 100644 +index e9ea5c5dd183..dad36ff30510 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -465,6 +465,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { @@ -212,7 +214,7 @@ index 2b58e35bea63..eaa21456bbe2 100644 }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) -@@ -2089,6 +2099,7 @@ static int __xfrm6_state_sort_cmp(const void *p) +@@ -2111,6 +2121,7 @@ static int __xfrm6_state_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: @@ -220,7 +222,7 @@ index 2b58e35bea63..eaa21456bbe2 100644 return 4; } return 5; -@@ -2115,6 +2126,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) +@@ -2137,6 +2148,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: @@ -229,18 +231,25 @@ index 2b58e35bea63..eaa21456bbe2 100644 } return 4; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c -index 92d11f2306e7..80617c0d040f 100644 +index dfd52637abed..177dc0b23002 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c -@@ -353,6 +353,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, - case XFRM_MODE_TUNNEL: +@@ -379,6 +379,14 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_BEET: -+ case XFRM_MODE_IPTFS: break; ++ case XFRM_MODE_IPTFS: ++ if (sa_dir == 0) { ++ NL_SET_ERR_MSG( ++ extack, ++ "IP-TFS mode requires in or out direction attribute"); ++ goto out; ++ } ++ break; default: -@@ -1830,6 +1831,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, + NL_SET_ERR_MSG(extack, "Unsupported mode"); +@@ -1973,6 +1981,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, return -EINVAL; } break; @@ -250,5 +259,5 @@ index 92d11f2306e7..80617c0d040f 100644 if (ut[i].family != prev_family) { NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); -- -2.43.2 +2.45.1 diff --git a/patches/v2/v2-0008-iptfs-impl-add-new-iptfs-xfrm-mode-impl.patch b/patches/v2/v2-0008-iptfs-impl-add-new-iptfs-xfrm-mode-impl.patch deleted file mode 100644 index e122e07..0000000 --- a/patches/v2/v2-0008-iptfs-impl-add-new-iptfs-xfrm-mode-impl.patch +++ /dev/null @@ -1,3026 +0,0 @@ -From 46c387bf464c5826e5830c4a35bfae87c13fe38c Mon Sep 17 00:00:00 2001 -From: Christian Hopps -Date: Sat, 29 Jul 2023 07:43:26 +0000 -Subject: [PATCH ipsec-next v2 8/8] iptfs: impl: add new iptfs xfrm mode impl - -Add a new xfrm mode implementing AggFrag/IP-TFS from RFC9347. - -This utilizes the new xfrm_mode_cbs to implement demand-driven IP-TFS -functionality. This functionality can be used to increase bandwidth -utilization through small packet aggregation, as well as help solve PMTU -issues through it's efficient use of fragmentation. - -Link: https://www.rfc-editor.org/rfc/rfc9347.txt - -Signed-off-by: Christian Hopps ---- - net/xfrm/Makefile | 1 + - net/xfrm/trace_iptfs.h | 218 ++++ - net/xfrm/xfrm_iptfs.c | 2759 ++++++++++++++++++++++++++++++++++++++++ - 3 files changed, 2978 insertions(+) - create mode 100644 net/xfrm/trace_iptfs.h - create mode 100644 net/xfrm/xfrm_iptfs.c - -diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile -index 547cec77ba03..cd6520d4d777 100644 ---- a/net/xfrm/Makefile -+++ b/net/xfrm/Makefile -@@ -20,5 +20,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o - obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o - obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o - obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o -+obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o - obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o - obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o -diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h -new file mode 100644 -index 000000000000..ae126eb2bf15 ---- /dev/null -+++ b/net/xfrm/trace_iptfs.h -@@ -0,0 +1,218 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* xfrm_trace_iptfs.h -+ * -+ * August 12 2023, Christian Hopps -+ * -+ * Copyright (c) 2023, LabN Consulting, L.L.C. -+ */ -+ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM iptfs -+ -+#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_IPTFS_H -+ -+#include -+#include -+#include -+#include -+ -+struct xfrm_iptfs_data; -+ -+TRACE_EVENT(iptfs_egress_recv, -+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), -+ TP_ARGS(skb, xtfs, blkoff), -+ TP_STRUCT__entry(__field(struct sk_buff *, skb) -+ __field(void *, head) -+ __field(void *, head_pg_addr) -+ __field(void *, pg0addr) -+ __field(u32, skb_len) -+ __field(u32, data_len) -+ __field(u32, headroom) -+ __field(u32, tailroom) -+ __field(u32, tail) -+ __field(u32, end) -+ __field(u32, pg0off) -+ __field(u8, head_frag) -+ __field(u8, frag_list) -+ __field(u8, nr_frags) -+ __field(u16, blkoff)), -+ TP_fast_assign(__entry->skb = skb; -+ __entry->head = skb->head; -+ __entry->skb_len = skb->len; -+ __entry->data_len = skb->data_len; -+ __entry->headroom = skb_headroom(skb); -+ __entry->tailroom = skb_tailroom(skb); -+ __entry->tail = skb->tail; -+ __entry->end = skb->end; -+ __entry->head_frag = skb->head_frag; -+ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; -+ __entry->nr_frags = skb_shinfo(skb)->nr_frags; -+ __entry->blkoff = blkoff; -+ __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); -+ __entry->pg0addr = (__entry->nr_frags -+ ? page_address(skb_shinfo(skb)->frags[0].bv_page) -+ : NULL); -+ __entry->pg0off = (__entry->nr_frags -+ ? skb_shinfo(skb)->frags[0].bv_offset -+ : 0); -+ ), -+ TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", -+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, -+ __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, -+ __entry->tailroom, __entry->tail, __entry->end, __entry->head, -+ __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, -+ __entry->pg0off) -+ ) -+ -+DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, -+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, -+ u32 pmtu, u8 was_gso), -+ TP_ARGS(skb, xtfs, pmtu, was_gso), -+ TP_STRUCT__entry(__field(struct sk_buff *, skb) -+ __field(u32, skb_len) -+ __field(u32, data_len) -+ __field(u32, pmtu) -+ __field(u32, queue_size) -+ __field(u32, proto_seq) -+ __field(u8, proto) -+ __field(u8, was_gso) -+ ), -+ TP_fast_assign(__entry->skb = skb; -+ __entry->skb_len = skb->len; -+ __entry->data_len = skb->data_len; -+ __entry->queue_size = -+ xtfs->cfg.max_queue_size - xtfs->queue_size; -+ __entry->proto = __trace_ip_proto(ip_hdr(skb)); -+ __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); -+ __entry->pmtu = pmtu; -+ __entry->was_gso = was_gso; -+ ), -+ TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", -+ __entry->skb, __entry->skb_len, __entry->data_len, -+ __entry->queue_size, __entry->proto, __entry->proto_seq, -+ __entry->pmtu, __entry->was_gso)); -+ -+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, -+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), -+ TP_ARGS(skb, xtfs, pmtu, was_gso)); -+ -+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, -+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), -+ TP_ARGS(skb, xtfs, pmtu, was_gso)); -+ -+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, -+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), -+ TP_ARGS(skb, xtfs, pmtu, was_gso)); -+ -+DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, -+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), -+ TP_ARGS(skb, mtu, blkoff, iph), -+ TP_STRUCT__entry(__field(struct sk_buff *, skb) -+ __field(u32, skb_len) -+ __field(u32, data_len) -+ __field(u32, mtu) -+ __field(u32, proto_seq) -+ __field(u16, blkoff) -+ __field(u8, proto)), -+ TP_fast_assign(__entry->skb = skb; -+ __entry->skb_len = skb->len; -+ __entry->data_len = skb->data_len; -+ __entry->mtu = mtu; -+ __entry->blkoff = blkoff; -+ __entry->proto = iph ? __trace_ip_proto(iph) : 0; -+ __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; -+ ), -+ TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", -+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, -+ __entry->blkoff, __entry->proto, __entry->proto_seq)); -+ -+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, -+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, -+ struct iphdr *iph), -+ TP_ARGS(skb, mtu, blkoff, iph)); -+ -+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, -+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, -+ struct iphdr *iph), -+ TP_ARGS(skb, mtu, blkoff, iph)); -+ -+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, -+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, -+ struct iphdr *iph), -+ TP_ARGS(skb, mtu, blkoff, iph)); -+ -+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, -+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, -+ struct iphdr *iph), -+ TP_ARGS(skb, mtu, blkoff, iph)); -+ -+TRACE_EVENT(iptfs_ingress_nth_peek, -+ TP_PROTO(struct sk_buff *skb, u32 remaining), -+ TP_ARGS(skb, remaining), -+ TP_STRUCT__entry(__field(struct sk_buff *, skb) -+ __field(u32, skb_len) -+ __field(u32, remaining)), -+ TP_fast_assign(__entry->skb = skb; -+ __entry->skb_len = skb->len; -+ __entry->remaining = remaining; -+ ), -+ TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", -+ __entry->skb, __entry->skb_len, __entry->remaining)); -+ -+TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), -+ TP_ARGS(skb, share_ok), -+ TP_STRUCT__entry(__field(struct sk_buff *, skb) -+ __field(u32, skb_len) -+ __field(u32, data_len) -+ __field(u8, share_ok) -+ __field(u8, head_frag) -+ __field(u8, pp_recycle) -+ __field(u8, cloned) -+ __field(u8, shared) -+ __field(u8, nr_frags) -+ __field(u8, frag_list) -+ ), -+ TP_fast_assign(__entry->skb = skb; -+ __entry->skb_len = skb->len; -+ __entry->data_len = skb->data_len; -+ __entry->share_ok = share_ok; -+ __entry->head_frag = skb->head_frag; -+ __entry->pp_recycle = skb->pp_recycle; -+ __entry->cloned = skb_cloned(skb); -+ __entry->shared = skb_shared(skb); -+ __entry->nr_frags = skb_shinfo(skb)->nr_frags; -+ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; -+ ), -+ TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", -+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, -+ __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, -+ __entry->nr_frags, __entry->frag_list)); -+ -+DECLARE_EVENT_CLASS(iptfs_timer_event, -+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), -+ TP_ARGS(xtfs, time_val), -+ TP_STRUCT__entry(__field(u64, time_val) -+ __field(u64, set_time)), -+ TP_fast_assign(__entry->time_val = time_val; -+ __entry->set_time = xtfs->iptfs_settime; -+ ), -+ TP_printk("TIMER: set_time=%llu time_val=%llu", -+ __entry->set_time, __entry->time_val)); -+ -+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, -+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), -+ TP_ARGS(xtfs, time_val)); -+ -+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, -+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), -+ TP_ARGS(xtfs, time_val)); -+ -+#endif /* _TRACE_IPTFS_H */ -+ -+/* This part must be outside protection */ -+#undef TRACE_INCLUDE_PATH -+#define TRACE_INCLUDE_PATH ../../net/xfrm -+#undef TRACE_INCLUDE_FILE -+#define TRACE_INCLUDE_FILE trace_iptfs -+#include -diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c -new file mode 100644 -index 000000000000..5e16f1682ea2 ---- /dev/null -+++ b/net/xfrm/xfrm_iptfs.c -@@ -0,0 +1,2759 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* xfrm_iptfs: IPTFS encapsulation support -+ * -+ * April 21 2022, Christian Hopps -+ * -+ * Copyright (c) 2022, LabN Consulting, L.L.C. -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "xfrm_inout.h" -+#include "trace_iptfs.h" -+ -+/* IPTFS encap (header) values. */ -+#define IPTFS_SUBTYPE_BASIC 0 -+#define IPTFS_SUBTYPE_CC 1 -+ -+/* 1) skb->head should be cache aligned. -+ * 2) when resv is for L2 headers (i.e., ethernet) we want the cacheline to -+ * start -16 from data. -+ * 3) when resv is for L3+L2 headers IOW skb->data points at the IPTFS payload -+ * we want data to be cache line aligned so all the pushed headers will be in -+ * another cacheline. -+ */ -+#define XFRM_IPTFS_MIN_L3HEADROOM 128 -+#define XFRM_IPTFS_MIN_L2HEADROOM (64 + 16) -+#define IPTFS_FRAG_COPY_MAX 256 /* max for copying to create iptfs frags */ -+#define IPTFS_PKT_SHARE_MIN 129 /* min to try to share vs copy pkt data */ -+#define NSECS_IN_USEC 1000 -+ -+#define IPTFS_TYPE_NOCC 0 -+#define IPTFS_TYPE_CC 1 -+ -+#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT -+ -+struct skb_wseq { -+ struct sk_buff *skb; -+ u64 drop_time; -+}; -+ -+struct xfrm_iptfs_config { -+ bool dont_frag : 1; -+ u16 reorder_win_size; -+ u32 pkt_size; /* outer_packet_size or 0 */ -+ u32 max_queue_size; /* octets */ -+}; -+ -+struct xfrm_iptfs_data { -+ struct xfrm_iptfs_config cfg; -+ -+ /* Ingress User Input */ -+ struct xfrm_state *x; /* owning state */ -+ struct sk_buff_head queue; /* output queue */ -+ u32 queue_size; /* octets */ -+ u32 ecn_queue_size; /* octets above which ECN mark */ -+ u64 init_delay_ns; /* nanoseconds */ -+ struct hrtimer iptfs_timer; /* output timer */ -+ time64_t iptfs_settime; /* time timer was set */ -+ u32 payload_mtu; /* max payload size */ -+ -+ /* Tunnel input reordering */ -+ bool w_seq_set; /* true after first seq received */ -+ u64 w_wantseq; /* expected next sequence */ -+ struct skb_wseq *w_saved; /* the saved buf array */ -+ u32 w_savedlen; /* the saved len (not size) */ -+ spinlock_t drop_lock; -+ struct hrtimer drop_timer; -+ u64 drop_time_ns; -+ -+ /* Tunnel input reassembly */ -+ struct sk_buff *ra_newskb; /* new pkt being reassembled */ -+ u64 ra_wantseq; /* expected next sequence */ -+ u8 ra_runt[6]; /* last pkt bytes from last skb */ -+ u8 ra_runtlen; /* count of ra_runt */ -+}; -+ -+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); -+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); -+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); -+ -+/* ================= */ -+/* Utility Functions */ -+/* ================= */ -+ -+static u32 __trace_ip_proto(struct iphdr *iph) -+{ -+ if (iph->version == 4) -+ return iph->protocol; -+ return ((struct ipv6hdr *)iph)->nexthdr; -+} -+ -+static u32 __trace_ip_proto_seq(struct iphdr *iph) -+{ -+ void *nexthdr; -+ u32 protocol = 0; -+ -+ if (iph->version == 4) { -+ nexthdr = (void *)(iph + 1); -+ protocol = iph->protocol; -+ } else if (iph->version == 6) { -+ nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); -+ protocol = ((struct ipv6hdr *)(iph))->nexthdr; -+ } -+ switch (protocol) { -+ case IPPROTO_ICMP: -+ return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); -+ case IPPROTO_ICMPV6: -+ return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); -+ case IPPROTO_TCP: -+ return ntohl(((struct tcphdr *)nexthdr)->seq); -+ case IPPROTO_UDP: -+ return ntohs(((struct udphdr *)nexthdr)->source); -+ default: -+ return 0; -+ } -+} -+ -+static u64 __esp_seq(struct sk_buff *skb) -+{ -+ u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); -+ -+ return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; -+} -+ -+/* ================= */ -+/* SK_BUFF Functions */ -+/* ================= */ -+ -+/** -+ * iptfs_alloc_skb() - Allocate a new `skb` using a meta-data template. -+ * @tpl: the template to copy the new `skb`s meta-data from. -+ * @len: the linear length of the head data, zero is fine. -+ * @l3resv: true if reserve needs to support pushing L3 headers -+ * -+ * A new `skb` is allocated and it's meta-data is initialized from `tpl`, the -+ * head data is sized to `len` + reserved space set according to the @l3resv -+ * boolean. When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which -+ * arranges for `skb->data - 16` (etherhdr space) to be the start of a cacheline. -+ * Otherwise, @l3resv is true and resv is either the size of headroom from `tpl` or -+ * XFRM_IPTFS_MIN_L3HEADROOM whichever is greater, which tries to align -+ * skb->data to a cacheline as all headers will be pushed on the previous -+ * cacheline bytes. -+ * -+ * When copying meta-data from the @tpl, the sk_buff->headers are not copied. -+ * -+ * Zero length skbs are allocated when we only need a head skb to hold new -+ * packet headers (basically the mac header) that sit on top of existing shared -+ * packet data. -+ * -+ * Return: the new skb or NULL. -+ */ -+static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, -+ bool l3resv) -+{ -+ struct sk_buff *skb; -+ u32 resv; -+ -+ if (!l3resv) { -+ resv = XFRM_IPTFS_MIN_L2HEADROOM; -+ } else { -+ resv = skb_headroom(tpl); -+ if (resv < XFRM_IPTFS_MIN_L3HEADROOM) -+ resv = XFRM_IPTFS_MIN_L3HEADROOM; -+ } -+ -+ skb = alloc_skb(len + resv, GFP_ATOMIC); -+ if (!skb) { -+ XFRM_INC_STATS(dev_net(tpl->dev), LINUX_MIB_XFRMINERROR); -+ return NULL; -+ } -+ -+ skb_reserve(skb, resv); -+ -+ /* Code from __copy_skb_header() -- we do not want any of the -+ * tpl->headers copied over, so we aren't using `skb_copy_header()`. -+ */ -+ skb->tstamp = tpl->tstamp; -+ skb->dev = tpl->dev; -+ memcpy(skb->cb, tpl->cb, sizeof(skb->cb)); -+ skb_dst_copy(skb, tpl); -+ __skb_ext_copy(skb, tpl); -+ __nf_copy(skb, tpl, false); -+ -+ return skb; -+} -+ -+/** -+ * skb_head_to_frag() - initialize a skb_frag_t based on skb head data -+ * @skb: skb with the head data -+ * @frag: frag to initialize -+ */ -+static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) -+{ -+ struct page *page = virt_to_head_page(skb->data); -+ unsigned char *addr = (unsigned char *)page_address(page); -+ -+ BUG_ON(!skb->head_frag); -+ skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); -+} -+ -+/** -+ * struct skb_frag_walk - use to track a walk through fragments -+ * @fragi: current fragment index -+ * @past: length of data in fragments before @fragi -+ * @total: length of data in all fragments -+ * @nr_frags: number of fragments present in array -+ * @initial_offset: the value passed in to skb_prepare_frag_walk() -+ * @pp_recycle: copy of skb->pp_recycle -+ * @frags: the page fragments inc. room for head page -+ */ -+struct skb_frag_walk { -+ u32 fragi; -+ u32 past; -+ u32 total; -+ u32 nr_frags; -+ u32 initial_offset; -+ bool pp_recycle; -+ skb_frag_t frags[MAX_SKB_FRAGS + 1]; -+}; -+ -+/** -+ * skb_prepare_frag_walk() - initialize a frag walk over an skb. -+ * @skb: the skb to walk. -+ * @initial_offset: start the walk @initial_offset into the skb. -+ * @walk: the walk to initialize -+ * -+ * Future calls to skb_add_frags() will expect the @offset value to be at -+ * least @initial_offset large. -+ */ -+static void skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, -+ struct skb_frag_walk *walk) -+{ -+ struct skb_shared_info *shinfo = skb_shinfo(skb); -+ skb_frag_t *frag, *from; -+ u32 i; -+ -+ walk->initial_offset = initial_offset; -+ walk->fragi = 0; -+ walk->past = 0; -+ walk->total = 0; -+ walk->nr_frags = 0; -+ walk->pp_recycle = skb->pp_recycle; -+ -+ if (skb->head_frag) { -+ if (initial_offset >= skb_headlen(skb)) { -+ initial_offset -= skb_headlen(skb); -+ } else { -+ frag = &walk->frags[walk->nr_frags++]; -+ skb_head_to_frag(skb, frag); -+ frag->bv_offset += initial_offset; -+ frag->bv_len -= initial_offset; -+ walk->total += frag->bv_len; -+ initial_offset = 0; -+ } -+ } else { -+ BUG_ON(skb_headlen(skb) > initial_offset); -+ initial_offset -= skb_headlen(skb); -+ } -+ -+ for (i = 0; i < shinfo->nr_frags; i++) { -+ from = &shinfo->frags[i]; -+ if (initial_offset >= from->bv_len) { -+ initial_offset -= from->bv_len; -+ continue; -+ } -+ frag = &walk->frags[walk->nr_frags++]; -+ *frag = *from; -+ if (initial_offset) { -+ frag->bv_offset += initial_offset; -+ frag->bv_len -= initial_offset; -+ initial_offset = 0; -+ } -+ walk->total += frag->bv_len; -+ } -+ BUG_ON(initial_offset != 0); -+} -+ -+static u32 __skb_reset_frag_walk(struct skb_frag_walk *walk, u32 offset) -+{ -+ /* Adjust offset to refer to internal walk values */ -+ BUG_ON(offset < walk->initial_offset); -+ offset -= walk->initial_offset; -+ -+ /* Get to the correct fragment for offset */ -+ while (offset < walk->past) { -+ walk->past -= walk->frags[--walk->fragi].bv_len; -+ if (offset >= walk->past) -+ break; -+ BUG_ON(walk->fragi == 0); -+ } -+ while (offset >= walk->past + walk->frags[walk->fragi].bv_len) -+ walk->past += walk->frags[walk->fragi++].bv_len; -+ -+ /* offset now relative to this current frag */ -+ offset -= walk->past; -+ return offset; -+} -+ -+/** -+ * skb_can_add_frags() - check if ok to add frags from walk to skb -+ * @skb: skb to check for adding frags to -+ * @walk: the walk that will be used as source for frags. -+ * @offset: offset from beginning of original skb to start from. -+ * @len: amount of data to add frag references to in @skb. -+ */ -+static bool skb_can_add_frags(const struct sk_buff *skb, -+ struct skb_frag_walk *walk, u32 offset, u32 len) -+{ -+ struct skb_shared_info *shinfo = skb_shinfo(skb); -+ u32 fragi, nr_frags, fraglen; -+ -+ if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) -+ return false; -+ -+ /* Make offset relative to current frag after setting that */ -+ offset = __skb_reset_frag_walk(walk, offset); -+ -+ /* Verify we have array space for the fragments we need to add */ -+ fragi = walk->fragi; -+ nr_frags = shinfo->nr_frags; -+ while (len && fragi < walk->nr_frags) { -+ skb_frag_t *frag = &walk->frags[fragi]; -+ -+ fraglen = frag->bv_len; -+ if (offset) { -+ fraglen -= offset; -+ offset = 0; -+ } -+ if (++nr_frags > MAX_SKB_FRAGS) -+ return false; -+ if (len <= fraglen) -+ return true; -+ len -= fraglen; -+ fragi++; -+ } -+ /* We may not copy all @len but what we have will fit. */ -+ return true; -+} -+ -+/** -+ * skb_add_frags() - add a range of fragment references into an skb -+ * @skb: skb to add references into -+ * @walk: the walk to add referenced fragments from. -+ * @offset: offset from beginning of original skb to start from. -+ * @len: amount of data to add frag references to in @skb. -+ * -+ * skb_can_add_frags() should be called before this function to verify that the -+ * destination @skb is compatible with the walk and has space in the array for -+ * the to be added frag references. -+ * -+ * Return: The number of bytes not added to @skb b/c we reached the end of the -+ * walk before adding all of @len. -+ */ -+static int skb_add_frags(struct sk_buff *skb, struct skb_frag_walk *walk, -+ u32 offset, u32 len) -+{ -+ struct skb_shared_info *shinfo = skb_shinfo(skb); -+ u32 fraglen; -+ -+ BUG_ON(skb->pp_recycle != walk->pp_recycle); -+ if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) -+ return len; -+ -+ /* make offset relative to current frag after setting that */ -+ offset = __skb_reset_frag_walk(walk, offset); -+ BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); -+ -+ while (len && walk->fragi < walk->nr_frags) { -+ skb_frag_t *frag = &walk->frags[walk->fragi]; -+ skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; -+ -+ *tofrag = *frag; -+ if (offset) { -+ tofrag->bv_offset += offset; -+ tofrag->bv_len -= offset; -+ offset = 0; -+ } -+ __skb_frag_ref(tofrag); -+ shinfo->nr_frags++; -+ BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); -+ -+ /* see if we are done */ -+ fraglen = tofrag->bv_len; -+ if (len < fraglen) { -+ tofrag->bv_len = len; -+ skb->len += len; -+ skb->data_len += len; -+ return 0; -+ } -+ /* advance to next source fragment */ -+ len -= fraglen; /* careful, use dst bv_len */ -+ skb->len += fraglen; /* careful, " " " */ -+ skb->data_len += fraglen; /* careful, " " " */ -+ walk->past += -+ frag->bv_len; /* careful, use src bv_len */ -+ walk->fragi++; -+ } -+ return len; -+} -+ -+/** -+ * skb_copy_bits_seq - copy bits from a skb_seq_state to kernel buffer -+ * @st: source skb_seq_state -+ * @offset: offset in source -+ * @to: destination buffer -+ * @len: number of bytes to copy -+ * -+ * Copy @len bytes from @offset bytes into the source @st to the destination -+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent -+ * call to this function. If offset needs to decrease from the previous use `st` -+ * should be reset first. -+ */ -+static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, int len) -+{ -+ const u8 *data; -+ u32 sqlen; -+ -+ for (;;) { -+ sqlen = skb_seq_read(offset, &data, st); -+ if (sqlen == 0) -+ return -ENOMEM; -+ if (sqlen >= len) { -+ memcpy(to, data, len); -+ return 0; -+ } -+ memcpy(to, data, sqlen); -+ to += sqlen; -+ offset += sqlen; -+ len -= sqlen; -+ } -+} -+ -+/* ================================== */ -+/* IPTFS Trace Event Definitions */ -+/* ================================== */ -+ -+#define CREATE_TRACE_POINTS -+#include "trace_iptfs.h" -+ -+/* ================================== */ -+/* IPTFS Receiving (egress) Functions */ -+/* ================================== */ -+ -+/** -+ * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. -+ * @tpl: template to create new skb from. -+ * @walk: The source for fragments to add. -+ * @off: The offset into @walk to add frags from, also used with @st and -+ * @copy_len. -+ * @len: The length of data to add covering frags from @walk into @skb. -+ * This must be <= @skblen. -+ * @st: The sequence state to copy from into the new head skb. -+ * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb -+ * linear space. -+ * -+ * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from -+ * @st into the new skb linear space, and then add shared fragments from the -+ * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). -+ * -+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. -+ */ -+static struct sk_buff *iptfs_pskb_add_frags(struct sk_buff *tpl, -+ struct skb_frag_walk *walk, u32 off, -+ u32 len, struct skb_seq_state *st, -+ u32 copy_len) -+{ -+ struct sk_buff *skb; -+ -+ skb = iptfs_alloc_skb(tpl, copy_len, false); -+ if (!skb) -+ return NULL; -+ -+ /* this should not normally be happening */ -+ if (!skb_can_add_frags(skb, walk, off + copy_len, len - copy_len)) { -+ kfree_skb(skb); -+ return NULL; -+ } -+ -+ if (copy_len && -+ skb_copy_bits_seq(st, off, skb_put(skb, copy_len), copy_len)) { -+ XFRM_INC_STATS(dev_net(st->root_skb->dev), -+ LINUX_MIB_XFRMINERROR); -+ kfree_skb(skb); -+ return NULL; -+ } -+ -+ skb_add_frags(skb, walk, off + copy_len, len - copy_len); -+ return skb; -+} -+ -+/** -+ * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. -+ * @skblen: the total data size for `skb`. -+ * @st: The source for the rest of the data to copy into `skb`. -+ * @off: The offset into @st to copy data from. -+ * @len: The length of data to copy from @st into `skb`. This must be <= -+ * @skblen. -+ * -+ * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, -+ * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len -+ * bytes from @st into `skb` starting from @off. -+ * -+ * It is an error for @len to be greater than the amount of data left in @st. -+ * -+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. -+ */ -+static struct sk_buff * -+iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) -+{ -+ struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); -+ -+ if (!skb) -+ return NULL; -+ if (skb_copy_bits_seq(st, off, skb_put(skb, len), len)) { -+ XFRM_INC_STATS(dev_net(st->root_skb->dev), -+ LINUX_MIB_XFRMINERROR); -+ kfree_skb(skb); -+ return NULL; -+ } -+ return skb; -+} -+ -+/** -+ * iptfs_input_save_runt() - save data in xtfs runt space. -+ * @xtfs: xtfs state -+ * @seq: the current sequence -+ * @buf: packet data -+ * @len: length of packet data -+ * -+ * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data -+ * runt space. -+ */ -+static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, -+ u8 *buf, int len) -+{ -+ BUG_ON(xtfs->ra_newskb); /* we won't have a new SKB yet */ -+ -+ memcpy(xtfs->ra_runt, buf, len); -+ -+ xtfs->ra_runtlen = len; -+ xtfs->ra_wantseq = seq + 1; -+} -+ -+/** -+ * __iptfs_iphlen() - return the v4/v6 header length using packet data. -+ * @data: pointer at octet with version nibble -+ * -+ * The version data is expected to be valid (i.e., either 4 or 6). -+ */ -+static u32 __iptfs_iphlen(u8 *data) -+{ -+ struct iphdr *iph = (struct iphdr *)data; -+ -+ if (iph->version == 0x4) -+ return sizeof(*iph); -+ BUG_ON(iph->version != 0x6); -+ return sizeof(struct ipv6hdr); -+} -+ -+/** -+ * __iptfs_iplen() - return the v4/v6 length using packet data. -+ * @data: pointer to ip (v4/v6) packet header -+ * -+ * Grab the IPv4 or IPv6 length value in the start of the inner packet header -+ * pointed to by `data`. Assumes data len is enough for the length field only. -+ * -+ * The version data is expected to be valid (i.e., either 4 or 6). -+ */ -+static u32 __iptfs_iplen(u8 *data) -+{ -+ struct iphdr *iph = (struct iphdr *)data; -+ -+ if (iph->version == 0x4) -+ return ntohs(iph->tot_len); -+ BUG_ON(iph->version != 0x6); -+ return ntohs(((struct ipv6hdr *)iph)->payload_len) + -+ sizeof(struct ipv6hdr); -+} -+ -+/** -+ * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. -+ * @x: xfrm state -+ * @skb: the inner packet -+ * -+ * Finish the standard xfrm processing on the inner packet prior to sending back -+ * through gro_cells_receive. We do this separately b/c we are building a list -+ * of packets in the hopes that one day a list will be taken by -+ * xfrm_input. -+ */ -+static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ skb_reset_network_header(skb); -+ -+ /* The packet is going back through gro_cells_receive no need to -+ * set this. -+ */ -+ skb_reset_transport_header(skb); -+ -+ /* Packet already has checksum value set. */ -+ skb->ip_summed = CHECKSUM_NONE; -+ -+ /* Our skb will contain the header data copied when this outer packet -+ * which contained the start of this inner packet. This is true -+ * when we allocate a new skb as well as when we reuse the existing skb. -+ */ -+ if (ip_hdr(skb)->version == 0x4) { -+ struct iphdr *iph = ip_hdr(skb); -+ -+ if (x->props.flags & XFRM_STATE_DECAP_DSCP) -+ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); -+ if (!(x->props.flags & XFRM_STATE_NOECN)) -+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) -+ IP_ECN_set_ce(iph); -+ -+ skb->protocol = htons(ETH_P_IP); -+ } else { -+ struct ipv6hdr *iph = ipv6_hdr(skb); -+ -+ if (x->props.flags & XFRM_STATE_DECAP_DSCP) -+ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); -+ if (!(x->props.flags & XFRM_STATE_NOECN)) -+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) -+ IP6_ECN_set_ce(skb, iph); -+ -+ skb->protocol = htons(ETH_P_IPV6); -+ } -+} -+ -+static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) -+{ -+ assert_spin_locked(&xtfs->drop_lock); -+ -+ /* We don't care if it works locking takes care of things */ -+ hrtimer_try_to_cancel(&xtfs->drop_timer); -+ if (free) -+ kfree_skb(xtfs->ra_newskb); -+ xtfs->ra_newskb = NULL; -+} -+ -+/** -+ * iptfs_reassem_abort() - In-progress packet is aborted free the state. -+ * @xtfs: xtfs state -+ */ -+static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) -+{ -+ __iptfs_reassem_done(xtfs, true); -+} -+ -+/** -+ * iptfs_reassem_done() - In-progress packet is complete, clear the state. -+ * @xtfs: xtfs state -+ */ -+static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) -+{ -+ __iptfs_reassem_done(xtfs, false); -+} -+ -+/** -+ * iptfs_reassem_cont() - Continue the reassembly of an inner packets. -+ * @xtfs: xtfs state -+ * @seq: sequence of current packet -+ * @st: seq read stat for current packet -+ * @skb: current packet -+ * @data: offset into sequential packet data -+ * @blkoff: packet blkoff value -+ * @list: list of skbs to enqueue completed packet on -+ * -+ * Process an IPTFS payload that has a non-zero `blkoff` or when we are -+ * expecting the continuation b/c we have a runt or in-progress packet. -+ */ -+static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, -+ struct skb_seq_state *st, struct sk_buff *skb, -+ u32 data, u32 blkoff, struct list_head *list) -+{ -+ struct skb_frag_walk _fragwalk; -+ struct skb_frag_walk *fragwalk = NULL; -+ struct sk_buff *newskb = xtfs->ra_newskb; -+ u32 remaining = skb->len - data; -+ u32 runtlen = xtfs->ra_runtlen; -+ u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; -+ -+ /* Handle packet fragment we aren't expecting */ -+ if (!runtlen && !xtfs->ra_newskb) -+ return data + min(blkoff, remaining); -+ -+ /* Important to remember that input to this function is an ordered -+ * packet stream (unless the user disabled the reorder window). Thus if -+ * we are waiting for, and expecting the next packet so we can continue -+ * assembly, a newer sequence number indicates older ones are not coming -+ * (or if they do should be ignored). Technically we can receive older -+ * ones when the reorder window is disabled; however, the user should -+ * have disabled fragmentation in this case, and regardless we don't -+ * deal with it. -+ * -+ * blkoff could be zero if the stream is messed up (or it's an all pad -+ * insertion) be careful to handle that case in each of the below -+ */ -+ -+ /* Too old case: This can happen when the reorder window is disabled so -+ * ordering isn't actually guaranteed. -+ */ -+ if (seq < xtfs->ra_wantseq) -+ return data + remaining; -+ -+ /* Too new case: We missed what we wanted cleanup. */ -+ if (seq > xtfs->ra_wantseq) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ -+ if (blkoff == 0) { -+ if ((*skb->data & 0xF0) != 0) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ /* Handle all pad case, advance expected sequence number. -+ * (RFC 9347 S2.2.3) -+ */ -+ xtfs->ra_wantseq++; -+ /* will end parsing */ -+ return data + remaining; -+ } -+ -+ if (runtlen) { -+ BUG_ON(xtfs->ra_newskb); -+ -+ /* Regardless of what happens we're done with the runt */ -+ xtfs->ra_runtlen = 0; -+ -+ /* The start of this inner packet was at the very end of the last -+ * iptfs payload which didn't include enough for the ip header -+ * length field. We must have *at least* that now. -+ */ -+ rrem = sizeof(xtfs->ra_runt) - runtlen; -+ if (remaining < rrem || blkoff < rrem) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ -+ /* fill in the runt data */ -+ if (skb_copy_bits_seq(st, data, &xtfs->ra_runt[runtlen], -+ rrem)) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINBUFFERERROR); -+ goto abandon; -+ } -+ -+ /* We have enough data to get the ip length value now, -+ * allocate an in progress skb -+ */ -+ ipremain = __iptfs_iplen(xtfs->ra_runt); -+ if (ipremain < sizeof(xtfs->ra_runt)) { -+ /* length has to be at least runtsize large */ -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ -+ /* For the runt case we don't attempt sharing currently. NOTE: -+ * Currently, this IPTFS implementation will not create runts. -+ */ -+ -+ newskb = iptfs_alloc_skb(skb, ipremain, false); -+ if (!newskb) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINERROR); -+ goto abandon; -+ } -+ xtfs->ra_newskb = newskb; -+ -+ /* Copy the runt data into the buffer, but leave data -+ * pointers the same as normal non-runt case. The extra `rrem` -+ * recopied bytes are basically cacheline free. Allows using -+ * same logic below to complete. -+ */ -+ memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, -+ sizeof(xtfs->ra_runt)); -+ } -+ -+ /* Continue reassembling the packet */ -+ ipremain = __iptfs_iplen(newskb->data); -+ iphlen = __iptfs_iphlen(newskb->data); -+ -+ /* Sanity check, we created the newskb knowing the IP length so the IP -+ * length can't now be shorter. -+ */ -+ BUG_ON(newskb->len > ipremain); -+ -+ ipremain -= newskb->len; -+ if (blkoff < ipremain) { -+ /* Corrupt data, we don't have enough to complete the packet */ -+ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ -+ /* We want the IP header in linear space */ -+ if (newskb->len < iphlen) { -+ iphremain = iphlen - newskb->len; -+ if (blkoff < iphremain) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINIPTFSERROR); -+ goto abandon; -+ } -+ fraglen = min(blkoff, remaining); -+ copylen = min(fraglen, iphremain); -+ BUG_ON(skb_tailroom(newskb) < copylen); -+ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINBUFFERERROR); -+ goto abandon; -+ } -+ /* this is a silly condition that might occur anyway */ -+ if (copylen < iphremain) { -+ xtfs->ra_wantseq++; -+ return data + fraglen; -+ } -+ /* update data and things derived from it */ -+ data += copylen; -+ blkoff -= copylen; -+ remaining -= copylen; -+ ipremain -= copylen; -+ } -+ -+ fraglen = min(blkoff, remaining); -+ copylen = min(fraglen, ipremain); -+ -+ /* If we may have the opportunity to share prepare a fragwalk. */ -+ if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && -+ (skb->head_frag || skb->len == skb->data_len) && -+ skb->pp_recycle == newskb->pp_recycle) { -+ fragwalk = &_fragwalk; -+ skb_prepare_frag_walk(skb, data, fragwalk); -+ } -+ -+ /* Try share then copy. */ -+ if (fragwalk && skb_can_add_frags(newskb, fragwalk, data, copylen)) { -+ u32 leftover; -+ -+ leftover = skb_add_frags(newskb, fragwalk, data, copylen); -+ BUG_ON(leftover != 0); -+ } else { -+ /* We verified this was true in the main receive routine */ -+ BUG_ON(skb_tailroom(newskb) < copylen); -+ -+ /* copy fragment data into newskb */ -+ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMINBUFFERERROR); -+ goto abandon; -+ } -+ } -+ -+ if (copylen < ipremain) { -+ xtfs->ra_wantseq++; -+ } else { -+ /* We are done with packet reassembly! */ -+ BUG_ON(copylen != ipremain); -+ iptfs_reassem_done(xtfs); -+ iptfs_complete_inner_skb(xtfs->x, newskb); -+ list_add_tail(&newskb->list, list); -+ } -+ -+ /* will continue on to new data block or end */ -+ return data + fraglen; -+ -+abandon: -+ if (xtfs->ra_newskb) { -+ iptfs_reassem_abort(xtfs); -+ } else { -+ xtfs->ra_runtlen = 0; -+ xtfs->ra_wantseq = 0; -+ } -+ /* skip past fragment, maybe to end */ -+ return data + min(blkoff, remaining); -+} -+ -+/** -+ * iptfs_input_ordered() - handle next in order IPTFS payload. -+ * @x: xfrm state -+ * @skb: current packet -+ * -+ * Process the IPTFS payload in `skb` and consume it afterwards. -+ */ -+static int iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ u8 hbytes[sizeof(struct ipv6hdr)]; -+ struct ip_iptfs_cc_hdr iptcch; -+ struct skb_seq_state skbseq; -+ struct skb_frag_walk _fragwalk; -+ struct skb_frag_walk *fragwalk = NULL; -+ struct list_head sublist; /* rename this it's just a list */ -+ struct sk_buff *first_skb, *defer, *next; -+ const unsigned char *old_mac; -+ struct xfrm_iptfs_data *xtfs; -+ struct ip_iptfs_hdr *ipth; -+ struct iphdr *iph; -+ struct net *net; -+ u32 remaining, first_iplen, iplen, iphlen, data, tail; -+ u32 blkoff, capturelen; -+ u64 seq; -+ -+ xtfs = x->mode_data; -+ net = dev_net(skb->dev); -+ first_skb = NULL; -+ defer = NULL; -+ -+ seq = __esp_seq(skb); -+ -+ /* Large enough to hold both types of header */ -+ ipth = (struct ip_iptfs_hdr *)&iptcch; -+ -+ /* Save the old mac header if set */ -+ old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; -+ -+ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); -+ -+ /* Get the IPTFS header and validate it */ -+ -+ if (skb_copy_bits_seq(&skbseq, 0, ipth, sizeof(*ipth))) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); -+ goto done; -+ } -+ data = sizeof(*ipth); -+ -+ trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); -+ -+ /* Set data past the basic header */ -+ if (ipth->subtype == IPTFS_SUBTYPE_CC) { -+ /* Copy the rest of the CC header */ -+ remaining = sizeof(iptcch) - sizeof(*ipth); -+ if (skb_copy_bits_seq(&skbseq, data, ipth + 1, remaining)) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); -+ goto done; -+ } -+ data += remaining; -+ } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); -+ goto done; -+ } -+ -+ if (ipth->flags != 0) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); -+ goto done; -+ } -+ -+ INIT_LIST_HEAD(&sublist); -+ -+ /* Handle fragment at start of payload, and/or waiting reassembly. */ -+ -+ blkoff = ntohs(ipth->block_offset); -+ /* check before locking i.e., maybe */ -+ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { -+ spin_lock(&xtfs->drop_lock); -+ -+ /* check again after lock */ -+ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { -+ data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, -+ blkoff, &sublist); -+ } -+ -+ spin_unlock(&xtfs->drop_lock); -+ } -+ -+ /* New packets */ -+ -+ tail = skb->len; -+ BUG_ON(xtfs->ra_newskb && data < tail); -+ -+ while (data < tail) { -+ __be16 protocol = 0; -+ -+ /* Gather information on the next data block. -+ * `data` points to the start of the data block. -+ */ -+ remaining = tail - data; -+ -+ /* try and copy enough bytes to read length from ipv4/ipv6 */ -+ iphlen = min_t(u32, remaining, 6); -+ if (skb_copy_bits_seq(&skbseq, data, hbytes, iphlen)) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); -+ goto done; -+ } -+ -+ iph = (struct iphdr *)hbytes; -+ if (iph->version == 0x4) { -+ /* must have at least tot_len field present */ -+ if (remaining < 4) { -+ /* save the bytes we have, advance data and exit */ -+ iptfs_input_save_runt(xtfs, seq, hbytes, -+ remaining); -+ data += remaining; -+ break; -+ } -+ -+ iplen = be16_to_cpu(iph->tot_len); -+ iphlen = iph->ihl << 2; -+ protocol = cpu_to_be16(ETH_P_IP); -+ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; -+ } else if (iph->version == 0x6) { -+ /* must have at least payload_len field present */ -+ if (remaining < 6) { -+ /* save the bytes we have, advance data and exit */ -+ iptfs_input_save_runt(xtfs, seq, hbytes, -+ remaining); -+ data += remaining; -+ break; -+ } -+ -+ iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); -+ iplen += sizeof(struct ipv6hdr); -+ iphlen = sizeof(struct ipv6hdr); -+ protocol = cpu_to_be16(ETH_P_IPV6); -+ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = -+ ipv6_get_dsfield((struct ipv6hdr *)iph); -+ } else if (iph->version == 0x0) { -+ /* pad */ -+ data = tail; -+ break; -+ } else { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); -+ goto done; -+ } -+ -+ if (unlikely(skbseq.stepped_offset)) { -+ /* We need to reset our seq read, it can't backup at -+ * this point. -+ */ -+ struct sk_buff *save = skbseq.root_skb; -+ -+ skb_abort_seq_read(&skbseq); -+ skb_prepare_seq_read(save, data, tail, &skbseq); -+ } -+ -+ if (first_skb) { -+ skb = NULL; -+ } else { -+ first_skb = skb; -+ first_iplen = iplen; -+ fragwalk = NULL; -+ -+ /* We are going to skip over `data` bytes to reach the -+ * start of the IP header of `iphlen` len for `iplen` -+ * inner packet. -+ */ -+ -+ if (skb_has_frag_list(skb)) { -+ defer = skb; -+ skb = NULL; -+ } else if (data + iphlen <= skb_headlen(skb) && -+ /* make sure our header is 32-bit aligned? */ -+ /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ -+ skb_tailroom(skb) + tail - data >= iplen) { -+ /* Reuse the received skb. -+ * -+ * We have enough headlen to pull past any -+ * initial fragment data, leaving at least the -+ * IP header in the linear buffer space. -+ * -+ * For linear buffer space we only require that -+ * linear buffer space is large enough to -+ * eventually hold the entire reassembled -+ * packet (by including tailroom in the check). -+ * -+ * For non-linear tailroom is 0 and so we only -+ * re-use if the entire packet is present -+ * already. -+ * -+ * NOTE: there are many more options for -+ * sharing, KISS for now. Also, this can produce -+ * skb's with the IP header unaligned to 32 -+ * bits. If that ends up being a problem then a -+ * check should be added to the conditional -+ * above that the header lies on a 32-bit -+ * boundary as well. -+ */ -+ skb_pull(skb, data); -+ -+ /* our range just changed */ -+ data = 0; -+ tail = skb->len; -+ remaining = skb->len; -+ -+ skb->protocol = protocol; -+ skb_mac_header_rebuild(skb); -+ if (skb->mac_len) -+ eth_hdr(skb)->h_proto = skb->protocol; -+ -+ /* all pointers could be changed now reset walk */ -+ skb_abort_seq_read(&skbseq); -+ skb_prepare_seq_read(skb, data, tail, &skbseq); -+ } else if (skb->head_frag && -+ /* We have the IP header right now */ -+ remaining >= iphlen) { -+ fragwalk = &_fragwalk; -+ skb_prepare_frag_walk(skb, data, fragwalk); -+ defer = skb; -+ skb = NULL; -+ } else { -+ /* We couldn't reuse the input skb so allocate a -+ * new one. -+ */ -+ defer = skb; -+ skb = NULL; -+ } -+ -+ /* Don't trim `first_skb` until the end as we are -+ * walking that data now. -+ */ -+ } -+ -+ capturelen = min(iplen, remaining); -+ if (!skb) { -+ if (!fragwalk || -+ /* Large enough to be worth sharing */ -+ iplen < IPTFS_PKT_SHARE_MIN || -+ /* Have IP header + some data to share. */ -+ capturelen <= iphlen || -+ /* Try creating skb and adding frags */ -+ !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, -+ data, capturelen, -+ &skbseq, iphlen))) { -+ skb = iptfs_pskb_extract_seq(iplen, &skbseq, -+ data, capturelen); -+ } -+ if (!skb) { -+ /* skip to next packet or done */ -+ data += capturelen; -+ continue; -+ } -+ BUG_ON(skb->len != capturelen); -+ -+ skb->protocol = protocol; -+ if (old_mac) { -+ /* rebuild the mac header */ -+ skb_set_mac_header(skb, -first_skb->mac_len); -+ memcpy(skb_mac_header(skb), old_mac, -+ first_skb->mac_len); -+ eth_hdr(skb)->h_proto = skb->protocol; -+ } -+ } -+ -+ data += capturelen; -+ -+ if (skb->len < iplen) { -+ BUG_ON(data != tail); -+ BUG_ON(xtfs->ra_newskb); -+ -+ /* Start reassembly */ -+ spin_lock(&xtfs->drop_lock); -+ -+ xtfs->ra_newskb = skb; -+ xtfs->ra_wantseq = seq + 1; -+ if (!hrtimer_is_queued(&xtfs->drop_timer)) { -+ /* softirq blocked lest the timer fire and interrupt us */ -+ BUG_ON(!in_interrupt()); -+ hrtimer_start(&xtfs->drop_timer, -+ xtfs->drop_time_ns, -+ IPTFS_HRTIMER_MODE); -+ } -+ -+ spin_unlock(&xtfs->drop_lock); -+ -+ break; -+ } -+ -+ iptfs_complete_inner_skb(x, skb); -+ list_add_tail(&skb->list, &sublist); -+ } -+ -+ if (data != tail) -+ /* this should not happen from the above code */ -+ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); -+ -+ if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { -+ /* first_skb is queued b/c !defer and not partial */ -+ if (pskb_trim(first_skb, first_iplen)) { -+ /* error trimming */ -+ list_del(&first_skb->list); -+ defer = first_skb; -+ } -+ first_skb->ip_summed = CHECKSUM_NONE; -+ } -+ -+ /* Send the packets! */ -+ list_for_each_entry_safe(skb, next, &sublist, list) { -+ BUG_ON(skb == defer); -+ skb_list_del_init(skb); -+ if (xfrm_input(skb, 0, 0, -3)) -+ kfree_skb(skb); -+ } -+ -+done: -+ skb = skbseq.root_skb; -+ skb_abort_seq_read(&skbseq); -+ -+ if (defer) { -+ consume_skb(defer); -+ } else if (!first_skb) { -+ /* skb is the original passed in skb, but we didn't get far -+ * enough to process it as the first_skb, if we had it would -+ * either be save in ra_newskb, trimmed and sent on as an skb or -+ * placed in defer to be freed. -+ */ -+ BUG_ON(!skb); -+ kfree_skb(skb); -+ } -+ -+ return 0; -+} -+ -+/* ------------------------------- */ -+/* Input (Egress) Re-ordering Code */ -+/* ------------------------------- */ -+ -+static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) -+{ -+ u32 savedlen = xtfs->w_savedlen; -+ -+ if (shift > savedlen) -+ shift = savedlen; -+ if (shift != savedlen) -+ memcpy(xtfs->w_saved, xtfs->w_saved + shift, -+ (savedlen - shift) * sizeof(*xtfs->w_saved)); -+ memset(xtfs->w_saved + savedlen - shift, 0, -+ shift * sizeof(*xtfs->w_saved)); -+ xtfs->w_savedlen -= shift; -+} -+ -+static int __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, -+ struct list_head *freelist, u32 *fcount) -+{ -+ list_add_tail(&inskb->list, freelist); -+ (*fcount)++; -+ return 0; -+} -+ -+static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) -+ -+{ -+ struct skb_wseq *s, *se; -+ const u32 savedlen = xtfs->w_savedlen; -+ time64_t now = ktime_get_raw_fast_ns(); -+ u32 count = 0; -+ u32 scount = 0; -+ -+ BUG_ON(!savedlen); -+ if (xtfs->w_saved[0].drop_time > now) -+ goto set_timer; -+ -+ ++xtfs->w_wantseq; -+ -+ /* Keep flushing packets until we reach a drop time greater than now. */ -+ s = xtfs->w_saved; -+ se = s + savedlen; -+ do { -+ /* Walking past empty slots until we reach a packet */ -+ for (; s < se && !s->skb; s++) -+ if (s->drop_time > now) -+ goto outerdone; -+ /* Sending packets until we hit another empty slot. */ -+ for (; s < se && s->skb; scount++, s++) -+ list_add_tail(&s->skb->list, list); -+ } while (s < se); -+outerdone: -+ -+ count = s - xtfs->w_saved; -+ if (count) { -+ xtfs->w_wantseq += count; -+ -+ /* Shift handled slots plus final empty slot into slot 0. */ -+ __vec_shift(xtfs, count); -+ } -+ -+ if (xtfs->w_savedlen) { -+set_timer: -+ /* Drifting is OK */ -+ hrtimer_start(&xtfs->drop_timer, -+ xtfs->w_saved[0].drop_time - now, -+ IPTFS_HRTIMER_MODE); -+ } -+ return scount; -+} -+ -+static u32 __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, -+ struct list_head *list) -+{ -+ struct skb_wseq *s, *se; -+ const u32 savedlen = xtfs->w_savedlen; -+ u32 count = 0; -+ -+ /* Got what we wanted. */ -+ list_add_tail(&inskb->list, list); -+ ++xtfs->w_wantseq; -+ if (!savedlen) -+ return 1; -+ -+ /* Flush remaining consecutive packets. */ -+ -+ /* Keep sending until we hit another missed pkt. */ -+ for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) -+ list_add_tail(&s->skb->list, list); -+ count = s - xtfs->w_saved; -+ if (count) -+ xtfs->w_wantseq += count; -+ -+ /* Shift handled slots plus final empty slot into slot 0. */ -+ __vec_shift(xtfs, count + 1); -+ -+ return count + 1; -+} -+ -+/* Set the slot's drop time and all the empty slots below it until reaching a -+ * filled slot which will already be set. -+ */ -+static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) -+{ -+ const u32 savedlen = xtfs->w_savedlen; -+ struct skb_wseq *s = xtfs->w_saved; -+ time64_t drop_time; -+ -+ assert_spin_locked(&xtfs->drop_lock); -+ -+ if (savedlen > index + 1) { -+ /* we are below another, our drop time and the timer are already set */ -+ BUG_ON(xtfs->w_saved[index + 1].drop_time != -+ xtfs->w_saved[index].drop_time); -+ return; -+ } -+ /* we are the most future so get a new drop time. */ -+ drop_time = ktime_get_raw_fast_ns(); -+ drop_time += xtfs->drop_time_ns; -+ -+ /* Walk back through the array setting drop times as we go */ -+ s[index].drop_time = drop_time; -+ while (index-- > 0 && !s[index].skb) -+ s[index].drop_time = drop_time; -+ -+ /* If we walked all the way back, schedule the drop timer if needed */ -+ if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) -+ hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, -+ IPTFS_HRTIMER_MODE); -+} -+ -+static u32 __reorder_future_fits(struct xfrm_iptfs_data *xtfs, -+ struct sk_buff *inskb, -+ struct list_head *freelist, u32 *fcount) -+{ -+ const u32 nslots = xtfs->cfg.reorder_win_size + 1; -+ const u64 inseq = __esp_seq(inskb); -+ const u64 wantseq = xtfs->w_wantseq; -+ const u64 distance = inseq - wantseq; -+ const u32 savedlen = xtfs->w_savedlen; -+ const u32 index = distance - 1; -+ -+ BUG_ON(distance >= nslots); -+ -+ /* Handle future sequence number received which fits in the window. -+ * -+ * We know we don't have the seq we want so we won't be able to flush -+ * anything. -+ */ -+ -+ /* slot count is 4, saved size is 3 savedlen is 2 -+ * -+ * "window boundary" is based on the fixed window size -+ * distance is also slot number -+ * index is an array index (i.e., - 1 of slot) -+ * : : - implicit NULL after array len -+ * -+ * +--------- used length (savedlen == 2) -+ * | +----- array size (nslots - 1 == 3) -+ * | | + window boundary (nslots == 4) -+ * V V | V -+ * | -+ * 0 1 2 3 | slot number -+ * --- 0 1 2 | array index -+ * [-] [b] : :| array -+ * -+ * "2" "3" "4" *5*| seq numbers -+ * -+ * We receive seq number 5 -+ * distance == 3 [inseq(5) - w_wantseq(2)] -+ * index == 2 [distance(6) - 1] -+ */ -+ -+ if (xtfs->w_saved[index].skb) { -+ /* a dup of a future */ -+ list_add_tail(&inskb->list, freelist); -+ (*fcount)++; -+ return 0; -+ } -+ -+ xtfs->w_saved[index].skb = inskb; -+ xtfs->w_savedlen = max(savedlen, index + 1); -+ iptfs_set_window_drop_times(xtfs, index); -+ -+ return 0; -+} -+ -+static u32 __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, -+ struct sk_buff *inskb, -+ struct list_head *list, -+ struct list_head *freelist, u32 *fcount) -+{ -+ const u32 nslots = xtfs->cfg.reorder_win_size + 1; -+ const u64 inseq = __esp_seq(inskb); -+ u32 savedlen = xtfs->w_savedlen; -+ u64 wantseq = xtfs->w_wantseq; -+ struct sk_buff *slot0 = NULL; -+ u64 last_drop_seq = xtfs->w_wantseq; -+ u64 distance, extra_drops, s0seq; -+ u32 count = 0; -+ struct skb_wseq *wnext; -+ u32 beyond, shifting, slot; -+ -+ BUG_ON(inseq <= wantseq); -+ distance = inseq - wantseq; -+ BUG_ON(distance <= nslots - 1); -+ beyond = distance - (nslots - 1); -+ -+ /* Handle future sequence number received. -+ * -+ * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 -+ * b/c we are beyond the window boundary. -+ * -+ * We know we don't have the wantseq so that counts as a drop. -+ */ -+ -+ /* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the -+ * missing sequence number. -+ * -+ * the final slot at savedlen (index savedlen - 1) is always occupied. -+ * -+ * beyond is "beyond array size" not savedlen. -+ * -+ * +--------- array length (savedlen == 2) -+ * | +----- array size (nslots - 1 == 3) -+ * | | +- window boundary (nslots == 4) -+ * V V | V -+ * | -+ * 0 1 2 3 | slot number -+ * --- 0 1 2 | array index -+ * [b] [c] : :| array -+ * | -+ * "2" "3" "4" "5"|*6* seq numbers -+ * -+ * We receive seq number 6 -+ * distance == 4 [inseq(6) - w_wantseq(2)] -+ * newslot == distance -+ * index == 3 [distance(4) - 1] -+ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] -+ * shifting == 1 [min(savedlen(2), beyond(1)] -+ * slot0_skb == [b], and should match w_wantseq -+ * -+ * +--- window boundary (nslots == 4) -+ * 0 1 2 3 | 4 slot number -+ * --- 0 1 2 | 3 array index -+ * [b] : : : :| array -+ * "2" "3" "4" "5" *6* seq numbers -+ * -+ * We receive seq number 6 -+ * distance == 4 [inseq(6) - w_wantseq(2)] -+ * newslot == distance -+ * index == 3 [distance(4) - 1] -+ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] -+ * shifting == 1 [min(savedlen(1), beyond(1)] -+ * slot0_skb == [b] and should match w_wantseq -+ * -+ * +-- window boundary (nslots == 4) -+ * 0 1 2 3 | 4 5 6 slot number -+ * --- 0 1 2 | 3 4 5 array index -+ * [-] [c] : :| array -+ * "2" "3" "4" "5" "6" "7" *8* seq numbers -+ * -+ * savedlen = 2, beyond = 3 -+ * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] -+ * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] -+ * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) -+ * -+ * We receive seq number 8 -+ * distance == 6 [inseq(8) - w_wantseq(2)] -+ * newslot == distance -+ * index == 5 [distance(6) - 1] -+ * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] -+ * shifting == 2 [min(savedlen(2), beyond(3)] -+ * -+ * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. -+ */ -+ -+ /* Now send any packets that are being shifted out of saved, and account -+ * for missing packets that are exiting the window as we shift it. -+ */ -+ -+ /* If savedlen > beyond we are shifting some, else all. */ -+ shifting = min(savedlen, beyond); -+ -+ /* slot0 is the buf that just shifted out and into slot0 */ -+ slot0 = NULL; -+ s0seq = wantseq; -+ last_drop_seq = s0seq; -+ wnext = xtfs->w_saved; -+ for (slot = 1; slot <= shifting; slot++, wnext++) { -+ /* handle what was in slot0 before we occupy it */ -+ if (!slot0) { -+ last_drop_seq = s0seq; -+ } else { -+ list_add_tail(&slot0->list, list); -+ count++; -+ } -+ s0seq++; -+ slot0 = wnext->skb; -+ wnext->skb = NULL; -+ } -+ -+ /* slot0 is now either NULL (in which case it's what we now are waiting -+ * for, or a buf in which case we need to handle it like we received it; -+ * however, we may be advancing past that buffer as well.. -+ */ -+ -+ /* Handle case where we need to shift more than we had saved, slot0 will -+ * be NULL iff savedlen is 0, otherwise slot0 will always be -+ * non-NULL b/c we shifted the final element, which is always set if -+ * there is any saved, into slot0. -+ */ -+ if (savedlen < beyond) { -+ extra_drops = beyond - savedlen; -+ if (savedlen == 0) { -+ BUG_ON(slot0); -+ s0seq += extra_drops; -+ last_drop_seq = s0seq - 1; -+ } else { -+ extra_drops--; /* we aren't dropping what's in slot0 */ -+ BUG_ON(!slot0); -+ list_add_tail(&slot0->list, list); -+ /* if extra_drops then we are going past this slot0 -+ * so we can safely advance last_drop_seq -+ */ -+ if (extra_drops) -+ last_drop_seq = s0seq + extra_drops; -+ s0seq += extra_drops + 1; -+ count++; -+ } -+ slot0 = NULL; -+ /* slot0 has had an empty slot pushed into it */ -+ } -+ (void)last_drop_seq; /* we want this for CC code */ -+ -+ /* Remove the entries */ -+ __vec_shift(xtfs, beyond); -+ -+ /* Advance want seq */ -+ xtfs->w_wantseq += beyond; -+ -+ /* Process drops here when implementing congestion control */ -+ -+ /* We've shifted. plug the packet in at the end. */ -+ xtfs->w_savedlen = nslots - 1; -+ xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; -+ iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); -+ -+ /* if we don't have a slot0 then we must wait for it */ -+ if (!slot0) -+ return count; -+ -+ /* If slot0, seq must match new want seq */ -+ BUG_ON(xtfs->w_wantseq != __esp_seq(slot0)); -+ -+ /* slot0 is valid, treat like we received expected. */ -+ count += __reorder_this(xtfs, slot0, list); -+ return count; -+} -+ -+/* Receive a new packet into the reorder window. Return a list of ordered -+ * packets from the window. -+ */ -+static u32 iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, -+ struct sk_buff *inskb, struct list_head *list, -+ struct list_head *freelist, u32 *fcount) -+{ -+ const u32 nslots = xtfs->cfg.reorder_win_size + 1; -+ u64 inseq = __esp_seq(inskb); -+ u64 wantseq; -+ -+ assert_spin_locked(&xtfs->drop_lock); -+ -+ if (unlikely(!xtfs->w_seq_set)) { -+ xtfs->w_seq_set = true; -+ xtfs->w_wantseq = inseq; -+ } -+ wantseq = xtfs->w_wantseq; -+ -+ if (likely(inseq == wantseq)) -+ return __reorder_this(xtfs, inskb, list); -+ else if (inseq < wantseq) -+ return __reorder_past(xtfs, inskb, freelist, fcount); -+ else if ((inseq - wantseq) < nslots) -+ return __reorder_future_fits(xtfs, inskb, freelist, fcount); -+ else -+ return __reorder_future_shifts(xtfs, inskb, list, freelist, -+ fcount); -+} -+ -+/** -+ * iptfs_drop_timer() - Handle drop timer expiry. -+ * @me: the timer -+ * -+ * This is similar to our input function. -+ * -+ * The drop timer is set when we start an in progress reassembly, and also when -+ * we save a future packet in the window saved array. -+ * -+ * NOTE packets in the save window are always newer WRT drop times as -+ * they get further in the future. i.e. for: -+ * -+ * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, -+ * then D(n-1) <= D(n). -+ * -+ * So, regardless of why the timer is firing we can always discard any inprogress -+ * fragment; either it's the reassembly timer, or slot 0 is going to be -+ * dropped as S0 must have the most recent drop time, and slot 0 holds the -+ * continuation fragment of the in progress packet. -+ */ -+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) -+{ -+ struct sk_buff *skb, *next; -+ struct list_head freelist, list; -+ struct xfrm_iptfs_data *xtfs; -+ struct xfrm_state *x; -+ u32 count; -+ -+ xtfs = container_of(me, typeof(*xtfs), drop_timer); -+ x = xtfs->x; -+ -+ spin_lock(&xtfs->drop_lock); -+ -+ INIT_LIST_HEAD(&list); -+ INIT_LIST_HEAD(&freelist); -+ -+ /* Drop any in progress packet */ -+ -+ if (xtfs->ra_newskb) { -+ kfree_skb(xtfs->ra_newskb); -+ xtfs->ra_newskb = NULL; -+ } -+ -+ /* Now drop as many packets as we should from the reordering window -+ * saved array -+ */ -+ count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; -+ -+ spin_unlock(&xtfs->drop_lock); -+ -+ if (count) { -+ list_for_each_entry_safe(skb, next, &list, list) { -+ skb_list_del_init(skb); -+ (void)iptfs_input_ordered(x, skb); -+ } -+ } -+ return HRTIMER_NORESTART; -+} -+ -+/** -+ * iptfs_input() - handle receipt of iptfs payload -+ * @x: xfrm state -+ * @skb: the packet -+ * -+ * We have an IPTFS payload order it if needed, then process newly in order -+ * packets. -+ */ -+static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ struct list_head freelist, list; -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct sk_buff *next; -+ u32 count, fcount; -+ -+ /* Fast path for no reorder window. */ -+ if (xtfs->cfg.reorder_win_size == 0) { -+ iptfs_input_ordered(x, skb); -+ goto done; -+ } -+ -+ /* Fetch list of in-order packets from the reordering window as well as -+ * a list of buffers we need to now free. -+ */ -+ INIT_LIST_HEAD(&list); -+ INIT_LIST_HEAD(&freelist); -+ fcount = 0; -+ -+ spin_lock(&xtfs->drop_lock); -+ count = iptfs_input_reorder(xtfs, skb, &list, &freelist, &fcount); -+ spin_unlock(&xtfs->drop_lock); -+ -+ if (count) { -+ list_for_each_entry_safe(skb, next, &list, list) { -+ skb_list_del_init(skb); -+ (void)iptfs_input_ordered(x, skb); -+ } -+ } -+ -+ if (fcount) { -+ list_for_each_entry_safe(skb, next, &freelist, list) { -+ skb_list_del_init(skb); -+ kfree_skb(skb); -+ } -+ } -+done: -+ /* We always have dealt with the input SKB, either we are re-using it, -+ * or we have freed it. Return EINPROGRESS so that xfrm_input stops -+ * processing it. -+ */ -+ return -EINPROGRESS; -+} -+ -+/* ================================= */ -+/* IPTFS Sending (ingress) Functions */ -+/* ================================= */ -+ -+/* ------------------------- */ -+/* Enqueue to send functions */ -+/* ------------------------- */ -+ -+/** -+ * iptfs_enqueue() - enqueue packet if ok to send. -+ * @xtfs: xtfs state -+ * @skb: the packet -+ * -+ * Return: true if packet enqueued. -+ */ -+static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) -+{ -+ u64 newsz = xtfs->queue_size + skb->len; -+ struct iphdr *iph; -+ -+ assert_spin_locked(&xtfs->x->lock); -+ -+ if (newsz > xtfs->cfg.max_queue_size) -+ return false; -+ -+ /* Set ECN CE if we are above our ECN queue threshold */ -+ if (newsz > xtfs->ecn_queue_size) { -+ iph = ip_hdr(skb); -+ if (iph->version == 4) -+ IP_ECN_set_ce(iph); -+ else if (iph->version == 6) -+ IP6_ECN_set_ce(skb, ipv6_hdr(skb)); -+ } -+ -+ __skb_queue_tail(&xtfs->queue, skb); -+ xtfs->queue_size += skb->len; -+ return true; -+} -+ -+static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs, -+ struct sk_buff *skb) -+{ -+ struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); -+ u32 payload_mtu = xtfs->payload_mtu; -+ u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); -+ -+ if (payload_mtu && payload_mtu < pmtu) -+ pmtu = payload_mtu; -+ -+ return pmtu; -+} -+ -+static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) -+{ -+ if (skb->len <= pmtu) -+ return 0; -+ -+ /* We only send ICMP too big if the user has configured us as -+ * dont-fragment. -+ */ -+ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); -+ -+ if (sk) { -+ xfrm_local_error(skb, pmtu); -+ } else if (ip_hdr(skb)->version == 4) { -+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -+ htonl(pmtu)); -+ } else { -+ WARN_ON_ONCE(ip_hdr(skb)->version != 6); -+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); -+ } -+ return 1; -+} -+ -+/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload -+ * (i.e., aggregating or fragmenting as appropriate). -+ * This is set in dst->output for an SA. -+ */ -+static int iptfs_output_collect(struct net *net, struct sock *sk, -+ struct sk_buff *skb) -+{ -+ struct dst_entry *dst = skb_dst(skb); -+ struct xfrm_state *x = dst->xfrm; -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct sk_buff *segs, *nskb; -+ u32 pmtu = 0; -+ bool ok = true; -+ bool was_gso; -+ -+ /* We have hooked into dst_entry->output which means we have skipped the -+ * protocol specific netfilter (see xfrm4_output, xfrm6_output). -+ * when our timer runs we will end up calling xfrm_output directly on -+ * the encapsulated traffic. -+ * -+ * For both cases this is the NF_INET_POST_ROUTING hook which allows -+ * changing the skb->dst entry which then may not be xfrm based anymore -+ * in which case a REROUTED flag is set. and dst_output is called. -+ * -+ * For IPv6 we are also skipping fragmentation handling for local -+ * sockets, which may or may not be good depending on our tunnel DF -+ * setting. Normally with fragmentation supported we want to skip this -+ * fragmentation. -+ */ -+ -+ BUG_ON(!xtfs); -+ -+ if (xtfs->cfg.dont_frag) -+ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); -+ -+ /* Break apart GSO skbs. If the queue is nearing full then we want the -+ * accounting and queuing to be based on the individual packets not on the -+ * aggregate GSO buffer. -+ */ -+ was_gso = skb_is_gso(skb); -+ if (!was_gso) { -+ segs = skb; -+ } else { -+ segs = skb_gso_segment(skb, 0); -+ if (IS_ERR_OR_NULL(segs)) { -+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); -+ kfree_skb(skb); -+ return PTR_ERR(segs); -+ } -+ consume_skb(skb); -+ skb = NULL; -+ } -+ -+ /* We can be running on multiple cores and from the network softirq or -+ * from user context depending on where the packet is coming from. -+ */ -+ spin_lock_bh(&x->lock); -+ -+ skb_list_walk_safe(segs, skb, nskb) { -+ skb_mark_not_on_list(skb); -+ -+ /* Once we drop due to no queue space we continue to drop the -+ * rest of the packets from that GRO. -+ */ -+ if (!ok) { -+nospace: -+ trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); -+ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTNOQSPACE); -+ kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); -+ continue; -+ } -+ -+ /* If the user indicated no iptfs fragmenting check before -+ * enqueue. -+ */ -+ if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { -+ trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); -+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); -+ continue; -+ } -+ -+ /* Enqueue to send in tunnel */ -+ ok = iptfs_enqueue(xtfs, skb); -+ if (!ok) -+ goto nospace; -+ -+ trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); -+ } -+ -+ /* Start a delay timer if we don't have one yet */ -+ if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { -+ hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, -+ IPTFS_HRTIMER_MODE); -+ xtfs->iptfs_settime = ktime_get_raw_fast_ns(); -+ trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); -+ } -+ -+ spin_unlock_bh(&x->lock); -+ return 0; -+} -+ -+/* -------------------------- */ -+/* Dequeue and send functions */ -+/* -------------------------- */ -+ -+static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) -+{ -+ struct ip_iptfs_hdr *h; -+ size_t hsz = sizeof(*h); -+ -+ /* now reset values to be pointing at the rest of the packets */ -+ h = skb_push(skb, hsz); -+ memset(h, 0, hsz); -+ if (blkoff) -+ h->block_offset = htons(blkoff); -+ -+ /* network_header current points at the inner IP packet -+ * move it to the iptfs header -+ */ -+ skb->transport_header = skb->network_header; -+ skb->network_header -= hsz; -+ -+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; -+ -+ /* xfrm[46]_prepare_output sets skb->protocol here, but the resulting -+ * called ip[6]_output functions also set this value as appropriate so -+ * seems unnecessary -+ * -+ * skb->protocol = htons(ETH_P_IP) or htons(ETH_P_IPV6); -+ */ -+} -+ -+/** -+ * iptfs_copy_create_frag() - create an inner fragment skb. -+ * @st: The source packet data. -+ * @offset: offset in @st of the new fragment data. -+ * @copy_len: the amount of data to copy from @st. -+ * -+ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must -+ * not be greater than the max fragment size. -+ * -+ * Return: the new fragment skb or an ERR_PTR(). -+ */ -+static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, -+ u32 offset, u32 copy_len) -+{ -+ struct sk_buff *src = st->root_skb; -+ struct sk_buff *skb; -+ int err; -+ -+ skb = iptfs_alloc_skb(src, copy_len, true); -+ if (!skb) -+ return ERR_PTR(-ENOMEM); -+ -+ /* Now copy `copy_len` data from src */ -+ err = skb_copy_bits_seq(st, offset, skb_put(skb, copy_len), copy_len); -+ if (err) { -+ XFRM_INC_STATS(dev_net(src->dev), LINUX_MIB_XFRMOUTERROR); -+ kfree_skb(skb); -+ return ERR_PTR(err); -+ } -+ -+ return skb; -+} -+ -+/** -+ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. -+ * @skbp: the source packet skb (IN), skb holding the last fragment in -+ * the fragment stream (OUT). -+ * @xtfs: IPTFS SA state. -+ * @mtu: the max IPTFS fragment size. -+ * -+ * This function is responsible for fragmenting a larger inner packet into a -+ * sequence of IPTFS payload packets. The last fragment is returned rather than -+ * being sent so that the caller can append more inner packets (aggregation) if -+ * there is room. -+ */ -+static int iptfs_copy_create_frags(struct sk_buff **skbp, -+ struct xfrm_iptfs_data *xtfs, u32 mtu) -+{ -+ struct skb_seq_state skbseq; -+ struct list_head sublist; -+ struct sk_buff *skb = *skbp; -+ struct sk_buff *nskb = *skbp; -+ u32 copy_len, offset; -+ u32 to_copy = skb->len - mtu; -+ u32 blkoff = 0; -+ int err = 0; -+ -+ INIT_LIST_HEAD(&sublist); -+ -+ BUG_ON(skb->len <= mtu); -+ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); -+ -+ /* A trimmed `skb` will be sent as the first fragment, later. */ -+ offset = mtu; -+ to_copy = skb->len - offset; -+ while (to_copy) { -+ /* Send all but last fragment to allow agg. append */ -+ trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); -+ list_add_tail(&nskb->list, &sublist); -+ -+ /* FUTURE: if the packet has an odd/non-aligning length we could -+ * send less data in the penultimate fragment so that the last -+ * fragment then ends on an aligned boundary. -+ */ -+ copy_len = to_copy <= mtu ? to_copy : mtu; -+ nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); -+ if (IS_ERR(nskb)) { -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMOUTERROR); -+ skb_abort_seq_read(&skbseq); -+ err = PTR_ERR(nskb); -+ nskb = NULL; -+ break; -+ } -+ iptfs_output_prepare_skb(nskb, to_copy); -+ offset += copy_len; -+ to_copy -= copy_len; -+ blkoff = to_copy; -+ } -+ skb_abort_seq_read(&skbseq); -+ -+ /* return last fragment that will be unsent (or NULL) */ -+ *skbp = nskb; -+ if (nskb) -+ trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); -+ -+ /* trim the original skb to MTU */ -+ if (!err) -+ err = pskb_trim(skb, mtu); -+ -+ if (err) { -+ /* Free all frags. Don't bother sending a partial packet we will -+ * never complete. -+ */ -+ kfree_skb(nskb); -+ list_for_each_entry_safe(skb, nskb, &sublist, list) { -+ skb_list_del_init(skb); -+ kfree_skb(skb); -+ } -+ return err; -+ } -+ -+ /* prepare the initial fragment with an iptfs header */ -+ iptfs_output_prepare_skb(skb, 0); -+ -+ /* Send all but last fragment, if we fail to send a fragment then free -+ * the rest -- no point in sending a packet that can't be reassembled. -+ */ -+ list_for_each_entry_safe(skb, nskb, &sublist, list) { -+ skb_list_del_init(skb); -+ if (!err) -+ err = xfrm_output(NULL, skb); -+ else -+ kfree_skb(skb); -+ } -+ if (err) -+ kfree_skb(*skbp); -+ return err; -+} -+ -+/** -+ * iptfs_first_should_copy() - determine if we should copy packet data. -+ * @first_skb: the first skb in the packet -+ * @mtu: the MTU. -+ * -+ * Determine if we should create subsequent skbs to hold the remaining data from -+ * a large inner packet by copying the packet data, or cloning the original skb -+ * and adjusting the offsets. -+ */ -+static bool iptfs_first_should_copy(struct sk_buff *first_skb, u32 mtu) -+{ -+ u32 frag_copy_max; -+ -+ /* If we have less than frag_copy_max for remaining packet we copy -+ * those tail bytes as it is more efficient. -+ */ -+ frag_copy_max = mtu <= IPTFS_FRAG_COPY_MAX ? mtu : IPTFS_FRAG_COPY_MAX; -+ if ((int)first_skb->len - (int)mtu < (int)frag_copy_max) -+ return true; -+ -+ /* If we have non-linear skb just use copy */ -+ if (skb_is_nonlinear(first_skb)) -+ return true; -+ -+ /* So we have a simple linear skb, easy to clone and share */ -+ return false; -+} -+ -+/** -+ * iptfs_first_skb() - handle the first dequeued inner packet for output -+ * @skbp: the source packet skb (IN), skb holding the last fragment in -+ * the fragment stream (OUT). -+ * @xtfs: IPTFS SA state. -+ * @mtu: the max IPTFS fragment size. -+ * -+ * This function is responsible for fragmenting a larger inner packet into a -+ * sequence of IPTFS payload packets. If it needs to fragment into subsequent -+ * skb's, it will either do so by copying or cloning. -+ * -+ * The last fragment is returned rather than being sent so that the caller can -+ * append more inner packets (aggregation) if there is room. -+ * -+ */ -+static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, -+ u32 mtu) -+{ -+ struct sk_buff *skb = *skbp; -+ int err; -+ -+ /* Classic ESP skips the don't fragment ICMP error if DF is clear on -+ * the inner packet or ignore_df is set. Otherwise it will send an ICMP -+ * or local error if the inner packet won't fit it's MTU. -+ * -+ * With IPTFS we do not care about the inner packet DF bit. If the -+ * tunnel is configured to "don't fragment" we error back if things -+ * don't fit in our max packet size. Otherwise we iptfs-fragment as -+ * normal. -+ */ -+ -+ /* The opportunity for HW offload has ended */ -+ if (skb->ip_summed == CHECKSUM_PARTIAL) { -+ err = skb_checksum_help(skb); -+ if (err) -+ return err; -+ } -+ -+ /* We've split these up before queuing */ -+ BUG_ON(skb_is_gso(skb)); -+ -+ trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); -+ -+ /* Simple case -- it fits. `mtu` accounted for all the overhead -+ * including the basic IPTFS header. -+ */ -+ if (skb->len <= mtu) { -+ iptfs_output_prepare_skb(skb, 0); -+ return 0; -+ } -+ -+ if (iptfs_first_should_copy(skb, mtu)) -+ return iptfs_copy_create_frags(skbp, xtfs, mtu); -+ -+ /* For now we always copy */ -+ return iptfs_copy_create_frags(skbp, xtfs, mtu); -+} -+ -+static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, -+ struct sk_buff *child) -+{ -+ u32 fllen = 0; -+ -+ /* It might be possible to account for a frag list in addition to page -+ * fragment if it's a valid state to be in. The page fragments size -+ * should be kept as data_len so only the frag_list size is removed, -+ * this must be done above as well. -+ */ -+ BUG_ON(skb_shinfo(child)->nr_frags); -+ *nextp = skb_shinfo(child)->frag_list; -+ while (*nextp) { -+ fllen += (*nextp)->len; -+ nextp = &(*nextp)->next; -+ } -+ skb_frag_list_init(child); -+ BUG_ON(fllen > child->data_len); -+ child->len -= fllen; -+ child->data_len -= fllen; -+ -+ return nextp; -+} -+ -+static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) -+{ -+ struct skb_shared_info *fromi = skb_shinfo(from); -+ struct skb_shared_info *toi = skb_shinfo(to); -+ unsigned int new_truesize; -+ -+ /* If we have data in a head page, grab it */ -+ if (!skb_headlen(from)) { -+ new_truesize = SKB_TRUESIZE(skb_end_offset(from)); -+ } else { -+ skb_head_to_frag(from, &toi->frags[toi->nr_frags]); -+ skb_frag_ref(to, toi->nr_frags++); -+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); -+ } -+ -+ /* Move any other page fragments rather than copy */ -+ memcpy(&toi->frags[toi->nr_frags], fromi->frags, -+ sizeof(fromi->frags[0]) * fromi->nr_frags); -+ toi->nr_frags += fromi->nr_frags; -+ fromi->nr_frags = 0; -+ from->data_len = 0; -+ from->len = 0; -+ to->truesize += from->truesize - new_truesize; -+ from->truesize = new_truesize; -+ -+ /* We are done with this SKB */ -+ consume_skb(from); -+} -+ -+static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) -+{ -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct sk_buff *skb, *skb2, **nextp; -+ struct skb_shared_info *shi, *shi2; -+ -+ /* For now we are just outputting packets as fast as we can, so if we -+ * are fragmenting we will do so until the last inner packet has been -+ * consumed. -+ * -+ * When we are fragmenting we need to output all outer packets that -+ * contain the fragments of a single inner packet, consecutively (ESP -+ * seq-wise). Since this output function is always running from a timer -+ * we do not need a lock to provide this guarantee. We will output our -+ * packets consecutively before the timer is allowed to run again on -+ * some other CPU. -+ */ -+ -+ /* NOTE: for the future, for timed packet sends, if our queue is not -+ * growing longer (i.e., we are keeping up) and a packet we are about to -+ * fragment will not fragment in then next outer packet, we might consider -+ * holding on to it to send whole in the next slot. The question then is -+ * does this introduce a continuous delay in the inner packet stream -+ * with certain packet rates and sizes? -+ */ -+ -+ /* and send them on their way */ -+ -+ while ((skb = __skb_dequeue(list))) { -+ u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); -+ bool share_ok = true; -+ int remaining; -+ -+ /* protocol comes to us cleared sometimes */ -+ skb->protocol = x->outer_mode.family == AF_INET ? -+ htons(ETH_P_IP) : -+ htons(ETH_P_IPV6); -+ -+ if (skb->len > mtu && xtfs->cfg.dont_frag) { -+ /* We handle this case before enqueueing so we are only -+ * here b/c MTU changed after we enqueued before we -+ * dequeued, just drop these. -+ */ -+ XFRM_INC_STATS(dev_net(skb->dev), -+ LINUX_MIB_XFRMOUTERROR); -+ -+ trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); -+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); -+ continue; -+ } -+ -+ /* iptfs_first_skb will free skb on error as well */ -+ if (iptfs_first_skb(&skb, xtfs, mtu)) -+ continue; -+ -+ /* The returned skb is the last IPTFS fragment, it has it's -+ * IPTFS header included and it's blkoff set just past the end -+ * fragment data if needed. The space remaining to send more -+ * inner packet data is `mtu` - (skb->len - sizeof iptfs -+ * header). This is b/c the `mtu` value has the basic IPTFS -+ * header len accounted for, and we added that header to the skb -+ * so it is a part of skb->len, thus we subtract it from the skb -+ * length. -+ */ -+ remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); -+ -+ /* Re-home nested fragment lists. */ -+ shi = skb_shinfo(skb); -+ nextp = &shi->frag_list; -+ while (*nextp) { -+ if (skb_has_frag_list(*nextp)) -+ nextp = iptfs_rehome_fraglist(&(*nextp)->next, -+ *nextp); -+ else -+ nextp = &(*nextp)->next; -+ } -+ -+ if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) -+ share_ok = false; -+ -+ /* See if we have enough space to simply append. -+ * -+ * NOTE: Maybe do not append if we will be mis-aligned, -+ * SW-based endpoints will probably have to copy in this -+ * case. -+ */ -+ while ((skb2 = skb_peek(list))) { -+ trace_iptfs_ingress_nth_peek(skb2, remaining); -+ if (skb2->len > remaining) -+ break; -+ -+ __skb_unlink(skb2, list); -+ -+ /* The opportunity for HW offload has ended, if we -+ * don't have a cksum in the packet we need to add one -+ * before encap and transmit. -+ */ -+ if (skb2->ip_summed == CHECKSUM_PARTIAL) { -+ if (skb_checksum_help(skb2)) { -+ XFRM_INC_STATS(dev_net(skb_dst(skb2)->dev), -+ LINUX_MIB_XFRMOUTERROR); -+ kfree_skb(skb2); -+ continue; -+ } -+ } -+ -+ /* skb->pp_recycle is passed to __skb_flag_unref for all -+ * frag pages so we can only share pages with skb's who -+ * match ourselves. -+ */ -+ shi2 = skb_shinfo(skb2); -+ if (share_ok && -+ (shi2->frag_list || -+ (!skb2->head_frag && skb_headlen(skb)) || -+ skb->pp_recycle != skb2->pp_recycle || -+ skb_zcopy(skb2) || -+ (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS))) -+ share_ok = false; -+ -+ /* do acct so we can free skb2 in share case */ -+ skb->data_len += skb2->len; -+ skb->len += skb2->len; -+ remaining -= skb2->len; -+ -+ trace_iptfs_ingress_nth_add(skb2, share_ok); -+ -+ if (share_ok) { -+ iptfs_consume_frags(skb, skb2); -+ } else { -+ /* link on the frag_list */ -+ *nextp = skb2; -+ nextp = &skb2->next; -+ BUG_ON(*nextp); -+ if (skb_has_frag_list(skb2)) -+ nextp = iptfs_rehome_fraglist(nextp, -+ skb2); -+ skb->truesize += skb2->truesize; -+ } -+ } -+ -+ /* Consider fragmenting this skb2 that didn't fit. For demand -+ * driven variable sized IPTFS pkts, though this isn't buying -+ * a whole lot, especially if we are doing a copy which waiting -+ * to send in a new pkt would not. -+ */ -+ -+ xfrm_output(NULL, skb); -+ } -+} -+ -+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) -+{ -+ struct sk_buff_head list; -+ struct xfrm_iptfs_data *xtfs; -+ struct xfrm_state *x; -+ time64_t settime; -+ -+ xtfs = container_of(me, typeof(*xtfs), iptfs_timer); -+ x = xtfs->x; -+ -+ /* Process all the queued packets -+ * -+ * softirq execution order: timer > tasklet > hrtimer -+ * -+ * Network rx will have run before us giving one last chance to queue -+ * ingress packets for us to process and transmit. -+ */ -+ -+ spin_lock(&x->lock); -+ __skb_queue_head_init(&list); -+ skb_queue_splice_init(&xtfs->queue, &list); -+ xtfs->queue_size = 0; -+ settime = xtfs->iptfs_settime; -+ spin_unlock(&x->lock); -+ -+ /* After the above unlock, packets can begin queuing again, and the -+ * timer can be set again, from another CPU either in softirq or user -+ * context (not from this one since we are running at softirq level -+ * already). -+ */ -+ -+ trace_iptfs_timer_expire(xtfs, -+ (unsigned long long)(ktime_get_raw_fast_ns() - settime)); -+ -+ iptfs_output_queued(x, &list); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/** -+ * iptfs_encap_add_ipv4() - add outer encaps -+ * @x: xfrm state -+ * @skb: the packet -+ * -+ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the -+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set -+ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do -+ * anything with inner headers as they aren't pointing into a normal IP -+ * singleton inner packet. -+ */ -+static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ struct dst_entry *dst = skb_dst(skb); -+ struct iphdr *top_iph; -+ -+ skb_reset_inner_network_header(skb); -+ skb_reset_inner_transport_header(skb); -+ -+ skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len)); -+ skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); -+ skb->transport_header = skb->network_header + sizeof(*top_iph); -+ -+ top_iph = ip_hdr(skb); -+ top_iph->ihl = 5; -+ top_iph->version = 4; -+ top_iph->protocol = IPPROTO_AGGFRAG; -+ -+ /* As we have 0, fractional, 1 or N inner packets there's no obviously -+ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 -+ * 3.1. -+ */ -+ top_iph->tos = 0; -+ -+ top_iph->frag_off = htons(IP_DF); -+ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); -+ top_iph->saddr = x->props.saddr.a4; -+ top_iph->daddr = x->id.daddr.a4; -+ ip_select_ident(dev_net(dst->dev), skb, NULL); -+ -+ return 0; -+} -+ -+/** -+ * iptfs_encap_add_ipv6() - add outer encaps -+ * @x: xfrm state -+ * @skb: the packet -+ * -+ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the -+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set -+ * the flow label and TOS/DSCP bits. It also sets the protocol to a different -+ * value and doesn't do anything with inner headers as they aren't pointing into -+ * a normal IP singleton inner packet. -+ */ -+static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ struct dst_entry *dst = skb_dst(skb); -+ struct ipv6hdr *top_iph; -+ int dsfield; -+ -+ skb_reset_inner_network_header(skb); -+ skb_reset_inner_transport_header(skb); -+ -+ skb_set_network_header(skb, -+ -x->props.header_len + x->props.enc_hdr_len); -+ skb->mac_header = skb->network_header + -+ offsetof(struct ipv6hdr, nexthdr); -+ skb->transport_header = skb->network_header + sizeof(*top_iph); -+ -+ top_iph = ipv6_hdr(skb); -+ top_iph->version = 6; -+ top_iph->priority = 0; -+ memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); -+ top_iph->nexthdr = IPPROTO_AGGFRAG; -+ -+ /* As we have 0, fractional, 1 or N inner packets there's no obviously -+ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 -+ * 3.1. -+ */ -+ dsfield = 0; -+ ipv6_change_dsfield(top_iph, 0, dsfield); -+ -+ top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); -+ top_iph->saddr = *(struct in6_addr *)&x->props.saddr; -+ top_iph->daddr = *(struct in6_addr *)&x->id.daddr; -+ -+ return 0; -+} -+ -+/** -+ * iptfs_prepare_output() - prepare the skb for output -+ * @x: xfrm state -+ * @skb: the packet -+ * -+ * Return: Error value, if 0 then skb values should be as follows: -+ * - transport_header should point at ESP header -+ * - network_header should point at Outer IP header -+ * - mac_header should point at protocol/nexthdr of the outer IP -+ */ -+static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ if (x->outer_mode.family == AF_INET) -+ return iptfs_encap_add_ipv4(x, skb); -+ if (x->outer_mode.family == AF_INET6) { -+#if IS_ENABLED(CONFIG_IPV6) -+ return iptfs_encap_add_ipv6(x, skb); -+#else -+ WARN_ON_ONCE(1); -+ return -EAFNOSUPPORT; -+#endif -+ } -+ WARN_ON_ONCE(1); -+ return -EOPNOTSUPP; -+} -+ -+/* ========================== */ -+/* State Management Functions */ -+/* ========================== */ -+ -+/** -+ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. -+ * @x: xfrm state. -+ * @outer_mtu: the outer mtu -+ */ -+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) -+{ -+ struct crypto_aead *aead; -+ u32 blksize; -+ -+ aead = x->data; -+ blksize = ALIGN(crypto_aead_blocksize(aead), 4); -+ return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & -+ ~(blksize - 1)) - 2; -+} -+ -+/** -+ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. -+ * @x: xfrm state. -+ * @outer_mtu: Outer MTU for the encapsulated packet. -+ * -+ * Return: Correct MTU taking in to account the encap overhead. -+ */ -+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) -+{ -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ -+ /* If not dont-frag we have no MTU */ -+ if (!xtfs->cfg.dont_frag) -+ return x->outer_mode.family == AF_INET ? IP_MAX_MTU : -+ IP6_MAX_MTU; -+ return __iptfs_get_inner_mtu(x, outer_mtu); -+} -+ -+/** -+ * iptfs_user_init() - initialize the SA with IPTFS options from netlink. -+ * @net: the net data -+ * @x: xfrm state -+ * @attrs: netlink attributes -+ * @extack: extack return data -+ */ -+static int iptfs_user_init(struct net *net, struct xfrm_state *x, -+ struct nlattr **attrs, -+ struct netlink_ext_ack *extack) -+{ -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct xfrm_iptfs_config *xc; -+ -+ xc = &xtfs->cfg; -+ xc->reorder_win_size = net->xfrm.sysctl_iptfs_reorder_window; -+ xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize; -+ xtfs->init_delay_ns = net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; -+ xtfs->drop_time_ns = net->xfrm.sysctl_iptfs_drop_time * NSECS_IN_USEC; -+ -+ if (attrs[XFRMA_IPTFS_DONT_FRAG]) -+ xc->dont_frag = true; -+ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) -+ xc->reorder_win_size = -+ nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); -+ /* saved array is for saving 1..N seq nums from wantseq */ -+ if (xc->reorder_win_size) -+ xtfs->w_saved = kcalloc(xc->reorder_win_size, -+ sizeof(*xtfs->w_saved), GFP_KERNEL); -+ if (attrs[XFRMA_IPTFS_PKT_SIZE]) { -+ xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); -+ if (!xc->pkt_size) { -+ xtfs->payload_mtu = 0; -+ } else if (xc->pkt_size > x->props.header_len) { -+ xtfs->payload_mtu = xc->pkt_size - x->props.header_len; -+ } else { -+ NL_SET_ERR_MSG(extack, -+ "Packet size must be 0 or greater than IPTFS/ESP header length"); -+ return -EINVAL; -+ } -+ } -+ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) -+ xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); -+ if (attrs[XFRMA_IPTFS_DROP_TIME]) -+ xtfs->drop_time_ns = nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * -+ NSECS_IN_USEC; -+ if (attrs[XFRMA_IPTFS_INIT_DELAY]) -+ xtfs->init_delay_ns = -+ nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * -+ NSECS_IN_USEC; -+ -+ xtfs->ecn_queue_size = (u64)xc->max_queue_size * 95 / 100; -+ -+ return 0; -+} -+ -+static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) -+{ -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct xfrm_iptfs_config *xc = &xtfs->cfg; -+ int ret; -+ -+ if (xc->dont_frag) { -+ ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); -+ if (ret) -+ return ret; -+ } -+ ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, xc->reorder_win_size); -+ if (ret) -+ return ret; -+ ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); -+ if (ret) -+ return ret; -+ ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size); -+ if (ret) -+ return ret; -+ ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, -+ do_div(xtfs->drop_time_ns, NSECS_IN_USEC)); -+ if (ret) -+ return ret; -+ ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, -+ do_div(xtfs->init_delay_ns, NSECS_IN_USEC)); -+ return ret; -+} -+ -+static int __iptfs_init_state(struct xfrm_state *x, -+ struct xfrm_iptfs_data *xtfs) -+{ -+ __skb_queue_head_init(&xtfs->queue); -+ hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); -+ xtfs->iptfs_timer.function = iptfs_delay_timer; -+ -+ spin_lock_init(&xtfs->drop_lock); -+ hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); -+ xtfs->drop_timer.function = iptfs_drop_timer; -+ -+ /* Modify type (esp) adjustment values */ -+ -+ if (x->props.family == AF_INET) -+ x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); -+ else if (x->props.family == AF_INET6) -+ x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); -+ x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); -+ -+ /* Always have a module reference if x->mode_data is set */ -+ if (!try_module_get(x->mode_cbs->owner)) -+ return -EINVAL; -+ -+ x->mode_data = xtfs; -+ xtfs->x = x; -+ -+ return 0; -+} -+ -+static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) -+{ -+ struct xfrm_iptfs_data *xtfs; -+ int err; -+ -+ xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); -+ if (!xtfs) -+ return -ENOMEM; -+ -+ xtfs->ra_newskb = NULL; -+ if (xtfs->cfg.reorder_win_size) { -+ xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, -+ sizeof(*xtfs->w_saved), GFP_KERNEL); -+ if (!xtfs->w_saved) { -+ kfree_sensitive(xtfs); -+ return -ENOMEM; -+ } -+ } -+ -+ err = __iptfs_init_state(x, xtfs); -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+static int iptfs_create_state(struct xfrm_state *x) -+{ -+ struct xfrm_iptfs_data *xtfs; -+ int err; -+ -+ xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); -+ if (!xtfs) -+ return -ENOMEM; -+ -+ err = __iptfs_init_state(x, xtfs); -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+static void iptfs_delete_state(struct xfrm_state *x) -+{ -+ struct xfrm_iptfs_data *xtfs = x->mode_data; -+ struct skb_wseq *s, *se; -+ -+ if (!xtfs) -+ return; -+ -+ spin_lock_bh(&xtfs->drop_lock); -+ hrtimer_cancel(&xtfs->iptfs_timer); -+ hrtimer_cancel(&xtfs->drop_timer); -+ spin_unlock_bh(&xtfs->drop_lock); -+ -+ if (xtfs->ra_newskb) -+ kfree_skb(xtfs->ra_newskb); -+ -+ for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) -+ if (s->skb) -+ kfree_skb(s->skb); -+ -+ kfree_sensitive(xtfs->w_saved); -+ kfree_sensitive(xtfs); -+ -+ module_put(x->mode_cbs->owner); -+} -+ -+static const struct xfrm_mode_cbs iptfs_mode_cbs = { -+ .owner = THIS_MODULE, -+ .create_state = iptfs_create_state, -+ .delete_state = iptfs_delete_state, -+ .user_init = iptfs_user_init, -+ .copy_to_user = iptfs_copy_to_user, -+ .clone = iptfs_clone, -+ .get_inner_mtu = iptfs_get_inner_mtu, -+ .input = iptfs_input, -+ .output = iptfs_output_collect, -+ .prepare_output = iptfs_prepare_output, -+}; -+ -+static int __init xfrm_iptfs_init(void) -+{ -+ int err; -+ -+ pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); -+ -+ err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); -+ if (err < 0) -+ pr_info("%s: can't register IP-TFS\n", __func__); -+ -+ return err; -+} -+ -+static void __exit xfrm_iptfs_fini(void) -+{ -+ xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); -+} -+ -+module_init(xfrm_iptfs_init); -+module_exit(xfrm_iptfs_fini); -+MODULE_LICENSE("GPL"); --- -2.43.2 - diff --git a/patches/v2/v2-0008-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch b/patches/v2/v2-0008-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch new file mode 100644 index 0000000..19f408d --- /dev/null +++ b/patches/v2/v2-0008-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch @@ -0,0 +1,268 @@ +From a9a68bc0279b39fc79809f1cc1856780c176bba1 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 11:57:48 -0400 +Subject: [PATCH ipsec-next v2 08/17] xfrm: iptfs: add new iptfs xfrm mode impl + +Add a new xfrm mode implementing AggFrag/IP-TFS from RFC9347. + +This utilizes the new xfrm_mode_cbs to implement demand-driven IP-TFS +functionality. This functionality can be used to increase bandwidth +utilization through small packet aggregation, as well as help solve PMTU +issues through it's efficient use of fragmentation. + + Link: https://www.rfc-editor.org/rfc/rfc9347.txt + +Multiple commits follow to build the functionality into xfrm_iptfs.c + +Signed-off-by: Christian Hopps +--- + net/xfrm/Makefile | 1 + + net/xfrm/xfrm_iptfs.c | 225 ++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 226 insertions(+) + create mode 100644 net/xfrm/xfrm_iptfs.c + +diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile +index 547cec77ba03..cd6520d4d777 100644 +--- a/net/xfrm/Makefile ++++ b/net/xfrm/Makefile +@@ -20,5 +20,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o + obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o + obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o + obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o ++obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o + obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o + obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +new file mode 100644 +index 000000000000..e7b5546e1f6a +--- /dev/null ++++ b/net/xfrm/xfrm_iptfs.c +@@ -0,0 +1,225 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* xfrm_iptfs: IPTFS encapsulation support ++ * ++ * April 21 2022, Christian Hopps ++ * ++ * Copyright (c) 2022, LabN Consulting, L.L.C. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "xfrm_inout.h" ++ ++struct xfrm_iptfs_config { ++ u32 pkt_size; /* outer_packet_size or 0 */ ++}; ++ ++struct xfrm_iptfs_data { ++ struct xfrm_iptfs_config cfg; ++ ++ /* Ingress User Input */ ++ struct xfrm_state *x; /* owning state */ ++ u32 payload_mtu; /* max payload size */ ++}; ++ ++/* ========================== */ ++/* State Management Functions */ ++/* ========================== */ ++ ++/** ++ * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. ++ * @x: xfrm state. ++ * @outer_mtu: the outer mtu ++ */ ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++{ ++ struct crypto_aead *aead; ++ u32 blksize; ++ ++ aead = x->data; ++ blksize = ALIGN(crypto_aead_blocksize(aead), 4); ++ return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & ++ ~(blksize - 1)) - 2; ++} ++ ++/** ++ * iptfs_user_init() - initialize the SA with IPTFS options from netlink. ++ * @net: the net data ++ * @x: xfrm state ++ * @attrs: netlink attributes ++ * @extack: extack return data ++ */ ++static int iptfs_user_init(struct net *net, struct xfrm_state *x, ++ struct nlattr **attrs, ++ struct netlink_ext_ack *extack) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc; ++ ++ xc = &xtfs->cfg; ++ ++ if (attrs[XFRMA_IPTFS_PKT_SIZE]) { ++ xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); ++ if (!xc->pkt_size) { ++ xtfs->payload_mtu = 0; ++ } else if (xc->pkt_size > x->props.header_len) { ++ xtfs->payload_mtu = xc->pkt_size - x->props.header_len; ++ } else { ++ NL_SET_ERR_MSG(extack, ++ "Packet size must be 0 or greater than IPTFS/ESP header length"); ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++static unsigned int iptfs_sa_len(const struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc = &xtfs->cfg; ++ unsigned int l = 0; ++ ++ l += nla_total_size(0); ++ l += nla_total_size(sizeof(u16)); ++ l += nla_total_size(sizeof(xc->pkt_size)); ++ l += nla_total_size(sizeof(u32)); ++ l += nla_total_size(sizeof(u32)); /* drop time usec */ ++ l += nla_total_size(sizeof(u32)); /* init delay usec */ ++ ++ return l; ++} ++ ++static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc = &xtfs->cfg; ++ int ret; ++ ++ ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); ++ if (ret) ++ return ret; ++ ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, 0); ++ if (ret) ++ return ret; ++ ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); ++ if (ret) ++ return ret; ++ ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, 0); ++ if (ret) ++ return ret; ++ ++ ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, 0); ++ if (ret) ++ return ret; ++ ++ ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, 0); ++ ++ return ret; ++} ++ ++static int __iptfs_init_state(struct xfrm_state *x, ++ struct xfrm_iptfs_data *xtfs) ++{ ++ /* Modify type (esp) adjustment values */ ++ ++ if (x->props.family == AF_INET) ++ x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); ++ else if (x->props.family == AF_INET6) ++ x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); ++ x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); ++ ++ /* Always have a module reference if x->mode_data is set */ ++ if (!try_module_get(x->mode_cbs->owner)) ++ return -EINVAL; ++ ++ x->mode_data = xtfs; ++ xtfs->x = x; ++ ++ return 0; ++} ++ ++static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ int err; ++ ++ xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); ++ if (!xtfs) ++ return -ENOMEM; ++ ++ err = __iptfs_init_state(x, xtfs); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static int iptfs_create_state(struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ int err; ++ ++ xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); ++ if (!xtfs) ++ return -ENOMEM; ++ ++ err = __iptfs_init_state(x, xtfs); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static void iptfs_delete_state(struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ ++ if (!xtfs) ++ return; ++ ++ kfree_sensitive(xtfs); ++ ++ module_put(x->mode_cbs->owner); ++} ++ ++static const struct xfrm_mode_cbs iptfs_mode_cbs = { ++ .owner = THIS_MODULE, ++ .create_state = iptfs_create_state, ++ .delete_state = iptfs_delete_state, ++ .user_init = iptfs_user_init, ++ .copy_to_user = iptfs_copy_to_user, ++ .sa_len = iptfs_sa_len, ++ .clone = iptfs_clone, ++ .get_inner_mtu = iptfs_get_inner_mtu, ++}; ++ ++static int __init xfrm_iptfs_init(void) ++{ ++ int err; ++ ++ pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); ++ ++ err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); ++ if (err < 0) ++ pr_info("%s: can't register IP-TFS\n", __func__); ++ ++ return err; ++} ++ ++static void __exit xfrm_iptfs_fini(void) ++{ ++ xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); ++} ++ ++module_init(xfrm_iptfs_init); ++module_exit(xfrm_iptfs_fini); ++MODULE_LICENSE("GPL"); +-- +2.45.1 + diff --git a/patches/v2/v2-0009-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch b/patches/v2/v2-0009-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch new file mode 100644 index 0000000..569eaba --- /dev/null +++ b/patches/v2/v2-0009-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch @@ -0,0 +1,622 @@ +From 0c0295727e9fced94770b10bdd9ce7a80495924c Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Thu, 4 Apr 2024 02:27:59 -0400 +Subject: [PATCH ipsec-next v2 09/17] xfrm: iptfs: add user packet (tunnel + ingress) handling + +Add tunnel packet output functionality. This is code handles +the ingress to the tunnel. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 519 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 516 insertions(+), 3 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index e7b5546e1f6a..561483fc83f9 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -19,8 +19,13 @@ + + #include "xfrm_inout.h" + ++#define NSECS_IN_USEC 1000 ++ ++#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT ++ + struct xfrm_iptfs_config { + u32 pkt_size; /* outer_packet_size or 0 */ ++ u32 max_queue_size; /* octets */ + }; + + struct xfrm_iptfs_data { +@@ -28,9 +33,495 @@ struct xfrm_iptfs_data { + + /* Ingress User Input */ + struct xfrm_state *x; /* owning state */ ++ struct sk_buff_head queue; /* output queue */ ++ u32 queue_size; /* octets */ ++ u32 ecn_queue_size; /* octets above which ECN mark */ ++ u64 init_delay_ns; /* nanoseconds */ ++ struct hrtimer iptfs_timer; /* output timer */ ++ time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ + }; + ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); ++static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); ++ ++/* ================================= */ ++/* IPTFS Sending (ingress) Functions */ ++/* ================================= */ ++ ++/* ------------------------- */ ++/* Enqueue to send functions */ ++/* ------------------------- */ ++ ++/** ++ * iptfs_enqueue() - enqueue packet if ok to send. ++ * @xtfs: xtfs state ++ * @skb: the packet ++ * ++ * Return: true if packet enqueued. ++ */ ++static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) ++{ ++ u64 newsz = xtfs->queue_size + skb->len; ++ struct iphdr *iph; ++ ++ assert_spin_locked(&xtfs->x->lock); ++ ++ if (newsz > xtfs->cfg.max_queue_size) ++ return false; ++ ++ /* Set ECN CE if we are above our ECN queue threshold */ ++ if (newsz > xtfs->ecn_queue_size) { ++ iph = ip_hdr(skb); ++ if (iph->version == 4) ++ IP_ECN_set_ce(iph); ++ else if (iph->version == 6) ++ IP6_ECN_set_ce(skb, ipv6_hdr(skb)); ++ } ++ ++ __skb_queue_tail(&xtfs->queue, skb); ++ xtfs->queue_size += skb->len; ++ return true; ++} ++ ++static int iptfs_get_cur_pmtu(struct xfrm_state *x, ++ struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) ++{ ++ struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); ++ u32 payload_mtu = xtfs->payload_mtu; ++ u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); ++ ++ if (payload_mtu && payload_mtu < pmtu) ++ pmtu = payload_mtu; ++ ++ return pmtu; ++} ++ ++static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) ++{ ++ if (skb->len <= pmtu) ++ return 0; ++ ++ /* We only send ICMP too big if the user has configured us as ++ * dont-fragment. ++ */ ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); ++ ++ if (sk) { ++ xfrm_local_error(skb, pmtu); ++ } else if (ip_hdr(skb)->version == 4) { ++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, ++ htonl(pmtu)); ++ } else { ++ WARN_ON_ONCE(ip_hdr(skb)->version != 6); ++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); ++ } ++ return 1; ++} ++ ++/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload ++ * (i.e., aggregating or fragmenting as appropriate). ++ * This is set in dst->output for an SA. ++ */ ++static int iptfs_output_collect(struct net *net, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct xfrm_state *x = dst->xfrm; ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *segs, *nskb; ++ u32 pmtu = 0; ++ bool ok = true; ++ bool was_gso; ++ ++ /* We have hooked into dst_entry->output which means we have skipped the ++ * protocol specific netfilter (see xfrm4_output, xfrm6_output). ++ * when our timer runs we will end up calling xfrm_output directly on ++ * the encapsulated traffic. ++ * ++ * For both cases this is the NF_INET_POST_ROUTING hook which allows ++ * changing the skb->dst entry which then may not be xfrm based anymore ++ * in which case a REROUTED flag is set. and dst_output is called. ++ * ++ * For IPv6 we are also skipping fragmentation handling for local ++ * sockets, which may or may not be good depending on our tunnel DF ++ * setting. Normally with fragmentation supported we want to skip this ++ * fragmentation. ++ */ ++ ++ BUG_ON(!xtfs); ++ ++ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ ++ /* Break apart GSO skbs. If the queue is nearing full then we want the ++ * accounting and queuing to be based on the individual packets not on the ++ * aggregate GSO buffer. ++ */ ++ was_gso = skb_is_gso(skb); ++ if (!was_gso) { ++ segs = skb; ++ } else { ++ segs = skb_gso_segment(skb, 0); ++ if (IS_ERR_OR_NULL(segs)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb); ++ return PTR_ERR(segs); ++ } ++ consume_skb(skb); ++ skb = NULL; ++ } ++ ++ /* We can be running on multiple cores and from the network softirq or ++ * from user context depending on where the packet is coming from. ++ */ ++ spin_lock_bh(&x->lock); ++ ++ skb_list_walk_safe(segs, skb, nskb) ++ { ++ skb_mark_not_on_list(skb); ++ ++ /* Once we drop due to no queue space we continue to drop the ++ * rest of the packets from that GRO. ++ */ ++ if (!ok) { ++nospace: ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMOUTNOQSPACE); ++ kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); ++ continue; ++ } ++ ++ /* Fragmenting handled in following commits. */ ++ if (iptfs_is_too_big(sk, skb, pmtu)) { ++ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); ++ continue; ++ } ++ ++ /* Enqueue to send in tunnel */ ++ ok = iptfs_enqueue(xtfs, skb); ++ if (!ok) ++ goto nospace; ++ } ++ ++ /* Start a delay timer if we don't have one yet */ ++ if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { ++ hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, ++ IPTFS_HRTIMER_MODE); ++ xtfs->iptfs_settime = ktime_get_raw_fast_ns(); ++ } ++ ++ spin_unlock_bh(&x->lock); ++ return 0; ++} ++ ++/* -------------------------- */ ++/* Dequeue and send functions */ ++/* -------------------------- */ ++ ++static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) ++{ ++ struct ip_iptfs_hdr *h; ++ size_t hsz = sizeof(*h); ++ ++ /* now reset values to be pointing at the rest of the packets */ ++ h = skb_push(skb, hsz); ++ memset(h, 0, hsz); ++ if (blkoff) ++ h->block_offset = htons(blkoff); ++ ++ /* network_header current points at the inner IP packet ++ * move it to the iptfs header ++ */ ++ skb->transport_header = skb->network_header; ++ skb->network_header -= hsz; ++ ++ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; ++} ++ ++static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, ++ struct sk_buff *child) ++{ ++ u32 fllen = 0; ++ ++ /* It might be possible to account for a frag list in addition to page ++ * fragment if it's a valid state to be in. The page fragments size ++ * should be kept as data_len so only the frag_list size is removed, ++ * this must be done above as well. ++ */ ++ BUG_ON(skb_shinfo(child)->nr_frags); ++ *nextp = skb_shinfo(child)->frag_list; ++ while (*nextp) { ++ fllen += (*nextp)->len; ++ nextp = &(*nextp)->next; ++ } ++ skb_frag_list_init(child); ++ BUG_ON(fllen > child->data_len); ++ child->len -= fllen; ++ child->data_len -= fllen; ++ ++ return nextp; ++} ++ ++static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *skb, *skb2, **nextp; ++ struct skb_shared_info *shi; ++ ++ while ((skb = __skb_dequeue(list))) { ++ u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ int remaining; ++ ++ /* protocol comes to us cleared sometimes */ ++ skb->protocol = x->outer_mode.family == AF_INET ? ++ htons(ETH_P_IP) : ++ htons(ETH_P_IPV6); ++ ++ if (skb->len > mtu) { ++ /* We handle this case before enqueueing so we are only ++ * here b/c MTU changed after we enqueued before we ++ * dequeued, just drop these. ++ */ ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMOUTERROR); ++ ++ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); ++ continue; ++ } ++ ++ /* If we don't have a cksum in the packet we need to add one ++ * before encapsulation. ++ */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_help(skb)) { ++ XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), ++ LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb); ++ continue; ++ } ++ } ++ ++ /* Convert first inner packet into an outer IPTFS packet */ ++ iptfs_output_prepare_skb(skb, 0); ++ ++ /* The space remaining to send more inner packet data is `mtu` - ++ * (skb->len - sizeof iptfs header). This is b/c the `mtu` value ++ * has the basic IPTFS header len accounted for, and we added ++ * that header to the skb so it is a part of skb->len, thus we ++ * subtract it from the skb length. ++ */ ++ remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); ++ ++ /* Re-home (un-nest) nested fragment lists. We need to do this ++ * b/c we will simply be appending any following aggregated ++ * inner packets to the frag list. ++ */ ++ shi = skb_shinfo(skb); ++ nextp = &shi->frag_list; ++ while (*nextp) { ++ if (skb_has_frag_list(*nextp)) ++ nextp = iptfs_rehome_fraglist(&(*nextp)->next, ++ *nextp); ++ else ++ nextp = &(*nextp)->next; ++ } ++ ++ /* See if we have enough space to simply append. ++ * ++ * NOTE: Maybe do not append if we will be mis-aligned, ++ * SW-based endpoints will probably have to copy in this ++ * case. ++ */ ++ while ((skb2 = skb_peek(list))) { ++ if (skb2->len > remaining) ++ break; ++ ++ __skb_unlink(skb2, list); ++ ++ /* If we don't have a cksum in the packet we need to add ++ * one before encapsulation. ++ */ ++ if (skb2->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_help(skb2)) { ++ XFRM_INC_STATS( ++ dev_net(skb_dst(skb2)->dev), ++ LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb2); ++ continue; ++ } ++ } ++ ++ /* Do accounting */ ++ skb->data_len += skb2->len; ++ skb->len += skb2->len; ++ remaining -= skb2->len; ++ ++ /* Append to the frag_list */ ++ *nextp = skb2; ++ nextp = &skb2->next; ++ BUG_ON(*nextp); ++ if (skb_has_frag_list(skb2)) ++ nextp = iptfs_rehome_fraglist(nextp, skb2); ++ skb->truesize += skb2->truesize; ++ } ++ ++ xfrm_output(NULL, skb); ++ } ++} ++ ++static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) ++{ ++ struct sk_buff_head list; ++ struct xfrm_iptfs_data *xtfs; ++ struct xfrm_state *x; ++ time64_t settime; ++ ++ xtfs = container_of(me, typeof(*xtfs), iptfs_timer); ++ x = xtfs->x; ++ ++ /* Process all the queued packets ++ * ++ * softirq execution order: timer > tasklet > hrtimer ++ * ++ * Network rx will have run before us giving one last chance to queue ++ * ingress packets for us to process and transmit. ++ */ ++ ++ spin_lock(&x->lock); ++ __skb_queue_head_init(&list); ++ skb_queue_splice_init(&xtfs->queue, &list); ++ xtfs->queue_size = 0; ++ settime = xtfs->iptfs_settime; ++ spin_unlock(&x->lock); ++ ++ /* After the above unlock, packets can begin queuing again, and the ++ * timer can be set again, from another CPU either in softirq or user ++ * context (not from this one since we are running at softirq level ++ * already). ++ */ ++ ++ iptfs_output_queued(x, &list); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/** ++ * iptfs_encap_add_ipv4() - add outer encaps ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the ++ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set ++ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do ++ * anything with inner headers as they aren't pointing into a normal IP ++ * singleton inner packet. ++ */ ++static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct iphdr *top_iph; ++ ++ skb_reset_inner_network_header(skb); ++ skb_reset_inner_transport_header(skb); ++ ++ skb_set_network_header(skb, ++ -(x->props.header_len - x->props.enc_hdr_len)); ++ skb->mac_header = ++ skb->network_header + offsetof(struct iphdr, protocol); ++ skb->transport_header = skb->network_header + sizeof(*top_iph); ++ ++ top_iph = ip_hdr(skb); ++ top_iph->ihl = 5; ++ top_iph->version = 4; ++ top_iph->protocol = IPPROTO_AGGFRAG; ++ ++ /* As we have 0, fractional, 1 or N inner packets there's no obviously ++ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 ++ * 3.1. ++ */ ++ top_iph->tos = 0; ++ ++ top_iph->frag_off = htons(IP_DF); ++ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); ++ top_iph->saddr = x->props.saddr.a4; ++ top_iph->daddr = x->id.daddr.a4; ++ ip_select_ident(dev_net(dst->dev), skb, NULL); ++ ++ return 0; ++} ++ ++/** ++ * iptfs_encap_add_ipv6() - add outer encaps ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the ++ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set ++ * the flow label and TOS/DSCP bits. It also sets the protocol to a different ++ * value and doesn't do anything with inner headers as they aren't pointing into ++ * a normal IP singleton inner packet. ++ */ ++static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct ipv6hdr *top_iph; ++ int dsfield; ++ ++ skb_reset_inner_network_header(skb); ++ skb_reset_inner_transport_header(skb); ++ ++ skb_set_network_header(skb, ++ -x->props.header_len + x->props.enc_hdr_len); ++ skb->mac_header = ++ skb->network_header + offsetof(struct ipv6hdr, nexthdr); ++ skb->transport_header = skb->network_header + sizeof(*top_iph); ++ ++ top_iph = ipv6_hdr(skb); ++ top_iph->version = 6; ++ top_iph->priority = 0; ++ memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); ++ top_iph->nexthdr = IPPROTO_AGGFRAG; ++ ++ /* As we have 0, fractional, 1 or N inner packets there's no obviously ++ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 ++ * 3.1. ++ */ ++ dsfield = 0; ++ ipv6_change_dsfield(top_iph, 0, dsfield); ++ ++ top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); ++ top_iph->saddr = *(struct in6_addr *)&x->props.saddr; ++ top_iph->daddr = *(struct in6_addr *)&x->id.daddr; ++ ++ return 0; ++} ++ ++/** ++ * iptfs_prepare_output() - prepare the skb for output ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * Return: Error value, if 0 then skb values should be as follows: ++ * - transport_header should point at ESP header ++ * - network_header should point at Outer IP header ++ * - mac_header should point at protocol/nexthdr of the outer IP ++ */ ++static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ if (x->outer_mode.family == AF_INET) ++ return iptfs_encap_add_ipv4(x, skb); ++ if (x->outer_mode.family == AF_INET6) { ++#if IS_ENABLED(CONFIG_IPV6) ++ return iptfs_encap_add_ipv6(x, skb); ++#else ++ WARN_ON_ONCE(1); ++ return -EAFNOSUPPORT; ++#endif ++ } ++ WARN_ON_ONCE(1); ++ return -EOPNOTSUPP; ++} ++ + /* ========================== */ + /* State Management Functions */ + /* ========================== */ +@@ -66,6 +557,9 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + struct xfrm_iptfs_config *xc; + + xc = &xtfs->cfg; ++ xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize; ++ xtfs->init_delay_ns = ++ (u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); +@@ -79,6 +573,15 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + return -EINVAL; + } + } ++ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) ++ xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); ++ if (attrs[XFRMA_IPTFS_INIT_DELAY]) ++ xtfs->init_delay_ns = ++ (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * ++ NSECS_IN_USEC; ++ ++ xtfs->ecn_queue_size = (u64)xc->max_queue_size * 95 / 100; ++ + return 0; + } + +@@ -91,7 +594,7 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + l += nla_total_size(0); + l += nla_total_size(sizeof(u16)); + l += nla_total_size(sizeof(xc->pkt_size)); +- l += nla_total_size(sizeof(u32)); ++ l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(u32)); /* drop time usec */ + l += nla_total_size(sizeof(u32)); /* init delay usec */ + +@@ -103,6 +606,7 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + int ret; ++ u64 q; + + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); + if (ret) +@@ -113,7 +617,7 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); + if (ret) + return ret; +- ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, 0); ++ ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size); + if (ret) + return ret; + +@@ -121,7 +625,9 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + if (ret) + return ret; + +- ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, 0); ++ q = xtfs->init_delay_ns; ++ (void)do_div(q, NSECS_IN_USEC); ++ ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); + + return ret; + } +@@ -129,6 +635,10 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + static int __iptfs_init_state(struct xfrm_state *x, + struct xfrm_iptfs_data *xtfs) + { ++ __skb_queue_head_init(&xtfs->queue); ++ hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); ++ xtfs->iptfs_timer.function = iptfs_delay_timer; ++ + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) +@@ -186,6 +696,7 @@ static void iptfs_delete_state(struct xfrm_state *x) + if (!xtfs) + return; + ++ hrtimer_cancel(&xtfs->iptfs_timer); + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +@@ -200,6 +711,8 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .sa_len = iptfs_sa_len, + .clone = iptfs_clone, + .get_inner_mtu = iptfs_get_inner_mtu, ++ .output = iptfs_output_collect, ++ .prepare_output = iptfs_prepare_output, + }; + + static int __init xfrm_iptfs_init(void) +-- +2.45.1 + diff --git a/patches/v2/v2-0010-xfrm-iptfs-share-page-fragments-of-inner-packets.patch b/patches/v2/v2-0010-xfrm-iptfs-share-page-fragments-of-inner-packets.patch new file mode 100644 index 0000000..18e922a --- /dev/null +++ b/patches/v2/v2-0010-xfrm-iptfs-share-page-fragments-of-inner-packets.patch @@ -0,0 +1,157 @@ +From 862fabe4c25a98d8af52e2e1b1cc812aa76d425b Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Thu, 4 Apr 2024 02:33:03 -0400 +Subject: [PATCH ipsec-next v2 10/17] xfrm: iptfs: share page fragments of + inner packets + +When possible rather than appending secondary (aggregated) inner packets +to the fragment list, share their page fragments with the outer IPTFS +packet. This allows for more efficient packet transmission. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 88 ++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 79 insertions(+), 9 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 561483fc83f9..87ba643edfb7 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -45,6 +45,24 @@ struct xfrm_iptfs_data { + static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + ++/* ================= */ ++/* SK_BUFF Functions */ ++/* ================= */ ++ ++/** ++ * skb_head_to_frag() - initialize a skb_frag_t based on skb head data ++ * @skb: skb with the head data ++ * @frag: frag to initialize ++ */ ++static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) ++{ ++ struct page *page = virt_to_head_page(skb->data); ++ unsigned char *addr = (unsigned char *)page_address(page); ++ ++ BUG_ON(!skb->head_frag); ++ skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -262,14 +280,44 @@ static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, + return nextp; + } + ++static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) ++{ ++ struct skb_shared_info *fromi = skb_shinfo(from); ++ struct skb_shared_info *toi = skb_shinfo(to); ++ unsigned int new_truesize; ++ ++ /* If we have data in a head page, grab it */ ++ if (!skb_headlen(from)) { ++ new_truesize = SKB_TRUESIZE(skb_end_offset(from)); ++ } else { ++ skb_head_to_frag(from, &toi->frags[toi->nr_frags]); ++ skb_frag_ref(to, toi->nr_frags++); ++ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ } ++ ++ /* Move any other page fragments rather than copy */ ++ memcpy(&toi->frags[toi->nr_frags], fromi->frags, ++ sizeof(fromi->frags[0]) * fromi->nr_frags); ++ toi->nr_frags += fromi->nr_frags; ++ fromi->nr_frags = 0; ++ from->data_len = 0; ++ from->len = 0; ++ to->truesize += from->truesize - new_truesize; ++ from->truesize = new_truesize; ++ ++ /* We are done with this SKB */ ++ consume_skb(from); ++} ++ + static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + { + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *skb, *skb2, **nextp; +- struct skb_shared_info *shi; ++ struct skb_shared_info *shi, *shi2; + + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ bool share_ok = true; + int remaining; + + /* protocol comes to us cleared sometimes */ +@@ -314,7 +362,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + + /* Re-home (un-nest) nested fragment lists. We need to do this + * b/c we will simply be appending any following aggregated +- * inner packets to the frag list. ++ * inner packets using the frag list. + */ + shi = skb_shinfo(skb); + nextp = &shi->frag_list; +@@ -326,6 +374,9 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + nextp = &(*nextp)->next; + } + ++ if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) ++ share_ok = false; ++ + /* See if we have enough space to simply append. + * + * NOTE: Maybe do not append if we will be mis-aligned, +@@ -351,18 +402,37 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + } + } + ++ /* skb->pp_recycle is passed to __skb_flag_unref for all ++ * frag pages so we can only share pages with skb's who ++ * match ourselves. ++ */ ++ shi2 = skb_shinfo(skb2); ++ if (share_ok && ++ (shi2->frag_list || ++ (!skb2->head_frag && skb_headlen(skb)) || ++ skb->pp_recycle != skb2->pp_recycle || ++ skb_zcopy(skb2) || ++ (shi->nr_frags + shi2->nr_frags + 1 > ++ MAX_SKB_FRAGS))) ++ share_ok = false; ++ + /* Do accounting */ + skb->data_len += skb2->len; + skb->len += skb2->len; + remaining -= skb2->len; + +- /* Append to the frag_list */ +- *nextp = skb2; +- nextp = &skb2->next; +- BUG_ON(*nextp); +- if (skb_has_frag_list(skb2)) +- nextp = iptfs_rehome_fraglist(nextp, skb2); +- skb->truesize += skb2->truesize; ++ if (share_ok) { ++ iptfs_consume_frags(skb, skb2); ++ } else { ++ /* Append to the frag_list */ ++ *nextp = skb2; ++ nextp = &skb2->next; ++ BUG_ON(*nextp); ++ if (skb_has_frag_list(skb2)) ++ nextp = iptfs_rehome_fraglist(nextp, ++ skb2); ++ skb->truesize += skb2->truesize; ++ } + } + + xfrm_output(NULL, skb); +-- +2.45.1 + diff --git a/patches/v2/v2-0011-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch b/patches/v2/v2-0011-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch new file mode 100644 index 0000000..4c66ecc --- /dev/null +++ b/patches/v2/v2-0011-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch @@ -0,0 +1,546 @@ +From 23953763386cb76cb7221becb893b5db926fd486 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sat, 18 May 2024 08:19:39 -0400 +Subject: [PATCH ipsec-next v2 11/17] xfrm: iptfs: add fragmenting of larger + than MTU user packets + +Add support for tunneling user (inner) packets that are larger than the +tunnel's path MTU (outer) using IP-TFS fragmentation. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 404 ++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 374 insertions(+), 30 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 87ba643edfb7..cabe3e1ffbff 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -19,11 +19,22 @@ + + #include "xfrm_inout.h" + ++/* 1) skb->head should be cache aligned. ++ * 2) when resv is for L2 headers (i.e., ethernet) we want the cacheline to ++ * start -16 from data. ++ * 3) when resv is for L3+L2 headers IOW skb->data points at the IPTFS payload ++ * we want data to be cache line aligned so all the pushed headers will be in ++ * another cacheline. ++ */ ++#define XFRM_IPTFS_MIN_L3HEADROOM 128 ++#define XFRM_IPTFS_MIN_L2HEADROOM (64 + 16) ++#define IPTFS_FRAG_COPY_MAX 256 /* max for copying to create iptfs frags */ + #define NSECS_IN_USEC 1000 + + #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + + struct xfrm_iptfs_config { ++ bool dont_frag : 1; + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + }; +@@ -42,13 +53,71 @@ struct xfrm_iptfs_data { + u32 payload_mtu; /* max payload size */ + }; + +-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); ++static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + + /* ================= */ + /* SK_BUFF Functions */ + /* ================= */ + ++/** ++ * iptfs_alloc_skb() - Allocate a new `skb` using a meta-data template. ++ * @tpl: the template to copy the new `skb`s meta-data from. ++ * @len: the linear length of the head data, zero is fine. ++ * @l3resv: true if reserve needs to support pushing L3 headers ++ * ++ * A new `skb` is allocated and it's meta-data is initialized from `tpl`, the ++ * head data is sized to `len` + reserved space set according to the @l3resv ++ * boolean. When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which ++ * arranges for `skb->data - 16` (etherhdr space) to be the start of a cacheline. ++ * Otherwise, @l3resv is true and resv is either the size of headroom from `tpl` or ++ * XFRM_IPTFS_MIN_L3HEADROOM whichever is greater, which tries to align ++ * skb->data to a cacheline as all headers will be pushed on the previous ++ * cacheline bytes. ++ * ++ * When copying meta-data from the @tpl, the sk_buff->headers are not copied. ++ * ++ * Zero length skbs are allocated when we only need a head skb to hold new ++ * packet headers (basically the mac header) that sit on top of existing shared ++ * packet data. ++ * ++ * Return: the new skb or NULL. ++ */ ++static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, ++ bool l3resv) ++{ ++ struct sk_buff *skb; ++ u32 resv; ++ ++ if (!l3resv) { ++ resv = XFRM_IPTFS_MIN_L2HEADROOM; ++ } else { ++ resv = skb_headroom(tpl); ++ if (resv < XFRM_IPTFS_MIN_L3HEADROOM) ++ resv = XFRM_IPTFS_MIN_L3HEADROOM; ++ } ++ ++ skb = alloc_skb(len + resv, GFP_ATOMIC); ++ if (!skb) { ++ XFRM_INC_STATS(dev_net(tpl->dev), LINUX_MIB_XFRMINERROR); ++ return NULL; ++ } ++ ++ skb_reserve(skb, resv); ++ ++ /* Code from __copy_skb_header() -- we do not want any of the ++ * tpl->headers copied over, so we aren't using `skb_copy_header()`. ++ */ ++ skb->tstamp = tpl->tstamp; ++ skb->dev = tpl->dev; ++ memcpy(skb->cb, tpl->cb, sizeof(skb->cb)); ++ skb_dst_copy(skb, tpl); ++ __skb_ext_copy(skb, tpl); ++ __nf_copy(skb, tpl, false); ++ ++ return skb; ++} ++ + /** + * skb_head_to_frag() - initialize a skb_frag_t based on skb head data + * @skb: skb with the head data +@@ -63,6 +132,39 @@ static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); + } + ++/** ++ * skb_copy_bits_seq - copy bits from a skb_seq_state to kernel buffer ++ * @st: source skb_seq_state ++ * @offset: offset in source ++ * @to: destination buffer ++ * @len: number of bytes to copy ++ * ++ * Copy @len bytes from @offset bytes into the source @st to the destination ++ * buffer @to. `offset` should increase (or be unchanged) with each subsequent ++ * call to this function. If offset needs to decrease from the previous use `st` ++ * should be reset first. ++ */ ++static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, ++ int len) ++{ ++ const u8 *data; ++ u32 sqlen; ++ ++ for (;;) { ++ sqlen = skb_seq_read(offset, &data, st); ++ if (sqlen == 0) ++ return -ENOMEM; ++ if (sqlen >= len) { ++ memcpy(to, data, len); ++ return 0; ++ } ++ memcpy(to, data, sqlen); ++ to += sqlen; ++ offset += sqlen; ++ len -= sqlen; ++ } ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -107,7 +209,7 @@ static int iptfs_get_cur_pmtu(struct xfrm_state *x, + { + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); + u32 payload_mtu = xtfs->payload_mtu; +- u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); ++ u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + + if (payload_mtu && payload_mtu < pmtu) + pmtu = payload_mtu; +@@ -169,7 +271,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + + BUG_ON(!xtfs); + +- pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ if (xtfs->cfg.dont_frag) ++ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + + /* Break apart GSO skbs. If the queue is nearing full then we want the + * accounting and queuing to be based on the individual packets not on the +@@ -209,8 +312,10 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + continue; + } + +- /* Fragmenting handled in following commits. */ +- if (iptfs_is_too_big(sk, skb, pmtu)) { ++ /* If the user indicated no iptfs fragmenting check before ++ * enqueue. ++ */ ++ if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -256,6 +361,217 @@ static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + } + ++/** ++ * iptfs_copy_create_frag() - create an inner fragment skb. ++ * @st: The source packet data. ++ * @offset: offset in @st of the new fragment data. ++ * @copy_len: the amount of data to copy from @st. ++ * ++ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must ++ * not be greater than the max fragment size. ++ * ++ * Return: the new fragment skb or an ERR_PTR(). ++ */ ++static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, ++ u32 offset, u32 copy_len) ++{ ++ struct sk_buff *src = st->root_skb; ++ struct sk_buff *skb; ++ int err; ++ ++ skb = iptfs_alloc_skb(src, copy_len, true); ++ if (!skb) ++ return ERR_PTR(-ENOMEM); ++ ++ /* Now copy `copy_len` data from src */ ++ err = skb_copy_bits_seq(st, offset, skb_put(skb, copy_len), copy_len); ++ if (err) { ++ XFRM_INC_STATS(dev_net(src->dev), LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ ++ return skb; ++} ++ ++/** ++ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. ++ * @skbp: the source packet skb (IN), skb holding the last fragment in ++ * the fragment stream (OUT). ++ * @xtfs: IPTFS SA state. ++ * @mtu: the max IPTFS fragment size. ++ * ++ * This function is responsible for fragmenting a larger inner packet into a ++ * sequence of IPTFS payload packets. The last fragment is returned rather than ++ * being sent so that the caller can append more inner packets (aggregation) if ++ * there is room. ++ */ ++static int iptfs_copy_create_frags(struct sk_buff **skbp, ++ struct xfrm_iptfs_data *xtfs, u32 mtu) ++{ ++ struct skb_seq_state skbseq; ++ struct list_head sublist; ++ struct sk_buff *skb = *skbp; ++ struct sk_buff *nskb = *skbp; ++ u32 copy_len, offset; ++ u32 to_copy = skb->len - mtu; ++ u32 blkoff = 0; ++ int err = 0; ++ ++ INIT_LIST_HEAD(&sublist); ++ ++ BUG_ON(skb->len <= mtu); ++ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); ++ ++ /* A trimmed `skb` will be sent as the first fragment, later. */ ++ offset = mtu; ++ to_copy = skb->len - offset; ++ while (to_copy) { ++ /* Send all but last fragment to allow agg. append */ ++ list_add_tail(&nskb->list, &sublist); ++ ++ /* FUTURE: if the packet has an odd/non-aligning length we could ++ * send less data in the penultimate fragment so that the last ++ * fragment then ends on an aligned boundary. ++ */ ++ copy_len = to_copy <= mtu ? to_copy : mtu; ++ nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); ++ if (IS_ERR(nskb)) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMOUTERROR); ++ skb_abort_seq_read(&skbseq); ++ err = PTR_ERR(nskb); ++ nskb = NULL; ++ break; ++ } ++ iptfs_output_prepare_skb(nskb, to_copy); ++ offset += copy_len; ++ to_copy -= copy_len; ++ blkoff = to_copy; ++ } ++ skb_abort_seq_read(&skbseq); ++ ++ /* return last fragment that will be unsent (or NULL) */ ++ *skbp = nskb; ++ ++ /* trim the original skb to MTU */ ++ if (!err) ++ err = pskb_trim(skb, mtu); ++ ++ if (err) { ++ /* Free all frags. Don't bother sending a partial packet we will ++ * never complete. ++ */ ++ kfree_skb(nskb); ++ list_for_each_entry_safe(skb, nskb, &sublist, list) { ++ skb_list_del_init(skb); ++ kfree_skb(skb); ++ } ++ return err; ++ } ++ ++ /* prepare the initial fragment with an iptfs header */ ++ iptfs_output_prepare_skb(skb, 0); ++ ++ /* Send all but last fragment, if we fail to send a fragment then free ++ * the rest -- no point in sending a packet that can't be reassembled. ++ */ ++ list_for_each_entry_safe(skb, nskb, &sublist, list) { ++ skb_list_del_init(skb); ++ if (!err) ++ err = xfrm_output(NULL, skb); ++ else ++ kfree_skb(skb); ++ } ++ if (err) ++ kfree_skb(*skbp); ++ return err; ++} ++ ++/** ++ * iptfs_first_should_copy() - determine if we should copy packet data. ++ * @first_skb: the first skb in the packet ++ * @mtu: the MTU. ++ * ++ * Determine if we should create subsequent skbs to hold the remaining data from ++ * a large inner packet by copying the packet data, or cloning the original skb ++ * and adjusting the offsets. ++ */ ++static bool iptfs_first_should_copy(struct sk_buff *first_skb, u32 mtu) ++{ ++ u32 frag_copy_max; ++ ++ /* If we have less than frag_copy_max for remaining packet we copy ++ * those tail bytes as it is more efficient. ++ */ ++ frag_copy_max = mtu <= IPTFS_FRAG_COPY_MAX ? mtu : IPTFS_FRAG_COPY_MAX; ++ if ((int)first_skb->len - (int)mtu < (int)frag_copy_max) ++ return true; ++ ++ /* If we have non-linear skb just use copy */ ++ if (skb_is_nonlinear(first_skb)) ++ return true; ++ ++ /* So we have a simple linear skb, easy to clone and share */ ++ return false; ++} ++ ++/** ++ * iptfs_first_skb() - handle the first dequeued inner packet for output ++ * @skbp: the source packet skb (IN), skb holding the last fragment in ++ * the fragment stream (OUT). ++ * @xtfs: IPTFS SA state. ++ * @mtu: the max IPTFS fragment size. ++ * ++ * This function is responsible for fragmenting a larger inner packet into a ++ * sequence of IPTFS payload packets. If it needs to fragment into subsequent ++ * skb's, it will either do so by copying or cloning. ++ * ++ * The last fragment is returned rather than being sent so that the caller can ++ * append more inner packets (aggregation) if there is room. ++ * ++ */ ++static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, ++ u32 mtu) ++{ ++ struct sk_buff *skb = *skbp; ++ int err; ++ ++ /* Classic ESP skips the don't fragment ICMP error if DF is clear on ++ * the inner packet or ignore_df is set. Otherwise it will send an ICMP ++ * or local error if the inner packet won't fit it's MTU. ++ * ++ * With IPTFS we do not care about the inner packet DF bit. If the ++ * tunnel is configured to "don't fragment" we error back if things ++ * don't fit in our max packet size. Otherwise we iptfs-fragment as ++ * normal. ++ */ ++ ++ /* The opportunity for HW offload has ended */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ err = skb_checksum_help(skb); ++ if (err) ++ return err; ++ } ++ ++ /* We've split these up before queuing */ ++ BUG_ON(skb_is_gso(skb)); ++ ++ /* Simple case -- it fits. `mtu` accounted for all the overhead ++ * including the basic IPTFS header. ++ */ ++ if (skb->len <= mtu) { ++ iptfs_output_prepare_skb(skb, 0); ++ return 0; ++ } ++ ++ if (iptfs_first_should_copy(skb, mtu)) ++ return iptfs_copy_create_frags(skbp, xtfs, mtu); ++ ++ /* For now we always copy */ ++ return iptfs_copy_create_frags(skbp, xtfs, mtu); ++} ++ + static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, + struct sk_buff *child) + { +@@ -315,6 +631,15 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + struct sk_buff *skb, *skb2, **nextp; + struct skb_shared_info *shi, *shi2; + ++ /* If we are fragmenting due to a large inner packet we will output all ++ * the outer IPTFS packets required to contain the fragments of the ++ * single large inner packet. These outer packets need to be sent ++ * consecutively (ESP seq-wise). Since this output function is always ++ * running from a timer we do not need a lock to provide this guarantee. ++ * We will output our packets consecutively before the timer is allowed ++ * to run again on some other CPU. ++ */ ++ + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + bool share_ok = true; +@@ -325,7 +650,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + htons(ETH_P_IP) : + htons(ETH_P_IPV6); + +- if (skb->len > mtu) { ++ if (skb->len > mtu && xtfs->cfg.dont_frag) { + /* We handle this case before enqueueing so we are only + * here b/c MTU changed after we enqueued before we + * dequeued, just drop these. +@@ -337,26 +662,22 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + continue; + } + +- /* If we don't have a cksum in the packet we need to add one +- * before encapsulation. ++ /* Convert first inner packet into an outer IPTFS packet, ++ * dealing with any fragmentation into multiple outer packets ++ * if necessary. + */ +- if (skb->ip_summed == CHECKSUM_PARTIAL) { +- if (skb_checksum_help(skb)) { +- XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), +- LINUX_MIB_XFRMOUTERROR); +- kfree_skb(skb); +- continue; +- } +- } +- +- /* Convert first inner packet into an outer IPTFS packet */ +- iptfs_output_prepare_skb(skb, 0); ++ if (iptfs_first_skb(&skb, xtfs, mtu)) ++ continue; + +- /* The space remaining to send more inner packet data is `mtu` - +- * (skb->len - sizeof iptfs header). This is b/c the `mtu` value +- * has the basic IPTFS header len accounted for, and we added +- * that header to the skb so it is a part of skb->len, thus we +- * subtract it from the skb length. ++ /* If fragmentation was required the returned skb is the last ++ * IPTFS fragment in the chain, and it's IPTFS header blkoff has ++ * been set just past the end of the fragment data. ++ * ++ * In either case the space remaining to send more inner packet ++ * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c ++ * the `mtu` value has the basic IPTFS header len accounted for, ++ * and we added that header to the skb so it is a part of ++ * skb->len, thus we subtract it from the skb length. + */ + remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); + +@@ -597,11 +918,11 @@ static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) + /* ========================== */ + + /** +- * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. ++ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * @x: xfrm state. + * @outer_mtu: the outer mtu + */ +-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) + { + struct crypto_aead *aead; + u32 blksize; +@@ -612,6 +933,24 @@ static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) + ~(blksize - 1)) - 2; + } + ++/** ++ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. ++ * @x: xfrm state. ++ * @outer_mtu: Outer MTU for the encapsulated packet. ++ * ++ * Return: Correct MTU taking in to account the encap overhead. ++ */ ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ ++ /* If not dont-frag we have no MTU */ ++ if (!xtfs->cfg.dont_frag) ++ return x->outer_mode.family == AF_INET ? IP_MAX_MTU : ++ IP6_MAX_MTU; ++ return __iptfs_get_inner_mtu(x, outer_mtu); ++} ++ + /** + * iptfs_user_init() - initialize the SA with IPTFS options from netlink. + * @net: the net data +@@ -631,6 +970,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + xtfs->init_delay_ns = + (u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; + ++ if (attrs[XFRMA_IPTFS_DONT_FRAG]) ++ xc->dont_frag = true; + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { +@@ -661,7 +1002,8 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + +- l += nla_total_size(0); ++ if (xc->dont_frag) ++ l += nla_total_size(0); + l += nla_total_size(sizeof(u16)); + l += nla_total_size(sizeof(xc->pkt_size)); + l += nla_total_size(sizeof(xc->max_queue_size)); +@@ -678,9 +1020,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + int ret; + u64 q; + +- ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); +- if (ret) +- return ret; ++ if (xc->dont_frag) { ++ ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); ++ if (ret) ++ return ret; ++ } + ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, 0); + if (ret) + return ret; +-- +2.45.1 + diff --git a/patches/v2/v2-0012-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch b/patches/v2/v2-0012-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch new file mode 100644 index 0000000..efdadf4 --- /dev/null +++ b/patches/v2/v2-0012-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch @@ -0,0 +1,332 @@ +From 2efd1b058cd10742413f8fbfa4a41884138575b2 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 12:00:25 -0400 +Subject: [PATCH ipsec-next v2 12/17] xfrm: iptfs: add basic receive packet + (tunnel egress) handling + +Add handling of packets received from the tunnel. This implements +tunnel egress functionality. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 283 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 283 insertions(+) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index cabe3e1ffbff..bad0d7d2c547 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -19,6 +19,10 @@ + + #include "xfrm_inout.h" + ++/* IPTFS encap (header) values. */ ++#define IPTFS_SUBTYPE_BASIC 0 ++#define IPTFS_SUBTYPE_CC 1 ++ + /* 1) skb->head should be cache aligned. + * 2) when resv is for L2 headers (i.e., ethernet) we want the cacheline to + * start -16 from data. +@@ -56,6 +60,17 @@ struct xfrm_iptfs_data { + static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + ++/* ================= */ ++/* Utility Functions */ ++/* ================= */ ++ ++static u64 __esp_seq(struct sk_buff *skb) ++{ ++ u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); ++ ++ return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; ++} ++ + /* ================= */ + /* SK_BUFF Functions */ + /* ================= */ +@@ -165,6 +180,273 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + } + } + ++/* ================================== */ ++/* IPTFS Receiving (egress) Functions */ ++/* ================================== */ ++ ++/** ++ * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. ++ * @skblen: the total data size for `skb`. ++ * @st: The source for the rest of the data to copy into `skb`. ++ * @off: The offset into @st to copy data from. ++ * @len: The length of data to copy from @st into `skb`. This must be <= ++ * @skblen. ++ * ++ * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, ++ * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len ++ * bytes from @st into `skb` starting from @off. ++ * ++ * It is an error for @len to be greater than the amount of data left in @st. ++ * ++ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. ++ */ ++static struct sk_buff * ++iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) ++{ ++ struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); ++ ++ if (!skb) ++ return NULL; ++ if (skb_copy_bits_seq(st, off, skb_put(skb, len), len)) { ++ XFRM_INC_STATS(dev_net(st->root_skb->dev), ++ LINUX_MIB_XFRMINERROR); ++ kfree_skb(skb); ++ return NULL; ++ } ++ return skb; ++} ++ ++/** ++ * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. ++ * @x: xfrm state ++ * @skb: the inner packet ++ * ++ * Finish the standard xfrm processing on the inner packet prior to sending back ++ * through gro_cells_receive. We do this separately b/c we are building a list ++ * of packets in the hopes that one day a list will be taken by ++ * xfrm_input. ++ */ ++static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ skb_reset_network_header(skb); ++ ++ /* The packet is going back through gro_cells_receive no need to ++ * set this. ++ */ ++ skb_reset_transport_header(skb); ++ ++ /* Packet already has checksum value set. */ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ /* Our skb will contain the header data copied when this outer packet ++ * which contained the start of this inner packet. This is true ++ * when we allocate a new skb as well as when we reuse the existing skb. ++ */ ++ if (ip_hdr(skb)->version == 0x4) { ++ struct iphdr *iph = ip_hdr(skb); ++ ++ if (x->props.flags & XFRM_STATE_DECAP_DSCP) ++ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); ++ if (!(x->props.flags & XFRM_STATE_NOECN)) ++ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) ++ IP_ECN_set_ce(iph); ++ ++ skb->protocol = htons(ETH_P_IP); ++ } else { ++ struct ipv6hdr *iph = ipv6_hdr(skb); ++ ++ if (x->props.flags & XFRM_STATE_DECAP_DSCP) ++ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); ++ if (!(x->props.flags & XFRM_STATE_NOECN)) ++ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) ++ IP6_ECN_set_ce(skb, iph); ++ ++ skb->protocol = htons(ETH_P_IPV6); ++ } ++} ++ ++/** ++ * iptfs_input() - handle receipt of iptfs payload ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * Process the IPTFS payload in `skb` and consume it afterwards. ++ */ ++static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ u8 hbytes[sizeof(struct ipv6hdr)]; ++ struct ip_iptfs_cc_hdr iptcch; ++ struct skb_seq_state skbseq; ++ struct list_head sublist; /* rename this it's just a list */ ++ struct sk_buff *first_skb, *next; ++ const unsigned char *old_mac; ++ struct xfrm_iptfs_data *xtfs; ++ struct ip_iptfs_hdr *ipth; ++ struct iphdr *iph; ++ struct net *net; ++ u32 remaining, iplen, iphlen, data, tail; ++ u32 blkoff; ++ u64 seq; ++ ++ xtfs = x->mode_data; ++ net = dev_net(skb->dev); ++ first_skb = NULL; ++ ++ seq = __esp_seq(skb); ++ ++ /* Large enough to hold both types of header */ ++ ipth = (struct ip_iptfs_hdr *)&iptcch; ++ ++ /* Save the old mac header if set */ ++ old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; ++ ++ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); ++ ++ /* Get the IPTFS header and validate it */ ++ ++ if (skb_copy_bits_seq(&skbseq, 0, ipth, sizeof(*ipth))) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ data = sizeof(*ipth); ++ ++ /* Set data past the basic header */ ++ if (ipth->subtype == IPTFS_SUBTYPE_CC) { ++ /* Copy the rest of the CC header */ ++ remaining = sizeof(iptcch) - sizeof(*ipth); ++ if (skb_copy_bits_seq(&skbseq, data, ipth + 1, remaining)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ data += remaining; ++ } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); ++ goto done; ++ } ++ ++ if (ipth->flags != 0) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); ++ goto done; ++ } ++ ++ INIT_LIST_HEAD(&sublist); ++ ++ /* Fragment handling in following commits */ ++ blkoff = ntohs(ipth->block_offset); ++ data += blkoff; ++ ++ /* New packets */ ++ tail = skb->len; ++ while (data < tail) { ++ __be16 protocol = 0; ++ ++ /* Gather information on the next data block. ++ * `data` points to the start of the data block. ++ */ ++ remaining = tail - data; ++ ++ /* try and copy enough bytes to read length from ipv4/ipv6 */ ++ iphlen = min_t(u32, remaining, 6); ++ if (skb_copy_bits_seq(&skbseq, data, hbytes, iphlen)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ ++ iph = (struct iphdr *)hbytes; ++ if (iph->version == 0x4) { ++ /* must have at least tot_len field present */ ++ if (remaining < 4) ++ break; ++ ++ iplen = be16_to_cpu(iph->tot_len); ++ iphlen = iph->ihl << 2; ++ protocol = cpu_to_be16(ETH_P_IP); ++ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; ++ } else if (iph->version == 0x6) { ++ /* must have at least payload_len field present */ ++ if (remaining < 6) ++ break; ++ ++ iplen = be16_to_cpu( ++ ((struct ipv6hdr *)hbytes)->payload_len); ++ iplen += sizeof(struct ipv6hdr); ++ iphlen = sizeof(struct ipv6hdr); ++ protocol = cpu_to_be16(ETH_P_IPV6); ++ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = ++ ipv6_get_dsfield((struct ipv6hdr *)iph); ++ } else if (iph->version == 0x0) { ++ /* pad */ ++ break; ++ } else { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ ++ if (unlikely(skbseq.stepped_offset)) { ++ /* We need to reset our seq read, it can't backup at ++ * this point. ++ */ ++ struct sk_buff *save = skbseq.root_skb; ++ ++ skb_abort_seq_read(&skbseq); ++ skb_prepare_seq_read(save, data, tail, &skbseq); ++ } ++ ++ if (!first_skb) ++ first_skb = skb; ++ ++ /* Fragment handling in following commits */ ++ if (iplen > remaining) ++ break; ++ ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, iplen); ++ if (!skb) { ++ /* skip to next packet or done */ ++ data += iplen; ++ continue; ++ } ++ ++ skb->protocol = protocol; ++ if (old_mac) { ++ /* rebuild the mac header */ ++ skb_set_mac_header(skb, -first_skb->mac_len); ++ memcpy(skb_mac_header(skb), old_mac, ++ first_skb->mac_len); ++ eth_hdr(skb)->h_proto = skb->protocol; ++ } ++ ++ data += iplen; ++ iptfs_complete_inner_skb(x, skb); ++ list_add_tail(&skb->list, &sublist); ++ } ++ ++ /* Send the packets! */ ++ list_for_each_entry_safe(skb, next, &sublist, list) { ++ skb_list_del_init(skb); ++ if (xfrm_input(skb, 0, 0, -3)) ++ kfree_skb(skb); ++ } ++ ++done: ++ skb = skbseq.root_skb; ++ skb_abort_seq_read(&skbseq); ++ ++ if (first_skb) { ++ consume_skb(first_skb); ++ } else { ++ /* skb is the original passed in skb, but we didn't get far ++ * enough to process it as the first_skb. ++ */ ++ kfree_skb(skb); ++ } ++ ++ /* We always have dealt with the input SKB, either we are re-using it, ++ * or we have freed it. Return EINPROGRESS so that xfrm_input stops ++ * processing it. ++ */ ++ return -EINPROGRESS; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -1125,6 +1407,7 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .sa_len = iptfs_sa_len, + .clone = iptfs_clone, + .get_inner_mtu = iptfs_get_inner_mtu, ++ .input = iptfs_input, + .output = iptfs_output_collect, + .prepare_output = iptfs_prepare_output, + }; +-- +2.45.1 + diff --git a/patches/v2/v2-0013-xfrm-iptfs-handle-received-fragmented-inner-packe.patch b/patches/v2/v2-0013-xfrm-iptfs-handle-received-fragmented-inner-packe.patch new file mode 100644 index 0000000..ff2d645 --- /dev/null +++ b/patches/v2/v2-0013-xfrm-iptfs-handle-received-fragmented-inner-packe.patch @@ -0,0 +1,597 @@ +From 9be0394a586415550faff92637d4a84ecce212ed Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:07:22 -0400 +Subject: [PATCH ipsec-next v2 13/17] xfrm: iptfs: handle received fragmented + inner packets + +Add support for handling receipt of partial inner packets that have +been fragmented across multiple outer IP-TFS tunnel packets. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 438 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 425 insertions(+), 13 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index bad0d7d2c547..958aa3d0522f 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -55,10 +55,22 @@ struct xfrm_iptfs_data { + struct hrtimer iptfs_timer; /* output timer */ + time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ ++ ++ /* Tunnel egress */ ++ spinlock_t drop_lock; ++ struct hrtimer drop_timer; ++ u64 drop_time_ns; ++ ++ /* Tunnel egress reassembly */ ++ struct sk_buff *ra_newskb; /* new pkt being reassembled */ ++ u64 ra_wantseq; /* expected next sequence */ ++ u8 ra_runt[6]; /* last pkt bytes from last skb */ ++ u8 ra_runtlen; /* count of ra_runt */ + }; + + static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); ++static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + + /* ================= */ + /* Utility Functions */ +@@ -216,6 +228,63 @@ iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) + return skb; + } + ++/** ++ * iptfs_input_save_runt() - save data in xtfs runt space. ++ * @xtfs: xtfs state ++ * @seq: the current sequence ++ * @buf: packet data ++ * @len: length of packet data ++ * ++ * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data ++ * runt space. ++ */ ++static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, ++ u8 *buf, int len) ++{ ++ BUG_ON(xtfs->ra_newskb); /* we won't have a new SKB yet */ ++ ++ memcpy(xtfs->ra_runt, buf, len); ++ ++ xtfs->ra_runtlen = len; ++ xtfs->ra_wantseq = seq + 1; ++} ++ ++/** ++ * __iptfs_iphlen() - return the v4/v6 header length using packet data. ++ * @data: pointer at octet with version nibble ++ * ++ * The version data is expected to be valid (i.e., either 4 or 6). ++ */ ++static u32 __iptfs_iphlen(u8 *data) ++{ ++ struct iphdr *iph = (struct iphdr *)data; ++ ++ if (iph->version == 0x4) ++ return sizeof(*iph); ++ BUG_ON(iph->version != 0x6); ++ return sizeof(struct ipv6hdr); ++} ++ ++/** ++ * __iptfs_iplen() - return the v4/v6 length using packet data. ++ * @data: pointer to ip (v4/v6) packet header ++ * ++ * Grab the IPv4 or IPv6 length value in the start of the inner packet header ++ * pointed to by `data`. Assumes data len is enough for the length field only. ++ * ++ * The version data is expected to be valid (i.e., either 4 or 6). ++ */ ++static u32 __iptfs_iplen(u8 *data) ++{ ++ struct iphdr *iph = (struct iphdr *)data; ++ ++ if (iph->version == 0x4) ++ return ntohs(iph->tot_len); ++ BUG_ON(iph->version != 0x6); ++ return ntohs(((struct ipv6hdr *)iph)->payload_len) + ++ sizeof(struct ipv6hdr); ++} ++ + /** + * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. + * @x: xfrm state +@@ -265,6 +334,238 @@ static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) + } + } + ++static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) ++{ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ /* We don't care if it works locking takes care of things */ ++ hrtimer_try_to_cancel(&xtfs->drop_timer); ++ if (free) ++ kfree_skb(xtfs->ra_newskb); ++ xtfs->ra_newskb = NULL; ++} ++ ++/** ++ * iptfs_reassem_abort() - In-progress packet is aborted free the state. ++ * @xtfs: xtfs state ++ */ ++static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) ++{ ++ __iptfs_reassem_done(xtfs, true); ++} ++ ++/** ++ * iptfs_reassem_done() - In-progress packet is complete, clear the state. ++ * @xtfs: xtfs state ++ */ ++static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) ++{ ++ __iptfs_reassem_done(xtfs, false); ++} ++ ++/** ++ * iptfs_reassem_cont() - Continue the reassembly of an inner packets. ++ * @xtfs: xtfs state ++ * @seq: sequence of current packet ++ * @st: seq read stat for current packet ++ * @skb: current packet ++ * @data: offset into sequential packet data ++ * @blkoff: packet blkoff value ++ * @list: list of skbs to enqueue completed packet on ++ * ++ * Process an IPTFS payload that has a non-zero `blkoff` or when we are ++ * expecting the continuation b/c we have a runt or in-progress packet. ++ */ ++static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, ++ struct skb_seq_state *st, struct sk_buff *skb, ++ u32 data, u32 blkoff, struct list_head *list) ++{ ++ struct sk_buff *newskb = xtfs->ra_newskb; ++ u32 remaining = skb->len - data; ++ u32 runtlen = xtfs->ra_runtlen; ++ u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; ++ ++ /* Handle packet fragment we aren't expecting */ ++ if (!runtlen && !xtfs->ra_newskb) ++ return data + min(blkoff, remaining); ++ ++ /* Important to remember that input to this function is an ordered ++ * packet stream (unless the user disabled the reorder window). Thus if ++ * we are waiting for, and expecting the next packet so we can continue ++ * assembly, a newer sequence number indicates older ones are not coming ++ * (or if they do should be ignored). Technically we can receive older ++ * ones when the reorder window is disabled; however, the user should ++ * have disabled fragmentation in this case, and regardless we don't ++ * deal with it. ++ * ++ * blkoff could be zero if the stream is messed up (or it's an all pad ++ * insertion) be careful to handle that case in each of the below ++ */ ++ ++ /* Too old case: This can happen when the reorder window is disabled so ++ * ordering isn't actually guaranteed. ++ */ ++ if (seq < xtfs->ra_wantseq) ++ return data + remaining; ++ ++ /* Too new case: We missed what we wanted cleanup. */ ++ if (seq > xtfs->ra_wantseq) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ if (blkoff == 0) { ++ if ((*skb->data & 0xF0) != 0) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ /* Handle all pad case, advance expected sequence number. ++ * (RFC 9347 S2.2.3) ++ */ ++ xtfs->ra_wantseq++; ++ /* will end parsing */ ++ return data + remaining; ++ } ++ ++ if (runtlen) { ++ BUG_ON(xtfs->ra_newskb); ++ ++ /* Regardless of what happens we're done with the runt */ ++ xtfs->ra_runtlen = 0; ++ ++ /* The start of this inner packet was at the very end of the last ++ * iptfs payload which didn't include enough for the ip header ++ * length field. We must have *at least* that now. ++ */ ++ rrem = sizeof(xtfs->ra_runt) - runtlen; ++ if (remaining < rrem || blkoff < rrem) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* fill in the runt data */ ++ if (skb_copy_bits_seq(st, data, &xtfs->ra_runt[runtlen], ++ rrem)) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ ++ /* We have enough data to get the ip length value now, ++ * allocate an in progress skb ++ */ ++ ipremain = __iptfs_iplen(xtfs->ra_runt); ++ if (ipremain < sizeof(xtfs->ra_runt)) { ++ /* length has to be at least runtsize large */ ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* For the runt case we don't attempt sharing currently. NOTE: ++ * Currently, this IPTFS implementation will not create runts. ++ */ ++ ++ newskb = iptfs_alloc_skb(skb, ipremain, false); ++ if (!newskb) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINERROR); ++ goto abandon; ++ } ++ xtfs->ra_newskb = newskb; ++ ++ /* Copy the runt data into the buffer, but leave data ++ * pointers the same as normal non-runt case. The extra `rrem` ++ * recopied bytes are basically cacheline free. Allows using ++ * same logic below to complete. ++ */ ++ memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, ++ sizeof(xtfs->ra_runt)); ++ } ++ ++ /* Continue reassembling the packet */ ++ ipremain = __iptfs_iplen(newskb->data); ++ iphlen = __iptfs_iphlen(newskb->data); ++ ++ /* Sanity check, we created the newskb knowing the IP length so the IP ++ * length can't now be shorter. ++ */ ++ BUG_ON(newskb->len > ipremain); ++ ++ ipremain -= newskb->len; ++ if (blkoff < ipremain) { ++ /* Corrupt data, we don't have enough to complete the packet */ ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* We want the IP header in linear space */ ++ if (newskb->len < iphlen) { ++ iphremain = iphlen - newskb->len; ++ if (blkoff < iphremain) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ fraglen = min(blkoff, remaining); ++ copylen = min(fraglen, iphremain); ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ /* this is a silly condition that might occur anyway */ ++ if (copylen < iphremain) { ++ xtfs->ra_wantseq++; ++ return data + fraglen; ++ } ++ /* update data and things derived from it */ ++ data += copylen; ++ blkoff -= copylen; ++ remaining -= copylen; ++ ipremain -= copylen; ++ } ++ ++ fraglen = min(blkoff, remaining); ++ copylen = min(fraglen, ipremain); ++ ++ /* We verified this was true in the main receive routine */ ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ ++ /* copy fragment data into newskb */ ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ ++ if (copylen < ipremain) { ++ xtfs->ra_wantseq++; ++ } else { ++ /* We are done with packet reassembly! */ ++ BUG_ON(copylen != ipremain); ++ iptfs_reassem_done(xtfs); ++ iptfs_complete_inner_skb(xtfs->x, newskb); ++ list_add_tail(&newskb->list, list); ++ } ++ ++ /* will continue on to new data block or end */ ++ return data + fraglen; ++ ++abandon: ++ if (xtfs->ra_newskb) { ++ iptfs_reassem_abort(xtfs); ++ } else { ++ xtfs->ra_runtlen = 0; ++ xtfs->ra_wantseq = 0; ++ } ++ /* skip past fragment, maybe to end */ ++ return data + min(blkoff, remaining); ++} ++ + /** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state +@@ -285,7 +586,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + struct iphdr *iph; + struct net *net; + u32 remaining, iplen, iphlen, data, tail; +- u32 blkoff; ++ u32 blkoff, capturelen; + u64 seq; + + xtfs = x->mode_data; +@@ -331,12 +632,27 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + + INIT_LIST_HEAD(&sublist); + +- /* Fragment handling in following commits */ ++ /* Handle fragment at start of payload, and/or waiting reassembly. */ ++ + blkoff = ntohs(ipth->block_offset); +- data += blkoff; ++ /* check before locking i.e., maybe */ ++ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { ++ spin_lock(&xtfs->drop_lock); ++ ++ /* check again after lock */ ++ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { ++ data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, ++ blkoff, &sublist); ++ } ++ ++ spin_unlock(&xtfs->drop_lock); ++ } + + /* New packets */ ++ + tail = skb->len; ++ BUG_ON(xtfs->ra_newskb && data < tail); ++ + while (data < tail) { + __be16 protocol = 0; + +@@ -355,8 +671,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + iph = (struct iphdr *)hbytes; + if (iph->version == 0x4) { + /* must have at least tot_len field present */ +- if (remaining < 4) ++ if (remaining < 4) { ++ /* save the bytes we have, advance data and exit */ ++ iptfs_input_save_runt(xtfs, seq, hbytes, ++ remaining); ++ data += remaining; + break; ++ } + + iplen = be16_to_cpu(iph->tot_len); + iphlen = iph->ihl << 2; +@@ -364,8 +685,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; + } else if (iph->version == 0x6) { + /* must have at least payload_len field present */ +- if (remaining < 6) ++ if (remaining < 6) { ++ /* save the bytes we have, advance data and exit */ ++ iptfs_input_save_runt(xtfs, seq, hbytes, ++ remaining); ++ data += remaining; + break; ++ } + + iplen = be16_to_cpu( + ((struct ipv6hdr *)hbytes)->payload_len); +@@ -376,6 +702,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + ipv6_get_dsfield((struct ipv6hdr *)iph); + } else if (iph->version == 0x0) { + /* pad */ ++ data = tail; + break; + } else { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); +@@ -395,16 +722,14 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + if (!first_skb) + first_skb = skb; + +- /* Fragment handling in following commits */ +- if (iplen > remaining) +- break; +- +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, iplen); ++ capturelen = min(iplen, remaining); ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, capturelen); + if (!skb) { + /* skip to next packet or done */ +- data += iplen; ++ data += capturelen; + continue; + } ++ BUG_ON(skb->len != capturelen); + + skb->protocol = protocol; + if (old_mac) { +@@ -415,11 +740,38 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + eth_hdr(skb)->h_proto = skb->protocol; + } + +- data += iplen; ++ data += capturelen; ++ ++ if (skb->len < iplen) { ++ BUG_ON(data != tail); ++ BUG_ON(xtfs->ra_newskb); ++ ++ /* Start reassembly */ ++ spin_lock(&xtfs->drop_lock); ++ ++ xtfs->ra_newskb = skb; ++ xtfs->ra_wantseq = seq + 1; ++ if (!hrtimer_is_queued(&xtfs->drop_timer)) { ++ /* softirq blocked lest the timer fire and interrupt us */ ++ BUG_ON(!in_interrupt()); ++ hrtimer_start(&xtfs->drop_timer, ++ xtfs->drop_time_ns, ++ IPTFS_HRTIMER_MODE); ++ } ++ ++ spin_unlock(&xtfs->drop_lock); ++ ++ break; ++ } ++ + iptfs_complete_inner_skb(x, skb); + list_add_tail(&skb->list, &sublist); + } + ++ if (data != tail) ++ /* this should not happen from the above code */ ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); ++ + /* Send the packets! */ + list_for_each_entry_safe(skb, next, &sublist, list) { + skb_list_del_init(skb); +@@ -447,6 +799,45 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + return -EINPROGRESS; + } + ++/** ++ * iptfs_drop_timer() - Handle drop timer expiry. ++ * @me: the timer ++ * ++ * This is similar to our input function. ++ * ++ * The drop timer is set when we start an in progress reassembly, and also when ++ * we save a future packet in the window saved array. ++ * ++ * NOTE packets in the save window are always newer WRT drop times as ++ * they get further in the future. i.e. for: ++ * ++ * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, ++ * then D(n-1) <= D(n). ++ * ++ * So, regardless of why the timer is firing we can always discard any inprogress ++ * fragment; either it's the reassembly timer, or slot 0 is going to be ++ * dropped as S0 must have the most recent drop time, and slot 0 holds the ++ * continuation fragment of the in progress packet. ++ */ ++static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ struct xfrm_state *x; ++ ++ xtfs = container_of(me, typeof(*xtfs), drop_timer); ++ x = xtfs->x; ++ ++ /* Drop any in progress packet */ ++ spin_lock(&xtfs->drop_lock); ++ if (xtfs->ra_newskb) { ++ kfree_skb(xtfs->ra_newskb); ++ xtfs->ra_newskb = NULL; ++ } ++ spin_unlock(&xtfs->drop_lock); ++ ++ return HRTIMER_NORESTART; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -1251,6 +1642,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize; + xtfs->init_delay_ns = + (u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; ++ xtfs->drop_time_ns = ++ (u64)net->xfrm.sysctl_iptfs_drop_time * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; +@@ -1268,6 +1661,10 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + } + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); ++ if (attrs[XFRMA_IPTFS_DROP_TIME]) ++ xtfs->drop_time_ns = ++ (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * ++ NSECS_IN_USEC; + if (attrs[XFRMA_IPTFS_INIT_DELAY]) + xtfs->init_delay_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * +@@ -1317,7 +1714,9 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + if (ret) + return ret; + +- ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, 0); ++ q = xtfs->drop_time_ns; ++ (void)do_div(q, NSECS_IN_USEC); ++ ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + if (ret) + return ret; + +@@ -1335,6 +1734,10 @@ static int __iptfs_init_state(struct xfrm_state *x, + hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->iptfs_timer.function = iptfs_delay_timer; + ++ spin_lock_init(&xtfs->drop_lock); ++ hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); ++ xtfs->drop_timer.function = iptfs_drop_timer; ++ + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) +@@ -1362,6 +1765,8 @@ static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) + if (!xtfs) + return -ENOMEM; + ++ xtfs->ra_newskb = NULL; ++ + err = __iptfs_init_state(x, xtfs); + if (err) + return err; +@@ -1392,7 +1797,14 @@ static void iptfs_delete_state(struct xfrm_state *x) + if (!xtfs) + return; + ++ spin_lock_bh(&xtfs->drop_lock); + hrtimer_cancel(&xtfs->iptfs_timer); ++ hrtimer_cancel(&xtfs->drop_timer); ++ spin_unlock_bh(&xtfs->drop_lock); ++ ++ if (xtfs->ra_newskb) ++ kfree_skb(xtfs->ra_newskb); ++ + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +-- +2.45.1 + diff --git a/patches/v2/v2-0014-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch b/patches/v2/v2-0014-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch new file mode 100644 index 0000000..73aca6a --- /dev/null +++ b/patches/v2/v2-0014-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch @@ -0,0 +1,194 @@ +From 76a289b04ab44f381f5091870d4dfae17e54141d Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:08:37 -0400 +Subject: [PATCH ipsec-next v2 14/17] xfrm: iptfs: add reusing received skb for + the tunnel egress packet + +Add an optimization of re-using the tunnel outer skb re-transmission +of the inner packet to avoid skb allocation and copy. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 126 +++++++++++++++++++++++++++++++++++------- + 1 file changed, 105 insertions(+), 21 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 958aa3d0522f..a0d7abe4f0d0 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -579,19 +579,20 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; + struct list_head sublist; /* rename this it's just a list */ +- struct sk_buff *first_skb, *next; ++ struct sk_buff *first_skb, *defer, *next; + const unsigned char *old_mac; + struct xfrm_iptfs_data *xtfs; + struct ip_iptfs_hdr *ipth; + struct iphdr *iph; + struct net *net; +- u32 remaining, iplen, iphlen, data, tail; ++ u32 remaining, first_iplen, iplen, iphlen, data, tail; + u32 blkoff, capturelen; + u64 seq; + + xtfs = x->mode_data; + net = dev_net(skb->dev); + first_skb = NULL; ++ defer = NULL; + + seq = __esp_seq(skb); + +@@ -719,25 +720,94 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + skb_prepare_seq_read(save, data, tail, &skbseq); + } + +- if (!first_skb) ++ if (first_skb) { ++ skb = NULL; ++ } else { + first_skb = skb; ++ first_iplen = iplen; ++ ++ /* We are going to skip over `data` bytes to reach the ++ * start of the IP header of `iphlen` len for `iplen` ++ * inner packet. ++ */ ++ ++ if (skb_has_frag_list(skb)) { ++ defer = skb; ++ skb = NULL; ++ } else if (data + iphlen <= skb_headlen(skb) && ++ /* make sure our header is 32-bit aligned? */ ++ /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ ++ skb_tailroom(skb) + tail - data >= iplen) { ++ /* Reuse the received skb. ++ * ++ * We have enough headlen to pull past any ++ * initial fragment data, leaving at least the ++ * IP header in the linear buffer space. ++ * ++ * For linear buffer space we only require that ++ * linear buffer space is large enough to ++ * eventually hold the entire reassembled ++ * packet (by including tailroom in the check). ++ * ++ * For non-linear tailroom is 0 and so we only ++ * re-use if the entire packet is present ++ * already. ++ * ++ * NOTE: there are many more options for ++ * sharing, KISS for now. Also, this can produce ++ * skb's with the IP header unaligned to 32 ++ * bits. If that ends up being a problem then a ++ * check should be added to the conditional ++ * above that the header lies on a 32-bit ++ * boundary as well. ++ */ ++ skb_pull(skb, data); ++ ++ /* our range just changed */ ++ data = 0; ++ tail = skb->len; ++ remaining = skb->len; ++ ++ skb->protocol = protocol; ++ skb_mac_header_rebuild(skb); ++ if (skb->mac_len) ++ eth_hdr(skb)->h_proto = skb->protocol; ++ ++ /* all pointers could be changed now reset walk */ ++ skb_abort_seq_read(&skbseq); ++ skb_prepare_seq_read(skb, data, tail, &skbseq); ++ } else { ++ /* We couldn't reuse the input skb so allocate a ++ * new one. ++ */ ++ defer = skb; ++ skb = NULL; ++ } ++ ++ /* Don't trim `first_skb` until the end as we are ++ * walking that data now. ++ */ ++ } + + capturelen = min(iplen, remaining); +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, capturelen); + if (!skb) { +- /* skip to next packet or done */ +- data += capturelen; +- continue; +- } +- BUG_ON(skb->len != capturelen); +- +- skb->protocol = protocol; +- if (old_mac) { +- /* rebuild the mac header */ +- skb_set_mac_header(skb, -first_skb->mac_len); +- memcpy(skb_mac_header(skb), old_mac, +- first_skb->mac_len); +- eth_hdr(skb)->h_proto = skb->protocol; ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, ++ capturelen); ++ if (!skb) { ++ /* skip to next packet or done */ ++ data += capturelen; ++ continue; ++ } ++ BUG_ON(skb->len != capturelen); ++ ++ skb->protocol = protocol; ++ if (old_mac) { ++ /* rebuild the mac header */ ++ skb_set_mac_header(skb, -first_skb->mac_len); ++ memcpy(skb_mac_header(skb), old_mac, ++ first_skb->mac_len); ++ eth_hdr(skb)->h_proto = skb->protocol; ++ } + } + + data += capturelen; +@@ -772,8 +842,19 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + /* this should not happen from the above code */ + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINIPTFSERROR); + ++ if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { ++ /* first_skb is queued b/c !defer and not partial */ ++ if (pskb_trim(first_skb, first_iplen)) { ++ /* error trimming */ ++ list_del(&first_skb->list); ++ defer = first_skb; ++ } ++ first_skb->ip_summed = CHECKSUM_NONE; ++ } ++ + /* Send the packets! */ + list_for_each_entry_safe(skb, next, &sublist, list) { ++ BUG_ON(skb == defer); + skb_list_del_init(skb); + if (xfrm_input(skb, 0, 0, -3)) + kfree_skb(skb); +@@ -783,12 +864,15 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); + +- if (first_skb) { +- consume_skb(first_skb); +- } else { ++ if (defer) { ++ consume_skb(defer); ++ } else if (!first_skb) { + /* skb is the original passed in skb, but we didn't get far +- * enough to process it as the first_skb. ++ * enough to process it as the first_skb, if we had it would ++ * either be save in ra_newskb, trimmed and sent on as an skb or ++ * placed in defer to be freed. + */ ++ BUG_ON(!skb); + kfree_skb(skb); + } + +-- +2.45.1 + diff --git a/patches/v2/v2-0015-xfrm-iptfs-add-skb-fragment-sharing-code.patch b/patches/v2/v2-0015-xfrm-iptfs-add-skb-fragment-sharing-code.patch new file mode 100644 index 0000000..2d85719 --- /dev/null +++ b/patches/v2/v2-0015-xfrm-iptfs-add-skb-fragment-sharing-code.patch @@ -0,0 +1,388 @@ +From 7209bc3a4b24d421b6cc5e2f632bd0f437a1354c Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 12:53:38 -0400 +Subject: [PATCH ipsec-next v2 15/17] xfrm: iptfs: add skb-fragment sharing + code + +Avoid copying the inner packet data by sharing the skb data fragments +from the output packet skb into new inner packet skb. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 303 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 295 insertions(+), 8 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index a0d7abe4f0d0..f8b7cf6f4b01 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -33,6 +33,7 @@ + #define XFRM_IPTFS_MIN_L3HEADROOM 128 + #define XFRM_IPTFS_MIN_L2HEADROOM (64 + 16) + #define IPTFS_FRAG_COPY_MAX 256 /* max for copying to create iptfs frags */ ++#define IPTFS_PKT_SHARE_MIN 129 /* min to try to share vs copy pkt data */ + #define NSECS_IN_USEC 1000 + + #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT +@@ -159,6 +160,205 @@ static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); + } + ++/** ++ * struct skb_frag_walk - use to track a walk through fragments ++ * @fragi: current fragment index ++ * @past: length of data in fragments before @fragi ++ * @total: length of data in all fragments ++ * @nr_frags: number of fragments present in array ++ * @initial_offset: the value passed in to skb_prepare_frag_walk() ++ * @pp_recycle: copy of skb->pp_recycle ++ * @frags: the page fragments inc. room for head page ++ */ ++struct skb_frag_walk { ++ u32 fragi; ++ u32 past; ++ u32 total; ++ u32 nr_frags; ++ u32 initial_offset; ++ bool pp_recycle; ++ skb_frag_t frags[MAX_SKB_FRAGS + 1]; ++}; ++ ++/** ++ * skb_prepare_frag_walk() - initialize a frag walk over an skb. ++ * @skb: the skb to walk. ++ * @initial_offset: start the walk @initial_offset into the skb. ++ * @walk: the walk to initialize ++ * ++ * Future calls to skb_add_frags() will expect the @offset value to be at ++ * least @initial_offset large. ++ */ ++static void skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, ++ struct skb_frag_walk *walk) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frag, *from; ++ u32 i; ++ ++ walk->initial_offset = initial_offset; ++ walk->fragi = 0; ++ walk->past = 0; ++ walk->total = 0; ++ walk->nr_frags = 0; ++ walk->pp_recycle = skb->pp_recycle; ++ ++ if (skb->head_frag) { ++ if (initial_offset >= skb_headlen(skb)) { ++ initial_offset -= skb_headlen(skb); ++ } else { ++ frag = &walk->frags[walk->nr_frags++]; ++ skb_head_to_frag(skb, frag); ++ frag->offset += initial_offset; ++ frag->len -= initial_offset; ++ walk->total += frag->len; ++ initial_offset = 0; ++ } ++ } else { ++ BUG_ON(skb_headlen(skb) > initial_offset); ++ initial_offset -= skb_headlen(skb); ++ } ++ ++ for (i = 0; i < shinfo->nr_frags; i++) { ++ from = &shinfo->frags[i]; ++ if (initial_offset >= from->len) { ++ initial_offset -= from->len; ++ continue; ++ } ++ frag = &walk->frags[walk->nr_frags++]; ++ *frag = *from; ++ if (initial_offset) { ++ frag->offset += initial_offset; ++ frag->len -= initial_offset; ++ initial_offset = 0; ++ } ++ walk->total += frag->len; ++ } ++ BUG_ON(initial_offset != 0); ++} ++ ++static u32 __skb_reset_frag_walk(struct skb_frag_walk *walk, u32 offset) ++{ ++ /* Adjust offset to refer to internal walk values */ ++ BUG_ON(offset < walk->initial_offset); ++ offset -= walk->initial_offset; ++ ++ /* Get to the correct fragment for offset */ ++ while (offset < walk->past) { ++ walk->past -= walk->frags[--walk->fragi].len; ++ if (offset >= walk->past) ++ break; ++ BUG_ON(walk->fragi == 0); ++ } ++ while (offset >= walk->past + walk->frags[walk->fragi].len) ++ walk->past += walk->frags[walk->fragi++].len; ++ ++ /* offset now relative to this current frag */ ++ offset -= walk->past; ++ return offset; ++} ++ ++/** ++ * skb_can_add_frags() - check if ok to add frags from walk to skb ++ * @skb: skb to check for adding frags to ++ * @walk: the walk that will be used as source for frags. ++ * @offset: offset from beginning of original skb to start from. ++ * @len: amount of data to add frag references to in @skb. ++ */ ++static bool skb_can_add_frags(const struct sk_buff *skb, ++ struct skb_frag_walk *walk, u32 offset, u32 len) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ u32 fragi, nr_frags, fraglen; ++ ++ if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) ++ return false; ++ ++ /* Make offset relative to current frag after setting that */ ++ offset = __skb_reset_frag_walk(walk, offset); ++ ++ /* Verify we have array space for the fragments we need to add */ ++ fragi = walk->fragi; ++ nr_frags = shinfo->nr_frags; ++ while (len && fragi < walk->nr_frags) { ++ skb_frag_t *frag = &walk->frags[fragi]; ++ ++ fraglen = frag->len; ++ if (offset) { ++ fraglen -= offset; ++ offset = 0; ++ } ++ if (++nr_frags > MAX_SKB_FRAGS) ++ return false; ++ if (len <= fraglen) ++ return true; ++ len -= fraglen; ++ fragi++; ++ } ++ /* We may not copy all @len but what we have will fit. */ ++ return true; ++} ++ ++/** ++ * skb_add_frags() - add a range of fragment references into an skb ++ * @skb: skb to add references into ++ * @walk: the walk to add referenced fragments from. ++ * @offset: offset from beginning of original skb to start from. ++ * @len: amount of data to add frag references to in @skb. ++ * ++ * skb_can_add_frags() should be called before this function to verify that the ++ * destination @skb is compatible with the walk and has space in the array for ++ * the to be added frag references. ++ * ++ * Return: The number of bytes not added to @skb b/c we reached the end of the ++ * walk before adding all of @len. ++ */ ++static int skb_add_frags(struct sk_buff *skb, struct skb_frag_walk *walk, ++ u32 offset, u32 len) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ u32 fraglen; ++ ++ BUG_ON(skb->pp_recycle != walk->pp_recycle); ++ if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) ++ return len; ++ ++ /* make offset relative to current frag after setting that */ ++ offset = __skb_reset_frag_walk(walk, offset); ++ BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); ++ ++ while (len && walk->fragi < walk->nr_frags) { ++ skb_frag_t *frag = &walk->frags[walk->fragi]; ++ skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; ++ ++ *tofrag = *frag; ++ if (offset) { ++ tofrag->offset += offset; ++ tofrag->len -= offset; ++ offset = 0; ++ } ++ __skb_frag_ref(tofrag); ++ shinfo->nr_frags++; ++ BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); ++ ++ /* see if we are done */ ++ fraglen = tofrag->len; ++ if (len < fraglen) { ++ tofrag->len = len; ++ skb->len += len; ++ skb->data_len += len; ++ return 0; ++ } ++ /* advance to next source fragment */ ++ len -= fraglen; /* careful, use dst bv_len */ ++ skb->len += fraglen; /* careful, " " " */ ++ skb->data_len += fraglen; /* careful, " " " */ ++ walk->past += frag->len; /* careful, use src bv_len */ ++ walk->fragi++; ++ } ++ return len; ++} ++ + /** + * skb_copy_bits_seq - copy bits from a skb_seq_state to kernel buffer + * @st: source skb_seq_state +@@ -196,6 +396,53 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + /* IPTFS Receiving (egress) Functions */ + /* ================================== */ + ++/** ++ * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. ++ * @tpl: template to create new skb from. ++ * @walk: The source for fragments to add. ++ * @off: The offset into @walk to add frags from, also used with @st and ++ * @copy_len. ++ * @len: The length of data to add covering frags from @walk into @skb. ++ * This must be <= @skblen. ++ * @st: The sequence state to copy from into the new head skb. ++ * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb ++ * linear space. ++ * ++ * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from ++ * @st into the new skb linear space, and then add shared fragments from the ++ * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). ++ * ++ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. ++ */ ++static struct sk_buff *iptfs_pskb_add_frags(struct sk_buff *tpl, ++ struct skb_frag_walk *walk, u32 off, ++ u32 len, struct skb_seq_state *st, ++ u32 copy_len) ++{ ++ struct sk_buff *skb; ++ ++ skb = iptfs_alloc_skb(tpl, copy_len, false); ++ if (!skb) ++ return NULL; ++ ++ /* this should not normally be happening */ ++ if (!skb_can_add_frags(skb, walk, off + copy_len, len - copy_len)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ if (copy_len && ++ skb_copy_bits_seq(st, off, skb_put(skb, copy_len), copy_len)) { ++ XFRM_INC_STATS(dev_net(st->root_skb->dev), ++ LINUX_MIB_XFRMINERROR); ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ skb_add_frags(skb, walk, off + copy_len, len - copy_len); ++ return skb; ++} ++ + /** + * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. + * @skblen: the total data size for `skb`. +@@ -380,6 +627,8 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) + { ++ struct skb_frag_walk _fragwalk; ++ struct skb_frag_walk *fragwalk = NULL; + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; +@@ -533,13 +782,31 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + +- /* We verified this was true in the main receive routine */ +- BUG_ON(skb_tailroom(newskb) < copylen); ++ /* If we may have the opportunity to share prepare a fragwalk. */ ++ if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && ++ (skb->head_frag || skb->len == skb->data_len) && ++ skb->pp_recycle == newskb->pp_recycle) { ++ fragwalk = &_fragwalk; ++ skb_prepare_frag_walk(skb, data, fragwalk); ++ } + +- /* copy fragment data into newskb */ +- if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { +- XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); +- goto abandon; ++ /* Try share then copy. */ ++ if (fragwalk && skb_can_add_frags(newskb, fragwalk, data, copylen)) { ++ u32 leftover; ++ ++ leftover = skb_add_frags(newskb, fragwalk, data, copylen); ++ BUG_ON(leftover != 0); ++ } else { ++ /* We verified this was true in the main receive routine */ ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ ++ /* copy fragment data into newskb */ ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), ++ copylen)) { ++ XFRM_INC_STATS(dev_net(skb->dev), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } + } + + if (copylen < ipremain) { +@@ -578,6 +845,8 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + u8 hbytes[sizeof(struct ipv6hdr)]; + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; ++ struct skb_frag_walk _fragwalk; ++ struct skb_frag_walk *fragwalk = NULL; + struct list_head sublist; /* rename this it's just a list */ + struct sk_buff *first_skb, *defer, *next; + const unsigned char *old_mac; +@@ -725,6 +994,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + } else { + first_skb = skb; + first_iplen = iplen; ++ fragwalk = NULL; + + /* We are going to skip over `data` bytes to reach the + * start of the IP header of `iphlen` len for `iplen` +@@ -776,6 +1046,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + /* all pointers could be changed now reset walk */ + skb_abort_seq_read(&skbseq); + skb_prepare_seq_read(skb, data, tail, &skbseq); ++ } else if (skb->head_frag && ++ /* We have the IP header right now */ ++ remaining >= iphlen) { ++ fragwalk = &_fragwalk; ++ skb_prepare_frag_walk(skb, data, fragwalk); ++ defer = skb; ++ skb = NULL; + } else { + /* We couldn't reuse the input skb so allocate a + * new one. +@@ -791,8 +1068,18 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + + capturelen = min(iplen, remaining); + if (!skb) { +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, +- capturelen); ++ if (!fragwalk || ++ /* Large enough to be worth sharing */ ++ iplen < IPTFS_PKT_SHARE_MIN || ++ /* Have IP header + some data to share. */ ++ capturelen <= iphlen || ++ /* Try creating skb and adding frags */ ++ !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, ++ data, capturelen, ++ &skbseq, iphlen))) { ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, ++ data, capturelen); ++ } + if (!skb) { + /* skip to next packet or done */ + data += capturelen; +-- +2.45.1 + diff --git a/patches/v2/v2-0016-xfrm-iptfs-handle-reordering-of-received-packets.patch b/patches/v2/v2-0016-xfrm-iptfs-handle-reordering-of-received-packets.patch new file mode 100644 index 0000000..9e1967b --- /dev/null +++ b/patches/v2/v2-0016-xfrm-iptfs-handle-reordering-of-received-packets.patch @@ -0,0 +1,623 @@ +From 941729c69756e194c7f91c98e411f7a6daa2d6ba Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:10:49 -0400 +Subject: [PATCH ipsec-next v2 16/17] xfrm: iptfs: handle reordering of + received packets + +Handle the receipt of the outer tunnel packets out-of-order. Pointers to +the out-of-order packets are saved in a window (array) awaiting needed +prior packets. When the required prior packets are received the now +in-order packets are then passed on to the regular packet receive code. +A timer is used to consider missing earlier packet as lost so the +algorithm will advance. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 487 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 475 insertions(+), 12 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index f8b7cf6f4b01..12b59411fbf0 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -38,8 +38,14 @@ + + #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + ++struct skb_wseq { ++ struct sk_buff *skb; ++ u64 drop_time; ++}; ++ + struct xfrm_iptfs_config { + bool dont_frag : 1; ++ u16 reorder_win_size; + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + }; +@@ -57,12 +63,16 @@ struct xfrm_iptfs_data { + time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ + +- /* Tunnel egress */ ++ /* Tunnel input reordering */ ++ bool w_seq_set; /* true after first seq received */ ++ u64 w_wantseq; /* expected next sequence */ ++ struct skb_wseq *w_saved; /* the saved buf array */ ++ u32 w_savedlen; /* the saved len (not size) */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + +- /* Tunnel egress reassembly */ ++ /* Tunnel input reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ +@@ -834,13 +844,13 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + } + + /** +- * iptfs_input() - handle receipt of iptfs payload ++ * iptfs_input_ordered() - handle next in order IPTFS payload. + * @x: xfrm state +- * @skb: the packet ++ * @skb: current packet + * + * Process the IPTFS payload in `skb` and consume it afterwards. + */ +-static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++static int iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) + { + u8 hbytes[sizeof(struct ipv6hdr)]; + struct ip_iptfs_cc_hdr iptcch; +@@ -1163,11 +1173,375 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + kfree_skb(skb); + } + +- /* We always have dealt with the input SKB, either we are re-using it, +- * or we have freed it. Return EINPROGRESS so that xfrm_input stops +- * processing it. ++ return 0; ++} ++ ++/* ------------------------------- */ ++/* Input (Egress) Re-ordering Code */ ++/* ------------------------------- */ ++ ++static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) ++{ ++ u32 savedlen = xtfs->w_savedlen; ++ ++ if (shift > savedlen) ++ shift = savedlen; ++ if (shift != savedlen) ++ memcpy(xtfs->w_saved, xtfs->w_saved + shift, ++ (savedlen - shift) * sizeof(*xtfs->w_saved)); ++ memset(xtfs->w_saved + savedlen - shift, 0, ++ shift * sizeof(*xtfs->w_saved)); ++ xtfs->w_savedlen -= shift; ++} ++ ++static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, ++ struct list_head *freelist) ++{ ++ list_add_tail(&inskb->list, freelist); ++} ++ ++static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) ++ ++{ ++ struct skb_wseq *s, *se; ++ const u32 savedlen = xtfs->w_savedlen; ++ time64_t now = ktime_get_raw_fast_ns(); ++ u32 count = 0; ++ u32 scount = 0; ++ ++ BUG_ON(!savedlen); ++ if (xtfs->w_saved[0].drop_time > now) ++ goto set_timer; ++ ++ ++xtfs->w_wantseq; ++ ++ /* Keep flushing packets until we reach a drop time greater than now. */ ++ s = xtfs->w_saved; ++ se = s + savedlen; ++ do { ++ /* Walking past empty slots until we reach a packet */ ++ for (; s < se && !s->skb; s++) ++ if (s->drop_time > now) ++ goto outerdone; ++ /* Sending packets until we hit another empty slot. */ ++ for (; s < se && s->skb; scount++, s++) ++ list_add_tail(&s->skb->list, list); ++ } while (s < se); ++outerdone: ++ ++ count = s - xtfs->w_saved; ++ if (count) { ++ xtfs->w_wantseq += count; ++ ++ /* Shift handled slots plus final empty slot into slot 0. */ ++ __vec_shift(xtfs, count); ++ } ++ ++ if (xtfs->w_savedlen) { ++set_timer: ++ /* Drifting is OK */ ++ hrtimer_start(&xtfs->drop_timer, ++ xtfs->w_saved[0].drop_time - now, ++ IPTFS_HRTIMER_MODE); ++ } ++ return scount; ++} ++ ++static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, ++ struct list_head *list) ++{ ++ struct skb_wseq *s, *se; ++ const u32 savedlen = xtfs->w_savedlen; ++ u32 count = 0; ++ ++ /* Got what we wanted. */ ++ list_add_tail(&inskb->list, list); ++ ++xtfs->w_wantseq; ++ if (!savedlen) ++ return; ++ ++ /* Flush remaining consecutive packets. */ ++ ++ /* Keep sending until we hit another missed pkt. */ ++ for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) ++ list_add_tail(&s->skb->list, list); ++ count = s - xtfs->w_saved; ++ if (count) ++ xtfs->w_wantseq += count; ++ ++ /* Shift handled slots plus final empty slot into slot 0. */ ++ __vec_shift(xtfs, count + 1); ++} ++ ++/* Set the slot's drop time and all the empty slots below it until reaching a ++ * filled slot which will already be set. ++ */ ++static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) ++{ ++ const u32 savedlen = xtfs->w_savedlen; ++ struct skb_wseq *s = xtfs->w_saved; ++ time64_t drop_time; ++ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ if (savedlen > index + 1) { ++ /* we are below another, our drop time and the timer are already set */ ++ BUG_ON(xtfs->w_saved[index + 1].drop_time != ++ xtfs->w_saved[index].drop_time); ++ return; ++ } ++ /* we are the most future so get a new drop time. */ ++ drop_time = ktime_get_raw_fast_ns(); ++ drop_time += xtfs->drop_time_ns; ++ ++ /* Walk back through the array setting drop times as we go */ ++ s[index].drop_time = drop_time; ++ while (index-- > 0 && !s[index].skb) ++ s[index].drop_time = drop_time; ++ ++ /* If we walked all the way back, schedule the drop timer if needed */ ++ if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) ++ hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, ++ IPTFS_HRTIMER_MODE); ++} ++ ++static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ const u64 inseq = __esp_seq(inskb); ++ const u64 wantseq = xtfs->w_wantseq; ++ const u64 distance = inseq - wantseq; ++ const u32 savedlen = xtfs->w_savedlen; ++ const u32 index = distance - 1; ++ ++ BUG_ON(distance >= nslots); ++ ++ /* Handle future sequence number received which fits in the window. ++ * ++ * We know we don't have the seq we want so we won't be able to flush ++ * anything. + */ +- return -EINPROGRESS; ++ ++ /* slot count is 4, saved size is 3 savedlen is 2 ++ * ++ * "window boundary" is based on the fixed window size ++ * distance is also slot number ++ * index is an array index (i.e., - 1 of slot) ++ * : : - implicit NULL after array len ++ * ++ * +--------- used length (savedlen == 2) ++ * | +----- array size (nslots - 1 == 3) ++ * | | + window boundary (nslots == 4) ++ * V V | V ++ * | ++ * 0 1 2 3 | slot number ++ * --- 0 1 2 | array index ++ * [-] [b] : :| array ++ * ++ * "2" "3" "4" *5*| seq numbers ++ * ++ * We receive seq number 5 ++ * distance == 3 [inseq(5) - w_wantseq(2)] ++ * index == 2 [distance(6) - 1] ++ */ ++ ++ if (xtfs->w_saved[index].skb) { ++ /* a dup of a future */ ++ list_add_tail(&inskb->list, freelist); ++ return; ++ } ++ ++ xtfs->w_saved[index].skb = inskb; ++ xtfs->w_savedlen = max(savedlen, index + 1); ++ iptfs_set_window_drop_times(xtfs, index); ++} ++ ++static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, ++ struct list_head *list, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ const u64 inseq = __esp_seq(inskb); ++ u32 savedlen = xtfs->w_savedlen; ++ u64 wantseq = xtfs->w_wantseq; ++ struct sk_buff *slot0 = NULL; ++ u64 distance, extra_drops, s0seq; ++ struct skb_wseq *wnext; ++ u32 beyond, shifting, slot; ++ ++ BUG_ON(inseq <= wantseq); ++ distance = inseq - wantseq; ++ BUG_ON(distance <= nslots - 1); ++ beyond = distance - (nslots - 1); ++ ++ /* Handle future sequence number received. ++ * ++ * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 ++ * b/c we are beyond the window boundary. ++ * ++ * We know we don't have the wantseq so that counts as a drop. ++ */ ++ ++ /* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the ++ * missing sequence number. ++ * ++ * the final slot at savedlen (index savedlen - 1) is always occupied. ++ * ++ * beyond is "beyond array size" not savedlen. ++ * ++ * +--------- array length (savedlen == 2) ++ * | +----- array size (nslots - 1 == 3) ++ * | | +- window boundary (nslots == 4) ++ * V V | V ++ * | ++ * 0 1 2 3 | slot number ++ * --- 0 1 2 | array index ++ * [b] [c] : :| array ++ * | ++ * "2" "3" "4" "5"|*6* seq numbers ++ * ++ * We receive seq number 6 ++ * distance == 4 [inseq(6) - w_wantseq(2)] ++ * newslot == distance ++ * index == 3 [distance(4) - 1] ++ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] ++ * shifting == 1 [min(savedlen(2), beyond(1)] ++ * slot0_skb == [b], and should match w_wantseq ++ * ++ * +--- window boundary (nslots == 4) ++ * 0 1 2 3 | 4 slot number ++ * --- 0 1 2 | 3 array index ++ * [b] : : : :| array ++ * "2" "3" "4" "5" *6* seq numbers ++ * ++ * We receive seq number 6 ++ * distance == 4 [inseq(6) - w_wantseq(2)] ++ * newslot == distance ++ * index == 3 [distance(4) - 1] ++ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] ++ * shifting == 1 [min(savedlen(1), beyond(1)] ++ * slot0_skb == [b] and should match w_wantseq ++ * ++ * +-- window boundary (nslots == 4) ++ * 0 1 2 3 | 4 5 6 slot number ++ * --- 0 1 2 | 3 4 5 array index ++ * [-] [c] : :| array ++ * "2" "3" "4" "5" "6" "7" *8* seq numbers ++ * ++ * savedlen = 2, beyond = 3 ++ * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] ++ * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] ++ * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) ++ * ++ * We receive seq number 8 ++ * distance == 6 [inseq(8) - w_wantseq(2)] ++ * newslot == distance ++ * index == 5 [distance(6) - 1] ++ * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] ++ * shifting == 2 [min(savedlen(2), beyond(3)] ++ * ++ * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. ++ */ ++ ++ /* Now send any packets that are being shifted out of saved, and account ++ * for missing packets that are exiting the window as we shift it. ++ */ ++ ++ /* If savedlen > beyond we are shifting some, else all. */ ++ shifting = min(savedlen, beyond); ++ ++ /* slot0 is the buf that just shifted out and into slot0 */ ++ slot0 = NULL; ++ s0seq = wantseq; ++ wnext = xtfs->w_saved; ++ for (slot = 1; slot <= shifting; slot++, wnext++) { ++ /* handle what was in slot0 before we occupy it */ ++ if (slot0) ++ list_add_tail(&slot0->list, list); ++ s0seq++; ++ slot0 = wnext->skb; ++ wnext->skb = NULL; ++ } ++ ++ /* slot0 is now either NULL (in which case it's what we now are waiting ++ * for, or a buf in which case we need to handle it like we received it; ++ * however, we may be advancing past that buffer as well.. ++ */ ++ ++ /* Handle case where we need to shift more than we had saved, slot0 will ++ * be NULL iff savedlen is 0, otherwise slot0 will always be ++ * non-NULL b/c we shifted the final element, which is always set if ++ * there is any saved, into slot0. ++ */ ++ if (savedlen < beyond) { ++ extra_drops = beyond - savedlen; ++ if (savedlen == 0) { ++ BUG_ON(slot0); ++ s0seq += extra_drops; ++ } else { ++ extra_drops--; /* we aren't dropping what's in slot0 */ ++ BUG_ON(!slot0); ++ list_add_tail(&slot0->list, list); ++ s0seq += extra_drops + 1; ++ } ++ slot0 = NULL; ++ /* slot0 has had an empty slot pushed into it */ ++ } ++ ++ /* Remove the entries */ ++ __vec_shift(xtfs, beyond); ++ ++ /* Advance want seq */ ++ xtfs->w_wantseq += beyond; ++ ++ /* Process drops here when implementing congestion control */ ++ ++ /* We've shifted. plug the packet in at the end. */ ++ xtfs->w_savedlen = nslots - 1; ++ xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; ++ iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); ++ ++ /* if we don't have a slot0 then we must wait for it */ ++ if (!slot0) ++ return; ++ ++ /* If slot0, seq must match new want seq */ ++ BUG_ON(xtfs->w_wantseq != __esp_seq(slot0)); ++ ++ /* slot0 is valid, treat like we received expected. */ ++ __reorder_this(xtfs, slot0, list); ++} ++ ++/* Receive a new packet into the reorder window. Return a list of ordered ++ * packets from the window. ++ */ ++static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, struct list_head *list, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ u64 inseq = __esp_seq(inskb); ++ u64 wantseq; ++ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ if (unlikely(!xtfs->w_seq_set)) { ++ xtfs->w_seq_set = true; ++ xtfs->w_wantseq = inseq; ++ } ++ wantseq = xtfs->w_wantseq; ++ ++ if (likely(inseq == wantseq)) ++ __reorder_this(xtfs, inskb, list); ++ else if (inseq < wantseq) ++ __reorder_past(xtfs, inskb, freelist); ++ else if ((inseq - wantseq) < nslots) ++ __reorder_future_fits(xtfs, inskb, freelist); ++ else ++ __reorder_future_shifts(xtfs, inskb, list, freelist); + } + + /** +@@ -1192,23 +1566,90 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + */ + static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) + { ++ struct sk_buff *skb, *next; ++ struct list_head freelist, list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; ++ u32 count; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + +- /* Drop any in progress packet */ + spin_lock(&xtfs->drop_lock); ++ ++ INIT_LIST_HEAD(&list); ++ INIT_LIST_HEAD(&freelist); ++ ++ /* Drop any in progress packet */ ++ + if (xtfs->ra_newskb) { + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; + } ++ ++ /* Now drop as many packets as we should from the reordering window ++ * saved array ++ */ ++ count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; ++ + spin_unlock(&xtfs->drop_lock); + ++ if (count) { ++ list_for_each_entry_safe(skb, next, &list, list) { ++ skb_list_del_init(skb); ++ (void)iptfs_input_ordered(x, skb); ++ } ++ } + return HRTIMER_NORESTART; + } + ++/** ++ * iptfs_input() - handle receipt of iptfs payload ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * We have an IPTFS payload order it if needed, then process newly in order ++ * packets. ++ */ ++static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct list_head freelist, list; ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *next; ++ ++ /* Fast path for no reorder window. */ ++ if (xtfs->cfg.reorder_win_size == 0) { ++ iptfs_input_ordered(x, skb); ++ goto done; ++ } ++ ++ /* Fetch list of in-order packets from the reordering window as well as ++ * a list of buffers we need to now free. ++ */ ++ INIT_LIST_HEAD(&list); ++ INIT_LIST_HEAD(&freelist); ++ ++ spin_lock(&xtfs->drop_lock); ++ iptfs_input_reorder(xtfs, skb, &list, &freelist); ++ spin_unlock(&xtfs->drop_lock); ++ ++ list_for_each_entry_safe(skb, next, &list, list) { ++ skb_list_del_init(skb); ++ (void)iptfs_input_ordered(x, skb); ++ } ++ ++ list_for_each_entry_safe(skb, next, &freelist, list) { ++ skb_list_del_init(skb); ++ kfree_skb(skb); ++ } ++done: ++ /* We always have dealt with the input SKB, either we are re-using it, ++ * or we have freed it. Return EINPROGRESS so that xfrm_input stops ++ * processing it. ++ */ ++ return -EINPROGRESS; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -2010,6 +2451,7 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + struct xfrm_iptfs_config *xc; + + xc = &xtfs->cfg; ++ xc->reorder_win_size = net->xfrm.sysctl_iptfs_reorder_window; + xc->max_queue_size = net->xfrm.sysctl_iptfs_max_qsize; + xtfs->init_delay_ns = + (u64)net->xfrm.sysctl_iptfs_init_delay * NSECS_IN_USEC; +@@ -2018,6 +2460,13 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; ++ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) ++ xc->reorder_win_size = ++ nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); ++ /* saved array is for saving 1..N seq nums from wantseq */ ++ if (xc->reorder_win_size) ++ xtfs->w_saved = kcalloc(xc->reorder_win_size, ++ sizeof(*xtfs->w_saved), GFP_KERNEL); + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { +@@ -2054,7 +2503,7 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + + if (xc->dont_frag) + l += nla_total_size(0); +- l += nla_total_size(sizeof(u16)); ++ l += nla_total_size(sizeof(xc->reorder_win_size)); + l += nla_total_size(sizeof(xc->pkt_size)); + l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(u32)); /* drop time usec */ +@@ -2075,7 +2524,7 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + if (ret) + return ret; + } +- ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, 0); ++ ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, xc->reorder_win_size); + if (ret) + return ret; + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); +@@ -2137,6 +2586,14 @@ static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) + return -ENOMEM; + + xtfs->ra_newskb = NULL; ++ if (xtfs->cfg.reorder_win_size) { ++ xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, ++ sizeof(*xtfs->w_saved), GFP_KERNEL); ++ if (!xtfs->w_saved) { ++ kfree_sensitive(xtfs); ++ return -ENOMEM; ++ } ++ } + + err = __iptfs_init_state(x, xtfs); + if (err) +@@ -2164,6 +2621,7 @@ static int iptfs_create_state(struct xfrm_state *x) + static void iptfs_delete_state(struct xfrm_state *x) + { + struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct skb_wseq *s, *se; + + if (!xtfs) + return; +@@ -2176,6 +2634,11 @@ static void iptfs_delete_state(struct xfrm_state *x) + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + ++ for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) ++ if (s->skb) ++ kfree_skb(s->skb); ++ ++ kfree_sensitive(xtfs->w_saved); + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +-- +2.45.1 + diff --git a/patches/v2/v2-0017-xfrm-iptfs-add-tracepoint-functionality.patch b/patches/v2/v2-0017-xfrm-iptfs-add-tracepoint-functionality.patch new file mode 100644 index 0000000..07db10f --- /dev/null +++ b/patches/v2/v2-0017-xfrm-iptfs-add-tracepoint-functionality.patch @@ -0,0 +1,410 @@ +From fe55dee6b92d6e91df99e3cb87455738da431156 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:11:30 -0400 +Subject: [PATCH ipsec-next v2 17/17] xfrm: iptfs: add tracepoint functionality + +Add tracepoints to the IP-TFS code. + +Signed-off-by: Christian Hopps +--- + net/xfrm/trace_iptfs.h | 218 +++++++++++++++++++++++++++++++++++++++++ + net/xfrm/xfrm_iptfs.c | 60 ++++++++++++ + 2 files changed, 278 insertions(+) + create mode 100644 net/xfrm/trace_iptfs.h + +diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h +new file mode 100644 +index 000000000000..0425f051572f +--- /dev/null ++++ b/net/xfrm/trace_iptfs.h +@@ -0,0 +1,218 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* xfrm_trace_iptfs.h ++ * ++ * August 12 2023, Christian Hopps ++ * ++ * Copyright (c) 2023, LabN Consulting, L.L.C. ++ */ ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM iptfs ++ ++#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_IPTFS_H ++ ++#include ++#include ++#include ++#include ++ ++struct xfrm_iptfs_data; ++ ++TRACE_EVENT(iptfs_egress_recv, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), ++ TP_ARGS(skb, xtfs, blkoff), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(void *, head) ++ __field(void *, head_pg_addr) ++ __field(void *, pg0addr) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, headroom) ++ __field(u32, tailroom) ++ __field(u32, tail) ++ __field(u32, end) ++ __field(u32, pg0off) ++ __field(u8, head_frag) ++ __field(u8, frag_list) ++ __field(u8, nr_frags) ++ __field(u16, blkoff)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->head = skb->head; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->headroom = skb_headroom(skb); ++ __entry->tailroom = skb_tailroom(skb); ++ __entry->tail = skb->tail; ++ __entry->end = skb->end; ++ __entry->head_frag = skb->head_frag; ++ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; ++ __entry->nr_frags = skb_shinfo(skb)->nr_frags; ++ __entry->blkoff = blkoff; ++ __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); ++ __entry->pg0addr = (__entry->nr_frags ++ ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem)) ++ : NULL); ++ __entry->pg0off = (__entry->nr_frags ++ ? skb_shinfo(skb)->frags[0].offset ++ : 0); ++ ), ++ TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, ++ __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, ++ __entry->tailroom, __entry->tail, __entry->end, __entry->head, ++ __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, ++ __entry->pg0off) ++ ) ++ ++DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, ++ u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, pmtu) ++ __field(u32, queue_size) ++ __field(u32, proto_seq) ++ __field(u8, proto) ++ __field(u8, was_gso) ++ ), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->queue_size = ++ xtfs->cfg.max_queue_size - xtfs->queue_size; ++ __entry->proto = __trace_ip_proto(ip_hdr(skb)); ++ __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); ++ __entry->pmtu = pmtu; ++ __entry->was_gso = was_gso; ++ ), ++ TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, ++ __entry->queue_size, __entry->proto, __entry->proto_seq, ++ __entry->pmtu, __entry->was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, mtu) ++ __field(u32, proto_seq) ++ __field(u16, blkoff) ++ __field(u8, proto)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->mtu = mtu; ++ __entry->blkoff = blkoff; ++ __entry->proto = iph ? __trace_ip_proto(iph) : 0; ++ __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; ++ ), ++ TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, ++ __entry->blkoff, __entry->proto, __entry->proto_seq)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++TRACE_EVENT(iptfs_ingress_nth_peek, ++ TP_PROTO(struct sk_buff *skb, u32 remaining), ++ TP_ARGS(skb, remaining), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, remaining)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->remaining = remaining; ++ ), ++ TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", ++ __entry->skb, __entry->skb_len, __entry->remaining)); ++ ++TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), ++ TP_ARGS(skb, share_ok), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u8, share_ok) ++ __field(u8, head_frag) ++ __field(u8, pp_recycle) ++ __field(u8, cloned) ++ __field(u8, shared) ++ __field(u8, nr_frags) ++ __field(u8, frag_list) ++ ), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->share_ok = share_ok; ++ __entry->head_frag = skb->head_frag; ++ __entry->pp_recycle = skb->pp_recycle; ++ __entry->cloned = skb_cloned(skb); ++ __entry->shared = skb_shared(skb); ++ __entry->nr_frags = skb_shinfo(skb)->nr_frags; ++ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; ++ ), ++ TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, ++ __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, ++ __entry->nr_frags, __entry->frag_list)); ++ ++DECLARE_EVENT_CLASS(iptfs_timer_event, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val), ++ TP_STRUCT__entry(__field(u64, time_val) ++ __field(u64, set_time)), ++ TP_fast_assign(__entry->time_val = time_val; ++ __entry->set_time = xtfs->iptfs_settime; ++ ), ++ TP_printk("TIMER: set_time=%llu time_val=%llu", ++ __entry->set_time, __entry->time_val)); ++ ++DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val)); ++ ++DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val)); ++ ++#endif /* _TRACE_IPTFS_H */ ++ ++/* This part must be outside protection */ ++#undef TRACE_INCLUDE_PATH ++#define TRACE_INCLUDE_PATH ../../net/xfrm ++#undef TRACE_INCLUDE_FILE ++#define TRACE_INCLUDE_FILE trace_iptfs ++#include +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 12b59411fbf0..1173d5f9f3fd 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -18,6 +18,7 @@ + #include + + #include "xfrm_inout.h" ++#include "trace_iptfs.h" + + /* IPTFS encap (header) values. */ + #define IPTFS_SUBTYPE_BASIC 0 +@@ -87,6 +88,39 @@ static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + /* Utility Functions */ + /* ================= */ + ++static u32 __trace_ip_proto(struct iphdr *iph) ++{ ++ if (iph->version == 4) ++ return iph->protocol; ++ return ((struct ipv6hdr *)iph)->nexthdr; ++} ++ ++static u32 __trace_ip_proto_seq(struct iphdr *iph) ++{ ++ void *nexthdr; ++ u32 protocol = 0; ++ ++ if (iph->version == 4) { ++ nexthdr = (void *)(iph + 1); ++ protocol = iph->protocol; ++ } else if (iph->version == 6) { ++ nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); ++ protocol = ((struct ipv6hdr *)(iph))->nexthdr; ++ } ++ switch (protocol) { ++ case IPPROTO_ICMP: ++ return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); ++ case IPPROTO_ICMPV6: ++ return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); ++ case IPPROTO_TCP: ++ return ntohl(((struct tcphdr *)nexthdr)->seq); ++ case IPPROTO_UDP: ++ return ntohs(((struct udphdr *)nexthdr)->source); ++ default: ++ return 0; ++ } ++} ++ + static u64 __esp_seq(struct sk_buff *skb) + { + u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); +@@ -402,6 +436,13 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + } + } + ++/* ================================== */ ++/* IPTFS Trace Event Definitions */ ++/* ================================== */ ++ ++#define CREATE_TRACE_POINTS ++#include "trace_iptfs.h" ++ + /* ================================== */ + /* IPTFS Receiving (egress) Functions */ + /* ================================== */ +@@ -891,6 +932,8 @@ static int iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) + } + data = sizeof(*ipth); + ++ trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); ++ + /* Set data past the basic header */ + if (ipth->subtype == IPTFS_SUBTYPE_CC) { + /* Copy the rest of the CC header */ +@@ -1791,6 +1834,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + */ + if (!ok) { + nospace: ++ trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMOUTNOQSPACE); + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); +@@ -1801,6 +1845,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + * enqueue. + */ + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { ++ trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -1809,6 +1854,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + ok = iptfs_enqueue(xtfs, skb); + if (!ok) + goto nospace; ++ ++ trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); + } + + /* Start a delay timer if we don't have one yet */ +@@ -1816,6 +1863,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, + IPTFS_HRTIMER_MODE); + xtfs->iptfs_settime = ktime_get_raw_fast_ns(); ++ trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); + } + + spin_unlock_bh(&x->lock); +@@ -1913,6 +1961,7 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, + to_copy = skb->len - offset; + while (to_copy) { + /* Send all but last fragment to allow agg. append */ ++ trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); + list_add_tail(&nskb->list, &sublist); + + /* FUTURE: if the packet has an odd/non-aligning length we could +@@ -1938,6 +1987,8 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, + + /* return last fragment that will be unsent (or NULL) */ + *skbp = nskb; ++ if (nskb) ++ trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); + + /* trim the original skb to MTU */ + if (!err) +@@ -2042,6 +2093,8 @@ static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, + /* We've split these up before queuing */ + BUG_ON(skb_is_gso(skb)); + ++ trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); ++ + /* Simple case -- it fits. `mtu` accounted for all the overhead + * including the basic IPTFS header. + */ +@@ -2143,6 +2196,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMOUTERROR); + ++ trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -2190,6 +2244,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + * case. + */ + while ((skb2 = skb_peek(list))) { ++ trace_iptfs_ingress_nth_peek(skb2, remaining); + if (skb2->len > remaining) + break; + +@@ -2227,6 +2282,8 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + skb->len += skb2->len; + remaining -= skb2->len; + ++ trace_iptfs_ingress_nth_add(skb2, share_ok); ++ + if (share_ok) { + iptfs_consume_frags(skb, skb2); + } else { +@@ -2276,6 +2333,9 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) + * already). + */ + ++ trace_iptfs_timer_expire( ++ xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime)); ++ + iptfs_output_queued(x, &list); + + return HRTIMER_NORESTART; +-- +2.45.1 +