From e23d07475fa0bb0bef50cc4cc2110fa97bce3770 Mon Sep 17 00:00:00 2001 From: Hongliang Liu <75655411+hongliangl@users.noreply.github.com> Date: Fri, 26 Apr 2024 21:41:30 +0800 Subject: [PATCH] Update OVS pipeline document (#5412) Resolves #5200 Signed-off-by: Hongliang Liu --- docs/assets/ovs-pipeline-antrea-proxy.svg | 4835 ---------------- docs/assets/ovs-pipeline.svg | 6069 +++++++-------------- docs/design/ovs-pipeline.md | 2506 ++++++--- pkg/agent/openflow/fields.go | 8 +- 4 files changed, 3561 insertions(+), 9857 deletions(-) delete mode 100644 docs/assets/ovs-pipeline-antrea-proxy.svg diff --git a/docs/assets/ovs-pipeline-antrea-proxy.svg b/docs/assets/ovs-pipeline-antrea-proxy.svg deleted file mode 100644 index 7016a665305..00000000000 --- a/docs/assets/ovs-pipeline-antrea-proxy.svg +++ /dev/null @@ -1,4835 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xmldiff --git a/docs/assets/ovs-pipeline.svg b/docs/assets/ovs-pipeline.svg index c60576a18e1..6630ac656f0 100644 --- a/docs/assets/ovs-pipeline.svg +++ b/docs/assets/ovs-pipeline.svg @@ -2,14 +2,14 @@ - + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> @@ -610,26 +490,25 @@ orient="auto" refY="0" refX="0" - id="marker1534-5" + id="marker1488-1" style="overflow:visible" inkscape:isstock="true"> @@ -639,3886 +518,1844 @@ orient="auto" refY="0" refX="0" - id="marker5914-9-9" + id="marker1644-8" style="overflow:visible" - inkscape:isstock="true" - inkscape:collect="always"> + inkscape:isstock="true"> - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + id="path1642-1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - + id="path1642-1-3-0-7-2-4-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + + + + + + + image/svg+xmlgressRule + EgressDefaultRule + EgressMetric + EgressMark + L3DecTTL + SNATMark + L2ForwardingCalc + SessionAffinity + NodePortMark + L3Forwarding + SNAT + TrafficControl + IngressSecurityClassifier + IngressRule + AntreaPolicyIngressRule + IngressDefaultRule + IngressMetric + ConntrackCommit + Output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - ARP packets + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + y="30.55513" + x="39.744644" + id="tspan1018-5" + sodipodi:role="line">IP packets + + + + diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index a967b123086..6f5d1d18b24 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -1,1197 +1,1899 @@ # Antrea OVS Pipeline +## Introduction + +This document outlines the Open vSwitch (OVS) pipeline Antrea uses to implement its networking functionalities. The +following assumptions are currently in place: + +- Antrea is deployed in encap mode, establishing an overlay network across all Nodes. +- All the Nodes are Linux Nodes. +- IPv6 is disabled. +- Option `antreaProxy.proxyAll` (referred to as `proxyAll` later in this document) is enabled. +- Two Alpha features `TrafficControl` and `L7NetworkPolicy` are enabled. +- Default settings are maintained for other features and options. + +The document references version v1.15 of Antrea. + ## Terminology -* *Node Route Controller*: the [K8s - controller](https://kubernetes.io/docs/concepts/architecture/controller/) - which is part of the Antrea Agent and watches for updates to Nodes. When a - Node is added, it updates the local networking configuration (e.g. configure - the tunnel to the new Node). When a Node is deleted, it performs the necessary - clean-ups. -* *peer Node*: this is how we refer to other Nodes in the cluster, to which the - local Node is connected through a Geneve, VXLAN, GRE, or STT tunnel. -* *Global Virtual MAC*: a virtual MAC address which is used as the destination - MAC for all tunnelled traffic across all Nodes. This simplifies networking by - enabling all Nodes to use this MAC address instead of the actual MAC address - of the appropriate remote gateway. This enables each vSwitch to act as a - "proxy" for the local gateway when receiving tunnelled traffic and directly - take care of the packet forwarding. At the moment, we use an hard-coded value - of `aa:bb:cc:dd:ee:ff`. -* *Antrea-native Policies*: Antrea ClusterNetworkPolicy and Antrea NetworkPolicy - CRDs, as documented [here](../antrea-network-policy.md). -* *`normal` action*: OpenFlow defines this action to submit a packet to "the - traditional non-OpenFlow pipeline of the switch". That is, if a flow uses this - action, then the packets in the flow go through the switch in the same way - that they would if OpenFlow was not configured on the switch. Antrea uses this - action to process ARP traffic as a regular learning L2 switch would. -* *table-miss flow entry*: a "catch-all" entry in a OpenFlow table, which is - used if no other flow is matched. If the table-miss flow entry does not exist, - by default packets unmatched by flow entries are dropped (discarded). -* *conjunctive match fields*: an efficient way in OVS to implement conjunctive - matches, that is a match for which we have multiple fields, each one with a - set of acceptable values. See [OVS - fields](http://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) for - more information. -* *conntrack*: a connection tracking module that can be used by OVS to match on +### Antrea / Kubernetes + +- *Node Route Controller*: the [Kubernetes controller](https://kubernetes.io/docs/concepts/architecture/controller/) + which is a part of antrea-agent and watches for updates to Nodes. When a Node is added, it updates the local + networking configurations (e.g. configure the tunnel to the new Node). When a Node is deleted, it performs the + necessary clean-ups. +- *peer Node*: this is how we refer to other Nodes in the cluster, to which the local Node is connected through a Geneve, + VXLAN, GRE, or STT tunnel. +- *Antrea-native NetworkPolicy*: Antrea ClusterNetworkPolicy and Antrea NetworkPolicy CRDs, as documented + [here](../antrea-network-policy.md). +- *Service session affinity*: a Service attribute that selects the same backend Pods for connections from a particular + client. For a K8s Service, session affinity can be enabled by setting `service.spec.sessionAffinity` to `ClientIP` + (default is `None`). See [Kubernetes Service](https://kubernetes.io/docs/concepts/services-networking/service/) for + more information about session affinity. + +### OpenFlow + +- *table-miss flow*: a "catch-all" flow in an OpenFlow table, which is used if no other flow is matched. If the table-miss + flow does not exist, by default packets unmatched by flows are dropped (discarded). +- *action `conjunction`*: an efficient way in OVS to implement conjunctive matches, is a match for which multiple fields + are required to match conjunctively, each within a set of acceptable values. See [OVS + fields](http://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) for more information. +- *action `normal`*: OpenFlow defines this action to submit a packet to "the traditional non-OpenFlow pipeline of + the switch". In other words, if a flow uses this action, the packets matched by the flow traverse the switch in + the same manner as they would if OpenFlow were not configured on the switch. Antrea uses this action to process + ARP packets as a regular learning L2 switch would. +- *action `group`*: an action used to process forwarding decisions on multiple OVS ports. Examples include: + load-balancing, multicast, and active/standby. See [OVS group + action](https://docs.openvswitch.org/en/latest/ref/ovs-actions.7/#the-group-action) for more information. +- *action `IN_PORT`*: an action to output packets to the port on which they were received. This is the only standard way + to output the packets to the input port. +- *action `ct`*: an action to commit connections to the connection tracking module, which OVS can use to match the state of a TCP, UDP, ICMP, etc., connection. See the [OVS Conntrack - tutorial](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for - more information. -* *dmac table*: a traditional L2 switch has a "dmac" table which maps - learned destination MAC address to the appropriate egress port. It is often - the same physical table as the "smac" table (which matches on the source MAC - address and initiate MAC learning if the address is unknown). -* *group action*: an action which is used to process forwarding decisions - on multiple OVS ports. Examples include: load-balancing, multicast, and active/standby. - See [OVS group action](https://docs.openvswitch.org/en/latest/ref/ovs-actions.7/#the-group-action) - for more information. -* *IN_PORT action*: an action to output the packet to the port on which it was - received. This is the only standard way to output the packet to the input port. -* *session affinity*: a load balancer feature that always selects the same backend - Pod for connections from a particular client. For a K8s Service, session - affinity can be enabled by setting `service.spec.sessionAffinity` to `ClientIP` - (default is `None`). See [K8s Service](https://kubernetes.io/docs/concepts/services-networking/service/) - for more information about session affinity. - -**This document currently makes the following assumptions:** - -* Antrea is used in encap mode (an overlay network is created between all Nodes) -* All the Nodes are Linux Nodes -* IPv6 is disabled -* AntreaProxy is enabled -* AntreaPolicy is enabled - -## Dumping the Flows - -This guide includes a representative flow dump for every table in the pipeline, -in order to illustrate the function of each table. If you have a cluster running -Antrea, you can dump the flows for a given Node as follows: + tutorial](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for more information. +- *reg mark*: a value stored in an OVS register conveying information for a packet across the pipeline. Explore all reg + marks in the pipeline in the [OVS Registers] section. +- *ct mark*: a value stored in the field `ct_mark` of OVS conntrack, conveying information for a connection throughout + its entire lifecycle across the pipeline. Explore all values used in the pipeline in the [Ct Marks] section. +- *ct label*: a value stored in the field `ct_label` of OVS conntrack, conveying information for a connection throughout + its entire lifecycle across the pipeline. Explore all values used in the pipeline in the [Ct Labels] section. +- *ct zone*: a zone is to isolate connection tracking rules stored in the field `ct_zone` of OVS conntrack. It is + conceptually similar to the more generic Linux network namespace but is specific to conntrack and has less + overhead. Explore all the zones used in the pipeline in the [Ct Zones] section. + +### Misc + +- *dmac table*: a traditional L2 switch has a "dmac" table that maps the learned destination MAC address to the appropriate + egress port. It is often the same physical table as the "smac" table (which matches the source MAC address and + initiates MAC learning if the address is unknown). +- *Global Virtual MAC*: a virtual MAC address that is used as the destination MAC for all tunneled traffic across all + Nodes. This simplifies networking by enabling all Nodes to use this MAC address instead of the actual MAC address of + the appropriate remote gateway. This allows each OVS to act as a "proxy" for the local gateway when receiving + tunneled traffic and directly take care of the packet forwarding. Currently, we use a hard-coded value of + `aa:bb:cc:dd:ee:ff`. +- *Virtual Service IP*: a virtual IP address used as the source IP address for hairpin Service connections through the + Antrea gateway port. Currently, we use a hard-coded value of `169.254.0.253`. +- *Virtual NodePort DNAT IP*: a virtual IP address used as a DNAT IP address for NodePort Service connections through + Antrea gateway port. Currently, we use a hard-coded value of `169.254.0.252`. + +## Dumping the Flows / Groups + +This guide includes a representative flow dump for every table in the pipeline, to illustrate the function of each +table. If you have a cluster running Antrea, you can dump the flows or groups on a given Node as follows: ```bash -kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows [--no-stats] [--names] +# Dump all flows. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows -O Openflow15 [--no-stats] [--names] + +# Dump all groups. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--names] ``` -where `` is the name of the Antrea Agent Pod running on -that Node and `` is the name of the bridge created by Antrea -(`br-int` by default). - -## Registers - -We use 2 32-bit OVS registers to carry information throughout the pipeline: - -* reg0 (NXM_NX_REG0): - - bits [0..3] are used to store the traffic source (from tunnel: 0, from - local gateway: 1, from local Pod: 2). It is set in [ClassifierTable]. - - bit 16 is used to indicate whether the destination MAC address of a packet - is "known", i.e. corresponds to an entry in [L2ForwardingCalcTable], which - is essentially a "dmac" table. - - bit 18 is used to indicate whether the packet should be output to the port - on which it was received. It is consumed in [L2ForwardingOutTable] - to output the packet with action `IN_PORT`. - - bit 19 is used to indicate whether the destination and source MACs of the - packet should be rewritten in [l3ForwardingTable]. The bit is set for - packets received from the tunnel port in [ClassifierTable]. The - destination MAC of such packets is the Global Virtual MAC and should be - rewritten to the destination port's MAC before output to the port. When such - a packet is destined to a Pod, its source MAC should be rewritten to the - local gateway port's MAC too. -* reg1 (NXM_NX_REG1): it is used to store the egress OF port for the packet. It - is set in [DNATTable] for traffic destined to Services and in - [L2ForwardingCalcTable] otherwise. It is consumed in [L2ForwardingOutTable] to - output each packet to the correct port. -* reg3 (NXM_NX_REG3): it is used to store selected Service Endpoint IPv4 address - in OVS group entry. It is consumed in [EndpointDNATTable]. -* reg4 (NXM_NX_REG4): - * bits [0..16] are used to store selected Service Endpoint port number in OVS - group entry. They are consumed in [EndpointDNATTable]. - * bits [17..18] are used to store the state of a Service request packet. - Marks in this field include, - * 0b001: packet needs to do Endpoint selection. - * 0b010: packet has done Endpoint selection. - * 0b011: packet has done Endpoint selection and the selection result needs to - be cached. - -## Network Policy Implementation - -Several tables of the pipeline are dedicated to [K8s Network -Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -implementation ([EgressRuleTable], [EgressDefaultTable], [IngressRuleTable] and -[IngressDefaultTable]). - -The Antrea implementation of K8s Network Policy, including the communication -channel between the Controller and Agents, and how a Network Policy is mapped to -OVS flows at each Node, will be described in details in a separate document. For -the present document, we will use the Network Policy example below, and explain -how these simple ingress and egress rules map to individual flows as we describe -the relevant tables of our pipeline. +where `` is the name of the antrea-agent Pod running on that Node, and `` is the name +of the bridge created by Antrea (`br-int` by default). + +You can also dump the flows for a specific table or group as follows: + +```bash +# Dump flows of a table. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows table= -O Openflow15 [--no-stats] [--names] + +# Dump a group. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--names] +``` + +where `` is the name of a table in the pipeline, and `` is the ID of a group. + +## OVS Registers + +We use some OVS registers to carry information throughout the pipeline. To enhance usability, we assign friendly names +to the registers we use. + +| Register | Field Range | Field Name | Reg Mark Value | Reg Mark Name | Description | +|---------------|-------------|---------------------------------|----------------|---------------------------------|------------------------------------------------------------------------------------------------------| +| NXM_NX_REG0 | bits 0-3 | PktSourceField | 0x1 | FromTunnelRegMark | Packet source is tunnel port. | +| | | | 0x2 | FromGatewayRegMark | Packet source is the local Antrea gateway port. | +| | | | 0x3 | FromPodRegMark | Packet source is local Pod port. | +| | | | 0x4 | FromUplinkRegMark | Packet source is uplink port. | +| | | | 0x5 | FromBridgeRegMark | Packet source is local bridge port. | +| | | | 0x6 | FromTCReturnRegMark | Packet source is TrafficControl return port. | +| | bits 4-7 | PktDestinationField | 0x1 | ToTunnelRegMark | Packet destination is tunnel port. | +| | | | 0x2 | ToGatewayRegMark | Packet destination is the local Antrea gateway port. | +| | | | 0x3 | ToLocalRegMark | Packet destination is local Pod port. | +| | | | 0x4 | ToUplinkRegMark | Packet destination is uplink port. | +| | | | 0x5 | ToBridgeRegMark | Packet destination is local bridge port. | +| | bit 9 | | 0b0 | NotRewriteMACRegMark | Packet's source/destination MAC address does not need to be rewritten. | +| | | | 0b1 | RewriteMACRegMark | Packet's source/destination MAC address needs to be rewritten. | +| | bit 10 | | 0b1 | APDenyRegMark | Packet denied (Drop/Reject) by Antrea NetworkPolicy. | +| | bits 11-12 | APDispositionField | 0b00 | DispositionAllowRegMark | Indicates Antrea NetworkPolicy disposition: allow. | +| | | | 0b01 | DispositionDropRegMark | Indicates Antrea NetworkPolicy disposition: drop. | +| | | | 0b11 | DispositionPassRegMark | Indicates Antrea NetworkPolicy disposition: pass. | +| | bit 13 | | 0b1 | GeneratedRejectPacketOutRegMark | Indicates packet is a generated reject response packet-out. | +| | bit 14 | | 0b1 | SvcNoEpRegMark | Indicates packet towards a Service without Endpoint. | +| | bit 19 | | 0b1 | RemoteSNATRegMark | Indicates packet needs SNAT on a remote Node. | +| | bit 22 | | 0b1 | L7NPRedirectRegMark | Indicates L7 Antrea NetworkPolicy disposition of redirect. | +| | bits 21-22 | OutputRegField | 0b01 | OutputToOFPortRegMark | Output packet to an OVS port. | +| | | | 0b10 | OutputToControllerRegMark | Send packet to Antrea Agent. | +| | bits 25-32 | PacketInOperationField | | | Field to store NetworkPolicy packetIn operation. | +| NXM_NX_REG1 | bits 0-31 | TargetOFPortField | | | Egress OVS port of packet. | +| NXM_NX_REG2 | bits 0-31 | SwapField | | | Swap values in flow fields in OpenFlow actions. | +| | bits 0-7 | PacketInTableField | | | OVS table where it was decided to send packets to the controller (Antrea Agent). | +| NXM_NX_REG3 | bits 0-31 | EndpointIPField | | | Field to store IPv4 address of the selected Service Endpoint. | +| | | APConjIDField | | | Field to store Conjunction ID for Antrea Policy. | +| NXM_NX_REG4 | bits 0-15 | EndpointPortField | | | Field store TCP/UDP/SCTP port of a Service's selected Endpoint. | +| | bits 16-18 | ServiceEPStateField | 0b001 | EpToSelectRegMark | Packet needs to do Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b010 | EpSelectedRegMark | Packet has done Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b011 | EpToLearnRegMark | Packet has done Service Endpoint selection and the selected Endpoint needs to be cached. | +| | bits 0-18 | EpUnionField | | | The union value of EndpointPortField and ServiceEPStateField. | +| | bit 19 | | 0b1 | ToNodePortAddressRegMark | Packet is destined for a Service of type NodePort. | +| | bit 20 | | 0b1 | AntreaFlexibleIPAMRegMark | Packet is from local Antrea IPAM Pod. | +| | bit 20 | | 0b0 | NotAntreaFlexibleIPAMRegMark | Packet is not from local Antrea IPAM Pod. | +| | bit 21 | | 0b1 | ToExternalAddressRegMark | Packet is destined for a Service's external IP. | +| | bits 22-23 | TrafficControlActionField | 0b01 | TrafficControlMirrorRegMark | Indicates packet needs to be mirrored (used by TrafficControl). | +| | | | 0b10 | TrafficControlRedirectRegMark | Indicates packet needs to be redirected (used by TrafficControl). | +| | bit 24 | | 0b1 | NestedServiceRegMark | Packet is destined for a Service using other Services as Endpoints. | +| | bit 25 | | 0b1 | DSRServiceRegMark | Packet is destined for a Service working in DSR mode. | +| | | | 0b0 | NotDSRServiceRegMark | Packet is destined for a Service working in non-DSR mode. | +| | bit 26 | | 0b1 | RemoteEndpointRegMark | Packet is destined for a Service selecting a remote non-hostNetwork Endpoint. | +| | bit 27 | | 0b1 | FromExternalRegMark | Packet is from Antrea gateway, but its source IP is not the gateway IP. | +| | bit 28 | | 0b1 | FromLocalRegMark | Packet is from a local Pod or the Node. | +| NXM_NX_REG5 | bits 0-31 | TFEgressConjIDField | | | Egress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG6 | bits 0-31 | TFIngressConjIDField | | | Ingress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG7 | bits 0-31 | ServiceGroupIDField | | | GroupID corresponding to the Service. | +| NXM_NX_REG8 | bits 0-11 | VLANIDField | | | VLAN ID. | +| | bits 12-15 | CtZoneTypeField | 0b0001 | IPCtZoneTypeRegMark | Ct zone type is IPv4. | +| | | | 0b0011 | IPv6CtZoneTypeRegMark | Ct zone type is IPv6. | +| | bits 0-15 | CtZoneField | | | Ct zone ID is a combination of VLANIDField and CtZoneTypeField. | +| NXM_NX_REG9 | bits 0-31 | TrafficControlTargetOFPortField | | | Field to cache the OVS port to output packets to be mirrored or redirected (used by TrafficControl). | +| NXM_NX_XXREG3 | bits 0-127 | EndpointIP6Field | | | Field to store IPv6 address of the selected Service Endpoint. | + +Note that reg marks that have overlapped bits will not be used at the same time, such as `SwapField` and `PacketInTableField`. + +## OVS Ct Mark + +We use some bits of the `ct_mark` field of OVS conntrack to carry information throughout the pipeline. To enhance +usability, we assign friendly names to the bits we use. + +| Field Range | Field Name | Ct Mark Value | Ct Mark Name | Description | +|-------------|-----------------------|---------------|--------------------|-----------------------------------------------------------------| +| bits 0-3 | ConnSourceCTMarkField | 0b0010 | FromGatewayCTMark | Connection source is the Antrea gateway port. | +| | | 0b0101 | FromBridgeCTMark | Connection source is the local bridge port. | +| bit 4 | | 0b1 | ServiceCTMark | Connection is for Service. | +| | | 0b0 | NotServiceCTMark | Connection is not for Service. | +| bit 5 | | 0b1 | ConnSNATCTMark | SNAT'd connection for Service. | +| bit 6 | | 0b1 | HairpinCTMark | Hair-pin connection. | +| bit 7 | | 0b1 | L7NPRedirectCTMark | Connection should be redirected to an application-aware engine. | + +## OVS Ct Label + +We use some bits of the `ct_label` field of OVS conntrack to carry information throughout the pipeline. To enhance +usability, we assign friendly names to the bits we use. + +| Field Range | Field Name | Description | +|-------------|-----------------------|------------------------------------| +| bits 0-31 | IngressRuleCTLabel | Ingress rule ID. | +| bits 32-63 | EgressRuleCTLabel | Egress rule ID. | +| bits 64-75 | L7NPRuleVlanIDCTLabel | VLAN ID for L7 NetworkPolicy rule. | + +## OVS Ct Zone + +We use some OVS conntrack zones to isolate connection tracking rules. To enhance usability, we assign friendly names to +the ct zones. + +| Zone ID | Zone Name | Description | +|---------|--------------|----------------------------------------------------| +| 65520 | CtZone | Tracking IPv4 connections that don't require SNAT. | +| 65521 | SNATCtZone | Tracking IPv4 connections that require SNAT. | + +## Kubernetes NetworkPolicy Implementation + +Several tables of the pipeline are dedicated to [Kubernetes +NetworkPolicy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) implementation (tables +[EgressRule], [EgressDefaultRule], [IngressRule], and [IngressDefaultRule]). + +Throughout this document, the following K8s NetworkPolicy example is used to demonstrate how simple ingress and egress +policy rules are mapped to OVS flows. + +This K8s NetworkPolicy is applied to Pods with the label `app: web` in the `default` Namespace. For these Pods, only TCP +traffic on port 80 from Pods with the label `app: client` and to Pods with the label `app: db` is allowed. Because +Antrea will only install OVS flows for this K8s NetworkPolicy on Nodes that have Pods selected by the policy, we have +scheduled an `app: web` Pod on the current Node from which the sample flows in this document are dumped. The Pod has +been assigned an IP address `10.10.0.19` from the Antrea CNI, so you will see the IP address shown in the associated +flows. ```yaml apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: test-network-policy + name: web-app-db-network-policy namespace: default spec: podSelector: matchLabels: - app: nginx + app: web policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: nginx - ports: + - from: + - podSelector: + matchLabels: + app: client + ports: + - protocol: TCP + port: 80 + egress: + - to: + - podSelector: + matchLabels: + app: db + ports: + - protocol: TCP + port: 3306 +``` + +## Kubernetes Service Implementation + +Like K8s NetworkPolicy, several tables of the pipeline are dedicated to [Kubernetes +Service](https://kubernetes.io/docs/concepts/services-networking/service/) implementation (tables [NodePortMark], +[SessionAffinity], [ServiceLB], and [EndpointDNAT]). + +By enabling `proxyAll`, ClusterIP, NodePort, LoadBalancer, and ExternalIP are all handled by AntreaProxy. Otherwise, +only in-cluster ClusterIP is handled. In this document, we use the sample K8s Services below. These Services select Pods +with the label `app: web` as Endpoints. + +### ClusterIP without Endpoint + +A sample Service with `clusterIP` set to `10.101.255.29` does not have any associated Endpoint. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-clusterip-no-ep +spec: + ports: - protocol: TCP port: 80 - egress: - - to: - - podSelector: - matchLabels: - app: nginx - ports: + targetPort: 80 + clusterIP: 10.101.255.29 +``` + +### ClusterIP + +A sample ClusterIP Service with `clusterIP` set to `10.105.31.235`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-clusterip +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + clusterIP: 10.105.31.235 +``` + +### NodePort + +A sample NodePort Service with `nodePort` set to `30004`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-nodeport +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + nodePort: 30004 + type: NodePort +``` + +### LoadBalancer + +A sample LoadBalancer Service with ingress IP `192.168.77.150` assigned by an ingress controller. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-loadbalancer +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + type: LoadBalancer +status: + loadBalancer: + ingress: + - ip: 192.168.77.150 +``` + +### Service with ExternalIP + +A sample Service with external IP `192.168.77.200`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-externalip +spec: + selector: + app: web + ports: - protocol: TCP port: 80 + targetPort: 80 + externalIPs: + - 192.168.77.200 ``` -This Network Policy is applied to all Pods with the `nginx` app label in the -`default` Namespace. For these Pods, it only allows TCP traffic on port 80 from -and to Pods which also have the `nginx` app label. Because Antrea will only -install OVS flows for this Network Policy on Nodes for which some of the Pods -are the target of the policy, we have scheduled 2 `nginx` Pods on the same -Node. They received IP addresses 10.10.1.2 and 10.10.1.3 from the Antrea CNI, so -you will see these addresses show up in the OVS flows. +### Service with Session Affinity -## Antrea-native Policies Implementation +A sample Service configured with session affinity. -In addition to the above tables created for K8s NetworkPolicy, Antrea creates -additional dedicated tables to support the [Antrea-native policies](../antrea-network-policy.md) -([AntreaPolicyEgressRuleTable] and [AntreaPolicyIngressRuleTable]). +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-session-affinity +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + clusterIP: 10.96.76.15 + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 300 +``` -Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application tier as an -example for the remainder of this document. +### Service with ExternalTrafficPolicy Local + +A sample Service configured `externalTrafficPolicy` to `Local`. Only `externalTrafficPolicy` of NodePort/LoadBalancer +Service can be configured with `Local`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-etp-local +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + type: LoadBalancer + externalTrafficPolicy: Local +status: + loadBalancer: + ingress: + - ip: 192.168.77.151 +``` + +## Antrea-native NetworkPolicy Implementation + +In addition to the tables created for K8s NetworkPolicy, Antrea creates additional dedicated tables to support +[Antrea-native NetworkPolicy](../antrea-network-policy.md) (tables [AntreaPolicyEgressRule] and +[AntreaPolicyIngressRule]). + +Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application Tier as an example for the remainder of +this document. + +This ACNP is applied to all Pods with the label `app: web` in all Namespaces. For these Pods, only TCP traffic on port +80 from the Pods with the label `app: client` and to the Pods with the label `app: db` is allowed. Similar to K8s +NetworkPolicy, Antrea will only install OVS flows for this policy on Nodes that have Pods selected by the policy. + +This policy has very similar rules as the K8s NetworkPolicy example shown previously. This is intentional to simplify +this document and to allow easier comparison between the flows generated for both types of policies. Additionally, we +should emphasize that this policy applies to Pods across all Namespaces, while a K8s NetworkPolicy is always scoped to +a specific Namespace (in the case of our example, the default Namespace). ```yaml apiVersion: crd.antrea.io/v1beta1 kind: ClusterNetworkPolicy metadata: - name: cnp0 + name: web-app-db-network-policy spec: - priority: 10 - tier: application # defaults to application tier if not specified + priority: 5 + tier: application appliedTo: - podSelector: matchLabels: - app: server + app: web ingress: - - action: Drop + - action: Allow from: - podSelector: matchLabels: - app: notClient + app: client ports: - protocol: TCP port: 80 + name: AllowFromClient + - action: Drop egress: - action: Allow to: - podSelector: matchLabels: - app: dns + app: db ports: - - protocol: UDP - port: 53 + - protocol: TCP + port: 3306 + name: AllowToDB + - action: Drop ``` -This ACNP is applied to all Pods with the `app: server` label in all -Namespaces. For these Pods, it drops TCP traffic on port 80 from all -Pods which have the `app: notClient` label. In addition to the ingress rules, -this policy also allows egress UDP traffic on port 53 to all Pods with the -label `app: dns`. Similar to K8s NetworkPolicy, Antrea will only install OVS -flows for this ACNP on Nodes for which some of the Pods are the target of the -policy. Thus, we have scheduled three Pods (appServer, appDns, appNotClient) -on the same Node and they have the following IP addresses: +## Antrea-native L7 NetworkPolicy Implementation -- appServer: 10.10.1.6 -- appNotClient: 10.10.1.7 -- appDns: 10.10.1.8 +In addition to layer 3 and layer 4 policies mentioned above, [Antrea-native Layer 7 +NetworkPolicy](../antrea-l7-network-policy.md) is also supported in Antrea. The main difference is that Antrea-native L7 +NetworkPolicy uses layer 7 protocol to filter traffic, not layer 3 or layer 4 protocol. -## Tables +Consider the following Antrea-native L7 NetworkPolicy in the Application Tier as an example for the remainder of this +document. -![OVS pipeline](../assets/ovs-pipeline-antrea-proxy.svg) +This ACNP is applied to all Pods with the label `app: web` in all Namespaces. It allows only HTTP ingress traffic on +port 8080 from Pods with the label `app: client`, limited to the `GET` method and `/api/v2/*` path. Any other HTTP +ingress traffic on port 8080 from Pods with the label `app: client` will be dropped. -### ClassifierTable (0) +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: ClusterNetworkPolicy +metadata: + name: ingress-allow-http-request-to-api-v2 +spec: + priority: 4 + tier: application + appliedTo: + - podSelector: + matchLabels: + app: web + ingress: + - name: AllowFromClientL7 + action: Allow + from: + - podSelector: + matchLabels: + app: client + ports: + - protocol: TCP + port: 8080 + l7Protocols: + - http: + path: "/api/v2/*" + method: "GET" +``` -This table is used to determine which "category" of traffic (tunnel, local -gateway or local Pod) the packet belongs to. This is done by matching on the -ingress port for the packet. The appropriate value is then written to bits -[0..3] in NXM_NX_REG0: 0 for tunnel, 1 for local gateway and 2 for local Pod. -This information is used by matches in subsequent tables. For a packet received -from the tunnel port, bit 19 in NXM_NX_REG0 is set to 1, to indicate MAC rewrite -should be performed for the packet in [L3ForwardingTable]. +## TrafficControl Implementation -If you dump the flows for this table, you may see the following: +[TrafficControl](../traffic-control.md) is a CRD API that manages and manipulates the transmission of Pod traffic. +Antrea creates a dedicated table [TrafficControl] to implement feature `TrafficControl`. We will use the following +TrafficControls as examples for the remainder of this document. -```text -1. table=0, priority=200,in_port=32769 actions=set_field:0x1/0xf->reg0,resubmit(,10) -2. table=0, priority=200,in_port=32768 actions=set_field:0/0xf->reg0,load:0x1->NXM_NX_REG0[19],resubmit(,30) -3. table=0, priority=190,in_port=4 actions=set_field:0x2/0xf->reg0,resubmit(,10) -4. table=0, priority=190,in_port=32770 actions=set_field:0x2/0xf->reg0,resubmit(,10) -5. table=0, priority=0 actions=drop +### TrafficControl for Packet Redirecting + +This is a TrafficControl applied to Pods with the label `app: web`. For these Pods, both ingress and egress traffic will +be redirected to port `antrea-tc-tap0`, and returned through port `antrea-tc-tap1`. + +```yaml +apiVersion: crd.antrea.io/v1alpha2 +kind: TrafficControl +metadata: + name: redirect-web-to-local +spec: + appliedTo: + podSelector: + matchLabels: + app: web + direction: Both + action: Redirect + targetPort: + ovsInternal: + name: antrea-tc-tap0 + returnPort: + ovsInternal: + name: antrea-tc-tap1 ``` -Flow 1 is for traffic coming in on the local gateway. Flow 2 is for traffic -coming in through an overlay tunnel (i.e. from another Node). The next two -flows (3 and 4) are for local Pods. +### TrafficControl for Packet Mirroring -Local traffic then goes to [SpoofGuardTable], while tunnel traffic from other -Nodes goes to [ConntrackTable]. The table-miss flow entry will drop all -unmatched packets (in practice this flow entry should almost never be used). +This is a TrafficControl applied to Pods with the label `app: db`. For these Pods, both ingress and egress will be +mirrored (duplicated) to port `antrea-tc-tap2`. -### SpoofGuardTable (10) +```yaml +apiVersion: crd.antrea.io/v1alpha2 +kind: TrafficControl +metadata: + name: mirror-db-to-local +spec: + appliedTo: + podSelector: + matchLabels: + app: db + direction: Both + action: Mirror + targetPort: + ovsInternal: + name: antrea-tc-tap2 +``` -This table prevents IP and ARP -[spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For -each Pod (as identified by the ingress port), we ensure that: +## Egress Implementation -* for IP traffic, the source IP and MAC addresses are correct, i.e. match the - values configured on the interface when Antrea set-up networking for the Pod. -* for ARP traffic, the advertised IP and MAC addresses are correct, i.e. match - the values configured on the interface when Antrea set-up networking for the - Pod. +Table [EgressMark] is dedicated to the implementation of feature `Egress`. -Because Antrea currently relies on kube-proxy to load-balance traffic destined -to Services, implementing that kind of IP spoofing check for traffic coming-in -on the local gateway port is not as trivial. Traffic from local Pods destined to -Services will first go through the gateway, get load-balanced by the kube-proxy -datapath (DNAT) then sent back through the gateway. This means that legitimate -traffic can be received on the gateway port with a source IP belonging to a -local Pod. We may add some fine-grained rules in the future to accommodate for -this, but for now we just allow all IP traffic received from the gateway. We do -have an ARP spoofing check for the gateway however, since there is no reason for -the host to advertise a different MAC address on antrea-gw0. +Consider the following Egresses as examples for the remainder of this document. -If you dump the flows for this table, you may see the following: +### Egress Applied to Web Pods -```text -1. table=10, priority=200,ip,in_port=32769 actions=resubmit(,23) -2. table=10, priority=200,arp,in_port=32769,arp_spa=10.10.0.1,arp_sha=3a:dd:79:0f:55:4c actions=resubmit(,20) -3. table=10, priority=200,arp,in_port=4,arp_spa=10.10.0.2,arp_sha=ce:99:ca:bd:62:c5 actions=resubmit(,20) -4. table=10, priority=200,arp,in_port=32770,arp_spa=10.10.0.3,arp_sha=3a:41:49:42:98:69 actions=resubmit(,20) -5. table=10, priority=200,ip,in_port=4,dl_src=ce:99:ca:bd:62:c5,nw_src=10.10.0.2 actions=resubmit(,23) -6. table=10, priority=200,ip,in_port=32770,dl_src=3a:41:49:42:98:69,nw_src=10.10.0.3 actions=resubmit(,23) -7. table=10, priority=0 actions=drop +This is an Egress applied to Pods with the label `app: web`. For these Pods, all egress traffic (traffic leaving the +cluster) will be SNAT'd on the Node `k8s-node-control-plane` using Egress IP `192.168.77.112`. In this context, +`k8s-node-control-plane` is known as the "Egress Node" for this Egress resource. Note that the flows presented in the +rest of this document were dumped on Node `k8s-node-control-plane`. Egress flows are different on the "source Node" +(Node running a workload Pod to which the Egress resource is applied) and on the "Egress Node" (Node enforcing the +SNAT policy). + +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: Egress +metadata: + name: egress-web +spec: + appliedTo: + podSelector: + matchLabels: + app: web + egressIP: 192.168.77.112 +status: + egressNode: k8s-node-control-plane ``` -After this table, ARP traffic goes to [ARPResponderTable], while IP -traffic goes to [ServiceHairpinTable]. Traffic which does not match -any of the rules described above will be dropped by the table-miss flow entry. +### Egress Applied to Client Pods -### ARPResponderTable (20) +This is an Egress applied to Pods with the label `app: client`. For these Pods, all egress traffic will be SNAT'd on the +Node `k8s-node-worker-1` using Egress IP `192.168.77.113`. -The main purpose of this table is to reply to ARP requests from the local -gateway asking for the MAC address of a remote peer gateway (another Node's -gateway). This ensures that the local Node can reach any remote Pod, which in -particular is required for Service traffic which has been load-balanced to a -remote Pod backend by kube-proxy. Note that the table is programmed to reply to -such ARP requests with a "Global Virtual MAC" ("Global" because it is used by -all Antrea OVS bridges), and not with the actual MAC address of the remote -gateway. This ensures that once the traffic is received by the remote OVS -bridge, it can be directly forwarded to the appropriate Pod without actually -going through the gateway. The Virtual MAC is used as the destination MAC -address for all the traffic being tunnelled. +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: Egress +metadata: + name: egress-client +spec: + appliedTo: + podSelector: + matchLabels: + app: client + egressIP: 192.168.77.113 +status: + egressNode: k8s-node-worker-1 +``` -If you dump the flows for this table, you may see the following: +## OVS Tables -```text -1. table=20, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],mod_dl_src:aa:bb:cc:dd:ee:ff,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],load:0xaabbccddeeff->NXM_NX_ARP_SHA[],move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],load:0xa0a0101->NXM_OF_ARP_SPA[],IN_PORT -2. table=20, priority=190,arp actions=NORMAL -3. table=20, priority=0 actions=drop -``` +![OVS pipeline](../assets/ovs-pipeline.svg) + +### PipelineRootClassifier + +This table serves as the primary entry point in the pipeline, forwarding packets to different tables based on their +respective protocols. -Flow 1 is the "ARP responder" for the peer Node whose local Pod subnet is -10.10.1.0/24. If we were to look at the routing table for the local Node, we -would see the following "onlink" route: +If you dump the flows of this table, you may see the following: ```text -10.10.1.0/24 via 10.10.1.1 dev antrea-gw0 onlink +1. table=PipelineRootClassifier, priority=200,arp actions=goto_table:ARPSpoofGuard +2. table=PipelineRootClassifier, priority=200,ip actions=goto_table:Classifier +3. table=PipelineRootClassifier, priority=0 actions=drop ``` -A similar route is installed on the gateway (antrea-gw0) interface every time the -Antrea Node Route Controller is notified that a new Node has joined the -cluster. The route must be marked as "onlink" since the kernel does not have a -route to the peer gateway 10.10.1.1: we trick the kernel into believing that -10.10.1.1 is directly connected to the local Node, even though it is on the -other side of the tunnel. +Flow 1 forwards ARP packets to table [ARPSpoofGuard]. -Flow 2 ensures that OVS handle the remainder of ARP traffic as a regular L2 -learning switch (using the `normal` action). In particular, this takes care of -forwarding ARP requests and replies between local Pods. +Flow 2 forwards IP packets to table [Classifier]. -The table-miss flow entry (flow 3) will drop all other packets. This flow should -never be used because only ARP traffic should go to this table, and -ARP traffic will either match flow 1 or flow 2. +Flow 3 is the table-miss flow to drop other unsupported protocols, not normally used. -### ServiceHairpinTable (23) +### ARPSpoofGuard -When a backend Pod of a Service accesses the Service, and the Pod itself is selected -as the destination, then we have the hairpin case, in which the source IP should be -SNAT'd with a virtual hairpin IP in [hairpinSNATTable]. The source and destination -IP addresses cannot be the same, otherwise the connection will be broken. It will be -explained in detail in [hairpinSNATTable]. For response packets, the -destination IP is the virtual hairpin IP, so the destination IP should be changed back -to the IP of the backend Pod. Then the response packets can be forwarded back correctly. +This table is designed to drop ARP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) packets from local Pods or +the local Antrea gateway. We ensure that the advertised IP and MAC addresses are correct, meaning they match the values +configured on the interface when Antrea sets up networking for a local Pod or the local Antrea gateway. -If you dump the flows for this table, you should see the flows: +If you dump the flows of this table, you may see the following: ```text -1. table=23, priority=200,ip,nw_dst=169.254.169.252 actions=move:NXM_OF_IP_SRC[]->NXM_OF_IP_DST[],load:0x1->NXM_NX_REG0[18],resubmit(,30) -2. table=23, priority=0 actions=resubmit(,24) +1. table=ARPSpoofGuard, priority=200,arp,in_port="antrea-gw0",arp_spa=10.10.0.1,arp_sha=ba:5e:d1:55:aa:c0 actions=goto_table:ARPResponder +2. table=ARPSpoofGuard, priority=200,arp,in_port="client-6-3353ef",arp_spa=10.10.0.26,arp_sha=5e:b5:e3:a6:90:b7 actions=goto_table:ARPResponder +3. table=ARPSpoofGuard, priority=200,arp,in_port="web-7975-274540",arp_spa=10.10.0.24,arp_sha=fa:b7:53:74:21:a6 actions=goto_table:ARPResponder +4. table=ARPSpoofGuard, priority=200,arp,in_port="db-755c6-5080e3",arp_spa=10.10.0.25,arp_sha=36:48:21:a2:9d:b4 actions=goto_table:ARPResponder +5. table=ARPSpoofGuard, priority=0 actions=drop ``` -Flow 1 is used to match packet whose destination IP is virtual hairpin IP and -change the destination IP of the matched packet by loading register `NXM_OF_IP_SRC` -to `NXM_OF_IP_DST`. Bit 18 in NXM_NX_REG0 is set to 0x1, which indicates that the -packet should be output to the port on which it was received, which is done in -[L2ForwardingOutTable]. +Flow 1 matches legitimate ARP packets from the local Antrea gateway. + +Flows 2-4 match legitimate ARP packets from local Pods. + +Flow 5 is the table-miss flow to drop ARP spoofing packets, which are not matched by flows 1-4. + +### ARPResponder -### ConntrackTable (30) +The purpose of this table is to handle ARP requests from the local Antrea gateway or local Pods, addressing specific cases: -The sole purpose of this table is to invoke the `ct` action on all packets and -set the `ct_zone` (connection tracking context) to a hard-coded value, then -forward traffic to [ConntrackStateTable]. If you dump the flows for this table, -you should only see 1 flow: +1. Responding to ARP requests from the local Antrea gateway seeking the MAC address of a remote Antrea gateway located + on a different Node. This ensures that the local Node can reach any remote Pods. +2. Ensuring the normal layer 2 (L2) learning among local Pods and the local Antrea gateway. + +If you dump the flows of this table, you may see the following: ```text -1. table=30, priority=200,ip actions=ct(table=31,zone=65520) +1. table=ARPResponder, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],set_field:aa:bb:cc:dd:ee:ff->eth_src,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],set_field:aa:bb:cc:dd:ee:ff->arp_sha,move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],set_field:10.10.1.1->arp_spa,IN_PORT +2. table=ARPResponder, priority=190,arp actions=NORMAL +3. table=ARPResponder, priority=0 actions=drop ``` -A `ct_zone` is simply used to isolate connection tracking rules. It is similar -in spirit to the more generic Linux network namespaces, but `ct_zone` is -specific to conntrack and has less overhead. +Flow 1 is designed for case 1, matching ARP request packets for the MAC address of a remote Antrea gateway with IP address +`10.10.1.1`. It programs an ARP reply packet and sends it back to the port where the request packet was received. Note +that both the source hardware address and the source MAC address in the ARP reply packet are set to the *Global Virtual +MAC* `aa:bb:cc:dd:ee:ff`, not the actual MAC address of the remote Antrea gateway. This ensures that once the traffic is +received by the remote OVS bridge, it can be directly forwarded to the appropriate Pod without actually going through +the local Antrea gateway. The *Global Virtual MAC* is used as the destination MAC address for all the traffic being +tunneled or routed. -After invoking the ct action, packets will be in the "tracked" (`trk`) state and -all [connection tracking -fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) will be -set to the correct value. Packets will then move on to [ConntrackStateTable]. +This flow serves as the "ARP responder" for the peer Node whose local Pod subnet is `10.10.1.0/24`. If we were to look +at the routing table for the local Node, we would find the following "onlink" route: -Refer to [this -document](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for -more information on connection tracking in OVS. +```text +10.10.1.0/24 via 10.10.1.1 dev antrea-gw0 onlink +``` -### ConntrackStateTable (31) +A similar route is installed on the local Antrea gateway (antrea-gw0) interface every time the Antrea *Node Route Controller* +is notified that a new Node has joined the cluster. The route must be marked as "onlink" since the kernel does not have +a route to the peer gateway `10.10.1.1`. We "trick" the kernel into believing that `10.10.1.1` is directly connected to +the local Node, even though it is on the other side of the tunnel. -This table handles "tracked" packets (packets which are moved to the tracked -state by the previous table [ConntrackTable]) and "untracked" packets (packets -is not in tracked state). +Flow 2 is designed for case 2, ensuring that OVS handles the remainder of ARP traffic as a regular L2 learning switch +(using the `normal` action). In particular, this takes care of forwarding ARP requests and replies among local Pods. -This table serves the following purposes: +Flow 3 is the table-miss flow, which should never be used since ARP packets will be matched by either flow 1 or 2. -* For tracked Service packets, bit 19 in NXM_NX_REG0 will be set to 0x1, then - the tracked packet will be forwarded to [EgressRuleTable] directly. -* Drop packets reported as invalid by conntrack. -* Non-Service tracked packets goes to [EgressRuleTable] directly. -* Untracked packets goes to [SessionAffinityTable] and [ServiceLBTable]. +### Classifier -If you dump the flows for this table, you should see the following: +This table is designed to determine the "category" of IP packets by matching on their ingress port. It addresses +specific cases: + +1. Packets originating from the local Node through the local Antrea gateway port, requiring IP spoof legitimacy + verification. +2. Packets originating from the external network through the Antrea gateway port. +3. Packets received through an overlay tunnel. +4. Packets received through a return port defined in a user-provided TrafficControl CR (for feature `TrafficControl`). +5. Packets returned from an application-aware engine through a specific port (for feature `L7NetworkPolicy`). +6. Packets originating from local Pods, requiring IP spoof legitimacy verification. + +If you dump the flows of this table, you may see the following: ```text -1. table=31, priority=200,ct_state=-new+trk,ct_mark=0x21,ip actions=load:0x1->NXM_NX_REG0[19],resubmit(,50) -2. table=31, priority=190,ct_state=+inv+trk,ip actions=drop -3. table=31, priority=190,ct_state=-new+trk,ip actions=resubmit(,50) -4. table=31, priority=0 actions=resubmit(,40),resubmit(,41) +1. table=Classifier, priority=210,ip,in_port="antrea-gw0",nw_src=10.10.0.1 actions=set_field:0x2/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +2. table=Classifier, priority=200,in_port="antrea-gw0" actions=set_field:0x2/0xf->reg0,set_field:0x8000000/0x8000000->reg4,goto_table:SpoofGuard +3. table=Classifier, priority=200,in_port="antrea-tun0" actions=set_field:0x1/0xf->reg0,set_field:0x200/0x200->reg0,goto_table:UnSNAT +4. table=Classifier, priority=200,in_port="antrea-tc-tap2" actions=set_field:0x6/0xf->reg0,goto_table:L3Forwarding +5. table=Classifier, priority=200,in_port="antrea-l7-tap1",vlan_tci=0x1000/0x1000 actions=pop_vlan,set_field:0x6/0xf->reg0,goto_table:L3Forwarding +6. table=Classifier, priority=190,in_port="client-6-3353ef" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +7. table=Classifier, priority=190,in_port="web-7975-274540" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +8. table=Classifier, priority=190,in_port="db-755c6-5080e3" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +9. table=Classifier, priority=0 actions=drop ``` -Flow 1 is used to forward tracked Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. -The flow also sets bit 19 in NXM_NX_REG0 to 0x1, which indicates that the destination -and source MACs of the matched packets should be rewritten in [l3ForwardingTable]. +Flow 1 is designed for case 1, matching the source IP address `10.10.0.1` to ensure that the packets are originating from +the local Antrea gateway. The following reg marks are loaded: + +- `FromGatewayRegMark`, indicating that the packets are received on the local Antrea gateway port, which will be + consumed in tables [L3Forwarding], [L3DecTTL], [SNATMark] and [SNAT]. +- `FromLocalRegMark`, indicating that the packets are from the local Node, which will be consumed in table [ServiceLB]. + +Flow 2 is designed for case 2, matching packets originating from the external network through the Antrea gateway port +and forwarding them to table [SpoofGuard]. Since packets originating from the local Antrea gateway are matched by flow +1, flow 2 can only match packets originating from the external network. The following reg marks are loaded: + +- `FromGatewayRegMark`, the same as flow 1. +- `FromExternalRegMark`, indicating that the packets are from the external network, not the local Node. + +Flow 3 is for case 3, matching packets through an overlay tunnel (i.e., from another Node) and forwarding them to table +[UnSNAT]. This approach is based on the understanding that these packets originate from remote Nodes, potentially +bearing varying source IP addresses. These packets undergo legitimacy verification before being tunneled. As a consequence, +packets from the tunnel should be seamlessly forwarded to table [UnSNAT]. The following reg marks are loaded: + +- `FromTunnelRegMark`, indicating that the packets are received on a tunnel, consumed in table [L3Forwarding]. +- `RewriteMACRegMark`, indicating that the source and destination MAC addresses of the packets should be rewritten, + and consumed in table [L3Forwarding]. + +Flow 4 is for case 4, matching packets from a TrafficControl return port and forwarding them to table [L3Forwarding] +to decide the egress port. It's important to note that a forwarding decision for these packets was already made before +redirecting them to the TrafficControl target port in table [Output], and at this point, the source and destination MAC +addresses of these packets have already been set to the correct values. The only purpose of forwarding the packets to +table [L3Forwarding] is to load the tunnel destination IP for packets destined for remote Nodes. This ensures that the +returned packets destined for remote Nodes are forwarded through the tunnel. `FromTCReturnRegMark`, which will be used +in table [TrafficControl], is loaded to mark the packet source. -Flow 2 is used to drop packets which is reported as invalid by conntrack. +Flow 5 is for case 5, matching packets returned back from an application-aware engine through a specific port, stripping +the VLAN ID used by the application-aware engine, and forwarding them to table [L3Forwarding] to decide the egress port. +Like flow 4, the purpose of forwarding the packets to table [L3Forwarding] is to load the tunnel destination IP for +packets destined for remote Nodes, and `FromTCReturnRegMark` is also loaded. -Flow 3 is used to forward tracked non-Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. +Flows 6-8 are for case 6, matching packets from local Pods and forwarding them to table [SpoofGuard] to do legitimacy +verification. The following reg marks are loaded: -Flow 4 is used to match the first packet of untracked connection and forward it to -[SessionAffinityTable] and [ServiceLBTable]. +- `FromPodRegMark`, indicating that the packets are received on the ports connected to the local Pods, consumed in + tables [L3Forwarding] and [SNATMark]. +- `FromLocalRegMark`, indicating that the packets are from the local Pods, consumed in table [ServiceLB]. -### SessionAffinityTable (40) +Flow 9 is the table-miss flow to drop packets that are not matched by flows 1-8. -If `service.spec.sessionAffinity` of a Service is `None`, this table will set the value -of bits [16..18] in NXM_NX_REG4 to 0b001, which indicates that the Service needs to do -Endpoint selection. If you dump the flow, you should see the flow: +### SpoofGuard + +This table is crafted to prevent IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. It +addresses specific cases: + +1. Allowing all packets from the local Antrea gateway. We do not perform checks for this interface as we need to accept + external traffic with a source IP address that does not match the gateway IP. +2. Ensuring that the source IP and MAC addresses are correct, i.e., matching the values configured on the interface when + Antrea sets up networking for a Pod. + +If you dump the flows of this table, you may see the following: ```text -table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=SpoofGuard, priority=200,ip,in_port="antrea-gw0" actions=goto_table:UnSNAT +2. table=SpoofGuard, priority=200,ip,in_port="client-6-3353ef",dl_src=5e:b5:e3:a6:90:b7,nw_src=10.10.0.26 actions=goto_table:UnSNAT +3. table=SpoofGuard, priority=200,ip,in_port="web-7975-274540",dl_src=fa:b7:53:74:21:a6,nw_src=10.10.0.24 actions=goto_table:UnSNAT +4. table=SpoofGuard, priority=200,ip,in_port="db-755c6-5080e3",dl_src=36:48:21:a2:9d:b4,nw_src=10.10.0.25 actions=goto_table:UnSNAT +5. table=SpoofGuard, priority=0 actions=drop ``` -If `service.spec.sessionAffinity` of a Service is `ClientIP`, when a client accesses -the Service for the first time, a learned flow with hard timeout which equals -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be -generated in this table. This will be explained in detail in chapter [ServiceLBTable]. +Flow 1 is for case 1, matching packets received on the local Antrea gateway port without checking the source IP and MAC +addresses. There are some cases where the source IP of the packets through the local Antrea gateway port is not the local +Antrea gateway IP address: + +- When Antrea is deployed with kube-proxy, and `AntreaProxy` is not enabled, packets from local Pods destined for Services + will first go through the gateway port, get load-balanced by the kube-proxy data path (undergoes DNAT) then re-enter + the OVS pipeline through the gateway port (through an "onlink" route, installed by Antrea, directing the DNAT'd packets + to the gateway port), resulting in the source IP being that of a local Pod. +- When Antrea is deployed without kube-proxy, and both `AntreaProxy` and `proxyAll` are enabled, packets from the external + network destined for Services will be routed to OVS through the gateway port without masquerading source IP. +- When Antrea is deployed with kube-proxy, packets from the external network destined for Services whose + `externalTrafficPolicy` is set to `Local` will get load-balanced by the kube-proxy data path (undergoes DNAT with a + local Endpoint selected by the kube-proxy) and then enter the OVS pipeline through the gateway (through a "onlink" + route, installed by Antrea, directing the DNAT'd packets to the gateway port) without masquerading source IP. -### ServiceLBTable (41) +Flows 2-4 are for case 2, matching legitimate IP packets from local Pods. -This table is used to implement Service Endpoint selection. Note that, currently, only -ClusterIP Service request from Pods is supported. NodePort, LoadBalancer and ClusterIP -whose client is from K8s Node will be supported in the future. +Flow 5 is the table-miss flow to drop IP spoofing packets. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `None`, if you -dump the flows, you should see the following flow: +### UnSNAT + +This table is used to undo SNAT on reply packets by invoking action `ct` on them. The packets are from SNAT'd Service +connections that have been committed to `SNATCtZone` in table [SNAT]. After invoking action `ct`, the packets will be +in a "tracked" state, restoring all [connection tracking +fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) (such as `ct_state`, `ct_mark`, `ct_label`, etc.) +to their original values. The packets with a "tracked" state are then forwarded to table [ConntrackZone]. + +If you dump the flows of this table, you may see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 +1. table=UnSNAT, priority=200,ip,nw_dst=169.254.0.253 actions=ct(table=ConntrackZone,zone=65521,nat) +2. table=UnSNAT, priority=200,ip,nw_dst=10.10.0.1 actions=ct(table=ConntrackZone,zone=65521,nat) +3. table=UnSNAT, priority=0 actions=goto_table:ConntrackZone ``` -Among the match conditions of the above flow: +Flow 1 matches reply packets for Service connections which were SNAT'd with the *Virtual Service IP* `169.254.0.253` +and invokes action `ct` on them. + +Flow 2 matches packets for Service connections which were SNAT'd with the local Antrea gateway IP `10.10.0.1` and +invokes action `ct` on them. This flow also matches request packets destined for the local Antrea gateway IP from +local Pods by accident. However, this is harmless since such connections will never be committed to `SNATCtZone`, and +therefore, connection tracking fields for the packets are unset. + +Flow 3 is the table-miss flow. -* `reg4=0x10000/0x70000`, value of bits [16..18] in NXM_NX_REG4 is 0b001, which is used - to match Service packet whose state is to do Endpoint selection. The value of - bits [16..18] in NXM_NX_REG4 is set in [SessionAffinityTable] by flow `table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18]`. +For reply packets from SNAT'd connections, whose destination IP is the translated SNAT IP, after invoking action `ct`, +the destination IP of the packets will be restored to the original IP before SNAT, stored in the connection tracking +field `ct_nw_dst`. -The actions of the above flow: +### ConntrackZone -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 - to 0b002, which indicates that Endpoint selection "is performed". Note that, Endpoint - selection has not really been done yet - it will be done by group action. The current - action should have been done in target OVS group entry after Endpoint selection. However, - we set the bits here, for the purpose of supporting more Endpoints in an OVS group. - Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, - which means that the source and destination MACs need to be rewritten. -* `group:5` is used to set the target OVS group. Note that, the target group needs to be - created first before the flow is created. +The main purpose of this table is to invoke action `ct` on packets from all connections. After invoking `ct` action, +packets will be in a "tracked" state, restoring all connection tracking fields to their appropriate values. When invoking +action `ct` with `CtZone` to the packets that have a "tracked" state associated with `SNATCtZone`, then the "tracked" +state associated with `SNATCtZone` will be inaccessible. This transition occurs because the "tracked" state shifts to +another state associated with `CtZone`. A ct zone is similar in spirit to the more generic Linux network namespaces, +uniquely containing a "tracked" state within each ct zone. -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +If you dump the flows of this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42) +1. table=ConntrackZone, priority=200,ip actions=ct(table=ConntrackState,zone=65520,nat) +2. table=ConntrackZone, priority=0 actions=goto_table:ConntrackState ``` -For the above OVS group, there are three buckets which have the same weight. Every bucket -has the same chance to be selected since they have the same weight. The selected bucket -will load Endpoint IPv4 address to NXM_NX_REG3, Endpoint port number to bits [0..15] -in NXM_NX_REG4. Then the matched packet will be resubmitted to [EndpointDNATTable]. +Flow 1 invokes `ct` action on packets from all connections, and the packets are then forwarded to table [ConntrackState] +with the "tracked" state associated with `CtZone`. Note that for packets in an established Service (DNATed) connection, +not the first packet of a Service connection, DNAT or un-DNAT is performed on them before they are forwarded. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `ClientIP`, you may -see the following flows: +Flow 2 is the table-miss flow that should remain unused. + +### ConntrackState + +This table handles packets from the connections that have a "tracked" state associated with `CtZone`. It addresses +specific cases: + +1. Dropping invalid packets reported by conntrack. +2. Forwarding tracked packets from all connections to table [AntreaPolicyEgressRule] directly, bypassing the tables + like [PreRoutingClassifier], [NodePortMark], [SessionAffinity], [ServiceLB], and [EndpointDNAT] for Service Endpoint + selection. +3. Forwarding packets from new connections to table [PreRoutingClassifier] to start Service Endpoint selection since + Service connections are not identified at this stage. + +If you dump the flows of this table, you may see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x3->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 -2. table=41, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=\ - learn(table=40,hard_timeout=300,priority=200,delete_learned,cookie=0x2040000000008, \ - eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],\ - load:NXM_NX_REG3[]->NXM_NX_REG3[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19]),\ - load:0x2->NXM_NX_REG4[16..18],\ - resubmit(,42) +1. table=ConntrackState, priority=200,ct_state=+inv+trk,ip actions=drop +2. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0/0x10,ip actions=goto_table:AntreaPolicyEgressRule +3. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0x10/0x10,ip actions=set_field:0x200/0x200->reg0,goto_table:AntreaPolicyEgressRule +4. table=ConntrackState, priority=0 actions=goto_table:PreRoutingClassifier ``` -When a client (assumed that the source IP is 10.10.0.2) accesses the ClusterIP for the first -time, the first packet of the connection will be matched by flow 1. Note that the action -`load:0x3->NXM_NX_REG4[16..18]` indicates that the Service Endpoint selection result needs -to be cached. +Flow 1 is for case 1, dropping invalid packets. + +Flow 2 is for case 2, matching packets from non-Service connections with `NotServiceCTMark` and forwarding them to +table [AntreaPolicyEgressRule] directly, bypassing the tables for Service Endpoint selection. + +Flow 3 is also for case 2, matching packets from Service connections with `ServiceCTMark` loaded in table +[EndpointDNAT] and forwarding them to table [AntreaPolicyEgressRule], bypassing the tables for Service Endpoint +selection. `RewriteMACRegMark`, which is used in table [L3Forwarding], is loaded in this flow, indicating that the +source and destination MAC addresses of the packets should be rewritten. + +Flow 4 is the table-miss flow for case 3, matching packets from all new connections and forwarding them to table +[PreRoutingClassifier] to start the processing of Service Endpoint selection. + +### PreRoutingClassifier -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +This table handles the first packet from uncommitted Service connections before Service Endpoint selection. It +sequentially resubmits the packets to tables [NodePortMark] and [SessionAffinity] to do some pre-processing, including +the loading of specific reg marks. Subsequently, it forwards the packets to table [ServiceLB] to perform Service Endpoint +selection. + +If you dump the flows of this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41) +1. table=PreRoutingClassifier, priority=200,ip actions=resubmit(,NodePortMark),resubmit(,SessionAffinity),resubmit(,ServiceLB) +2. table=PreRoutingClassifier, priority=0 actions=goto_table:NodePortMark ``` -Note the action `resubmit(,41)` resubmits the first packet of a ClusterIP Service connection -back to [ServiceLBTable], not resubmits the packet to [EndpointDNATTable]. Then the -packet will be matched by flow 2 since value of bits [16..18] in NXM_NX_REG4 is 0b011. One -action of the flow is to generate a learned flow in [SessionAffinityTable], the other -action is to resubmit the packet to [EndpointDNATTable]. +Flow 1 sequentially resubmits packets to tables [NodePortMark], [SessionAffinity], and [ServiceLB]. Note that packets +are ultimately forwarded to table [ServiceLB]. In tables [NodePortMark] and [SessionAffinity], only reg marks are loaded. + +Flow 2 is the table-miss flow that should remain unused. + +### NodePortMark + +This table is designed to potentially mark packets destined for NodePort Services. It is only created when `proxyAll` is +enabled. -Now if you dump flows of table [SessionAffinityTable], you may see the following flows: +If you dump the flows of this table, you may see the following: ```text -1. table=40, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.2,nw_dst=10.107.100.231,tp_dst=443 \ - actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19] -2. table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=NodePortMark, priority=200,ip,nw_dst=192.168.77.102 actions=set_field:0x80000/0x80000->reg4 +2. table=NodePortMark, priority=200,ip,nw_dst=169.254.0.252 actions=set_field:0x80000/0x80000->reg4 +3. table=NodePortMark, priority=0 actions=goto_table:SessionAffinity ``` -Note that, flow 1 (the generated learned flow) has higher priority than flow 2 in table -[SessionAffinityTable]. When a particular client accesses the ClusterIP once again, the first -packet of the connection will be matched by flow 1 due to the match condition `nw_src=10.10.0.2`. +Flow 1 matches packets destined for the local Node from local Pods. `NodePortRegMark` is loaded, indicating that the +packets are potentially destined for NodePort Services. We assume only one valid IP address, `192.168.77.102` (the +Node's transport IP), can serve as the host IP address for NodePort based on the option `antreaProxy.nodePortAddresses`. +If there are multiple valid IP addresses specified in the option, a flow similar to flow 1 will be installed for each +IP address. -The actions of flow 1: +Flow 2 match packets destined for the *Virtual NodePort DNAT IP*. Packets destined for NodePort Services from the local +Node or the external network is DNAT'd to the *Virtual NodePort DNAT IP* by iptables before entering the pipeline. -* `load:0xa0a0004->NXM_NX_REG3[]` is used to load Endpoint IPv4 address to NXM_NX_REG3. -* `load:0x50->NXM_NX_REG4[0..15]` is used to load Endpoint port number to bits [0..15] in - NXM_NX_REG4. -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 to - 0b010, which indicates that the Service has done Endpoint selection. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, which - indicates that the source and destination MACs need to be rewritten. +Flow 3 is the table-miss flow. -Note that, if the value of bits [16..18] in NXM_NX_REG4 is 0b010 (set by action `load:0x2->NXM_NX_REG4[16..18]` -in table [SessionAffinityTable]), then packet will not be matched by any flows in table -[ServiceLBTable] except the last one. The last one just forwards the packet to table -[EndpointDNATTable] without selecting target OVS group. Then connections from a particular -client will always access the same backend Pod within the session timeout setting by -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds`. +Note that packets of NodePort Services have not been identified in this table by matching destination IP address. The +identification of NodePort Services will be done finally in table [ServiceLB] by matching `NodePortRegMark` and the +the specific destination port of a NodePort. -### EndpointDNATTable (42) +### SessionAffinity -The table implements DNAT for Service traffic after Endpoint selection for the first -packet of a Service connection. +This table is designed to implement Service session affinity. The learned flows that cache the information of the +selected Endpoints are installed here. -If you dump the flows for this table, you should see flows like the following: +If you dump the flows of this table, you may see the following: ```text -1. table=42, priority=200,tcp,reg3=0xc0a84d64,reg4=0x2192b/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.100:6443),exec(load:0x21->NXM_NX_CT_MARK[])) -2. table=42, priority=200,tcp,reg3=0xc0a84d65,reg4=0x2286d/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.101:10349),exec(load:0x21->NXM_NX_CT_MARK[])) -3. table=42, priority=200,tcp,reg3=0xa0a0004,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.4:80),exec(load:0x21->NXM_NX_CT_MARK[])) -4. table=42, priority=200,tcp,reg3=0xa0a0102,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.1.2:80),exec(load:0x21->NXM_NX_CT_MARK[])) -5. table=42, priority=200,udp,reg3=0xa0a0002,reg4=0x20035/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.2:53),exec(load:0x21->NXM_NX_CT_MARK[])) -6. table=42, priority=190,reg4=0x20000/0x70000 actions=load:0x1->NXM_NX_REG4[16..18],resubmit(,41) -7. table=42, priority=0 actions=resubmit(,45) +1. table=SessionAffinity, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.1,nw_dst=10.96.76.15,tp_dst=80 \ + actions=set_field:0x50/0xffff->reg4,set_field:0/0x4000000->reg4,set_field:0xa0a0001->reg3,set_field:0x20000/0x70000->reg4,set_field:0x200/0x200->reg0 +2. table=SessionAffinity, priority=0 actions=set_field:0x10000/0x70000->reg4 ``` -For flow 1-5, DNAT is performed with the IPv4 address stored in NXM_NX_REG3 and port number stored in -bits[0..15] in NXM_NX_REG4 by `ct commit` action. Note that, the match condition `reg4=0x2192b/0x7ffff` -is a union value. The value of bits [0..15] is port number. The value of bits [16..18] is 0b010, -which indicates that Service has done Endpoint selection. Service ct_mark `0x21` is also marked. +Flow 1 is a learned flow generated by flow 3 in table [ServiceLB], designed for the sample Service [ClusterIP with +Session Affinity], to implement Service session affinity. Here are some details about the flow: -If none of the flows described above are hit, flow 6 is used to forward packet back to table [ServiceLBTable] -to select Endpoint again. +- The "hard timeout" of the learned flow should be equal to the value of + `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` defined in the Service. This means that until the hard + timeout expires, this flow is present in the pipeline, and the session affinity of the Service takes effect. Unlike an + "idle timeout", the "hard timeout" does not reset whenever the flow is matched. +- Source IP address, destination IP address, destination port, and transport protocol are used to match packets of + connections sourced from the same client and destined for the Service during the affinity time window. +- Endpoint IP address and Endpoint port are loaded into `EndpointIPField` and `EndpointPortField` respectively. +- `EpSelectedRegMark` is loaded, indicating that the Service Endpoint selection is done, and ensuring that the packets + will only match the last flow in table [ServiceLB]. +- `RewriteMACRegMark`, which will be consumed in table [L3Forwarding], is loaded here, indicating that the source and + destination MAC addresses of the packets should be rewritten. -Flow 7 is used to match non-Service packet. +Flow 2 is the table-miss flow to match the first packet of connections destined for Services. The loading of +`EpToSelectRegMark`, to be consumed in table [ServiceLB], indicating that the packet needs to do Service Endpoint +selection. -### AntreaPolicyEgressRuleTable (45) +### ServiceLB -For this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) -that we are using. +This table is used to implement Service Endpoint selection. It addresses specific cases: -This table is used to implement the egress rules across all Antrea-native policies, -except for policies that are created in the Baseline Tier. Antrea-native policies -created in the Baseline Tier will be enforced after K8s NetworkPolicies, and their -egress rules are installed in the [EgressDefaultTable] and [EgressRuleTable] -respectively, i.e. +1. ClusterIP, as demonstrated in the examples [ClusterIP without Endpoint] and [ClusterIP]. +2. NodePort, as demonstrated in the example [NodePort]. +3. LoadBalancer, as demonstrated in the example [LoadBalancer]. +4. Service configured with external IPs, as demonstrated in the example [Service with ExternalIP]. +5. Service configured with session affinity, as demonstrated in the example [Service with session affinity]. +6. Service configured with externalTrafficPolicy to `Local`, as demonstrated in the example [Service with + ExternalTrafficPolicy Local]. + +If you dump the flows of this table, you may see the following: ```text -Baseline Tier -> EgressDefaultTable(60) -K8s NetworkPolicy -> EgressRuleTable(50) -All other Tiers -> AntreaPolicyEgressRuleTable(45) +1. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.101.255.29,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x9->reg7,group:9 +2. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.105.31.235,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xc->reg7,group:10 +3. table=ServiceLB, priority=200,tcp,reg4=0x90000/0xf0000,tp_dst=30004 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x200000/0x200000->reg4,set_field:0xc->reg7,group:12 +4. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=192.168.77.150,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xe->reg7,group:14 +5. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=192.168.77.200,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x10->reg7,group:16 +6. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x30000/0x70000->reg4,set_field:0xa->reg7,group:11 +7. table=ServiceLB, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=learn(table=SessionAffinity,hard_timeout=300,priority=200,delete_learned,cookie=0x203000000000a,\ + eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:NXM_NX_REG4[26]->NXM_NX_REG4[26],load:NXM_NX_REG3[]->NXM_NX_REG3[],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[9]),\ + set_field:0x20000/0x70000->reg4,goto_table:EndpointDNAT +8. table=ServiceLB, priority=210,tcp,reg4=0x10010000/0x10070000,nw_dst=192.168.77.151,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x11->reg7,group:17 +9. table=ServiceLB, priority=200,tcp,nw_dst=192.168.77.151,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x12->reg7,group:18 +10. table=ServiceLB, priority=0 actions=goto_table:EndpointDNAT ``` -Since the example ACNP resides in the Application tier, if you dump the flows for -table 45, you should see something like this: +Flow 1 and flow 2 are designed for case 1, matching the first packet of connections destined for the sample [ClusterIP +without Endpoint] or [ClusterIP]. This is achieved by matching `EpToSelectRegMark` loaded in table [SessionAffinity], +clusterIP, and port. The target of the packet matched by the flow is an OVS group where the Endpoint will be selected. +Before forwarding the packet to the OVS group, `RewriteMACRegMark`, which will be consumed in table [L3Forwarding], is +loaded, indicating that the source and destination MAC addresses of the packets should be rewritten. `EpSelectedRegMark` +, which will be consumed in table [EndpointDNAT], is also loaded, indicating that the Endpoint is selected. Note that the +Service Endpoint selection is not completed yet, as it will be done in the target OVS group. -```text -1. table=45, priority=64990,ct_state=-new+est,ip actions=resubmit(,61) -2. table=45, priority=14000,conj_id=1,ip actions=load:0x1->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x1->NXM_NX_CT_LABEL[32..63])) -3. table=45, priority=14000,ip,nw_src=10.10.1.6 actions=conjunction(1,1/3) -4. table=45, priority=14000,ip,nw_dst=10.10.1.8 actions=conjunction(1,2/3) -5. table=45, priority=14000,udp,tp_dst=53 actions=conjunction(1,3/3) -6. table=45, priority=0 actions=resubmit(,50) -``` +Flow 3 is for case 2, matching the first packet of connections destined for the sample [NodePort]. This is achieved by +matching `EpToSelectRegMark` loaded in table [SessionAffinity], `NodePortRegMark` loaded in table [NodePortMark], and +NodePort port. Similar to flows 1-2, `RewriteMACRegMark` and `EpSelectedRegMark` are also loaded. -Similar to [K8s NetworkPolicy implementation](#egressruletable-50), -AntreaPolicyEgressRuleTable also relies on the OVS built-in `conjunction` action to -implement policies efficiently. +Flow 4 is for case 3, processing the first packet of connections destined for the ingress IP of the sample +[LoadBalancer], similar to flow 1. -The above example flows read as follow: if the source IP address is in set -{10.10.1.6}, and the destination IP address is in the set {10.10.1.8}, and the -destination TCP port is in the set {53}, then use the `conjunction` action with -id 1, which stores the `conj_id` 1 in `ct_label[32..63]` for egress metrics collection -purposes, and forwards the packet to EgressMetricsTable, then [L3ForwardingTable]. -Otherwise, go to [EgressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy egress rules in any tier (except for the "baseline" tier). +Flow 5 is for case 4, processing the first packet of connections destined for the external IP of the sample [Service +with ExternalIP], similar to flow 1. -If the `conjunction` action is matched, packets are "allowed" or "dropped" -based on the `action` field of the policy rule. If allowed, they follow a similar -path as described in the following [EgressRuleTable] section. +Flow 6 is the initial process for case 5, matching the first packet of connections destined for the sample [Service with +Session Affinity]. This is achieved by matching the conditions similar to flow 1. Like flow 1, the target of the flow is +also an OVS group, and `RewriteMACRegMark` is loaded. The difference is that `EpToLearnRegMark` is loaded, rather than +`EpSelectedRegMark`, indicating that the selected Endpoint needs to be cached. -Unlike the default of K8s NetworkPolicies, Antrea-native policies have no such -default rules. Hence, they are evaluated as-is, and there is no need for a -AntreaPolicyEgressDefaultTable. +Flow 7 is the final process for case 5, matching the packet previously matched by flow 6, resubmitted back from the target OVS +group after selecting an Endpoint. Then a learned flow will be generated in table [SessionAffinity] to match the packets +of the subsequent connections from the same client IP, ensuring that the packets are always forwarded to the same Endpoint +selected the first time. `EpSelectedRegMark`, which will be consumed in table [EndpointDNAT], is loaded, indicating that +Service Endpoint selection has been done. -### EgressRuleTable (50) +Flow 8 and flow 9 are for case 6. Flow 8 has higher priority than flow 9, prioritizing matching the first +packet of connections sourced from a local Pod or the local Node with `FromLocalRegMark` loaded in table [Classifier] +and destined for the sample [Service with ExternalTrafficPolicy Local]. The target of flow 8 is an OVS group that has +all the Endpoints across the cluster, ensuring accessibility for Service connections originating from local Pods or +Nodes, even though `externalTrafficPolicy` is set to `Local` for the Service. Due to the existence of flow 8, consequently, +flow 9 exclusively matches packets sourced from the external network, resembling the pattern of flow 1. The target of +flow 9 is an OVS group that has only the local Endpoints since `externalTrafficPolicy` of the Service is `Local`. -For this table, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +Flow 10 is the table-miss flow. -This table is used to implement the egress rules across all Network Policies. If -you dump the flows for this table, you should see something like this: +As mentioned above, the Service Endpoint selection is performed within OVS groups. 3 typical OVS groups are listed below: ```text -1. table=50, priority=210,ct_state=-new+est,ip actions=goto_table:70 -2. table=50, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(2,1/3) -3. table=50, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(2,1/3) -4. table=50, priority=200,ip,nw_dst=10.10.1.2 actions=conjunction(2,2/3) -5. table=50, priority=200,ip,nw_dst=10.10.1.3 actions=conjunction(2,2/3) -6. table=50, priority=200,tcp,tp_dst=80 actions=conjunction(2,3/3) -7. table=50, priority=190,conj_id=2,ip actions=load:0x2->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x2->NXM_NX_CT_LABEL[32..63])) -8. table=50, priority=0 actions=goto_table:60 +1. group_id=9,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0x4000/0x4000->reg0,resubmit(,EndpointDNAT) +2. group_id=10,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0018->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT),\ + bucket=bucket_id:1,weight:100,actions=set_field:0x4000000/0x4000000->reg4,set_field:0xa0a0106->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT) +3. group_id=11,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0018->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB),\ + bucket=bucket_id:1,weight:100,actions=set_field:0x4000000/0x4000000->reg4,set_field:0xa0a0106->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB) ``` -Notice how we use the OVS built-in `conjunction` action to implement policies -efficiently. This enables us to do a conjunctive match across multiple -dimensions (source IP, destination IP, port) efficiently without "exploding" the -number of flows. By definition of a conjunctive match, we have at least 2 -dimensions. For our use-case we have at most 3 dimensions. - -The only requirements on `conj_id` is for it to be a unique 32-bit integer -within the table. At the moment we use a single custom allocator, which is -common to all tables that can have NetworkPolicy flows installed (45, 50, -60, 85, 90 and 100). This is why `conj_id` is set to 2 in the above example -(1 was allocated for the egress rule of our Antrea-native NetworkPolicy example -in the previous section). - -The above example flows read as follow: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination IP address is in the set {10.10.1.2, -10.10.1.3}, and the destination TCP port is in the set {80}, then use the -`conjunction` action with id 2, which goes to [EgressMetricsTable], and then -[L3ForwardingTable]. Otherwise, packet goes to [EgressDefaultTable]. - -If the Network Policy specification includes exceptions (`except` field), then -the table will include multiple flows with conjunctive match, corresponding to -each CIDR that is present in `from` or `to` fields, but not in `except` field. -Network Policy implementation details are not covered in this document. - -If the `conjunction` action is matched, packets are "allowed" and forwarded -directly to [L3ForwardingTable]. Other packets go to [EgressDefaultTable]. If a -connection is established - as a reminder all connections are committed in -[ConntrackCommitTable] - its packets go straight to [L3ForwardingTable], with no -other match required (see flow 1 above, which has the highest priority). In -particular, this ensures that reply traffic is never dropped because of a -Network Policy rule. However, this also means that ongoing connections are not -affected if the K8s Network Policies are updated. - -One thing to keep in mind is that for Service traffic, these rules are applied -after the packets have gone through the local gateway and through kube-proxy. At -this point the ingress port is no longer the Pod port, but the local gateway -port. Therefore we cannot use the port as the match condition to identify if the -Pod has been applied a Network Policy - which is what we do for the -[IngressRuleTable] -, but instead have to use the source IP address. - -### EgressDefaultTable (60) - -This table complements [EgressRuleTable] for Network Policy egress rule -implementation. In K8s, when a Network Policy is applied to a set of Pods, the -default behavior for these Pods become "deny" (it becomes an [isolated Pod]( -https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). -This table is in charge of dropping traffic originating from Pods to which a Network -Policy (with an egress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic originating from our 2 Pods (10.10.1.2 and 10.10.1.3), which is -confirmed by dumping the flows: +The first group with `group_id` 9 is the destination of packets matched by flow 1, designed for a Service without +Endpoints. The group only has a single bucket where `SvcNoEpRegMark` which will be used in table [EndpointDNAT] is +loaded, indicating that the Service has no Endpoint, and then packets are forwarded to table [EndpointDNAT]. -```text -1. table=60, priority=200,ip,nw_src=10.10.1.2 actions=drop -2. table=60, priority=200,ip,nw_src=10.10.1.3 actions=drop -3. table=60, priority=0 actions=goto_table:61 -``` +The second group with `group_id` 10 is the destination of packets matched by flow 2, designed for a Service with +Endpoints. The group has 2 buckets, indicating the availability of 2 selectable Endpoints. Each bucket has an equal +chance of being chosen since they have the same weights. For every bucket, the Endpoint IP and Endpoint port are loaded +into `EndpointIPField` and `EndpointPortField`, respectively. These loaded values will be consumed in table +[EndpointDNAT] to which the packets are forwarded and in which DNAT will be performed. `RemoteEndpointRegMark` is loaded +for remote Endpoints, like the bucket with `bucket_id` 1 in this group. + +The third group with `group_id` 11 is the destination of packets matched by flow 6, designed for a Service that has +Endpoints and is configured with session affinity. The group closely resembles the group with `group_id` 10, except that +the destination of the packets is table [ServiceLB], rather than table [EndpointDNAT]. After being resubmitted back to table +[ServiceLB], they will be matched by flow 7. + +### EndpointDNAT + +The table implements DNAT for Service connections after Endpoint selection is performed in table [ServiceLB]. -This table is also used to implement Antrea-native policy egress rules that are -created in the Baseline Tier. Since the Baseline Tier is meant to be enforced -after K8s NetworkPolicies, the corresponding flows will be created at a lower -priority than K8s default drop flows. For example, a baseline rule to drop -egress traffic to 10.0.10.0/24 for a Namespace will look like the following: +If you dump the flows of this table, you may see the following:: ```text -1. table=60, priority=80,ip,nw_src=10.10.1.11 actions=conjunction(5,1/2) -2. table=60, priority=80,ip,nw_src=10.10.1.10 actions=conjunction(5,1/2) -3. table=60, priority=80,ip,nw_dst=10.0.10.0/24 actions=conjunction(5,2) -4. table=60, priority=80,conj_id=5,ip actions=load:0x3->NXM_NX_REG5[],load:0x1->NXM_NX_REG0[20],resubmit(,61) +1. table=EndpointDNAT, priority=200,reg0=0x4000/0x4000 actions=controller(reason=no_match,id=62373,userdata=04) +2. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0018,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.24:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +3. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0106,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.1.6:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +4. table=EndpointDNAT, priority=190,reg4=0x20000/0x70000 actions=set_field:0x10000/0x70000->reg4,resubmit(,ServiceLB) +5. table=EndpointDNAT, priority=0 actions=goto_table:AntreaPolicyEgressRule ``` -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table EgressMetricsTable, then ([L3ForwardingTable]). +Flow 1 is designed for Services without Endpoints. It identifies the first packet of connections destined for such Service +by matching `SvcNoEpRegMark`. Subsequently, the packet is forwarded to the OpenFlow controller (Antrea Agent). For TCP +Service traffic, the controller will send a TCP RST, and for all other cases the controller will send an ICMP Destination +Unreachable message. -### L3ForwardingTable (70) +Flows 2-3 are designed for Services that have selected an Endpoint. These flows identify the first packet of connections +destined for such Services by matching `EndpointPortField`, which stores the Endpoint IP, and `EpUnionField` (a combination +of `EndpointPortField` storing the Endpoint port and `EpSelectedRegMark`). Then `ct` action is invoked on the packet, +performing DNAT'd and forwarding it to table [ConntrackState] with the "tracked" state associated with `CtZone`. +Some bits of ct mark are persisted: -This is the L3 routing table. It implements the following functionality: +- `ServiceCTMark`, to be consumed in tables [L3Forwarding] and [ConntrackCommit], indicating that the current packet and + subsequent packets of the connection are for a Service. +- The value of `PktSourceField` is persisted to `ConnSourceCTMarkField`, storing the source of the connection for the + current packet and subsequent packets of the connection. -* Tunnelled traffic coming-in from a peer Node and destined to a local Pod is - directly forwarded to the Pod. This requires setting the source MAC to the MAC - of the local gateway interface and setting the destination MAC to the Pod's - MAC address. Then the packets will go to [L3DecTTLTable] for decrementing - the IP TTL value. Such packets can be identified by bit 19 of the NXM_NX_REG0 - register (which was set to 1 in the [ClassifierTable]) and the destination IP - address (which should match the IP address of a local Pod). We therefore - install one flow for each Pod created locally on the Node. For example: +Flow 4 is to resubmit the packets which are not matched by flows 1-3 back to table [ServiceLB] to select Endpoint again. -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.2 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:12:9e:a6:47:d0:70,goto_table:72 -``` +Flow 5 is the table-miss flow to match non-Service packets. -* All tunnelled traffic destined to the local gateway (i.e. for which the - destination IP matches the local gateway's IP) is forwarded to the gateway - port by rewriting the destination MAC (from the Global Virtual MAC to the - local gateway's MAC). +### AntreaPolicyEgressRule + +This table is used to implement the egress rules across all Antrea-native NetworkPolicies, except for NetworkPolicies +that are created in the Baseline Tier. Antrea-native NetworkPolicies created in the Baseline Tier will be enforced after +K8s NetworkPolicies and their egress rules are installed in tables [EgressDefaultRule] and [EgressRule] respectively, i.e. ```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.1 actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyEgressRule +K8s NetworkPolicy -> EgressRule +Antrea-native NetworkPolicy Baseline Tier -> EgressDefaultRule ``` -* All reply traffic of connections initiated through the gateway port, i.e. for - which the first packet of the connection (SYN packet for TCP) was received - through the gateway. Such packets can be identified by the packet's direction - in `ct_state` and the `ct_mark` value `0x20` which is committed in - [ConntrackCommitTable] when the first packet of the connection was handled. - A flow will overwrite the destination MAC to the local gateway MAC to ensure - that they get forwarded through the gateway port. This is required to handle - the following cases: - - reply traffic for connections from a local Pod to a ClusterIP Service, which - are handled by kube-proxy and go through DNAT. In this case the destination - IP address of the reply traffic is the Pod which initiated the connection to - the Service (no SNAT by kube-proxy). We need to make sure that these packets - are sent back through the gateway so that the source IP can be rewritten to - the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not - rewrite the destination MAC, reply traffic from the backend will go directly - to the originating Pod without going first through the gateway and - kube-proxy. This means that the reply traffic will arrive at the originating - Pod with the incorrect source IP (it will be set to the backend's IP instead - of the Service IP). - - when hair-pinning is involved, i.e. connections between 2 local Pods, for - which NAT is performed. One example is a Pod accessing a NodePort Service - for which `externalTrafficPolicy` is set to `Local` using the local Node's - IP address, as there will be no SNAT for such traffic. Another example could - be `hostPort` support, depending on how the feature is implemented. +Antrea-native NetworkPolicy relies on the OVS built-in `conjunction` action to implement policies efficiently. This +enables us to do a conjunctive match across multiple dimensions (source IP, destination IP, port, etc.) efficiently +without "exploding" the number of flows. For our use case, we have at most 3 dimensions. -```text -table=70, priority=210,ct_state=+rpl+trk,ct_mark=0x20,ip actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 -``` +The only requirement of `conj_id` is to be a unique 32-bit integer within the table. At the moment we use a single +custom allocator, which is common to all tables that can have NetworkPolicy flows installed +([AntreaPolicyEgressRule], [EgressRule], [EgressDefaultRule], [AntreaPolicyIngressRule], [IngressRule], and +[IngressDefaultRule]). -* All traffic destined to a remote Pod is forwarded through the appropriate - tunnel. This means that we install one flow for each peer Node, each one - matching the destination IP address of the packet against the Pod subnet for - the Node. In case of a match the source MAC is set to the local gateway MAC, - the destination MAC is set to the Global Virtual MAC and we set the OF - `tun_dst` field to the appropriate value (i.e. the IP address of the remote - gateway). Traffic then goes to [L3DecTTLTable]. - For a given peer Node, the flow may look like this: +For this table, you will need to keep in mind the Antrea-native NetworkPolicy +[specification](#antrea-native-networkpolicy-implementation). Since the sample egress policy resides in the Application +Tie, if you dump the flows of this table, you may see the following: ```text -table=70, priority=200,ip,nw_dst=10.10.1.0/24 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80102->NXM_NX_TUN_IPV4_DST[],goto_table:72 +1. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:EgressMetric +2. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:EgressMetric +3. table=AntreaPolicyEgressRule, priority=14500,ip,nw_src=10.10.0.24 actions=conjunction(7,1/3) +4. table=AntreaPolicyEgressRule, priority=14500,ip,nw_dst=10.10.0.25 actions=conjunction(7,2/3) +5. table=AntreaPolicyEgressRule, priority=14500,tcp,tp_dst=3306 actions=conjunction(7,3/3) +6. table=AntreaPolicyEgressRule, priority=14500,conj_id=7,ip actions=set_field:0x7->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x700000000/0xffffffff00000000->ct_label)) +7. table=AntreaPolicyEgressRule, priority=14499,ip,nw_src=10.10.0.24 actions=conjunction(5,1/2) +8. table=AntreaPolicyEgressRule, priority=14499,ip actions=conjunction(5,2/2) +9. table=AntreaPolicyEgressRule, priority=14499,conj_id=5 actions=set_field:0x5->reg3,set_field:0x400/0x400->reg0,goto_table:EgressMetric +10. table=AntreaPolicyEgressRule, priority=0 actions=goto_table:EgressRule ``` -If none of the flows described above are hit, traffic goes directly to -[L2ForwardingCalcTable]. This is the case for external traffic, whose -destination is outside the cluster (such traffic has already been -forwarded to the local gateway by the local source Pod, and only L2 switching -is required), as well as for local Pod-to-Pod traffic. +Flows 1-2, which are installed by default with the highest priority, match non-new and "tracked" packets and +forward them to table [EgressMetric] to bypass the check from egress rules. This means that if a connection is +established, its packets go straight to table [EgressMetric], with no other match required. In particular, this ensures +that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this +also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is +updated. + +The priorities of flows 3-9 installed for the egress rules are decided by the following: + +- The `spec.tier` value in an Antrea-native NetworkPolicy determines the primary level for flow priority. +- The `spec.priority` value in an Antrea-native NetworkPolicy determines the secondary level for flow priority within + the same `spec.tier`. A lower value in this field corresponds to a higher priority for the flow. +- The rule's position within an Antrea-native NetworkPolicy also influences flow priority. Rules positioned closer to + the beginning have higher priority for the flow. + +Flows 3-6, whose priorities are all 14500, are installed for the egress rule `AllowToDB` in the sample policy. These +flows are described as follows: + +- Flow 3 is used to match packets with the source IP address in set {10.10.0.24}, which has all IP addresses of the Pods + selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 7. +- Flow 4 is used to match packets with the destination IP address in set {10.10.0.25}, which has all IP addresses of + the Pods selected by the label `app: db`, constituting the second dimension for `conjunction` with `conj_id` 7. +- Flow 5 is used to match packets with the destination TCP port in set {3306} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 7. +- Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 7 and forward them + to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel`, which will be consumed in table [EgressMetric]. + +Flows 7-9, whose priorities are all 14499, are installed for the egress rule with a `Drop` action defined after the rule +`AllowToDB` in the sample policy, and serves as a default rule. Antrea-native NetworkPolicy does not have the same +default isolated behavior as K8s NetworkPolicy (implemented in the [EgressDefaultRule] table). As soon as a rule is +matched, we apply the corresponding action. If no rule is matched, there is no implicit drop for Pods to which an +Antrea-native NetworkPolicy applies. These flows are described as follows: + +- Flow 7 is used to match packets with the source IP address in set {10.10.0.24}, which is from the Pods selected + by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 5. +- Flow 8 is used to match any IP packets, constituting the second dimension for `conjunction` with `conj_id` 5. This + flow, which matches all IP packets, exists because we need at least 2 dimensions for a conjunctive match. +- Flow 9 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 5. `APDenyRegMark` is + loaded and will be consumed in table [EgressMetric] to which the packets are forwarded. + +Flow 10 is the table-miss flow to forward packets not matched by other flows to table [EgressMetric]. + +### EgressRule + +For this table, you will need to keep in mind the K8s NetworkPolicy +[specification](#kubernetes-networkpolicy-implementation) that we are using. + +This table is used to implement the egress rules across all K8s NetworkPolicies. If you dump the flows for this table, +you may see the following: ```text -table=70, priority=0 actions=goto_table:80 +1. table=EgressRule, priority=200,ip,nw_src=10.10.0.24 actions=conjunction(2,1/3) +2. table=EgressRule, priority=200,ip,nw_dst=10.10.0.25 actions=conjunction(2,2/3) +3. table=EgressRule, priority=200,tcp,tp_dst=3306 actions=conjunction(2,3/3) +4. table=EgressRule, priority=190,conj_id=2,ip actions=set_field:0x2->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x200000000/0xffffffff00000000->ct_label)) +5. table=EgressRule, priority=0 actions=goto_table:EgressDefaultRule ``` -When the Egress feature is enabled, extra flows will be added to -[L3ForwardingTable], which send the egress traffic from Pods to external network -to [SNATTable]. The following two flows match traffic to local Pods and traffic -to the local Node IP respectively, and keep them in the normal forwarding path -(to [L2ForwardingCalcTable]), so they will not be sent to [SNATTable]: +Flows 1-4 are installed for the egress rule in the sample K8s NetworkPolicy. These flows are described as follows: + +- Flow 1 is to match packets with the source IP address in set {10.10.0.24}, which has all IP addresses of the Pods + selected by the label `app: web` in the `default` Namespace, constituting the first dimension for `conjunction` with `conj_id` 2. +- Flow 2 is to match packets with the destination IP address in set {10.10.0.25}, which has all IP addresses of the Pods + selected by the label `app: db` in the `default` Namespace, constituting the second dimension for `conjunction` with `conj_id` 2. +- Flow 3 is to match packets with the destination TCP port in set {3306} specified in the rule, constituting the third + dimension for `conjunction` with `conj_id` 2. +- Flow 4 is to match packets meeting all the three dimensions of `conjunction` with `conj_id` 2 and forward them to + table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel`. + +Flow 5 is the table-miss flow to forward packets not matched by other flows to table [EgressDefaultRule]. + +### EgressDefaultRule + +This table complements table [EgressRule] for K8s NetworkPolicy egress rule implementation. When a NetworkPolicy is +applied to a set of Pods, then the default behavior for egress connections for these Pods becomes "deny" (they become [isolated +Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). +This table is in charge of dropping traffic originating from Pods to which a NetworkPolicy (with an egress rule) is +applied, and which did not match any of the "allowed" list rules. + +If you dump the flows of this table, you may see the following: ```text -table=70, priority=200,ip,reg0=0/0x80000,nw_dst=10.10.1.0/24 actions=goto_table:80 -table=70, priority=200,ip,reg0=0x2/0xffff,nw_dst=192.168.1.1 actions=goto_table:80 +1. table=EgressDefaultRule, priority=200,ip,nw_src=10.10.0.24 actions=drop +2. table=EgressDefaultRule, priority=0 actions=goto_table:EgressMetric ``` -The following two flows send the traffic not matched by other flows to -[SNATTable]. One of the flows is for egress traffic from local Pods; another -one is for egress traffic from remote Pods, which is tunnelled to this Node to -be SNAT'd with a SNAT IP configured on the Node. In the latter case, the flow -also rewrites the destination MAC to the local gateway interface MAC. +Flow 1, based on our sample K8s NetworkPolicy, is to drop traffic originating from 10.10.0.24, an IP address associated +with a Pod selected by the label `app: web`. If there are multiple Pods being selected by the label `app: web`, you will +see multiple similar flows for each IP address. + +Flow 2 is the table-miss flow to forward packets to table [EgressMetric]. + +This table is also used to implement Antrea-native NetworkPolicy egress rules that are created in the Baseline Tier. +Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the corresponding flows will be created at a +lower priority than K8s NetworkPolicy default drop flows. These flows are similar to flows 3-9 in table +[AntreaPolicyEgressRule]. For the sake of simplicity, we have not defined any example Baseline policies in this document. + +### EgressMetric + +This table is used to collect egress metrics for Antrea-native NetworkPolicies and K8s NetworkPolicies. + +If you dump the flows of this table, you may see the following: ```text -table=70, priority=190,ip,reg0=0x2/0xf actions=goto_table:71 -table=70, priority=190,ip,reg0=0/0xf actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:71 +1. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +2. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +3. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x700000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +4. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x700000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +5. table=EgressMetric, priority=200,reg0=0x400/0x400,reg3=0x5 actions=drop +6. table=EgressMetric, priority=0 actions=goto_table:L3Forwarding ``` -### SNATTable (71) +Flows 1-2, matching packets with `EgressRuleCTLabel` set to 2, the `conj_id` allocated for the sample K8s NetworkPolicy +egress rule and loaded in table [EgressRule] flow 4, are used to collect metrics for the egress rule. + +Flows 3-4, matching packets with `EgressRuleCTLabel` set to 7, the `conj_id` allocated for the sample Antrea-native +NetworkPolicy egress rule and loaded in table [AntreaPolicyEgressRule] flow 6, are used to collect metrics for the +egress rule. + +Flow 5 serves as the drop rule for the sample Antrea-native NetworkPolicy egress rule. It drops the packets by matching +`APDenyRegMark` loaded in table [AntreaPolicyEgressRule] flow 9 and `APConjIDField` set to 5 which is the `conj_id` +allocated the egress rule and loaded in table [AntreaPolicyEgressRule] flow 9. + +These flows have no explicit action besides the `goto_table` action. This is because we rely on the "implicit" flow +counters to keep track of connection / packet statistics. -This table is created only when the Egress feature is enabled. It includes flows -to implement Egresses and select the right SNAT IPs for egress traffic from Pods -to external network. +Ct label is used in flows 1-4, while reg is used in flow 5. The distinction lies in the fact that the value persisted in +the ct label can be read throughout the entire lifecycle of a connection, but the reg mark is only valid for the current +packet. For a connection permitted by a rule, all its packets should be collected for metrics, thus a ct label is used. +For a connection denied or dropped by a rule, the first packet and the subsequent retry packets will be blocked, +therefore a reg is enough. -When no Egress applies to Pods on the Node, and no SNAT IP is configured on the -Node, [SNATTable] just has two flows. One drops egress traffic tunnelled from -remote Nodes that does not match any SNAT IP configured on this Node, and the -default flow that sends egress traffic from local Pods, which do not have any -Egress applied, to [L2ForwardingCalcTable]. Such traffic will be SNAT'd with -the default SNAT IP (by an iptables masquerade rule). +Flow 6 is the table-miss flow. + +### L3Forwarding + +This table, designated as the L3 routing table, serves to assign suitable source and destination MAC addresses to +packets based on their destination IP addresses, as well as their reg marks or ct marks. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=190,ct_state=+new+trk,ip,reg0=0/0xf actions=drop -table=71, priority=0 actions=goto_table:80 +1. table=L3Forwarding, priority=210,ip,nw_dst=10.10.0.1 actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +2. table=L3Forwarding, priority=210,ct_state=+rpl+trk,ct_mark=0x2/0xf,ip actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +3. table=L3Forwarding, priority=200,ip,reg0=0/0x200,nw_dst=10.10.0.0/24 actions=goto_table:L2ForwardingCalc +4. table=L3Forwarding, priority=200,ip,nw_dst=10.10.1.0/24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,goto_table:L3DecTTL +5. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:fa:b7:53:74:21:a6->eth_dst,goto_table:L3DecTTL +6. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.25 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:36:48:21:a2:9d:b4->eth_dst,goto_table:L3DecTTL +7. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.26 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:5e:b5:e3:a6:90:b7->eth_dst,goto_table:L3DecTTL +8. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x3/0xf,reg4=0/0x100000 actions=goto_table:EgressMark +9. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x1/0xf actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,goto_table:EgressMark +10. table=L3Forwarding, priority=190,ct_mark=0x10/0x10,reg0=0x202/0x20f actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +11. table=L3Forwarding, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When there is an Egress applied to a Pod on the Node, a flow will be added for -the Pod's egress traffic. If the SNAT IP of the Egress is configured on the -local Node, the flow sets an 8 bits ID allocated for the SNAT IP to pkt_mark. -The ID is for iptables SNAT rules to match the packets and perfrom SNAT with -the right SNAT IP (Antrea Agent adds an iptables SNAT rule for each local SNAT -IP that matches the ID). +Flow 1 matches packets destined for the local Antrea gateway IP, rewrites their destination MAC address to that of the +local Antrea gateway, loads `ToGatewayRegMark`, and forwards them to table [L3DecTTL] to decrease TTL value. The action +of rewriting the destination MAC address is not necessary but not harmful for Pod-to-gateway request packets because the +destination MAC address is already the local gateway MAC address. In short, the action is only necessary for +`AntreaIPAM` Pods, not required by the sample NodeIPAM Pods in this document. + +Flow 2 matches reply packets with corresponding ct "tracked" states and `FromGatewayCTMark` from connections initiated +through the local Antrea gateway. In other words, these are connections for which the first packet of the connection +(SYN packet for TCP) was received through the local Antrea gateway. It rewrites the destination MAC address to +that of the local Antrea gateway, loads `ToGatewayRegMark`, and forwards them to table [L3DecTTL]. This ensures that +reply packets can be forwarded back to the local Antrea gateway in subsequent tables. This flow is required to handle +the following cases when AntreaProxy is not enabled: + +- Reply traffic for connections from a local Pod to a ClusterIP Service, which are handled by kube-proxy and go through + DNAT. In this case, the destination IP address of the reply traffic is the Pod which initiated the connection to the + Service (no SNAT by kube-proxy). These packets should be forwarded back to the local Antrea gateway to the third-party module + to complete the DNAT processes, e.g., kube-proxy. The destination MAC of the packets is rewritten in the table to + avoid it is forwarded to the original client Pod by mistake. +- When hairpin is involved, i.e. connections between 2 local Pods, for which NAT is performed. One example is a + Pod accessing a NodePort Service for which externalTrafficPolicy is set to `Local` using the local Node's IP address, + as there will be no SNAT for such traffic. Another example could be hostPort support, depending on how the feature + is implemented. + +Flow 3 matches packets from intra-Node connections (excluding Service connections) and marked with +`NotRewriteMACRegMark`, indicating that the destination and source MACs of packets should not be overwritten, and +forwards them to table [L2ForwardingCalc] instead of table [L3DecTTL]. The deviation is due to local Pods connections +not traversing any router device or undergoing NAT process. For packets from Service or inter-Node connections, +`RewriteMACRegMark`, mutually exclusive with `NotRewriteMACRegMark`, is loaded. Therefore, the packets will not be +matched by the flow. + +Flow 4 is designed to match packets destined for a remote Pod CIDR. This involves installing a separate flow for each remote +Node, with each flow matching the destination IP address of the packets against the Pod subnet for the respective Node. +For the matched packets, the source MAC address is set to that of the local Antrea gateway MAC, and the destination +MAC address is set to the *Global Virtual MAC*. The Openflow `tun_dst` field is set to the appropriate value (i.e. +the IP address of the remote Node). Additionally, `ToTunnelRegMark` is loaded, signifying that the packets will be +forwarded to remote Nodes through a tunnel. The matched packets are then forwarded to table [L3DecTTL] to decrease the TTL +value. + +Flow 5-7 matches packets destined for local Pods and marked by `RewriteMACRegMark`, which signifies that the packets may +originate from Service or inter-Node connections. For the matched packets, the source MAC address is set to that of the +local Antrea gateway MAC, and the destination MAC address is set to the associated local Pod MAC address. The matched +packets are then forwarded to table [L3DecTTL] to decrease the TTL value. + +Flow 8 matches request packets originating from local Pods and destined for the external network, and then forwards them +to table [EgressMark] dedicated to feature `Egress`. In table [EgressMark], SNAT IPs for Egress are looked up for the packets. +To match the expected packets, `FromPodRegMark` is used to exclude packets that are not from local Pods. +Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` which is used to mark +packets from Antrea IPAM Pods, is used since Egress can only be applied to Node IPAM Pods. + +It's worth noting that packets sourced from local Pods and destined for the Services listed in the option +`antreaProxy.skipServices` are unexpectedly matched by flow 8 due to the fact that there is no flow in [ServiceLB] +to handle these Services. Consequently, the destination IP address of the packets, allocated from the Service CIDR, +is considered part of the "external network". No need to worry about the mismatch, as flow 3 in table [EgressMark] +is designed to match these packets and prevent them from undergoing SNAT by Egress. + +Flow 9 matches request packets originating from remote Pods and destined for the external network, and then forwards them +to table [EgressMark] dedicated to feature `Egress`. To match the expected packets, `FromTunnelRegMark` is used to +include packets that are from remote Pods through a tunnel. Considering that the packets from remote Pods traverse a +tunnel, the destination MAC address of the packets, represented by the *Global Virtual MAC*, needs to be rewritten to +MAC address of the local Antrea gateway. + +Flow 10 matches packets from Service connections that are originating from the local Antrea gateway and destined for the +external network. This is accomplished by matching `RewriteMACRegMark`, `FromGatewayRegMark`, and `ServiceCTMark`. The +destination MAC address is then set to that of the local Antrea gateway. Additionally, `ToGatewayRegMark`, which will be +used with `FromGatewayRegMark` together to identify hairpin connections in table [SNATMark], is loaded. Finally, +the packets are forwarded to table [L3DecTTL]. + +Flow 11 is the table-miss flow, and is used for packets originating from local Pods and destined for the external network, and +then forwarding them to table [L2ForwardingCalc]. `ToGatewayRegMark` is loaded as the matched packets traverse the +local Antrea gateway. + +### EgressMark + +This table is dedicated to feature `Egress`. It includes flows to select the right SNAT IPs for egress traffic +originating from Pods and destined for the external network. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod1-7e503a" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=EgressMark, priority=210,ip,nw_dst=192.168.77.102 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +2. table=EgressMark, priority=210,ip,nw_dst=192.168.77.103 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +3. table=EgressMark, priority=210,ip,nw_dst=10.96.0.0/12 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +4. table=EgressMark, priority=200,ip,in_port="client-6-3353ef" actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.113->tun_dst,set_field:0x10/0xf0->reg0,set_field:0x80000/0x80000->reg0,goto_table:L2ForwardingCalc +5. table=EgressMark, priority=200,ct_state=+new+trk,ip,tun_dst=192.168.77.112 actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +6. table=EgressMark, priority=200,ct_state=+new+trk,ip,in_port="web-7975-274540" actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +7. table=EgressMark, priority=190,ct_state=+new+trk,ip,reg0=0x1/0xf actions=drop +8. table=EgressMark, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When the SNAT IP of the Egress is on a remote Node, the flow will tunnel the -packets to the remote Node with the tunnel's destination IP to be the SNAT IP. -The packets will be SNAT'd on the remote Node. The same as a normal tunnel flow -in [L3ForwardingTable], the flow will rewrite the packets' source and -destination MAC addresses, load the SNAT IP to NXM_NX_TUN_IPV4_DST, and send the -packets to [L3DecTTLTable]. +Flows 1-2 match packets originating from local Pods and destined for the transport IP of remote Nodes, and then forward +them to table [L2ForwardingCalc] to bypass Egress SNAT. `ToGatewayRegMark` is loaded, indicating that the output port +of the packets is the local Antrea gateway. + +Flow 3 matches packets originating from local Pods and destined for the Services listed in the option +`antreaProxy.skipServices`, and then forwards them to table [L2ForwardingCalc] to bypass Egress SNAT. Similar to flows +1-2, `ToGatewayRegMark` is also loaded. + +The packets, matched by flows 1-3, are forwarded to this table by flow 8 in table [L3Forwarding], as they are classified +as part of traffic destined for the external network. However, these packets are not intended to undergo Egress SNAT. +Consequently, flows 1-3 are used to bypass Egress SNAT for these packets. + +Flow 4 match packets originating from local Pods selected by the sample [Egress egress-client], whose SNAT IP is configured +on a remote Node, which means that the matched packets should be forwarded to the remote Node through a tunnel. Before +sending the packets to the tunnel, the source and destination MAC addresses are set to the local Antrea gateway MAC +and the *Global Virtual MAC* respectively. Additionally, `ToTunnelRegMark`, indicating that the output port is a tunnel, +and `EgressSNATRegMark`, indicating that packets should undergo SNAT on a remote Node, are loaded. Finally, the packets +are forwarded to table [L2ForwardingCalc]. + +Flow 5 matches the first packet of connections originating from remote Pods selected by the sample [Egress egress-web] +whose SNAT IP is configured on the local Node, and then loads an 8-bit ID allocated for the associated SNAT IP defined +in the sample Egress to the `pkt_mark`, which will be consumed by iptables on the local Node to perform SNAT with the +SNAT IP. Subsequently, `ToGatewayRegMark`, indicating that the output port is the local Antrea gateway, is loaded. +Finally, the packets are forwarded to table [L2ForwardingCalc]. + +Flow 6 matches the first packet of connections originating from local Pods selected by the sample [Egress egress-web], +whose SNAT IP is configured on the local Node. Similar to flow 4, the 8-bit ID allocated for the SNAT IP is loaded to +`pkt_mark`, `ToGatewayRegMark` is loaded, and the packets are forwarded to table [L2ForwardingCalc] finally. + +Flow 7 drops all other packets tunneled from remote Nodes (identified with `FromTunnelRegMark`, indicating that the packets are +from remote Pods through a tunnel). The packets are not matched by any flows 1-6, which means that they are here +unexpected and should be dropped. + +Flow 8 is the table-miss flow, which matches "tracked" and non-new packets from Egress connections and forwards +them to table [L2ForwardingCalc]. `ToGatewayRegMark` is also loaded for these packets. + +### L3DecTTL + +This is the table to decrement TTL for IP packets. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod2-357c21" actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80a66->NXM_NX_TUN_IPV4_DST[],goto_table:72 +1. table=L3DecTTL, priority=210,ip,reg0=0x2/0xf actions=goto_table:SNATMark +2. table=L3DecTTL, priority=200,ip actions=dec_ttl,goto_table:SNATMark +3. table=L3DecTTL, priority=0 actions=goto_table:SNATMark ``` -Last, when a SNAT IP configured for Egresses is on the local Node, an additional -flow is added in [SNATTable] for egress traffic from remote Node that should -use the SNAT IP. The flow matches the tunnel destination IP (which should be -equal to the SNAT IP), and sets the 8 bits ID of the SNAT IP to pkt_mark. +Flow 1 matches packets with `FromGatewayRegMark`, which means that these packets enter the OVS pipeline from the local +Antrea gateway, as the host IP stack should have decremented the TTL already for such packets, TTL should not be +decremented again. + +Flow 2 is to decrement TTL for packets which are not matched by flow 1. + +Flow 3 is the table-miss flow that should remain unused. + +### SNATMark + +This table marks connections requiring SNAT within the OVS pipeline, distinct from Egress SNAT handled by iptables. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,tun_dst="192.168.10.101" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x22/0xff actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x12/0xff,reg4=0x200000/0x2200000 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark)) +3. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.23,nw_dst=10.10.0.23 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +4. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.24,nw_dst=10.10.0.24 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +5. table=SNATMark, priority=0 actions=goto_table:SNAT ``` -### L3DecTTLTable (72) +Flow 1 matches the first packet of hairpin Service connections, identified by `FromGatewayRegMark` and `ToGatewayRegMark`, +indicating that both the input and output ports of the connections are the local Antrea gateway port. Such hairpin +connections will undergo SNAT with the *Virtual Service IP* in table [SNAT]. Before forwarding the packets to table +[SNAT], `ConnSNATCTMark`, indicating that the connection requires SNAT, and `HairpinCTMark`, indicating that this is +a hairpin connection, are persisted to mark the connections. These two ct marks will be consumed in table [SNAT]. + +Flow 2 matches the first packet of Service connections requiring SNAT, identified by `FromGatewayRegMark` and +`ToTunnelRegMark`, indicating that the input port is the local Antrea gateway and the output port is a tunnel. Such +connections will undergo SNAT with the IP address of the local Antrea gateway in table [SNAT]. Before forwarding the +packets to table [SNAT], `ToExternalAddressRegMark` and `NotDSRServiceRegMark` are loaded, indicating that the packets +are destined for a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it is not DSR mode. +Additionally, `ConnSNATCTMark`, indicating that the connection requires SNAT, is persisted to mark the connections. + +It's worth noting that flows 1-2 are specific to `proxyAll`, but they are harmless when `proxyAll` is disabled since +these flows should be never matched by in-cluster Service traffic. -This is the table to decrement TTL for the IP packets destined to remote Nodes -through a tunnel, or the IP packets received from a tunnel. But for the packets -that enter the OVS pipeline from the local gateway and are destined to a remote -Node, TTL should not be decremented in OVS on the source Node, because the host -IP stack should have already decremented TTL if that is needed. +Flow 3-4 match the first packet of hairpin Service connections, identified by the same source and destination Pod IP +addresses. Such hairpin connections will undergo SNAT with the IP address of the local Antrea gateway in table [SNAT]. +Similar to flow 1, `ConnSNATCTMark` and `HairpinCTMark` are persisted to mark the connections. -If you dump the flows for this table, you should see flows like the following: +Flow 5 is the table-miss flow. + +### SNAT + +This table performs SNAT for connections requiring SNAT within the pipeline. + +If you dump the flows of this table, you may see the following: ```text -1. table=72, priority=210,ip,reg0=0x1/0xf, actions=goto_table:80 -2. table=72, priority=200,ip, actions=dec_ttl,goto_table:80 -3. table=72, priority=0, actions=goto_table:80 +1. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=169.254.0.253),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x3/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +3. table=SNAT, priority=200,ct_state=-new-rpl+trk,ct_mark=0x20/0x20,ip actions=ct(table=L2ForwardingCalc,zone=65521,nat) +4. table=SNAT, priority=190,ct_state=+new+trk,ct_mark=0x20/0x20,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark)) +5. table=SNAT, priority=0 actions=goto_table:L2ForwardingCalc ``` -The first flow is to bypass the TTL decrement for the packets from the gateway -port. +Flow 1 matches the first packet of hairpin Service connections through the local Antrea gateway, identified by +`HairpinCTMark` and `FromGatewayRegMark`. It performs SNAT with the *Virtual Service IP* `169.254.0.253` and forwards +the SNAT'd packets to table [L2ForwardingCalc]. Before SNAT, the "tracked" state of packets is associated with `CtZone`. +After SNAT, their "track" state is associated with `SNATCtZone`, and then `ServiceCTMark` and `HairpinCTMark` persisted +in `CtZone` are not accessible anymore. As a result, `ServiceCTMark` and `HairpinCTMark` need to be persisted once +again, but this time they are persisted in `SNATCtZone` for subsequent tables to consume. + +Flow 2 matches the first packet of hairpin Service connection originating from local Pods, identified by `HairpinCTMark` +and `FromPodRegMark`. It performs SNAT with the IP address of the local Antrea gateway and forwards the SNAT'd packets +to table [L2ForwardingCalc]. Similar to flow 1, `ServiceCTMark` and `HairpinCTMark` are persisted in `SNATCtZone`. -### L2ForwardingCalcTable (80) +Flow 3 matches the subsequent request packets of connections for which SNAT was performed for the first packet, and then +invokes `ct` action on the packets again to restore the "tracked" state in `SNATCtZone`. The packets with the appropriate +"tracked" state are forwarded to table [L2ForwardingCalc]. -This is essentially the "dmac" table of the switch. We program one flow for each -port (tunnel port, gateway port, and local Pod ports), as you can see if you -dump the flows: +Flow 4 matches the first packet of Service connections requiring SNAT, identified by `ConnSNATCTMark` and +`FromGatewayRegMark`, indicating the connection is destined for an external Service IP initiated through the +Antrea gateway and the Endpoint is a remote Pod. It performs SNAT with the IP address of the local Antrea gateway and +forwards the SNAT'd packets to table [L2ForwardingCalc]. Similar to other flow 1 or 2, `ServiceCTMark` is persisted in +`SNATCtZone`. + +Flow 5 is the table-miss flow. + +### L2ForwardingCalc + +This is essentially the "dmac" table of the switch. We program one flow for each port (tunnel port, the local Antrea +gateway port, and local Pod ports). + +If you dump the flows of this table, you may see the following: ```text -1. table=80, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x8000->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -2. table=80, priority=200,dl_dst=e2:e5:a4:9b:1c:b1 actions=set_field:0x8001->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -3. table=80, priority=200,dl_dst=12:9e:a6:47:d0:70 actions=set_field:0x3->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -4. table=80, priority=200,dl_dst=ba:a8:13:ca:ed:cf actions=set_field:0x8002->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -5. table=80, priority=0 actions=goto_table:105 +1. table=L2ForwardingCalc, priority=200,dl_dst=ba:5e:d1:55:aa:c0 actions=set_field:0x2->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +2. table=L2ForwardingCalc, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +3. table=L2ForwardingCalc, priority=200,dl_dst=5e:b5:e3:a6:90:b7 actions=set_field:0x24->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +4. table=L2ForwardingCalc, priority=200,dl_dst=fa:b7:53:74:21:a6 actions=set_field:0x25->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +5. table=L2ForwardingCalc, priority=200,dl_dst=36:48:21:a2:9d:b4 actions=set_field:0x26->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +6. table=L2ForwardingCalc, priority=0 actions=goto_table:TrafficControl ``` -For each port flow (1 through 5 in the example above), we set bit 16 of the -NXM_NX_REG0 register to indicate that there was a matching entry for the -destination MAC address and that the packet must be forwarded. In the last table -of the pipeline ([L2ForwardingOutTable]), we will drop all packets for which -this bit is not set. We also use the NXM_NX_REG1 register to store the egress -port for the packet, which will be used as a parameter to the `output` OpenFlow -action in [L2ForwardingOutTable]. - -The packets that match local Pods' MAC entries will go to the first table -([AntreaPolicyIngressRuleTable] when AntreaPolicy is enabled, or -[IngressRuleTable] when AntreaPolicy is not enabled) for NetworkPolicy ingress -rules. Other packets will go to [ConntrackCommitTable]. Specifically, packets -to the gateway port or the tunnel port will also go to [ConntrackCommitTable] -and bypass the NetworkPolicy ingress rule tables, as NetworkPolicy ingress rules -are not enforced for these packets on the source Node. - -What about L2 multicast / broadcast traffic? ARP requests will never reach this -table, as they will be handled by the OpenFlow `normal` action in the -[ArpResponderTable]. As for the rest, if it is IP traffic, it will hit the -"last" flow in this table and go to [ConntrackCommitTable]; and finally the last -table of the pipeline ([L2ForwardingOutTable]), and get dropped there since bit -16 of the NXM_NX_REG0 will not be set. Traffic which is non-ARP and non-IP -(assuming any can be received by the switch) is actually dropped much earlier in -the pipeline ([SpoofGuardTable]). In the future, we may need to support more -cases for L2 multicast / broadcast traffic. - -### AntreaPolicyIngressRuleTable (85) - -This table is very similar to [AntreaPolicyEgressRuleTable], but implements -the ingress rules of Antrea-native Policies. Depending on the tier to which the policy -belongs to, the rules will be installed in a table corresponding to that tier. -The ingress table to tier mappings is as follows: +Flow 1 matches packets destined for the local Antrea gateway, identified by the destination MAC address being that of +the local Antrea gateway. It loads `OutputToOFPortRegMark`, indicating that the packets should output to an OVS port, +and also loads the port number of the local Antrea gateway to `TargetOFPortField`. Both of these two values will be consumed +in table [Output]. + +Flow 2 matches packets destined for a tunnel, identified by the destination MAC address being that of the *Global Virtual +MAC*. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the tunnel is loaded to +`TargetOFPortField`. + +Flows 3-5 match packets destined for local Pods, identified by the destination MAC address being that of one of the local +Pods. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the local Pods is loaded to +`TargetOFPortField`. + +Flow 6 is the table-miss flow. + +### TrafficControl + +This table is dedicated to `TrafficControl`. + +If you dump the flows of this table, you may see the following: ```text -Baseline Tier -> IngressDefaultTable(100) -K8s NetworkPolicy -> IngressRuleTable(90) -All other Tiers -> AntreaPolicyIngressRuleTable(85) +1. table=TrafficControl, priority=210,reg0=0x200006/0x60000f actions=goto_table:Output +2. table=TrafficControl, priority=200,reg1=0x25 actions=set_field:0x22->reg9,set_field:0x800000/0xc00000->reg4,goto_table:IngressSecurityClassifier +3. table=TrafficControl, priority=200,in_port="web-7975-274540" actions=set_field:0x22->reg9,set_field:0x800000/0xc00000->reg4,goto_table:IngressSecurityClassifier +4. table=TrafficControl, priority=200,reg1=0x26 actions=set_field:0x27->reg9,set_field:0x400000/0xc00000->reg4,goto_table:IngressSecurityClassifier +5. table=TrafficControl, priority=200,in_port="db-755c6-5080e3" actions=set_field:0x27->reg9,set_field:0x400000/0xc00000->reg4,goto_table:IngressSecurityClassifier +6. table=TrafficControl, priority=0 actions=goto_table:IngressSecurityClassifier ``` -Again for this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) that we are using. -Since the example ACNP resides in the Application tier, if you dump the flows -for table 85, you should see something like this: +Flow 1 matches packets returned from TrafficControl return ports and forwards them to table [Output], where the packets +are output to the port to which they are destined. To identify such packets, `OutputToOFPortRegMark`, indicating that +the packets should be output to an OVS port, and `FromTCReturnRegMark` loaded in table [Classifier], indicating that +the packets are from a TrafficControl return port, are used. + +Flows 2-3 are installed for the sample [TrafficControl redirect-web-to-local] to mark the packets associated with the +Pods labeled by `app: web` using `TrafficControlRedirectRegMark`. Flow 2 handles the ingress direction, while flow 3 +handles the egress direction. In table [Output], these packets will be redirected to a TrafficControl target port +specified in `TrafficControlTargetOFPortField`, of which value is loaded in these 2 flows. + +Flows 4-5 are installed for the sample [TrafficControl mirror-db-to-local] to mark the packets associated with the Pods +labeled by `app: db` using `TrafficControlMirrorRegMark`. Similar to flows 2-3, flows 4-5 also handles the two directions. +In table [Output], these packets will be mirrored (duplicated) to a TrafficControl target port specified in +`TrafficControlTargetOFPortField`, of which value is loaded in these 2 flows. + +Flow 6 is the table-miss flow. + +### IngressSecurityClassifier + +This table is to classify packets before they enter the tables for ingress security. + +If you dump the flows of this table, you may see the following: ```text -1. table=85, priority=64990,ct_state=-new+est,ip actions=resubmit(,105) -2. table=85, priority=14000,conj_id=4,ip actions=load:0x4->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) -3. table=85, priority=14000,ip,nw_src=10.10.1.7 actions=conjunction(4,1/3) -4. table=85, priority=14000,ip,reg1=0x19c actions=conjunction(4,2/3) -5. table=85, priority=14000,tcp,tp_dst=80 actions=conjunction(4,3/3) -6. table=85, priority=0 actions=resubmit(,90) +1. table=IngressSecurityClassifier, priority=210,pkt_mark=0x80000000/0x80000000,ct_state=-rpl+trk,ip actions=goto_table:ConntrackCommit +2. table=IngressSecurityClassifier, priority=201,reg4=0x80000/0x80000 actions=goto_table:AntreaPolicyIngressRule +3. table=IngressSecurityClassifier, priority=200,reg0=0x20/0xf0 actions=goto_table:IngressMetric +4. table=IngressSecurityClassifier, priority=200,reg0=0x10/0xf0 actions=goto_table:IngressMetric +5. table=IngressSecurityClassifier, priority=200,reg0=0x40/0xf0 actions=goto_table:IngressMetric +6. table=IngressSecurityClassifier, priority=200,ct_mark=0x40/0x40 actions=goto_table:ConntrackCommit +7. table=IngressSecurityClassifier, priority=0 actions=goto_table:AntreaPolicyIngressRule ``` -As for [AntreaPolicyEgressRuleTable], flow 1 (highest priority) ensures that for -established connections packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. +Flow 1 matches locally generated request packets for liveness/readiness probes from kubelet, identified by `pkt_mark` +which is set by iptables in the host network namespace. It forwards the packets to table [ConntrackCommit] directly to +bypass all tables for ingress security. -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.7}, and the destination OF port is in the set {412} (which -correspond to IP addresses {10.10.1.6}), and the destination TCP port -is in the set {80}, then use `conjunction` action with id 4, which loads -the `conj_id` 4 into NXM_NX_REG3, a register used by Antrea internally to -indicate the disposition of the packet is Drop, and forward the packet to -IngressMetricsTable for it to be dropped. +Flow 2 matches packets destined for NodePort Services and forwards them to table [AntreaPolicyIngressRule] to enforce +Antrea-native NetworkPolicies applied to NodePort Services. Without this flow, if the selected Endpoint is not a local +Pod, the packets might be matched by one of the flows 3-5, skipping table [AntreaPolicyIngressRule]. -Otherwise, go to [IngressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy ingress rules in any tier (except for the "baseline" tier). -One notable difference is how we use OF ports to identify the destination of -the traffic, while we use IP addresses in [AntreaPolicyEgressRuleTable] to -identify the source of the traffic. More details regarding this can be found -in the following [IngressRuleTable] section. +Flows 3-5 matches packets destined for the local Antrea gateway, tunnel, uplink port with `ToGatewayRegMark`, +`ToTunnelRegMark` or `ToUplinkRegMark`, respectively, and forwards them to table [IngressMetric] directly to bypass +all tables for ingress security. -As seen in [AntreaPolicyEgressRuleTable], the default action is to evaluate K8s -Network Policy [IngressRuleTable] and a AntreaPolicyIngressDefaultTable does not exist. +Flow 5 matches packets from hairpin connections with `HairpinCTMark` and forwards them to table [ConntrackCommit] +directly to bypass all tables for ingress security. Refer to this PR +[#5687](https://github.com/antrea-io/antrea/pull/5687) for more information. -### IngressRuleTable (90) +Flow 6 is the table-miss flow. -This table is very similar to [EgressRuleTable], but implements ingress rules -for Network Policies. Once again, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +### AntreaPolicyIngressRule -If you dump the flows for this table, you should see something like this: +This table is very similar to table [AntreaPolicyEgressRule] but implements the ingress rules of Antrea-native +NetworkPolicies. Depending on the tier to which the policy belongs, the rules will be installed in a table corresponding +to that tier. The ingress table to tier mappings is as follows: ```text -1. table=90, priority=210,ct_state=-new+est,ip actions=goto_table:101 -2. table=90, priority=210,pkt_mark=0x1/0x1 actions=goto_table:105 -3. table=90, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(3,1/3) -4. table=90, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(3,1/3) -5. table=90, priority=200,ip,reg1=0x3 actions=conjunction(3,2/3) -6. table=90, priority=200,ip,reg1=0x8002 actions=conjunction(3,2/3) -7. table=90, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) -8. table=90, priority=190,conj_id=3,ip actions=load:0x3->NXM_NX_REG6[],ct(commit,table=101,zone=65520,exec(load:0x3->NXM_NX_CT_LABEL[0..31])) -9. table=90, priority=0 actions=goto_table:100 +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyIngressRule +K8s NetworkPolicy -> IngressRule +Antrea-native NetworkPolicy Baseline Tier -> IngressDefaultRule ``` -As for [EgressRuleTable], flow 1 (highest priority) ensures that for established -connections - as a reminder all connections are committed in -[ConntrackCommitTable] - packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. - -Flow 2 ensures that the traffic initiated from the host network namespace cannot -be dropped because of Network Policies. This ensures that K8s [liveness -probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) -can go through. An iptables rule in the mangle table of the host network -namespace is responsible for marking the locally-generated packets with the -`0x1/0x1` mark. Note that the flow will be different for Windows worker Node or -when OVS userspace (netdev) datapath is used. This is because either there is no -way to add mark for particular traffic (i.e. Windows) or matching the mark in -OVS is not properly supported (i.e. netdev datapath). As a result, the flow will -match source IP instead, however, NodePort Service access by external clients -will be masqueraded as a local gateway IP to bypass Network Policies. This may -be fixed after AntreaProxy can serve NodePort traffic. - -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination OF port is in the set {3, 4} (which -correspond to IP addresses {10.10.1.2, 10.10.1.3}, and the destination TCP port -is in the set {80}, then use `conjunction` action with id 3, which stores the -`conj_id` 3 in `ct_label[0..31]` for egress metrics collection purposes, and forwards -the packet to IngressMetricsTable, then [L2ForwardingOutTable]. Otherwise, go to -[IngressDefaultTable]. One notable difference is how we use OF ports to identify -the destination of the traffic, while we use IP addresses in [EgressRuleTable] -to identify the source of the traffic. We do this as an increased security measure -in case a local Pod is misbehaving and trying to access another local Pod using -the correct destination MAC address but a different destination IP address to bypass -an egress Network Policy rule. This is also why the Network Policy ingress rules -are enforced after the egress port has been determined. - -### IngressDefaultTable (100) - -This table is similar in its purpose to [EgressDefaultTable], and it complements -[IngressRuleTable] for Network Policy ingress rule implementation. In K8s, when -a Network Policy is applied to a set of Pods, the default behavior for these -Pods become "deny" (it becomes an [isolated -Pod](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This -table is in charge of dropping traffic destined to Pods to which a Network -Policy (with an ingress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic destined to our 2 Pods (3 and 4), which is confirmed by dumping -the flows: +Again for this table, you will need to keep in mind the Antrea-native NetworkPolicy +[specification](#antrea-native-networkpolicy-implementation) and Antrea-native L7 NetworkPolicy +[specification](#antrea-native-l7-networkpolicy-implementation) that we are using that we are using. Since these sample +ingress policies reside in the Application Tier, if you dump the flows for this table, you may see the following: ```text -1. table=100, priority=200,ip,reg1=0x3 actions=drop -2. table=100, priority=200,ip,reg1=0x8002 actions=drop -3. table=100, priority=0 actions=goto_table:105 +1. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:IngressMetric +2. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:IngressMetric +3. table=AntreaPolicyIngressRule, priority=14500,reg1=0x7 actions=conjunction(14,2/3) +4. table=AntreaPolicyIngressRule, priority=14500,ip,nw_src=10.10.0.26 actions=conjunction(14,1/3) +5. table=AntreaPolicyIngressRule, priority=14500,tcp,tp_dst=8080 actions=conjunction(14,3/3) +6. table=AntreaPolicyIngressRule, priority=14500,conj_id=14,ip actions=set_field:0xd->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0xd/0xffffffff->ct_label,set_field:0x80/0x80->ct_mark,set_field:0x20000000000000000/0xfff0000000000000000->ct_label)) +7. table=AntreaPolicyIngressRule, priority=14600,ip,nw_src=10.10.0.26 actions=conjunction(6,1/3) +8. table=AntreaPolicyIngressRule, priority=14600,reg1=0x25 actions=conjunction(6,2/3) +9. table=AntreaPolicyIngressRule, priority=14600,tcp,tp_dst=80 actions=conjunction(6,3/3) +10. table=AntreaPolicyIngressRule, priority=14600,conj_id=6,ip actions=set_field:0x6->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x6/0xffffffff->ct_label)) +11. table=AntreaPolicyIngressRule, priority=14600,ip actions=conjunction(4,1/2) +12. table=AntreaPolicyIngressRule, priority=14599,reg1=0x25 actions=conjunction(4,2/2) +13. table=AntreaPolicyIngressRule, priority=14599,conj_id=4 actions=set_field:0x4->reg3,set_field:0x400/0x400->reg0,goto_table:IngressMetric +14. table=AntreaPolicyIngressRule, priority=0 actions=goto_table:IngressRule ``` -Similar to the [EgressDefaultTable], this table is also used to implement -Antrea-native policy ingress rules that are created in the Baseline Tier. -Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the -corresponding flows will be created at a lower priority than K8s default drop flows. -For example, a baseline rule to isolate ingress traffic for a Namespace will look -like the following: +Flows 1-2, which are installed by default with the highest priority, match non-new and "tracked" packets and +forward them to table [IngressMetric] to bypass the check from egress rules. This means that if a connection is +established, its packets go straight to table [IngressMetric], with no other match required. In particular, this ensures +that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this +also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is +updated. + +Similar to table [AntreaPolicyEgressRule], the priorities of flows 3-13 installed for the ingress rules are decided by +the following: + +- The `spec.tier` value in an Antrea-native NetworkPolicy determines the primary level for flow priority. +- The `spec.priority` value in an Antrea-native NetworkPolicy determines the secondary level for flow priority within + the same `spec.tier`. A lower value in this field corresponds to a higher priority for the flow. +- The rule's position within an Antrea-native NetworkPolicy also influences flow priority. Rules positioned closer to + the beginning have higher priority for the flow. + +Flows 3-6, whose priories are all 14500, are installed for the egress rule `AllowFromClientL7` in the sample policy. +These flows are described as follows: + +- Flow 3 is used to match packets with the source IP address in set {10.10.0.26}, which has all IP addresses of the + Pods selected by the label `app: client`, constituting the first dimension for `cojunction` with `conj_id` 14. +- Flow 4 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods selected + by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 14. +- Flow 5 is used to match packets with the destination TCP port in set {8080} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 14. +- Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 14 and forward them + to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel` consumed in table [IngressMetric]. + Additionally, for the L7 protocol: + - `L7NPRedirectCTMark` is persisted, indicating the packets should be redirected to an application-aware engine to + be filtered according to L7 rules, such as method `GET` and path `/api/v2/*` in the sample policy. + - A VLAN ID allocated for the Antrea-native L7 NetworkPolicy is persisted in `L7NPRuleVlanIDCTLabel`, which will be + consumed in table [Output]. + +Flows 7-11, whose priorities are 14600, are installed for the egress rule `AllowFromClient` in the sample policy. +These flows are described as follows: + +- Flow 7 is used to match packets with the source IP address in set {10.10.0.26}, which has all IP addresses of the Pods + selected by the label `app: client`, constituting the first dimension for `cojunction` with `conj_id` 6. +- Flow 8 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods selected + by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 6. +- Flow 9 is used to match packets with the destination TCP port in set {80} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 6. +- Flow 10 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 6 and forward + them to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel` consumed in table [IngressMetric]. + +Flows 11-13, whose priorities are all 14599, are installed for the egress rule with a `Drop` action defined after the +rule `AllowFromClient` in the sample policy, serves as a default rule. Unlike the default of K8s NetworkPolicy, +Antrea-native NetworkPolicy has no default rule, and all rules should be explicitly defined. Hence, they are evaluated +as-is, and there is no need for a table [AntreaPolicyIngressDefaultRule]. These flows are described as follows: + +- Flow 11 is used to match any IP packets, constituting the second dimension for `conjunction` with `conj_id` 4. This + flow, which matches all IP packets, exists because we need at least 2 dimensions for a conjunctive match. +- Flow 12 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods + selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 4. +- Flow 13 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 4. `APDenyRegMark` that + will be consumed in table [IngressMetric] to which the packets are forwarded is loaded. + +Flow 14 is the table-miss flow to forward packets not matched by other flows to table [IngressMetric]. + +### IngressRule + +This table is very similar to table [EgressRule] but implements ingress rules for K8s NetworkPolicies. Once again, you +will need to keep in mind the K8s NetworkPolicy [specification](#kubernetes-networkpolicy-implementation) that we are +using. + +If you dump the flows of this table, you should see something like this: ```text -table=100, priority=80,ip,reg1=0xb actions=conjunction(6,2/3) -table=100, priority=80,ip,reg1=0xc actions=conjunction(6,2/3) -table=100, priority=80,ip,nw_src=10.10.1.9 actions=conjunction(6,1/3) -table=100, priority=80,ip,nw_src=10.10.1.7 actions=conjunction(6,1/3) -table=100, priority=80,tcp,tp_dst=8080 actions=conjunction(6,3/3) -table=100, priority=80,conj_id=6,ip actions=load:0x6->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) +1. table=IngressRule, priority=200,ip,nw_src=10.10.0.26 actions=conjunction(3,1/3) +2. table=IngressRule, priority=200,reg1=0x25 actions=conjunction(3,2/3) +3. table=IngressRule, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) +4. table=IngressRule, priority=190,conj_id=3,ip actions=set_field:0x3->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x3/0xffffffff->ct_label)) +5. table=IngressRule, priority=0 actions=goto_table:IngressDefaultRule ``` -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table ([ConntrackCommitTable]). +Flows 1-4 are installed for the ingress rule in the sample K8s NetworkPolicy. These flows are described as follows: -### ConntrackCommitTable (105) +- Flow 1 is used to match packets with the source IP address in set {10.10.0.26}, which is from the Pods selected + by the label `app: client` in the `default` Namespace, constituting the first dimension for `conjunction` with `conj_id` 3. +- Flow 2 is used to match packets with the output port OVS in set {0x25}, which has all ports of the Pods selected + by the label `app: web` in the `default` Namespace, constituting the second dimension for `conjunction` with `conj_id` 3. +- Flow 3 is used to match packets with the destination TCP port in set {80} specified in the rule, constituting + the third dimension for `conjunction` with `conj_id` 3. +- Flow 4 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 3 and forward + them to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel`. -As mentioned before, this table is in charge of committing all new connections -which are not dropped because of Network Policies. If you dump the flows for this -table, you should see something like this: +Flow 5 is the table-miss flow to forward packets not matched by other flows to table [IngressDefaultRule]. + +### IngressDefaultRule + +This table is similar in its purpose to table [EgressDefaultRule], and it complements table [IngressRule] for K8s +NetworkPolicy ingress rule implementation. In Kubernetes, when a NetworkPolicy is applied to a set of Pods, then the default +behavior for ingress connections for these Pods becomes "deny" (they become [isolated +Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This +table is in charge of dropping traffic destined for Pods to which a NetworkPolicy (with an ingress rule) is applied, +and which did not match any of the "allow" list rules. + +If you dump the flows of this table, you may see the following: ```text -1. table=105, priority=200,ct_state=+new+trk,ip,reg0=0x1/0xf actions=ct(commit,table=108,zone=65520,exec(load:0x20->NXM_NX_CT_MARK[])) -2. table=105, priority=190,ct_state=+new+trk,ip actions=ct(commit,table=108,zone=65520) -3. table=105, priority=0 actions=goto_table:108 +1. table=IngressDefaultRule, priority=200,reg1=0x25 actions=drop +2. table=IngressDefaultRule, priority=0 actions=goto_table:IngressMetric ``` -Flow 1 ensures that we commit connections initiated through the gateway -interface and mark them with a `ct_mark` of `0x20`. This ensures that -[ConntrackStateTable] can perform its functions correctly and rewrite the -destination MAC address to the gateway's MAC address for connections which -require it. Such connections include Pod-to-ClusterIP traffic. Note that the -`0x20` mark is applied to *all* connections initiated through the gateway -(i.e. for which the first packet of the connection was received through the -gateway) and that [ConntrackStateTable] will perform the destination MAC address -for the reply traffic of *all* such connections. In some cases (the ones -described for [ConntrackStateTable]), this rewrite is necessary. For others -(e.g. a connection from the host to a local Pod), this rewrite is not necessary -but is also harmless, as the destination MAC is already correct. +Flow 1, based on our sample K8s NetworkPolicy, is to drop traffic destined for OVS port 0x25, the port number associated +with a Pod selected by the label `app: web`. -Flow 2 commits all other new connections. +Flow 2 is the table-miss flow to forward packets to table [IngressMetric]. -All traffic then goes to [HairpinSNATTable]. +This table is also used to implement Antrea-native NetworkPolicy ingress rules created in the Baseline Tier. +Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the corresponding flows will be created at a +lower priority than K8s NetworkPolicy default drop flows. These flows are similar to flows 3-9 in table +[AntreaPolicyIngressRule]. -### HairpinSNATTable (108) +### IngressMetric -The table is used to handle Service hairpin case, which indicates that the -packet should be output to the port on which it was received. +This table is very similar to table [EgressMetric], but used to collect ingress metrics for Antrea-native NetworkPolicies. -If you dump the flows for this table, you should see the flows: +If you dump the flows of this table, you may see the following: ```text -1. table=108, priority=200,ip,nw_src=10.10.0.4,nw_dst=10.10.0.4 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -2. table=108, priority=200,ip,nw_src=10.10.0.2,nw_dst=10.10.0.2 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -3. table=108, priority=200,ip,nw_src=10.10.0.3,nw_dst=10.10.0.3 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -4. table=108, priority=0 actions=resubmit(,110) +1. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x3/0xffffffff,ip actions=goto_table:ConntrackCommit +2. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x3/0xffffffff,ip actions=goto_table:ConntrackCommit +3. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +4. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +5. table=IngressMetric, priority=200,reg0=0x400/0x400,reg3=0x4 actions=drop +6. table=IngressMetric, priority=0 actions=goto_table:ConntrackCommit ``` -Flow 1-3 are used to match Service packets from Pods. The source IP of the matched -packets by flow 1-3 should be SNAT'd with a virtual hairpin IP since the source and -destination IP addresses should not be the same. Without SNAT, response packets from -a Pod will not be forwarded back to OVS pipeline as the destination IP is the Pod's -own IP, then the connection is interrupted because the conntrack state is only stored -in OVS ct zone, not in the Pod. With SNAT, the destination IP will be the virtual -hairpin IP and forwarded back to OVS pipeline. Note that, bit 18 in NXM_NX_REG0 is -set to 0x1, and it is consumed in [L2ForwardingOutTable] to output the packet -to the port on which it was received with action `IN_PORT`. +Flows 1-2, matching packets with `IngressRuleCTLabel` set to 3 (the `conj_id` allocated for the sample K8s NetworkPolicy +ingress rule and loaded in table [IngressRule] flow 4), are used to collect metrics for the ingress rule. + +Flows 3-4, matching packets with `IngressRuleCTLabel` set to 6 (the `conj_id` allocated for the sample Antrea-native +NetworkPolicy ingress rule and loaded in table [AntreaPolicyIngressRule] flow 10), are used to collect metrics for the +ingress rule. -### L2ForwardingOutTable (110) +Flow 5 is the drop rule for the sample Antrea-native NetworkPolicy ingress rule. It drops the packets by matching +`APDenyRegMark` loaded in table [AntreaPolicyIngressRule] flow 13 and `APConjIDField` set to 4 which is the `conj_id` +allocated for the ingress rule and loaded in table [AntreaPolicyIngressRule] flow 13. -It is a simple table and if you dump the flows for this table, you should only -see 2 flows: +Flow 6 is the table-miss flow. + +### ConntrackCommit + +This table is in charge of committing non-Service connections in `CtZone`. + +If you dump the flows of this table, you may see the following: ```text -1. table=110, priority=200,ip,reg0=0x10000/0x10000 actions=output:NXM_NX_REG1[] -2. table=110, priority=0, actions=drop +1. table=ConntrackCommit, priority=200,ct_state=+new+trk-snat,ct_mark=0/0x10,ip actions=ct(commit,table=Output,zone=65520,exec(move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +2. table=ConntrackCommit, priority=0 actions=goto_table:Output ``` -The first flow outputs all unicast packets to the correct port (the port was -resolved by the "dmac" table, [L2ForwardingCalcTable]). IP packets for which -[L2ForwardingCalcTable] did not set bit 16 of NXM_NX_REG0 will be dropped. +Flow 1 is designed to match the first packet of non-Service connections with the "tracked" state and `NotServiceCTMark`. +Then it commits the relevant connections in `CtZone`, persisting the value of `PktSourceField` to +`ConnSourceCTMarkField`, and forwards the packets to table [Output]. -## Tables (AntreaProxy is disabled) +Flow 2 is the table-miss flow. -![OVS pipeline](../assets/ovs-pipeline.svg) +### Output -### DNATTable (40) +This is the final table in the pipeline, responsible for handling the output of packets from OVS. It addresses the +following cases: -This table is created only when AntreaProxy is disabled. Its only job is to -send traffic destined to Services through the local gateway interface, without any -modifications. kube-proxy will then take care of load-balancing the connections -across the different backends for each Service. +1. Output packets to an application-aware engine for further L7 protocol processing. +2. Output packets to a target port and a mirroring port defined in a TrafficControl CR with `Mirror` action. +3. Output packets to a port defined in a TrafficControl CR with `Redirect` action. +4. Output packets from hairpin connections to the ingress port where they were received. +5. Output packets to a target port. +6. Output packets to the OpenFlow controller (Antrea Agent). +7. Drop packets. -If you dump the flows for this table, you should see something like this: +If you dump the flows of this table, you may see the following: ```text -1. table=40, priority=200,ip,nw_dst=10.96.0.0/12 actions=set_field:0x8001->reg1,load:0x1->NXM_NX_REG0[16],goto_table:105 -2. table=40, priority=0 actions=goto_table:45 +1. table=Output, priority=212,ct_mark=0x80/0x80,reg0=0x200000/0x600000 actions=push_vlan:0x8100,move:NXM_NX_CT_LABEL[64..75]->OXM_OF_VLAN_VID[],output:"antrea-l7-tap0" +2. table=Output, priority=211,reg0=0x200000/0x600000,reg4=0x400000/0xc00000 actions=output:NXM_NX_REG1[],output:NXM_NX_REG9[] +3. table=Output, priority=211,reg0=0x200000/0x600000,reg4=0x800000/0xc00000 actions=output:NXM_NX_REG9[] +4. table=Output, priority=210,ct_mark=0x40/0x40 actions=IN_PORT +5. table=Output, priority=200,reg0=0x200000/0x600000 actions=output:NXM_NX_REG1[] +6. table=Output, priority=200,reg0=0x2400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.01) +7. table=Output, priority=200,reg0=0x4400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.02) +8. table=Output, priority=0 actions=drop ``` -In the example above, 10.96.0.0/12 is the Service CIDR (this is the default -value used by `kubeadm init`). This flow is not actually required for -forwarding, but to bypass [EgressRuleTable] and [EgressDefaultTable] for Service -traffic on its way to kube-proxy through the gateway. If we omitted this flow, -such traffic would be unconditionally dropped if a Network Policy is applied on -the originating Pod. For such traffic, we instead enforce Network Policy egress -rules when packets come back through the gateway and the destination IP has been -rewritten by kube-proxy (DNAT to a backend for the Service). We cannot output -the Service traffic to the gateway port directly as we haven't committed the -connection yet; instead we store the port in NXM_NX_REG1 - similarly to how we -process non-Service traffic in [L2ForwardingCalcTable] - and forward it to -[ConntrackCommitTable]. By committing the connection we ensure that reply -traffic (traffic from the Service backend which has already gone through -kube-proxy for source IP rewrite) will not be dropped because of Network -Policies. - -The table-miss flow entry (flow 2) for this table forwards all non-Service -traffic to the next table, [AntreaPolicyEgressRuleTable]. - -[ClassifierTable]: #classifiertable-0 -[SpoofGuardTable]: #spoofguardtable-10 -[ARPResponderTable]: #arprespondertable-20 -[ServiceHairpinTable]: #servicehairpintable-23 -[ConntrackTable]: #conntracktable-30 -[ConntrackStateTable]: #conntrackstatetable-31 -[DNATTable]: #dnattable-40 -[SessionAffinityTable]: #sessionaffinitytable-40 -[ServiceLBTable]: #servicelbtable-41 -[EndpointDNATTable]: #endpointdnattable-42 -[AntreaPolicyEgressRuleTable]: #antreapolicyegressruletable-45 -[EgressRuleTable]: #egressruletable-50 -[EgressDefaultTable]: #egressdefaulttable-60 -[L3ForwardingTable]: #l3forwardingtable-70 -[SNATTable]: #snattable-71 -[L3DecTTLTable]: #l3decttltable-72 -[L2ForwardingCalcTable]: #l2forwardingcalctable-80 -[AntreaPolicyIngressRuleTable]: #antreapolicyingressruletable-85 -[IngressRuleTable]: #ingressruletable-90 -[IngressDefaultTable]: #ingressdefaulttable-100 -[ConntrackCommitTable]: #conntrackcommittable-105 -[HairpinSNATTable]: #hairpinsnattable-108 -[L2ForwardingOutTable]: #l2forwardingouttable-110 +Flow 1 is for case 1. It matches packets with `L7NPRedirectCTMark` and `OutputToOFPortRegMark`, and then outputs them to +the port `antrea-l7-tap0` specifically created for connecting to an application-aware engine. Notably, these packets are pushed +with an 802.1Q header and loaded with the VLAN ID value persisted in `L7NPRuleVlanIDCTLabel` before being output, due to +the implementation of Antrea-native L7 NetworkPolicy. The application-aware engine enforcing L7 policies (e.g., Suricata) +can leverage the VLAN ID to determine which set of rules to apply to the packet. + +Flow 2 is for case 2. It matches packets with `TrafficControlMirrorRegMark` and `OutputToOFPortRegMark`, and then +outputs them to the port specified in `TargetOFPortField` and the port specified in `TrafficControlTargetOFPortField`. +Unlike the `Redirect` action, the `Mirror` action creates an additional copy of the packet. + +Flow 3 is for case 3. It matches packets with `TrafficControlRedirectRegMark` and `OutputToOFPortRegMark`, and then +outputs them to the port specified in `TrafficControlTargetOFPortField`. + +Flow 4 is for case 4. It matches packets from hairpin connections by matching `HairpinCTMark` and outputs them back to the +port where they were received. + +Flow 5 is for case 5. It matches packets by matching `OutputToOFPortRegMark` and outputs them to the OVS port specified by +the value stored in `TargetOFPortField`. + +Flows 6-7 are for case 6. They match packets by matching `OutputToControllerRegMark` and the value stored in +`PacketInOperationField`, then output them to the OpenFlow controller (Antrea Agent) with corresponding user data. + +In practice, you will see additional flows similar to these ones to accommodate different scenarios (different +PacketInOperationField values). Note that packets sent to controller are metered to avoid overrunning the antrea-agent +and using too many resources. + +Flow 8 is the table-miss flow for case 7. It drops packets that do not match any of the flows in this table. + +[ARPSpoofGuard]: #arpspoofguard +[AntreaPolicyEgressRule]: #antreapolicyegressrule +[AntreaPolicyIngressRule]: #antreapolicyingressrule +[Classifier]: #classifier +[ClusterIP without Endpoint]: #clusterip-without-endpoint +[ClusterIP]: #clusterip +[ConntrackCommit]: #conntrackcommit +[ConntrackState]: #conntrackstate +[ConntrackZone]: #conntrackzone +[Ct Labels]: #ovs-ct-label +[Ct Marks]: #ovs-ct-mark +[Ct Zones]: #ovs-ct-zone +[EgressDefaultRule]: #egressdefaultrule +[EgressMark]: #egressmark +[EgressMetric]: #egressmetric +[EgressRule]: #egressrule +[Egress egress-client]: #egress-applied-to-client-pods +[Egress egress-web]: #egress-applied-to-web-pods +[EndpointDNAT]: #endpointdnat +[IngressDefaultRule]: #ingressdefaultrule +[IngressMetric]: #ingressmetric +[IngressRule]: #ingressrule +[L2ForwardingCalc]: #l2forwardingcalc +[L3DecTTL]: #l3decttl +[L3Forwarding]: #l3forwarding +[LoadBalancer]: #loadbalancer +[NodePort]: #nodeport +[NodePortMark]: #nodeportmark +[OVS Registers]: #ovs-registers +[Output]: #output +[PreRoutingClassifier]: #preroutingclassifier +[SNATMark]: #snatmark +[SNAT]: #snat +[Service with ExternalIP]: #service-with-externalip +[Service with ExternalTrafficPolicy Local]: #service-with-externaltrafficpolicy-local +[Service with session affinity]: #service-with-session-affinity +[ServiceLB]: #servicelb +[SessionAffinity]: #sessionaffinity +[SpoofGuard]: #spoofguard +[TrafficControl]: #trafficcontrol +[TrafficControl mirror-db-to-local]: #trafficcontrol-for-packet-mirroring +[TrafficControl redirect-web-to-local]: #trafficcontrol-for-packet-redirecting +[UnSNAT]: #unsnat diff --git a/pkg/agent/openflow/fields.go b/pkg/agent/openflow/fields.go index 47381d5ad0b..78f0845a143 100644 --- a/pkg/agent/openflow/fields.go +++ b/pkg/agent/openflow/fields.go @@ -109,12 +109,12 @@ var ( APConjIDField = binding.NewRegField(3, 0, 31) // reg4(NXM_NX_REG4) - // reg4[0..15]: Field to store the selected Service Endpoint port. + // reg4[0..15]: Field to store the selected Service Endpoint port number. EndpointPortField = binding.NewRegField(4, 0, 15) // reg4[16..18]: Field to store the state of a packet accessing a Service. Marks in this field include: - // - 0b001: packet need to do service selection. - // - 0b010: packet has done service selection. - // - 0b011: packet has done service selection and the selection result needs to be cached. + // - 0b001: packet needs to do Endpoint selection. + // - 0b010: packet has done Endpoint selection. + // - 0b011: packet has done Endpoint selection and the selection result needs to be cached. ServiceEPStateField = binding.NewRegField(4, 16, 18) EpToSelectRegMark = binding.NewRegMark(ServiceEPStateField, 0b001) EpSelectedRegMark = binding.NewRegMark(ServiceEPStateField, 0b010)