Skip to content

Commit

Permalink
northd: Fix logical router load-balancer nat rules when using DGP.
Browse files Browse the repository at this point in the history
This commit fixes the build_distr_lrouter_nat_flows_for_lb function to
include a DNAT flow entry for each DGP in use. Since we have added support
to create multiple gateway ports per logical router, it's necessary to
include in the LR NAT rules pipeline a specific entry for each attached DGP.
Otherwise, the inbound traffic will only be redirected when the incoming LRP
matches the chassis_resident field.

Additionally, this patch includes the ability to use load-balancer with DGPs
attached to multiple chassis. We can have each of the DGPs associated with a
different chassis, and in this case the DNAT rules added by default will not
be enough to guarantee outgoing traffic.

To solve the multiple chassis for DGPs problem, this patch include a new
config options to be configured in the load-balancer. If the use_stateless_nat
is set to true, the logical router that references this load-balancer will use
Stateless NAT rules when the logical router has multiple DGPs. After applying
this patch and setting the use_stateless_nat option, the inbound and/or
outbound traffic can pass through any chassis where the DGP resides without
having problems with CT state.

Reported-at: https://bugs.launchpad.net/ubuntu/+source/ovn/+bug/2054322
Fixes: 15348b7 ("ovn-northd: Multiple distributed gateway port support.")
Signed-off-by: Roberto Bartzen Acosta <[email protected]>
Signed-off-by: Numan Siddique <[email protected]>
  • Loading branch information
r-acosta authored and numansiddique committed Oct 25, 2024
1 parent d741f2f commit 264c831
Show file tree
Hide file tree
Showing 7 changed files with 1,479 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ovn-fake-multinode-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ jobs:
- name: Start basic cluster
run: |
sudo -E ./ovn_cluster.sh start
sudo -E CHASSIS_COUNT=4 GW_COUNT=4 ./ovn_cluster.sh start
sudo podman exec -it ovn-central-az1-1 ovn-nbctl show
sudo podman exec -it ovn-central-az1-1 ovn-appctl -t ovn-northd version
sudo podman exec -it ovn-chassis-1 ovn-appctl -t ovn-controller version
Expand Down
12 changes: 0 additions & 12 deletions northd/en-lr-stateful.c
Original file line number Diff line number Diff line change
Expand Up @@ -516,18 +516,6 @@ lr_stateful_record_create(struct lr_stateful_table *table,

table->array[od->index] = lr_stateful_rec;

/* Load balancers are not supported (yet) if a logical router has multiple
* distributed gateway port. Log a warning. */
if (lr_stateful_rec->has_lb_vip && lr_has_multiple_gw_ports(od)) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "Load-balancers are configured on logical "
"router %s, which has %"PRIuSIZE" distributed "
"gateway ports. Load-balancer is not supported "
"yet when there is more than one distributed "
"gateway port on the router.",
od->nbr->name, od->n_l3dgw_ports);
}

return lr_stateful_rec;
}

Expand Down
138 changes: 113 additions & 25 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -11876,31 +11876,42 @@ static void
build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
enum lrouter_nat_lb_flow_type type,
struct ovn_datapath *od,
struct lflow_ref *lflow_ref)
struct lflow_ref *lflow_ref,
struct ovn_port *dgp,
bool stateless_nat)
{
struct ovn_port *dgp = od->l3dgw_ports[0];

const char *undnat_action;

switch (type) {
case LROUTER_NAT_LB_FLOW_FORCE_SNAT:
undnat_action = "flags.force_snat_for_lb = 1; next;";
break;
case LROUTER_NAT_LB_FLOW_SKIP_SNAT:
undnat_action = "flags.skip_snat_for_lb = 1; next;";
break;
case LROUTER_NAT_LB_FLOW_NORMAL:
case LROUTER_NAT_LB_FLOW_MAX:
undnat_action = lrouter_use_common_zone(od)
? "ct_dnat_in_czone;"
: "ct_dnat;";
break;
}
struct ds dnat_action = DS_EMPTY_INITIALIZER;

/* Store the match lengths, so we can reuse the ds buffer. */
size_t new_match_len = ctx->new_match->length;
size_t undnat_match_len = ctx->undnat_match->length;

/* (NOTE) dnat_action: Add the first LB backend IP as a destination
* action of the lr_in_dnat NAT rule. Including the backend IP is useful
* for accepting packets coming from a chassis that does not have
* previously established conntrack entries. This means that the actions
* (ip4.dst + ct_lb_mark) are executed in addition and ip4.dst is not
* useful when traffic passes through the same chassis for ingress/egress
* packets. However, the actions are complementary in cases where traffic
* enters from one chassis, the ack response comes from another chassis,
* and the final ack step of the TCP handshake comes from the first
* chassis used. Without using stateless NAT, the connection will not be
* established because the return packet followed a path through another
* chassis and only ct_lb_mark will not be able to receive the ack and
* forward it to the right backend. With using stateless NAT, the packet
* will be accepted and forwarded to the same backend that corresponds to
* the previous conntrack entry that is in the SYN_SENT state
* (created by ct_lb_mark for the first rcv packet in this flow).
*/
if (stateless_nat) {
if (ctx->lb_vip->n_backends) {
struct ovn_lb_backend *backend = &ctx->lb_vip->backends[0];
bool ipv6 = !IN6_IS_ADDR_V4MAPPED(&backend->ip);
ds_put_format(&dnat_action, "%s.dst = %s; ", ipv6 ? "ip6" : "ip4",
backend->ip_str);
}
}
ds_put_format(&dnat_action, "%s", ctx->new_action[type]);

const char *meter = NULL;

Expand All @@ -11910,20 +11921,46 @@ build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,

if (ctx->lb_vip->n_backends || !ctx->lb_vip->empty_backend_rej) {
ds_put_format(ctx->new_match, " && is_chassis_resident(%s)",
od->l3dgw_ports[0]->cr_port->json_key);
dgp->cr_port->json_key);
}

ovn_lflow_add_with_hint__(ctx->lflows, od, S_ROUTER_IN_DNAT, ctx->prio,
ds_cstr(ctx->new_match), ctx->new_action[type],
ds_cstr(ctx->new_match), ds_cstr(&dnat_action),
NULL, meter, &ctx->lb->nlb->header_,
lflow_ref);

ds_truncate(ctx->new_match, new_match_len);

ds_destroy(&dnat_action);
if (!ctx->lb_vip->n_backends) {
return;
}

struct ds undnat_action = DS_EMPTY_INITIALIZER;
struct ds snat_action = DS_EMPTY_INITIALIZER;

switch (type) {
case LROUTER_NAT_LB_FLOW_FORCE_SNAT:
ds_put_format(&undnat_action, "flags.force_snat_for_lb = 1; next;");
break;
case LROUTER_NAT_LB_FLOW_SKIP_SNAT:
ds_put_format(&undnat_action, "flags.skip_snat_for_lb = 1; next;");
break;
case LROUTER_NAT_LB_FLOW_NORMAL:
case LROUTER_NAT_LB_FLOW_MAX:
ds_put_format(&undnat_action, "%s",
lrouter_use_common_zone(od) ? "ct_dnat_in_czone;"
: "ct_dnat;");
break;
}

/* undnat_action: Remove the ct action from the lr_out_undenat NAT rule.
*/
if (stateless_nat) {
ds_clear(&undnat_action);
ds_put_format(&undnat_action, "next;");
}

/* We need to centralize the LB traffic to properly perform
* the undnat stage.
*/
Expand All @@ -11942,11 +11979,51 @@ build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
ds_put_format(ctx->undnat_match, ") && (inport == %s || outport == %s)"
" && is_chassis_resident(%s)", dgp->json_key, dgp->json_key,
dgp->cr_port->json_key);
/* Use the LB protocol as matching criteria for out undnat and snat when
* creating LBs with stateless NAT. */
if (stateless_nat) {
ds_put_format(ctx->undnat_match, " && %s", ctx->lb->proto);
}
ovn_lflow_add_with_hint(ctx->lflows, od, S_ROUTER_OUT_UNDNAT, 120,
ds_cstr(ctx->undnat_match), undnat_action,
&ctx->lb->nlb->header_,
ds_cstr(ctx->undnat_match),
ds_cstr(&undnat_action), &ctx->lb->nlb->header_,
lflow_ref);

/* (NOTE) snat_action: Add a new rule lr_out_snat with LB VIP as source
* IP action to perform stateless NAT pipeline completely when the
* outgoing packet is redirected to a chassis that does not have an
* active conntrack entry. Otherwise, it will not be SNATed by the
* ct_lb action because it does not refer to a valid created flow. The
* use case for responding to a packet in different chassis is multipath
* via ECMP. So, the LB lr_out_snat is created with a lower priority than
* the other router pipeline entries, in this case, if the packet is not
* SNATed by ct_lb (conntrack lost), it will be SNATed by the LB
* stateless NAT rule. Also, SNAT is performed only when the packet
* matches the configured LB backend IPs, ports and protocols. Otherwise,
* the packet will be forwarded without SNAted interference.
*/
if (stateless_nat) {
if (ctx->lb_vip->port_str) {
ds_put_format(&snat_action, "%s.src = %s; %s.src = %s; next;",
ctx->lb_vip->address_family == AF_INET6 ?
"ip6" : "ip4",
ctx->lb_vip->vip_str, ctx->lb->proto,
ctx->lb_vip->port_str);
} else {
ds_put_format(&snat_action, "%s.src = %s; next;",
ctx->lb_vip->address_family == AF_INET6 ?
"ip6" : "ip4",
ctx->lb_vip->vip_str);
}
ovn_lflow_add_with_hint(ctx->lflows, od, S_ROUTER_OUT_SNAT, 160,
ds_cstr(ctx->undnat_match),
ds_cstr(&snat_action), &ctx->lb->nlb->header_,
lflow_ref);
}

ds_truncate(ctx->undnat_match, undnat_match_len);
ds_destroy(&undnat_action);
ds_destroy(&snat_action);
}

static void
Expand Down Expand Up @@ -12091,6 +12168,8 @@ build_lrouter_nat_flows_for_lb(
* lflow generation for them.
*/
size_t index;
bool use_stateless_nat = smap_get_bool(&lb->nlb->options,
"use_stateless_nat", false);
BITMAP_FOR_EACH_1 (index, bitmap_len, lb_dps->nb_lr_map) {
struct ovn_datapath *od = lr_datapaths->array[index];
enum lrouter_nat_lb_flow_type type;
Expand All @@ -12112,8 +12191,17 @@ build_lrouter_nat_flows_for_lb(
if (!od->n_l3dgw_ports) {
bitmap_set1(gw_dp_bitmap[type], index);
} else {
build_distr_lrouter_nat_flows_for_lb(&ctx, type, od,
lb_dps->lflow_ref);
/* Create stateless LB NAT rules when using multiple DGPs and
* use_stateless_nat is true.
*/
bool stateless_nat = (od->n_l3dgw_ports > 1)
? use_stateless_nat : false;
for (size_t i = 0; i < od->n_l3dgw_ports; i++) {
struct ovn_port *dgp = od->l3dgw_ports[i];
build_distr_lrouter_nat_flows_for_lb(&ctx, type, od,
lb_dps->lflow_ref, dgp,
stateless_nat);
}
}

if (lb->affinity_timeout) {
Expand Down
20 changes: 18 additions & 2 deletions ovn-nb.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2302,6 +2302,16 @@ or
local anymore by the ovn-controller. This option is set to
<code>false</code> by default.
</column>

<column name="options" key="use_stateless_nat"
type='{"type": "boolean"}'>
If the load balancer is configured with <code>use_stateless_nat</code>
option to <code>true</code>, the logical router that references this
load balancer will use Stateless NAT rules when the logical router
has multiple distributed gateway ports(DGP). Otherwise, the outbound
traffic may be dropped in scenarios where we have different chassis
for each DGP. This option is set to <code>false</code> by default.
</column>
</group>
</table>

Expand Down Expand Up @@ -2688,8 +2698,14 @@ or

<column name="load_balancer">
Set of load balancers associated to this logical router. Load balancer
Load balancer rules only work on the Gateway routers or routers with one
and only one distributed gateway port.
rules only work without limitations on the Gateway routers or routers
with one and only one distributed gateway port (DGP). Load balancers
will only work in scenarios that use more than one DGP when the multiple
DGPs are associated with the same gateway chassis, this way this chassis
can apply/maintain the conntrack state without problems. To use a load
balancer in scenarios with DGPs associated with different gateway chassis
(e.g. ECMP routes), consider using the <code>use_stateless_nat</code>
option to <code>true</code> in the load balancer options column.
</column>

<column name="load_balancer_group">
Expand Down
36 changes: 36 additions & 0 deletions tests/multinode-macros.at
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,23 @@ m4_define([M_START_TCPDUMP],
)


# M_FORMAT_CT([ip-addr])
#
# Strip content from the piped input which would differ from test to test
# and limit the output to the rows containing 'ip-addr'.
#
m4_define([M_FORMAT_CT],
[[grep -F "dst=$1," | sed -e 's/id=[0-9]*/id=<cleared>/g' -e 's/state=[0-9_A-Z]*/state=<cleared>/g' | sort | uniq | sed -e 's/zone=[[0-9]]*/zone=<cleared>/' -e 's/mark=[[0-9]]*/mark=<cleared>/' ]])

# M_FORMAT_CURL([ip-addr], [port])
#
# Strip content from the piped input which would differ from test to test
# and limit the output to the rows containing 'ip-addr' and 'port'.
#
m4_define([M_FORMAT_CURL],
[[sed 's/\(.*\)Connected to $1 ($1) port $2/Connected to $1 ($1) port $2\n/' | sed 's/\(.*\)200 OK/200 OK\n/' | grep -i -e connected -e "200 OK" | uniq ]])


OVS_START_SHELL_HELPERS

m_as() {
Expand Down Expand Up @@ -76,6 +93,25 @@ multinode_nbctl () {
m_as ovn-central-az1-1 ovn-nbctl "$@"
}

check_fake_multinode_setup_by_nodes() {
check m_as ovn-central-az1-1 ovn-nbctl --wait=sb sync
for c in $1
do
AT_CHECK([m_as $c ovn-appctl -t ovn-controller version], [0], [ignore])
done
}

cleanup_multinode_resources_by_nodes() {
m_as ovn-central-az1-1 rm -f /etc/ovn/ovnnb_db.db
m_as ovn-central-az1-1 /usr/share/ovn/scripts/ovn-ctl restart_northd
check m_as ovn-central-az1-1 ovn-nbctl --wait=sb sync
for c in $1
do
m_as $c ovs-vsctl del-br br-int
m_as $c ip --all netns delete
done
}

# m_count_rows TABLE [CONDITION...]
#
# Prints the number of rows in TABLE (that satisfy CONDITION).
Expand Down
Loading

0 comments on commit 264c831

Please sign in to comment.