Commit 5bd73431 authored by Ondrej Zajicek (work)'s avatar Ondrej Zajicek (work)
Browse files

BGP: Long-lived graceful restart

The patch implements long-lived graceful restart for BGP, namely
draft-uttaro-idr-bgp-persistence-03.
parent 318acb0f
Loading
Loading
Loading
Loading
+38 −4
Original line number Diff line number Diff line
@@ -2220,13 +2220,16 @@ using the following configuration parameters:
	immediately shut down. Note that this option cannot be used with
	multihop BGP. Default: enabled for direct BGP, disabled otherwise.

	<tag><label id="bgp-bfd">bfd <M>switch</M></tag>
	<tag><label id="bgp-bfd">bfd <M>switch</M>|graceful</tag>
	BGP could use BFD protocol as an advisory mechanism for neighbor
	liveness and failure detection. If enabled, BIRD setups a BFD session
	for the BGP neighbor and tracks its liveness by it. This has an
	advantage of an order of magnitude lower detection times in case of
	failure. Note that BFD protocol also has to be configured, see
	<ref id="bfd" name="BFD"> section for details. Default: disabled.
	failure. When a neighbor failure is detected, the BGP session is
	restarted. Optionally, it can be configured (by <cf/graceful/ argument)
	to trigger graceful restart instead of regular restart. Note that BFD
	protocol also has to be configured, see <ref id="bfd" name="BFD">
	section for details. Default: disabled.

	<tag><label id="bgp-ttl-security">ttl security <m/switch/</tag>
	Use GTSM (<rfc id="5082"> - the generalized TTL security mechanism). GTSM
@@ -2348,6 +2351,25 @@ using the following configuration parameters:
	re-establish after a restart before deleting stale routes. Default:
	120 seconds.

	<tag><label id="bgp-long-lived-graceful-restart">long lived graceful restart <m/switch/|aware</tag>
	The long-lived graceful restart is an extension of the traditional
	<ref id="bgp-graceful-restart" name="BGP graceful restart">, where stale
	routes are kept even after the <ref id="bgp-graceful-restart-time"
	name="restart time"> expires for additional long-lived stale time, but
	they are marked with the LLGR_STALE community, depreferenced, and
	withdrawn from routers not supporting LLGR. Like traditional BGP
	graceful restart, it has three states: disabled, aware (receiving-only),
	and enabled. Note that long-lived graceful restart requires at least
	aware level of traditional BGP graceful restart. Default: aware, unless
	graceful restart is disabled.

	<tag><label id="bgp-long-lived-stale-time">long lived stale time <m/number/</tag>
	The long-lived stale time is announced in the BGP long-lived graceful
	restart capability and specifies how long the neighbor would keep stale
	routes depreferenced during long-lived graceful restart until either the
	session is re-stablished and synchronized or the stale time expires and
	routes are removed. Default: 3600 seconds.

	<tag><label id="bgp-interpret-communities">interpret communities <m/switch/</tag>
	<rfc id="1997"> demands that BGP speaker should process well-known
	communities like no-export (65535, 65281) or no-advertise (65535,
@@ -2607,6 +2629,19 @@ be used in explicit configuration.
	configure restarting role per AFI/SAFI pair by this channel option.
	The option is ignored if graceful restart is disabled by protocol-wide
	option. Default: off in aware mode, on in full mode.

	<tag><label id="bgp-long-lived-graceful-restart-c">long lived graceful restart <m/switch/</tag>
	BGP long-lived graceful restart is configured mainly by protocol-wide
	<ref id="bgp-long-lived-graceful-restart" name="options">, but the
	restarting role can be set per AFI/SAFI pair by this channel option.
	The option is ignored if long-lived graceful restart is disabled by
	protocol-wide option. Default: off in aware mode, on in full mode.

	<tag><label id="bgp-long-lived-stale-time-c">long lived stale time <m/number/</tag>
	Like previous graceful restart channel options, this option allows to
	set <ref id="bgp-long-lived-stale-time" name="long lived stale time">
	per AFI/SAFI pair instead of per protocol. Default: set by protocol-wide
	option.
</descrip>

<sect1>Attributes
@@ -2761,7 +2796,6 @@ interfaces to be defined for them to work with.
	so the default time is set to a large value.

	<tag><label id="device-iface">interface <m/pattern/ [, <m/.../]</tag>

	By default, the Device protocol handles all interfaces without any
	configuration. Interface definitions allow to specify optional
	parameters for specific interfaces. See <ref id="proto-iface"
+1 −0
Original line number Diff line number Diff line
@@ -229,6 +229,7 @@ struct proto {
  int (*rte_better)(struct rte *, struct rte *);
  int (*rte_same)(struct rte *, struct rte *);
  int (*rte_mergable)(struct rte *, struct rte *);
  struct rte * (*rte_modify)(struct rte *, struct linpool *);
  void (*rte_insert)(struct network *, struct rte *);
  void (*rte_remove)(struct network *, struct rte *);

+3 −0
Original line number Diff line number Diff line
@@ -231,6 +231,7 @@ typedef struct rte {
#ifdef CONFIG_BGP
    struct {
      u8 suppressed;			/* Used for deterministic MED comparison */
      s8 stale;				/* Route is LLGR_STALE, -1 if unknown */
    } bgp;
#endif
#ifdef CONFIG_BABEL
@@ -254,6 +255,7 @@ typedef struct rte {
#define REF_FILTERED	2		/* Route is rejected by import filter */
#define REF_STALE	4		/* Route is stale in a refresh cycle */
#define REF_DISCARD	8		/* Route is scheduled for discard */
#define REF_MODIFY	16		/* Route is scheduled for modify */

/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
@@ -297,6 +299,7 @@ int rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter);
rte *rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent);
void rt_refresh_begin(rtable *t, struct channel *c);
void rt_refresh_end(rtable *t, struct channel *c);
void rt_modify_stale(rtable *t, struct channel *c);
void rt_schedule_prune(rtable *t);
void rte_dump(rte *);
void rte_free(rte *);
+59 −0
Original line number Diff line number Diff line
@@ -1437,6 +1437,28 @@ rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collec
  rte_update_unlock();
}

/* Modify existing route by protocol hook, used for long-lived graceful restart */
static inline void
rte_modify(rte *old)
{
  rte_update_lock();

  rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
  if (new != old)
  {
    if (new)
    {
      if (!rta_is_cached(new->attrs))
	new->attrs = rta_lookup(new->attrs);
      new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
    }

    rte_recalculate(old->sender, old->net, new, old->attrs->src);
  }

  rte_update_unlock();
}

/* Check rtable for best route to given net whether it would be exported do p */
int
rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter)
@@ -1521,6 +1543,26 @@ rt_refresh_end(rtable *t, struct channel *c)
    rt_schedule_prune(t);
}

void
rt_modify_stale(rtable *t, struct channel *c)
{
  int prune = 0;

  FIB_WALK(&t->fib, net, n)
    {
      rte *e;
      for (e = n->routes; e; e = e->next)
	if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
	  {
	    e->flags |= REF_MODIFY;
	    prune = 1;
	  }
    }
  FIB_WALK_END;

  if (prune)
    rt_schedule_prune(t);
}

/**
 * rte_dump - dump a route
@@ -1712,6 +1754,7 @@ again:

    rescan:
      for (e=n->routes; e; e=e->next)
      {
	if (e->sender->flush_active || (e->flags & REF_DISCARD))
	  {
	    if (limit <= 0)
@@ -1727,6 +1770,22 @@ again:
	    goto rescan;
	  }

	if (e->flags & REF_MODIFY)
	  {
	    if (limit <= 0)
	      {
		FIB_ITERATE_PUT(fit);
		ev_schedule(tab->rt_event);
		return;
	      }

	    rte_modify(e);
	    limit--;

	    goto rescan;
	  }
      }

      if (!n->routes)		/* Orphaned FIB entry */
	{
	  FIB_ITERATE_PUT(fit);
+54 −1
Original line number Diff line number Diff line
@@ -1413,6 +1413,10 @@ bgp_import_control(struct proto *P, rte **new, struct linpool *pool UNUSED)
    /* Do not export outside of AS (or confederation) */
    if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT))
      return -1;

    /* Do not export LLGR_STALE routes to LLGR-ignorant peers */
    if (!p->conn->remote_caps->llgr_aware && int_set_contains(d, BGP_COMM_LLGR_STALE))
      return -1;
  }

  return 0;
@@ -1580,6 +1584,19 @@ rte_resolvable(rte *rt)
  return rt->attrs->dest == RTD_UNICAST;
}

static inline int
rte_stale(rte *r)
{
  if (r->u.bgp.stale < 0)
  {
    /* If staleness is unknown, compute and cache it */
    eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY));
    r->u.bgp.stale = a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE);
  }

  return r->u.bgp.stale;
}

int
bgp_rte_better(rte *new, rte *old)
{
@@ -1604,6 +1621,14 @@ bgp_rte_better(rte *new, rte *old)
  if (n < o)
    return 0;

  /* LLGR draft - depreference stale routes */
  n = rte_stale(new);
  o = rte_stale(old);
  if (n > o)
    return 0;
  if (n < o)
    return 1;

 /* Start with local preferences */
  x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
  y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
@@ -1725,6 +1750,10 @@ bgp_rte_mergable(rte *pri, rte *sec)
  if (!rte_resolvable(sec))
    return 0;

  /* LLGR draft - depreference stale routes */
  if (rte_stale(pri) != rte_stale(sec))
    return 0;

  /* Start with local preferences */
  x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
  y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
@@ -1926,6 +1955,27 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
    return old_is_group_best;
}

struct rte *
bgp_rte_modify_stale(struct rte *r, struct linpool *pool)
{
  eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY));
  struct adata *ad = a ? a->u.ptr : NULL;
  uint flags = a ? a->flags : BAF_PARTIAL;

  if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR))
    return NULL;

  if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE))
    return r;

  r = rte_cow_rta(r, pool);
  bgp_set_attr_ptr(&(r->attrs->eattrs), pool, BA_COMMUNITY, flags,
		   int_set_add(pool, ad, BGP_COMM_LLGR_STALE));
  r->u.bgp.stale = 1;

  return r;
}


/*
 * Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3
@@ -2011,6 +2061,9 @@ bgp_get_route_info(rte *e, byte *buf)
  if (e->u.bgp.suppressed)
    buf += bsprintf(buf, "-");

  if (rte_stale(e))
    buf += bsprintf(buf, "s");

  if (e->attrs->hostentry)
  {
    if (!rte_resolvable(e))
Loading