aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorKip Macy <kmacy@FreeBSD.org>2009-04-14 23:05:36 +0000
committerKip Macy <kmacy@FreeBSD.org>2009-04-14 23:05:36 +0000
commit427ac07f05d7ed561978c93c42896f123ec51d9a (patch)
tree2c468e4e81b86a0a66401b94c6626251cbbccf6a /sys
parentecbd3f3a291f2704b4ff4599d8bb69c0a10015a7 (diff)
downloadsrc-427ac07f05d7ed561978c93c42896f123ec51d9a.tar.gz
src-427ac07f05d7ed561978c93c42896f123ec51d9a.zip
Extend route command:
- add show as alias for get - add weights to allow mpath to do more than equal cost - add sticky / nostick to disable / re-enable per-connection load balancing This adds a field to rt_metrics_lite so network bits of world will need to be re-built. Reviewed by: jeli & qingli
Notes
Notes: svn path=/head/; revision=191080
Diffstat (limited to 'sys')
-rw-r--r--sys/net/radix_mpath.c43
-rw-r--r--sys/net/route.c161
-rw-r--r--sys/net/route.h16
-rw-r--r--sys/net/rtsock.c14
-rw-r--r--sys/sys/param.h2
5 files changed, 149 insertions, 87 deletions
diff --git a/sys/net/radix_mpath.c b/sys/net/radix_mpath.c
index 8d94d010b462..9be01d2fe3c3 100644
--- a/sys/net/radix_mpath.c
+++ b/sys/net/radix_mpath.c
@@ -77,15 +77,18 @@ rn_mpath_next(struct radix_node *rn)
return NULL;
}
-u_int32_t
+uint32_t
rn_mpath_count(struct radix_node *rn)
{
- u_int32_t i;
-
- i = 1;
- while ((rn = rn_mpath_next(rn)) != NULL)
- i++;
- return i;
+ uint32_t i = 0;
+ struct rtentry *rt;
+
+ while (rn != NULL) {
+ rt = (struct rtentry *)rn;
+ i += rt->rt_rmx.rmx_weight;
+ rn = rn_mpath_next(rn);
+ }
+ return (i);
}
struct rtentry *
@@ -256,10 +259,12 @@ different:
}
void
-rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
+rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
{
struct radix_node *rn0, *rn;
u_int32_t n;
+ struct rtentry *rt;
+ int64_t weight;
/*
* XXX we don't attempt to lookup cached route again; what should
@@ -284,25 +289,31 @@ rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum)
/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
hash += hashjitter;
hash %= n;
- while (hash-- > 0 && rn) {
+ for (weight = abs((int32_t)hash), rt = ro->ro_rt;
+ weight >= rt->rt_rmx.rmx_weight && rn;
+ weight -= rt->rt_rmx.rmx_weight) {
+
/* stay within the multipath routes */
if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
break;
rn = rn->rn_dupedkey;
+ rt = (struct rtentry *)rn;
}
-
/* XXX try filling rt_gwroute and avoid unreachable gw */
- /* if gw selection fails, use the first match (default) */
+ /* gw selection has failed - there must be only zero weight routes */
if (!rn) {
RT_UNLOCK(ro->ro_rt);
+ ro->ro_rt = NULL;
return;
}
-
- RTFREE_LOCKED(ro->ro_rt);
- ro->ro_rt = (struct rtentry *)rn;
- RT_LOCK(ro->ro_rt);
- RT_ADDREF(ro->ro_rt);
+ if (ro->ro_rt != rt) {
+ RTFREE_LOCKED(ro->ro_rt);
+ ro->ro_rt = (struct rtentry *)rn;
+ RT_LOCK(ro->ro_rt);
+ RT_ADDREF(ro->ro_rt);
+
+ }
RT_UNLOCK(ro->ro_rt);
}
diff --git a/sys/net/route.c b/sys/net/route.c
index f1e13adb066e..5294975033eb 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -826,6 +826,103 @@ bad:
return (error);
}
+#ifdef RADIX_MPATH
+static int
+rn_mpath_update(int req, struct rt_addrinfo *info,
+ struct radix_node_head *rnh, struct rtentry **ret_nrt)
+{
+ /*
+ * if we got multipath routes, we require users to specify
+ * a matching RTAX_GATEWAY.
+ */
+ struct rtentry *rt, *rto = NULL;
+ register struct radix_node *rn;
+ int error = 0;
+
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn == NULL)
+ return (ESRCH);
+ rto = rt = RNTORT(rn);
+ rt = rt_mpath_matchgate(rt, gateway);
+ if (rt == NULL)
+ return (ESRCH);
+ /*
+ * this is the first entry in the chain
+ */
+ if (rto == rt) {
+ rn = rn_mpath_next((struct radix_node *)rt);
+ /*
+ * there is another entry, now it's active
+ */
+ if (rn) {
+ rto = RNTORT(rn);
+ RT_LOCK(rto);
+ rto->rt_flags |= RTF_UP;
+ RT_UNLOCK(rto);
+ } else if (rt->rt_flags & RTF_GATEWAY) {
+ /*
+ * For gateway routes, we need to
+ * make sure that we we are deleting
+ * the correct gateway.
+ * rt_mpath_matchgate() does not
+ * check the case when there is only
+ * one route in the chain.
+ */
+ if (gateway &&
+ (rt->rt_gateway->sa_len != gateway->sa_len ||
+ memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
+ error = ESRCH;
+ goto done;
+ }
+ /*
+ * use the normal delete code to remove
+ * the first entry
+ */
+ if (req != RTM_DELETE)
+ goto nondelete;
+
+ error = ENOENT;
+ goto done;
+ }
+
+ /*
+ * if the entry is 2nd and on up
+ */
+ if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
+ panic ("rtrequest1: rt_mpath_deldup");
+ RT_LOCK(rt);
+ RT_ADDREF(rt);
+ if (req == RTM_DELETE) {
+ rt->rt_flags &= ~RTF_UP;
+ /*
+ * One more rtentry floating around that is not
+ * linked to the routing table. rttrash will be decremented
+ * when RTFREE(rt) is eventually called.
+ */
+ V_rttrash++;
+
+ }
+
+nondelete:
+ if (req != RTM_DELETE)
+ panic("unrecognized request %d", req);
+
+
+ /*
+ * If the caller wants it, then it can have it,
+ * but it's up to it to free the rtentry as we won't be
+ * doing it.
+ */
+ if (ret_nrt) {
+ *ret_nrt = rt;
+ RT_UNLOCK(rt);
+ } else
+ RTFREE_LOCKED(rt);
+done:
+ return (error);
+}
+#endif
+
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
@@ -864,65 +961,15 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
- /*
- * if we got multipath routes, we require users to specify
- * a matching RTAX_GATEWAY.
- */
if (rn_mpath_capable(rnh)) {
- struct rtentry *rto = NULL;
-
- rn = rnh->rnh_matchaddr(dst, rnh);
- if (rn == NULL)
- senderr(ESRCH);
- rto = rt = RNTORT(rn);
- rt = rt_mpath_matchgate(rt, gateway);
- if (!rt)
- senderr(ESRCH);
- /*
- * this is the first entry in the chain
- */
- if (rto == rt) {
- rn = rn_mpath_next((struct radix_node *)rt);
- /*
- * there is another entry, now it's active
- */
- if (rn) {
- rto = RNTORT(rn);
- RT_LOCK(rto);
- rto->rt_flags |= RTF_UP;
- RT_UNLOCK(rto);
- } else if (rt->rt_flags & RTF_GATEWAY) {
- /*
- * For gateway routes, we need to
- * make sure that we we are deleting
- * the correct gateway.
- * rt_mpath_matchgate() does not
- * check the case when there is only
- * one route in the chain.
- */
- if (gateway &&
- (rt->rt_gateway->sa_len != gateway->sa_len ||
- memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
- senderr(ESRCH);
- }
- /*
- * use the normal delete code to remove
- * the first entry
- */
- goto normal_rtdel;
- }
+ error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
- * if the entry is 2nd and on up
+ * "bad" holds true for the success case
+ * as well
*/
- if (!rt_mpath_deldup(rto, rt))
- panic ("rtrequest1: rt_mpath_deldup");
- RT_LOCK(rt);
- RT_ADDREF(rt);
- rt->rt_flags &= ~RTF_UP;
- goto deldone; /* done with the RTM_DELETE command */
+ if (error != ENOENT)
+ goto bad;
}
-
-normal_rtdel:
#endif
/*
* Remove the item from the tree and return it.
@@ -944,9 +991,6 @@ normal_rtdel:
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
-#ifdef RADIX_MPATH
-deldone:
-#endif
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
@@ -1019,6 +1063,7 @@ deldone:
IFAREF(ifa);
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
+ rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */
diff --git a/sys/net/route.h b/sys/net/route.h
index 44b04ac14d60..26247882b8e5 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -58,6 +58,7 @@ struct rt_metrics_lite {
u_long rmx_mtu; /* MTU for this path */
u_long rmx_expire; /* lifetime for route, e.g. redirect */
u_long rmx_pksent; /* packets sent using this route */
+ u_long rmx_weight; /* absolute weight */
};
struct rt_metrics {
@@ -71,7 +72,8 @@ struct rt_metrics {
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_pksent; /* packets sent using this route */
- u_long rmx_filler[4]; /* will be used for T/TCP later */
+ u_long rmx_weight; /* route weight */
+ u_long rmx_filler[3]; /* will be used for T/TCP later */
};
/*
@@ -193,13 +195,15 @@ struct ortentry {
#define RTF_LOCAL 0x200000 /* route represents a local address */
#define RTF_BROADCAST 0x400000 /* route represents a bcast address */
#define RTF_MULTICAST 0x800000 /* route represents a mcast address */
- /* 0x1000000 and up unassigned */
-#define RTF_RNH_LOCKED 0x40000000 /* radix node head locked by caller */
+ /* 0x8000000 and up unassigned */
+#define RTF_STICKY 0x10000000 /* always route dst->src */
+
+#define RTF_RNH_LOCKED 0x40000000 /* radix node head is locked */
/* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
#define RTF_FMASK \
(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
- RTF_REJECT | RTF_STATIC)
+ RTF_REJECT | RTF_STATIC | RTF_STICKY)
/*
* Routing statistics.
@@ -225,12 +229,11 @@ struct rt_msghdr {
int rtm_seq; /* for sender to identify action */
int rtm_errno; /* why failed */
int rtm_fmask; /* bitmask used in RTM_CHANGE message */
-#define rtm_use rtm_fmask /* deprecated, use rtm_rmx->rmx_pksent */
u_long rtm_inits; /* which metrics we are initializing */
struct rt_metrics rtm_rmx; /* metrics themselves */
};
-#define RTM_VERSION 5 /* Up the ante and ignore older versions */
+#define RTM_VERSION 6 /* Up the ante and ignore older versions */
/*
* Message types.
@@ -265,6 +268,7 @@ struct rt_msghdr {
#define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */
#define RTV_RTT 0x40 /* init or lock _rtt */
#define RTV_RTTVAR 0x80 /* init or lock _rttvar */
+#define RTV_WEIGHT 0x100 /* init or lock _weight */
/*
* Bitmask values for rtm_addrs.
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 91aec203ad36..6b7c29b47e74 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -637,7 +637,6 @@ route_output(struct mbuf *m, struct socket *so)
}
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = 0;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@@ -691,10 +690,8 @@ route_output(struct mbuf *m, struct socket *so)
rt->rt_ifp = info.rti_ifp;
}
/* Allow some flags to be toggled on change. */
- if (rtm->rtm_fmask & RTF_FMASK)
- rt->rt_flags = (rt->rt_flags &
- ~rtm->rtm_fmask) |
- (rtm->rtm_flags & rtm->rtm_fmask);
+ rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) |
+ (rtm->rtm_flags & RTF_FMASK);
rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
&rt->rt_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
@@ -773,6 +770,7 @@ rt_setmetrics(u_long which, const struct rt_metrics *in,
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
+ metric(RTV_WEIGHT, rmx_weight);
/* Userland -> kernel timebase conversion. */
if (which & RTV_EXPIRE)
out->rmx_expire = in->rmx_expire ?
@@ -786,6 +784,7 @@ rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out)
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
+ metric(rmx_weight);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_uptime + time_second : 0;
@@ -1257,7 +1256,10 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = rt->rt_rmx.rmx_pksent;
+ /*
+ * let's be honest about this being a retarded hack
+ */
+ rtm->rtm_fmask = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
diff --git a/sys/sys/param.h b/sys/sys/param.h
index f02853c8d892..8703c30dbc08 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -57,7 +57,7 @@
* is created, otherwise 1.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 800077 /* Master, propagated to newvers */
+#define __FreeBSD_version 800078 /* Master, propagated to newvers */
#ifndef LOCORE
#include <sys/types.h>