4 files changed, 8383 insertions, 0 deletions
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
new file mode 100644
index 000000000000..2f11ae08a579
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dummynet.c
@@ -0,0 +1,2371 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define	DUMMYNET_DEBUG
+
+#include "opt_inet6.h"
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ *  + variable declarations;
+ *  + heap management functions;
+ *  + scheduler and dummynet functions;
+ *  + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by the "dummynet lock".
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513:	initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/limits.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ;	/* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static long searches, search_steps ;
+static int pipe_expire = 1 ;   /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
+static long pipe_byte_limit = 1024 * 1024;
+
+static int red_lookup_depth = 256;	/* RED - default lookup table depth */
+static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
+static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+
+static struct timeval prev_t, t;
+static long tick_last;			/* Last tick duration (usec). */
+static long tick_delta;			/* Last vs standard tick diff (usec). */
+static long tick_delta_sum;		/* Accumulated tick difference (usec).*/
+static long tick_adjustment;		/* Tick adjustments done. */
+static long tick_lost;			/* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static int		io_fast;
+static unsigned long	io_pkt;
+static unsigned long	io_pkt_fast;
+static unsigned long	io_pkt_drop;
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int	heap_init(struct dn_heap *h, int size);
+static int	heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void	heap_extract(struct dn_heap *h, void *obj);
+static void	transmit_event(struct dn_pipe *pipe, struct mbuf **head,
+		    struct mbuf **tail);
+static void	ready_event(struct dn_flow_queue *q, struct mbuf **head,
+		    struct mbuf **tail);
+static void	ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
+		    struct mbuf **tail);
+
+#define	HASHSIZE	16
+#define	HASH(num)	((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
+static struct dn_pipe_head	pipehash[HASHSIZE];	/* all pipes */
+static struct dn_flow_set_head	flowsethash[HASHSIZE];	/* all flowsets */
+
+static struct callout dn_timeout;
+
+extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+#if 0	/* curr_time is 64 bit */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+    CTLFLAG_RD, &curr_time, 0, "Current tick");
+#endif
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
+    CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+    CTLFLAG_RW, &dn_max_ratio, 0,
+    "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
+#endif
+
+#ifdef DUMMYNET_DEBUG
+int	dummynet_debug = 0;
+#ifdef SYSCTL_NODE
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
+	    0, "control debugging printfs");
+#endif
+#define	DPRINTF(X)	if (dummynet_debug) printf X
+#else
+#define	DPRINTF(X)
+#endif
+
+static struct task	dn_task;
+static struct taskqueue	*dn_tq = NULL;
+static void dummynet_task(void *, int);
+
+static struct mtx dummynet_mtx;
+#define	DUMMYNET_LOCK_INIT() \
+	mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
+#define	DUMMYNET_LOCK_DESTROY()	mtx_destroy(&dummynet_mtx)
+#define	DUMMYNET_LOCK()		mtx_lock(&dummynet_mtx)
+#define	DUMMYNET_UNLOCK()	mtx_unlock(&dummynet_mtx)
+#define	DUMMYNET_LOCK_ASSERT()	mtx_assert(&dummynet_mtx, MA_OWNED)
+
+static int	config_pipe(struct dn_pipe *p);
+static int	ip_dn_ctl(struct sockopt *sopt);
+
+static void	dummynet(void *);
+static void	dummynet_flush(void);
+static void	dummynet_send(struct mbuf *);
+void		dummynet_drain(void);
+static int	dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+static void	dn_rule_delete(void *);
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT	15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+	printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
+		h->size, new_size);
+	return 0 ;
+    }
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
+    if (p == NULL) {
+	printf("dummynet: %s, resize %d failed\n", __func__, new_size );
+	return 1 ; /* error */
+    }
+    if (h->size > 0) {
+	bcopy(h->p, p, h->size * sizeof(*p) );
+	free(h->p, M_DUMMYNET);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+    int son = h->elements ;
+
+    if (p == NULL)	/* data already there, set starting point */
+	son = key1 ;
+    else {		/* insert new element at the end, possibly resize */
+	son = h->elements ;
+	if (son == h->size) /* need resize... */
+	    if (heap_init(h, h->elements+1) )
+		return 1 ; /* failure... */
+	h->p[son].object = p ;
+	h->p[son].key = key1 ;
+	h->elements++ ;
+    }
+    while (son > 0) {				/* bubble up */
+	int father = HEAP_FATHER(son) ;
+	struct dn_heap_entry tmp  ;
+
+	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+	    break ; /* found right position */
+	/* son smaller than father, swap and repeat */
+	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+	SET_OFFSET(h, son);
+	son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+    int child, father, max = h->elements - 1 ;
+
+    if (max < 0) {
+	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
+	return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+	if (h->offset <= 0)
+	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
+	father = *((int *)((char *)obj + h->offset)) ;
+	if (father < 0 || father >= h->elements) {
+	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+		father, h->elements);
+	    panic("dummynet: heap_extract");
+	}
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;		/* left child */
+    while (child <= max) {		/* valid entry */
+	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+	    child = child+1 ;		/* take right child, otherwise left */
+	h->p[father] = h->p[child] ;
+	SET_OFFSET(h, father);
+	father = child ;
+	child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }
+    h->elements-- ;
+    if (father != max) {
+	/*
+	 * Fill hole with last entry and bubble up, reusing the insert code
+	 */
+	h->p[father] = h->p[max] ;
+	heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+	panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+	h->p[i].key = new_key ;
+	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+		 i = temp ) { /* bubble up */
+	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+	    SET_OFFSET(h, i);
+	}
+    } else {		/* must move down */
+	h->p[i].key = new_key ;
+	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+		temp++ ; /* select child with min key */
+	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+		SET_OFFSET(h, i);
+	    } else
+		break ;
+	    i = temp ;
+	}
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+	heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+	free(h->p, M_DUMMYNET);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Return the mbuf tag holding the dummynet state.  As an optimization
+ * this is assumed to be the first tag on the list.  If this turns out
+ * wrong we'll need to search the list.
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+    struct m_tag *mtag = m_tag_first(m);
+    KASSERT(mtag != NULL &&
+	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+	    ("packet on dummynet queue w/o dummynet tag!"));
+    return (struct dn_pkt_tag *)(mtag+1);
+}
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+{
+	struct mbuf *m;
+	struct dn_pkt_tag *pkt;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	while ((m = pipe->head) != NULL) {
+		pkt = dn_tag_get(m);
+		if (!DN_KEY_LEQ(pkt->output_time, curr_time))
+			break;
+
+		pipe->head = m->m_nextpkt;
+		if (*tail != NULL)
+			(*tail)->m_nextpkt = m;
+		else
+			*head = m;
+		*tail = m;
+	}
+	if (*tail != NULL)
+		(*tail)->m_nextpkt = NULL;
+
+	/* If there are leftover packets, put into the heap for next event. */
+	if ((m = pipe->head) != NULL) {
+		pkt = dn_tag_get(m);
+		/*
+		 * XXX Should check errors on heap_insert, by draining the
+		 * whole pipe p and hoping in the future we are more successful.
+		 */
+		heap_insert(&extract_heap, pkt->output_time, pipe);
+	}
+}
+
+#define div64(a, b)	((int64_t)(a) / (int64_t)(b))
+#define DN_TO_DROP	0xffff
+/*
+ * Compute how many ticks we have to wait before being able to send
+ * a packet. This is computed as the "wire time" for the packet
+ * (length + extra bits), minus the credit available, scaled to ticks.
+ * Check that the result is not be negative (it could be if we have
+ * too much leftover credit in q->numbytes).
+ */
+static inline dn_key
+set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+{
+	int64_t ret;
+
+	ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
+		- q->numbytes + p->bandwidth - 1 , p->bandwidth);
+#if 0
+	printf("%s %d extra_bits %d numb %d ret %d\n",
+		__FUNCTION__, __LINE__,
+		(int)(q->extra_bits & 0xffffffff),
+		(int)(q->numbytes & 0xffffffff),
+		(int)(ret & 0xffffffff));
+#endif
+	if (ret < 0)
+		ret = 0;
+	return ret;
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are in milliseconds
+ * so we need to divide by 1000.
+ */
+static dn_key
+compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+{
+	int index;
+	dn_key extra_bits;
+
+	if (!p->samples || p->samples_no == 0)
+		return 0;
+	index  = random() % p->samples_no;
+	extra_bits = ((dn_key)p->samples[index] * p->bandwidth) / 1000;
+	if (index >= p->loss_level) {
+		struct dn_pkt_tag *dt = dn_tag_get(pkt);
+		if (dt)
+			dt->dn_dir = DN_TO_DROP;
+	}
+	return extra_bits;
+}
+
+static void
+free_pipe(struct dn_pipe *p)
+{
+	if (p->samples)
+		free(p->samples, M_DUMMYNET);
+	free(p, M_DUMMYNET);
+}
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
+    int len)
+{
+    struct dn_pkt_tag *dt = dn_tag_get(pkt);
+
+    q->head = pkt->m_nextpkt ;
+    q->len-- ;
+    q->len_bytes -= len ;
+
+    dt->output_time = curr_time + p->delay ;
+
+    if (p->head == NULL)
+	p->head = pkt;
+    else
+	p->tail->m_nextpkt = pkt;
+    p->tail = pkt;
+    p->tail->m_nextpkt = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+{
+	struct mbuf *pkt;
+	struct dn_pipe *p = q->fs->pipe;
+	int p_was_empty;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	if (p == NULL) {
+		printf("dummynet: ready_event- pipe is gone\n");
+		return;
+	}
+	p_was_empty = (p->head == NULL);
+
+	/*
+	 * Schedule fixed-rate queues linked to this pipe:
+	 * account for the bw accumulated since last scheduling, then
+	 * drain as many pkts as allowed by q->numbytes and move to
+	 * the delay line (in p) computing output time.
+	 * bandwidth==0 (no limit) means we can drain the whole queue,
+	 * setting len_scaled = 0 does the job.
+	 */
+	q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
+	while ((pkt = q->head) != NULL) {
+		int len = pkt->m_pkthdr.len;
+		dn_key len_scaled = p->bandwidth ? len*8*hz
+			+ q->extra_bits*hz
+			: 0;
+
+		if (DN_KEY_GT(len_scaled, q->numbytes))
+			break;
+		q->numbytes -= len_scaled;
+		move_pkt(pkt, q, p, len);
+		if (q->head)
+			q->extra_bits = compute_extra_bits(q->head, p);
+	}
+	/*
+	 * If we have more packets queued, schedule next ready event
+	 * (can only occur when bandwidth != 0, otherwise we would have
+	 * flushed the whole queue in the previous loop).
+	 * To this purpose we record the current time and compute how many
+	 * ticks to go for the finish time of the packet.
+	 */
+	if ((pkt = q->head) != NULL) {	/* this implies bandwidth != 0 */
+		dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+
+		q->sched_time = curr_time;
+		heap_insert(&ready_heap, curr_time + t, (void *)q);
+		/*
+		 * XXX Should check errors on heap_insert, and drain the whole
+		 * queue on error hoping next time we are luckier.
+		 */
+	} else		/* RED needs to know when the queue becomes empty. */
+		q->q_time = curr_time;
+
+	/*
+	 * If the delay line was empty call transmit_event() now.
+	 * Otherwise, the scheduler will take care of it.
+	 */
+	if (p_was_empty)
+		transmit_event(p, head, tail);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes < 0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+{
+	int p_was_empty = (p->head == NULL);
+	struct dn_heap *sch = &(p->scheduler_heap);
+	struct dn_heap *neh = &(p->not_eligible_heap);
+	int64_t p_numbytes = p->numbytes;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	if (p->if_name[0] == 0)		/* tx clock is simulated */
+		/*
+		 * Since result may not fit into p->numbytes (32bit) we
+		 * are using 64bit var here.
+		 */
+		p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
+	else {	/*
+		 * tx clock is for real,
+		 * the ifq must be empty or this is a NOP.
+		 */
+		if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+			return;
+		else {
+			DPRINTF(("dummynet: pipe %d ready from %s --\n",
+			    p->pipe_nr, p->if_name));
+		}
+	}
+
+	/*
+	 * While we have backlogged traffic AND credit, we need to do
+	 * something on the queue.
+	 */
+	while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
+		if (sch->elements > 0) {
+			/* Have some eligible pkts to send out. */
+			struct dn_flow_queue *q = sch->p[0].object;
+			struct mbuf *pkt = q->head;
+			struct dn_flow_set *fs = q->fs;
+			uint64_t len = pkt->m_pkthdr.len;
+			int len_scaled = p->bandwidth ? len * 8 * hz : 0;
+
+			heap_extract(sch, NULL); /* Remove queue from heap. */
+			p_numbytes -= len_scaled;
+			move_pkt(pkt, q, p, len);
+
+			p->V += (len << MY_M) / p->sum;	/* Update V. */
+			q->S = q->F;			/* Update start time. */
+			if (q->len == 0) {
+				/* Flow not backlogged any more. */
+				fs->backlogged--;
+				heap_insert(&(p->idle_heap), q->F, q);
+			} else {
+				/* Still backlogged. */
+
+				/*
+				 * Update F and position in backlogged queue,
+				 * then put flow in not_eligible_heap
+				 * (we will fix this later).
+				 */
+				len = (q->head)->m_pkthdr.len;
+				q->F += (len << MY_M) / (uint64_t)fs->weight;
+				if (DN_KEY_LEQ(q->S, p->V))
+					heap_insert(neh, q->S, q);
+				else
+					heap_insert(sch, q->F, q);
+			}
+		}
+		/*
+		 * Now compute V = max(V, min(S_i)). Remember that all elements
+		 * in sch have by definition S_i <= V so if sch is not empty,
+		 * V is surely the max and we must not update it. Conversely,
+		 * if sch is empty we only need to look at neh.
+		 */
+		if (sch->elements == 0 && neh->elements > 0)
+			p->V = MAX64(p->V, neh->p[0].key);
+		/* Move from neh to sch any packets that have become eligible */
+		while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
+			struct dn_flow_queue *q = neh->p[0].object;
+			heap_extract(neh, NULL);
+			heap_insert(sch, q->F, q);
+		}
+
+		if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
+			p_numbytes = -1;	/* Mark not ready for I/O. */
+			break;
+		}
+	}
+	if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 &&
+	    p->idle_heap.elements > 0) {
+		/*
+		 * No traffic and no events scheduled.
+		 * We can get rid of idle-heap.
+		 */
+		int i;
+
+		for (i = 0; i < p->idle_heap.elements; i++) {
+			struct dn_flow_queue *q = p->idle_heap.p[i].object;
+
+			q->F = 0;
+			q->S = q->F + 1;
+		}
+		p->sum = 0;
+		p->V = 0;
+		p->idle_heap.elements = 0;
+	}
+	/*
+	 * If we are getting clocks from dummynet (not a real interface) and
+	 * If we are under credit, schedule the next ready event.
+	 * Also fix the delivery time of the last packet.
+	 */
+	if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
+		dn_key t = 0;		/* Number of ticks i have to wait. */
+
+		if (p->bandwidth > 0)
+			t = (p->bandwidth - 1 - p_numbytes) / p->bandwidth;
+		dn_tag_get(p->tail)->output_time += t;
+		p->sched_time = curr_time;
+		heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+		/*
+		 * XXX Should check errors on heap_insert, and drain the whole
+		 * queue on error hoping next time we are luckier.
+		 */
+	}
+
+	/* Fit (adjust if necessary) 64bit result into 32bit variable. */
+	if (p_numbytes > INT_MAX)
+		p->numbytes = INT_MAX;
+	else if (p_numbytes < INT_MIN)
+		p->numbytes = INT_MIN;
+	else
+		p->numbytes = p_numbytes;
+
+	/*
+	 * If the delay line was empty call transmit_event() now.
+	 * Otherwise, the scheduler will take care of it.
+	 */
+	if (p_was_empty)
+		transmit_event(p, head, tail);
+}
+
+/*
+ * This is called one tick, after previous run. It is used to
+ * schedule next run.
+ */
+static void
+dummynet(void * __unused unused)
+{
+
+	taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+/*
+ * The main dummynet processing function.
+ */
+static void
+dummynet_task(void *context, int pending)
+{
+	struct mbuf *head = NULL, *tail = NULL;
+	struct dn_pipe *pipe;
+	struct dn_heap *heaps[3];
+	struct dn_heap *h;
+	void *p;	/* generic parameter to handler */
+	int i;
+
+	DUMMYNET_LOCK();
+
+	heaps[0] = &ready_heap;			/* fixed-rate queues */
+	heaps[1] = &wfq_ready_heap;		/* wfq queues */
+	heaps[2] = &extract_heap;		/* delay line */
+
+ 	/* Update number of lost(coalesced) ticks. */
+ 	tick_lost += pending - 1;
+ 
+ 	getmicrouptime(&t);
+ 	/* Last tick duration (usec). */
+ 	tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
+ 	    (t.tv_usec - prev_t.tv_usec);
+ 	/* Last tick vs standard tick difference (usec). */
+ 	tick_delta = (tick_last * hz - 1000000) / hz;
+ 	/* Accumulated tick difference (usec). */
+ 	tick_delta_sum += tick_delta;
+ 
+ 	prev_t = t;
+ 
+ 	/*
+ 	 * Adjust curr_time if accumulated tick difference greater than
+ 	 * 'standard' tick. Since curr_time should be monotonically increasing,
+ 	 * we do positive adjustment as required and throttle curr_time in
+ 	 * case of negative adjustment.
+ 	 */
+  	curr_time++;
+ 	if (tick_delta_sum - tick >= 0) {
+ 		int diff = tick_delta_sum / tick;
+ 
+ 		curr_time += diff;
+ 		tick_diff += diff;
+ 		tick_delta_sum %= tick;
+ 		tick_adjustment++;
+ 	} else if (tick_delta_sum + tick <= 0) {
+ 		curr_time--;
+ 		tick_diff--;
+ 		tick_delta_sum += tick;
+ 		tick_adjustment++;
+ 	}
+
+	for (i = 0; i < 3; i++) {
+		h = heaps[i];
+		while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
+			if (h->p[0].key > curr_time)
+				printf("dummynet: warning, "
+				    "heap %d is %d ticks late\n",
+				    i, (int)(curr_time - h->p[0].key));
+			/* store a copy before heap_extract */
+			p = h->p[0].object;
+			/* need to extract before processing */
+			heap_extract(h, NULL);
+			if (i == 0)
+				ready_event(p, &head, &tail);
+			else if (i == 1) {
+				struct dn_pipe *pipe = p;
+				if (pipe->if_name[0] != '\0')
+					printf("dummynet: bad ready_event_wfq "
+					    "for pipe %s\n", pipe->if_name);
+				else
+					ready_event_wfq(p, &head, &tail);
+			} else
+				transmit_event(p, &head, &tail);
+		}
+	}
+
+	/* Sweep pipes trying to expire idle flow_queues. */
+	for (i = 0; i < HASHSIZE; i++)
+		SLIST_FOREACH(pipe, &pipehash[i], next)
+			if (pipe->idle_heap.elements > 0 &&
+			    DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
+				struct dn_flow_queue *q =
+				    pipe->idle_heap.p[0].object;
+
+				heap_extract(&(pipe->idle_heap), NULL);
+				/* Mark timestamp as invalid. */
+				q->S = q->F + 1;
+				pipe->sum -= q->fs->weight;
+			}
+
+	DUMMYNET_UNLOCK();
+
+	if (head != NULL)
+		dummynet_send(head);
+
+	callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+
+static void
+dummynet_send(struct mbuf *m)
+{
+	struct dn_pkt_tag *pkt;
+	struct mbuf *n;
+	struct ip *ip;
+
+	for (; m != NULL; m = n) {
+		n = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		pkt = dn_tag_get(m);
+		switch (pkt->dn_dir) {
+		case DN_TO_IP_OUT:
+			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+			break ;
+		case DN_TO_IP_IN :
+			ip = mtod(m, struct ip *);
+			ip->ip_len = htons(ip->ip_len);
+			ip->ip_off = htons(ip->ip_off);
+			netisr_dispatch(NETISR_IP, m);
+			break;
+#ifdef INET6
+		case DN_TO_IP6_IN:
+			netisr_dispatch(NETISR_IPV6, m);
+			break;
+
+		case DN_TO_IP6_OUT:
+			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+			break;
+#endif
+		case DN_TO_IFB_FWD:
+			if (bridge_dn_p != NULL)
+				((*bridge_dn_p)(m, pkt->ifp));
+			else
+				printf("dummynet: if_bridge not loaded\n");
+
+			break;
+		case DN_TO_ETH_DEMUX:
+			/*
+			 * The Ethernet code assumes the Ethernet header is
+			 * contiguous in the first mbuf header.
+			 * Insure this is true.
+			 */
+			if (m->m_len < ETHER_HDR_LEN &&
+			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+				printf("dummynet/ether: pullup failed, "
+				    "dropping packet\n");
+				break;
+			}
+			ether_demux(m->m_pkthdr.rcvif, m);
+			break;
+		case DN_TO_ETH_OUT:
+			ether_output_frame(pkt->ifp, m);
+			break;
+
+		case DN_TO_DROP:
+			/* drop the packet after some time */
+			m_freem(m);
+			break;
+
+		default:
+			printf("dummynet: bad switch %d!\n", pkt->dn_dir);
+			m_freem(m);
+			break;
+		}
+	}
+}
+
+/*
+ * Unconditionally expire empty queues in case of shortage.
+ * Returns the number of queues freed.
+ */
+static int
+expire_queues(struct dn_flow_set *fs)
+{
+    struct dn_flow_queue *q, *prev ;
+    int i, initial_elements = fs->rq_elements ;
+
+    if (fs->last_expired == time_uptime)
+	return 0 ;
+    fs->last_expired = time_uptime ;
+    for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
+	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
+	    if (q->head != NULL || q->S != q->F+1) {
+  		prev = q ;
+  	        q = q->next ;
+  	    } else { /* entry is idle, expire it */
+		struct dn_flow_queue *old_q = q ;
+
+		if (prev != NULL)
+		    prev->next = q = q->next ;
+		else
+		    fs->rq[i] = q = q->next ;
+		fs->rq_elements-- ;
+		free(old_q, M_DUMMYNET);
+	    }
+    return initial_elements - fs->rq_elements ;
+}
+
+/*
+ * If room, create a new queue and put at head of slot i;
+ * otherwise, create or use the default queue.
+ */
+static struct dn_flow_queue *
+create_queue(struct dn_flow_set *fs, int i)
+{
+	struct dn_flow_queue *q;
+
+	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
+	    expire_queues(fs) == 0) {
+		/* No way to get room, use or create overflow queue. */
+		i = fs->rq_size;
+		if (fs->rq[i] != NULL)
+		    return fs->rq[i];
+	}
+	q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (q == NULL) {
+		printf("dummynet: sorry, cannot allocate queue for new flow\n");
+		return (NULL);
+	}
+	q->fs = fs;
+	q->hash_slot = i;
+	q->next = fs->rq[i];
+	q->S = q->F + 1;	/* hack - mark timestamp as invalid. */
+	q->numbytes = io_fast ? fs->pipe->bandwidth : 0;
+	fs->rq[i] = q;
+	fs->rq_elements++;
+	return (q);
+}
+
+/*
+ * Given a flow_set and a pkt in last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
+{
+    int i = 0 ; /* we need i and q for new allocations */
+    struct dn_flow_queue *q, *prev;
+    int is_v6 = IS_IP6_FLOW_ID(id);
+
+    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
+	q = fs->rq[0] ;
+    else {
+	/* first, do the masking, then hash */
+	id->dst_port &= fs->flow_mask.dst_port ;
+	id->src_port &= fs->flow_mask.src_port ;
+	id->proto &= fs->flow_mask.proto ;
+	id->flags = 0 ; /* we don't care about this one */
+	if (is_v6) {
+	    APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
+	    APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
+	    id->flow_id6 &= fs->flow_mask.flow_id6;
+
+	    i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
+
+		((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
+		((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
+
+		((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
+
+		((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
+		((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
+
+		(id->dst_port << 1) ^ (id->src_port) ^
+		(id->proto ) ^
+		(id->flow_id6);
+	} else {
+	    id->dst_ip &= fs->flow_mask.dst_ip ;
+	    id->src_ip &= fs->flow_mask.src_ip ;
+
+	    i = ( (id->dst_ip) & 0xffff ) ^
+		( (id->dst_ip >> 15) & 0xffff ) ^
+		( (id->src_ip << 1) & 0xffff ) ^
+		( (id->src_ip >> 16 ) & 0xffff ) ^
+		(id->dst_port << 1) ^ (id->src_port) ^
+		(id->proto );
+	}
+	i = i % fs->rq_size ;
+	/* finally, scan the current list for a match */
+	searches++ ;
+	for (prev=NULL, q = fs->rq[i] ; q ; ) {
+	    search_steps++;
+	    if (is_v6 &&
+		    IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
+		    IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
+		    id->dst_port == q->id.dst_port &&
+		    id->src_port == q->id.src_port &&
+		    id->proto == q->id.proto &&
+		    id->flags == q->id.flags &&
+		    id->flow_id6 == q->id.flow_id6)
+		break ; /* found */
+
+	    if (!is_v6 && id->dst_ip == q->id.dst_ip &&
+		    id->src_ip == q->id.src_ip &&
+		    id->dst_port == q->id.dst_port &&
+		    id->src_port == q->id.src_port &&
+		    id->proto == q->id.proto &&
+		    id->flags == q->id.flags)
+		break ; /* found */
+
+	    /* No match. Check if we can expire the entry */
+	    if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
+		/* entry is idle and not in any heap, expire it */
+		struct dn_flow_queue *old_q = q ;
+
+		if (prev != NULL)
+		    prev->next = q = q->next ;
+		else
+		    fs->rq[i] = q = q->next ;
+		fs->rq_elements-- ;
+		free(old_q, M_DUMMYNET);
+		continue ;
+	    }
+	    prev = q ;
+	    q = q->next ;
+	}
+	if (q && prev != NULL) { /* found and not in front */
+	    prev->next = q->next ;
+	    q->next = fs->rq[i] ;
+	    fs->rq[i] = q ;
+	}
+    }
+    if (q == NULL) { /* no match, need to allocate a new entry */
+	q = create_queue(fs, i);
+	if (q != NULL)
+	q->id = *id ;
+    }
+    return q ;
+}
+
+static int
+red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+{
+	/*
+	 * RED algorithm
+	 *
+	 * RED calculates the average queue size (avg) using a low-pass filter
+	 * with an exponential weighted (w_q) moving average:
+	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
+	 * where q_size is the queue length (measured in bytes or * packets).
+	 *
+	 * If q_size == 0, we compute the idle time for the link, and set
+	 *	avg = (1 - w_q)^(idle/s)
+	 * where s is the time needed for transmitting a medium-sized packet.
+	 *
+	 * Now, if avg < min_th the packet is enqueued.
+	 * If avg > max_th the packet is dropped. Otherwise, the packet is
+	 * dropped with probability P function of avg.
+	 */
+
+	int64_t p_b = 0;
+
+	/* Queue in bytes or packets? */
+	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
+	    q->len_bytes : q->len;
+
+	DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
+
+	/* Average queue size estimation. */
+	if (q_size != 0) {
+		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+		int diff = SCALE(q_size) - q->avg;
+		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+		q->avg += (int)v;
+	} else {
+		/*
+		 * Queue is empty, find for how long the queue has been
+		 * empty and use a lookup table for computing
+		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
+		 * (small) packet.
+		 * XXX check wraps...
+		 */
+		if (q->avg) {
+			u_int t = (curr_time - q->q_time) / fs->lookup_step;
+
+			q->avg = (t < fs->lookup_depth) ?
+			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+		}
+	}
+	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
+
+	/* Should i drop? */
+	if (q->avg < fs->min_th) {
+		q->count = -1;
+		return (0);	/* accept packet */
+	}
+	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
+		if (fs->flags_fs & DN_IS_GENTLE_RED) {
+			/*
+			 * According to Gentle-RED, if avg is greater than
+			 * max_th the packet is dropped with a probability
+			 *	 p_b = c_3 * avg - c_4
+			 * where c_3 = (1 - max_p) / max_th
+			 *       c_4 = 1 - 2 * max_p
+			 */
+			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+			    fs->c_4;
+		} else {
+			q->count = -1;
+			DPRINTF(("dummynet: - drop"));
+			return (1);
+		}
+	} else if (q->avg > fs->min_th) {
+		/*
+		 * We compute p_b using the linear dropping function
+		 *	 p_b = c_1 * avg - c_2
+		 * where c_1 = max_p / (max_th - min_th)
+		 * 	 c_2 = max_p * min_th / (max_th - min_th)
+		 */
+		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+	}
+
+	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
+		p_b = (p_b * len) / fs->max_pkt_size;
+	if (++q->count == 0)
+		q->random = random() & 0xffff;
+	else {
+		/*
+		 * q->count counts packets arrived since last drop, so a greater
+		 * value of q->count means a greater packet drop probability.
+		 */
+		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+			q->count = 0;
+			DPRINTF(("dummynet: - red drop"));
+			/* After a drop we calculate a new random value. */
+			q->random = random() & 0xffff;
+			return (1);	/* drop */
+		}
+	}
+	/* End of RED algorithm. */
+
+	return (0);	/* accept */
+}
+
+static __inline struct dn_flow_set *
+locate_flowset(int fs_nr)
+{
+	struct dn_flow_set *fs;
+
+	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
+		if (fs->fs_nr == fs_nr)
+			return (fs);
+
+	return (NULL);
+}
+
+static __inline struct dn_pipe *
+locate_pipe(int pipe_nr)
+{
+	struct dn_pipe *pipe;
+
+	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
+		if (pipe->pipe_nr == pipe_nr)
+			return (pipe);
+
+	return (NULL);
+}
+
+/*
+ * dummynet hook for packets. Below 'pipe' is a pipe or a queue
+ * depending on whether WF2Q or fixed bw is used.
+ *
+ * pipe_nr	pipe or queue the packet is destined for.
+ * dir		where shall we send the packet after dummynet.
+ * m		the mbuf with the packet
+ * ifp		the 'ifp' parameter from the caller.
+ *		NULL in ip_input, destination interface in ip_output,
+ * rule		matching rule, in case of multiple passes
+ */
+static int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+	struct mbuf *m = *m0, *head = NULL, *tail = NULL;
+	struct dn_pkt_tag *pkt;
+	struct m_tag *mtag;
+	struct dn_flow_set *fs = NULL;
+	struct dn_pipe *pipe;
+	uint64_t len = m->m_pkthdr.len;
+	struct dn_flow_queue *q = NULL;
+	int is_pipe;
+	ipfw_insn *cmd = ACTION_PTR(fwa->rule);
+
+	KASSERT(m->m_nextpkt == NULL,
+	    ("dummynet_io: mbuf queue passed to dummynet"));
+
+	if (cmd->opcode == O_LOG)
+		cmd += F_LEN(cmd);
+	if (cmd->opcode == O_ALTQ)
+		cmd += F_LEN(cmd);
+	if (cmd->opcode == O_TAG)
+		cmd += F_LEN(cmd);
+	is_pipe = (cmd->opcode == O_PIPE);
+
+	DUMMYNET_LOCK();
+	io_pkt++;
+	/*
+	 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
+	 *
+	 * XXXGL: probably the pipe->fs and fs->pipe logic here
+	 * below can be simplified.
+	 */
+	if (is_pipe) {
+		pipe = locate_pipe(fwa->cookie);
+		if (pipe != NULL)
+			fs = &(pipe->fs);
+	} else
+		fs = locate_flowset(fwa->cookie);
+
+	if (fs == NULL)
+		goto dropit;	/* This queue/pipe does not exist! */
+	pipe = fs->pipe;
+	if (pipe == NULL) {	/* Must be a queue, try find a matching pipe. */
+		pipe = locate_pipe(fs->parent_nr);
+		if (pipe != NULL)
+			fs->pipe = pipe;
+		else {
+			printf("dummynet: no pipe %d for queue %d, drop pkt\n",
+			    fs->parent_nr, fs->fs_nr);
+			goto dropit;
+		}
+	}
+	q = find_queue(fs, &(fwa->f_id));
+	if (q == NULL)
+		goto dropit;		/* Cannot allocate queue. */
+
+	/* Update statistics, then check reasons to drop pkt. */
+	q->tot_bytes += len;
+	q->tot_pkts++;
+	if (fs->plr && random() < fs->plr)
+		goto dropit;		/* Random pkt drop. */
+	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+		if (q->len_bytes > fs->qsize)
+			goto dropit;	/* Queue size overflow. */
+	} else {
+		if (q->len >= fs->qsize)
+			goto dropit;	/* Queue count overflow. */
+	}
+	if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
+		goto dropit;
+
+	/* XXX expensive to zero, see if we can remove it. */
+	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+	    sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
+	if (mtag == NULL)
+		goto dropit;		/* Cannot allocate packet header. */
+	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
+
+	pkt = (struct dn_pkt_tag *)(mtag + 1);
+	/*
+	 * Ok, i can handle the pkt now...
+	 * Build and enqueue packet + parameters.
+	 */
+	pkt->rule = fwa->rule;
+	pkt->dn_dir = dir;
+
+	pkt->ifp = fwa->oif;
+
+	if (q->head == NULL)
+		q->head = m;
+	else
+		q->tail->m_nextpkt = m;
+	q->tail = m;
+	q->len++;
+	q->len_bytes += len;
+
+	if (q->head != m)		/* Flow was not idle, we are done. */
+		goto done;
+
+	if (q->q_time < curr_time)
+		q->numbytes = io_fast ? fs->pipe->bandwidth : 0;
+	q->q_time = curr_time;
+
+	/*
+	 * If we reach this point the flow was previously idle, so we need
+	 * to schedule it. This involves different actions for fixed-rate or
+	 * WF2Q queues.
+	 */
+	if (is_pipe) {
+		/* Fixed-rate queue: just insert into the ready_heap. */
+		dn_key t = 0;
+
+		if (pipe->bandwidth) {
+			q->extra_bits = compute_extra_bits(m, pipe);
+			t = set_ticks(m, q, pipe);
+		}
+		q->sched_time = curr_time;
+		if (t == 0)		/* Must process it now. */
+			ready_event(q, &head, &tail);
+		else
+			heap_insert(&ready_heap, curr_time + t , q);
+	} else {
+		/*
+		 * WF2Q. First, compute start time S: if the flow was
+		 * idle (S = F + 1) set S to the virtual time V for the
+		 * controlling pipe, and update the sum of weights for the pipe;
+		 * otherwise, remove flow from idle_heap and set S to max(F,V).
+		 * Second, compute finish time F = S + len / weight.
+		 * Third, if pipe was idle, update V = max(S, V).
+		 * Fourth, count one more backlogged flow.
+		 */
+		if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
+			q->S = pipe->V;
+			pipe->sum += fs->weight; /* Add weight of new queue. */
+		} else {
+			heap_extract(&(pipe->idle_heap), q);
+			q->S = MAX64(q->F, pipe->V);
+		}
+		q->F = q->S + (len << MY_M) / (uint64_t)fs->weight;
+
+		if (pipe->not_eligible_heap.elements == 0 &&
+		    pipe->scheduler_heap.elements == 0)
+			pipe->V = MAX64(q->S, pipe->V);
+		fs->backlogged++;
+		/*
+		 * Look at eligibility. A flow is not eligibile if S>V (when
+		 * this happens, it means that there is some other flow already
+		 * scheduled for the same pipe, so the scheduler_heap cannot be
+		 * empty). If the flow is not eligible we just store it in the
+		 * not_eligible_heap. Otherwise, we store in the scheduler_heap
+		 * and possibly invoke ready_event_wfq() right now if there is
+		 * leftover credit.
+		 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
+		 * and for all flows in not_eligible_heap (NEH), S_i > V.
+		 * So when we need to compute max(V, min(S_i)) forall i in
+		 * SCH+NEH, we only need to look into NEH.
+		 */
+		if (DN_KEY_GT(q->S, pipe->V)) {		/* Not eligible. */
+			if (pipe->scheduler_heap.elements == 0)
+				printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
+			heap_insert(&(pipe->not_eligible_heap), q->S, q);
+		} else {
+			heap_insert(&(pipe->scheduler_heap), q->F, q);
+			if (pipe->numbytes >= 0) {	 /* Pipe is idle. */
+				if (pipe->scheduler_heap.elements != 1)
+					printf("dummynet: OUCH! pipe should have been idle!\n");
+				DPRINTF(("dummynet: waking up pipe %d at %d\n",
+				    pipe->pipe_nr, (int)(q->F >> MY_M)));
+				pipe->sched_time = curr_time;
+				ready_event_wfq(pipe, &head, &tail);
+			}
+		}
+	}
+done:
+	if (head == m && dir != DN_TO_IFB_FWD && dir != DN_TO_ETH_DEMUX &&
+	    dir != DN_TO_ETH_OUT) {	/* Fast io. */
+		io_pkt_fast++;
+		if (m->m_nextpkt != NULL)
+			printf("dummynet: fast io: pkt chain detected!\n");
+		head = m->m_nextpkt = NULL;
+	} else
+		*m0 = NULL;		/* Normal io. */
+
+	DUMMYNET_UNLOCK();
+	if (head != NULL)
+		dummynet_send(head);
+	return (0);
+
+dropit:
+	io_pkt_drop++;
+	if (q)
+		q->drops++;
+	DUMMYNET_UNLOCK();
+	m_freem(m);
+	*m0 = NULL;
+	return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+}
+
+/*
+ * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
+ * Doing this would probably save us the initial bzero of dn_pkt
+ */
+#define	DN_FREE_PKT(_m) do {				\
+	m_freem(_m);					\
+} while (0)
+
+/*
+ * Dispose all packets and flow_queues on a flow_set.
+ * If all=1, also remove red lookup table and other storage,
+ * including the descriptor itself.
+ * For the one in dn_pipe MUST also cleanup ready_heap...
+ */
+static void
+purge_flow_set(struct dn_flow_set *fs, int all)
+{
+	struct dn_flow_queue *q, *qn;
+	int i;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	for (i = 0; i <= fs->rq_size; i++) {
+		for (q = fs->rq[i]; q != NULL; q = qn) {
+			struct mbuf *m, *mnext;
+
+			mnext = q->head;
+			while ((m = mnext) != NULL) {
+				mnext = m->m_nextpkt;
+				DN_FREE_PKT(m);
+			}
+			qn = q->next;
+			free(q, M_DUMMYNET);
+		}
+		fs->rq[i] = NULL;
+	}
+
+	fs->rq_elements = 0;
+	if (all) {
+		/* RED - free lookup table. */
+		if (fs->w_q_lookup != NULL)
+			free(fs->w_q_lookup, M_DUMMYNET);
+		if (fs->rq != NULL)
+			free(fs->rq, M_DUMMYNET);
+		/* If this fs is not part of a pipe, free it. */
+		if (fs->pipe == NULL || fs != &(fs->pipe->fs))
+			free(fs, M_DUMMYNET);
+	}
+}
+
+/*
+ * Dispose all packets queued on a pipe (not a flow_set).
+ * Also free all resources associated to a pipe, which is about
+ * to be deleted.
+ */
+static void
+purge_pipe(struct dn_pipe *pipe)
+{
+    struct mbuf *m, *mnext;
+
+    purge_flow_set( &(pipe->fs), 1 );
+
+    mnext = pipe->head;
+    while ((m = mnext) != NULL) {
+	mnext = m->m_nextpkt;
+	DN_FREE_PKT(m);
+    }
+
+    heap_free( &(pipe->scheduler_heap) );
+    heap_free( &(pipe->not_eligible_heap) );
+    heap_free( &(pipe->idle_heap) );
+}
+
+/*
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
+ */
+static void
+dummynet_flush(void)
+{
+	struct dn_pipe *pipe, *pipe1;
+	struct dn_flow_set *fs, *fs1;
+	int i;
+
+	DUMMYNET_LOCK();
+	/* Free heaps so we don't have unwanted events. */
+	heap_free(&ready_heap);
+	heap_free(&wfq_ready_heap);
+	heap_free(&extract_heap);
+
+	/*
+	 * Now purge all queued pkts and delete all pipes.
+	 *
+	 * XXXGL: can we merge the for(;;) cycles into one or not?
+	 */
+	for (i = 0; i < HASHSIZE; i++)
+		SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
+			SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
+			purge_flow_set(fs, 1);
+		}
+	for (i = 0; i < HASHSIZE; i++)
+		SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
+			SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
+			purge_pipe(pipe);
+			free_pipe(pipe);
+		}
+	DUMMYNET_UNLOCK();
+}
+
+extern struct ip_fw *ip_fw_default_rule ;
+static void
+dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
+{
+    int i ;
+    struct dn_flow_queue *q ;
+    struct mbuf *m ;
+
+    for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
+	for (q = fs->rq[i] ; q ; q = q->next )
+	    for (m = q->head ; m ; m = m->m_nextpkt ) {
+		struct dn_pkt_tag *pkt = dn_tag_get(m) ;
+		if (pkt->rule == r)
+		    pkt->rule = ip_fw_default_rule ;
+	    }
+}
+
+/*
+ * When a firewall rule is deleted, scan all queues and remove the pointer
+ * to the rule from matching packets, making them point to the default rule.
+ * The pointer is used to reinject packets in case one_pass = 0.
+ */
+void
+dn_rule_delete(void *r)
+{
+    struct dn_pipe *pipe;
+    struct dn_flow_set *fs;
+    struct dn_pkt_tag *pkt;
+    struct mbuf *m;
+    int i;
+
+    DUMMYNET_LOCK();
+    /*
+     * If the rule references a queue (dn_flow_set), then scan
+     * the flow set, otherwise scan pipes. Should do either, but doing
+     * both does not harm.
+     */
+    for (i = 0; i < HASHSIZE; i++)
+	SLIST_FOREACH(fs, &flowsethash[i], next)
+		dn_rule_delete_fs(fs, r);
+
+    for (i = 0; i < HASHSIZE; i++)
+	SLIST_FOREACH(pipe, &pipehash[i], next) {
+		fs = &(pipe->fs);
+		dn_rule_delete_fs(fs, r);
+		for (m = pipe->head ; m ; m = m->m_nextpkt ) {
+			pkt = dn_tag_get(m);
+			if (pkt->rule == r)
+				pkt->rule = ip_fw_default_rule;
+		}
+	}
+    DUMMYNET_UNLOCK();
+}
+
+/*
+ * setup RED parameters
+ */
+static int
+config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+{
+	int i;
+
+	x->w_q = p->w_q;
+	x->min_th = SCALE(p->min_th);
+	x->max_th = SCALE(p->max_th);
+	x->max_p = p->max_p;
+
+	x->c_1 = p->max_p / (p->max_th - p->min_th);
+	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
+
+	if (x->flags_fs & DN_IS_GENTLE_RED) {
+		x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
+		x->c_4 = SCALE(1) - 2 * p->max_p;
+	}
+
+	/* If the lookup table already exist, free and create it again. */
+	if (x->w_q_lookup) {
+		free(x->w_q_lookup, M_DUMMYNET);
+		x->w_q_lookup = NULL;
+	}
+	if (red_lookup_depth == 0) {
+		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+		    "must be > 0\n");
+		free(x, M_DUMMYNET);
+		return (EINVAL);
+	}
+	x->lookup_depth = red_lookup_depth;
+	x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+	    M_DUMMYNET, M_NOWAIT);
+	if (x->w_q_lookup == NULL) {
+		printf("dummynet: sorry, cannot allocate red lookup table\n");
+		free(x, M_DUMMYNET);
+		return(ENOSPC);
+	}
+
+	/* Fill the lookup table with (1 - w_q)^x */
+	x->lookup_step = p->lookup_step;
+	x->lookup_weight = p->lookup_weight;
+	x->w_q_lookup[0] = SCALE(1) - x->w_q;
+
+	for (i = 1; i < x->lookup_depth; i++)
+		x->w_q_lookup[i] =
+		    SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+
+	if (red_avg_pkt_size < 1)
+		red_avg_pkt_size = 512;
+	x->avg_pkt_size = red_avg_pkt_size;
+	if (red_max_pkt_size < 1)
+		red_max_pkt_size = 1500;
+	x->max_pkt_size = red_max_pkt_size;
+	return (0);
+}
+
+static int
+alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
+{
+    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
+	int l = pfs->rq_size;
+
+	if (l == 0)
+	    l = dn_hash_size;
+	if (l < 4)
+	    l = 4;
+	else if (l > DN_MAX_HASH_SIZE)
+	    l = DN_MAX_HASH_SIZE;
+	x->rq_size = l;
+    } else                  /* one is enough for null mask */
+	x->rq_size = 1;
+    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
+	    M_DUMMYNET, M_NOWAIT | M_ZERO);
+    if (x->rq == NULL) {
+	printf("dummynet: sorry, cannot allocate queue\n");
+	return (ENOMEM);
+    }
+    x->rq_elements = 0;
+    return 0 ;
+}
+
+static void
+set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
+{
+	x->flags_fs = src->flags_fs;
+	x->qsize = src->qsize;
+	x->plr = src->plr;
+	x->flow_mask = src->flow_mask;
+	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
+		if (x->qsize > pipe_byte_limit)
+			x->qsize = 1024 * 1024;
+	} else {
+		if (x->qsize == 0)
+			x->qsize = 50;
+		if (x->qsize > pipe_slot_limit)
+			x->qsize = 50;
+	}
+	/* Configuring RED. */
+	if (x->flags_fs & DN_IS_RED)
+		config_red(src, x);	/* XXX should check errors */
+}
+
+/*
+ * Setup pipe or queue parameters.
+ */
+static int
+config_pipe(struct dn_pipe *p)
+{
+	struct dn_flow_set *pfs = &(p->fs);
+	struct dn_flow_queue *q;
+	int i, error;
+
+	/*
+	 * The config program passes parameters as follows:
+	 * bw = bits/second (0 means no limits),
+	 * delay = ms, must be translated into ticks.
+	 * qsize = slots/bytes
+	 */
+	p->delay = (p->delay * hz) / 1000;
+	/* We need either a pipe number or a flow_set number. */
+	if (p->pipe_nr == 0 && pfs->fs_nr == 0)
+		return (EINVAL);
+	if (p->pipe_nr != 0 && pfs->fs_nr != 0)
+		return (EINVAL);
+	if (p->pipe_nr != 0) {			/* this is a pipe */
+		struct dn_pipe *pipe;
+
+		DUMMYNET_LOCK();
+		pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
+
+		if (pipe == NULL) {		/* new pipe */
+			pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
+			    M_NOWAIT | M_ZERO);
+			if (pipe == NULL) {
+				DUMMYNET_UNLOCK();
+				printf("dummynet: no memory for new pipe\n");
+				return (ENOMEM);
+			}
+			pipe->pipe_nr = p->pipe_nr;
+			pipe->fs.pipe = pipe;
+			/*
+			 * idle_heap is the only one from which
+			 * we extract from the middle.
+			 */
+			pipe->idle_heap.size = pipe->idle_heap.elements = 0;
+			pipe->idle_heap.offset =
+			    offsetof(struct dn_flow_queue, heap_pos);
+		} else
+			/* Flush accumulated credit for all queues. */
+			for (i = 0; i <= pipe->fs.rq_size; i++)
+				for (q = pipe->fs.rq[i]; q; q = q->next)
+					q->numbytes = io_fast ? p->bandwidth : 0;
+
+		pipe->bandwidth = p->bandwidth;
+		pipe->numbytes = 0;		/* just in case... */
+		bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
+		pipe->ifp = NULL;		/* reset interface ptr */
+		pipe->delay = p->delay;
+		set_fs_parms(&(pipe->fs), pfs);
+
+		/* Handle changes in the delay profile. */
+		if (p->samples_no > 0) {
+			if (pipe->samples_no != p->samples_no) {
+				if (pipe->samples != NULL)
+					free(pipe->samples, M_DUMMYNET);
+				pipe->samples =
+				    malloc(p->samples_no*sizeof(dn_key),
+					M_DUMMYNET, M_NOWAIT | M_ZERO);
+				if (pipe->samples == NULL) {
+					DUMMYNET_UNLOCK();
+					printf("dummynet: no memory "
+						"for new samples\n");
+					return (ENOMEM);
+				}
+				pipe->samples_no = p->samples_no;
+			}
+
+			strncpy(pipe->name,p->name,sizeof(pipe->name));
+			pipe->loss_level = p->loss_level;
+			for (i = 0; i<pipe->samples_no; ++i)
+				pipe->samples[i] = p->samples[i];
+		} else if (pipe->samples != NULL) {
+			free(pipe->samples, M_DUMMYNET);
+			pipe->samples = NULL;
+			pipe->samples_no = 0;
+		}
+
+		if (pipe->fs.rq == NULL) {	/* a new pipe */
+			error = alloc_hash(&(pipe->fs), pfs);
+			if (error) {
+				DUMMYNET_UNLOCK();
+				free_pipe(pipe);
+				return (error);
+			}
+			SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
+			    pipe, next);
+		}
+		DUMMYNET_UNLOCK();
+	} else {				/* config queue */
+		struct dn_flow_set *fs;
+
+		DUMMYNET_LOCK();
+		fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+
+		if (fs == NULL) {		/* new */
+			if (pfs->parent_nr == 0) { /* need link to a pipe */
+				DUMMYNET_UNLOCK();
+				return (EINVAL);
+			}
+			fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
+			    M_NOWAIT | M_ZERO);
+			if (fs == NULL) {
+				DUMMYNET_UNLOCK();
+				printf(
+				    "dummynet: no memory for new flow_set\n");
+				return (ENOMEM);
+			}
+			fs->fs_nr = pfs->fs_nr;
+			fs->parent_nr = pfs->parent_nr;
+			fs->weight = pfs->weight;
+			if (fs->weight == 0)
+				fs->weight = 1;
+			else if (fs->weight > 100)
+				fs->weight = 100;
+		} else {
+			/*
+			 * Change parent pipe not allowed;
+			 * must delete and recreate.
+			 */
+			if (pfs->parent_nr != 0 &&
+			    fs->parent_nr != pfs->parent_nr) {
+				DUMMYNET_UNLOCK();
+				return (EINVAL);
+			}
+		}
+
+		set_fs_parms(fs, pfs);
+
+		if (fs->rq == NULL) {		/* a new flow_set */
+			error = alloc_hash(fs, pfs);
+			if (error) {
+				DUMMYNET_UNLOCK();
+				free(fs, M_DUMMYNET);
+				return (error);
+			}
+			SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
+			    fs, next);
+		}
+		DUMMYNET_UNLOCK();
+	}
+	return (0);
+}
+
+/*
+ * Helper function to remove from a heap queues which are linked to
+ * a flow_set about to be deleted.
+ */
+static void
+fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+{
+    int i = 0, found = 0 ;
+    for (; i < h->elements ;)
+	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
+	    h->elements-- ;
+	    h->p[i] = h->p[h->elements] ;
+	    found++ ;
+	} else
+	    i++ ;
+    if (found)
+	heapify(h);
+}
+
+/*
+ * helper function to remove a pipe from a heap (can be there at most once)
+ */
+static void
+pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+{
+    if (h->elements > 0) {
+	int i = 0 ;
+	for (i=0; i < h->elements ; i++ ) {
+	    if (h->p[i].object == p) { /* found it */
+		h->elements-- ;
+		h->p[i] = h->p[h->elements] ;
+		heapify(h);
+		break ;
+	    }
+	}
+    }
+}
+
+/*
+ * drain all queues. Called in case of severe mbuf shortage.
+ */
+void
+dummynet_drain(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    struct mbuf *m, *mnext;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    heap_free(&ready_heap);
+    heap_free(&wfq_ready_heap);
+    heap_free(&extract_heap);
+    /* remove all references to this pipe from flow_sets */
+    for (i = 0; i < HASHSIZE; i++)
+	SLIST_FOREACH(fs, &flowsethash[i], next)
+		purge_flow_set(fs, 0);
+
+    for (i = 0; i < HASHSIZE; i++) {
+	SLIST_FOREACH(pipe, &pipehash[i], next) {
+		purge_flow_set(&(pipe->fs), 0);
+
+		mnext = pipe->head;
+		while ((m = mnext) != NULL) {
+			mnext = m->m_nextpkt;
+			DN_FREE_PKT(m);
+		}
+		pipe->head = pipe->tail = NULL;
+	}
+    }
+}
+
+/*
+ * Fully delete a pipe or a queue, cleaning up associated info.
+ */
+static int
+delete_pipe(struct dn_pipe *p)
+{
+
+    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+	return EINVAL ;
+    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+	return EINVAL ;
+    if (p->pipe_nr != 0) { /* this is an old-style pipe */
+	struct dn_pipe *pipe;
+	struct dn_flow_set *fs;
+	int i;
+
+	DUMMYNET_LOCK();
+	pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
+
+	if (pipe == NULL) {
+	    DUMMYNET_UNLOCK();
+	    return (ENOENT);	/* not found */
+	}
+
+	/* Unlink from list of pipes. */
+	SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+
+	/* Remove all references to this pipe from flow_sets. */
+	for (i = 0; i < HASHSIZE; i++)
+	    SLIST_FOREACH(fs, &flowsethash[i], next)
+		if (fs->pipe == pipe) {
+			printf("dummynet: ++ ref to pipe %d from fs %d\n",
+			    p->pipe_nr, fs->fs_nr);
+			fs->pipe = NULL ;
+			purge_flow_set(fs, 0);
+		}
+	fs_remove_from_heap(&ready_heap, &(pipe->fs));
+	purge_pipe(pipe); /* remove all data associated to this pipe */
+	/* remove reference to here from extract_heap and wfq_ready_heap */
+	pipe_remove_from_heap(&extract_heap, pipe);
+	pipe_remove_from_heap(&wfq_ready_heap, pipe);
+	DUMMYNET_UNLOCK();
+
+	free_pipe(pipe);
+    } else { /* this is a WF2Q queue (dn_flow_set) */
+	struct dn_flow_set *fs;
+
+	DUMMYNET_LOCK();
+	fs = locate_flowset(p->fs.fs_nr); /* locate set */
+
+	if (fs == NULL) {
+	    DUMMYNET_UNLOCK();
+	    return (ENOENT); /* not found */
+	}
+
+	/* Unlink from list of flowsets. */
+	SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+
+	if (fs->pipe != NULL) {
+	    /* Update total weight on parent pipe and cleanup parent heaps. */
+	    fs->pipe->sum -= fs->weight * fs->backlogged ;
+	    fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
+	    fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
+#if 1	/* XXX should i remove from idle_heap as well ? */
+	    fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
+#endif
+	}
+	purge_flow_set(fs, 1);
+	DUMMYNET_UNLOCK();
+    }
+    return 0 ;
+}
+
+/*
+ * helper function used to copy data from kernel in DUMMYNET_GET
+ */
+static char *
+dn_copy_set(struct dn_flow_set *set, char *bp)
+{
+    int i, copied = 0 ;
+    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
+
+    DUMMYNET_LOCK_ASSERT();
+
+    for (i = 0 ; i <= set->rq_size ; i++)
+	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
+	    if (q->hash_slot != i)
+		printf("dummynet: ++ at %d: wrong slot (have %d, "
+		    "should be %d)\n", copied, q->hash_slot, i);
+	    if (q->fs != set)
+		printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
+			i, q->fs, set);
+	    copied++ ;
+	    bcopy(q, qp, sizeof( *q ) );
+	    /* cleanup pointers */
+	    qp->next = NULL ;
+	    qp->head = qp->tail = NULL ;
+	    qp->fs = NULL ;
+	}
+    if (copied != set->rq_elements)
+	printf("dummynet: ++ wrong count, have %d should be %d\n",
+	    copied, set->rq_elements);
+    return (char *)qp ;
+}
+
+static size_t
+dn_calc_size(void)
+{
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    size_t size = 0;
+    int i;
+
+    DUMMYNET_LOCK_ASSERT();
+    /*
+     * Compute size of data structures: list of pipes and flow_sets.
+     */
+    for (i = 0; i < HASHSIZE; i++) {
+	SLIST_FOREACH(pipe, &pipehash[i], next)
+		size += sizeof(*pipe) +
+		    pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
+	SLIST_FOREACH(fs, &flowsethash[i], next)
+		size += sizeof (*fs) +
+		    fs->rq_elements * sizeof(struct dn_flow_queue);
+    }
+    return size;
+}
+
+static int
+dummynet_get(struct sockopt *sopt)
+{
+    char *buf, *bp ; /* bp is the "copy-pointer" */
+    size_t size ;
+    struct dn_flow_set *fs;
+    struct dn_pipe *pipe;
+    int error=0, i ;
+
+    /* XXX lock held too long */
+    DUMMYNET_LOCK();
+    /*
+     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
+     *      cannot use this flag while holding a mutex.
+     */
+    for (i = 0; i < 10; i++) {
+	size = dn_calc_size();
+	DUMMYNET_UNLOCK();
+	buf = malloc(size, M_TEMP, M_WAITOK);
+	DUMMYNET_LOCK();
+	if (size == dn_calc_size())
+		break;
+	free(buf, M_TEMP);
+	buf = NULL;
+    }
+    if (buf == NULL) {
+	DUMMYNET_UNLOCK();
+	return ENOBUFS ;
+    }
+    bp = buf;
+    for (i = 0; i < HASHSIZE; i++)
+	SLIST_FOREACH(pipe, &pipehash[i], next) {
+		struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
+
+		/*
+		 * Copy pipe descriptor into *bp, convert delay back to ms,
+		 * then copy the flow_set descriptor(s) one at a time.
+		 * After each flow_set, copy the queue descriptor it owns.
+		 */
+		bcopy(pipe, bp, sizeof(*pipe));
+		pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
+		/*
+		 * XXX the following is a hack based on ->next being the
+		 * first field in dn_pipe and dn_flow_set. The correct
+		 * solution would be to move the dn_flow_set to the beginning
+		 * of struct dn_pipe.
+		 */
+		pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
+		/* Clean pointers. */
+		pipe_bp->head = pipe_bp->tail = NULL;
+		pipe_bp->fs.next.sle_next = NULL;
+		pipe_bp->fs.pipe = NULL;
+		pipe_bp->fs.rq = NULL;
+		pipe_bp->samples = NULL;
+
+		bp += sizeof(*pipe) ;
+		bp = dn_copy_set(&(pipe->fs), bp);
+	}
+
+    for (i = 0; i < HASHSIZE; i++)
+	SLIST_FOREACH(fs, &flowsethash[i], next) {
+		struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+
+		bcopy(fs, bp, sizeof(*fs));
+		/* XXX same hack as above */
+		fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+		fs_bp->pipe = NULL;
+		fs_bp->rq = NULL;
+		bp += sizeof(*fs);
+		bp = dn_copy_set(fs, bp);
+	}
+
+    DUMMYNET_UNLOCK();
+
+    error = sooptcopyout(sopt, buf, size);
+    free(buf, M_TEMP);
+    return error ;
+}
+
+/*
+ * Handler for the various dummynet socket options (get, flush, config, del)
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+    int error = 0 ;
+    struct dn_pipe *p;
+    struct dn_pipe_max tmp_pipe;	/* pipe + large buffer */
+
+    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+    if (error)
+	return (error);
+
+    /* Disallow sets in really-really secure mode. */
+    if (sopt->sopt_dir == SOPT_SET) {
+#if __FreeBSD_version >= 500034
+	error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+	if (error)
+	    return (error);
+#else
+	if (securelevel >= 3)
+	    return (EPERM);
+#endif
+    }
+
+    switch (sopt->sopt_name) {
+    default :
+	printf("dummynet: -- unknown option %d", sopt->sopt_name);
+	return EINVAL ;
+
+    case IP_DUMMYNET_GET :
+	error = dummynet_get(sopt);
+	break ;
+
+    case IP_DUMMYNET_FLUSH :
+	dummynet_flush() ;
+	break ;
+
+    case IP_DUMMYNET_CONFIGURE :
+	p = (struct dn_pipe *)&tmp_pipe ;
+	error = sooptcopyin(sopt, p, sizeof(tmp_pipe), sizeof *p);
+	if (error)
+	    break ;
+	if (p->samples_no > 0)
+	    p->samples = &tmp_pipe.samples[0];
+
+	error = config_pipe(p);
+	break ;
+
+    case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
+	p = (struct dn_pipe *)&tmp_pipe ;
+	error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
+	if (error)
+	    break ;
+
+	error = delete_pipe(p);
+	break ;
+    }
+    return error ;
+}
+
+static void
+ip_dn_init(void)
+{
+	int i;
+
+	if (bootverbose)
+		printf("DUMMYNET with IPv6 initialized (040826)\n");
+
+	DUMMYNET_LOCK_INIT();
+
+	for (i = 0; i < HASHSIZE; i++) {
+		SLIST_INIT(&pipehash[i]);
+		SLIST_INIT(&flowsethash[i]);
+	}
+	ready_heap.size = ready_heap.elements = 0;
+	ready_heap.offset = 0;
+
+	wfq_ready_heap.size = wfq_ready_heap.elements = 0;
+	wfq_ready_heap.offset = 0;
+
+	extract_heap.size = extract_heap.elements = 0;
+	extract_heap.offset = 0;
+
+	ip_dn_ctl_ptr = ip_dn_ctl;
+	ip_dn_io_ptr = dummynet_io;
+	ip_dn_ruledel_ptr = dn_rule_delete;
+
+	TASK_INIT(&dn_task, 0, dummynet_task, NULL);
+	dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT,
+	    taskqueue_thread_enqueue, &dn_tq);
+	taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+	callout_init(&dn_timeout, CALLOUT_MPSAFE);
+	callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+	/* Initialize curr_time adjustment mechanics. */
+	getmicrouptime(&prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(void)
+{
+	ip_dn_ctl_ptr = NULL;
+	ip_dn_io_ptr = NULL;
+	ip_dn_ruledel_ptr = NULL;
+
+	DUMMYNET_LOCK();
+	callout_stop(&dn_timeout);
+	DUMMYNET_UNLOCK();
+	taskqueue_drain(dn_tq, &dn_task);
+	taskqueue_free(dn_tq);
+
+	dummynet_flush();
+
+	DUMMYNET_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+	switch (type) {
+	case MOD_LOAD:
+		if (ip_dn_io_ptr) {
+		    printf("DUMMYNET already loaded\n");
+		    return EEXIST ;
+		}
+		ip_dn_init();
+		break;
+
+	case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+		printf("dummynet statically compiled, cannot unload\n");
+		return EINVAL ;
+#else
+		ip_dn_destroy();
+#endif
+		break ;
+	default:
+		return EOPNOTSUPP;
+		break ;
+	}
+	return 0 ;
+}
+
+static moduledata_t dummynet_mod = {
+	"dummynet",
+	dummynet_modevent,
+	NULL
+};
+DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 1);
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
new file mode 100644
index 000000000000..760150a4ddb9
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -0,0 +1,4747 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define        DEB(x)
+#define        DDB(x) x
+
+/*
+ * Implement IP packet firewall (new version)
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <sys/vimage.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#define	IPFW_INTERNAL	/* Access to protected data structures in ip_fw.h. */
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_divert.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+#include <netinet/vinet.h>
+
+#include <netgraph/ng_ipfw.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/scope6_var.h>
+#endif
+
+#include <machine/in_cksum.h>	/* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+#ifndef VIMAGE
+#ifndef VIMAGE_GLOBALS
+struct vnet_ipfw vnet_ipfw_0;
+#endif
+#endif
+
+/*
+ * set_disable contains one bit per set value (0..31).
+ * If the bit is set, all rules with the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted explicitly.
+ */
+#ifdef VIMAGE_GLOBALS
+static u_int32_t set_disable;
+static int fw_verbose;
+static struct callout ipfw_timeout;
+static int verbose_limit;
+#endif
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+static uma_zone_t ipfw_dyn_rule_zone;
+
+/*
+ * Data structure to cache our ucred related
+ * information. This structure only gets used if
+ * the user specified UID/GID based constraints in
+ * a firewall rule.
+ */
+struct ip_fw_ugid {
+	gid_t		fw_groups[NGROUPS];
+	int		fw_ngroups;
+	uid_t		fw_uid;
+	int		fw_prid;
+};
+
+/*
+ * list of rules for layer 3
+ */
+#ifdef VIMAGE_GLOBALS
+struct ip_fw_chain layer3_chain;
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+struct table_entry {
+	struct radix_node	rn[2];
+	struct sockaddr_in	addr, mask;
+	u_int32_t		value;
+};
+
+#ifdef VIMAGE_GLOBALS
+static int autoinc_step;
+#endif
+
+extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, enable,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw_enable, 0,
+    ipfw_chg_hook, "I", "Enable ipfw");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_RW, autoinc_step, 0, "Rule number auto-increment step");
+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_RW | CTLFLAG_SECURE3, fw_one_pass, 0,
+    "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_SECURE3,
+    fw_verbose, 0, "Log matches to ipfw rules");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_RW, verbose_limit, 0,
+    "Set upper limit of matches of ipfw rules logged");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+    NULL, IPFW_DEFAULT_RULE, "The default/max possible rule number.");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
+    NULL, IPFW_TABLES_MAX, "The maximum number of tables.");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+    &default_to_accept, 0, "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+#endif /* SYSCTL_NODE */
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ *  + stateful rules;
+ *  + enforcing limits on the number of sessions;
+ *  + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+#ifdef VIMAGE_GLOBALS
+static ipfw_dyn_rule **ipfw_dyn_v;
+static u_int32_t dyn_buckets;
+static u_int32_t curr_dyn_buckets;
+#endif
+
+static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
+#define	IPFW_DYN_LOCK_INIT() \
+	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
+#define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
+#define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
+#define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+#ifdef VIMAGE_GLOBALS
+static u_int32_t dyn_ack_lifetime;
+static u_int32_t dyn_syn_lifetime;
+static u_int32_t dyn_fin_lifetime;
+static u_int32_t dyn_rst_lifetime;
+static u_int32_t dyn_udp_lifetime;
+static u_int32_t dyn_short_lifetime;
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static u_int32_t dyn_keepalive_interval;
+static u_int32_t dyn_keepalive_period;
+static u_int32_t dyn_keepalive;
+
+static u_int32_t static_count;	/* # of static rules */
+static u_int32_t static_len;	/* size in bytes of static rules */
+static u_int32_t dyn_count;	/* # of dynamic rules */
+static u_int32_t dyn_max;	/* max # of dynamic rules */
+#endif /* VIMAGE_GLOBALS */
+
+#ifdef SYSCTL_NODE
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_buckets,
+    CTLFLAG_RW, dyn_buckets, 0, "Number of dyn. buckets");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+    CTLFLAG_RD, curr_dyn_buckets, 0, "Current Number of dyn. buckets");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLFLAG_RD, dyn_count, 0, "Number of dyn. rules");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLFLAG_RW, dyn_max, 0, "Max number of dyn. rules");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_RD, static_count, 0, "Number of static rules");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+    CTLFLAG_RW, dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+    CTLFLAG_RW, dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+    CTLFLAG_RW, dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+    CTLFLAG_RW, dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+    CTLFLAG_RW, dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+    CTLFLAG_RW, dyn_short_lifetime, 0,
+    "Lifetime of dyn. rules for other situations");
+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+    CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules");
+#endif /* SYSCTL_NODE */
+
+#ifdef INET6
+/*
+ * IPv6 specific variables
+ */
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet6_ip6);
+#endif /* SYSCTL_NODE */
+
+static struct sysctl_ctx_list ip6_fw_sysctl_ctx;
+static struct sysctl_oid *ip6_fw_sysctl_tree;
+#endif /* INET6 */
+
+#ifdef VIMAGE_GLOBALS
+static int fw_deny_unknown_exthdrs;
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define	TCP(p)		((struct tcphdr *)(p))
+#define	SCTP(p)		((struct sctphdr *)(p))
+#define	UDP(p)		((struct udphdr *)(p))
+#define	ICMP(p)		((struct icmphdr *)(p))
+#define	ICMP6(p)	((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+	int type = icmp->icmp_type;
+
+	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+	int type = icmp->icmp_type;
+
+	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+	u_char want_clear;
+	bits = ~bits;
+
+	if ( ((cmd->arg1 & 0xff) & bits) != 0)
+		return 0; /* some bits we want set were clear */
+	want_clear = (cmd->arg1 >> 8) & 0xff;
+	if ( (want_clear & bits) != want_clear)
+		return 0; /* some bits we want clear were set */
+	return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+	int optlen, bits = 0;
+	u_char *cp = (u_char *)(ip + 1);
+	int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+	for (; x > 0; x -= optlen, cp += optlen) {
+		int opt = cp[IPOPT_OPTVAL];
+
+		if (opt == IPOPT_EOL)
+			break;
+		if (opt == IPOPT_NOP)
+			optlen = 1;
+		else {
+			optlen = cp[IPOPT_OLEN];
+			if (optlen <= 0 || optlen > x)
+				return 0; /* invalid or truncated */
+		}
+		switch (opt) {
+
+		default:
+			break;
+
+		case IPOPT_LSRR:
+			bits |= IP_FW_IPOPT_LSRR;
+			break;
+
+		case IPOPT_SSRR:
+			bits |= IP_FW_IPOPT_SSRR;
+			break;
+
+		case IPOPT_RR:
+			bits |= IP_FW_IPOPT_RR;
+			break;
+
+		case IPOPT_TS:
+			bits |= IP_FW_IPOPT_TS;
+			break;
+		}
+	}
+	return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+	int optlen, bits = 0;
+	u_char *cp = (u_char *)(tcp + 1);
+	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+	for (; x > 0; x -= optlen, cp += optlen) {
+		int opt = cp[0];
+		if (opt == TCPOPT_EOL)
+			break;
+		if (opt == TCPOPT_NOP)
+			optlen = 1;
+		else {
+			optlen = cp[1];
+			if (optlen <= 0)
+				break;
+		}
+
+		switch (opt) {
+
+		default:
+			break;
+
+		case TCPOPT_MAXSEG:
+			bits |= IP_FW_TCPOPT_MSS;
+			break;
+
+		case TCPOPT_WINDOW:
+			bits |= IP_FW_TCPOPT_WINDOW;
+			break;
+
+		case TCPOPT_SACK_PERMITTED:
+		case TCPOPT_SACK:
+			bits |= IP_FW_TCPOPT_SACK;
+			break;
+
+		case TCPOPT_TIMESTAMP:
+			bits |= IP_FW_TCPOPT_TS;
+			break;
+
+		}
+	}
+	return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
+{
+	if (ifp == NULL)	/* no iface with this packet, match fails */
+		return 0;
+	/* Check by name or by IP address */
+	if (cmd->name[0] != '\0') { /* match by name */
+		/* Check name */
+		if (cmd->p.glob) {
+			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+				return(1);
+		} else {
+			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+				return(1);
+		}
+	} else {
+		struct ifaddr *ia;
+
+		IF_ADDR_LOCK(ifp);
+		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+			if (ia->ifa_addr->sa_family != AF_INET)
+				continue;
+			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+			    (ia->ifa_addr))->sin_addr.s_addr) {
+				IF_ADDR_UNLOCK(ifp);
+				return(1);	/* match */
+			}
+		}
+		IF_ADDR_UNLOCK(ifp);
+	}
+	return(0);	/* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ * 
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.  The 'versrcreach'
+ * option just checks that the source address is reachable via any route
+ * (except default) in the routing table.  These two are a measure to block
+ * forged packets.  This is also commonly known as "anti-spoofing" or Unicast
+ * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ *   ip verify unicast reverse-path
+ *   ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that syntax is
+ * misleading. The check may be performed on all IP packets whether unicast,
+ * multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+	struct route ro;
+	struct sockaddr_in *dst;
+
+	bzero(&ro, sizeof(ro));
+
+	dst = (struct sockaddr_in *)&(ro.ro_dst);
+	dst->sin_family = AF_INET;
+	dst->sin_len = sizeof(*dst);
+	dst->sin_addr = src;
+	in_rtalloc_ign(&ro, 0, fib);
+
+	if (ro.ro_rt == NULL)
+		return 0;
+
+	/*
+	 * If ifp is provided, check for equality with rtentry.
+	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+	 * in order to pass packets injected back by if_simloop():
+	 * if useloopback == 1 routing entry (via lo0) for our own address
+	 * may exist, so we need to handle routing assymetry.
+	 */
+	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* if no ifp provided, check if rtentry is not default route */
+	if (ifp == NULL &&
+	     satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* or if this is a blackhole/reject route */
+	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* found valid route */
+	RTFREE(ro.ro_rt);
+	return 1;
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+	int i;
+	for (i=0; i <= cmd->o.arg1; ++i )
+		if (curr_flow == cmd->d[i] )
+			return 1;
+	return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+	INIT_VNET_NET(curvnet);
+	struct ifnet *mdc;
+	struct ifaddr *mdc2;
+	struct in6_ifaddr *fdm;
+	struct in6_addr copia;
+
+	TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+		IF_ADDR_LOCK(mdc);
+		TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+			if (mdc2->ifa_addr->sa_family == AF_INET6) {
+				fdm = (struct in6_ifaddr *)mdc2;
+				copia = fdm->ia_addr.sin6_addr;
+				/* need for leaving scope_id in the sock_addr */
+				in6_clearscope(&copia);
+				if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+					IF_ADDR_UNLOCK(mdc);
+					return 1;
+				}
+			}
+		}
+		IF_ADDR_UNLOCK(mdc);
+	}
+	return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp)
+{
+	struct route_in6 ro;
+	struct sockaddr_in6 *dst;
+
+	bzero(&ro, sizeof(ro));
+
+	dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+	dst->sin6_family = AF_INET6;
+	dst->sin6_len = sizeof(*dst);
+	dst->sin6_addr = *src;
+	/* XXX MRT 0 for ipv6 at this time */
+	rtalloc_ign((struct route *)&ro, 0);
+
+	if (ro.ro_rt == NULL)
+		return 0;
+
+	/* 
+	 * if ifp is provided, check for equality with rtentry
+	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+	 * to support the case of sending packets to an address of our own.
+	 * (where the former interface is the first argument of if_simloop()
+	 *  (=ifp), the latter is lo0)
+	 */
+	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* if no ifp provided, check if rtentry is not default route */
+	if (ifp == NULL &&
+	    IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* or if this is a blackhole/reject route */
+	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+		RTFREE(ro.ro_rt);
+		return 0;
+	}
+
+	/* found valid route */
+	RTFREE(ro.ro_rt);
+	return 1;
+
+}
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+	u_int32_t i;
+	i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+	    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+	    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+	    (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+	    (id->dst_port) ^ (id->src_port);
+	return i;
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+	if ((icmp6_type <= ICMP6_MAXTYPE) &&
+	    (icmp6_type == ICMP6_ECHO_REQUEST ||
+	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+	    icmp6_type == ICMP6_WRUREQUEST ||
+	    icmp6_type == ICMP6_FQDN_QUERY ||
+	    icmp6_type == ICMP6_NI_QUERY))
+		return (1);
+
+	return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+	struct mbuf *m;
+
+	m = args->m;
+	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+		struct tcphdr *tcp;
+		tcp_seq ack, seq;
+		int flags;
+		struct {
+			struct ip6_hdr ip6;
+			struct tcphdr th;
+		} ti;
+		tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+		if ((tcp->th_flags & TH_RST) != 0) {
+			m_freem(m);
+			args->m = NULL;
+			return;
+		}
+
+		ti.ip6 = *ip6;
+		ti.th = *tcp;
+		ti.th.th_seq = ntohl(ti.th.th_seq);
+		ti.th.th_ack = ntohl(ti.th.th_ack);
+		ti.ip6.ip6_nxt = IPPROTO_TCP;
+
+		if (ti.th.th_flags & TH_ACK) {
+			ack = 0;
+			seq = ti.th.th_ack;
+			flags = TH_RST;
+		} else {
+			ack = ti.th.th_seq;
+			if ((m->m_flags & M_PKTHDR) != 0) {
+				/*
+				 * total new data to ACK is:
+				 * total packet length,
+				 * minus the header length,
+				 * minus the tcp header length.
+				 */
+				ack += m->m_pkthdr.len - hlen
+					- (ti.th.th_off << 2);
+			} else if (ip6->ip6_plen) {
+				ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) -
+				    hlen - (ti.th.th_off << 2);
+			} else {
+				m_freem(m);
+				return;
+			}
+			if (tcp->th_flags & TH_SYN)
+				ack++;
+			seq = 0;
+			flags = TH_RST|TH_ACK;
+		}
+		bcopy(&ti, ip6, sizeof(ti));
+		/*
+		 * m is only used to recycle the mbuf
+		 * The data in it is never read so we don't need
+		 * to correct the offsets or anything
+		 */
+		tcp_respond(NULL, ip6, tcp, m, ack, seq, flags);
+	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+		/*
+		 * Unlike above, the mbufs need to line up with the ip6 hdr,
+		 * as the contents are read. We need to m_adj() the
+		 * needed amount.
+		 * The mbuf will however be thrown away so we can adjust it.
+		 * Remember we did an m_pullup on it already so we
+		 * can make some assumptions about contiguousness.
+		 */
+		if (args->L3offset)
+			m_adj(m, args->L3offset);
+#endif
+		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+	} else
+		m_freem(m);
+
+	args->m = NULL;
+}
+
+#endif /* INET6 */
+
+#ifdef VIMAGE_GLOBALS
+static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
+#endif
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+static void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+    struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+    struct ip *ip)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ether_header *eh = args->eh;
+	char *action;
+	int limit_reached = 0;
+	char action2[40], proto[128], fragment[32];
+
+	fragment[0] = '\0';
+	proto[0] = '\0';
+
+	if (f == NULL) {	/* bogus pkt */
+		if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+			return;
+		V_norule_counter++;
+		if (V_norule_counter == V_verbose_limit)
+			limit_reached = V_verbose_limit;
+		action = "Refuse";
+	} else {	/* O_LOG is the first action, find the real one */
+		ipfw_insn *cmd = ACTION_PTR(f);
+		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+		if (l->max_log != 0 && l->log_left == 0)
+			return;
+		l->log_left--;
+		if (l->log_left == 0)
+			limit_reached = l->max_log;
+		cmd += F_LEN(cmd);	/* point to first action */
+		if (cmd->opcode == O_ALTQ) {
+			ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+			snprintf(SNPARGS(action2, 0), "Altq %d",
+				altq->qid);
+			cmd += F_LEN(cmd);
+		}
+		if (cmd->opcode == O_PROB)
+			cmd += F_LEN(cmd);
+
+		if (cmd->opcode == O_TAG)
+			cmd += F_LEN(cmd);
+
+		action = action2;
+		switch (cmd->opcode) {
+		case O_DENY:
+			action = "Deny";
+			break;
+
+		case O_REJECT:
+			if (cmd->arg1==ICMP_REJECT_RST)
+				action = "Reset";
+			else if (cmd->arg1==ICMP_UNREACH_HOST)
+				action = "Reject";
+			else
+				snprintf(SNPARGS(action2, 0), "Unreach %d",
+					cmd->arg1);
+			break;
+
+		case O_UNREACH6:
+			if (cmd->arg1==ICMP6_UNREACH_RST)
+				action = "Reset";
+			else
+				snprintf(SNPARGS(action2, 0), "Unreach %d",
+					cmd->arg1);
+			break;
+
+		case O_ACCEPT:
+			action = "Accept";
+			break;
+		case O_COUNT:
+			action = "Count";
+			break;
+		case O_DIVERT:
+			snprintf(SNPARGS(action2, 0), "Divert %d",
+				cmd->arg1);
+			break;
+		case O_TEE:
+			snprintf(SNPARGS(action2, 0), "Tee %d",
+				cmd->arg1);
+			break;
+		case O_SETFIB:
+			snprintf(SNPARGS(action2, 0), "SetFib %d",
+				cmd->arg1);
+			break;
+		case O_SKIPTO:
+			snprintf(SNPARGS(action2, 0), "SkipTo %d",
+				cmd->arg1);
+			break;
+		case O_PIPE:
+			snprintf(SNPARGS(action2, 0), "Pipe %d",
+				cmd->arg1);
+			break;
+		case O_QUEUE:
+			snprintf(SNPARGS(action2, 0), "Queue %d",
+				cmd->arg1);
+			break;
+		case O_FORWARD_IP: {
+			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+			int len;
+			struct in_addr dummyaddr;
+			if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+				dummyaddr.s_addr = htonl(tablearg);
+			else
+				dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+				inet_ntoa(dummyaddr));
+
+			if (sa->sa.sin_port)
+				snprintf(SNPARGS(action2, len), ":%d",
+				    sa->sa.sin_port);
+			}
+			break;
+		case O_NETGRAPH:
+			snprintf(SNPARGS(action2, 0), "Netgraph %d",
+				cmd->arg1);
+			break;
+		case O_NGTEE:
+			snprintf(SNPARGS(action2, 0), "Ngtee %d",
+				cmd->arg1);
+			break;
+		case O_NAT:
+			action = "Nat";
+ 			break;
+		case O_REASS:
+			action = "Reass";
+			break;
+		default:
+			action = "UNKNOWN";
+			break;
+		}
+	}
+
+	if (hlen == 0) {	/* non-ip */
+		snprintf(SNPARGS(proto, 0), "MAC");
+
+	} else {
+		int len;
+		char src[48], dst[48];
+		struct icmphdr *icmp;
+		struct tcphdr *tcp;
+		struct udphdr *udp;
+#ifdef INET6
+		struct ip6_hdr *ip6 = NULL;
+		struct icmp6_hdr *icmp6;
+#endif
+		src[0] = '\0';
+		dst[0] = '\0';
+#ifdef INET6
+		if (IS_IP6_FLOW_ID(&(args->f_id))) {
+			char ip6buf[INET6_ADDRSTRLEN];
+			snprintf(src, sizeof(src), "[%s]",
+			    ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+			snprintf(dst, sizeof(dst), "[%s]",
+			    ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+			ip6 = (struct ip6_hdr *)ip;
+			tcp = (struct tcphdr *)(((char *)ip) + hlen);
+			udp = (struct udphdr *)(((char *)ip) + hlen);
+		} else
+#endif
+		{
+			tcp = L3HDR(struct tcphdr, ip);
+			udp = L3HDR(struct udphdr, ip);
+
+			inet_ntoa_r(ip->ip_src, src);
+			inet_ntoa_r(ip->ip_dst, dst);
+		}
+
+		switch (args->f_id.proto) {
+		case IPPROTO_TCP:
+			len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+			if (offset == 0)
+				snprintf(SNPARGS(proto, len), ":%d %s:%d",
+				    ntohs(tcp->th_sport),
+				    dst,
+				    ntohs(tcp->th_dport));
+			else
+				snprintf(SNPARGS(proto, len), " %s", dst);
+			break;
+
+		case IPPROTO_UDP:
+			len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+			if (offset == 0)
+				snprintf(SNPARGS(proto, len), ":%d %s:%d",
+				    ntohs(udp->uh_sport),
+				    dst,
+				    ntohs(udp->uh_dport));
+			else
+				snprintf(SNPARGS(proto, len), " %s", dst);
+			break;
+
+		case IPPROTO_ICMP:
+			icmp = L3HDR(struct icmphdr, ip);
+			if (offset == 0)
+				len = snprintf(SNPARGS(proto, 0),
+				    "ICMP:%u.%u ",
+				    icmp->icmp_type, icmp->icmp_code);
+			else
+				len = snprintf(SNPARGS(proto, 0), "ICMP ");
+			len += snprintf(SNPARGS(proto, len), "%s", src);
+			snprintf(SNPARGS(proto, len), " %s", dst);
+			break;
+#ifdef INET6
+		case IPPROTO_ICMPV6:
+			icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+			if (offset == 0)
+				len = snprintf(SNPARGS(proto, 0),
+				    "ICMPv6:%u.%u ",
+				    icmp6->icmp6_type, icmp6->icmp6_code);
+			else
+				len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+			len += snprintf(SNPARGS(proto, len), "%s", src);
+			snprintf(SNPARGS(proto, len), " %s", dst);
+			break;
+#endif
+		default:
+			len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+			    args->f_id.proto, src);
+			snprintf(SNPARGS(proto, len), " %s", dst);
+			break;
+		}
+
+#ifdef INET6
+		if (IS_IP6_FLOW_ID(&(args->f_id))) {
+			if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+				snprintf(SNPARGS(fragment, 0),
+				    " (frag %08x:%d@%d%s)",
+				    args->f_id.frag_id6,
+				    ntohs(ip6->ip6_plen) - hlen,
+				    ntohs(offset & IP6F_OFF_MASK) << 3,
+				    (offset & IP6F_MORE_FRAG) ? "+" : "");
+		} else
+#endif
+		{
+			int ip_off, ip_len;
+			if (eh != NULL) { /* layer 2 packets are as on the wire */
+				ip_off = ntohs(ip->ip_off);
+				ip_len = ntohs(ip->ip_len);
+			} else {
+				ip_off = ip->ip_off;
+				ip_len = ip->ip_len;
+			}
+			if (ip_off & (IP_MF | IP_OFFMASK))
+				snprintf(SNPARGS(fragment, 0),
+				    " (frag %d:%d@%d%s)",
+				    ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
+				    offset << 3,
+				    (ip_off & IP_MF) ? "+" : "");
+		}
+	}
+	if (oif || m->m_pkthdr.rcvif)
+		log(LOG_SECURITY | LOG_INFO,
+		    "ipfw: %d %s %s %s via %s%s\n",
+		    f ? f->rulenum : -1,
+		    action, proto, oif ? "out" : "in",
+		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+		    fragment);
+	else
+		log(LOG_SECURITY | LOG_INFO,
+		    "ipfw: %d %s %s [no if info]%s\n",
+		    f ? f->rulenum : -1,
+		    action, proto, fragment);
+	if (limit_reached)
+		log(LOG_SECURITY | LOG_NOTICE,
+		    "ipfw: limit %d reached on entry %d\n",
+		    limit_reached, f ? f->rulenum : -1);
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+	INIT_VNET_IPFW(curvnet);
+	u_int32_t i;
+
+#ifdef INET6
+	if (IS_IP6_FLOW_ID(id)) 
+		i = hash_packet6(id);
+	else
+#endif /* INET6 */
+	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+	i &= (V_curr_dyn_buckets - 1);
+	return i;
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) {				\
+	ipfw_dyn_rule *old_q = q;					\
+									\
+	/* remove a refcount to the parent */				\
+	if (q->dyn_type == O_LIMIT)					\
+		q->parent->count--;					\
+	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
+		(q->id.src_ip), (q->id.src_port),			\
+		(q->id.dst_ip), (q->id.dst_port), V_dyn_count-1 ); )	\
+	if (prev != NULL)						\
+		prev->next = q = q->next;				\
+	else								\
+		head = q = q->next;					\
+	V_dyn_count--;							\
+	uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+	INIT_VNET_IPFW(curvnet);
+	static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+	ipfw_dyn_rule *prev, *q;
+	int i, pass = 0, max_pass = 0;
+
+	IPFW_DYN_LOCK_ASSERT();
+
+	if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+		return;
+	/* do not expire more than once per second, it is useless */
+	if (!FORCE && last_remove == time_uptime)
+		return;
+	last_remove = time_uptime;
+
+	/*
+	 * because O_LIMIT refer to parent rules, during the first pass only
+	 * remove child and mark any pending LIMIT_PARENT, and remove
+	 * them in a second pass.
+	 */
+next_pass:
+	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+		for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+			/*
+			 * Logic can become complex here, so we split tests.
+			 */
+			if (q == keep_me)
+				goto next;
+			if (rule != NULL && rule != q->rule)
+				goto next; /* not the one we are looking for */
+			if (q->dyn_type == O_LIMIT_PARENT) {
+				/*
+				 * handle parent in the second pass,
+				 * record we need one.
+				 */
+				max_pass = 1;
+				if (pass == 0)
+					goto next;
+				if (FORCE && q->count != 0 ) {
+					/* XXX should not happen! */
+					printf("ipfw: OUCH! cannot remove rule,"
+					     " count %d\n", q->count);
+				}
+			} else {
+				if (!FORCE &&
+				    !TIME_LEQ( q->expire, time_uptime ))
+					goto next;
+			}
+             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+                     continue;
+             }
+next:
+			prev=q;
+			q=q->next;
+		}
+	}
+	if (pass++ < max_pass)
+		goto next_pass;
+}
+
+
+/**
+ * lookup a dynamic rule.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+	INIT_VNET_IPFW(curvnet);
+	/*
+	 * stateful ipfw extensions.
+	 * Lookup into dynamic session queue
+	 */
+#define MATCH_REVERSE	0
+#define MATCH_FORWARD	1
+#define MATCH_NONE	2
+#define MATCH_UNKNOWN	3
+	int i, dir = MATCH_NONE;
+	ipfw_dyn_rule *prev, *q=NULL;
+
+	IPFW_DYN_LOCK_ASSERT();
+
+	if (V_ipfw_dyn_v == NULL)
+		goto done;	/* not found */
+	i = hash_packet( pkt );
+	for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
+		if (q->dyn_type == O_LIMIT_PARENT && q->count)
+			goto next;
+		if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
+			UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+			continue;
+		}
+		if (pkt->proto == q->id.proto &&
+		    q->dyn_type != O_LIMIT_PARENT) {
+			if (IS_IP6_FLOW_ID(pkt)) {
+			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+				&(q->id.src_ip6)) &&
+			    IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+				&(q->id.dst_ip6)) &&
+			    pkt->src_port == q->id.src_port &&
+			    pkt->dst_port == q->id.dst_port ) {
+				dir = MATCH_FORWARD;
+				break;
+			    }
+			    if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+				    &(q->id.dst_ip6)) &&
+				IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+				    &(q->id.src_ip6)) &&
+				pkt->src_port == q->id.dst_port &&
+				pkt->dst_port == q->id.src_port ) {
+				    dir = MATCH_REVERSE;
+				    break;
+			    }
+			} else {
+			    if (pkt->src_ip == q->id.src_ip &&
+				pkt->dst_ip == q->id.dst_ip &&
+				pkt->src_port == q->id.src_port &&
+				pkt->dst_port == q->id.dst_port ) {
+				    dir = MATCH_FORWARD;
+				    break;
+			    }
+			    if (pkt->src_ip == q->id.dst_ip &&
+				pkt->dst_ip == q->id.src_ip &&
+				pkt->src_port == q->id.dst_port &&
+				pkt->dst_port == q->id.src_port ) {
+				    dir = MATCH_REVERSE;
+				    break;
+			    }
+			}
+		}
+next:
+		prev = q;
+		q = q->next;
+	}
+	if (q == NULL)
+		goto done; /* q = NULL, not found */
+
+	if ( prev != NULL) { /* found and not in front */
+		prev->next = q->next;
+		q->next = V_ipfw_dyn_v[i];
+		V_ipfw_dyn_v[i] = q;
+	}
+	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
+
+#define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
+		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+		switch (q->state) {
+		case TH_SYN:				/* opening */
+			q->expire = time_uptime + V_dyn_syn_lifetime;
+			break;
+
+		case BOTH_SYN:			/* move to established */
+		case BOTH_SYN | TH_FIN :	/* one side tries to close */
+		case BOTH_SYN | (TH_FIN << 8) :
+ 			if (tcp) {
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+			    u_int32_t ack = ntohl(tcp->th_ack);
+			    if (dir == MATCH_FORWARD) {
+				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
+				    q->ack_fwd = ack;
+				else { /* ignore out-of-sequence */
+				    break;
+				}
+			    } else {
+				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
+				    q->ack_rev = ack;
+				else { /* ignore out-of-sequence */
+				    break;
+				}
+			    }
+			}
+			q->expire = time_uptime + V_dyn_ack_lifetime;
+			break;
+
+		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
+			if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+				V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+			q->expire = time_uptime + V_dyn_fin_lifetime;
+			break;
+
+		default:
+#if 0
+			/*
+			 * reset or some invalid combination, but can also
+			 * occur if we use keep-state the wrong way.
+			 */
+			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+				printf("invalid state: 0x%x\n", q->state);
+#endif
+			if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+				V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+			q->expire = time_uptime + V_dyn_rst_lifetime;
+			break;
+		}
+	} else if (pkt->proto == IPPROTO_UDP) {
+		q->expire = time_uptime + V_dyn_udp_lifetime;
+	} else {
+		/* other protocols */
+		q->expire = time_uptime + V_dyn_short_lifetime;
+	}
+done:
+	if (match_direction)
+		*match_direction = dir;
+	return q;
+}
+
+static ipfw_dyn_rule *
+lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+    struct tcphdr *tcp)
+{
+	ipfw_dyn_rule *q;
+
+	IPFW_DYN_LOCK();
+	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+	if (q == NULL)
+		IPFW_DYN_UNLOCK();
+	/* NB: return table locked when q is not NULL */
+	return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+	INIT_VNET_IPFW(curvnet);
+	IPFW_DYN_LOCK_ASSERT();
+
+	/*
+	 * Try reallocation, make sure we have a power of 2 and do
+	 * not allow more than 64k entries. In case of overflow,
+	 * default to 1024.
+	 */
+
+	if (V_dyn_buckets > 65536)
+		V_dyn_buckets = 1024;
+	if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+		V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+		return;
+	}
+	V_curr_dyn_buckets = V_dyn_buckets;
+	if (V_ipfw_dyn_v != NULL)
+		free(V_ipfw_dyn_v, M_IPFW);
+	for (;;) {
+		V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+		       M_IPFW, M_NOWAIT | M_ZERO);
+		if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+			break;
+		V_curr_dyn_buckets /= 2;
+	}
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ *   (O_LIMIT). When they are created, the parent is
+ *   increased by 1, and decreased on delete. In this case,
+ *   the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+	INIT_VNET_IPFW(curvnet);
+	ipfw_dyn_rule *r;
+	int i;
+
+	IPFW_DYN_LOCK_ASSERT();
+
+	if (V_ipfw_dyn_v == NULL ||
+	    (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+		realloc_dynamic_table();
+		if (V_ipfw_dyn_v == NULL)
+			return NULL; /* failed ! */
+	}
+	i = hash_packet(id);
+
+	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+	if (r == NULL) {
+		printf ("ipfw: sorry cannot allocate state\n");
+		return NULL;
+	}
+
+	/* increase refcount on parent, and set pointer */
+	if (dyn_type == O_LIMIT) {
+		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+		if ( parent->dyn_type != O_LIMIT_PARENT)
+			panic("invalid parent");
+		parent->count++;
+		r->parent = parent;
+		rule = parent->rule;
+	}
+
+	r->id = *id;
+	r->expire = time_uptime + V_dyn_syn_lifetime;
+	r->rule = rule;
+	r->dyn_type = dyn_type;
+	r->pcnt = r->bcnt = 0;
+	r->count = 0;
+
+	r->bucket = i;
+	r->next = V_ipfw_dyn_v[i];
+	V_ipfw_dyn_v[i] = r;
+	V_dyn_count++;
+	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
+	   dyn_type,
+	   (r->id.src_ip), (r->id.src_port),
+	   (r->id.dst_ip), (r->id.dst_port),
+	   V_dyn_count ); )
+	return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+	INIT_VNET_IPFW(curvnet);
+	ipfw_dyn_rule *q;
+	int i;
+
+	IPFW_DYN_LOCK_ASSERT();
+
+	if (V_ipfw_dyn_v) {
+		int is_v6 = IS_IP6_FLOW_ID(pkt);
+		i = hash_packet( pkt );
+		for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+			if (q->dyn_type == O_LIMIT_PARENT &&
+			    rule== q->rule &&
+			    pkt->proto == q->id.proto &&
+			    pkt->src_port == q->id.src_port &&
+			    pkt->dst_port == q->id.dst_port &&
+			    (
+				(is_v6 &&
+				 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+					&(q->id.src_ip6)) &&
+				 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+					&(q->id.dst_ip6))) ||
+				(!is_v6 &&
+				 pkt->src_ip == q->id.src_ip &&
+				 pkt->dst_ip == q->id.dst_ip)
+			    )
+			) {
+				q->expire = time_uptime + V_dyn_short_lifetime;
+				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+				return q;
+			}
+	}
+	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+static int
+install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+    struct ip_fw_args *args, uint32_t tablearg)
+{
+	INIT_VNET_IPFW(curvnet);
+	static int last_log;
+	ipfw_dyn_rule *q;
+	struct in_addr da;
+	char src[48], dst[48];
+
+	src[0] = '\0';
+	dst[0] = '\0';
+
+	DEB(
+	printf("ipfw: %s: type %d 0x%08x %u -> 0x%08x %u\n",
+	    __func__, cmd->o.opcode,
+	    (args->f_id.src_ip), (args->f_id.src_port),
+	    (args->f_id.dst_ip), (args->f_id.dst_port));
+	)
+
+	IPFW_DYN_LOCK();
+
+	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+	if (q != NULL) {	/* should never occur */
+		if (last_log != time_uptime) {
+			last_log = time_uptime;
+			printf("ipfw: %s: entry already present, done\n",
+			    __func__);
+		}
+		IPFW_DYN_UNLOCK();
+		return (0);
+	}
+
+	if (V_dyn_count >= V_dyn_max)
+		/* Run out of slots, try to remove any expired rule. */
+		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+	if (V_dyn_count >= V_dyn_max) {
+		if (last_log != time_uptime) {
+			last_log = time_uptime;
+			printf("ipfw: %s: Too many dynamic rules\n", __func__);
+		}
+		IPFW_DYN_UNLOCK();
+		return (1);	/* cannot install, notify caller */
+	}
+
+	switch (cmd->o.opcode) {
+	case O_KEEP_STATE:	/* bidir rule */
+		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+		break;
+
+	case O_LIMIT: {		/* limit number of sessions */
+		struct ipfw_flow_id id;
+		ipfw_dyn_rule *parent;
+		uint32_t conn_limit;
+		uint16_t limit_mask = cmd->limit_mask;
+
+		conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+		    tablearg : cmd->conn_limit;
+		  
+		DEB(
+		if (cmd->conn_limit == IP_FW_TABLEARG)
+			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+			    "(tablearg)\n", __func__, conn_limit);
+		else
+			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+			    __func__, conn_limit);
+		)
+
+		id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+		id.proto = args->f_id.proto;
+		id.addr_type = args->f_id.addr_type;
+		id.fib = M_GETFIB(args->m);
+
+		if (IS_IP6_FLOW_ID (&(args->f_id))) {
+			if (limit_mask & DYN_SRC_ADDR)
+				id.src_ip6 = args->f_id.src_ip6;
+			if (limit_mask & DYN_DST_ADDR)
+				id.dst_ip6 = args->f_id.dst_ip6;
+		} else {
+			if (limit_mask & DYN_SRC_ADDR)
+				id.src_ip = args->f_id.src_ip;
+			if (limit_mask & DYN_DST_ADDR)
+				id.dst_ip = args->f_id.dst_ip;
+		}
+		if (limit_mask & DYN_SRC_PORT)
+			id.src_port = args->f_id.src_port;
+		if (limit_mask & DYN_DST_PORT)
+			id.dst_port = args->f_id.dst_port;
+		if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+			printf("ipfw: %s: add parent failed\n", __func__);
+			IPFW_DYN_UNLOCK();
+			return (1);
+		}
+
+		if (parent->count >= conn_limit) {
+			/* See if we can remove some expired rule. */
+			remove_dyn_rule(rule, parent);
+			if (parent->count >= conn_limit) {
+				if (V_fw_verbose && last_log != time_uptime) {
+					last_log = time_uptime;
+#ifdef INET6
+					/*
+					 * XXX IPv6 flows are not
+					 * supported yet.
+					 */
+					if (IS_IP6_FLOW_ID(&(args->f_id))) {
+						char ip6buf[INET6_ADDRSTRLEN];
+						snprintf(src, sizeof(src),
+						    "[%s]", ip6_sprintf(ip6buf,
+							&args->f_id.src_ip6));
+						snprintf(dst, sizeof(dst),
+						    "[%s]", ip6_sprintf(ip6buf,
+							&args->f_id.dst_ip6));
+					} else
+#endif
+					{
+						da.s_addr =
+						    htonl(args->f_id.src_ip);
+						inet_ntoa_r(da, src);
+						da.s_addr =
+						    htonl(args->f_id.dst_ip);
+						inet_ntoa_r(da, dst);
+					}
+					log(LOG_SECURITY | LOG_DEBUG,
+					    "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+					    parent->rule->rulenum,
+					    "drop session",
+					    src, (args->f_id.src_port),
+					    dst, (args->f_id.dst_port),
+					    "too many entries");
+				}
+				IPFW_DYN_UNLOCK();
+				return (1);
+			}
+		}
+		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+		break;
+	}
+	default:
+		printf("ipfw: %s: unknown dynamic rule type %u\n",
+		    __func__, cmd->o.opcode);
+		IPFW_DYN_UNLOCK();
+		return (1);
+	}
+
+	/* XXX just set lifetime */
+	lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+	IPFW_DYN_UNLOCK();
+	return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+static struct mbuf *
+send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+    u_int32_t ack, int flags)
+{
+	INIT_VNET_INET(curvnet);
+	struct mbuf *m;
+	struct ip *ip;
+	struct tcphdr *tcp;
+
+	MGETHDR(m, M_DONTWAIT, MT_DATA);
+	if (m == 0)
+		return (NULL);
+	m->m_pkthdr.rcvif = (struct ifnet *)0;
+
+	M_SETFIB(m, id->fib);
+#ifdef MAC
+	if (replyto != NULL)
+		mac_netinet_firewall_reply(replyto, m);
+	else
+		mac_netinet_firewall_send(m);
+#else
+	(void)replyto;		/* don't warn about unused arg */
+#endif
+
+	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
+	m->m_data += max_linkhdr;
+
+	ip = mtod(m, struct ip *);
+	bzero(ip, m->m_len);
+	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
+	ip->ip_p = IPPROTO_TCP;
+	tcp->th_off = 5;
+	/*
+	 * Assume we are sending a RST (or a keepalive in the reverse
+	 * direction), swap src and destination addresses and ports.
+	 */
+	ip->ip_src.s_addr = htonl(id->dst_ip);
+	ip->ip_dst.s_addr = htonl(id->src_ip);
+	tcp->th_sport = htons(id->dst_port);
+	tcp->th_dport = htons(id->src_port);
+	if (flags & TH_RST) {	/* we are sending a RST */
+		if (flags & TH_ACK) {
+			tcp->th_seq = htonl(ack);
+			tcp->th_ack = htonl(0);
+			tcp->th_flags = TH_RST;
+		} else {
+			if (flags & TH_SYN)
+				seq++;
+			tcp->th_seq = htonl(0);
+			tcp->th_ack = htonl(seq);
+			tcp->th_flags = TH_RST | TH_ACK;
+		}
+	} else {
+		/*
+		 * We are sending a keepalive. flags & TH_SYN determines
+		 * the direction, forward if set, reverse if clear.
+		 * NOTE: seq and ack are always assumed to be correct
+		 * as set by the caller. This may be confusing...
+		 */
+		if (flags & TH_SYN) {
+			/*
+			 * we have to rewrite the correct addresses!
+			 */
+			ip->ip_dst.s_addr = htonl(id->dst_ip);
+			ip->ip_src.s_addr = htonl(id->src_ip);
+			tcp->th_dport = htons(id->dst_port);
+			tcp->th_sport = htons(id->src_port);
+		}
+		tcp->th_seq = htonl(seq);
+		tcp->th_ack = htonl(ack);
+		tcp->th_flags = TH_ACK;
+	}
+	/*
+	 * set ip_len to the payload size so we can compute
+	 * the tcp checksum on the pseudoheader
+	 * XXX check this, could save a couple of words ?
+	 */
+	ip->ip_len = htons(sizeof(struct tcphdr));
+	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
+	/*
+	 * now fill fields left out earlier
+	 */
+	ip->ip_ttl = V_ip_defttl;
+	ip->ip_len = m->m_pkthdr.len;
+	m->m_flags |= M_SKIP_FIREWALL;
+	return (m);
+}
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
+{
+
+#if 0
+	/* XXX When ip is not guaranteed to be at mtod() we will
+	 * need to account for this */
+	 * The mbuf will however be thrown away so we can adjust it.
+	 * Remember we did an m_pullup on it already so we
+	 * can make some assumptions about contiguousness.
+	 */
+	if (args->L3offset)
+		m_adj(m, args->L3offset);
+#endif
+	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+		/* We need the IP header in host order for icmp_error(). */
+		if (args->eh != NULL) {
+			ip->ip_len = ntohs(ip->ip_len);
+			ip->ip_off = ntohs(ip->ip_off);
+		}
+		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+	} else if (args->f_id.proto == IPPROTO_TCP) {
+		struct tcphdr *const tcp =
+		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+		if ( (tcp->th_flags & TH_RST) == 0) {
+			struct mbuf *m;
+			m = send_pkt(args->m, &(args->f_id),
+				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+				tcp->th_flags | TH_RST);
+			if (m != NULL)
+				ip_output(m, NULL, NULL, 0, NULL, NULL);
+		}
+		m_freem(args->m);
+	} else
+		m_freem(args->m);
+	args->m = NULL;
+}
+
+/**
+ *
+ * Given an ip_fw *, lookup_next_rule will return a pointer
+ * to the next rule, which can be either the jump
+ * target (for skipto instructions) or the next one in the list (in
+ * all other cases including a missing jump target).
+ * The result is also written in the "next_rule" field of the rule.
+ * Backward jumps are not allowed, so start looking from the next
+ * rule...
+ *
+ * This never returns NULL -- in case we do not have an exact match,
+ * the next rule is returned. When the ruleset is changed,
+ * pointers are flushed so we are always correct.
+ */
+
+static struct ip_fw *
+lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
+{
+	struct ip_fw *rule = NULL;
+	ipfw_insn *cmd;
+	u_int16_t	rulenum;
+
+	/* look for action, in case it is a skipto */
+	cmd = ACTION_PTR(me);
+	if (cmd->opcode == O_LOG)
+		cmd += F_LEN(cmd);
+	if (cmd->opcode == O_ALTQ)
+		cmd += F_LEN(cmd);
+	if (cmd->opcode == O_TAG)
+		cmd += F_LEN(cmd);
+	if (cmd->opcode == O_SKIPTO ) {
+		if (tablearg != 0) {
+			rulenum = (u_int16_t)tablearg;
+		} else {
+			rulenum = cmd->arg1;
+		}
+		for (rule = me->next; rule ; rule = rule->next) {
+			if (rule->rulenum >= rulenum) {
+				break;
+			}
+		}
+	}
+	if (rule == NULL)			/* failure or not a skipto */
+		rule = me->next;
+	me->next_rule = rule;
+	return rule;
+}
+
+static int
+add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen, uint32_t value)
+{
+	struct radix_node_head *rnh;
+	struct table_entry *ent;
+	struct radix_node *rn;
+
+	if (tbl >= IPFW_TABLES_MAX)
+		return (EINVAL);
+	rnh = ch->tables[tbl];
+	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+	if (ent == NULL)
+		return (ENOMEM);
+	ent->value = value;
+	ent->addr.sin_len = ent->mask.sin_len = 8;
+	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+	IPFW_WLOCK(ch);
+	rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+	if (rn == NULL) {
+		IPFW_WUNLOCK(ch);
+		free(ent, M_IPFW_TBL);
+		return (EEXIST);
+	}
+	IPFW_WUNLOCK(ch);
+	return (0);
+}
+
+static int
+del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint8_t mlen)
+{
+	struct radix_node_head *rnh;
+	struct table_entry *ent;
+	struct sockaddr_in sa, mask;
+
+	if (tbl >= IPFW_TABLES_MAX)
+		return (EINVAL);
+	rnh = ch->tables[tbl];
+	sa.sin_len = mask.sin_len = 8;
+	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+	IPFW_WLOCK(ch);
+	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+	if (ent == NULL) {
+		IPFW_WUNLOCK(ch);
+		return (ESRCH);
+	}
+	IPFW_WUNLOCK(ch);
+	free(ent, M_IPFW_TBL);
+	return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+	struct radix_node_head * const rnh = arg;
+	struct table_entry *ent;
+
+	ent = (struct table_entry *)
+	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+	if (ent != NULL)
+		free(ent, M_IPFW_TBL);
+	return (0);
+}
+
+static int
+flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+	struct radix_node_head *rnh;
+
+	IPFW_WLOCK_ASSERT(ch);
+
+	if (tbl >= IPFW_TABLES_MAX)
+		return (EINVAL);
+	rnh = ch->tables[tbl];
+	KASSERT(rnh != NULL, ("NULL IPFW table"));
+	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+	return (0);
+}
+
+static void
+flush_tables(struct ip_fw_chain *ch)
+{
+	uint16_t tbl;
+
+	IPFW_WLOCK_ASSERT(ch);
+
+	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
+		flush_table(ch, tbl);
+}
+
+static int
+init_tables(struct ip_fw_chain *ch)
+{ 
+	int i;
+	uint16_t j;
+
+	for (i = 0; i < IPFW_TABLES_MAX; i++) {
+		if (!rn_inithead((void **)&ch->tables[i], 32)) {
+			for (j = 0; j < i; j++) {
+				(void) flush_table(ch, j);
+			}
+			return (ENOMEM);
+		}
+	}
+	return (0);
+}
+
+static int
+lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+    uint32_t *val)
+{
+	struct radix_node_head *rnh;
+	struct table_entry *ent;
+	struct sockaddr_in sa;
+
+	if (tbl >= IPFW_TABLES_MAX)
+		return (0);
+	rnh = ch->tables[tbl];
+	sa.sin_len = 8;
+	sa.sin_addr.s_addr = addr;
+	ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+	if (ent != NULL) {
+		*val = ent->value;
+		return (1);
+	}
+	return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+	u_int32_t * const cnt = arg;
+
+	(*cnt)++;
+	return (0);
+}
+
+static int
+count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+	struct radix_node_head *rnh;
+
+	if (tbl >= IPFW_TABLES_MAX)
+		return (EINVAL);
+	rnh = ch->tables[tbl];
+	*cnt = 0;
+	rnh->rnh_walktree(rnh, count_table_entry, cnt);
+	return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+	struct table_entry * const n = (struct table_entry *)rn;
+	ipfw_table * const tbl = arg;
+	ipfw_table_entry *ent;
+
+	if (tbl->cnt == tbl->size)
+		return (1);
+	ent = &tbl->ent[tbl->cnt];
+	ent->tbl = tbl->tbl;
+	if (in_nullhost(n->mask.sin_addr))
+		ent->masklen = 0;
+	else
+		ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+	ent->addr = n->addr.sin_addr.s_addr;
+	ent->value = n->value;
+	tbl->cnt++;
+	return (0);
+}
+
+static int
+dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+	struct radix_node_head *rnh;
+
+	if (tbl->tbl >= IPFW_TABLES_MAX)
+		return (EINVAL);
+	rnh = ch->tables[tbl->tbl];
+	tbl->cnt = 0;
+	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+	return (0);
+}
+
+static void
+fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp)
+{
+	struct ucred *cr;
+
+	cr = inp->inp_cred;
+	ugp->fw_prid = jailed(cr) ? cr->cr_prison->pr_id : -1;
+	ugp->fw_uid = cr->cr_uid;
+	ugp->fw_ngroups = cr->cr_ngroups;
+	bcopy(cr->cr_groups, ugp->fw_groups, sizeof(ugp->fw_groups));
+}
+
+static int
+check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
+    struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
+    u_int16_t src_port, struct ip_fw_ugid *ugp, int *ugid_lookupp,
+    struct inpcb *inp)
+{
+	INIT_VNET_INET(curvnet);
+	struct inpcbinfo *pi;
+	int wildcard;
+	struct inpcb *pcb;
+	int match;
+	gid_t *gp;
+
+	/*
+	 * Check to see if the UDP or TCP stack supplied us with
+	 * the PCB. If so, rather then holding a lock and looking
+	 * up the PCB, we can use the one that was supplied.
+	 */
+	if (inp && *ugid_lookupp == 0) {
+		INP_LOCK_ASSERT(inp);
+		if (inp->inp_socket != NULL) {
+			fill_ugid_cache(inp, ugp);
+			*ugid_lookupp = 1;
+		} else
+			*ugid_lookupp = -1;
+	}
+	/*
+	 * If we have already been here and the packet has no
+	 * PCB entry associated with it, then we can safely
+	 * assume that this is a no match.
+	 */
+	if (*ugid_lookupp == -1)
+		return (0);
+	if (proto == IPPROTO_TCP) {
+		wildcard = 0;
+		pi = &V_tcbinfo;
+	} else if (proto == IPPROTO_UDP) {
+		wildcard = INPLOOKUP_WILDCARD;
+		pi = &V_udbinfo;
+	} else
+		return 0;
+	match = 0;
+	if (*ugid_lookupp == 0) {
+		INP_INFO_RLOCK(pi);
+		pcb =  (oif) ?
+			in_pcblookup_hash(pi,
+				dst_ip, htons(dst_port),
+				src_ip, htons(src_port),
+				wildcard, oif) :
+			in_pcblookup_hash(pi,
+				src_ip, htons(src_port),
+				dst_ip, htons(dst_port),
+				wildcard, NULL);
+		if (pcb != NULL) {
+			fill_ugid_cache(pcb, ugp);
+			*ugid_lookupp = 1;
+		}
+		INP_INFO_RUNLOCK(pi);
+		if (*ugid_lookupp == 0) {
+			/*
+			 * If the lookup did not yield any results, there
+			 * is no sense in coming back and trying again. So
+			 * we can set lookup to -1 and ensure that we wont
+			 * bother the pcb system again.
+			 */
+			*ugid_lookupp = -1;
+			return (0);
+		}
+	} 
+	if (insn->o.opcode == O_UID)
+		match = (ugp->fw_uid == (uid_t)insn->d[0]);
+	else if (insn->o.opcode == O_GID) {
+		for (gp = ugp->fw_groups;
+			gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++)
+			if (*gp == (gid_t)insn->d[0]) {
+				match = 1;
+				break;
+			}
+	} else if (insn->o.opcode == O_JAIL)
+		match = (ugp->fw_prid == (int)insn->d[0]);
+	return match;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
+ *		Starts with the IP header.
+ *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
+ *	args->L3offset	Number of bytes bypassed if we came from L2.
+ *			e.g. often sizeof(eh)  ** NOTYET **
+ *	args->oif	Outgoing interface, or NULL if packet is incoming.
+ *		The incoming interface is in the mbuf. (in)
+ *	args->divert_rule (in/out)
+ *		Skip up to the first rule past this rule number;
+ *		upon return, non-zero port number for divert or tee.
+ *
+ *	args->rule	Pointer to the last matching rule (in/out)
+ *	args->next_hop	Socket we are forwarding to (out).
+ *	args->f_id	Addresses grabbed from the packet (out)
+ * 	args->cookie	a cookie depending on rule action
+ *
+ * Return value:
+ *
+ *	IP_FW_PASS	the packet must be accepted
+ *	IP_FW_DENY	the packet must be dropped
+ *	IP_FW_DIVERT	divert packet, port in m_tag
+ *	IP_FW_TEE	tee packet, port in m_tag
+ *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
+ *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+	INIT_VNET_INET(curvnet);
+	INIT_VNET_IPFW(curvnet);
+
+	/*
+	 * Local variables holding state during the processing of a packet:
+	 *
+	 * IMPORTANT NOTE: to speed up the processing of rules, there
+	 * are some assumption on the values of the variables, which
+	 * are documented here. Should you change them, please check
+	 * the implementation of the various instructions to make sure
+	 * that they still work.
+	 *
+	 * args->eh	The MAC header. It is non-null for a layer2
+	 *	packet, it is NULL for a layer-3 packet.
+	 * **notyet**
+	 * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+	 *
+	 * m | args->m	Pointer to the mbuf, as received from the caller.
+	 *	It may change if ipfw_chk() does an m_pullup, or if it
+	 *	consumes the packet because it calls send_reject().
+	 *	XXX This has to change, so that ipfw_chk() never modifies
+	 *	or consumes the buffer.
+	 * ip	is the beginning of the ip(4 or 6) header.
+	 *	Calculated by adding the L3offset to the start of data.
+	 *	(Until we start using L3offset, the packet is
+	 *	supposed to start with the ip header).
+	 */
+	struct mbuf *m = args->m;
+	struct ip *ip = mtod(m, struct ip *);
+
+	/*
+	 * For rules which contain uid/gid or jail constraints, cache
+	 * a copy of the users credentials after the pcb lookup has been
+	 * executed. This will speed up the processing of rules with
+	 * these types of constraints, as well as decrease contention
+	 * on pcb related locks.
+	 */
+	struct ip_fw_ugid fw_ugid_cache;
+	int ugid_lookup = 0;
+
+	/*
+	 * divinput_flags	If non-zero, set to the IP_FW_DIVERT_*_FLAG
+	 *	associated with a packet input on a divert socket.  This
+	 *	will allow to distinguish traffic and its direction when
+	 *	it originates from a divert socket.
+	 */
+	u_int divinput_flags = 0;
+
+	/*
+	 * oif | args->oif	If NULL, ipfw_chk has been called on the
+	 *	inbound path (ether_input, ip_input).
+	 *	If non-NULL, ipfw_chk has been called on the outbound path
+	 *	(ether_output, ip_output).
+	 */
+	struct ifnet *oif = args->oif;
+
+	struct ip_fw *f = NULL;		/* matching rule */
+	int retval = 0;
+
+	/*
+	 * hlen	The length of the IP header.
+	 */
+	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
+
+	/*
+	 * offset	The offset of a fragment. offset != 0 means that
+	 *	we have a fragment at this offset of an IPv4 packet.
+	 *	offset == 0 means that (if this is an IPv4 packet)
+	 *	this is the first or only fragment.
+	 *	For IPv6 offset == 0 means there is no Fragment Header. 
+	 *	If offset != 0 for IPv6 always use correct mask to
+	 *	get the correct offset because we add IP6F_MORE_FRAG
+	 *	to be able to dectect the first fragment which would
+	 *	otherwise have offset = 0.
+	 */
+	u_short offset = 0;
+
+	/*
+	 * Local copies of addresses. They are only valid if we have
+	 * an IP packet.
+	 *
+	 * proto	The protocol. Set to 0 for non-ip packets,
+	 *	or to the protocol read from the packet otherwise.
+	 *	proto != 0 means that we have an IPv4 packet.
+	 *
+	 * src_port, dst_port	port numbers, in HOST format. Only
+	 *	valid for TCP and UDP packets.
+	 *
+	 * src_ip, dst_ip	ip addresses, in NETWORK format.
+	 *	Only valid for IPv4 packets.
+	 */
+	u_int8_t proto;
+	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
+	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
+	u_int16_t ip_len=0;
+	int pktlen;
+	u_int16_t	etype = 0;	/* Host order stored ether type */
+
+	/*
+	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+	 * 	MATCH_NONE when checked and not matched (q = NULL),
+	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+	 */
+	int dyn_dir = MATCH_UNKNOWN;
+	ipfw_dyn_rule *q = NULL;
+	struct ip_fw_chain *chain = &V_layer3_chain;
+	struct m_tag *mtag;
+
+	/*
+	 * We store in ulp a pointer to the upper layer protocol header.
+	 * In the ipv4 case this is easy to determine from the header,
+	 * but for ipv6 we might have some additional headers in the middle.
+	 * ulp is NULL if not found.
+	 */
+	void *ulp = NULL;		/* upper layer protocol pointer. */
+	/* XXX ipv6 variables */
+	int is_ipv6 = 0;
+	u_int16_t ext_hd = 0;	/* bits vector for extension header filtering */
+	/* end of ipv6 variables */
+	int is_ipv4 = 0;
+
+	if (m->m_flags & M_SKIP_FIREWALL)
+		return (IP_FW_PASS);	/* accept */
+
+	dst_ip.s_addr = 0;		/* make sure it is initialized */
+	pktlen = m->m_pkthdr.len;
+	args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+	proto = args->f_id.proto = 0;	/* mark f_id invalid */
+		/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(len, p, T)						\
+do {									\
+	int x = (len) + sizeof(T);					\
+	if ((m)->m_len < x) {						\
+		args->m = m = m_pullup(m, x);				\
+		if (m == NULL)						\
+			goto pullup_failed;				\
+	}								\
+	p = (mtod(m, char *) + (len));					\
+} while (0)
+
+	/*
+	 * if we have an ether header,
+	 */
+	if (args->eh)
+		etype = ntohs(args->eh->ether_type);
+
+	/* Identify IP packets and fill up variables. */
+	if (pktlen >= sizeof(struct ip6_hdr) &&
+	    (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+		is_ipv6 = 1;
+		args->f_id.addr_type = 6;
+		hlen = sizeof(struct ip6_hdr);
+		proto = ip6->ip6_nxt;
+
+		/* Search extension headers to find upper layer protocols */
+		while (ulp == NULL) {
+			switch (proto) {
+			case IPPROTO_ICMPV6:
+				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+				args->f_id.flags = ICMP6(ulp)->icmp6_type;
+				break;
+
+			case IPPROTO_TCP:
+				PULLUP_TO(hlen, ulp, struct tcphdr);
+				dst_port = TCP(ulp)->th_dport;
+				src_port = TCP(ulp)->th_sport;
+				args->f_id.flags = TCP(ulp)->th_flags;
+				break;
+
+			case IPPROTO_SCTP:
+				PULLUP_TO(hlen, ulp, struct sctphdr);
+				src_port = SCTP(ulp)->src_port;
+				dst_port = SCTP(ulp)->dest_port;
+				break;
+
+			case IPPROTO_UDP:
+				PULLUP_TO(hlen, ulp, struct udphdr);
+				dst_port = UDP(ulp)->uh_dport;
+				src_port = UDP(ulp)->uh_sport;
+				break;
+
+			case IPPROTO_HOPOPTS:	/* RFC 2460 */
+				PULLUP_TO(hlen, ulp, struct ip6_hbh);
+				ext_hd |= EXT_HOPOPTS;
+				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+				ulp = NULL;
+				break;
+
+			case IPPROTO_ROUTING:	/* RFC 2460 */
+				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+				case 0:
+					ext_hd |= EXT_RTHDR0;
+					break;
+				case 2:
+					ext_hd |= EXT_RTHDR2;
+					break;
+				default:
+					printf("IPFW2: IPV6 - Unknown Routing "
+					    "Header type(%d)\n",
+					    ((struct ip6_rthdr *)ulp)->ip6r_type);
+					if (V_fw_deny_unknown_exthdrs)
+					    return (IP_FW_DENY);
+					break;
+				}
+				ext_hd |= EXT_ROUTING;
+				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+				ulp = NULL;
+				break;
+
+			case IPPROTO_FRAGMENT:	/* RFC 2460 */
+				PULLUP_TO(hlen, ulp, struct ip6_frag);
+				ext_hd |= EXT_FRAGMENT;
+				hlen += sizeof (struct ip6_frag);
+				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+					IP6F_OFF_MASK;
+				/* Add IP6F_MORE_FRAG for offset of first
+				 * fragment to be != 0. */
+				offset |= ((struct ip6_frag *)ulp)->ip6f_offlg &
+					IP6F_MORE_FRAG;
+				if (offset == 0) {
+					printf("IPFW2: IPV6 - Invalid Fragment "
+					    "Header\n");
+					if (V_fw_deny_unknown_exthdrs)
+					    return (IP_FW_DENY);
+					break;
+				}
+				args->f_id.frag_id6 =
+				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+				ulp = NULL;
+				break;
+
+			case IPPROTO_DSTOPTS:	/* RFC 2460 */
+				PULLUP_TO(hlen, ulp, struct ip6_hbh);
+				ext_hd |= EXT_DSTOPTS;
+				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+				ulp = NULL;
+				break;
+
+			case IPPROTO_AH:	/* RFC 2402 */
+				PULLUP_TO(hlen, ulp, struct ip6_ext);
+				ext_hd |= EXT_AH;
+				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+				ulp = NULL;
+				break;
+
+			case IPPROTO_ESP:	/* RFC 2406 */
+				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
+				/* Anything past Seq# is variable length and
+				 * data past this ext. header is encrypted. */
+				ext_hd |= EXT_ESP;
+				break;
+
+			case IPPROTO_NONE:	/* RFC 2460 */
+				/*
+				 * Packet ends here, and IPv6 header has
+				 * already been pulled up. If ip6e_len!=0
+				 * then octets must be ignored.
+				 */
+				ulp = ip; /* non-NULL to get out of loop. */
+				break;
+
+			case IPPROTO_OSPFIGP:
+				/* XXX OSPF header check? */
+				PULLUP_TO(hlen, ulp, struct ip6_ext);
+				break;
+
+			case IPPROTO_PIM:
+				/* XXX PIM header check? */
+				PULLUP_TO(hlen, ulp, struct pim);
+				break;
+
+			case IPPROTO_CARP:
+				PULLUP_TO(hlen, ulp, struct carp_header);
+				if (((struct carp_header *)ulp)->carp_version !=
+				    CARP_VERSION) 
+					return (IP_FW_DENY);
+				if (((struct carp_header *)ulp)->carp_type !=
+				    CARP_ADVERTISEMENT) 
+					return (IP_FW_DENY);
+				break;
+
+			case IPPROTO_IPV6:	/* RFC 2893 */
+				PULLUP_TO(hlen, ulp, struct ip6_hdr);
+				break;
+
+			case IPPROTO_IPV4:	/* RFC 2893 */
+				PULLUP_TO(hlen, ulp, struct ip);
+				break;
+
+			default:
+				printf("IPFW2: IPV6 - Unknown Extension "
+				    "Header(%d), ext_hd=%x\n", proto, ext_hd);
+				if (V_fw_deny_unknown_exthdrs)
+				    return (IP_FW_DENY);
+				PULLUP_TO(hlen, ulp, struct ip6_ext);
+				break;
+			} /*switch */
+		}
+		ip = mtod(m, struct ip *);
+		ip6 = (struct ip6_hdr *)ip;
+		args->f_id.src_ip6 = ip6->ip6_src;
+		args->f_id.dst_ip6 = ip6->ip6_dst;
+		args->f_id.src_ip = 0;
+		args->f_id.dst_ip = 0;
+		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+	} else if (pktlen >= sizeof(struct ip) &&
+	    (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+	    	is_ipv4 = 1;
+		hlen = ip->ip_hl << 2;
+		args->f_id.addr_type = 4;
+
+		/*
+		 * Collect parameters into local variables for faster matching.
+		 */
+		proto = ip->ip_p;
+		src_ip = ip->ip_src;
+		dst_ip = ip->ip_dst;
+		if (args->eh != NULL) { /* layer 2 packets are as on the wire */
+			offset = ntohs(ip->ip_off) & IP_OFFMASK;
+			ip_len = ntohs(ip->ip_len);
+		} else {
+			offset = ip->ip_off & IP_OFFMASK;
+			ip_len = ip->ip_len;
+		}
+		pktlen = ip_len < pktlen ? ip_len : pktlen;
+
+		if (offset == 0) {
+			switch (proto) {
+			case IPPROTO_TCP:
+				PULLUP_TO(hlen, ulp, struct tcphdr);
+				dst_port = TCP(ulp)->th_dport;
+				src_port = TCP(ulp)->th_sport;
+				args->f_id.flags = TCP(ulp)->th_flags;
+				break;
+
+			case IPPROTO_UDP:
+				PULLUP_TO(hlen, ulp, struct udphdr);
+				dst_port = UDP(ulp)->uh_dport;
+				src_port = UDP(ulp)->uh_sport;
+				break;
+
+			case IPPROTO_ICMP:
+				PULLUP_TO(hlen, ulp, struct icmphdr);
+				args->f_id.flags = ICMP(ulp)->icmp_type;
+				break;
+
+			default:
+				break;
+			}
+		}
+
+		ip = mtod(m, struct ip *);
+		args->f_id.src_ip = ntohl(src_ip.s_addr);
+		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+	}
+#undef PULLUP_TO
+	if (proto) { /* we may have port numbers, store them */
+		args->f_id.proto = proto;
+		args->f_id.src_port = src_port = ntohs(src_port);
+		args->f_id.dst_port = dst_port = ntohs(dst_port);
+	}
+
+	IPFW_RLOCK(chain);
+	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
+	if (args->rule) {
+		/*
+		 * Packet has already been tagged. Look for the next rule
+		 * to restart processing.
+		 */
+		f = args->rule->next_rule;
+		if (f == NULL)
+			f = lookup_next_rule(args->rule, 0);
+	} else {
+		/*
+		 * Find the starting rule. It can be either the first
+		 * one, or the one after divert_rule if asked so.
+		 */
+		int skipto = mtag ? divert_cookie(mtag) : 0;
+
+		f = chain->rules;
+		if (args->eh == NULL && skipto != 0) {
+			if (skipto >= IPFW_DEFAULT_RULE) {
+				IPFW_RUNLOCK(chain);
+				return (IP_FW_DENY); /* invalid */
+			}
+			while (f && f->rulenum <= skipto)
+				f = f->next;
+			if (f == NULL) {	/* drop packet */
+				IPFW_RUNLOCK(chain);
+				return (IP_FW_DENY);
+			}
+		}
+	}
+	/* reset divert rule to avoid confusion later */
+	if (mtag) {
+		divinput_flags = divert_info(mtag) &
+		    (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG);
+		m_tag_delete(m, mtag);
+	}
+
+	/*
+	 * Now scan the rules, and parse microinstructions for each rule.
+	 */
+	for (; f; f = f->next) {
+		ipfw_insn *cmd;
+		uint32_t tablearg = 0;
+		int l, cmdlen, skip_or; /* skip rest of OR block */
+
+again:
+		if (V_set_disable & (1 << f->set) )
+			continue;
+
+		skip_or = 0;
+		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+		    l -= cmdlen, cmd += cmdlen) {
+			int match;
+
+			/*
+			 * check_body is a jump target used when we find a
+			 * CHECK_STATE, and need to jump to the body of
+			 * the target rule.
+			 */
+
+check_body:
+			cmdlen = F_LEN(cmd);
+			/*
+			 * An OR block (insn_1 || .. || insn_n) has the
+			 * F_OR bit set in all but the last instruction.
+			 * The first match will set "skip_or", and cause
+			 * the following instructions to be skipped until
+			 * past the one with the F_OR bit clear.
+			 */
+			if (skip_or) {		/* skip this instruction */
+				if ((cmd->len & F_OR) == 0)
+					skip_or = 0;	/* next one is good */
+				continue;
+			}
+			match = 0; /* set to 1 if we succeed */
+
+			switch (cmd->opcode) {
+			/*
+			 * The first set of opcodes compares the packet's
+			 * fields with some pattern, setting 'match' if a
+			 * match is found. At the end of the loop there is
+			 * logic to deal with F_NOT and F_OR flags associated
+			 * with the opcode.
+			 */
+			case O_NOP:
+				match = 1;
+				break;
+
+			case O_FORWARD_MAC:
+				printf("ipfw: opcode %d unimplemented\n",
+				    cmd->opcode);
+				break;
+
+			case O_GID:
+			case O_UID:
+			case O_JAIL:
+				/*
+				 * We only check offset == 0 && proto != 0,
+				 * as this ensures that we have a
+				 * packet with the ports info.
+				 */
+				if (offset!=0)
+					break;
+				if (is_ipv6) /* XXX to be fixed later */
+					break;
+				if (proto == IPPROTO_TCP ||
+				    proto == IPPROTO_UDP)
+					match = check_uidgid(
+						    (ipfw_insn_u32 *)cmd,
+						    proto, oif,
+						    dst_ip, dst_port,
+						    src_ip, src_port, &fw_ugid_cache,
+						    &ugid_lookup, args->inp);
+				break;
+
+			case O_RECV:
+				match = iface_match(m->m_pkthdr.rcvif,
+				    (ipfw_insn_if *)cmd);
+				break;
+
+			case O_XMIT:
+				match = iface_match(oif, (ipfw_insn_if *)cmd);
+				break;
+
+			case O_VIA:
+				match = iface_match(oif ? oif :
+				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
+				break;
+
+			case O_MACADDR2:
+				if (args->eh != NULL) {	/* have MAC header */
+					u_int32_t *want = (u_int32_t *)
+						((ipfw_insn_mac *)cmd)->addr;
+					u_int32_t *mask = (u_int32_t *)
+						((ipfw_insn_mac *)cmd)->mask;
+					u_int32_t *hdr = (u_int32_t *)args->eh;
+
+					match =
+					    ( want[0] == (hdr[0] & mask[0]) &&
+					      want[1] == (hdr[1] & mask[1]) &&
+					      want[2] == (hdr[2] & mask[2]) );
+				}
+				break;
+
+			case O_MAC_TYPE:
+				if (args->eh != NULL) {
+					u_int16_t *p =
+					    ((ipfw_insn_u16 *)cmd)->ports;
+					int i;
+
+					for (i = cmdlen - 1; !match && i>0;
+					    i--, p += 2)
+						match = (etype >= p[0] &&
+						    etype <= p[1]);
+				}
+				break;
+
+			case O_FRAG:
+				match = (offset != 0);
+				break;
+
+			case O_IN:	/* "out" is "not in" */
+				match = (oif == NULL);
+				break;
+
+			case O_LAYER2:
+				match = (args->eh != NULL);
+				break;
+
+			case O_DIVERTED:
+				match = (cmd->arg1 & 1 && divinput_flags &
+				    IP_FW_DIVERT_LOOPBACK_FLAG) ||
+					(cmd->arg1 & 2 && divinput_flags &
+				    IP_FW_DIVERT_OUTPUT_FLAG);
+				break;
+
+			case O_PROTO:
+				/*
+				 * We do not allow an arg of 0 so the
+				 * check of "proto" only suffices.
+				 */
+				match = (proto == cmd->arg1);
+				break;
+
+			case O_IP_SRC:
+				match = is_ipv4 &&
+				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+				    src_ip.s_addr);
+				break;
+
+			case O_IP_SRC_LOOKUP:
+			case O_IP_DST_LOOKUP:
+				if (is_ipv4) {
+				    uint32_t a =
+					(cmd->opcode == O_IP_DST_LOOKUP) ?
+					    dst_ip.s_addr : src_ip.s_addr;
+				    uint32_t v = 0;
+
+				    match = lookup_table(chain, cmd->arg1, a,
+					&v);
+				    if (!match)
+					break;
+				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+					match =
+					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
+				    else
+					tablearg = v;
+				}
+				break;
+
+			case O_IP_SRC_MASK:
+			case O_IP_DST_MASK:
+				if (is_ipv4) {
+				    uint32_t a =
+					(cmd->opcode == O_IP_DST_MASK) ?
+					    dst_ip.s_addr : src_ip.s_addr;
+				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+				    int i = cmdlen-1;
+
+				    for (; !match && i>0; i-= 2, p+= 2)
+					match = (p[0] == (a & p[1]));
+				}
+				break;
+
+			case O_IP_SRC_ME:
+				if (is_ipv4) {
+					struct ifnet *tif;
+
+					INADDR_TO_IFP(src_ip, tif);
+					match = (tif != NULL);
+				}
+				break;
+
+			case O_IP_DST_SET:
+			case O_IP_SRC_SET:
+				if (is_ipv4) {
+					u_int32_t *d = (u_int32_t *)(cmd+1);
+					u_int32_t addr =
+					    cmd->opcode == O_IP_DST_SET ?
+						args->f_id.dst_ip :
+						args->f_id.src_ip;
+
+					    if (addr < d[0])
+						    break;
+					    addr -= d[0]; /* subtract base */
+					    match = (addr < cmd->arg1) &&
+						( d[ 1 + (addr>>5)] &
+						  (1<<(addr & 0x1f)) );
+				}
+				break;
+
+			case O_IP_DST:
+				match = is_ipv4 &&
+				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+				    dst_ip.s_addr);
+				break;
+
+			case O_IP_DST_ME:
+				if (is_ipv4) {
+					struct ifnet *tif;
+
+					INADDR_TO_IFP(dst_ip, tif);
+					match = (tif != NULL);
+				}
+				break;
+
+			case O_IP_SRCPORT:
+			case O_IP_DSTPORT:
+				/*
+				 * offset == 0 && proto != 0 is enough
+				 * to guarantee that we have a
+				 * packet with port info.
+				 */
+				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+				    && offset == 0) {
+					u_int16_t x =
+					    (cmd->opcode == O_IP_SRCPORT) ?
+						src_port : dst_port ;
+					u_int16_t *p =
+					    ((ipfw_insn_u16 *)cmd)->ports;
+					int i;
+
+					for (i = cmdlen - 1; !match && i>0;
+					    i--, p += 2)
+						match = (x>=p[0] && x<=p[1]);
+				}
+				break;
+
+			case O_ICMPTYPE:
+				match = (offset == 0 && proto==IPPROTO_ICMP &&
+				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+				break;
+
+#ifdef INET6
+			case O_ICMP6TYPE:
+				match = is_ipv6 && offset == 0 &&
+				    proto==IPPROTO_ICMPV6 &&
+				    icmp6type_match(
+					ICMP6(ulp)->icmp6_type,
+					(ipfw_insn_u32 *)cmd);
+				break;
+#endif /* INET6 */
+
+			case O_IPOPT:
+				match = (is_ipv4 &&
+				    ipopts_match(ip, cmd) );
+				break;
+
+			case O_IPVER:
+				match = (is_ipv4 &&
+				    cmd->arg1 == ip->ip_v);
+				break;
+
+			case O_IPID:
+			case O_IPLEN:
+			case O_IPTTL:
+				if (is_ipv4) {	/* only for IP packets */
+				    uint16_t x;
+				    uint16_t *p;
+				    int i;
+
+				    if (cmd->opcode == O_IPLEN)
+					x = ip_len;
+				    else if (cmd->opcode == O_IPTTL)
+					x = ip->ip_ttl;
+				    else /* must be IPID */
+					x = ntohs(ip->ip_id);
+				    if (cmdlen == 1) {
+					match = (cmd->arg1 == x);
+					break;
+				    }
+				    /* otherwise we have ranges */
+				    p = ((ipfw_insn_u16 *)cmd)->ports;
+				    i = cmdlen - 1;
+				    for (; !match && i>0; i--, p += 2)
+					match = (x >= p[0] && x <= p[1]);
+				}
+				break;
+
+			case O_IPPRECEDENCE:
+				match = (is_ipv4 &&
+				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+				break;
+
+			case O_IPTOS:
+				match = (is_ipv4 &&
+				    flags_match(cmd, ip->ip_tos));
+				break;
+
+			case O_TCPDATALEN:
+				if (proto == IPPROTO_TCP && offset == 0) {
+				    struct tcphdr *tcp;
+				    uint16_t x;
+				    uint16_t *p;
+				    int i;
+
+				    tcp = TCP(ulp);
+				    x = ip_len -
+					((ip->ip_hl + tcp->th_off) << 2);
+				    if (cmdlen == 1) {
+					match = (cmd->arg1 == x);
+					break;
+				    }
+				    /* otherwise we have ranges */
+				    p = ((ipfw_insn_u16 *)cmd)->ports;
+				    i = cmdlen - 1;
+				    for (; !match && i>0; i--, p += 2)
+					match = (x >= p[0] && x <= p[1]);
+				}
+				break;
+
+			case O_TCPFLAGS:
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    flags_match(cmd, TCP(ulp)->th_flags));
+				break;
+
+			case O_TCPOPTS:
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    tcpopts_match(TCP(ulp), cmd));
+				break;
+
+			case O_TCPSEQ:
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    ((ipfw_insn_u32 *)cmd)->d[0] ==
+					TCP(ulp)->th_seq);
+				break;
+
+			case O_TCPACK:
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    ((ipfw_insn_u32 *)cmd)->d[0] ==
+					TCP(ulp)->th_ack);
+				break;
+
+			case O_TCPWIN:
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    cmd->arg1 == TCP(ulp)->th_win);
+				break;
+
+			case O_ESTAB:
+				/* reject packets which have SYN only */
+				/* XXX should i also check for TH_ACK ? */
+				match = (proto == IPPROTO_TCP && offset == 0 &&
+				    (TCP(ulp)->th_flags &
+				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+				break;
+
+			case O_ALTQ: {
+				struct pf_mtag *at;
+				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+				match = 1;
+				at = pf_find_mtag(m);
+				if (at != NULL && at->qid != 0)
+					break;
+				at = pf_get_mtag(m);
+				if (at == NULL) {
+					/*
+					 * Let the packet fall back to the
+					 * default ALTQ.
+					 */
+					break;
+				}
+				at->qid = altq->qid;
+				if (is_ipv4)
+					at->af = AF_INET;
+				else
+					at->af = AF_LINK;
+				at->hdr = ip;
+				break;
+			}
+
+			case O_LOG:
+				if (V_fw_verbose)
+					ipfw_log(f, hlen, args, m,
+					    oif, offset, tablearg, ip);
+				match = 1;
+				break;
+
+			case O_PROB:
+				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+				break;
+
+			case O_VERREVPATH:
+				/* Outgoing packets automatically pass/match */
+				match = ((oif != NULL) ||
+				    (m->m_pkthdr.rcvif == NULL) ||
+				    (
+#ifdef INET6
+				    is_ipv6 ?
+					verify_path6(&(args->f_id.src_ip6),
+					    m->m_pkthdr.rcvif) :
+#endif
+				    verify_path(src_ip, m->m_pkthdr.rcvif,
+				        args->f_id.fib)));
+				break;
+
+			case O_VERSRCREACH:
+				/* Outgoing packets automatically pass/match */
+				match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+				    is_ipv6 ?
+				        verify_path6(&(args->f_id.src_ip6),
+				            NULL) :
+#endif
+				    verify_path(src_ip, NULL, args->f_id.fib)));
+				break;
+
+			case O_ANTISPOOF:
+				/* Outgoing packets automatically pass/match */
+				if (oif == NULL && hlen > 0 &&
+				    (  (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+				    || (is_ipv6 &&
+				        in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+				    ))
+					match =
+#ifdef INET6
+					    is_ipv6 ? verify_path6(
+					        &(args->f_id.src_ip6),
+					        m->m_pkthdr.rcvif) :
+#endif
+					    verify_path(src_ip,
+					    	m->m_pkthdr.rcvif,
+					        args->f_id.fib);
+				else
+					match = 1;
+				break;
+
+			case O_IPSEC:
+#ifdef IPSEC
+				match = (m_tag_find(m,
+				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+				/* otherwise no match */
+				break;
+
+#ifdef INET6
+			case O_IP6_SRC:
+				match = is_ipv6 &&
+				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+				    &((ipfw_insn_ip6 *)cmd)->addr6);
+				break;
+
+			case O_IP6_DST:
+				match = is_ipv6 &&
+				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+				    &((ipfw_insn_ip6 *)cmd)->addr6);
+				break;
+			case O_IP6_SRC_MASK:
+			case O_IP6_DST_MASK:
+				if (is_ipv6) {
+					int i = cmdlen - 1;
+					struct in6_addr p;
+					struct in6_addr *d =
+					    &((ipfw_insn_ip6 *)cmd)->addr6;
+
+					for (; !match && i > 0; d += 2,
+					    i -= F_INSN_SIZE(struct in6_addr)
+					    * 2) {
+						p = (cmd->opcode ==
+						    O_IP6_SRC_MASK) ?
+						    args->f_id.src_ip6:
+						    args->f_id.dst_ip6;
+						APPLY_MASK(&p, &d[1]);
+						match =
+						    IN6_ARE_ADDR_EQUAL(&d[0],
+						    &p);
+					}
+				}
+				break;
+
+			case O_IP6_SRC_ME:
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+				break;
+
+			case O_IP6_DST_ME:
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+				break;
+
+			case O_FLOW6ID:
+				match = is_ipv6 &&
+				    flow6id_match(args->f_id.flow_id6,
+				    (ipfw_insn_u32 *) cmd);
+				break;
+
+			case O_EXT_HDR:
+				match = is_ipv6 &&
+				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
+				break;
+
+			case O_IP6:
+				match = is_ipv6;
+				break;
+#endif
+
+			case O_IP4:
+				match = is_ipv4;
+				break;
+
+			case O_TAG: {
+				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+				    tablearg : cmd->arg1;
+
+				/* Packet is already tagged with this tag? */
+				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+				/* We have `untag' action when F_NOT flag is
+				 * present. And we must remove this mtag from
+				 * mbuf and reset `match' to zero (`match' will
+				 * be inversed later).
+				 * Otherwise we should allocate new mtag and
+				 * push it into mbuf.
+				 */
+				if (cmd->len & F_NOT) { /* `untag' action */
+					if (mtag != NULL)
+						m_tag_delete(m, mtag);
+				} else if (mtag == NULL) {
+					if ((mtag = m_tag_alloc(MTAG_IPFW,
+					    tag, 0, M_NOWAIT)) != NULL)
+						m_tag_prepend(m, mtag);
+				}
+				match = (cmd->len & F_NOT) ? 0: 1;
+				break;
+			}
+
+			case O_FIB: /* try match the specified fib */
+				if (args->f_id.fib == cmd->arg1)
+					match = 1;
+				break;
+
+			case O_TAGGED: {
+				uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+				    tablearg : cmd->arg1;
+
+				if (cmdlen == 1) {
+					match = m_tag_locate(m, MTAG_IPFW,
+					    tag, NULL) != NULL;
+					break;
+				}
+
+				/* we have ranges */
+				for (mtag = m_tag_first(m);
+				    mtag != NULL && !match;
+				    mtag = m_tag_next(m, mtag)) {
+					uint16_t *p;
+					int i;
+
+					if (mtag->m_tag_cookie != MTAG_IPFW)
+						continue;
+
+					p = ((ipfw_insn_u16 *)cmd)->ports;
+					i = cmdlen - 1;
+					for(; !match && i > 0; i--, p += 2)
+						match =
+						    mtag->m_tag_id >= p[0] &&
+						    mtag->m_tag_id <= p[1];
+				}
+				break;
+			}
+				
+			/*
+			 * The second set of opcodes represents 'actions',
+			 * i.e. the terminal part of a rule once the packet
+			 * matches all previous patterns.
+			 * Typically there is only one action for each rule,
+			 * and the opcode is stored at the end of the rule
+			 * (but there are exceptions -- see below).
+			 *
+			 * In general, here we set retval and terminate the
+			 * outer loop (would be a 'break 3' in some language,
+			 * but we need to do a 'goto done').
+			 *
+			 * Exceptions:
+			 * O_COUNT and O_SKIPTO actions:
+			 *   instead of terminating, we jump to the next rule
+			 *   ('goto next_rule', equivalent to a 'break 2'),
+			 *   or to the SKIPTO target ('goto again' after
+			 *   having set f, cmd and l), respectively.
+			 *
+			 * O_TAG, O_LOG and O_ALTQ action parameters:
+			 *   perform some action and set match = 1;
+			 *
+			 * O_LIMIT and O_KEEP_STATE: these opcodes are
+			 *   not real 'actions', and are stored right
+			 *   before the 'action' part of the rule.
+			 *   These opcodes try to install an entry in the
+			 *   state tables; if successful, we continue with
+			 *   the next opcode (match=1; break;), otherwise
+			 *   the packet *   must be dropped
+			 *   ('goto done' after setting retval);
+			 *
+			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+			 *   cause a lookup of the state table, and a jump
+			 *   to the 'action' part of the parent rule
+			 *   ('goto check_body') if an entry is found, or
+			 *   (CHECK_STATE only) a jump to the next rule if
+			 *   the entry is not found ('goto next_rule').
+			 *   The result of the lookup is cached to make
+			 *   further instances of these opcodes are
+			 *   effectively NOPs.
+			 */
+			case O_LIMIT:
+			case O_KEEP_STATE:
+				if (install_state(f,
+				    (ipfw_insn_limit *)cmd, args, tablearg)) {
+					retval = IP_FW_DENY;
+					goto done; /* error/limit violation */
+				}
+				match = 1;
+				break;
+
+			case O_PROBE_STATE:
+			case O_CHECK_STATE:
+				/*
+				 * dynamic rules are checked at the first
+				 * keep-state or check-state occurrence,
+				 * with the result being stored in dyn_dir.
+				 * The compiler introduces a PROBE_STATE
+				 * instruction for us when we have a
+				 * KEEP_STATE (because PROBE_STATE needs
+				 * to be run first).
+				 */
+				if (dyn_dir == MATCH_UNKNOWN &&
+				    (q = lookup_dyn_rule(&args->f_id,
+				     &dyn_dir, proto == IPPROTO_TCP ?
+					TCP(ulp) : NULL))
+					!= NULL) {
+					/*
+					 * Found dynamic entry, update stats
+					 * and jump to the 'action' part of
+					 * the parent rule.
+					 */
+					q->pcnt++;
+					q->bcnt += pktlen;
+					f = q->rule;
+					cmd = ACTION_PTR(f);
+					l = f->cmd_len - f->act_ofs;
+					IPFW_DYN_UNLOCK();
+					goto check_body;
+				}
+				/*
+				 * Dynamic entry not found. If CHECK_STATE,
+				 * skip to next rule, if PROBE_STATE just
+				 * ignore and continue with next opcode.
+				 */
+				if (cmd->opcode == O_CHECK_STATE)
+					goto next_rule;
+				match = 1;
+				break;
+
+			case O_ACCEPT:
+				retval = 0;	/* accept */
+				goto done;
+
+			case O_PIPE:
+			case O_QUEUE:
+				args->rule = f; /* report matching rule */
+				if (cmd->arg1 == IP_FW_TABLEARG)
+					args->cookie = tablearg;
+				else
+					args->cookie = cmd->arg1;
+				retval = IP_FW_DUMMYNET;
+				goto done;
+
+			case O_DIVERT:
+			case O_TEE: {
+				struct divert_tag *dt;
+
+				if (args->eh) /* not on layer 2 */
+					break;
+				mtag = m_tag_get(PACKET_TAG_DIVERT,
+						sizeof(struct divert_tag),
+						M_NOWAIT);
+				if (mtag == NULL) {
+					/* XXX statistic */
+					/* drop packet */
+					IPFW_RUNLOCK(chain);
+					return (IP_FW_DENY);
+				}
+				dt = (struct divert_tag *)(mtag+1);
+				dt->cookie = f->rulenum;
+				if (cmd->arg1 == IP_FW_TABLEARG)
+					dt->info = tablearg;
+				else
+					dt->info = cmd->arg1;
+				m_tag_prepend(m, mtag);
+				retval = (cmd->opcode == O_DIVERT) ?
+				    IP_FW_DIVERT : IP_FW_TEE;
+				goto done;
+			}
+			case O_COUNT:
+			case O_SKIPTO:
+				f->pcnt++;	/* update stats */
+				f->bcnt += pktlen;
+				f->timestamp = time_uptime;
+				if (cmd->opcode == O_COUNT)
+					goto next_rule;
+				/* handle skipto */
+				if (cmd->arg1 == IP_FW_TABLEARG) {
+					f = lookup_next_rule(f, tablearg);
+				} else {
+					if (f->next_rule == NULL)
+						lookup_next_rule(f, 0);
+					f = f->next_rule;
+				}
+				goto again;
+
+			case O_REJECT:
+				/*
+				 * Drop the packet and send a reject notice
+				 * if the packet is not ICMP (or is an ICMP
+				 * query), and it is not multicast/broadcast.
+				 */
+				if (hlen > 0 && is_ipv4 && offset == 0 &&
+				    (proto != IPPROTO_ICMP ||
+				     is_icmp_query(ICMP(ulp))) &&
+				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
+				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+					send_reject(args, cmd->arg1, ip_len, ip);
+					m = args->m;
+				}
+				/* FALLTHROUGH */
+#ifdef INET6
+			case O_UNREACH6:
+				if (hlen > 0 && is_ipv6 &&
+				    ((offset & IP6F_OFF_MASK) == 0) &&
+				    (proto != IPPROTO_ICMPV6 ||
+				     (is_icmp6_query(args->f_id.flags) == 1)) &&
+				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
+				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+					send_reject6(
+					    args, cmd->arg1, hlen,
+					    (struct ip6_hdr *)ip);
+					m = args->m;
+				}
+				/* FALLTHROUGH */
+#endif
+			case O_DENY:
+				retval = IP_FW_DENY;
+				goto done;
+
+			case O_FORWARD_IP: {
+				struct sockaddr_in *sa;
+				sa = &(((ipfw_insn_sa *)cmd)->sa);
+				if (args->eh)	/* not valid on layer2 pkts */
+					break;
+				if (!q || dyn_dir == MATCH_FORWARD) {
+					if (sa->sin_addr.s_addr == INADDR_ANY) {
+						bcopy(sa, &args->hopstore,
+							sizeof(*sa));
+						args->hopstore.sin_addr.s_addr =
+						    htonl(tablearg);
+						args->next_hop =
+						    &args->hopstore;
+					} else {
+						args->next_hop = sa;
+					}
+				}
+				retval = IP_FW_PASS;
+			    }
+			    goto done;
+
+			case O_NETGRAPH:
+			case O_NGTEE:
+				args->rule = f;	/* report matching rule */
+				if (cmd->arg1 == IP_FW_TABLEARG)
+					args->cookie = tablearg;
+				else
+					args->cookie = cmd->arg1;
+				retval = (cmd->opcode == O_NETGRAPH) ?
+				    IP_FW_NETGRAPH : IP_FW_NGTEE;
+				goto done;
+
+			case O_SETFIB:
+				f->pcnt++;	/* update stats */
+				f->bcnt += pktlen;
+				f->timestamp = time_uptime;
+				M_SETFIB(m, cmd->arg1);
+				args->f_id.fib = cmd->arg1;
+				goto next_rule;
+
+			case O_NAT: {
+                        	struct cfg_nat *t;
+                        	int nat_id;
+
+ 				if (IPFW_NAT_LOADED) {
+					args->rule = f; /* Report matching rule. */
+					t = ((ipfw_insn_nat *)cmd)->nat;
+					if (t == NULL) {
+						nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+						    tablearg : cmd->arg1;
+						LOOKUP_NAT(V_layer3_chain, nat_id, t);
+						if (t == NULL) {
+							retval = IP_FW_DENY;
+							goto done;
+						}
+						if (cmd->arg1 != IP_FW_TABLEARG)
+							((ipfw_insn_nat *)cmd)->nat = t;
+					}
+					retval = ipfw_nat_ptr(args, t, m);
+				} else
+					retval = IP_FW_DENY;
+				goto done;
+			}
+
+			case O_REASS: {
+				int ip_off;
+
+				f->pcnt++;
+				f->bcnt += pktlen;
+				ip_off = (args->eh != NULL) ? ntohs(ip->ip_off) : ip->ip_off;
+				if (ip_off & (IP_MF | IP_OFFMASK)) {
+					/* 
+					 * ip_reass() expects len & off in host
+					 * byte order: fix them in case we come
+					 * from layer2.
+					 */
+					if (args->eh != NULL) {
+						ip->ip_len = ntohs(ip->ip_len);
+						ip->ip_off = ntohs(ip->ip_off);
+					}
+
+					m = ip_reass(m);
+					args->m = m;
+					
+					/*
+					 * IP header checksum fixup after 
+					 * reassembly and leave header
+					 * in network byte order.
+					 */
+					if (m != NULL) {
+						int hlen;
+					
+						ip = mtod(m, struct ip *);
+						hlen = ip->ip_hl << 2;
+						/* revert len & off for layer2 pkts */
+						if (args->eh != NULL)
+							ip->ip_len = htons(ip->ip_len);
+						ip->ip_sum = 0;
+						if (hlen == sizeof(struct ip))
+							ip->ip_sum = in_cksum_hdr(ip);
+						else
+							ip->ip_sum = in_cksum(m, hlen);
+						retval = IP_FW_REASS;
+						args->rule = f;
+						goto done;
+					} else {
+						retval = IP_FW_DENY;
+						goto done;
+					}
+				}
+				goto next_rule;
+			}
+
+			default:
+				panic("-- unknown opcode %d\n", cmd->opcode);
+			} /* end of switch() on opcodes */
+
+			if (cmd->len & F_NOT)
+				match = !match;
+
+			if (match) {
+				if (cmd->len & F_OR)
+					skip_or = 1;
+			} else {
+				if (!(cmd->len & F_OR)) /* not an OR block, */
+					break;		/* try next rule    */
+			}
+
+		}	/* end of inner for, scan opcodes */
+
+next_rule:;		/* try next rule		*/
+
+	}		/* end of outer for, scan rules */
+	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+	IPFW_RUNLOCK(chain);
+	return (IP_FW_DENY);
+
+done:
+	/* Update statistics */
+	f->pcnt++;
+	f->bcnt += pktlen;
+	f->timestamp = time_uptime;
+	IPFW_RUNLOCK(chain);
+	return (retval);
+
+pullup_failed:
+	if (V_fw_verbose)
+		printf("ipfw: pullup failed\n");
+	return (IP_FW_DENY);
+}
+
+/*
+ * When a rule is added/deleted, clear the next_rule pointers in all rules.
+ * These will be reconstructed on the fly as packets are matched.
+ */
+static void
+flush_rule_ptrs(struct ip_fw_chain *chain)
+{
+	struct ip_fw *rule;
+
+	IPFW_WLOCK_ASSERT(chain);
+
+	for (rule = chain->rules; rule; rule = rule->next)
+		rule->next_rule = NULL;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ */
+static int
+add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *rule, *f, *prev;
+	int l = RULESIZE(input_rule);
+
+	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
+		return (EINVAL);
+
+	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
+	if (rule == NULL)
+		return (ENOSPC);
+
+	bcopy(input_rule, rule, l);
+
+	rule->next = NULL;
+	rule->next_rule = NULL;
+
+	rule->pcnt = 0;
+	rule->bcnt = 0;
+	rule->timestamp = 0;
+
+	IPFW_WLOCK(chain);
+
+	if (chain->rules == NULL) {	/* default rule */
+		chain->rules = rule;
+		goto done;
+        }
+
+	/*
+	 * If rulenum is 0, find highest numbered rule before the
+	 * default rule, and add autoinc_step
+	 */
+	if (V_autoinc_step < 1)
+		V_autoinc_step = 1;
+	else if (V_autoinc_step > 1000)
+		V_autoinc_step = 1000;
+	if (rule->rulenum == 0) {
+		/*
+		 * locate the highest numbered rule before default
+		 */
+		for (f = chain->rules; f; f = f->next) {
+			if (f->rulenum == IPFW_DEFAULT_RULE)
+				break;
+			rule->rulenum = f->rulenum;
+		}
+		if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+			rule->rulenum += V_autoinc_step;
+		input_rule->rulenum = rule->rulenum;
+	}
+
+	/*
+	 * Now insert the new rule in the right place in the sorted list.
+	 */
+	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
+		if (f->rulenum > rule->rulenum) { /* found the location */
+			if (prev) {
+				rule->next = f;
+				prev->next = rule;
+			} else { /* head insert */
+				rule->next = chain->rules;
+				chain->rules = rule;
+			}
+			break;
+		}
+	}
+	flush_rule_ptrs(chain);
+done:
+	V_static_count++;
+	V_static_len += l;
+	IPFW_WUNLOCK(chain);
+	DEB(printf("ipfw: installed rule %d, static count now %d\n",
+		rule->rulenum, V_static_count);)
+	return (0);
+}
+
+/**
+ * Remove a static rule (including derived * dynamic rules)
+ * and place it on the ``reap list'' for later reclamation.
+ * The caller is in charge of clearing rule pointers to avoid
+ * dangling pointers.
+ * @return a pointer to the next entry.
+ * Arguments are not checked, so they better be correct.
+ */
+static struct ip_fw *
+remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
+    struct ip_fw *prev)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *n;
+	int l = RULESIZE(rule);
+
+	IPFW_WLOCK_ASSERT(chain);
+
+	n = rule->next;
+	IPFW_DYN_LOCK();
+	remove_dyn_rule(rule, NULL /* force removal */);
+	IPFW_DYN_UNLOCK();
+	if (prev == NULL)
+		chain->rules = n;
+	else
+		prev->next = n;
+	V_static_count--;
+	V_static_len -= l;
+
+	rule->next = chain->reap;
+	chain->reap = rule;
+
+	return n;
+}
+
+/*
+ * Hook for cleaning up dummynet when an ipfw rule is deleted.
+ * Set/cleared when dummynet module is loaded/unloaded.
+ */
+void   (*ip_dn_ruledel_ptr)(void *) = NULL;
+
+/**
+ * Reclaim storage associated with a list of rules.  This is
+ * typically the list created using remove_rule.
+ */
+static void
+reap_rules(struct ip_fw *head)
+{
+	struct ip_fw *rule;
+
+	while ((rule = head) != NULL) {
+		head = head->next;
+		if (ip_dn_ruledel_ptr)
+			ip_dn_ruledel_ptr(rule);
+		free(rule, M_IPFW);
+	}
+}
+
+/*
+ * Remove all rules from a chain (except rules in set RESVD_SET
+ * unless kill_default = 1).  The caller is responsible for
+ * reclaiming storage for the rules left in chain->reap.
+ */
+static void
+free_chain(struct ip_fw_chain *chain, int kill_default)
+{
+	struct ip_fw *prev, *rule;
+
+	IPFW_WLOCK_ASSERT(chain);
+
+	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
+	for (prev = NULL, rule = chain->rules; rule ; )
+		if (kill_default || rule->set != RESVD_SET)
+			rule = remove_rule(chain, rule, prev);
+		else {
+			prev = rule;
+			rule = rule->next;
+		}
+}
+
+/**
+ * Remove all rules with given number, and also do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an u_int32_t. The low 16 bit are the rule or set number,
+ * the next 8 bits are the new set, the top 8 bits are the command:
+ *
+ *	0	delete rules with given number
+ *	1	delete rules with given set number
+ *	2	move rules with given number to new set
+ *	3	move rules with given set number to new set
+ *	4	swap sets with given numbers
+ *	5	delete rules with given number and with given set number
+ */
+static int
+del_entry(struct ip_fw_chain *chain, u_int32_t arg)
+{
+	struct ip_fw *prev = NULL, *rule;
+	u_int16_t rulenum;	/* rule or old_set */
+	u_int8_t cmd, new_set;
+
+	rulenum = arg & 0xffff;
+	cmd = (arg >> 24) & 0xff;
+	new_set = (arg >> 16) & 0xff;
+
+	if (cmd > 5 || new_set > RESVD_SET)
+		return EINVAL;
+	if (cmd == 0 || cmd == 2 || cmd == 5) {
+		if (rulenum >= IPFW_DEFAULT_RULE)
+			return EINVAL;
+	} else {
+		if (rulenum > RESVD_SET)	/* old_set */
+			return EINVAL;
+	}
+
+	IPFW_WLOCK(chain);
+	rule = chain->rules;
+	chain->reap = NULL;
+	switch (cmd) {
+	case 0:	/* delete rules with given number */
+		/*
+		 * locate first rule to delete
+		 */
+		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
+			;
+		if (rule->rulenum != rulenum) {
+			IPFW_WUNLOCK(chain);
+			return EINVAL;
+		}
+
+		/*
+		 * flush pointers outside the loop, then delete all matching
+		 * rules. prev remains the same throughout the cycle.
+		 */
+		flush_rule_ptrs(chain);
+		while (rule->rulenum == rulenum)
+			rule = remove_rule(chain, rule, prev);
+		break;
+
+	case 1:	/* delete all rules with given set number */
+		flush_rule_ptrs(chain);
+		rule = chain->rules;
+		while (rule->rulenum < IPFW_DEFAULT_RULE)
+			if (rule->set == rulenum)
+				rule = remove_rule(chain, rule, prev);
+			else {
+				prev = rule;
+				rule = rule->next;
+			}
+		break;
+
+	case 2:	/* move rules with given number to new set */
+		rule = chain->rules;
+		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+			if (rule->rulenum == rulenum)
+				rule->set = new_set;
+		break;
+
+	case 3: /* move rules with given set number to new set */
+		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+			if (rule->set == rulenum)
+				rule->set = new_set;
+		break;
+
+	case 4: /* swap two sets */
+		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
+			if (rule->set == rulenum)
+				rule->set = new_set;
+			else if (rule->set == new_set)
+				rule->set = rulenum;
+		break;
+	case 5: /* delete rules with given number and with given set number.
+		 * rulenum - given rule number;
+		 * new_set - given set number.
+		 */
+		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
+			;
+		if (rule->rulenum != rulenum) {
+			IPFW_WUNLOCK(chain);
+			return (EINVAL);
+		}
+		flush_rule_ptrs(chain);
+		while (rule->rulenum == rulenum) {
+			if (rule->set == new_set)
+				rule = remove_rule(chain, rule, prev);
+			else {
+				prev = rule;
+				rule = rule->next;
+			}
+		}
+	}
+	/*
+	 * Look for rules to reclaim.  We grab the list before
+	 * releasing the lock then reclaim them w/o the lock to
+	 * avoid a LOR with dummynet.
+	 */
+	rule = chain->reap;
+	chain->reap = NULL;
+	IPFW_WUNLOCK(chain);
+	if (rule)
+		reap_rules(rule);
+	return 0;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * The enclosing "table" is assumed locked.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+	if (log_only == 0) {
+		rule->bcnt = rule->pcnt = 0;
+		rule->timestamp = 0;
+	}
+	if (l->o.opcode == O_LOG)
+		l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ *	0	work with rules from all set's;
+ *	1	work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *rule;
+	char *msg;
+
+	uint16_t rulenum = arg & 0xffff;
+	uint8_t set = (arg >> 16) & 0xff;
+	uint8_t cmd = (arg >> 24) & 0xff;
+
+	if (cmd > 1)
+		return (EINVAL);
+	if (cmd == 1 && set > RESVD_SET)
+		return (EINVAL);
+
+	IPFW_WLOCK(chain);
+	if (rulenum == 0) {
+		V_norule_counter = 0;
+		for (rule = chain->rules; rule; rule = rule->next) {
+			/* Skip rules from another set. */
+			if (cmd == 1 && rule->set != set)
+				continue;
+			clear_counters(rule, log_only);
+		}
+		msg = log_only ? "All logging counts reset" :
+		    "Accounting cleared";
+	} else {
+		int cleared = 0;
+		/*
+		 * We can have multiple rules with the same number, so we
+		 * need to clear them all.
+		 */
+		for (rule = chain->rules; rule; rule = rule->next)
+			if (rule->rulenum == rulenum) {
+				while (rule && rule->rulenum == rulenum) {
+					if (cmd == 0 || rule->set == set)
+						clear_counters(rule, log_only);
+					rule = rule->next;
+				}
+				cleared = 1;
+				break;
+			}
+		if (!cleared) {	/* we did not find any matching rules */
+			IPFW_WUNLOCK(chain);
+			return (EINVAL);
+		}
+		msg = log_only ? "logging count reset" : "cleared";
+	}
+	IPFW_WUNLOCK(chain);
+
+	if (V_fw_verbose) {
+		int lev = LOG_SECURITY | LOG_NOTICE;
+
+		if (rulenum)
+			log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+		else
+			log(lev, "ipfw: %s.\n", msg);
+	}
+	return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Fortunately rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+	int l, cmdlen = 0;
+	int have_action=0;
+	ipfw_insn *cmd;
+
+	if (size < sizeof(*rule)) {
+		printf("ipfw: rule too short\n");
+		return (EINVAL);
+	}
+	/* first, check for valid size */
+	l = RULESIZE(rule);
+	if (l != size) {
+		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+		return (EINVAL);
+	}
+	if (rule->act_ofs >= rule->cmd_len) {
+		printf("ipfw: bogus action offset (%u > %u)\n",
+		    rule->act_ofs, rule->cmd_len - 1);
+		return (EINVAL);
+	}
+	/*
+	 * Now go for the individual checks. Very simple ones, basically only
+	 * instruction sizes.
+	 */
+	for (l = rule->cmd_len, cmd = rule->cmd ;
+			l > 0 ; l -= cmdlen, cmd += cmdlen) {
+		cmdlen = F_LEN(cmd);
+		if (cmdlen > l) {
+			printf("ipfw: opcode %d size truncated\n",
+			    cmd->opcode);
+			return EINVAL;
+		}
+		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
+		switch (cmd->opcode) {
+		case O_PROBE_STATE:
+		case O_KEEP_STATE:
+		case O_PROTO:
+		case O_IP_SRC_ME:
+		case O_IP_DST_ME:
+		case O_LAYER2:
+		case O_IN:
+		case O_FRAG:
+		case O_DIVERTED:
+		case O_IPOPT:
+		case O_IPTOS:
+		case O_IPPRECEDENCE:
+		case O_IPVER:
+		case O_TCPWIN:
+		case O_TCPFLAGS:
+		case O_TCPOPTS:
+		case O_ESTAB:
+		case O_VERREVPATH:
+		case O_VERSRCREACH:
+		case O_ANTISPOOF:
+		case O_IPSEC:
+#ifdef INET6
+		case O_IP6_SRC_ME:
+		case O_IP6_DST_ME:
+		case O_EXT_HDR:
+		case O_IP6:
+#endif
+		case O_IP4:
+		case O_TAG:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+			break;
+
+		case O_FIB:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+			if (cmd->arg1 >= rt_numfibs) {
+				printf("ipfw: invalid fib number %d\n",
+					cmd->arg1);
+				return EINVAL;
+			}
+			break;
+
+		case O_SETFIB:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+			if (cmd->arg1 >= rt_numfibs) {
+				printf("ipfw: invalid fib number %d\n",
+					cmd->arg1);
+				return EINVAL;
+			}
+			goto check_action;
+
+		case O_UID:
+		case O_GID:
+		case O_JAIL:
+		case O_IP_SRC:
+		case O_IP_DST:
+		case O_TCPSEQ:
+		case O_TCPACK:
+		case O_PROB:
+		case O_ICMPTYPE:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+				goto bad_size;
+			break;
+
+		case O_LIMIT:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+				goto bad_size;
+			break;
+
+		case O_LOG:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+				goto bad_size;
+
+			((ipfw_insn_log *)cmd)->log_left =
+			    ((ipfw_insn_log *)cmd)->max_log;
+
+			break;
+
+		case O_IP_SRC_MASK:
+		case O_IP_DST_MASK:
+			/* only odd command lengths */
+			if ( !(cmdlen & 1) || cmdlen > 31)
+				goto bad_size;
+			break;
+
+		case O_IP_SRC_SET:
+		case O_IP_DST_SET:
+			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+				printf("ipfw: invalid set size %d\n",
+					cmd->arg1);
+				return EINVAL;
+			}
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+			    (cmd->arg1+31)/32 )
+				goto bad_size;
+			break;
+
+		case O_IP_SRC_LOOKUP:
+		case O_IP_DST_LOOKUP:
+			if (cmd->arg1 >= IPFW_TABLES_MAX) {
+				printf("ipfw: invalid table number %d\n",
+				    cmd->arg1);
+				return (EINVAL);
+			}
+			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+				goto bad_size;
+			break;
+
+		case O_MACADDR2:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+				goto bad_size;
+			break;
+
+		case O_NOP:
+		case O_IPID:
+		case O_IPTTL:
+		case O_IPLEN:
+		case O_TCPDATALEN:
+		case O_TAGGED:
+			if (cmdlen < 1 || cmdlen > 31)
+				goto bad_size;
+			break;
+
+		case O_MAC_TYPE:
+		case O_IP_SRCPORT:
+		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+			if (cmdlen < 2 || cmdlen > 31)
+				goto bad_size;
+			break;
+
+		case O_RECV:
+		case O_XMIT:
+		case O_VIA:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+				goto bad_size;
+			break;
+
+		case O_ALTQ:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+				goto bad_size;
+			break;
+
+		case O_PIPE:
+		case O_QUEUE:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+			goto check_action;
+
+		case O_FORWARD_IP:
+#ifdef	IPFIREWALL_FORWARD
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+				goto bad_size;
+			goto check_action;
+#else
+			return EINVAL;
+#endif
+
+		case O_DIVERT:
+		case O_TEE:
+			if (ip_divert_ptr == NULL)
+				return EINVAL;
+			else
+				goto check_size;
+		case O_NETGRAPH:
+		case O_NGTEE:
+			if (!NG_IPFW_LOADED)
+				return EINVAL;
+			else
+				goto check_size;
+		case O_NAT:
+			if (!IPFW_NAT_LOADED)
+				return EINVAL;
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+ 				goto bad_size;		
+ 			goto check_action;
+		case O_FORWARD_MAC: /* XXX not implemented yet */
+		case O_CHECK_STATE:
+		case O_COUNT:
+		case O_ACCEPT:
+		case O_DENY:
+		case O_REJECT:
+#ifdef INET6
+		case O_UNREACH6:
+#endif
+		case O_SKIPTO:
+		case O_REASS:
+check_size:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+check_action:
+			if (have_action) {
+				printf("ipfw: opcode %d, multiple actions"
+					" not allowed\n",
+					cmd->opcode);
+				return EINVAL;
+			}
+			have_action = 1;
+			if (l != cmdlen) {
+				printf("ipfw: opcode %d, action must be"
+					" last opcode\n",
+					cmd->opcode);
+				return EINVAL;
+			}
+			break;
+#ifdef INET6
+		case O_IP6_SRC:
+		case O_IP6_DST:
+			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+			    F_INSN_SIZE(ipfw_insn))
+				goto bad_size;
+			break;
+
+		case O_FLOW6ID:
+			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+			    ((ipfw_insn_u32 *)cmd)->o.arg1)
+				goto bad_size;
+			break;
+
+		case O_IP6_SRC_MASK:
+		case O_IP6_DST_MASK:
+			if ( !(cmdlen & 1) || cmdlen > 127)
+				goto bad_size;
+			break;
+		case O_ICMP6TYPE:
+			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+				goto bad_size;
+			break;
+#endif
+
+		default:
+			switch (cmd->opcode) {
+#ifndef INET6
+			case O_IP6_SRC_ME:
+			case O_IP6_DST_ME:
+			case O_EXT_HDR:
+			case O_IP6:
+			case O_UNREACH6:
+			case O_IP6_SRC:
+			case O_IP6_DST:
+			case O_FLOW6ID:
+			case O_IP6_SRC_MASK:
+			case O_IP6_DST_MASK:
+			case O_ICMP6TYPE:
+				printf("ipfw: no IPv6 support in kernel\n");
+				return EPROTONOSUPPORT;
+#endif
+			default:
+				printf("ipfw: opcode %d, unknown opcode\n",
+					cmd->opcode);
+				return EINVAL;
+			}
+		}
+	}
+	if (have_action == 0) {
+		printf("ipfw: missing action\n");
+		return EINVAL;
+	}
+	return 0;
+
+bad_size:
+	printf("ipfw: opcode %d size %d wrong\n",
+		cmd->opcode, cmdlen);
+	return EINVAL;
+}
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+	INIT_VNET_IPFW(curvnet);
+	char *bp = buf;
+	char *ep = bp + space;
+	struct ip_fw *rule;
+	int i;
+	time_t	boot_seconds;
+
+        boot_seconds = boottime.tv_sec;
+	/* XXX this can take a long time and locking will block packet flow */
+	IPFW_RLOCK(chain);
+	for (rule = chain->rules; rule ; rule = rule->next) {
+		/*
+		 * Verify the entry fits in the buffer in case the
+		 * rules changed between calculating buffer space and
+		 * now.  This would be better done using a generation
+		 * number but should suffice for now.
+		 */
+		i = RULESIZE(rule);
+		if (bp + i <= ep) {
+			bcopy(rule, bp, i);
+			/*
+			 * XXX HACK. Store the disable mask in the "next"
+			 * pointer in a wild attempt to keep the ABI the same.
+			 * Why do we do this on EVERY rule?
+			 */
+			bcopy(&V_set_disable,
+			    &(((struct ip_fw *)bp)->next_rule),
+			    sizeof(V_set_disable));
+			if (((struct ip_fw *)bp)->timestamp)
+				((struct ip_fw *)bp)->timestamp += boot_seconds;
+			bp += i;
+		}
+	}
+	IPFW_RUNLOCK(chain);
+	if (V_ipfw_dyn_v) {
+		ipfw_dyn_rule *p, *last = NULL;
+
+		IPFW_DYN_LOCK();
+		for (i = 0 ; i < V_curr_dyn_buckets; i++)
+			for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+				if (bp + sizeof *p <= ep) {
+					ipfw_dyn_rule *dst =
+						(ipfw_dyn_rule *)bp;
+					bcopy(p, dst, sizeof *p);
+					bcopy(&(p->rule->rulenum), &(dst->rule),
+					    sizeof(p->rule->rulenum));
+					/*
+					 * store set number into high word of
+					 * dst->rule pointer.
+					 */
+					bcopy(&(p->rule->set),
+					    (char *)&dst->rule +
+					    sizeof(p->rule->rulenum),
+					    sizeof(p->rule->set));
+					/*
+					 * store a non-null value in "next".
+					 * The userland code will interpret a
+					 * NULL here as a marker
+					 * for the last dynamic rule.
+					 */
+					bcopy(&dst, &dst->next, sizeof(dst));
+					last = dst;
+					dst->expire =
+					    TIME_LEQ(dst->expire, time_uptime) ?
+						0 : dst->expire - time_uptime ;
+					bp += sizeof(ipfw_dyn_rule);
+				}
+			}
+		IPFW_DYN_UNLOCK();
+		if (last != NULL) /* mark last dynamic rule */
+			bzero(&last->next, sizeof(last));
+	}
+	return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+static int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
+	INIT_VNET_IPFW(curvnet);
+	int error;
+	size_t size;
+	struct ip_fw *buf, *rule;
+	u_int32_t rulenum[2];
+
+	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+	if (error)
+		return (error);
+
+	/*
+	 * Disallow modifications in really-really secure mode, but still allow
+	 * the logging counters to be reset.
+	 */
+	if (sopt->sopt_name == IP_FW_ADD ||
+	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+		if (error)
+			return (error);
+	}
+
+	error = 0;
+
+	switch (sopt->sopt_name) {
+	case IP_FW_GET:
+		/*
+		 * pass up a copy of the current rules. Static rules
+		 * come first (the last of which has number IPFW_DEFAULT_RULE),
+		 * followed by a possibly empty list of dynamic rule.
+		 * The last dynamic rule has NULL in the "next" field.
+		 *
+		 * Note that the calculated size is used to bound the
+		 * amount of data returned to the user.  The rule set may
+		 * change between calculating the size and returning the
+		 * data in which case we'll just return what fits.
+		 */
+		size = V_static_len;	/* size of static rules */
+		if (V_ipfw_dyn_v)		/* add size of dyn.rules */
+			size += (V_dyn_count * sizeof(ipfw_dyn_rule));
+
+		/*
+		 * XXX todo: if the user passes a short length just to know
+		 * how much room is needed, do not bother filling up the
+		 * buffer, just jump to the sooptcopyout.
+		 */
+		buf = malloc(size, M_TEMP, M_WAITOK);
+		error = sooptcopyout(sopt, buf,
+				ipfw_getrules(&V_layer3_chain, buf, size));
+		free(buf, M_TEMP);
+		break;
+
+	case IP_FW_FLUSH:
+		/*
+		 * Normally we cannot release the lock on each iteration.
+		 * We could do it here only because we start from the head all
+		 * the times so there is no risk of missing some entries.
+		 * On the other hand, the risk is that we end up with
+		 * a very inconsistent ruleset, so better keep the lock
+		 * around the whole cycle.
+		 *
+		 * XXX this code can be improved by resetting the head of
+		 * the list to point to the default rule, and then freeing
+		 * the old list without the need for a lock.
+		 */
+
+		IPFW_WLOCK(&V_layer3_chain);
+		V_layer3_chain.reap = NULL;
+		free_chain(&V_layer3_chain, 0 /* keep default rule */);
+		rule = V_layer3_chain.reap;
+		V_layer3_chain.reap = NULL;
+		IPFW_WUNLOCK(&V_layer3_chain);
+		if (rule != NULL)
+			reap_rules(rule);
+		break;
+
+	case IP_FW_ADD:
+		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+			sizeof(struct ip_fw) );
+		if (error == 0)
+			error = check_ipfw_struct(rule, sopt->sopt_valsize);
+		if (error == 0) {
+			error = add_rule(&V_layer3_chain, rule);
+			size = RULESIZE(rule);
+			if (!error && sopt->sopt_dir == SOPT_GET)
+				error = sooptcopyout(sopt, rule, size);
+		}
+		free(rule, M_TEMP);
+		break;
+
+	case IP_FW_DEL:
+		/*
+		 * IP_FW_DEL is used for deleting single rules or sets,
+		 * and (ab)used to atomically manipulate sets. Argument size
+		 * is used to distinguish between the two:
+		 *    sizeof(u_int32_t)
+		 *	delete single rule or set of rules,
+		 *	or reassign rules (or sets) to a different set.
+		 *    2*sizeof(u_int32_t)
+		 *	atomic disable/enable sets.
+		 *	first u_int32_t contains sets to be disabled,
+		 *	second u_int32_t contains sets to be enabled.
+		 */
+		error = sooptcopyin(sopt, rulenum,
+			2*sizeof(u_int32_t), sizeof(u_int32_t));
+		if (error)
+			break;
+		size = sopt->sopt_valsize;
+		if (size == sizeof(u_int32_t))	/* delete or reassign */
+			error = del_entry(&V_layer3_chain, rulenum[0]);
+		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
+			V_set_disable =
+			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
+			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+		else
+			error = EINVAL;
+		break;
+
+	case IP_FW_ZERO:
+	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+		rulenum[0] = 0;
+		if (sopt->sopt_val != 0) {
+		    error = sooptcopyin(sopt, rulenum,
+			    sizeof(u_int32_t), sizeof(u_int32_t));
+		    if (error)
+			break;
+		}
+		error = zero_entry(&V_layer3_chain, rulenum[0],
+			sopt->sopt_name == IP_FW_RESETLOG);
+		break;
+
+	case IP_FW_TABLE_ADD:
+		{
+			ipfw_table_entry ent;
+
+			error = sooptcopyin(sopt, &ent,
+			    sizeof(ent), sizeof(ent));
+			if (error)
+				break;
+			error = add_table_entry(&V_layer3_chain, ent.tbl,
+			    ent.addr, ent.masklen, ent.value);
+		}
+		break;
+
+	case IP_FW_TABLE_DEL:
+		{
+			ipfw_table_entry ent;
+
+			error = sooptcopyin(sopt, &ent,
+			    sizeof(ent), sizeof(ent));
+			if (error)
+				break;
+			error = del_table_entry(&V_layer3_chain, ent.tbl,
+			    ent.addr, ent.masklen);
+		}
+		break;
+
+	case IP_FW_TABLE_FLUSH:
+		{
+			u_int16_t tbl;
+
+			error = sooptcopyin(sopt, &tbl,
+			    sizeof(tbl), sizeof(tbl));
+			if (error)
+				break;
+			IPFW_WLOCK(&V_layer3_chain);
+			error = flush_table(&V_layer3_chain, tbl);
+			IPFW_WUNLOCK(&V_layer3_chain);
+		}
+		break;
+
+	case IP_FW_TABLE_GETSIZE:
+		{
+			u_int32_t tbl, cnt;
+
+			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+			    sizeof(tbl))))
+				break;
+			IPFW_RLOCK(&V_layer3_chain);
+			error = count_table(&V_layer3_chain, tbl, &cnt);
+			IPFW_RUNLOCK(&V_layer3_chain);
+			if (error)
+				break;
+			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+		}
+		break;
+
+	case IP_FW_TABLE_LIST:
+		{
+			ipfw_table *tbl;
+
+			if (sopt->sopt_valsize < sizeof(*tbl)) {
+				error = EINVAL;
+				break;
+			}
+			size = sopt->sopt_valsize;
+			tbl = malloc(size, M_TEMP, M_WAITOK);
+			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+			if (error) {
+				free(tbl, M_TEMP);
+				break;
+			}
+			tbl->size = (size - sizeof(*tbl)) /
+			    sizeof(ipfw_table_entry);
+			IPFW_RLOCK(&V_layer3_chain);
+			error = dump_table(&V_layer3_chain, tbl);
+			IPFW_RUNLOCK(&V_layer3_chain);
+			if (error) {
+				free(tbl, M_TEMP);
+				break;
+			}
+			error = sooptcopyout(sopt, tbl, size);
+			free(tbl, M_TEMP);
+		}
+		break;
+
+	case IP_FW_NAT_CFG:
+		if (IPFW_NAT_LOADED)
+			error = ipfw_nat_cfg_ptr(sopt);
+		else {
+			printf("IP_FW_NAT_CFG: %s\n",
+			    "ipfw_nat not present, please load it");
+			error = EINVAL;
+		}
+		break;
+
+	case IP_FW_NAT_DEL:
+		if (IPFW_NAT_LOADED)
+			error = ipfw_nat_del_ptr(sopt);
+		else {
+			printf("IP_FW_NAT_DEL: %s\n",
+			    "ipfw_nat not present, please load it");
+			error = EINVAL;
+		}
+		break;
+
+	case IP_FW_NAT_GET_CONFIG:
+		if (IPFW_NAT_LOADED)
+			error = ipfw_nat_get_cfg_ptr(sopt);
+		else {
+			printf("IP_FW_NAT_GET_CFG: %s\n",
+			    "ipfw_nat not present, please load it");
+			error = EINVAL;
+		}
+		break;
+
+	case IP_FW_NAT_GET_LOG:
+		if (IPFW_NAT_LOADED)
+			error = ipfw_nat_get_log_ptr(sopt);
+		else {
+			printf("IP_FW_NAT_GET_LOG: %s\n",
+			    "ipfw_nat not present, please load it");
+			error = EINVAL;
+		}
+		break;
+
+	default:
+		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+		error = EINVAL;
+	}
+
+	return (error);
+#undef RULE_MAXSIZE
+}
+
+/**
+ * dummynet needs a reference to the default rule, because rules can be
+ * deleted while packets hold a reference to them. When this happens,
+ * dummynet changes the reference to the default rule (it could well be a
+ * NULL pointer, but this way we do not need to check for the special
+ * case, plus here he have info on the default behaviour).
+ */
+struct ip_fw *ip_fw_default_rule;
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+static void
+ipfw_tick(void * __unused unused)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct mbuf *m0, *m, *mnext, **mtailp;
+	int i;
+	ipfw_dyn_rule *q;
+
+	if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+		goto done;
+
+	/*
+	 * We make a chain of packets to go out here -- not deferring
+	 * until after we drop the IPFW dynamic rule lock would result
+	 * in a lock order reversal with the normal packet input -> ipfw
+	 * call stack.
+	 */
+	m0 = NULL;
+	mtailp = &m0;
+	IPFW_DYN_LOCK();
+	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+		for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+			if (q->dyn_type == O_LIMIT_PARENT)
+				continue;
+			if (q->id.proto != IPPROTO_TCP)
+				continue;
+			if ( (q->state & BOTH_SYN) != BOTH_SYN)
+				continue;
+			if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+			    q->expire))
+				continue;	/* too early */
+			if (TIME_LEQ(q->expire, time_uptime))
+				continue;	/* too late, rule expired */
+
+			*mtailp = send_pkt(NULL, &(q->id), q->ack_rev - 1,
+				q->ack_fwd, TH_SYN);
+			if (*mtailp != NULL)
+				mtailp = &(*mtailp)->m_nextpkt;
+			*mtailp = send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+				q->ack_rev, 0);
+			if (*mtailp != NULL)
+				mtailp = &(*mtailp)->m_nextpkt;
+		}
+	}
+	IPFW_DYN_UNLOCK();
+	for (m = mnext = m0; m != NULL; m = mnext) {
+		mnext = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		ip_output(m, NULL, NULL, 0, NULL, NULL);
+	}
+done:
+	callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+		      ipfw_tick, NULL);
+}
+
+int
+ipfw_init(void)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw default_rule;
+	int error;
+
+	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
+
+	V_ipfw_dyn_v = NULL;
+	V_dyn_buckets = 256;	/* must be power of 2 */
+	V_curr_dyn_buckets = 256; /* must be power of 2 */
+
+	V_dyn_ack_lifetime = 300;
+	V_dyn_syn_lifetime = 20;
+	V_dyn_fin_lifetime = 1;
+	V_dyn_rst_lifetime = 1;
+	V_dyn_udp_lifetime = 10;
+	V_dyn_short_lifetime = 5;
+
+	V_dyn_keepalive_interval = 20;
+	V_dyn_keepalive_period = 5;
+	V_dyn_keepalive = 1;	/* do send keepalives */
+
+	V_dyn_max = 4096;	/* max # of dynamic rules */
+
+	V_fw_deny_unknown_exthdrs = 1;
+
+#ifdef INET6
+	/* Setup IPv6 fw sysctl tree. */
+	sysctl_ctx_init(&ip6_fw_sysctl_ctx);
+	ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx,
+	    SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw",
+	    CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall");
+	SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
+	    OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+	    &V_fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6");
+	SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
+	    OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE,
+	    &V_fw_deny_unknown_exthdrs, 0,
+	    "Deny packets with unknown IPv6 Extension Headers");
+#endif
+
+	V_layer3_chain.rules = NULL;
+	IPFW_LOCK_INIT(&V_layer3_chain);
+	ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+	    sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	IPFW_DYN_LOCK_INIT();
+	callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+
+	bzero(&default_rule, sizeof default_rule);
+
+	default_rule.act_ofs = 0;
+	default_rule.rulenum = IPFW_DEFAULT_RULE;
+	default_rule.cmd_len = 1;
+	default_rule.set = RESVD_SET;
+
+	default_rule.cmd[0].len = 1;
+	default_rule.cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+
+	error = add_rule(&V_layer3_chain, &default_rule);
+	if (error != 0) {
+		printf("ipfw2: error %u initializing default rule "
+			"(support disabled)\n", error);
+		IPFW_DYN_LOCK_DESTROY();
+		IPFW_LOCK_DESTROY(&V_layer3_chain);
+		uma_zdestroy(ipfw_dyn_rule_zone);
+		return (error);
+	}
+
+	ip_fw_default_rule = V_layer3_chain.rules;
+	printf("ipfw2 "
+#ifdef INET6
+		"(+ipv6) "
+#endif
+		"initialized, divert %s, nat %s, "
+		"rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+		"enabled, "
+#else
+		"disabled, "
+#endif
+		"default to %s, logging ",
+#ifdef IPDIVERT
+		"enabled",
+#else
+		"loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+		"enabled",
+#else
+		"loadable",
+#endif
+
+		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
+
+#ifdef IPFIREWALL_VERBOSE
+	V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+	if (V_fw_verbose == 0)
+		printf("disabled\n");
+	else if (V_verbose_limit == 0)
+		printf("unlimited\n");
+	else
+		printf("limited to %d packets/entry by default\n",
+		    V_verbose_limit);
+
+	error = init_tables(&V_layer3_chain);
+	if (error) {
+		IPFW_DYN_LOCK_DESTROY();
+		IPFW_LOCK_DESTROY(&V_layer3_chain);
+		uma_zdestroy(ipfw_dyn_rule_zone);
+		return (error);
+	}
+	ip_fw_ctl_ptr = ipfw_ctl;
+	ip_fw_chk_ptr = ipfw_chk;
+	callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL);	
+	LIST_INIT(&V_layer3_chain.nat);
+	return (0);
+}
+
+void
+ipfw_destroy(void)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *reap;
+
+	ip_fw_chk_ptr = NULL;
+	ip_fw_ctl_ptr = NULL;
+	callout_drain(&V_ipfw_timeout);
+	IPFW_WLOCK(&V_layer3_chain);
+	flush_tables(&V_layer3_chain);
+	V_layer3_chain.reap = NULL;
+	free_chain(&V_layer3_chain, 1 /* kill default rule */);
+	reap = V_layer3_chain.reap, V_layer3_chain.reap = NULL;
+	IPFW_WUNLOCK(&V_layer3_chain);
+	if (reap != NULL)
+		reap_rules(reap);
+	IPFW_DYN_LOCK_DESTROY();
+	uma_zdestroy(ipfw_dyn_rule_zone);
+	if (V_ipfw_dyn_v != NULL)
+		free(V_ipfw_dyn_v, M_IPFW);
+	IPFW_LOCK_DESTROY(&V_layer3_chain);
+
+#ifdef INET6
+	/* Free IPv6 fw sysctl tree. */
+	sysctl_ctx_free(&ip6_fw_sysctl_ctx);
+#endif
+
+	printf("IP firewall unloaded\n");
+}
diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c
new file mode 100644
index 000000000000..cce50863c31d
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_nat.c
@@ -0,0 +1,668 @@
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <sys/vimage.h>
+
+#include <netinet/libalias/alias.h>
+#include <netinet/libalias/alias_local.h>
+
+#define	IPFW_INTERNAL	/* Access to protected data structures in ip_fw.h. */
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+
+#include <machine/in_cksum.h>	/* XXX for in_cksum */
+
+MALLOC_DECLARE(M_IPFW);
+
+#ifdef VIMAGE_GLOBALS
+extern struct ip_fw_chain layer3_chain;
+static eventhandler_tag ifaddr_event_tag;
+#endif
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+static void 
+ifaddr_change(void *arg __unused, struct ifnet *ifp)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct cfg_nat *ptr;
+	struct ifaddr *ifa;
+
+	IPFW_WLOCK(&V_layer3_chain);			
+	/* Check every nat entry... */
+	LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
+		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
+		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) {
+			IF_ADDR_LOCK(ifp);
+			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+				if (ifa->ifa_addr == NULL)
+					continue;
+				if (ifa->ifa_addr->sa_family != AF_INET)
+					continue;
+				ptr->ip = ((struct sockaddr_in *) 
+				    (ifa->ifa_addr))->sin_addr;
+				LibAliasSetAddress(ptr->lib, ptr->ip);
+			}
+			IF_ADDR_UNLOCK(ifp);
+		}
+	}
+	IPFW_WUNLOCK(&V_layer3_chain);	
+}
+
+static void
+flush_nat_ptrs(const int i)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *rule;
+
+	IPFW_WLOCK_ASSERT(&V_layer3_chain);
+	for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
+		ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule);
+		if (cmd->o.opcode != O_NAT)
+			continue;
+		if (cmd->nat != NULL && cmd->nat->id == i)
+			cmd->nat = NULL;
+	}
+}
+
+#define HOOK_NAT(b, p) do {				\
+		IPFW_WLOCK_ASSERT(&V_layer3_chain);	\
+		LIST_INSERT_HEAD(b, p, _next);		\
+	} while (0)
+
+#define UNHOOK_NAT(p) do {				\
+		IPFW_WLOCK_ASSERT(&V_layer3_chain);	\
+		LIST_REMOVE(p, _next);			\
+	} while (0)
+
+#define HOOK_REDIR(b, p) do {			\
+		LIST_INSERT_HEAD(b, p, _next);	\
+	} while (0)
+
+#define HOOK_SPOOL(b, p) do {			\
+		LIST_INSERT_HEAD(b, p, _next);	\
+	} while (0)
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+	struct cfg_redir *r, *tmp_r;
+	struct cfg_spool *s, *tmp_s;
+	int i, num;
+
+	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+		num = 1; /* Number of alias_link to delete. */
+		switch (r->mode) {
+		case REDIR_PORT:
+			num = r->pport_cnt;
+			/* FALLTHROUGH */
+		case REDIR_ADDR:
+		case REDIR_PROTO:
+			/* Delete all libalias redirect entry. */
+			for (i = 0; i < num; i++)
+				LibAliasRedirectDelete(n->lib, r->alink[i]);
+			/* Del spool cfg if any. */
+			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+				LIST_REMOVE(s, _next);
+				free(s, M_IPFW);
+			}
+			free(r->alink, M_IPFW);
+			LIST_REMOVE(r, _next);
+			free(r, M_IPFW);
+			break;
+		default:
+			printf("unknown redirect mode: %u\n", r->mode);				
+			/* XXX - panic?!?!? */
+			break; 
+		}
+	}
+}
+
+static int
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+	struct cfg_redir *r, *ser_r;
+	struct cfg_spool *s, *ser_s;
+	int cnt, off, i;
+	char *panic_err;
+
+	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+		ser_r = (struct cfg_redir *)&buf[off];
+		r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+		memcpy(r, ser_r, SOF_REDIR);
+		LIST_INIT(&r->spool_chain);
+		off += SOF_REDIR;
+		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+		    M_IPFW, M_WAITOK | M_ZERO);
+		switch (r->mode) {
+		case REDIR_ADDR:
+			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+			    r->paddr);
+			break;
+		case REDIR_PORT:
+			for (i = 0 ; i < r->pport_cnt; i++) {
+				/* If remotePort is all ports, set it to 0. */
+				u_short remotePortCopy = r->rport + i;
+				if (r->rport_cnt == 1 && r->rport == 0)
+					remotePortCopy = 0;
+				r->alink[i] = LibAliasRedirectPort(ptr->lib,
+				    r->laddr, htons(r->lport + i), r->raddr,
+				    htons(remotePortCopy), r->paddr, 
+				    htons(r->pport + i), r->proto);
+				if (r->alink[i] == NULL) {
+					r->alink[0] = NULL;
+					break;
+				}
+			}
+			break;
+		case REDIR_PROTO:
+			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+			    r->raddr, r->paddr, r->proto);
+			break;
+		default:
+			printf("unknown redirect mode: %u\n", r->mode);
+			break; 
+		}
+		if (r->alink[0] == NULL) {
+			panic_err = "LibAliasRedirect* returned NULL";
+			goto bad;
+		} else /* LSNAT handling. */
+			for (i = 0; i < r->spool_cnt; i++) {
+				ser_s = (struct cfg_spool *)&buf[off];
+				s = malloc(SOF_REDIR, M_IPFW, 
+				    M_WAITOK | M_ZERO);
+				memcpy(s, ser_s, SOF_SPOOL);
+				LibAliasAddServer(ptr->lib, r->alink[0], 
+				    s->addr, htons(s->port));
+				off += SOF_SPOOL;
+				/* Hook spool entry. */
+				HOOK_SPOOL(&r->spool_chain, s);
+			}
+		/* And finally hook this redir entry. */
+		HOOK_REDIR(&ptr->redir_chain, r);
+	}
+	return (1);
+bad:
+	/* something really bad happened: panic! */
+	panic("%s\n", panic_err);
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+	struct mbuf *mcl;
+	struct ip *ip;
+	/* XXX - libalias duct tape */
+	int ldt, retval;
+	char *c;
+
+	ldt = 0;
+	retval = 0;
+	if ((mcl = m_megapullup(m, m->m_pkthdr.len)) ==
+	    NULL)
+		goto badnat;
+	ip = mtod(mcl, struct ip *);
+	if (args->eh == NULL) {
+		ip->ip_len = htons(ip->ip_len);
+		ip->ip_off = htons(ip->ip_off);
+	}
+
+	/* 
+	 * XXX - Libalias checksum offload 'duct tape':
+	 * 
+	 * locally generated packets have only
+	 * pseudo-header checksum calculated
+	 * and libalias will screw it[1], so
+	 * mark them for later fix.  Moreover
+	 * there are cases when libalias
+	 * modify tcp packet data[2], mark it
+	 * for later fix too.
+	 *
+	 * [1] libalias was never meant to run
+	 * in kernel, so it doesn't have any
+	 * knowledge about checksum
+	 * offloading, and it expects a packet
+	 * with a full internet
+	 * checksum. Unfortunately, packets
+	 * generated locally will have just the
+	 * pseudo header calculated, and when
+	 * libalias tries to adjust the
+	 * checksum it will actually screw it.
+	 *
+	 * [2] when libalias modify tcp's data
+	 * content, full TCP checksum has to
+	 * be recomputed: the problem is that
+	 * libalias doesn't have any idea
+	 * about checksum offloading To
+	 * workaround this, we do not do
+	 * checksumming in LibAlias, but only
+	 * mark the packets in th_x2 field. If
+	 * we receive a marked packet, we
+	 * calculate correct checksum for it
+	 * aware of offloading.  Why such a
+	 * terrible hack instead of
+	 * recalculating checksum for each
+	 * packet?  Because the previous
+	 * checksum was not checked!
+	 * Recalculating checksums for EVERY
+	 * packet will hide ALL transmission
+	 * errors. Yes, marked packets still
+	 * suffer from this problem. But,
+	 * sigh, natd(8) has this problem,
+	 * too.
+	 *
+	 * TODO: -make libalias mbuf aware (so
+	 * it can handle delayed checksum and tso)
+	 */
+
+	if (mcl->m_pkthdr.rcvif == NULL && 
+	    mcl->m_pkthdr.csum_flags & 
+	    CSUM_DELAY_DATA)
+		ldt = 1;
+
+	c = mtod(mcl, char *);
+	if (args->oif == NULL)
+		retval = LibAliasIn(t->lib, c, 
+			mcl->m_len + M_TRAILINGSPACE(mcl));
+	else
+		retval = LibAliasOut(t->lib, c, 
+			mcl->m_len + M_TRAILINGSPACE(mcl));
+	if (retval == PKT_ALIAS_RESPOND) {
+	  m->m_flags |= M_SKIP_FIREWALL;
+	  retval = PKT_ALIAS_OK;
+	}
+	if (retval != PKT_ALIAS_OK &&
+	    retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
+		/* XXX - should i add some logging? */
+		m_free(mcl);
+	badnat:
+		args->m = NULL;
+		return (IP_FW_DENY);
+	}
+	mcl->m_pkthdr.len = mcl->m_len = 
+	    ntohs(ip->ip_len);
+
+	/* 
+	 * XXX - libalias checksum offload 
+	 * 'duct tape' (see above) 
+	 */
+
+	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && 
+	    ip->ip_p == IPPROTO_TCP) {
+		struct tcphdr 	*th; 
+
+		th = (struct tcphdr *)(ip + 1);
+		if (th->th_x2) 
+			ldt = 1;
+	}
+
+	if (ldt) {
+		struct tcphdr 	*th;
+		struct udphdr 	*uh;
+		u_short cksum;
+
+		ip->ip_len = ntohs(ip->ip_len);
+		cksum = in_pseudo(
+		    ip->ip_src.s_addr,
+		    ip->ip_dst.s_addr, 
+		    htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))
+		);
+					
+		switch (ip->ip_p) {
+		case IPPROTO_TCP:
+			th = (struct tcphdr *)(ip + 1);
+			/* 
+			 * Maybe it was set in 
+			 * libalias... 
+			 */
+			th->th_x2 = 0;
+			th->th_sum = cksum;
+			mcl->m_pkthdr.csum_data = 
+			    offsetof(struct tcphdr, th_sum);
+			break;
+		case IPPROTO_UDP:
+			uh = (struct udphdr *)(ip + 1);
+			uh->uh_sum = cksum;
+			mcl->m_pkthdr.csum_data = 
+			    offsetof(struct udphdr, uh_sum);
+			break;						
+		}
+		/* 
+		 * No hw checksum offloading: do it 
+		 * by ourself. 
+		 */
+		if ((mcl->m_pkthdr.csum_flags & 
+		     CSUM_DELAY_DATA) == 0) {
+			in_delayed_cksum(mcl);
+			mcl->m_pkthdr.csum_flags &= 
+			    ~CSUM_DELAY_DATA;
+		}
+		ip->ip_len = htons(ip->ip_len);
+	}
+
+	if (args->eh == NULL) {
+		ip->ip_len = ntohs(ip->ip_len);
+		ip->ip_off = ntohs(ip->ip_off);
+	}
+
+	args->m = mcl;
+	return (IP_FW_NAT);
+}
+
+static int 
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct cfg_nat *ptr, *ser_n;
+	char *buf;
+
+	buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+	sooptcopyin(sopt, buf, NAT_BUF_LEN, 
+	    sizeof(struct cfg_nat));
+	ser_n = (struct cfg_nat *)buf;
+
+	/* 
+	 * Find/create nat rule.
+	 */
+	IPFW_WLOCK(&V_layer3_chain);
+	LOOKUP_NAT(V_layer3_chain, ser_n->id, ptr);
+	if (ptr == NULL) {
+		/* New rule: allocate and init new instance. */
+		ptr = malloc(sizeof(struct cfg_nat), 
+		    M_IPFW, M_NOWAIT | M_ZERO);
+		if (ptr == NULL) {
+			IPFW_WUNLOCK(&V_layer3_chain);				
+			free(buf, M_IPFW);
+			return (ENOSPC);
+		}
+		ptr->lib = LibAliasInit(NULL);
+		if (ptr->lib == NULL) {
+			IPFW_WUNLOCK(&V_layer3_chain);
+			free(ptr, M_IPFW);
+			free(buf, M_IPFW);
+			return (EINVAL);
+		}
+		LIST_INIT(&ptr->redir_chain);
+	} else {
+		/* Entry already present: temporarly unhook it. */
+		UNHOOK_NAT(ptr);
+		flush_nat_ptrs(ser_n->id);
+	}
+	IPFW_WUNLOCK(&V_layer3_chain);
+
+	/* 
+	 * Basic nat configuration.
+	 */
+	ptr->id = ser_n->id;
+	/* 
+	 * XXX - what if this rule doesn't nat any ip and just 
+	 * redirect? 
+	 * do we set aliasaddress to 0.0.0.0?
+	 */
+	ptr->ip = ser_n->ip;
+	ptr->redir_cnt = ser_n->redir_cnt;
+	ptr->mode = ser_n->mode;
+	LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
+	LibAliasSetAddress(ptr->lib, ptr->ip);
+	memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
+
+	/* 
+	 * Redir and LSNAT configuration.
+	 */
+	/* Delete old cfgs. */
+	del_redir_spool_cfg(ptr, &ptr->redir_chain);
+	/* Add new entries. */
+	add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+	free(buf, M_IPFW);
+	IPFW_WLOCK(&V_layer3_chain);
+	HOOK_NAT(&V_layer3_chain.nat, ptr);
+	IPFW_WUNLOCK(&V_layer3_chain);
+	return (0);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct cfg_nat *ptr;
+	int i;
+		
+	sooptcopyin(sopt, &i, sizeof i, sizeof i);
+	IPFW_WLOCK(&V_layer3_chain);
+	LOOKUP_NAT(V_layer3_chain, i, ptr);
+	if (ptr == NULL) {
+		IPFW_WUNLOCK(&V_layer3_chain);
+		return (EINVAL);
+	}
+	UNHOOK_NAT(ptr);
+	flush_nat_ptrs(i);
+	IPFW_WUNLOCK(&V_layer3_chain);
+	del_redir_spool_cfg(ptr, &ptr->redir_chain);
+	LibAliasUninit(ptr->lib);
+	free(ptr, M_IPFW);
+	return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{	
+	INIT_VNET_IPFW(curvnet);
+	uint8_t *data;
+	struct cfg_nat *n;
+	struct cfg_redir *r;
+	struct cfg_spool *s;
+	int nat_cnt, off;
+		
+	nat_cnt = 0;
+	off = sizeof(nat_cnt);
+
+	data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
+	IPFW_RLOCK(&V_layer3_chain);
+	/* Serialize all the data. */
+	LIST_FOREACH(n, &V_layer3_chain.nat, _next) {
+		nat_cnt++;
+		if (off + SOF_NAT < NAT_BUF_LEN) {
+			bcopy(n, &data[off], SOF_NAT);
+			off += SOF_NAT;
+			LIST_FOREACH(r, &n->redir_chain, _next) {
+				if (off + SOF_REDIR < NAT_BUF_LEN) {
+					bcopy(r, &data[off], 
+					    SOF_REDIR);
+					off += SOF_REDIR;
+					LIST_FOREACH(s, &r->spool_chain, 
+					    _next) {
+						if (off + SOF_SPOOL < 
+						    NAT_BUF_LEN) {
+							bcopy(s, &data[off],
+							    SOF_SPOOL);
+							off += SOF_SPOOL;
+						} else
+							goto nospace;
+					}
+				} else
+					goto nospace;
+			}
+		} else
+			goto nospace;
+	}
+	bcopy(&nat_cnt, data, sizeof(nat_cnt));
+	IPFW_RUNLOCK(&V_layer3_chain);
+	sooptcopyout(sopt, data, NAT_BUF_LEN);
+	free(data, M_IPFW);
+	return (0);
+nospace:
+	IPFW_RUNLOCK(&V_layer3_chain);
+	printf("serialized data buffer not big enough:"
+	    "please increase NAT_BUF_LEN\n");
+	free(data, M_IPFW);
+	return (ENOSPC);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+	INIT_VNET_IPFW(curvnet);
+	uint8_t *data;
+	struct cfg_nat *ptr;
+	int i, size, cnt, sof;
+
+	data = NULL;
+	sof = LIBALIAS_BUF_SIZE;
+	cnt = 0;
+
+	IPFW_RLOCK(&V_layer3_chain);
+	size = i = 0;
+	LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
+		if (ptr->lib->logDesc == NULL) 
+			continue;
+		cnt++;
+		size = cnt * (sof + sizeof(int));
+		data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO);
+		if (data == NULL) {
+			IPFW_RUNLOCK(&V_layer3_chain);
+			return (ENOSPC);
+		}
+		bcopy(&ptr->id, &data[i], sizeof(int));
+		i += sizeof(int);
+		bcopy(ptr->lib->logDesc, &data[i], sof);
+		i += sof;
+	}
+	IPFW_RUNLOCK(&V_layer3_chain);
+	sooptcopyout(sopt, data, size);
+	free(data, M_IPFW);
+	return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+	INIT_VNET_IPFW(curvnet);
+
+	IPFW_WLOCK(&V_layer3_chain);
+	/* init ipfw hooks */
+	ipfw_nat_ptr = ipfw_nat;
+	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+	ipfw_nat_del_ptr = ipfw_nat_del;
+	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+	IPFW_WUNLOCK(&V_layer3_chain);
+	V_ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, 
+	    NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+	INIT_VNET_IPFW(curvnet);
+	struct ip_fw *rule;
+	struct cfg_nat *ptr, *ptr_temp;
+	
+	IPFW_WLOCK(&V_layer3_chain);
+	LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) {
+		LIST_REMOVE(ptr, _next);
+		del_redir_spool_cfg(ptr, &ptr->redir_chain);
+		LibAliasUninit(ptr->lib);
+		free(ptr, M_IPFW);
+	}
+	EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+	/* flush all nat ptrs */
+	for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
+		ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule);
+		if (cmd->o.opcode == O_NAT)
+			cmd->nat = NULL;
+	}
+	/* deregister ipfw_nat */
+	ipfw_nat_ptr = NULL;
+	IPFW_WUNLOCK(&V_layer3_chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+	int err = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		ipfw_nat_init();
+		break;
+
+	case MOD_UNLOAD:
+		ipfw_nat_destroy();
+		break;
+
+	default:
+		return EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+	"ipfw_nat",
+	ipfw_nat_modevent,
+	0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
new file mode 100644
index 000000000000..0b1ba2daafb4
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_pfil.c
@@ -0,0 +1,597 @@
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif /* KLD_MODULE */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/ucred.h>
+#include <sys/vimage.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pfil.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_divert.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/vinet.h>
+
+#include <netgraph/ng_ipfw.h>
+
+#include <machine/in_cksum.h>
+
+#ifdef VIMAGE_GLOBALS
+int fw_enable = 1;
+#ifdef INET6
+int fw6_enable = 1;
+#endif
+#endif
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Divert hooks. */
+ip_divert_packet_t *ip_divert_ptr = NULL;
+
+/* ng_ipfw hooks. */
+ng_ipfw_input_t *ng_ipfw_input_p = NULL;
+
+/* Forward declarations. */
+static int	ipfw_divert(struct mbuf **, int, int);
+#define	DIV_DIR_IN	1
+#define	DIV_DIR_OUT	0
+
+int
+ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp)
+{
+	INIT_VNET_INET(curvnet);
+	struct ip_fw_args args;
+	struct ng_ipfw_tag *ng_tag;
+	struct m_tag *dn_tag;
+	int ipfw = 0;
+	int divert;
+	int tee;
+#ifdef IPFIREWALL_FORWARD
+	struct m_tag *fwd_tag;
+#endif
+
+	KASSERT(dir == PFIL_IN, ("ipfw_check_in wrong direction!"));
+
+	bzero(&args, sizeof(args));
+
+	ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
+	    NULL);
+	if (ng_tag != NULL) {
+		KASSERT(ng_tag->dir == NG_IPFW_IN,
+		    ("ng_ipfw tag with wrong direction"));
+		args.rule = ng_tag->rule;
+		m_tag_delete(*m0, (struct m_tag *)ng_tag);
+	}
+
+again:
+	dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
+	if (dn_tag != NULL){
+		struct dn_pkt_tag *dt;
+
+		dt = (struct dn_pkt_tag *)(dn_tag+1);
+		args.rule = dt->rule;
+
+		m_tag_delete(*m0, dn_tag);
+	}
+
+	args.m = *m0;
+	args.inp = inp;
+	tee = 0;
+
+	if (V_fw_one_pass == 0 || args.rule == NULL) {
+		ipfw = ipfw_chk(&args);
+		*m0 = args.m;
+	} else
+		ipfw = IP_FW_PASS;
+		
+	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+	    __func__));
+
+	switch (ipfw) {
+	case IP_FW_PASS:
+		if (args.next_hop == NULL)
+			goto pass;
+
+#ifdef IPFIREWALL_FORWARD
+		fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+				sizeof(struct sockaddr_in), M_NOWAIT);
+		if (fwd_tag == NULL)
+			goto drop;
+		bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+		m_tag_prepend(*m0, fwd_tag);
+
+		if (in_localip(args.next_hop->sin_addr))
+			(*m0)->m_flags |= M_FASTFWD_OURS;
+		goto pass;
+#endif
+		break;			/* not reached */
+
+	case IP_FW_DENY:
+		goto drop;
+		break;			/* not reached */
+
+	case IP_FW_DUMMYNET:
+		if (ip_dn_io_ptr == NULL)
+			goto drop;
+		if (mtod(*m0, struct ip *)->ip_v == 4)
+			ip_dn_io_ptr(m0, DN_TO_IP_IN, &args);
+		else if (mtod(*m0, struct ip *)->ip_v == 6)
+			ip_dn_io_ptr(m0, DN_TO_IP6_IN, &args);
+		if (*m0 != NULL)
+			goto again;
+		return 0;		/* packet consumed */
+
+	case IP_FW_TEE:
+		tee = 1;
+		/* fall through */
+
+	case IP_FW_DIVERT:
+		divert = ipfw_divert(m0, DIV_DIR_IN, tee);
+		if (divert) {
+			*m0 = NULL;
+			return 0;	/* packet consumed */
+		} else {
+			args.rule = NULL;
+			goto again;	/* continue with packet */
+		}
+
+	case IP_FW_NGTEE:
+		if (!NG_IPFW_LOADED)
+			goto drop;
+		(void)ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 1);
+		goto again;		/* continue with packet */
+
+	case IP_FW_NETGRAPH:
+		if (!NG_IPFW_LOADED)
+			goto drop;
+		return ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 0);
+		
+	case IP_FW_NAT:
+		goto again;		/* continue with packet */
+
+	case IP_FW_REASS:
+		goto again;
+
+	default:
+		KASSERT(0, ("%s: unknown retval", __func__));
+	}
+
+drop:
+	if (*m0)
+		m_freem(*m0);
+	*m0 = NULL;
+	return (EACCES);
+pass:
+	return 0;	/* not filtered */
+}
+
+int
+ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+    struct inpcb *inp)
+{
+	INIT_VNET_INET(curvnet);
+	struct ip_fw_args args;
+	struct ng_ipfw_tag *ng_tag;
+	struct m_tag *dn_tag;
+	int ipfw = 0;
+	int divert;
+	int tee;
+#ifdef IPFIREWALL_FORWARD
+	struct m_tag *fwd_tag;
+#endif
+
+	KASSERT(dir == PFIL_OUT, ("ipfw_check_out wrong direction!"));
+
+	bzero(&args, sizeof(args));
+
+	ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
+	    NULL);
+	if (ng_tag != NULL) {
+		KASSERT(ng_tag->dir == NG_IPFW_OUT,
+		    ("ng_ipfw tag with wrong direction"));
+		args.rule = ng_tag->rule;
+		m_tag_delete(*m0, (struct m_tag *)ng_tag);
+	}
+
+again:
+	dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
+	if (dn_tag != NULL) {
+		struct dn_pkt_tag *dt;
+
+		dt = (struct dn_pkt_tag *)(dn_tag+1);
+		args.rule = dt->rule;
+
+		m_tag_delete(*m0, dn_tag);
+	}
+
+	args.m = *m0;
+	args.oif = ifp;
+	args.inp = inp;
+	tee = 0;
+
+	if (V_fw_one_pass == 0 || args.rule == NULL) {
+		ipfw = ipfw_chk(&args);
+		*m0 = args.m;
+	} else
+		ipfw = IP_FW_PASS;
+
+	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+	    __func__));
+
+	switch (ipfw) {
+	case IP_FW_PASS:
+                if (args.next_hop == NULL)
+                        goto pass;
+#ifdef IPFIREWALL_FORWARD
+		/* Overwrite existing tag. */
+		fwd_tag = m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+		if (fwd_tag == NULL) {
+			fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
+				sizeof(struct sockaddr_in), M_NOWAIT);
+			if (fwd_tag == NULL)
+				goto drop;
+		} else
+			m_tag_unlink(*m0, fwd_tag);
+		bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
+		m_tag_prepend(*m0, fwd_tag);
+
+		if (in_localip(args.next_hop->sin_addr))
+			(*m0)->m_flags |= M_FASTFWD_OURS;
+		goto pass;
+#endif
+		break;			/* not reached */
+
+	case IP_FW_DENY:
+		goto drop;
+		break;  		/* not reached */
+
+	case IP_FW_DUMMYNET:
+		if (ip_dn_io_ptr == NULL)
+			break;
+		if (mtod(*m0, struct ip *)->ip_v == 4)
+			ip_dn_io_ptr(m0, DN_TO_IP_OUT, &args);
+		else if (mtod(*m0, struct ip *)->ip_v == 6)
+			ip_dn_io_ptr(m0, DN_TO_IP6_OUT, &args);
+		if (*m0 != NULL)
+			goto again;
+		return 0;		/* packet consumed */
+
+		break;
+
+	case IP_FW_TEE:
+		tee = 1;
+		/* fall through */
+
+	case IP_FW_DIVERT:
+		divert = ipfw_divert(m0, DIV_DIR_OUT, tee);
+		if (divert) {
+			*m0 = NULL;
+			return 0;	/* packet consumed */
+		} else {
+			args.rule = NULL;
+			goto again;	/* continue with packet */
+		}
+
+	case IP_FW_NGTEE:
+		if (!NG_IPFW_LOADED)
+			goto drop;
+		(void)ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 1);
+		goto again;		/* continue with packet */
+
+	case IP_FW_NETGRAPH:
+		if (!NG_IPFW_LOADED)
+			goto drop;
+		return ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 0);
+
+	case IP_FW_NAT:
+		goto again;		/* continue with packet */
+		
+	case IP_FW_REASS:
+		goto again;	
+	
+	default:
+		KASSERT(0, ("%s: unknown retval", __func__));
+	}
+
+drop:
+	if (*m0)
+		m_freem(*m0);
+	*m0 = NULL;
+	return (EACCES);
+pass:
+	return 0;	/* not filtered */
+}
+
+static int
+ipfw_divert(struct mbuf **m, int incoming, int tee)
+{
+	/*
+	 * ipfw_chk() has already tagged the packet with the divert tag.
+	 * If tee is set, copy packet and return original.
+	 * If not tee, consume packet and send it to divert socket.
+	 */
+	struct mbuf *clone, *reass;
+	struct ip *ip;
+	int hlen;
+
+	reass = NULL;
+
+	/* Is divert module loaded? */
+	if (ip_divert_ptr == NULL)
+		goto nodivert;
+
+	/* Cloning needed for tee? */
+	if (tee)
+		clone = m_dup(*m, M_DONTWAIT);
+	else
+		clone = *m;
+
+	/* In case m_dup was unable to allocate mbufs. */
+	if (clone == NULL)
+		goto teeout;
+
+	/*
+	 * Divert listeners can only handle non-fragmented packets.
+	 * However when tee is set we will *not* de-fragment the packets;
+	 * Doing do would put the reassembly into double-jeopardy.  On top
+	 * of that someone doing a tee will probably want to get the packet
+	 * in its original form.
+	 */
+	ip = mtod(clone, struct ip *);
+	if (!tee && ip->ip_off & (IP_MF | IP_OFFMASK)) {
+
+		/* Reassemble packet. */
+		reass = ip_reass(clone);
+
+		/*
+		 * IP header checksum fixup after reassembly and leave header
+		 * in network byte order.
+		 */
+		if (reass != NULL) {
+			ip = mtod(reass, struct ip *);
+			hlen = ip->ip_hl << 2;
+			ip->ip_len = htons(ip->ip_len);
+			ip->ip_off = htons(ip->ip_off);
+			ip->ip_sum = 0;
+			if (hlen == sizeof(struct ip))
+				ip->ip_sum = in_cksum_hdr(ip);
+			else
+				ip->ip_sum = in_cksum(reass, hlen);
+			clone = reass;
+		} else
+			clone = NULL;
+	} else {
+		/* Convert header to network byte order. */
+		ip->ip_len = htons(ip->ip_len);
+		ip->ip_off = htons(ip->ip_off);
+	}
+
+	/* Do the dirty job... */
+	if (clone && ip_divert_ptr != NULL)
+		ip_divert_ptr(clone, incoming);
+
+teeout:
+	/*
+	 * For tee we leave the divert tag attached to original packet.
+	 * It will then continue rule evaluation after the tee rule.
+	 */
+	if (tee)
+		return 0;
+
+	/* Packet diverted and consumed */
+	return 1;
+
+nodivert:
+	m_freem(*m);
+	return 1;
+}
+
+static int
+ipfw_hook(void)
+{
+	struct pfil_head *pfh_inet;
+
+	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+	if (pfh_inet == NULL)
+		return ENOENT;
+
+	(void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+	    pfh_inet);
+	(void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+	    pfh_inet);
+
+	return 0;
+}
+
+static int
+ipfw_unhook(void)
+{
+	struct pfil_head *pfh_inet;
+
+	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+	if (pfh_inet == NULL)
+		return ENOENT;
+
+	(void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+	    pfh_inet);
+	(void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+	    pfh_inet);
+
+	return 0;
+}
+
+#ifdef INET6
+static int
+ipfw6_hook(void)
+{
+	struct pfil_head *pfh_inet6;
+
+	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+	if (pfh_inet6 == NULL)
+		return ENOENT;
+
+	(void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+	    pfh_inet6);
+	(void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+	    pfh_inet6);
+
+	return 0;
+}
+
+static int
+ipfw6_unhook(void)
+{
+	struct pfil_head *pfh_inet6;
+
+	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+	if (pfh_inet6 == NULL)
+		return ENOENT;
+
+	(void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+	    pfh_inet6);
+	(void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+	    pfh_inet6);
+
+	return 0;
+}
+#endif /* INET6 */
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+	INIT_VNET_IPFW(curvnet);
+	int enable = *(int *)arg1;
+	int error;
+
+	error = sysctl_handle_int(oidp, &enable, 0, req);
+	if (error)
+		return (error);
+
+	enable = (enable) ? 1 : 0;
+
+	if (enable == *(int *)arg1)
+		return (0);
+
+	if (arg1 == &V_fw_enable) {
+		if (enable)
+			error = ipfw_hook();
+		else
+			error = ipfw_unhook();
+	}
+#ifdef INET6
+	if (arg1 == &V_fw6_enable) {
+		if (enable)
+			error = ipfw6_hook();
+		else
+			error = ipfw6_unhook();
+	}
+#endif
+
+	if (error)
+		return (error);
+
+	*(int *)arg1 = enable;
+
+	return (0);
+}
+
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+	int err = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		if ((err = ipfw_init()) != 0) {
+			printf("ipfw_init() error\n");
+			break;
+		}
+		if ((err = ipfw_hook()) != 0) {
+			printf("ipfw_hook() error\n");
+			break;
+		}
+#ifdef INET6
+		if ((err = ipfw6_hook()) != 0) {
+			printf("ipfw_hook() error\n");
+			break;
+		}
+#endif
+		break;
+
+	case MOD_UNLOAD:
+		if ((err = ipfw_unhook()) > 0)
+			break;
+#ifdef INET6
+		if ((err = ipfw6_unhook()) > 0)
+			break;
+#endif
+		ipfw_destroy();
+		break;
+
+	default:
+		return EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
+static moduledata_t ipfwmod = {
+	"ipfw",
+	ipfw_modevent,
+	0
+};
+DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY - 256);
+MODULE_VERSION(ipfw, 2);