aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/pf/netinet/in4_cksum.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/pf/netinet/in4_cksum.c')
-rw-r--r--sys/contrib/pf/netinet/in4_cksum.c168
1 files changed, 168 insertions, 0 deletions
diff --git a/sys/contrib/pf/netinet/in4_cksum.c b/sys/contrib/pf/netinet/in4_cksum.c
index 1c40f2e05b6e..ca2256f2e96f 100644
--- a/sys/contrib/pf/netinet/in4_cksum.c
+++ b/sys/contrib/pf/netinet/in4_cksum.c
@@ -1,3 +1,4 @@
+/* $FreeBSD$ */
/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */
/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */
/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */
@@ -72,6 +73,39 @@
#include <netinet/ip.h>
#include <netinet/ip_var.h>
+#if defined(__FreeBSD__) && defined(__i386__)
+/*
+ * Copied from FreeBSD 5.0 sys/i386/i386/in_cksum.c
+ * XXX
+ * Currently support I386 processor only.
+ * In the long run, we need an optimized cksum routines for each Tier1
+ * architecture. Due to the lack of available hardware except I386 I
+ * can't support other processors now. For those users which use Sparc64,
+ * Alpha processors can use more optimized version in FreeBSD.
+ * See sys/$ARCH/$ARCH/in_cksum.c where $ARCH=`uname -p`
+ */
+
+/*
+ * These asm statements require __volatile because they pass information
+ * via the condition codes. GCC does not currently provide a way to specify
+ * the condition codes as an input or output operand.
+ *
+ * The LOAD macro below is effectively a prefetch into cache. GCC will
+ * load the value into a register but will not use it. Since modern CPUs
+ * reorder operations, this will generally take place in parallel with
+ * other calculations.
+ */
+#define ADD(n) __asm __volatile \
+ ("addl %1, %0" : "+r" (sum) : \
+ "g" (((const u_int32_t *)w)[n / 4]))
+#define ADDC(n) __asm __volatile \
+ ("adcl %1, %0" : "+r" (sum) : \
+ "g" (((const u_int32_t *)w)[n / 4]))
+#define LOAD(n) __asm __volatile \
+ ("" : : "r" (((const u_int32_t *)w)[n / 4]))
+#define MOP __asm __volatile \
+ ("adcl $0, %0" : "+r" (sum))
+#endif
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
* This is only for IPv4 pseudo header checksum.
@@ -86,6 +120,11 @@
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+#if defined(__FreeBSD__)
+int
+in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
+#endif
+
int
in4_cksum(m, nxt, off, len)
struct mbuf *m;
@@ -158,6 +197,134 @@ in4_cksum(m, nxt, off, len)
if (len < mlen)
mlen = len;
len -= mlen;
+#if defined(__FreeBSD__) && defined(__i386__)
+ /*
+ * Force to long boundary so we do longword aligned
+ * memory operations
+ */
+ if (3 & (int) w) {
+ REDUCE;
+ if ((1 & (int) w) && (mlen > 0)) {
+ sum <<= 8;
+ s_util.c[0] = *(char *)w;
+ w = (u_short *)((char *)w + 1);
+ mlen--;
+ byte_swapped = 1;
+ }
+ if ((2 & (int) w) && (mlen >= 2)) {
+ sum += *w++;
+ mlen -= 2;
+ }
+ }
+ /*
+ * Advance to a 486 cache line boundary.
+ */
+ if (4 & (int) w && mlen >= 4) {
+ ADD(0);
+ MOP;
+ w += 2;
+ mlen -= 4;
+ }
+ if (8 & (int) w && mlen >= 8) {
+ ADD(0);
+ ADDC(4);
+ MOP;
+ w += 4;
+ mlen -= 8;
+ }
+ /*
+ * Do as much of the checksum as possible 32 bits at at time.
+ * In fact, this loop is unrolled to make overhead from
+ * branches &c small.
+ */
+ mlen -= 1;
+ while ((mlen -= 32) >= 0) {
+ /*
+ * Add with carry 16 words and fold in the last
+ * carry by adding a 0 with carry.
+ *
+ * The early ADD(16) and the LOAD(32) are to load
+ * the next 2 cache lines in advance on 486's. The
+ * 486 has a penalty of 2 clock cycles for loading
+ * a cache line, plus whatever time the external
+ * memory takes to load the first word(s) addressed.
+ * These penalties are unavoidable. Subsequent
+ * accesses to a cache line being loaded (and to
+ * other external memory?) are delayed until the
+ * whole load finishes. These penalties are mostly
+ * avoided by not accessing external memory for
+ * 8 cycles after the ADD(16) and 12 cycles after
+ * the LOAD(32). The loop terminates when mlen
+ * is initially 33 (not 32) to guaranteed that
+ * the LOAD(32) is within bounds.
+ */
+ ADD(16);
+ ADDC(0);
+ ADDC(4);
+ ADDC(8);
+ ADDC(12);
+ LOAD(32);
+ ADDC(20);
+ ADDC(24);
+ ADDC(28);
+ MOP;
+ w += 16;
+ }
+ mlen += 32 + 1;
+ if (mlen >= 32) {
+ ADD(16);
+ ADDC(0);
+ ADDC(4);
+ ADDC(8);
+ ADDC(12);
+ ADDC(20);
+ ADDC(24);
+ ADDC(28);
+ MOP;
+ w += 16;
+ mlen -= 32;
+ }
+ if (mlen >= 16) {
+ ADD(0);
+ ADDC(4);
+ ADDC(8);
+ ADDC(12);
+ MOP;
+ w += 8;
+ mlen -= 16;
+ }
+ if (mlen >= 8) {
+ ADD(0);
+ ADDC(4);
+ MOP;
+ w += 4;
+ mlen -= 8;
+ }
+ if (mlen == 0 && byte_swapped == 0)
+ continue; /* worth 1% maybe ?? */
+ REDUCE;
+ while ((mlen -= 2) >= 0) {
+ sum += *w++;
+ }
+ if (byte_swapped) {
+ REDUCE;
+ sum <<= 8;
+ byte_swapped = 0;
+ if (mlen == -1) {
+ s_util.c[1] = *(char *)w;
+ sum += s_util.s;
+ mlen = 0;
+ } else
+ mlen = -1;
+ } else if (mlen == -1)
+ /*
+ * This mbuf has odd number of bytes.
+ * There could be a word split betwen
+ * this mbuf and the next mbuf.
+ * Save the last byte (to prepend to next mbuf).
+ */
+ s_util.c[0] = *(char *)w;
+#else
/*
* Force to even boundary.
*/
@@ -204,6 +371,7 @@ in4_cksum(m, nxt, off, len)
mlen = -1;
} else if (mlen == -1)
s_util.c[0] = *(u_int8_t *)w;
+#endif
}
if (len)
printf("cksum4: out of data\n");