13 files changed, 4682 insertions, 0 deletions
diff --git a/contrib/ofed/libmlx4/buf.c b/contrib/ofed/libmlx4/buf.c
new file mode 100644
index 000000000000..9b41e7f62525
--- /dev/null
+++ b/contrib/ofed/libmlx4/buf.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "mlx4.h"
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	buf->length = align(size, page_size);
+	buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf->buf == MAP_FAILED)
+		return errno;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		munmap(buf->buf, buf->length);
+
+	return ret;
+}
+
+void mlx4_free_buf(struct mlx4_buf *buf)
+{
+	if (buf->length) {
+		ibv_dofork_range(buf->buf, buf->length);
+		munmap(buf->buf, buf->length);
+	}
+}
diff --git a/contrib/ofed/libmlx4/config.h b/contrib/ofed/libmlx4/config.h
new file mode 100644
index 000000000000..af75292ef03e
--- /dev/null
+++ b/contrib/ofed/libmlx4/config.h
@@ -0,0 +1,13 @@
+/* $FreeBSD$ */
+
+#ifdef	__LP64__
+#define	SIZEOF_LONG 8
+#else
+#define	SIZEOF_LONG 4
+#endif
+
+#define	VALGRIND_MAKE_MEM_DEFINED(...)	0
+#define	SWITCH_FALLTHROUGH (void)0
+#define	ALWAYS_INLINE __attribute__ ((__always_inline__))
+#define	likely(x) __predict_true(x)
+#define	unlikely(x) __predict_false(x)
diff --git a/contrib/ofed/libmlx4/cq.c b/contrib/ofed/libmlx4/cq.c
new file mode 100644
index 000000000000..aa2ec1e9636a
--- /dev/null
+++ b/contrib/ofed/libmlx4/cq.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include <infiniband/opcode.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+
+enum {
+	MLX4_CQ_DOORBELL			= 0x20
+};
+
+enum {
+	CQ_OK					=  0,
+	CQ_EMPTY				= -1,
+	CQ_POLL_ERR				= -2
+};
+
+#define MLX4_CQ_DB_REQ_NOT_SOL			(1 << 24)
+#define MLX4_CQ_DB_REQ_NOT			(2 << 24)
+
+enum {
+	MLX4_CQE_VLAN_PRESENT_MASK		= 1 << 29,
+	MLX4_CQE_QPN_MASK			= 0xffffff,
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK			= 0x80,
+	MLX4_CQE_IS_SEND_MASK			= 0x40,
+	MLX4_CQE_OPCODE_MASK			= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+struct mlx4_err_cqe {
+	uint32_t	vlan_my_qpn;
+	uint32_t	reserved1[5];
+	uint16_t	wqe_index;
+	uint8_t		vendor_err;
+	uint8_t		syndrome;
+	uint8_t		reserved2[3];
+	uint8_t		owner_sr_opcode;
+};
+
+static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry)
+{
+	return cq->buf.buf + entry * cq->cqe_size;
+}
+
+static void *get_sw_cqe(struct mlx4_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe);
+	struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;
+
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
+{
+	return get_sw_cqe(cq, cq->cons_index);
+}
+
+static enum ibv_wc_status mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
+		printf(PFX "local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       htobe32(cqe->vlan_my_qpn), htobe32(cqe->wqe_index),
+		       cqe->vendor_err,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		return IBV_WC_LOC_LEN_ERR;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		return IBV_WC_LOC_QP_OP_ERR;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		return IBV_WC_LOC_PROT_ERR;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		return IBV_WC_WR_FLUSH_ERR;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		return IBV_WC_MW_BIND_ERR;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		return IBV_WC_BAD_RESP_ERR;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		return IBV_WC_LOC_ACCESS_ERR;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		return IBV_WC_REM_INV_REQ_ERR;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		return IBV_WC_REM_ACCESS_ERR;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		return IBV_WC_REM_OP_ERR;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		return IBV_WC_RETRY_EXC_ERR;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		return IBV_WC_RNR_RETRY_EXC_ERR;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		return IBV_WC_REM_ABORT_ERR;
+	default:
+		return IBV_WC_GENERAL_ERR;
+	}
+}
+
+static inline void handle_good_req(struct ibv_wc *wc, struct mlx4_cqe *cqe)
+{
+	wc->wc_flags = 0;
+	switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+	case MLX4_OPCODE_RDMA_WRITE_IMM:
+		wc->wc_flags |= IBV_WC_WITH_IMM;
+		SWITCH_FALLTHROUGH;
+	case MLX4_OPCODE_RDMA_WRITE:
+		wc->opcode    = IBV_WC_RDMA_WRITE;
+		break;
+	case MLX4_OPCODE_SEND_IMM:
+		wc->wc_flags |= IBV_WC_WITH_IMM;
+		SWITCH_FALLTHROUGH;
+	case MLX4_OPCODE_SEND:
+	case MLX4_OPCODE_SEND_INVAL:
+		wc->opcode    = IBV_WC_SEND;
+		break;
+	case MLX4_OPCODE_RDMA_READ:
+		wc->opcode    = IBV_WC_RDMA_READ;
+		wc->byte_len  = be32toh(cqe->byte_cnt);
+		break;
+	case MLX4_OPCODE_ATOMIC_CS:
+		wc->opcode    = IBV_WC_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX4_OPCODE_ATOMIC_FA:
+		wc->opcode    = IBV_WC_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX4_OPCODE_LOCAL_INVAL:
+		wc->opcode    = IBV_WC_LOCAL_INV;
+		break;
+	case MLX4_OPCODE_BIND_MW:
+		wc->opcode    = IBV_WC_BIND_MW;
+		break;
+	default:
+		/* assume it's a send completion */
+		wc->opcode    = IBV_WC_SEND;
+		break;
+	}
+}
+
+static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
+				    struct mlx4_cqe **pcqe)
+				    ALWAYS_INLINE;
+static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
+				    struct mlx4_cqe **pcqe)
+{
+	struct mlx4_cqe *cqe;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return CQ_EMPTY;
+
+	if (cq->cqe_size == 64)
+		++cqe;
+
+	++cq->cons_index;
+
+	VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	udma_from_device_barrier();
+
+	*pcqe = cqe;
+
+	return CQ_OK;
+}
+
+static inline int mlx4_parse_cqe(struct mlx4_cq *cq,
+					struct mlx4_cqe *cqe,
+					struct mlx4_qp **cur_qp,
+					struct ibv_wc *wc, int lazy)
+					ALWAYS_INLINE;
+static inline int mlx4_parse_cqe(struct mlx4_cq *cq,
+					struct mlx4_cqe *cqe,
+					struct mlx4_qp **cur_qp,
+					struct ibv_wc *wc, int lazy)
+{
+	struct mlx4_wq *wq;
+	struct mlx4_srq *srq;
+	uint32_t qpn;
+	uint32_t g_mlpath_rqpn;
+	uint64_t *pwr_id;
+	uint16_t wqe_index;
+	struct mlx4_err_cqe *ecqe;
+	struct mlx4_context *mctx;
+	int is_error;
+	int is_send;
+	enum ibv_wc_status *pstatus;
+
+	mctx = to_mctx(cq->ibv_cq.context);
+	qpn = be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+	if (lazy) {
+		cq->cqe = cqe;
+		cq->flags &= (~MLX4_CQ_FLAGS_RX_CSUM_VALID);
+	} else
+		wc->qp_num = qpn;
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
+		/*
+		 * We do not have to take the XSRQ table lock here,
+		 * because CQs will be locked while SRQs are removed
+		 * from the table.
+		 */
+		srq = mlx4_find_xsrq(&mctx->xsrq_table,
+				     be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
+		if (!srq)
+			return CQ_POLL_ERR;
+	} else {
+		if (!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num)) {
+			/*
+			 * We do not have to take the QP table lock here,
+			 * because CQs will be locked while QPs are removed
+			 * from the table.
+			 */
+			*cur_qp = mlx4_find_qp(mctx, qpn);
+			if (!*cur_qp)
+				return CQ_POLL_ERR;
+		}
+		srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL;
+	}
+
+	pwr_id = lazy ? &cq->ibv_cq.wr_id : &wc->wr_id;
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = be16toh(cqe->wqe_index);
+		wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
+		*pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if (srq) {
+		wqe_index = be16toh(cqe->wqe_index);
+		*pwr_id = srq->wrid[wqe_index];
+		mlx4_free_srq_wqe(srq, wqe_index);
+	} else {
+		wq = &(*cur_qp)->rq;
+		*pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+
+	pstatus = lazy ? &cq->ibv_cq.status : &wc->status;
+	if (is_error) {
+		ecqe = (struct mlx4_err_cqe *)cqe;
+		*pstatus = mlx4_handle_error_cqe(ecqe);
+		if (!lazy)
+			wc->vendor_err = ecqe->vendor_err;
+		return CQ_OK;
+	}
+
+	*pstatus = IBV_WC_SUCCESS;
+	if (lazy) {
+		if (!is_send)
+			if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID))
+				cq->flags |= MLX4_CQ_FLAGS_RX_CSUM_VALID;
+	} else if (is_send) {
+		handle_good_req(wc, cqe);
+	} else {
+		wc->byte_len = be32toh(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode   = IBV_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc->opcode   = IBV_WC_RECV;
+			wc->wc_flags |= IBV_WC_WITH_INV;
+			wc->imm_data = be32toh(cqe->immed_rss_invalid);
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IBV_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode   = IBV_WC_RECV;
+			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		wc->slid	   = be16toh(cqe->rlid);
+		g_mlpath_rqpn	   = be32toh(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
+		wc->pkey_index     = be32toh(cqe->immed_rss_invalid) & 0x7f;
+		/* When working with xrc srqs, don't have qp to check link layer.
+		* Using IB SL, should consider Roce. (TBD)
+		*/
+		if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
+			wc->sl	   = be16toh(cqe->sl_vid) >> 13;
+		else
+			wc->sl	   = be16toh(cqe->sl_vid) >> 12;
+
+		if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) {
+			wc->wc_flags |= ((cqe->status & htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) ==
+				 htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) <<
+				IBV_WC_IP_CSUM_OK_SHIFT;
+		}
+	}
+
+	return CQ_OK;
+}
+
+static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq,
+				      struct mlx4_cqe *cqe)
+				      ALWAYS_INLINE;
+static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq,
+				      struct mlx4_cqe *cqe)
+{
+	return mlx4_parse_cqe(cq, cqe, &cq->cur_qp, NULL, 1);
+}
+
+static inline int mlx4_poll_one(struct mlx4_cq *cq,
+			 struct mlx4_qp **cur_qp,
+			 struct ibv_wc *wc)
+			 ALWAYS_INLINE;
+static inline int mlx4_poll_one(struct mlx4_cq *cq,
+			 struct mlx4_qp **cur_qp,
+			 struct ibv_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	int err;
+
+	err = mlx4_get_next_cqe(cq, &cqe);
+	if (err == CQ_EMPTY)
+		return err;
+
+	return mlx4_parse_cqe(cq, cqe, cur_qp, wc, 0);
+}
+
+int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_qp *qp = NULL;
+	int npolled;
+	int err = CQ_OK;
+
+	pthread_spin_lock(&cq->lock);
+
+	for (npolled = 0; npolled < ne; ++npolled) {
+		err = mlx4_poll_one(cq, &qp, wc + npolled);
+		if (err != CQ_OK)
+			break;
+	}
+
+	if (npolled || err == CQ_POLL_ERR)
+		mlx4_update_cons_index(cq);
+
+	pthread_spin_unlock(&cq->lock);
+
+	return err == CQ_POLL_ERR ? err : npolled;
+}
+
+static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock)
+				  ALWAYS_INLINE;
+static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	mlx4_update_cons_index(cq);
+
+	if (lock)
+		pthread_spin_unlock(&cq->lock);
+}
+
+static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq,
+				   struct ibv_poll_cq_attr *attr,
+				   int lock)
+				   ALWAYS_INLINE;
+static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq,
+				   struct ibv_poll_cq_attr *attr,
+				   int lock)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+	struct mlx4_cqe *cqe;
+	int err;
+
+	if (unlikely(attr->comp_mask))
+		return EINVAL;
+
+	if (lock)
+		pthread_spin_lock(&cq->lock);
+
+	cq->cur_qp = NULL;
+
+	err = mlx4_get_next_cqe(cq, &cqe);
+	if (err == CQ_EMPTY) {
+		if (lock)
+			pthread_spin_unlock(&cq->lock);
+		return ENOENT;
+	}
+
+	err = mlx4_parse_lazy_cqe(cq, cqe);
+	if (lock && err)
+		pthread_spin_unlock(&cq->lock);
+
+	return err;
+}
+
+static int mlx4_next_poll(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+	struct mlx4_cqe *cqe;
+	int err;
+
+	err = mlx4_get_next_cqe(cq, &cqe);
+	if (err == CQ_EMPTY)
+		return ENOENT;
+
+	return mlx4_parse_lazy_cqe(cq, cqe);
+}
+
+static void mlx4_end_poll(struct ibv_cq_ex *ibcq)
+{
+	_mlx4_end_poll(ibcq, 0);
+}
+
+static void mlx4_end_poll_lock(struct ibv_cq_ex *ibcq)
+{
+	_mlx4_end_poll(ibcq, 1);
+}
+
+static int mlx4_start_poll(struct ibv_cq_ex *ibcq,
+		    struct ibv_poll_cq_attr *attr)
+{
+	return _mlx4_start_poll(ibcq, attr, 0);
+}
+
+static int mlx4_start_poll_lock(struct ibv_cq_ex *ibcq,
+			 struct ibv_poll_cq_attr *attr)
+{
+	return _mlx4_start_poll(ibcq, attr, 1);
+}
+
+static enum ibv_wc_opcode mlx4_cq_read_wc_opcode(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	if (cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK) {
+		switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+		case MLX4_OPCODE_RDMA_WRITE:
+			return IBV_WC_RDMA_WRITE;
+		case MLX4_OPCODE_SEND_INVAL:
+		case MLX4_OPCODE_SEND_IMM:
+		case MLX4_OPCODE_SEND:
+			return IBV_WC_SEND;
+		case MLX4_OPCODE_RDMA_READ:
+			return IBV_WC_RDMA_READ;
+		case MLX4_OPCODE_ATOMIC_CS:
+			return IBV_WC_COMP_SWAP;
+		case MLX4_OPCODE_ATOMIC_FA:
+			return IBV_WC_FETCH_ADD;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			return IBV_WC_LOCAL_INV;
+		case MLX4_OPCODE_BIND_MW:
+			return IBV_WC_BIND_MW;
+		}
+	} else {
+		switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			return IBV_WC_RECV_RDMA_WITH_IMM;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+		case MLX4_RECV_OPCODE_SEND_IMM:
+		case MLX4_RECV_OPCODE_SEND:
+			return IBV_WC_RECV;
+		}
+	}
+
+	return 0;
+}
+
+static uint32_t mlx4_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return be32toh(cq->cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+}
+
+static int mlx4_cq_read_wc_flags(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+	int is_send  = cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	int wc_flags = 0;
+
+	if (is_send) {
+		switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+		case MLX4_OPCODE_SEND_IMM:
+			wc_flags |= IBV_WC_WITH_IMM;
+			break;
+		}
+	} else {
+		if (cq->flags & MLX4_CQ_FLAGS_RX_CSUM_VALID)
+			wc_flags |= ((cq->cqe->status &
+				htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) ==
+				htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) <<
+				IBV_WC_IP_CSUM_OK_SHIFT;
+
+		switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc_flags |= IBV_WC_WITH_IMM;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc_flags |= IBV_WC_WITH_INV;
+			break;
+		}
+		wc_flags |= (be32toh(cq->cqe->g_mlpath_rqpn) & 0x80000000) ? IBV_WC_GRH : 0;
+	}
+
+	return wc_flags;
+}
+
+static uint32_t mlx4_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return be32toh(cq->cqe->byte_cnt);
+}
+
+static uint32_t mlx4_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+	struct mlx4_err_cqe *ecqe = (struct mlx4_err_cqe *)cq->cqe;
+
+	return ecqe->vendor_err;
+}
+
+static uint32_t mlx4_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+	case MLX4_RECV_OPCODE_SEND_INVAL:
+		return be32toh(cq->cqe->immed_rss_invalid);
+	default:
+		return cq->cqe->immed_rss_invalid;
+	}
+}
+
+static uint32_t mlx4_cq_read_wc_slid(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return (uint32_t)be16toh(cq->cqe->rlid);
+}
+
+static uint8_t mlx4_cq_read_wc_sl(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	if ((cq->cur_qp) && (cq->cur_qp->link_layer == IBV_LINK_LAYER_ETHERNET))
+		return be16toh(cq->cqe->sl_vid) >> 13;
+	else
+		return be16toh(cq->cqe->sl_vid) >> 12;
+}
+
+static uint32_t mlx4_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return be32toh(cq->cqe->g_mlpath_rqpn) & 0xffffff;
+}
+
+static uint8_t mlx4_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return (be32toh(cq->cqe->g_mlpath_rqpn) >> 24) & 0x7f;
+}
+
+static uint64_t mlx4_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq)
+{
+	struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+	return ((uint64_t)be32toh(cq->cqe->ts_47_16) << 16) |
+			       (cq->cqe->ts_15_8   <<  8) |
+			       (cq->cqe->ts_7_0);
+}
+
+void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr)
+{
+
+	if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) {
+		cq->ibv_cq.start_poll = mlx4_start_poll;
+		cq->ibv_cq.end_poll = mlx4_end_poll;
+	} else {
+		cq->ibv_cq.start_poll = mlx4_start_poll_lock;
+		cq->ibv_cq.end_poll = mlx4_end_poll_lock;
+	}
+	cq->ibv_cq.next_poll = mlx4_next_poll;
+
+	cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode;
+	cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err;
+	cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+		cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM)
+		cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM)
+		cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP)
+		cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID)
+		cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL)
+		cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS)
+		cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits;
+	if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)
+		cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts;
+}
+
+int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
+{
+	struct mlx4_cq *cq = to_mcq(ibvcq);
+	uint32_t doorbell[2];
+	uint32_t sn;
+	uint32_t ci;
+	uint32_t cmd;
+
+	sn  = cq->arm_sn & 3;
+	ci  = cq->cons_index & 0xffffff;
+	cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+	*cq->arm_db = htobe32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	udma_to_device_barrier();
+
+	doorbell[0] = htobe32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = htobe32(ci);
+
+	mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL);
+
+	return 0;
+}
+
+void mlx4_cq_event(struct ibv_cq *cq)
+{
+	to_mcq(cq)->arm_sn++;
+}
+
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+	struct mlx4_cqe *cqe, *dest;
+	uint32_t prod_index;
+	uint8_t owner_bit;
+	int nfreed = 0;
+	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
+		cqe += cqe_inc;
+		if (srq && srq->ext_srq &&
+		    (be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
+		    !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
+			mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index));
+			++nfreed;
+		} else if ((be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe);
+			dest += cqe_inc;
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		udma_to_device_barrier();
+		mlx4_update_cons_index(cq);
+	}
+}
+
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+	pthread_spin_lock(&cq->lock);
+	__mlx4_cq_clean(cq, qpn, srq);
+	pthread_spin_unlock(&cq->lock);
+}
+
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
+{
+	uint32_t i;
+
+	for (i = cq->cons_index; get_sw_cqe(cq, i); ++i)
+		;
+
+	return i - cq->cons_index;
+}
+
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe)
+{
+	struct mlx4_cqe *cqe;
+	int i;
+	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+	i = cq->cons_index;
+	cqe = get_cqe(cq, (i & old_cqe));
+	cqe += cqe_inc;
+
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size,
+		       cqe - cqe_inc, cq->cqe_size);
+		++i;
+		cqe = get_cqe(cq, (i & old_cqe));
+		cqe += cqe_inc;
+	}
+
+	++cq->cons_index;
+}
+
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+		      int entry_size)
+{
+	if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
+			   dev->page_size))
+		return -1;
+	memset(buf->buf, 0, nent * entry_size);
+
+	return 0;
+}
diff --git a/contrib/ofed/libmlx4/dbrec.c b/contrib/ofed/libmlx4/dbrec.c
new file mode 100644
index 000000000000..3e875738fa61
--- /dev/null
+++ b/contrib/ofed/libmlx4/dbrec.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define _GNU_SOURCE
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+
+struct mlx4_db_page {
+	struct mlx4_db_page	       *prev, *next;
+	struct mlx4_buf			buf;
+	int				num_db;
+	int				use_cnt;
+	unsigned long			free[0];
+};
+
+static const int db_size[] = {
+	[MLX4_DB_TYPE_CQ] = 8,
+	[MLX4_DB_TYPE_RQ] = 4,
+};
+
+static struct mlx4_db_page *__add_page(struct mlx4_context *context,
+				       enum mlx4_db_type type)
+{
+	struct mlx4_db_page *page;
+	int ps = to_mdev(context->ibv_ctx.device)->page_size;
+	int pp;
+	int i;
+
+	pp = ps / db_size[type];
+
+	page = malloc(sizeof *page + pp / 8);
+	if (!page)
+		return NULL;
+
+	if (mlx4_alloc_buf(&page->buf, ps, ps)) {
+		free(page);
+		return NULL;
+	}
+
+	page->num_db  = pp;
+	page->use_cnt = 0;
+	for (i = 0; i < pp / (sizeof (long) * 8); ++i)
+		page->free[i] = ~0;
+
+	page->prev = NULL;
+	page->next = context->db_list[type];
+	context->db_list[type] = page;
+	if (page->next)
+		page->next->prev = page;
+
+	return page;
+}
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type)
+{
+	struct mlx4_db_page *page;
+	uint32_t *db = NULL;
+	int i, j;
+
+	pthread_mutex_lock(&context->db_list_mutex);
+
+	for (page = context->db_list[type]; page; page = page->next)
+		if (page->use_cnt < page->num_db)
+			goto found;
+
+	page = __add_page(context, type);
+	if (!page)
+		goto out;
+
+found:
+	++page->use_cnt;
+
+	for (i = 0; !page->free[i]; ++i)
+		/* nothing */;
+
+	j = ffsl(page->free[i]);
+	page->free[i] &= ~(1UL << (j - 1));
+	db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type];
+
+out:
+	pthread_mutex_unlock(&context->db_list_mutex);
+
+	return db;
+}
+
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db)
+{
+	struct mlx4_db_page *page;
+	uintptr_t ps = to_mdev(context->ibv_ctx.device)->page_size;
+	int i;
+
+	pthread_mutex_lock(&context->db_list_mutex);
+
+	for (page = context->db_list[type]; page; page = page->next)
+		if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf)
+			break;
+
+	if (!page)
+		goto out;
+
+	i = ((void *) db - page->buf.buf) / db_size[type];
+	page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long)));
+
+	if (!--page->use_cnt) {
+		if (page->prev)
+			page->prev->next = page->next;
+		else
+			context->db_list[type] = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+
+		mlx4_free_buf(&page->buf);
+		free(page);
+	}
+
+out:
+	pthread_mutex_unlock(&context->db_list_mutex);
+}
diff --git a/contrib/ofed/libmlx4/doorbell.h b/contrib/ofed/libmlx4/doorbell.h
new file mode 100644
index 000000000000..140a6158d7f2
--- /dev/null
+++ b/contrib/ofed/libmlx4/doorbell.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef DOORBELL_H
+#define DOORBELL_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include "mlx4.h"
+#include "mmio.h"
+
+struct mlx4_context;
+
+#if SIZEOF_LONG == 8
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#  define MLX4_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1])
+#else
+#  error __BYTE_ORDER not defined
+#endif
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+	mmio_writeq((unsigned long)(ctx->uar + offset), MLX4_PAIR_TO_64(val));
+}
+
+#else
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+	pthread_spin_lock(&ctx->uar_lock);
+	mmio_writel((unsigned long)(ctx->uar + offset), val[0]);
+	mmio_writel((unsigned long)(ctx->uar + offset + 4), val[1]);
+	pthread_spin_unlock(&ctx->uar_lock);
+}
+
+#endif
+
+#endif /* DOORBELL_H */
diff --git a/contrib/ofed/libmlx4/mlx4-abi.h b/contrib/ofed/libmlx4/mlx4-abi.h
new file mode 100644
index 000000000000..7d89505606e2
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4-abi.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ABI_H
+#define MLX4_ABI_H
+
+#include <infiniband/kern-abi.h>
+
+#define MLX4_UVERBS_MIN_ABI_VERSION	2
+#define MLX4_UVERBS_MAX_ABI_VERSION	4
+
+#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+
+enum {
+	MLX4_USER_DEV_CAP_64B_CQE	= 1L << 0
+};
+
+struct mlx4_alloc_ucontext_resp_v3 {
+	struct ibv_get_context_resp	ibv_resp;
+	__u32				qp_tab_size;
+	__u16				bf_reg_size;
+	__u16				bf_regs_per_page;
+};
+
+enum mlx4_query_dev_ex_resp_mask {
+	MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
+};
+
+struct mlx4_alloc_ucontext_resp {
+	struct ibv_get_context_resp	ibv_resp;
+	__u32				dev_caps;
+	__u32				qp_tab_size;
+	__u16				bf_reg_size;
+	__u16				bf_regs_per_page;
+	__u32				cqe_size;
+};
+
+struct mlx4_alloc_pd_resp {
+	struct ibv_alloc_pd_resp	ibv_resp;
+	__u32				pdn;
+	__u32				reserved;
+};
+
+struct mlx4_create_cq {
+	struct ibv_create_cq		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_cq_resp {
+	struct ibv_create_cq_resp	ibv_resp;
+	__u32				cqn;
+	__u32				reserved;
+};
+
+struct mlx4_create_cq_ex {
+	struct ibv_create_cq_ex		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_cq_resp_ex {
+	struct ibv_create_cq_resp_ex	ibv_resp;
+	__u32				cqn;
+	__u32				reserved;
+};
+
+struct mlx4_resize_cq {
+	struct ibv_resize_cq		ibv_cmd;
+	__u64				buf_addr;
+};
+
+struct mlx4_query_device_ex_resp {
+	struct ibv_query_device_resp_ex ibv_resp;
+	__u32				comp_mask;
+	__u32				response_length;
+	__u64				hca_core_clock_offset;
+};
+
+struct mlx4_query_device_ex {
+	struct ibv_query_device_ex	ibv_cmd;
+};
+
+struct mlx4_create_srq {
+	struct ibv_create_srq		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_xsrq {
+	struct ibv_create_xsrq		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_srq_resp {
+	struct ibv_create_srq_resp	ibv_resp;
+	__u32				srqn;
+	__u32				reserved;
+};
+
+struct mlx4_create_qp {
+	struct ibv_create_qp		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+	__u8				log_sq_bb_count;
+	__u8				log_sq_stride;
+	__u8				sq_no_prefetch;	/* was reserved in ABI 2 */
+	__u8				reserved[5];
+};
+
+struct mlx4_create_qp_drv_ex {
+	__u64		buf_addr;
+	__u64		db_addr;
+	__u8		log_sq_bb_count;
+	__u8		log_sq_stride;
+	__u8		sq_no_prefetch;	/* was reserved in ABI 2 */
+	__u8		reserved[5];
+};
+
+struct mlx4_create_qp_ex {
+	struct ibv_create_qp_ex		ibv_cmd;
+	struct mlx4_create_qp_drv_ex	drv_ex;
+};
+
+struct mlx4_create_qp_resp_ex {
+	struct ibv_create_qp_resp_ex	ibv_resp;
+};
+
+#endif /* MLX4_ABI_H */
diff --git a/contrib/ofed/libmlx4/mlx4.c b/contrib/ofed/libmlx4/mlx4.c
new file mode 100644
index 000000000000..229c2670b5ed
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+
+#ifndef PCI_VENDOR_ID_MELLANOX
+#define PCI_VENDOR_ID_MELLANOX			0x15b3
+#endif
+
+#define HCA(v, d) \
+	{ .vendor = PCI_VENDOR_ID_##v,			\
+	  .device = d }
+
+static struct {
+	unsigned		vendor;
+	unsigned		device;
+} hca_table[] = {
+	HCA(MELLANOX, 0x6340),	/* MT25408 "Hermon" SDR */
+	HCA(MELLANOX, 0x634a),	/* MT25408 "Hermon" DDR */
+	HCA(MELLANOX, 0x6354),	/* MT25408 "Hermon" QDR */
+	HCA(MELLANOX, 0x6732),	/* MT25408 "Hermon" DDR PCIe gen2 */
+	HCA(MELLANOX, 0x673c),	/* MT25408 "Hermon" QDR PCIe gen2 */
+	HCA(MELLANOX, 0x6368),	/* MT25408 "Hermon" EN 10GigE */
+	HCA(MELLANOX, 0x6750),	/* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	HCA(MELLANOX, 0x6372),	/* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	HCA(MELLANOX, 0x675a),	/* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	HCA(MELLANOX, 0x6764),	/* MT26468 ConnectX EN 10GigE PCIe gen2*/
+	HCA(MELLANOX, 0x6746),	/* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+	HCA(MELLANOX, 0x676e),	/* MT26478 ConnectX2 40GigE PCIe gen2 */
+	HCA(MELLANOX, 0x1002),	/* MT25400 Family [ConnectX-2 Virtual Function] */
+	HCA(MELLANOX, 0x1003),	/* MT27500 Family [ConnectX-3] */
+	HCA(MELLANOX, 0x1004),	/* MT27500 Family [ConnectX-3 Virtual Function] */
+	HCA(MELLANOX, 0x1005),	/* MT27510 Family */
+	HCA(MELLANOX, 0x1006),	/* MT27511 Family */
+	HCA(MELLANOX, 0x1007),	/* MT27520 Family */
+	HCA(MELLANOX, 0x1008),	/* MT27521 Family */
+	HCA(MELLANOX, 0x1009),	/* MT27530 Family */
+	HCA(MELLANOX, 0x100a),	/* MT27531 Family */
+	HCA(MELLANOX, 0x100b),	/* MT27540 Family */
+	HCA(MELLANOX, 0x100c),	/* MT27541 Family */
+	HCA(MELLANOX, 0x100d),	/* MT27550 Family */
+	HCA(MELLANOX, 0x100e),	/* MT27551 Family */
+	HCA(MELLANOX, 0x100f),	/* MT27560 Family */
+	HCA(MELLANOX, 0x1010),	/* MT27561 Family */
+};
+
+static struct ibv_context_ops mlx4_ctx_ops = {
+	.query_device  = mlx4_query_device,
+	.query_port    = mlx4_query_port,
+	.alloc_pd      = mlx4_alloc_pd,
+	.dealloc_pd    = mlx4_free_pd,
+	.reg_mr	       = mlx4_reg_mr,
+	.rereg_mr      = mlx4_rereg_mr,
+	.dereg_mr      = mlx4_dereg_mr,
+	.alloc_mw      = mlx4_alloc_mw,
+	.dealloc_mw    = mlx4_dealloc_mw,
+	.bind_mw       = mlx4_bind_mw,
+	.create_cq     = mlx4_create_cq,
+	.poll_cq       = mlx4_poll_cq,
+	.req_notify_cq = mlx4_arm_cq,
+	.cq_event      = mlx4_cq_event,
+	.resize_cq     = mlx4_resize_cq,
+	.destroy_cq    = mlx4_destroy_cq,
+	.create_srq    = mlx4_create_srq,
+	.modify_srq    = mlx4_modify_srq,
+	.query_srq     = mlx4_query_srq,
+	.destroy_srq   = mlx4_destroy_srq,
+	.post_srq_recv = mlx4_post_srq_recv,
+	.create_qp     = mlx4_create_qp,
+	.query_qp      = mlx4_query_qp,
+	.modify_qp     = mlx4_modify_qp,
+	.destroy_qp    = mlx4_destroy_qp,
+	.post_send     = mlx4_post_send,
+	.post_recv     = mlx4_post_recv,
+	.create_ah     = mlx4_create_ah,
+	.destroy_ah    = mlx4_destroy_ah,
+	.attach_mcast  = ibv_cmd_attach_mcast,
+	.detach_mcast  = ibv_cmd_detach_mcast
+};
+
+static int mlx4_map_internal_clock(struct mlx4_device *mdev,
+				   struct ibv_context *ibv_ctx)
+{
+	struct mlx4_context *context = to_mctx(ibv_ctx);
+	void *hca_clock_page;
+
+	hca_clock_page = mmap(NULL, mdev->page_size,
+			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
+			      mdev->page_size * 3);
+
+	if (hca_clock_page == MAP_FAILED) {
+		fprintf(stderr, PFX
+			"Warning: Timestamp available,\n"
+			"but failed to mmap() hca core clock page.\n");
+		return -1;
+	}
+
+	context->hca_core_clock = hca_clock_page +
+		(context->core_clock.offset & (mdev->page_size - 1));
+	return 0;
+}
+
+static int mlx4_init_context(struct verbs_device *v_device,
+				struct ibv_context *ibv_ctx, int cmd_fd)
+{
+	struct mlx4_context	       *context;
+	struct ibv_get_context		cmd;
+	struct mlx4_alloc_ucontext_resp resp;
+	int				i;
+	struct mlx4_alloc_ucontext_resp_v3 resp_v3;
+	__u16				bf_reg_size;
+	struct mlx4_device              *dev = to_mdev(&v_device->device);
+	struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
+	struct ibv_device_attr_ex	dev_attrs;
+
+	/* memory footprint of mlx4_context and verbs_context share
+	* struct ibv_context.
+	*/
+	context = to_mctx(ibv_ctx);
+	ibv_ctx->cmd_fd = cmd_fd;
+
+	if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd,
+					&resp_v3.ibv_resp, sizeof resp_v3))
+			return errno;
+
+		context->num_qps  = resp_v3.qp_tab_size;
+		bf_reg_size	  = resp_v3.bf_reg_size;
+		context->cqe_size = sizeof (struct mlx4_cqe);
+	} else  {
+		if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd,
+					&resp.ibv_resp, sizeof resp))
+			return errno;
+
+		context->num_qps  = resp.qp_tab_size;
+		bf_reg_size	  = resp.bf_reg_size;
+		if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE)
+			context->cqe_size = resp.cqe_size;
+		else
+			context->cqe_size = sizeof (struct mlx4_cqe);
+	}
+
+	context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
+	context->qp_table_mask	= (1 << context->qp_table_shift) - 1;
+	for (i = 0; i < MLX4_PORTS_NUM; ++i)
+		context->port_query_cache[i].valid = 0;
+
+	pthread_mutex_init(&context->qp_table_mutex, NULL);
+	for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
+		context->qp_table[i].refcnt = 0;
+
+	for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
+		context->db_list[i] = NULL;
+
+	mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps);
+	pthread_mutex_init(&context->db_list_mutex, NULL);
+
+	context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
+			    MAP_SHARED, cmd_fd, 0);
+	if (context->uar == MAP_FAILED)
+		return errno;
+
+	if (bf_reg_size) {
+		context->bf_page = mmap(NULL, dev->page_size,
+					PROT_WRITE, MAP_SHARED, cmd_fd,
+					dev->page_size);
+		if (context->bf_page == MAP_FAILED) {
+			fprintf(stderr, PFX "Warning: BlueFlame available, "
+				"but failed to mmap() BlueFlame page.\n");
+				context->bf_page     = NULL;
+				context->bf_buf_size = 0;
+		} else {
+			context->bf_buf_size = bf_reg_size / 2;
+			context->bf_offset   = 0;
+			pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+		}
+	} else {
+		context->bf_page     = NULL;
+		context->bf_buf_size = 0;
+	}
+
+	pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
+	ibv_ctx->ops = mlx4_ctx_ops;
+
+	context->hca_core_clock = NULL;
+	memset(&dev_attrs, 0, sizeof(dev_attrs));
+	if (!mlx4_query_device_ex(ibv_ctx, NULL, &dev_attrs,
+				  sizeof(struct ibv_device_attr_ex))) {
+		context->max_qp_wr = dev_attrs.orig_attr.max_qp_wr;
+		context->max_sge = dev_attrs.orig_attr.max_sge;
+		if (context->core_clock.offset_valid)
+			mlx4_map_internal_clock(dev, ibv_ctx);
+	}
+
+	verbs_ctx->has_comp_mask = VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ |
+					VERBS_CONTEXT_QP;
+	verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
+	verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
+	verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
+	verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
+	verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
+	verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
+	verbs_set_ctx_op(verbs_ctx, ibv_create_flow, ibv_cmd_create_flow);
+	verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
+	verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex);
+	verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex);
+	verbs_set_ctx_op(verbs_ctx, query_rt_values, mlx4_query_rt_values);
+
+	return 0;
+
+}
+
+static void mlx4_uninit_context(struct verbs_device *v_device,
+					struct ibv_context *ibv_ctx)
+{
+	struct mlx4_context *context = to_mctx(ibv_ctx);
+
+	munmap(context->uar, to_mdev(&v_device->device)->page_size);
+	if (context->bf_page)
+		munmap(context->bf_page, to_mdev(&v_device->device)->page_size);
+	if (context->hca_core_clock)
+		munmap(context->hca_core_clock - context->core_clock.offset,
+		       to_mdev(&v_device->device)->page_size);
+}
+
+static struct verbs_device_ops mlx4_dev_ops = {
+	.init_context = mlx4_init_context,
+	.uninit_context = mlx4_uninit_context,
+};
+
+static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version)
+{
+	char			value[8];
+	struct mlx4_device    *dev;
+	unsigned		vendor, device;
+	int			i;
+
+	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
+				value, sizeof value) < 0)
+		return NULL;
+	vendor = strtol(value, NULL, 16);
+
+	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
+				value, sizeof value) < 0)
+		return NULL;
+	device = strtol(value, NULL, 16);
+
+	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
+		if (vendor == hca_table[i].vendor &&
+		    device == hca_table[i].device)
+			goto found;
+
+	return NULL;
+
+found:
+	if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION ||
+	    abi_version > MLX4_UVERBS_MAX_ABI_VERSION) {
+		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
+			"(min supported %d, max supported %d)\n",
+			abi_version, uverbs_sys_path,
+			MLX4_UVERBS_MIN_ABI_VERSION,
+			MLX4_UVERBS_MAX_ABI_VERSION);
+		return NULL;
+	}
+
+	dev = calloc(1, sizeof *dev);
+	if (!dev) {
+		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
+			uverbs_sys_path);
+		return NULL;
+	}
+
+	dev->page_size   = sysconf(_SC_PAGESIZE);
+	dev->abi_version = abi_version;
+
+	dev->verbs_dev.ops = &mlx4_dev_ops;
+	dev->verbs_dev.sz = sizeof(*dev);
+	dev->verbs_dev.size_of_context =
+		sizeof(struct mlx4_context) - sizeof(struct ibv_context);
+
+	return &dev->verbs_dev;
+}
+
+static __attribute__((constructor)) void mlx4_register_driver(void)
+{
+	verbs_register_driver("mlx4", mlx4_driver_init);
+}
diff --git a/contrib/ofed/libmlx4/mlx4.h b/contrib/ofed/libmlx4/mlx4.h
new file mode 100644
index 000000000000..864ef9eccc60
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4.h
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <infiniband/endian.h>
+#include <stddef.h>
+
+#include <infiniband/driver.h>
+#include <infiniband/udma_barrier.h>
+#include <infiniband/verbs.h>
+
+#define MLX4_PORTS_NUM 2
+
+#define PFX		"mlx4: "
+
+enum {
+	MLX4_STAT_RATE_OFFSET		= 5
+};
+
+enum {
+	MLX4_QP_TABLE_BITS		= 8,
+	MLX4_QP_TABLE_SIZE		= 1 << MLX4_QP_TABLE_BITS,
+	MLX4_QP_TABLE_MASK		= MLX4_QP_TABLE_SIZE - 1
+};
+
+#define MLX4_REMOTE_SRQN_FLAGS(wr) htobe32(wr->qp_type.xrc.remote_srqn << 8)
+
+enum {
+	MLX4_XSRQ_TABLE_BITS = 8,
+	MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+	MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_xsrq_table {
+	struct {
+		struct mlx4_srq **table;
+		int		  refcnt;
+	} xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+
+	pthread_mutex_t		  mutex;
+	int			  num_xsrq;
+	int			  shift;
+	int			  mask;
+};
+
+enum {
+	MLX4_XRC_QPN_BIT     = (1 << 23)
+};
+
+enum mlx4_db_type {
+	MLX4_DB_TYPE_CQ,
+	MLX4_DB_TYPE_RQ,
+	MLX4_NUM_DB_TYPE
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+struct mlx4_device {
+	struct verbs_device		verbs_dev;
+	int				page_size;
+	int				abi_version;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+	struct ibv_context		ibv_ctx;
+
+	void			       *uar;
+	pthread_spinlock_t		uar_lock;
+
+	void			       *bf_page;
+	int				bf_buf_size;
+	int				bf_offset;
+	pthread_spinlock_t		bf_lock;
+
+	struct {
+		struct mlx4_qp	      **table;
+		int			refcnt;
+	}				qp_table[MLX4_QP_TABLE_SIZE];
+	pthread_mutex_t			qp_table_mutex;
+	int				num_qps;
+	int				qp_table_shift;
+	int				qp_table_mask;
+	int				max_qp_wr;
+	int				max_sge;
+
+	struct mlx4_db_page	       *db_list[MLX4_NUM_DB_TYPE];
+	pthread_mutex_t			db_list_mutex;
+	int				cqe_size;
+	struct mlx4_xsrq_table		xsrq_table;
+	struct {
+		uint8_t                 valid;
+		uint8_t                 link_layer;
+		enum ibv_port_cap_flags caps;
+	} port_query_cache[MLX4_PORTS_NUM];
+	struct {
+		uint64_t                offset;
+		uint8_t                 offset_valid;
+	} core_clock;
+	void			       *hca_core_clock;
+};
+
+struct mlx4_buf {
+	void			       *buf;
+	size_t				length;
+};
+
+struct mlx4_pd {
+	struct ibv_pd			ibv_pd;
+	uint32_t			pdn;
+};
+
+enum {
+	MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0,
+	MLX4_CQ_FLAGS_EXTENDED = 1 << 1,
+	MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2,
+};
+
+struct mlx4_cq {
+	struct ibv_cq_ex		ibv_cq;
+	struct mlx4_buf			buf;
+	struct mlx4_buf			resize_buf;
+	pthread_spinlock_t		lock;
+	uint32_t			cqn;
+	uint32_t			cons_index;
+	uint32_t		       *set_ci_db;
+	uint32_t		       *arm_db;
+	int				arm_sn;
+	int				cqe_size;
+	struct mlx4_qp			*cur_qp;
+	struct mlx4_cqe			*cqe;
+	uint32_t			flags;
+};
+
+struct mlx4_srq {
+	struct verbs_srq		verbs_srq;
+	struct mlx4_buf			buf;
+	pthread_spinlock_t		lock;
+	uint64_t		       *wrid;
+	uint32_t			srqn;
+	int				max;
+	int				max_gs;
+	int				wqe_shift;
+	int				head;
+	int				tail;
+	uint32_t		       *db;
+	uint16_t			counter;
+	uint8_t				ext_srq;
+};
+
+struct mlx4_wq {
+	uint64_t		       *wrid;
+	pthread_spinlock_t		lock;
+	int				wqe_cnt;
+	int				max_post;
+	unsigned			head;
+	unsigned			tail;
+	int				max_gs;
+	int				wqe_shift;
+	int				offset;
+};
+
+struct mlx4_qp {
+	struct verbs_qp			verbs_qp;
+	struct mlx4_buf			buf;
+	int				max_inline_data;
+	int				buf_size;
+
+	uint32_t			doorbell_qpn;
+	uint32_t			sq_signal_bits;
+	int				sq_spare_wqes;
+	struct mlx4_wq			sq;
+
+	uint32_t		       *db;
+	struct mlx4_wq			rq;
+
+	uint8_t				link_layer;
+	uint32_t			qp_cap_cache;
+};
+
+struct mlx4_av {
+	uint32_t			port_pd;
+	uint8_t				reserved1;
+	uint8_t				g_slid;
+	uint16_t			dlid;
+	uint8_t				reserved2;
+	uint8_t				gid_index;
+	uint8_t				stat_rate;
+	uint8_t				hop_limit;
+	uint32_t			sl_tclass_flowlabel;
+	uint8_t				dgid[16];
+};
+
+struct mlx4_ah {
+	struct ibv_ah			ibv_ah;
+	struct mlx4_av			av;
+	uint16_t			vlan;
+	uint8_t				mac[6];
+};
+
+enum {
+	MLX4_CSUM_SUPPORT_UD_OVER_IB	= (1 <<  0),
+	MLX4_CSUM_SUPPORT_RAW_OVER_ETH	= (1 <<  1),
+	/* Only report rx checksum when the validation is valid */
+	MLX4_RX_CSUM_VALID		= (1 <<  16),
+};
+
+enum mlx4_cqe_status {
+	MLX4_CQE_STATUS_TCP_UDP_CSUM_OK	= (1 <<  2),
+	MLX4_CQE_STATUS_IPV4_PKT	= (1 << 22),
+	MLX4_CQE_STATUS_IP_HDR_CSUM_OK	= (1 << 28),
+	MLX4_CQE_STATUS_IPV4_CSUM_OK	= MLX4_CQE_STATUS_IPV4_PKT |
+					MLX4_CQE_STATUS_IP_HDR_CSUM_OK |
+					MLX4_CQE_STATUS_TCP_UDP_CSUM_OK
+};
+
+struct mlx4_cqe {
+	uint32_t	vlan_my_qpn;
+	uint32_t	immed_rss_invalid;
+	uint32_t	g_mlpath_rqpn;
+	union {
+		struct {
+			uint16_t	sl_vid;
+			uint16_t	rlid;
+		};
+		uint32_t ts_47_16;
+	};
+	uint32_t	status;
+	uint32_t	byte_cnt;
+	uint16_t	wqe_index;
+	uint16_t	checksum;
+	uint8_t		reserved3;
+	uint8_t		ts_15_8;
+	uint8_t		ts_7_0;
+	uint8_t		owner_sr_opcode;
+};
+
+static inline unsigned long align(unsigned long val, unsigned long align)
+{
+	return (val + align - 1) & ~(align - 1);
+}
+int align_queue_size(int req);
+
+#define to_mxxx(xxx, type)						\
+	((struct mlx4_##type *)					\
+	 ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev)
+{
+	/* ibv_device is first field of verbs_device
+	 * see try_driver() in libibverbs.
+	 */
+	return container_of(ibdev, struct mlx4_device, verbs_dev);
+}
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+	return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd)
+{
+	return to_mxxx(pd, pd);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+	return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
+{
+	return container_of(container_of(ibsrq, struct verbs_srq, srq),
+			    struct mlx4_srq, verbs_srq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+	return container_of(container_of(ibqp, struct verbs_qp, qp),
+			    struct mlx4_qp, verbs_qp);
+}
+
+static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
+{
+	return to_mxxx(ah, ah);
+}
+
+static inline void mlx4_update_cons_index(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = htobe32(cq->cons_index & 0xffffff);
+}
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+void mlx4_free_buf(struct mlx4_buf *buf);
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);
+
+int mlx4_query_device(struct ibv_context *context,
+		       struct ibv_device_attr *attr);
+int mlx4_query_device_ex(struct ibv_context *context,
+			 const struct ibv_query_device_ex_input *input,
+			 struct ibv_device_attr_ex *attr,
+			 size_t attr_size);
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+		     struct ibv_port_attr *attr);
+int mlx4_query_rt_values(struct ibv_context *context,
+			 struct ibv_values_ex *values);
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
+int mlx4_free_pd(struct ibv_pd *pd);
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+				struct ibv_xrcd_init_attr *attr);
+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
+			    size_t length, int access);
+int mlx4_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd,
+		  void *addr, size_t length, int access);
+int mlx4_dereg_mr(struct ibv_mr *mr);
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type);
+int mlx4_dealloc_mw(struct ibv_mw *mw);
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+		 struct ibv_mw_bind *mw_bind);
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+			       struct ibv_comp_channel *channel,
+			       int comp_vector);
+struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
+				    struct ibv_cq_init_attr_ex *cq_attr);
+void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr);
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+		      int entry_size);
+int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
+int mlx4_destroy_cq(struct ibv_cq *cq);
+int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
+void mlx4_cq_event(struct ibv_cq *cq);
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq);
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe);
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+				 struct ibv_srq_init_attr *attr);
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+				   struct ibv_srq_init_attr_ex *attr_ex);
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+				    struct ibv_srq_init_attr_ex *attr_ex);
+int mlx4_modify_srq(struct ibv_srq *srq,
+		     struct ibv_srq_attr *attr,
+		     int mask);
+int mlx4_query_srq(struct ibv_srq *srq,
+			   struct ibv_srq_attr *attr);
+int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+			struct mlx4_srq *srq);
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+		    struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+		       struct ibv_recv_wr *wr,
+		       struct ibv_recv_wr **bad_wr);
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+				 struct ibv_qp_init_attr_ex *attr);
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr);
+int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		   int attr_mask,
+		   struct ibv_qp_init_attr *init_attr);
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		    int attr_mask);
+int mlx4_destroy_qp(struct ibv_qp *qp);
+void mlx4_init_qp_indices(struct mlx4_qp *qp);
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+			  struct ibv_send_wr **bad_wr);
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+			  struct ibv_recv_wr **bad_wr);
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp);
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type, struct mlx4_qp *qp);
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type);
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn);
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp);
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+int mlx4_destroy_ah(struct ibv_ah *ah);
+int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
+		   struct mlx4_ah *ah);
+void mlx4_free_av(struct mlx4_ah *ah);
+
+#endif /* MLX4_H */
diff --git a/contrib/ofed/libmlx4/mmio.h b/contrib/ofed/libmlx4/mmio.h
new file mode 100644
index 000000000000..a1a296658fdb
--- /dev/null
+++ b/contrib/ofed/libmlx4/mmio.h
@@ -0,0 +1,116 @@
+/* Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md
+ */
+#ifndef MMIO_H
+#define MMIO_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifdef __s390x__
+
+static inline long mmio_writeb(const unsigned long mmio_addr,
+			       const uint8_t val)
+{
+	return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writew(const unsigned long mmio_addr,
+			       const uint16_t val)
+{
+	return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writel(const unsigned long mmio_addr,
+			       const uint32_t val)
+{
+	return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writeq(const unsigned long mmio_addr,
+			       const uint64_t val)
+{
+	return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_write(const unsigned long mmio_addr,
+			      const void *val,
+			      const size_t length)
+{
+	return syscall(__NR_s390_pci_mmio_write, mmio_addr, val, length);
+}
+
+static inline long mmio_readb(const unsigned long mmio_addr, uint8_t *val)
+{
+	return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readw(const unsigned long mmio_addr, uint16_t *val)
+{
+	return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readl(const unsigned long mmio_addr, uint32_t *val)
+{
+	return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readq(const unsigned long mmio_addr, uint64_t *val)
+{
+	return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_read(const unsigned long mmio_addr,
+			     void *val,
+			     const size_t length)
+{
+	return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, length);
+}
+
+static inline void mlx4_bf_copy(unsigned long *dst,
+				unsigned long *src,
+				unsigned bytecnt)
+{
+	mmio_write((unsigned long)dst, src, bytecnt);
+}
+
+#else
+
+#define mmio_writeb(addr, value) \
+	(*((volatile uint8_t *)addr) = value)
+#define mmio_writew(addr, value) \
+	(*((volatile uint16_t *)addr) = value)
+#define mmio_writel(addr, value) \
+	(*((volatile uint32_t *)addr) = value)
+#define mmio_writeq(addr, value) \
+	(*((volatile uint64_t *)addr) = value)
+#define mmio_write(addr, value, length) \
+	memcpy(addr, value, length)
+
+#define mmio_readb(addr, value) \
+	(value = *((volatile uint8_t *)addr))
+#define mmio_readw(addr, value) \
+	(value = *((volatile uint16_t *)addr))
+#define mmio_readl(addr, value) \
+	(value = *((volatile uint32_t *)addr))
+#define mmio_readq(addr, value) \
+	(value = *((volatile uint64_t *)addr))
+#define mmio_read(addr, value, length) \
+	memcpy(value, addr, length)
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static inline void mlx4_bf_copy(unsigned long *dst,
+				unsigned long *src,
+				unsigned bytecnt)
+{
+	while (bytecnt > 0) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+		bytecnt -= 2 * sizeof(long);
+	}
+}
+#endif
+
+#endif
diff --git a/contrib/ofed/libmlx4/qp.c b/contrib/ofed/libmlx4/qp.c
new file mode 100644
index 000000000000..577aab5287ab
--- /dev/null
+++ b/contrib/ofed/libmlx4/qp.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+
+static const uint32_t mlx4_ib_opcode[] = {
+	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
+	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
+	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
+	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
+	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
+	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
+	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
+	[IBV_WR_LOCAL_INV]		= MLX4_OPCODE_LOCAL_INVAL,
+	[IBV_WR_BIND_MW]		= MLX4_OPCODE_BIND_MW,
+	[IBV_WR_SEND_WITH_INV]		= MLX4_OPCODE_SEND_INVAL,
+};
+
+static void *get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
+}
+
+static void *get_send_wqe(struct mlx4_qp *qp, int n)
+{
+	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+{
+	uint32_t *wqe = get_send_wqe(qp, n);
+	int i;
+	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
+
+	for (i = 16; i < ds; i += 16)
+		wqe[i] = 0xffffffff;
+}
+
+void mlx4_init_qp_indices(struct mlx4_qp *qp)
+{
+	qp->sq.head	 = 0;
+	qp->sq.tail	 = 0;
+	qp->rq.head	 = 0;
+	qp->rq.tail	 = 0;
+}
+
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	int i;
+
+	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+		ctrl = get_send_wqe(qp, i);
+		ctrl->owner_opcode = htobe32(1 << 31);
+		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+		stamp_send_wqe(qp, i);
+	}
+}
+
+static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
+{
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (cur + nreq < wq->max_post)
+		return 0;
+
+	pthread_spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	pthread_spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
+{
+	int acc = wr->bind_mw.bind_info.mw_access_flags;
+	bseg->flags1 = 0;
+	if (acc & IBV_ACCESS_REMOTE_ATOMIC)
+		bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
+	if (acc & IBV_ACCESS_REMOTE_WRITE)
+		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
+	if (acc & IBV_ACCESS_REMOTE_READ)
+		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
+
+	bseg->flags2 = 0;
+	if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
+		bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
+	if (acc & IBV_ACCESS_ZERO_BASED)
+		bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
+
+	bseg->new_rkey = htobe32(wr->bind_mw.rkey);
+	bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
+	bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
+	bseg->length = htobe64(wr->bind_mw.bind_info.length);
+}
+
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+		uint32_t rkey)
+{
+	iseg->mem_key	= htobe32(rkey);
+
+	iseg->reserved1    = 0;
+	iseg->reserved2    = 0;
+	iseg->reserved3[0] = 0;
+	iseg->reserved3[1] = 0;
+}
+
+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+				 uint64_t remote_addr, uint32_t rkey)
+{
+	rseg->raddr    = htobe64(remote_addr);
+	rseg->rkey     = htobe32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
+{
+	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = htobe64(wr->wr.atomic.swap);
+		aseg->compare  = htobe64(wr->wr.atomic.compare_add);
+	} else {
+		aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ibv_send_wr *wr)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
+	dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
+	dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+	dseg->byte_count = htobe32(sg->length);
+	dseg->lkey       = htobe32(sg->lkey);
+	dseg->addr       = htobe64(sg->addr);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+	dseg->lkey       = htobe32(sg->lkey);
+	dseg->addr       = htobe64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	udma_to_device_barrier();
+
+	if (likely(sg->length))
+		dseg->byte_count = htobe32(sg->length);
+	else
+		dseg->byte_count = htobe32(0x80000000);
+}
+
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+			  struct ibv_send_wr **bad_wr)
+{
+	struct mlx4_context *ctx;
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl = NULL;
+	int ind;
+	int nreq;
+	int inl = 0;
+	int ret = 0;
+	int size = 0;
+	int i;
+
+	pthread_spin_lock(&qp->sq.lock);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
+			ret = EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IBV_SEND_SIGNALED ?
+			 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IBV_SEND_SOLICITED ?
+			 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
+			qp->sq_signal_bits;
+
+		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+			ctrl->imm = wr->imm_data;
+		else
+			ctrl->imm = 0;
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (ibqp->qp_type) {
+		case IBV_QPT_XRC_SEND:
+			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
+			/* fall through */
+		case IBV_QPT_RC:
+		case IBV_QPT_UC:
+			switch (wr->opcode) {
+			case IBV_WR_ATOMIC_CMP_AND_SWP:
+			case IBV_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IBV_WR_RDMA_READ:
+				inl = 1;
+				/* fall through */
+			case IBV_WR_RDMA_WRITE:
+			case IBV_WR_RDMA_WRITE_WITH_IMM:
+				if (!wr->num_sge)
+					inl = 1;
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+				break;
+			case IBV_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->imm_data);
+				wqe  += sizeof
+					(struct mlx4_wqe_local_inval_seg);
+				size += sizeof
+					(struct mlx4_wqe_local_inval_seg) / 16;
+				break;
+			case IBV_WR_BIND_MW:
+				ctrl->srcrb_flags |=
+					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_bind_seg(wqe, wr);
+				wqe  += sizeof
+					(struct mlx4_wqe_bind_seg);
+				size += sizeof
+					(struct mlx4_wqe_bind_seg) / 16;
+				break;
+			case IBV_WR_SEND_WITH_INV:
+				ctrl->imm = htobe32(wr->imm_data);
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case IBV_QPT_UD:
+			set_datagram_seg(wqe, wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->send_flags & IBV_SEND_IP_CSUM) {
+				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
+					ret = EINVAL;
+					*bad_wr = wr;
+					goto out;
+				}
+				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+			break;
+
+		case IBV_QPT_RAW_PACKET:
+			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+			 * to indicate that no icrc should be calculated */
+			ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
+			if (wr->send_flags & IBV_SEND_IP_CSUM) {
+				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
+					ret = EINVAL;
+					*bad_wr = wr;
+					goto out;
+				}
+				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
+			struct mlx4_wqe_inline_seg *seg;
+			void *addr;
+			int len, seg_len;
+			int num_seg;
+			int off, to_copy;
+
+			inl = 0;
+
+			seg = wqe;
+			wqe += sizeof *seg;
+			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
+			num_seg = 0;
+			seg_len = 0;
+
+			for (i = 0; i < wr->num_sge; ++i) {
+				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
+				len  = wr->sg_list[i].length;
+				inl += len;
+
+				if (inl > qp->max_inline_data) {
+					inl = 0;
+					ret = ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				while (len >= MLX4_INLINE_ALIGN - off) {
+					to_copy = MLX4_INLINE_ALIGN - off;
+					memcpy(wqe, addr, to_copy);
+					len -= to_copy;
+					wqe += to_copy;
+					addr += to_copy;
+					seg_len += to_copy;
+					udma_to_device_barrier(); /* see comment below */
+					seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
+					seg_len = 0;
+					seg = wqe;
+					wqe += sizeof *seg;
+					off = sizeof *seg;
+					++num_seg;
+				}
+
+				memcpy(wqe, addr, len);
+				wqe += len;
+				seg_len += len;
+				off += len;
+			}
+
+			if (seg_len) {
+				++num_seg;
+				/*
+				 * Need a barrier here to make sure
+				 * all the data is visible before the
+				 * byte_count field is set.  Otherwise
+				 * the HCA prefetcher could grab the
+				 * 64-byte chunk with this inline
+				 * segment and get a valid (!=
+				 * 0xffffffff) byte count but stale
+				 * data, and end up sending the wrong
+				 * data.
+				 */
+				udma_to_device_barrier();
+				seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
+			}
+
+			size += (inl + num_seg * sizeof * seg + 15) / 16;
+		} else {
+			struct mlx4_wqe_data_seg *seg = wqe;
+
+			for (i = wr->num_sge - 1; i >= 0 ; --i)
+				set_data_seg(seg + i, wr->sg_list + i);
+
+			size += wr->num_sge * (sizeof *seg / 16);
+		}
+
+		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		udma_to_device_barrier();
+
+		ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
+			(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 */
+		if (wr->next)
+			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+				       (qp->sq.wqe_cnt - 1));
+
+		++ind;
+	}
+
+out:
+	ctx = to_mctx(ibqp->context);
+
+	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
+		ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
+
+		ctrl->bf_qpn |= qp->doorbell_qpn;
+		++qp->sq.head;
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		mmio_wc_spinlock(&ctx->bf_lock);
+
+		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
+			     align(size * 16, 64));
+		/* Flush before toggling bf_offset to be latency oriented */
+		mmio_flush_writes();
+
+		ctx->bf_offset ^= ctx->bf_buf_size;
+
+		pthread_spin_unlock(&ctx->bf_lock);
+	} else if (nreq) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		udma_to_device_barrier();
+
+		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
+			    qp->doorbell_qpn);
+	}
+
+	if (nreq)
+		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+			       (qp->sq.wqe_cnt - 1));
+
+	pthread_spin_unlock(&qp->sq.lock);
+
+	return ret;
+}
+
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+		   struct ibv_recv_wr **bad_wr)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	int ret = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	pthread_spin_lock(&qp->rq.lock);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->num_sge > qp->rq.max_gs) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (nreq) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		udma_to_device_barrier();
+
+		*qp->db = htobe32(qp->rq.head & 0xffff);
+	}
+
+	pthread_spin_unlock(&qp->rq.lock);
+
+	return ret;
+}
+
+static int num_inline_segs(int data, enum ibv_qp_type type)
+{
+	/*
+	 * Inline data segments are not allowed to cross 64 byte
+	 * boundaries.  For UD QPs, the data segments always start
+	 * aligned to 64 bytes (16 byte control segment + 48 byte
+	 * datagram segment); for other QPs, there will be a 16 byte
+	 * control segment and possibly a 16 byte remote address
+	 * segment, so in the worst case there will be only 32 bytes
+	 * available for the first data segment.
+	 */
+	if (type == IBV_QPT_UD)
+		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+			 sizeof (struct mlx4_wqe_datagram_seg)) %
+			MLX4_INLINE_ALIGN;
+	else
+		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+			 sizeof (struct mlx4_wqe_raddr_seg)) %
+			MLX4_INLINE_ALIGN;
+
+	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
+		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
+}
+
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp)
+{
+	int size;
+	int max_sq_sge;
+
+	max_sq_sge	 = align(cap->max_inline_data +
+				 num_inline_segs(cap->max_inline_data, type) *
+				 sizeof (struct mlx4_wqe_inline_seg),
+				 sizeof (struct mlx4_wqe_data_seg)) /
+		sizeof (struct mlx4_wqe_data_seg);
+	if (max_sq_sge < cap->max_send_sge)
+		max_sq_sge = cap->max_send_sge;
+
+	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
+	switch (type) {
+	case IBV_QPT_UD:
+		size += sizeof (struct mlx4_wqe_datagram_seg);
+		break;
+
+	case IBV_QPT_UC:
+		size += sizeof (struct mlx4_wqe_raddr_seg);
+		break;
+
+	case IBV_QPT_XRC_SEND:
+	case IBV_QPT_RC:
+		size += sizeof (struct mlx4_wqe_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
+			    sizeof (struct mlx4_wqe_raddr_seg) +
+			    sizeof (struct mlx4_wqe_data_seg)))
+			size = (sizeof (struct mlx4_wqe_atomic_seg) +
+				sizeof (struct mlx4_wqe_raddr_seg) +
+				sizeof (struct mlx4_wqe_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	if (size < sizeof (struct mlx4_wqe_bind_seg))
+		size = sizeof (struct mlx4_wqe_bind_seg);
+
+	size += sizeof (struct mlx4_wqe_ctrl_seg);
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+}
+
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type, struct mlx4_qp *qp)
+{
+	qp->rq.max_gs	 = cap->max_recv_sge;
+
+	if (qp->sq.wqe_cnt) {
+		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
+		if (!qp->sq.wrid)
+			return -1;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
+		if (!qp->rq.wrid) {
+			free(qp->sq.wrid);
+			return -1;
+		}
+	}
+
+	for (qp->rq.wqe_shift = 4;
+	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	if (qp->buf_size) {
+		if (mlx4_alloc_buf(&qp->buf,
+				   align(qp->buf_size, to_mdev(context->device)->page_size),
+				   to_mdev(context->device)->page_size)) {
+			free(qp->sq.wrid);
+			free(qp->rq.wrid);
+			return -1;
+		}
+
+		memset(qp->buf.buf, 0, qp->buf_size);
+	} else {
+		qp->buf.buf = NULL;
+	}
+
+	return 0;
+}
+
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type)
+{
+	int wqe_size;
+
+	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
+	switch (type) {
+	case IBV_QPT_UD:
+		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
+		break;
+
+	case IBV_QPT_XRC_SEND:
+	case IBV_QPT_UC:
+	case IBV_QPT_RC:
+		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
+		break;
+
+	default:
+		break;
+	}
+
+	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
+	cap->max_send_sge    = qp->sq.max_gs;
+	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+	cap->max_send_wr     = qp->sq.max_post;
+
+	/*
+	 * Inline data segments can't cross a 64 byte boundary.  So
+	 * subtract off one segment header for each 64-byte chunk,
+	 * taking into account the fact that wqe_size will be 32 mod
+	 * 64 for non-UD QPs.
+	 */
+	qp->max_inline_data  = wqe_size -
+		sizeof (struct mlx4_wqe_inline_seg) *
+		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
+	cap->max_inline_data = qp->max_inline_data;
+}
+
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (ctx->qp_table[tind].refcnt)
+		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
+	else
+		return NULL;
+}
+
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (!ctx->qp_table[tind].refcnt) {
+		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
+						   sizeof (struct mlx4_qp *));
+		if (!ctx->qp_table[tind].table)
+			return -1;
+	}
+
+	++ctx->qp_table[tind].refcnt;
+	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
+	return 0;
+}
+
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (!--ctx->qp_table[tind].refcnt)
+		free(ctx->qp_table[tind].table);
+	else
+		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
+}
diff --git a/contrib/ofed/libmlx4/srq.c b/contrib/ofed/libmlx4/srq.c
new file mode 100644
index 000000000000..b8d25bb343da
--- /dev/null
+++ b/contrib/ofed/libmlx4/srq.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+#include "mlx4-abi.h"
+
+static void *get_wqe(struct mlx4_srq *srq, int n)
+{
+	return srq->buf.buf + (n << srq->wqe_shift);
+}
+
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	pthread_spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = htobe16(ind);
+	srq->tail = ind;
+
+	pthread_spin_unlock(&srq->lock);
+}
+
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+		       struct ibv_recv_wr *wr,
+		       struct ibv_recv_wr **bad_wr)
+{
+	struct mlx4_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	int err = 0;
+	int nreq;
+	int i;
+
+	pthread_spin_lock(&srq->lock);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wr->num_sge > srq->max_gs) {
+			err = -1;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (srq->head == srq->tail) {
+			/* SRQ is full*/
+			err = -1;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16toh(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = htobe32(wr->sg_list[i].length);
+			scat[i].lkey       = htobe32(wr->sg_list[i].lkey);
+			scat[i].addr       = htobe64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (nreq) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		udma_to_device_barrier();
+
+		*srq->db = htobe32(srq->counter);
+	}
+
+	pthread_spin_unlock(&srq->lock);
+
+	return err;
+}
+
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+		       struct mlx4_srq *srq)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scatter;
+	int size;
+	int buf_size;
+	int i;
+
+	srq->wrid = malloc(srq->max * sizeof (uint64_t));
+	if (!srq->wrid)
+		return -1;
+
+	size = sizeof (struct mlx4_wqe_srq_next_seg) +
+		srq->max_gs * sizeof (struct mlx4_wqe_data_seg);
+
+	for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift)
+		; /* nothing */
+
+	buf_size = srq->max << srq->wqe_shift;
+
+	if (mlx4_alloc_buf(&srq->buf, buf_size,
+			   to_mdev(pd->context->device)->page_size)) {
+		free(srq->wrid);
+		return -1;
+	}
+
+	memset(srq->buf.buf, 0, buf_size);
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.
+	 */
+
+	for (i = 0; i < srq->max; ++i) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index = htobe16((i + 1) & (srq->max - 1));
+
+		for (scatter = (void *) (next + 1);
+		     (void *) scatter < (void *) next + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = htobe32(MLX4_INVALID_LKEY);
+	}
+
+	srq->head = 0;
+	srq->tail = srq->max - 1;
+
+	return 0;
+}
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
+{
+	memset(xsrq_table, 0, sizeof *xsrq_table);
+	xsrq_table->num_xsrq = size;
+	xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
+	xsrq_table->mask = (1 << xsrq_table->shift) - 1;
+
+	pthread_mutex_init(&xsrq_table->mutex, NULL);
+}
+
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+	int index;
+
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	if (xsrq_table->xsrq_table[index].refcnt)
+		return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
+
+	return NULL;
+}
+
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+		    struct mlx4_srq *srq)
+{
+	int index, ret = 0;
+
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	pthread_mutex_lock(&xsrq_table->mutex);
+	if (!xsrq_table->xsrq_table[index].refcnt) {
+		xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
+							     sizeof(struct mlx4_srq *));
+		if (!xsrq_table->xsrq_table[index].table) {
+			ret = -1;
+			goto out;
+		}
+	}
+
+	xsrq_table->xsrq_table[index].refcnt++;
+	xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;
+
+out:
+	pthread_mutex_unlock(&xsrq_table->mutex);
+	return ret;
+}
+
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+	int index;
+
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	pthread_mutex_lock(&xsrq_table->mutex);
+
+	if (--xsrq_table->xsrq_table[index].refcnt)
+		xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
+	else
+		free(xsrq_table->xsrq_table[index].table);
+
+	pthread_mutex_unlock(&xsrq_table->mutex);
+}
+
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+				    struct ibv_srq_init_attr_ex *attr_ex)
+{
+	struct mlx4_create_xsrq cmd;
+	struct mlx4_create_srq_resp resp;
+	struct mlx4_srq *srq;
+	int ret;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
+		return NULL;
+
+	srq = calloc(1, sizeof *srq);
+	if (!srq)
+		return NULL;
+
+	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+		goto err;
+
+	srq->max     = align_queue_size(attr_ex->attr.max_wr + 1);
+	srq->max_gs  = attr_ex->attr.max_sge;
+	srq->counter = 0;
+	srq->ext_srq = 1;
+
+	if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
+		goto err;
+
+	srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+	if (!srq->db)
+		goto err_free;
+
+	*srq->db = 0;
+
+	cmd.buf_addr = (uintptr_t) srq->buf.buf;
+	cmd.db_addr  = (uintptr_t) srq->db;
+
+	ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq,
+				    sizeof(srq->verbs_srq),
+				    attr_ex,
+				    &cmd.ibv_cmd, sizeof cmd,
+				    &resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_db;
+
+	ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
+			      srq->verbs_srq.srq_num, srq);
+	if (ret)
+		goto err_destroy;
+
+	return &srq->verbs_srq.srq;
+
+err_destroy:
+	ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
+err_db:
+	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
+err_free:
+	free(srq->wrid);
+	mlx4_free_buf(&srq->buf);
+err:
+	free(srq);
+	return NULL;
+}
+
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
+{
+	struct mlx4_context *mctx = to_mctx(srq->context);
+	struct mlx4_srq *msrq = to_msrq(srq);
+	struct mlx4_cq *mcq;
+	int ret;
+
+	mcq = to_mcq(msrq->verbs_srq.cq);
+	mlx4_cq_clean(mcq, 0, msrq);
+	pthread_spin_lock(&mcq->lock);
+	mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
+	pthread_spin_unlock(&mcq->lock);
+
+	ret = ibv_cmd_destroy_srq(srq);
+	if (ret) {
+		pthread_spin_lock(&mcq->lock);
+		mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
+		pthread_spin_unlock(&mcq->lock);
+		return ret;
+	}
+
+	mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
+	mlx4_free_buf(&msrq->buf);
+	free(msrq->wrid);
+	free(msrq);
+
+	return 0;
+}
diff --git a/contrib/ofed/libmlx4/verbs.c b/contrib/ofed/libmlx4/verbs.c
new file mode 100644
index 000000000000..f6f43f9bef76
--- /dev/null
+++ b/contrib/ofed/libmlx4/verbs.c
@@ -0,0 +1,1255 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <infiniband/endian.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+#include "wqe.h"
+
+int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
+{
+	struct ibv_query_device cmd;
+	uint64_t raw_fw_ver;
+	unsigned major, minor, sub_minor;
+	int ret;
+
+	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
+	if (ret)
+		return ret;
+
+	major     = (raw_fw_ver >> 32) & 0xffff;
+	minor     = (raw_fw_ver >> 16) & 0xffff;
+	sub_minor = raw_fw_ver & 0xffff;
+
+	snprintf(attr->fw_ver, sizeof attr->fw_ver,
+		 "%d.%d.%03d", major, minor, sub_minor);
+
+	return 0;
+}
+
+int mlx4_query_device_ex(struct ibv_context *context,
+			 const struct ibv_query_device_ex_input *input,
+			 struct ibv_device_attr_ex *attr,
+			 size_t attr_size)
+{
+	struct mlx4_context *mctx = to_mctx(context);
+	struct mlx4_query_device_ex_resp resp = {};
+	struct mlx4_query_device_ex cmd = {};
+	uint64_t raw_fw_ver;
+	unsigned sub_minor;
+	unsigned major;
+	unsigned minor;
+	int err;
+
+	err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
+				      &raw_fw_ver,
+				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
+				      &resp.ibv_resp, sizeof(resp.ibv_resp),
+				      sizeof(resp));
+	if (err)
+		return err;
+
+	if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) {
+		mctx->core_clock.offset = resp.hca_core_clock_offset;
+		mctx->core_clock.offset_valid = 1;
+	}
+
+	major     = (raw_fw_ver >> 32) & 0xffff;
+	minor     = (raw_fw_ver >> 16) & 0xffff;
+	sub_minor = raw_fw_ver & 0xffff;
+
+	snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver,
+		 "%d.%d.%03d", major, minor, sub_minor);
+
+	return 0;
+}
+
+#define READL(ptr) (*((uint32_t *)(ptr)))
+static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
+{
+	unsigned int clockhi, clocklo, clockhi1;
+	int i;
+	struct mlx4_context *ctx = to_mctx(context);
+
+	if (!ctx->hca_core_clock)
+		return -EOPNOTSUPP;
+
+	/* Handle wraparound */
+	for (i = 0; i < 2; i++) {
+		clockhi = be32toh(READL(ctx->hca_core_clock));
+		clocklo = be32toh(READL(ctx->hca_core_clock + 4));
+		clockhi1 = be32toh(READL(ctx->hca_core_clock));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	*cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
+
+	return 0;
+}
+
+int mlx4_query_rt_values(struct ibv_context *context,
+			 struct ibv_values_ex *values)
+{
+	uint32_t comp_mask = 0;
+	int err = 0;
+
+	if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
+		uint64_t cycles;
+
+		err = mlx4_read_clock(context, &cycles);
+		if (!err) {
+			values->raw_clock.tv_sec = 0;
+			values->raw_clock.tv_nsec = cycles;
+			comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
+		}
+	}
+
+	values->comp_mask = comp_mask;
+
+	return err;
+}
+
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+		     struct ibv_port_attr *attr)
+{
+	struct ibv_query_port cmd;
+	int err;
+
+	err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
+	if (!err && port <= MLX4_PORTS_NUM && port > 0) {
+		struct mlx4_context *mctx = to_mctx(context);
+		if (!mctx->port_query_cache[port - 1].valid) {
+			mctx->port_query_cache[port - 1].link_layer =
+				attr->link_layer;
+			mctx->port_query_cache[port - 1].caps =
+				attr->port_cap_flags;
+			mctx->port_query_cache[port - 1].valid = 1;
+		}
+	}
+
+	return err;
+}
+
+/* Only the fields in the port cache will be valid */
+static int query_port_cache(struct ibv_context *context, uint8_t port_num,
+			    struct ibv_port_attr *port_attr)
+{
+	struct mlx4_context *mctx = to_mctx(context);
+	if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
+		return -EINVAL;
+	if (mctx->port_query_cache[port_num - 1].valid) {
+		port_attr->link_layer =
+			mctx->
+			port_query_cache[port_num - 1].
+			link_layer;
+		port_attr->port_cap_flags =
+			mctx->
+			port_query_cache[port_num - 1].
+			caps;
+		return 0;
+	}
+	return mlx4_query_port(context, port_num,
+			       (struct ibv_port_attr *)port_attr);
+
+}
+
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
+{
+	struct ibv_alloc_pd       cmd;
+	struct mlx4_alloc_pd_resp resp;
+	struct mlx4_pd		 *pd;
+
+	pd = malloc(sizeof *pd);
+	if (!pd)
+		return NULL;
+
+	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
+			     &resp.ibv_resp, sizeof resp)) {
+		free(pd);
+		return NULL;
+	}
+
+	pd->pdn = resp.pdn;
+
+	return &pd->ibv_pd;
+}
+
+int mlx4_free_pd(struct ibv_pd *pd)
+{
+	int ret;
+
+	ret = ibv_cmd_dealloc_pd(pd);
+	if (ret)
+		return ret;
+
+	free(to_mpd(pd));
+	return 0;
+}
+
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+				struct ibv_xrcd_init_attr *attr)
+{
+	struct ibv_open_xrcd cmd;
+	struct ibv_open_xrcd_resp resp;
+	struct verbs_xrcd *xrcd;
+	int ret;
+
+	xrcd = calloc(1, sizeof *xrcd);
+	if (!xrcd)
+		return NULL;
+
+	ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
+				&cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		goto err;
+
+	return &xrcd->xrcd;
+
+err:
+	free(xrcd);
+	return NULL;
+}
+
+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
+{
+	struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
+	int ret;
+
+	ret = ibv_cmd_close_xrcd(xrcd);
+	if (!ret)
+		free(xrcd);
+
+	return ret;
+}
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+			   int access)
+{
+	struct ibv_mr *mr;
+	struct ibv_reg_mr cmd;
+	struct ibv_reg_mr_resp resp;
+	int ret;
+
+	mr = malloc(sizeof *mr);
+	if (!mr)
+		return NULL;
+
+	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
+			     access, mr, &cmd, sizeof cmd,
+			     &resp, sizeof resp);
+	if (ret) {
+		free(mr);
+		return NULL;
+	}
+
+	return mr;
+}
+
+int mlx4_rereg_mr(struct ibv_mr *mr,
+		  int flags,
+		  struct ibv_pd *pd, void *addr,
+		  size_t length, int access)
+{
+	struct ibv_rereg_mr cmd;
+	struct ibv_rereg_mr_resp resp;
+
+	if (flags & IBV_REREG_MR_KEEP_VALID)
+		return ENOTSUP;
+
+	return ibv_cmd_rereg_mr(mr, flags, addr, length,
+				(uintptr_t)addr,
+				access, pd,
+				&cmd, sizeof(cmd),
+				&resp, sizeof(resp));
+}
+
+int mlx4_dereg_mr(struct ibv_mr *mr)
+{
+	int ret;
+
+	ret = ibv_cmd_dereg_mr(mr);
+	if (ret)
+		return ret;
+
+	free(mr);
+	return 0;
+}
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
+{
+	struct ibv_mw *mw;
+	struct ibv_alloc_mw cmd;
+	struct ibv_alloc_mw_resp resp;
+	int ret;
+
+	mw = calloc(1, sizeof(*mw));
+	if (!mw)
+		return NULL;
+
+	ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd),
+			     &resp, sizeof(resp));
+
+	if (ret) {
+		free(mw);
+		return NULL;
+	}
+
+	return mw;
+}
+
+int mlx4_dealloc_mw(struct ibv_mw *mw)
+{
+	int ret;
+	struct ibv_dealloc_mw cmd;
+
+	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	free(mw);
+	return 0;
+}
+
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+		 struct ibv_mw_bind *mw_bind)
+{
+	struct ibv_send_wr *bad_wr = NULL;
+	struct ibv_send_wr wr = { };
+	int ret;
+
+
+	wr.opcode = IBV_WR_BIND_MW;
+	wr.next = NULL;
+
+	wr.wr_id = mw_bind->wr_id;
+	wr.send_flags = mw_bind->send_flags;
+
+	wr.bind_mw.mw = mw;
+	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
+	wr.bind_mw.bind_info = mw_bind->bind_info;
+
+	ret = mlx4_post_send(qp, &wr, &bad_wr);
+
+	if (ret)
+		return ret;
+
+	/* updating the mw with the latest rkey. */
+	mw->rkey = wr.bind_mw.rkey;
+
+	return 0;
+}
+
+int align_queue_size(int req)
+{
+	int nent;
+
+	for (nent = 1; nent < req; nent <<= 1)
+		; /* nothing */
+
+	return nent;
+}
+
+enum {
+	CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS	|
+				       IBV_WC_EX_WITH_COMPLETION_TIMESTAMP
+};
+
+enum {
+	CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
+};
+
+enum {
+	CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
+};
+
+
+static int mlx4_cmd_create_cq(struct ibv_context *context,
+			      struct ibv_cq_init_attr_ex *cq_attr,
+			      struct mlx4_cq *cq)
+{
+	struct mlx4_create_cq      cmd = {};
+	struct mlx4_create_cq_resp resp = {};
+	int ret;
+
+	cmd.buf_addr = (uintptr_t) cq->buf.buf;
+	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
+
+	ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel,
+				cq_attr->comp_vector,
+				ibv_cq_ex_to_cq(&cq->ibv_cq),
+				&cmd.ibv_cmd, sizeof(cmd),
+				&resp.ibv_resp, sizeof(resp));
+	if (!ret)
+		cq->cqn = resp.cqn;
+
+	return ret;
+
+}
+
+static int mlx4_cmd_create_cq_ex(struct ibv_context *context,
+				 struct ibv_cq_init_attr_ex *cq_attr,
+				 struct mlx4_cq *cq)
+{
+	struct mlx4_create_cq_ex      cmd = {};
+	struct mlx4_create_cq_resp_ex resp = {};
+	int ret;
+
+	cmd.buf_addr = (uintptr_t) cq->buf.buf;
+	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
+
+	ret = ibv_cmd_create_cq_ex(context, cq_attr,
+				   &cq->ibv_cq, &cmd.ibv_cmd,
+				   sizeof(cmd.ibv_cmd),
+				   sizeof(cmd),
+				   &resp.ibv_resp,
+				   sizeof(resp.ibv_resp),
+				   sizeof(resp));
+	if (!ret)
+		cq->cqn = resp.cqn;
+
+	return ret;
+}
+
+static struct ibv_cq_ex *create_cq(struct ibv_context *context,
+				   struct ibv_cq_init_attr_ex *cq_attr,
+				   int cq_alloc_flags)
+{
+	struct mlx4_cq      *cq;
+	int                  ret;
+	struct mlx4_context *mctx = to_mctx(context);
+
+	/* Sanity check CQ size before proceeding */
+	if (cq_attr->cqe > 0x3fffff) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
+		errno = ENOTSUP;
+		return NULL;
+	}
+
+	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
+	    cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
+		errno = ENOTSUP;
+		return NULL;
+	}
+
+	if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS)
+		return NULL;
+
+	/* mlx4 devices don't support slid and sl in cqe when completion
+	 * timestamp is enabled in the CQ
+	*/
+	if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) &&
+	    (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) {
+		errno = ENOTSUP;
+		return NULL;
+	}
+
+	cq = malloc(sizeof *cq);
+	if (!cq)
+		return NULL;
+
+	cq->cons_index = 0;
+
+	if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
+		goto err;
+
+	cq_attr->cqe = align_queue_size(cq_attr->cqe + 1);
+
+	if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size))
+		goto err;
+
+	cq->cqe_size = mctx->cqe_size;
+	cq->set_ci_db  = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
+	if (!cq->set_ci_db)
+		goto err_buf;
+
+	cq->arm_db     = cq->set_ci_db + 1;
+	*cq->arm_db    = 0;
+	cq->arm_sn     = 1;
+	*cq->set_ci_db = 0;
+	cq->flags = cq_alloc_flags;
+
+	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
+	    cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
+		cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED;
+
+	--cq_attr->cqe;
+	if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
+		ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq);
+	else
+		ret = mlx4_cmd_create_cq(context, cq_attr, cq);
+
+	if (ret)
+		goto err_db;
+
+
+	if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
+		mlx4_cq_fill_pfns(cq, cq_attr);
+
+	return &cq->ibv_cq;
+
+err_db:
+	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
+
+err_buf:
+	mlx4_free_buf(&cq->buf);
+
+err:
+	free(cq);
+
+	return NULL;
+}
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+			      struct ibv_comp_channel *channel,
+			      int comp_vector)
+{
+	struct ibv_cq_ex *cq;
+	struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
+					      .comp_vector = comp_vector,
+					      .wc_flags = IBV_WC_STANDARD_FLAGS};
+
+	cq = create_cq(context, &cq_attr, 0);
+	return cq ? ibv_cq_ex_to_cq(cq) : NULL;
+}
+
+struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
+				    struct ibv_cq_init_attr_ex *cq_attr)
+{
+	/*
+	 * Make local copy since some attributes might be adjusted
+	 * for internal use.
+	 */
+	struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe,
+						.channel = cq_attr->channel,
+						.comp_vector = cq_attr->comp_vector,
+						.wc_flags = cq_attr->wc_flags,
+						.comp_mask = cq_attr->comp_mask,
+						.flags = cq_attr->flags};
+
+	return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED);
+}
+
+int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_resize_cq cmd;
+	struct ibv_resize_cq_resp resp;
+	struct mlx4_buf buf;
+	int old_cqe, outst_cqe, ret;
+
+	/* Sanity check CQ size before proceeding */
+	if (cqe > 0x3fffff)
+		return EINVAL;
+
+	pthread_spin_lock(&cq->lock);
+
+	cqe = align_queue_size(cqe + 1);
+	if (cqe == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Can't be smaller then the number of outstanding CQEs */
+	outst_cqe = mlx4_get_outstanding_cqes(cq);
+	if (cqe < outst_cqe + 1) {
+		ret = EINVAL;
+		goto out;
+	}
+
+	ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size);
+	if (ret)
+		goto out;
+
+	old_cqe = ibcq->cqe;
+	cmd.buf_addr = (uintptr_t) buf.buf;
+
+	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd,
+				&resp, sizeof resp);
+	if (ret) {
+		mlx4_free_buf(&buf);
+		goto out;
+	}
+
+	mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
+
+	mlx4_free_buf(&cq->buf);
+	cq->buf = buf;
+	mlx4_update_cons_index(cq);
+
+out:
+	pthread_spin_unlock(&cq->lock);
+	return ret;
+}
+
+int mlx4_destroy_cq(struct ibv_cq *cq)
+{
+	int ret;
+
+	ret = ibv_cmd_destroy_cq(cq);
+	if (ret)
+		return ret;
+
+	mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
+	mlx4_free_buf(&to_mcq(cq)->buf);
+	free(to_mcq(cq));
+
+	return 0;
+}
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+				struct ibv_srq_init_attr *attr)
+{
+	struct mlx4_create_srq      cmd;
+	struct mlx4_create_srq_resp resp;
+	struct mlx4_srq		   *srq;
+	int			    ret;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
+		return NULL;
+
+	srq = malloc(sizeof *srq);
+	if (!srq)
+		return NULL;
+
+	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+		goto err;
+
+	srq->max     = align_queue_size(attr->attr.max_wr + 1);
+	srq->max_gs  = attr->attr.max_sge;
+	srq->counter = 0;
+	srq->ext_srq = 0;
+
+	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
+		goto err;
+
+	srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+	if (!srq->db)
+		goto err_free;
+
+	*srq->db = 0;
+
+	cmd.buf_addr = (uintptr_t) srq->buf.buf;
+	cmd.db_addr  = (uintptr_t) srq->db;
+
+	ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
+				 &cmd.ibv_cmd, sizeof cmd,
+				 &resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_db;
+
+	return &srq->verbs_srq.srq;
+
+err_db:
+	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
+
+err_free:
+	free(srq->wrid);
+	mlx4_free_buf(&srq->buf);
+
+err:
+	free(srq);
+
+	return NULL;
+}
+
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+				   struct ibv_srq_init_attr_ex *attr_ex)
+{
+	if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
+	    (attr_ex->srq_type == IBV_SRQT_BASIC))
+		return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
+	else if (attr_ex->srq_type == IBV_SRQT_XRC)
+		return mlx4_create_xrc_srq(context, attr_ex);
+
+	return NULL;
+}
+
+int mlx4_modify_srq(struct ibv_srq *srq,
+		     struct ibv_srq_attr *attr,
+		     int attr_mask)
+{
+	struct ibv_modify_srq cmd;
+
+	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
+}
+
+int mlx4_query_srq(struct ibv_srq *srq,
+		    struct ibv_srq_attr *attr)
+{
+	struct ibv_query_srq cmd;
+
+	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
+}
+
+int mlx4_destroy_srq(struct ibv_srq *srq)
+{
+	int ret;
+
+	if (to_msrq(srq)->ext_srq)
+		return mlx4_destroy_xrc_srq(srq);
+
+	ret = ibv_cmd_destroy_srq(srq);
+	if (ret)
+		return ret;
+
+	mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
+	mlx4_free_buf(&to_msrq(srq)->buf);
+	free(to_msrq(srq)->wrid);
+	free(to_msrq(srq));
+
+	return 0;
+}
+
+static int mlx4_cmd_create_qp_ex(struct ibv_context *context,
+				 struct ibv_qp_init_attr_ex *attr,
+				 struct mlx4_create_qp *cmd,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_create_qp_ex cmd_ex;
+	struct mlx4_create_qp_resp_ex resp;
+	int ret;
+
+	memset(&cmd_ex, 0, sizeof(cmd_ex));
+	memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
+	       offsetof(typeof(cmd->ibv_cmd), is_srq) +
+	       sizeof(cmd->ibv_cmd.is_srq) -
+	       offsetof(typeof(cmd->ibv_cmd), user_handle));
+
+	memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
+	       offsetof(typeof(*cmd), sq_no_prefetch) +
+	       sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd));
+
+	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
+				    sizeof(qp->verbs_qp), attr,
+				    &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
+				    sizeof(cmd_ex), &resp.ibv_resp,
+				    sizeof(resp.ibv_resp), sizeof(resp));
+	return ret;
+}
+
+enum {
+	MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
+					IBV_QP_INIT_ATTR_XRCD |
+					IBV_QP_INIT_ATTR_CREATE_FLAGS),
+};
+
+enum {
+	MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS),
+};
+
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+				 struct ibv_qp_init_attr_ex *attr)
+{
+	struct mlx4_context *ctx = to_mctx(context);
+	struct mlx4_create_qp     cmd;
+	struct ibv_create_qp_resp resp;
+	struct mlx4_qp		 *qp;
+	int			  ret;
+
+	/* Sanity check QP size before proceeding */
+	if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */
+		if (attr->cap.max_send_wr  > ctx->max_qp_wr ||
+		    attr->cap.max_recv_wr  > ctx->max_qp_wr ||
+		    attr->cap.max_send_sge > ctx->max_sge   ||
+		    attr->cap.max_recv_sge > ctx->max_sge)
+			return NULL;
+	} else {
+		if (attr->cap.max_send_wr  > 65536 ||
+		    attr->cap.max_recv_wr  > 65536 ||
+		    attr->cap.max_send_sge > 64    ||
+		    attr->cap.max_recv_sge > 64)
+			return NULL;
+	}
+	if (attr->cap.max_inline_data > 1024)
+		return NULL;
+
+	if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK)
+		return NULL;
+
+	qp = calloc(1, sizeof *qp);
+	if (!qp)
+		return NULL;
+
+	if (attr->qp_type == IBV_QPT_XRC_RECV) {
+		attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
+	} else {
+		mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+		/*
+		 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+		qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+	}
+
+	if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
+	    attr->qp_type == IBV_QPT_XRC_RECV) {
+		attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
+	} else {
+		qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+		if (attr->cap.max_recv_sge < 1)
+			attr->cap.max_recv_sge = 1;
+		if (attr->cap.max_recv_wr < 1)
+			attr->cap.max_recv_wr = 1;
+	}
+
+	if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp))
+		goto err;
+
+	mlx4_init_qp_indices(qp);
+
+	if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
+	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
+		goto err_free;
+
+	if (attr->cap.max_recv_sge) {
+		qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+		if (!qp->db)
+			goto err_free;
+
+		*qp->db = 0;
+		cmd.db_addr = (uintptr_t) qp->db;
+	} else {
+		cmd.db_addr = 0;
+	}
+
+	cmd.buf_addr	    = (uintptr_t) qp->buf.buf;
+	cmd.log_sq_stride   = qp->sq.wqe_shift;
+	for (cmd.log_sq_bb_count = 0;
+	     qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
+	     ++cmd.log_sq_bb_count)
+		; /* nothing */
+	cmd.sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
+	memset(cmd.reserved, 0, sizeof cmd.reserved);
+	pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
+
+	if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK)
+		ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp);
+	else
+		ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp,
+					   sizeof(qp->verbs_qp), attr,
+					   &cmd.ibv_cmd, sizeof(cmd), &resp,
+					   sizeof(resp));
+	if (ret)
+		goto err_rq_db;
+
+	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
+		ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
+		if (ret)
+			goto err_destroy;
+	}
+	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+
+	qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
+	qp->rq.max_gs  = attr->cap.max_recv_sge;
+	if (attr->qp_type != IBV_QPT_XRC_RECV)
+		mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+
+	qp->doorbell_qpn    = htobe32(qp->verbs_qp.qp.qp_num << 8);
+	if (attr->sq_sig_all)
+		qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE);
+	else
+		qp->sq_signal_bits = 0;
+
+	return &qp->verbs_qp.qp;
+
+err_destroy:
+	ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
+
+err_rq_db:
+	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+	if (attr->cap.max_recv_sge)
+		mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
+
+err_free:
+	free(qp->sq.wrid);
+	if (qp->rq.wqe_cnt)
+		free(qp->rq.wrid);
+	mlx4_free_buf(&qp->buf);
+
+err:
+	free(qp);
+
+	return NULL;
+}
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
+{
+	struct ibv_qp_init_attr_ex attr_ex;
+	struct ibv_qp *qp;
+
+	memcpy(&attr_ex, attr, sizeof *attr);
+	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
+	attr_ex.pd = pd;
+	qp = mlx4_create_qp_ex(pd->context, &attr_ex);
+	if (qp)
+		memcpy(attr, &attr_ex, sizeof *attr);
+	return qp;
+}
+
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
+{
+	struct ibv_open_qp cmd;
+	struct ibv_create_qp_resp resp;
+	struct mlx4_qp *qp;
+	int ret;
+
+	qp = calloc(1, sizeof *qp);
+	if (!qp)
+		return NULL;
+
+	ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
+			      &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		goto err;
+
+	return &qp->verbs_qp.qp;
+
+err:
+	free(qp);
+	return NULL;
+}
+
+int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
+		   int attr_mask,
+		   struct ibv_qp_init_attr *init_attr)
+{
+	struct ibv_query_qp cmd;
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	int ret;
+
+	ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd);
+	if (ret)
+		return ret;
+
+	init_attr->cap.max_send_wr     = qp->sq.max_post;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	attr->cap = init_attr->cap;
+
+	return 0;
+}
+
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		    int attr_mask)
+{
+	struct ibv_modify_qp cmd = {};
+	struct ibv_port_attr port_attr;
+	struct mlx4_qp *mqp = to_mqp(qp);
+	struct ibv_device_attr device_attr;
+	int ret;
+
+	memset(&device_attr, 0, sizeof(device_attr));
+	if (attr_mask & IBV_QP_PORT) {
+		ret = ibv_query_port(qp->context, attr->port_num,
+				     &port_attr);
+		if (ret)
+			return ret;
+		mqp->link_layer = port_attr.link_layer;
+
+		ret = ibv_query_device(qp->context, &device_attr);
+		if (ret)
+			return ret;
+
+		switch(qp->qp_type) {
+		case IBV_QPT_UD:
+			if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) &&
+			    (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM))
+				mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB |
+						MLX4_RX_CSUM_VALID;
+			break;
+		case IBV_QPT_RAW_PACKET:
+			if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) &&
+			    (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM))
+				mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH |
+						MLX4_RX_CSUM_VALID;
+			break;
+		default:
+			break;
+		}
+
+	}
+
+	if (qp->state == IBV_QPS_RESET &&
+	    attr_mask & IBV_QP_STATE   &&
+	    attr->qp_state == IBV_QPS_INIT) {
+		mlx4_qp_init_sq_ownership(to_mqp(qp));
+	}
+
+	ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd);
+
+	if (!ret		       &&
+	    (attr_mask & IBV_QP_STATE) &&
+	    attr->qp_state == IBV_QPS_RESET) {
+		if (qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+				      qp->srq ? to_msrq(qp->srq) : NULL);
+		if (qp->send_cq && qp->send_cq != qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
+
+		mlx4_init_qp_indices(to_mqp(qp));
+		if (to_mqp(qp)->rq.wqe_cnt)
+			*to_mqp(qp)->db = 0;
+	}
+
+	return ret;
+}
+
+static void mlx4_lock_cqs(struct ibv_qp *qp)
+{
+	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+	if (!qp->send_cq || !qp->recv_cq) {
+		if (qp->send_cq)
+			pthread_spin_lock(&send_cq->lock);
+		else if (qp->recv_cq)
+			pthread_spin_lock(&recv_cq->lock);
+	} else if (send_cq == recv_cq) {
+		pthread_spin_lock(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		pthread_spin_lock(&send_cq->lock);
+		pthread_spin_lock(&recv_cq->lock);
+	} else {
+		pthread_spin_lock(&recv_cq->lock);
+		pthread_spin_lock(&send_cq->lock);
+	}
+}
+
+static void mlx4_unlock_cqs(struct ibv_qp *qp)
+{
+	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+
+	if (!qp->send_cq || !qp->recv_cq) {
+		if (qp->send_cq)
+			pthread_spin_unlock(&send_cq->lock);
+		else if (qp->recv_cq)
+			pthread_spin_unlock(&recv_cq->lock);
+	} else if (send_cq == recv_cq) {
+		pthread_spin_unlock(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		pthread_spin_unlock(&recv_cq->lock);
+		pthread_spin_unlock(&send_cq->lock);
+	} else {
+		pthread_spin_unlock(&send_cq->lock);
+		pthread_spin_unlock(&recv_cq->lock);
+	}
+}
+
+int mlx4_destroy_qp(struct ibv_qp *ibqp)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	int ret;
+
+	pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex);
+	ret = ibv_cmd_destroy_qp(ibqp);
+	if (ret) {
+		pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+		return ret;
+	}
+
+	mlx4_lock_cqs(ibqp);
+
+	if (ibqp->recv_cq)
+		__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+				ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+	if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
+		__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
+
+	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
+		mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+
+	mlx4_unlock_cqs(ibqp);
+	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+
+	if (qp->rq.wqe_cnt) {
+		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
+		free(qp->rq.wrid);
+	}
+	if (qp->sq.wqe_cnt)
+		free(qp->sq.wrid);
+	mlx4_free_buf(&qp->buf);
+	free(qp);
+
+	return 0;
+}
+
+static int link_local_gid(const union ibv_gid *gid)
+{
+	uint32_t *tmp = (uint32_t *)gid->raw;
+	uint32_t hi = tmp[0];
+	uint32_t lo = tmp[1];
+
+	if (hi == htobe32(0xfe800000) && lo == 0)
+		return 1;
+
+	return 0;
+}
+
+static int is_multicast_gid(const union ibv_gid *gid)
+{
+	return gid->raw[0] == 0xff;
+}
+
+static uint16_t get_vlan_id(union ibv_gid *gid)
+{
+	uint16_t vid;
+	vid = gid->raw[11] << 8 | gid->raw[12];
+	return vid < 0x1000 ? vid : 0xffff;
+}
+
+static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah,
+				  struct ibv_ah_attr *attr)
+{
+	int err, i;
+	uint16_t vid;
+	union ibv_gid sgid;
+
+	if (link_local_gid(&attr->grh.dgid)) {
+		memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
+		memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
+		ah->mac[0] ^= 2;
+
+		vid = get_vlan_id(&attr->grh.dgid);
+	} else if (is_multicast_gid(&attr->grh.dgid)) {
+		ah->mac[0] = 0x33;
+		ah->mac[1] = 0x33;
+		for (i = 2; i < 6; ++i)
+			ah->mac[i] = attr->grh.dgid.raw[i + 10];
+
+		err = ibv_query_gid(pd->context, attr->port_num,
+				    attr->grh.sgid_index, &sgid);
+		if (err)
+			return err;
+
+		ah->av.dlid = htobe16(0xc000);
+		ah->av.port_pd |= htobe32(1 << 31);
+
+		vid = get_vlan_id(&sgid);
+	} else
+		return 1;
+
+	if (vid != 0xffff) {
+		ah->av.port_pd |= htobe32(1 << 29);
+		ah->vlan = vid | ((attr->sl & 7) << 13);
+	}
+
+	return 0;
+}
+
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+{
+	struct mlx4_ah *ah;
+	struct ibv_port_attr port_attr;
+
+	if (query_port_cache(pd->context, attr->port_num, &port_attr))
+		return NULL;
+
+	ah = malloc(sizeof *ah);
+	if (!ah)
+		return NULL;
+
+	memset(&ah->av, 0, sizeof ah->av);
+
+	ah->av.port_pd   = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24));
+
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		ah->av.g_slid = attr->src_path_bits;
+		ah->av.dlid   = htobe16(attr->dlid);
+		ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28);
+	} else
+		ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29);
+
+	if (attr->static_rate) {
+		ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		/* XXX check rate cap? */
+	}
+	if (attr->is_global) {
+		ah->av.g_slid   |= 0x80;
+		ah->av.gid_index = attr->grh.sgid_index;
+		ah->av.hop_limit = attr->grh.hop_limit;
+		ah->av.sl_tclass_flowlabel |=
+			htobe32((attr->grh.traffic_class << 20) |
+				    attr->grh.flow_label);
+		memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
+	}
+
+	if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+		if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) {
+			uint16_t vid;
+
+			if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
+							ah->mac, &vid)) {
+				free(ah);
+				return NULL;
+			}
+
+			if (vid <= 0xfff) {
+				ah->av.port_pd |= htobe32(1 << 29);
+				ah->vlan = vid |
+					((attr->sl & 7) << 13);
+			}
+
+		} else {
+			if (mlx4_resolve_grh_to_l2(pd, ah, attr)) {
+				free(ah);
+				return NULL;
+			}
+		}
+	}
+
+	return &ah->ibv_ah;
+}
+
+int mlx4_destroy_ah(struct ibv_ah *ah)
+{
+	free(to_mah(ah));
+
+	return 0;
+}
diff --git a/contrib/ofed/libmlx4/wqe.h b/contrib/ofed/libmlx4/wqe.h
new file mode 100644
index 000000000000..6f833d9bf76b
--- /dev/null
+++ b/contrib/ofed/libmlx4/wqe.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef WQE_H
+#define WQE_H
+
+#include <stdint.h>
+
+enum {
+	MLX4_SEND_DOORBELL	= 0x14,
+};
+
+enum {
+	MLX4_WQE_CTRL_SOLICIT		= 1 << 1,
+	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
+	MLX4_WQE_CTRL_IP_HDR_CSUM	= 1 << 4,
+	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
+	MLX4_WQE_CTRL_FENCE		= 1 << 6,
+	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7
+};
+
+enum {
+	MLX4_WQE_BIND_TYPE_2		= (1<<31),
+	MLX4_WQE_BIND_ZERO_BASED	= (1<<30),
+};
+
+enum {
+	MLX4_INLINE_SEG		= 1 << 31,
+	MLX4_INLINE_ALIGN	= 64,
+};
+
+enum {
+	MLX4_INVALID_LKEY	= 0x100,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	uint32_t		owner_opcode;
+	union {
+		struct {
+			uint8_t			reserved[3];
+			uint8_t			fence_size;
+		};
+		uint32_t	bf_qpn;
+	};
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 * [0]   FL (force loopback)
+	 */
+	uint32_t		srcrb_flags;
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	uint32_t		imm;
+};
+
+struct mlx4_wqe_datagram_seg {
+	uint32_t		av[8];
+	uint32_t		dqpn;
+	uint32_t		qkey;
+	uint16_t		vlan;
+	uint8_t			mac[6];
+};
+
+struct mlx4_wqe_data_seg {
+	uint32_t		byte_count;
+	uint32_t		lkey;
+	uint64_t		addr;
+};
+
+struct mlx4_wqe_inline_seg {
+	uint32_t		byte_count;
+};
+
+struct mlx4_wqe_srq_next_seg {
+	uint16_t		reserved1;
+	uint16_t		next_wqe_index;
+	uint32_t		reserved2[3];
+};
+
+struct mlx4_wqe_local_inval_seg {
+	uint64_t		reserved1;
+	uint32_t		mem_key;
+	uint32_t		reserved2;
+	uint64_t		reserved3[2];
+};
+
+enum {
+	MLX4_WQE_MW_REMOTE_READ   = 1 << 29,
+	MLX4_WQE_MW_REMOTE_WRITE  = 1 << 30,
+	MLX4_WQE_MW_ATOMIC        = 1 << 31
+};
+
+struct mlx4_wqe_raddr_seg {
+	uint64_t		raddr;
+	uint32_t		rkey;
+	uint32_t		reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	uint64_t		swap_add;
+	uint64_t		compare;
+};
+
+struct mlx4_wqe_bind_seg {
+	uint32_t		flags1;
+	uint32_t		flags2;
+	uint32_t		new_rkey;
+	uint32_t		lkey;
+	uint64_t		addr;
+	uint64_t		length;
+};
+
+#endif /* WQE_H */