aboutsummaryrefslogtreecommitdiff
path: root/contrib/ofed/libmlx4
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/ofed/libmlx4')
-rw-r--r--contrib/ofed/libmlx4/buf.c64
-rw-r--r--contrib/ofed/libmlx4/config.h13
-rw-r--r--contrib/ofed/libmlx4/cq.c819
-rw-r--r--contrib/ofed/libmlx4/dbrec.c151
-rw-r--r--contrib/ofed/libmlx4/doorbell.h70
-rw-r--r--contrib/ofed/libmlx4/mlx4-abi.h159
-rw-r--r--contrib/ofed/libmlx4/mlx4.c327
-rw-r--r--contrib/ofed/libmlx4/mlx4.h458
-rw-r--r--contrib/ofed/libmlx4/mmio.h116
-rw-r--r--contrib/ofed/libmlx4/qp.c776
-rw-r--r--contrib/ofed/libmlx4/srq.c325
-rw-r--r--contrib/ofed/libmlx4/verbs.c1255
-rw-r--r--contrib/ofed/libmlx4/wqe.h149
13 files changed, 4682 insertions, 0 deletions
diff --git a/contrib/ofed/libmlx4/buf.c b/contrib/ofed/libmlx4/buf.c
new file mode 100644
index 000000000000..9b41e7f62525
--- /dev/null
+++ b/contrib/ofed/libmlx4/buf.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "mlx4.h"
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
+{
+ int ret;
+
+ buf->length = align(size, page_size);
+ buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (buf->buf == MAP_FAILED)
+ return errno;
+
+ ret = ibv_dontfork_range(buf->buf, size);
+ if (ret)
+ munmap(buf->buf, buf->length);
+
+ return ret;
+}
+
+void mlx4_free_buf(struct mlx4_buf *buf)
+{
+ if (buf->length) {
+ ibv_dofork_range(buf->buf, buf->length);
+ munmap(buf->buf, buf->length);
+ }
+}
diff --git a/contrib/ofed/libmlx4/config.h b/contrib/ofed/libmlx4/config.h
new file mode 100644
index 000000000000..af75292ef03e
--- /dev/null
+++ b/contrib/ofed/libmlx4/config.h
@@ -0,0 +1,13 @@
+/* $FreeBSD$ */
+
+#ifdef __LP64__
+#define SIZEOF_LONG 8
+#else
+#define SIZEOF_LONG 4
+#endif
+
+#define VALGRIND_MAKE_MEM_DEFINED(...) 0
+#define SWITCH_FALLTHROUGH (void)0
+#define ALWAYS_INLINE __attribute__ ((__always_inline__))
+#define likely(x) __predict_true(x)
+#define unlikely(x) __predict_false(x)
diff --git a/contrib/ofed/libmlx4/cq.c b/contrib/ofed/libmlx4/cq.c
new file mode 100644
index 000000000000..aa2ec1e9636a
--- /dev/null
+++ b/contrib/ofed/libmlx4/cq.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include <infiniband/opcode.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+
+enum {
+ MLX4_CQ_DOORBELL = 0x20
+};
+
+enum {
+ CQ_OK = 0,
+ CQ_EMPTY = -1,
+ CQ_POLL_ERR = -2
+};
+
+#define MLX4_CQ_DB_REQ_NOT_SOL (1 << 24)
+#define MLX4_CQ_DB_REQ_NOT (2 << 24)
+
+enum {
+ MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29,
+ MLX4_CQE_QPN_MASK = 0xffffff,
+};
+
+enum {
+ MLX4_CQE_OWNER_MASK = 0x80,
+ MLX4_CQE_IS_SEND_MASK = 0x40,
+ MLX4_CQE_OPCODE_MASK = 0x1f
+};
+
+enum {
+ MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01,
+ MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02,
+ MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04,
+ MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05,
+ MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06,
+ MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10,
+ MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11,
+ MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12,
+ MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13,
+ MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14,
+ MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15,
+ MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16,
+ MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22,
+};
+
+struct mlx4_err_cqe {
+ uint32_t vlan_my_qpn;
+ uint32_t reserved1[5];
+ uint16_t wqe_index;
+ uint8_t vendor_err;
+ uint8_t syndrome;
+ uint8_t reserved2[3];
+ uint8_t owner_sr_opcode;
+};
+
+static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry)
+{
+ return cq->buf.buf + entry * cq->cqe_size;
+}
+
+static void *get_sw_cqe(struct mlx4_cq *cq, int n)
+{
+ struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe);
+ struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;
+
+ return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
+{
+ return get_sw_cqe(cq, cq->cons_index);
+}
+
+static enum ibv_wc_status mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe)
+{
+ if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
+ printf(PFX "local QP operation err "
+ "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+ "opcode = %02x)\n",
+ htobe32(cqe->vlan_my_qpn), htobe32(cqe->wqe_index),
+ cqe->vendor_err,
+ cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+
+ switch (cqe->syndrome) {
+ case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+ return IBV_WC_LOC_LEN_ERR;
+ case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+ return IBV_WC_LOC_QP_OP_ERR;
+ case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+ return IBV_WC_LOC_PROT_ERR;
+ case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+ return IBV_WC_WR_FLUSH_ERR;
+ case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+ return IBV_WC_MW_BIND_ERR;
+ case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+ return IBV_WC_BAD_RESP_ERR;
+ case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+ return IBV_WC_LOC_ACCESS_ERR;
+ case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+ return IBV_WC_REM_INV_REQ_ERR;
+ case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+ return IBV_WC_REM_ACCESS_ERR;
+ case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+ return IBV_WC_REM_OP_ERR;
+ case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+ return IBV_WC_RETRY_EXC_ERR;
+ case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+ return IBV_WC_RNR_RETRY_EXC_ERR;
+ case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+ return IBV_WC_REM_ABORT_ERR;
+ default:
+ return IBV_WC_GENERAL_ERR;
+ }
+}
+
+static inline void handle_good_req(struct ibv_wc *wc, struct mlx4_cqe *cqe)
+{
+ wc->wc_flags = 0;
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ wc->wc_flags |= IBV_WC_WITH_IMM;
+ SWITCH_FALLTHROUGH;
+ case MLX4_OPCODE_RDMA_WRITE:
+ wc->opcode = IBV_WC_RDMA_WRITE;
+ break;
+ case MLX4_OPCODE_SEND_IMM:
+ wc->wc_flags |= IBV_WC_WITH_IMM;
+ SWITCH_FALLTHROUGH;
+ case MLX4_OPCODE_SEND:
+ case MLX4_OPCODE_SEND_INVAL:
+ wc->opcode = IBV_WC_SEND;
+ break;
+ case MLX4_OPCODE_RDMA_READ:
+ wc->opcode = IBV_WC_RDMA_READ;
+ wc->byte_len = be32toh(cqe->byte_cnt);
+ break;
+ case MLX4_OPCODE_ATOMIC_CS:
+ wc->opcode = IBV_WC_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_ATOMIC_FA:
+ wc->opcode = IBV_WC_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_LOCAL_INVAL:
+ wc->opcode = IBV_WC_LOCAL_INV;
+ break;
+ case MLX4_OPCODE_BIND_MW:
+ wc->opcode = IBV_WC_BIND_MW;
+ break;
+ default:
+ /* assume it's a send completion */
+ wc->opcode = IBV_WC_SEND;
+ break;
+ }
+}
+
+static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe **pcqe)
+ ALWAYS_INLINE;
+static inline int mlx4_get_next_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe **pcqe)
+{
+ struct mlx4_cqe *cqe;
+
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return CQ_EMPTY;
+
+ if (cq->cqe_size == 64)
+ ++cqe;
+
+ ++cq->cons_index;
+
+ VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ udma_from_device_barrier();
+
+ *pcqe = cqe;
+
+ return CQ_OK;
+}
+
+static inline int mlx4_parse_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe *cqe,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc *wc, int lazy)
+ ALWAYS_INLINE;
+static inline int mlx4_parse_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe *cqe,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc *wc, int lazy)
+{
+ struct mlx4_wq *wq;
+ struct mlx4_srq *srq;
+ uint32_t qpn;
+ uint32_t g_mlpath_rqpn;
+ uint64_t *pwr_id;
+ uint16_t wqe_index;
+ struct mlx4_err_cqe *ecqe;
+ struct mlx4_context *mctx;
+ int is_error;
+ int is_send;
+ enum ibv_wc_status *pstatus;
+
+ mctx = to_mctx(cq->ibv_cq.context);
+ qpn = be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+ if (lazy) {
+ cq->cqe = cqe;
+ cq->flags &= (~MLX4_CQ_FLAGS_RX_CSUM_VALID);
+ } else
+ wc->qp_num = qpn;
+
+ is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR;
+
+ if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
+ /*
+ * We do not have to take the XSRQ table lock here,
+ * because CQs will be locked while SRQs are removed
+ * from the table.
+ */
+ srq = mlx4_find_xsrq(&mctx->xsrq_table,
+ be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
+ if (!srq)
+ return CQ_POLL_ERR;
+ } else {
+ if (!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num)) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = mlx4_find_qp(mctx, qpn);
+ if (!*cur_qp)
+ return CQ_POLL_ERR;
+ }
+ srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL;
+ }
+
+ pwr_id = lazy ? &cq->ibv_cq.wr_id : &wc->wr_id;
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ wqe_index = be16toh(cqe->wqe_index);
+ wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
+ *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ } else if (srq) {
+ wqe_index = be16toh(cqe->wqe_index);
+ *pwr_id = srq->wrid[wqe_index];
+ mlx4_free_srq_wqe(srq, wqe_index);
+ } else {
+ wq = &(*cur_qp)->rq;
+ *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ }
+
+ pstatus = lazy ? &cq->ibv_cq.status : &wc->status;
+ if (is_error) {
+ ecqe = (struct mlx4_err_cqe *)cqe;
+ *pstatus = mlx4_handle_error_cqe(ecqe);
+ if (!lazy)
+ wc->vendor_err = ecqe->vendor_err;
+ return CQ_OK;
+ }
+
+ *pstatus = IBV_WC_SUCCESS;
+ if (lazy) {
+ if (!is_send)
+ if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID))
+ cq->flags |= MLX4_CQ_FLAGS_RX_CSUM_VALID;
+ } else if (is_send) {
+ handle_good_req(wc, cqe);
+ } else {
+ wc->byte_len = be32toh(cqe->byte_cnt);
+
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
+ wc->wc_flags = IBV_WC_WITH_IMM;
+ wc->imm_data = cqe->immed_rss_invalid;
+ break;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ wc->opcode = IBV_WC_RECV;
+ wc->wc_flags |= IBV_WC_WITH_INV;
+ wc->imm_data = be32toh(cqe->immed_rss_invalid);
+ break;
+ case MLX4_RECV_OPCODE_SEND:
+ wc->opcode = IBV_WC_RECV;
+ wc->wc_flags = 0;
+ break;
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc->opcode = IBV_WC_RECV;
+ wc->wc_flags = IBV_WC_WITH_IMM;
+ wc->imm_data = cqe->immed_rss_invalid;
+ break;
+ }
+
+ wc->slid = be16toh(cqe->rlid);
+ g_mlpath_rqpn = be32toh(cqe->g_mlpath_rqpn);
+ wc->src_qp = g_mlpath_rqpn & 0xffffff;
+ wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+ wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
+ wc->pkey_index = be32toh(cqe->immed_rss_invalid) & 0x7f;
+ /* When working with xrc srqs, don't have qp to check link layer.
+ * Using IB SL, should consider Roce. (TBD)
+ */
+ if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
+ wc->sl = be16toh(cqe->sl_vid) >> 13;
+ else
+ wc->sl = be16toh(cqe->sl_vid) >> 12;
+
+ if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) {
+ wc->wc_flags |= ((cqe->status & htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) ==
+ htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) <<
+ IBV_WC_IP_CSUM_OK_SHIFT;
+ }
+ }
+
+ return CQ_OK;
+}
+
+static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe *cqe)
+ ALWAYS_INLINE;
+static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq,
+ struct mlx4_cqe *cqe)
+{
+ return mlx4_parse_cqe(cq, cqe, &cq->cur_qp, NULL, 1);
+}
+
+static inline int mlx4_poll_one(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc *wc)
+ ALWAYS_INLINE;
+static inline int mlx4_poll_one(struct mlx4_cq *cq,
+ struct mlx4_qp **cur_qp,
+ struct ibv_wc *wc)
+{
+ struct mlx4_cqe *cqe;
+ int err;
+
+ err = mlx4_get_next_cqe(cq, &cqe);
+ if (err == CQ_EMPTY)
+ return err;
+
+ return mlx4_parse_cqe(cq, cqe, cur_qp, wc, 0);
+}
+
+int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+ struct mlx4_qp *qp = NULL;
+ int npolled;
+ int err = CQ_OK;
+
+ pthread_spin_lock(&cq->lock);
+
+ for (npolled = 0; npolled < ne; ++npolled) {
+ err = mlx4_poll_one(cq, &qp, wc + npolled);
+ if (err != CQ_OK)
+ break;
+ }
+
+ if (npolled || err == CQ_POLL_ERR)
+ mlx4_update_cons_index(cq);
+
+ pthread_spin_unlock(&cq->lock);
+
+ return err == CQ_POLL_ERR ? err : npolled;
+}
+
+static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock)
+ ALWAYS_INLINE;
+static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ mlx4_update_cons_index(cq);
+
+ if (lock)
+ pthread_spin_unlock(&cq->lock);
+}
+
+static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq,
+ struct ibv_poll_cq_attr *attr,
+ int lock)
+ ALWAYS_INLINE;
+static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq,
+ struct ibv_poll_cq_attr *attr,
+ int lock)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+ struct mlx4_cqe *cqe;
+ int err;
+
+ if (unlikely(attr->comp_mask))
+ return EINVAL;
+
+ if (lock)
+ pthread_spin_lock(&cq->lock);
+
+ cq->cur_qp = NULL;
+
+ err = mlx4_get_next_cqe(cq, &cqe);
+ if (err == CQ_EMPTY) {
+ if (lock)
+ pthread_spin_unlock(&cq->lock);
+ return ENOENT;
+ }
+
+ err = mlx4_parse_lazy_cqe(cq, cqe);
+ if (lock && err)
+ pthread_spin_unlock(&cq->lock);
+
+ return err;
+}
+
+static int mlx4_next_poll(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+ struct mlx4_cqe *cqe;
+ int err;
+
+ err = mlx4_get_next_cqe(cq, &cqe);
+ if (err == CQ_EMPTY)
+ return ENOENT;
+
+ return mlx4_parse_lazy_cqe(cq, cqe);
+}
+
+static void mlx4_end_poll(struct ibv_cq_ex *ibcq)
+{
+ _mlx4_end_poll(ibcq, 0);
+}
+
+static void mlx4_end_poll_lock(struct ibv_cq_ex *ibcq)
+{
+ _mlx4_end_poll(ibcq, 1);
+}
+
+static int mlx4_start_poll(struct ibv_cq_ex *ibcq,
+ struct ibv_poll_cq_attr *attr)
+{
+ return _mlx4_start_poll(ibcq, attr, 0);
+}
+
+static int mlx4_start_poll_lock(struct ibv_cq_ex *ibcq,
+ struct ibv_poll_cq_attr *attr)
+{
+ return _mlx4_start_poll(ibcq, attr, 1);
+}
+
+static enum ibv_wc_opcode mlx4_cq_read_wc_opcode(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ if (cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK) {
+ switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ case MLX4_OPCODE_RDMA_WRITE:
+ return IBV_WC_RDMA_WRITE;
+ case MLX4_OPCODE_SEND_INVAL:
+ case MLX4_OPCODE_SEND_IMM:
+ case MLX4_OPCODE_SEND:
+ return IBV_WC_SEND;
+ case MLX4_OPCODE_RDMA_READ:
+ return IBV_WC_RDMA_READ;
+ case MLX4_OPCODE_ATOMIC_CS:
+ return IBV_WC_COMP_SWAP;
+ case MLX4_OPCODE_ATOMIC_FA:
+ return IBV_WC_FETCH_ADD;
+ case MLX4_OPCODE_LOCAL_INVAL:
+ return IBV_WC_LOCAL_INV;
+ case MLX4_OPCODE_BIND_MW:
+ return IBV_WC_BIND_MW;
+ }
+ } else {
+ switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ return IBV_WC_RECV_RDMA_WITH_IMM;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ case MLX4_RECV_OPCODE_SEND:
+ return IBV_WC_RECV;
+ }
+ }
+
+ return 0;
+}
+
+static uint32_t mlx4_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return be32toh(cq->cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+}
+
+static int mlx4_cq_read_wc_flags(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+ int is_send = cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+ int wc_flags = 0;
+
+ if (is_send) {
+ switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ case MLX4_OPCODE_SEND_IMM:
+ wc_flags |= IBV_WC_WITH_IMM;
+ break;
+ }
+ } else {
+ if (cq->flags & MLX4_CQ_FLAGS_RX_CSUM_VALID)
+ wc_flags |= ((cq->cqe->status &
+ htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) ==
+ htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) <<
+ IBV_WC_IP_CSUM_OK_SHIFT;
+
+ switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc_flags |= IBV_WC_WITH_IMM;
+ break;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ wc_flags |= IBV_WC_WITH_INV;
+ break;
+ }
+ wc_flags |= (be32toh(cq->cqe->g_mlpath_rqpn) & 0x80000000) ? IBV_WC_GRH : 0;
+ }
+
+ return wc_flags;
+}
+
+static uint32_t mlx4_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return be32toh(cq->cqe->byte_cnt);
+}
+
+static uint32_t mlx4_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+ struct mlx4_err_cqe *ecqe = (struct mlx4_err_cqe *)cq->cqe;
+
+ return ecqe->vendor_err;
+}
+
+static uint32_t mlx4_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ return be32toh(cq->cqe->immed_rss_invalid);
+ default:
+ return cq->cqe->immed_rss_invalid;
+ }
+}
+
+static uint32_t mlx4_cq_read_wc_slid(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return (uint32_t)be16toh(cq->cqe->rlid);
+}
+
+static uint8_t mlx4_cq_read_wc_sl(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ if ((cq->cur_qp) && (cq->cur_qp->link_layer == IBV_LINK_LAYER_ETHERNET))
+ return be16toh(cq->cqe->sl_vid) >> 13;
+ else
+ return be16toh(cq->cqe->sl_vid) >> 12;
+}
+
+static uint32_t mlx4_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return be32toh(cq->cqe->g_mlpath_rqpn) & 0xffffff;
+}
+
+static uint8_t mlx4_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return (be32toh(cq->cqe->g_mlpath_rqpn) >> 24) & 0x7f;
+}
+
+static uint64_t mlx4_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq)
+{
+ struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
+
+ return ((uint64_t)be32toh(cq->cqe->ts_47_16) << 16) |
+ (cq->cqe->ts_15_8 << 8) |
+ (cq->cqe->ts_7_0);
+}
+
+void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr)
+{
+
+ if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) {
+ cq->ibv_cq.start_poll = mlx4_start_poll;
+ cq->ibv_cq.end_poll = mlx4_end_poll;
+ } else {
+ cq->ibv_cq.start_poll = mlx4_start_poll_lock;
+ cq->ibv_cq.end_poll = mlx4_end_poll_lock;
+ }
+ cq->ibv_cq.next_poll = mlx4_next_poll;
+
+ cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode;
+ cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err;
+ cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
+ cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM)
+ cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM)
+ cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP)
+ cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID)
+ cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL)
+ cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS)
+ cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits;
+ if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)
+ cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts;
+}
+
+int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
+{
+ struct mlx4_cq *cq = to_mcq(ibvcq);
+ uint32_t doorbell[2];
+ uint32_t sn;
+ uint32_t ci;
+ uint32_t cmd;
+
+ sn = cq->arm_sn & 3;
+ ci = cq->cons_index & 0xffffff;
+ cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+ *cq->arm_db = htobe32(sn << 28 | cmd | ci);
+
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ udma_to_device_barrier();
+
+ doorbell[0] = htobe32(sn << 28 | cmd | cq->cqn);
+ doorbell[1] = htobe32(ci);
+
+ mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL);
+
+ return 0;
+}
+
+void mlx4_cq_event(struct ibv_cq *cq)
+{
+ to_mcq(cq)->arm_sn++;
+}
+
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+ struct mlx4_cqe *cqe, *dest;
+ uint32_t prod_index;
+ uint8_t owner_bit;
+ int nfreed = 0;
+ int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+ if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
+ break;
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while ((int) --prod_index - (int) cq->cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
+ cqe += cqe_inc;
+ if (srq && srq->ext_srq &&
+ (be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
+ !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
+ mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index));
+ ++nfreed;
+ } else if ((be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+ if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+ mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index));
+ ++nfreed;
+ } else if (nfreed) {
+ dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe);
+ dest += cqe_inc;
+ owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+ memcpy(dest, cqe, sizeof *cqe);
+ dest->owner_sr_opcode = owner_bit |
+ (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ }
+ }
+
+ if (nfreed) {
+ cq->cons_index += nfreed;
+ /*
+ * Make sure update of buffer contents is done before
+ * updating consumer index.
+ */
+ udma_to_device_barrier();
+ mlx4_update_cons_index(cq);
+ }
+}
+
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+ pthread_spin_lock(&cq->lock);
+ __mlx4_cq_clean(cq, qpn, srq);
+ pthread_spin_unlock(&cq->lock);
+}
+
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
+{
+ uint32_t i;
+
+ for (i = cq->cons_index; get_sw_cqe(cq, i); ++i)
+ ;
+
+ return i - cq->cons_index;
+}
+
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe)
+{
+ struct mlx4_cqe *cqe;
+ int i;
+ int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+ i = cq->cons_index;
+ cqe = get_cqe(cq, (i & old_cqe));
+ cqe += cqe_inc;
+
+ while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+ cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+ (((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+ memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size,
+ cqe - cqe_inc, cq->cqe_size);
+ ++i;
+ cqe = get_cqe(cq, (i & old_cqe));
+ cqe += cqe_inc;
+ }
+
+ ++cq->cons_index;
+}
+
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+ int entry_size)
+{
+ if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
+ dev->page_size))
+ return -1;
+ memset(buf->buf, 0, nent * entry_size);
+
+ return 0;
+}
diff --git a/contrib/ofed/libmlx4/dbrec.c b/contrib/ofed/libmlx4/dbrec.c
new file mode 100644
index 000000000000..3e875738fa61
--- /dev/null
+++ b/contrib/ofed/libmlx4/dbrec.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define _GNU_SOURCE
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+
+struct mlx4_db_page {
+ struct mlx4_db_page *prev, *next;
+ struct mlx4_buf buf;
+ int num_db;
+ int use_cnt;
+ unsigned long free[0];
+};
+
+static const int db_size[] = {
+ [MLX4_DB_TYPE_CQ] = 8,
+ [MLX4_DB_TYPE_RQ] = 4,
+};
+
+static struct mlx4_db_page *__add_page(struct mlx4_context *context,
+ enum mlx4_db_type type)
+{
+ struct mlx4_db_page *page;
+ int ps = to_mdev(context->ibv_ctx.device)->page_size;
+ int pp;
+ int i;
+
+ pp = ps / db_size[type];
+
+ page = malloc(sizeof *page + pp / 8);
+ if (!page)
+ return NULL;
+
+ if (mlx4_alloc_buf(&page->buf, ps, ps)) {
+ free(page);
+ return NULL;
+ }
+
+ page->num_db = pp;
+ page->use_cnt = 0;
+ for (i = 0; i < pp / (sizeof (long) * 8); ++i)
+ page->free[i] = ~0;
+
+ page->prev = NULL;
+ page->next = context->db_list[type];
+ context->db_list[type] = page;
+ if (page->next)
+ page->next->prev = page;
+
+ return page;
+}
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type)
+{
+ struct mlx4_db_page *page;
+ uint32_t *db = NULL;
+ int i, j;
+
+ pthread_mutex_lock(&context->db_list_mutex);
+
+ for (page = context->db_list[type]; page; page = page->next)
+ if (page->use_cnt < page->num_db)
+ goto found;
+
+ page = __add_page(context, type);
+ if (!page)
+ goto out;
+
+found:
+ ++page->use_cnt;
+
+ for (i = 0; !page->free[i]; ++i)
+ /* nothing */;
+
+ j = ffsl(page->free[i]);
+ page->free[i] &= ~(1UL << (j - 1));
+ db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type];
+
+out:
+ pthread_mutex_unlock(&context->db_list_mutex);
+
+ return db;
+}
+
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db)
+{
+ struct mlx4_db_page *page;
+ uintptr_t ps = to_mdev(context->ibv_ctx.device)->page_size;
+ int i;
+
+ pthread_mutex_lock(&context->db_list_mutex);
+
+ for (page = context->db_list[type]; page; page = page->next)
+ if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf)
+ break;
+
+ if (!page)
+ goto out;
+
+ i = ((void *) db - page->buf.buf) / db_size[type];
+ page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long)));
+
+ if (!--page->use_cnt) {
+ if (page->prev)
+ page->prev->next = page->next;
+ else
+ context->db_list[type] = page->next;
+ if (page->next)
+ page->next->prev = page->prev;
+
+ mlx4_free_buf(&page->buf);
+ free(page);
+ }
+
+out:
+ pthread_mutex_unlock(&context->db_list_mutex);
+}
diff --git a/contrib/ofed/libmlx4/doorbell.h b/contrib/ofed/libmlx4/doorbell.h
new file mode 100644
index 000000000000..140a6158d7f2
--- /dev/null
+++ b/contrib/ofed/libmlx4/doorbell.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef DOORBELL_H
+#define DOORBELL_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include "mlx4.h"
+#include "mmio.h"
+
+struct mlx4_context;
+
+#if SIZEOF_LONG == 8
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+# define MLX4_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1])
+#else
+# error __BYTE_ORDER not defined
+#endif
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+ mmio_writeq((unsigned long)(ctx->uar + offset), MLX4_PAIR_TO_64(val));
+}
+
+#else
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+ pthread_spin_lock(&ctx->uar_lock);
+ mmio_writel((unsigned long)(ctx->uar + offset), val[0]);
+ mmio_writel((unsigned long)(ctx->uar + offset + 4), val[1]);
+ pthread_spin_unlock(&ctx->uar_lock);
+}
+
+#endif
+
+#endif /* DOORBELL_H */
diff --git a/contrib/ofed/libmlx4/mlx4-abi.h b/contrib/ofed/libmlx4/mlx4-abi.h
new file mode 100644
index 000000000000..7d89505606e2
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4-abi.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ABI_H
+#define MLX4_ABI_H
+
+#include <infiniband/kern-abi.h>
+
+#define MLX4_UVERBS_MIN_ABI_VERSION 2
+#define MLX4_UVERBS_MAX_ABI_VERSION 4
+
+#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION 3
+
+enum {
+ MLX4_USER_DEV_CAP_64B_CQE = 1L << 0
+};
+
+struct mlx4_alloc_ucontext_resp_v3 {
+ struct ibv_get_context_resp ibv_resp;
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+};
+
+enum mlx4_query_dev_ex_resp_mask {
+ MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
+};
+
+struct mlx4_alloc_ucontext_resp {
+ struct ibv_get_context_resp ibv_resp;
+ __u32 dev_caps;
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+ __u32 cqe_size;
+};
+
+struct mlx4_alloc_pd_resp {
+ struct ibv_alloc_pd_resp ibv_resp;
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct mlx4_create_cq {
+ struct ibv_create_cq ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_create_cq_resp {
+ struct ibv_create_cq_resp ibv_resp;
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx4_create_cq_ex {
+ struct ibv_create_cq_ex ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_create_cq_resp_ex {
+ struct ibv_create_cq_resp_ex ibv_resp;
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx4_resize_cq {
+ struct ibv_resize_cq ibv_cmd;
+ __u64 buf_addr;
+};
+
+struct mlx4_query_device_ex_resp {
+ struct ibv_query_device_resp_ex ibv_resp;
+ __u32 comp_mask;
+ __u32 response_length;
+ __u64 hca_core_clock_offset;
+};
+
+struct mlx4_query_device_ex {
+ struct ibv_query_device_ex ibv_cmd;
+};
+
+struct mlx4_create_srq {
+ struct ibv_create_srq ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_create_xsrq {
+ struct ibv_create_xsrq ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_create_srq_resp {
+ struct ibv_create_srq_resp ibv_resp;
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mlx4_create_qp {
+ struct ibv_create_qp ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 sq_no_prefetch; /* was reserved in ABI 2 */
+ __u8 reserved[5];
+};
+
+struct mlx4_create_qp_drv_ex {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 sq_no_prefetch; /* was reserved in ABI 2 */
+ __u8 reserved[5];
+};
+
+struct mlx4_create_qp_ex {
+ struct ibv_create_qp_ex ibv_cmd;
+ struct mlx4_create_qp_drv_ex drv_ex;
+};
+
+struct mlx4_create_qp_resp_ex {
+ struct ibv_create_qp_resp_ex ibv_resp;
+};
+
+#endif /* MLX4_ABI_H */
diff --git a/contrib/ofed/libmlx4/mlx4.c b/contrib/ofed/libmlx4/mlx4.c
new file mode 100644
index 000000000000..229c2670b5ed
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+
+#ifndef PCI_VENDOR_ID_MELLANOX
+#define PCI_VENDOR_ID_MELLANOX 0x15b3
+#endif
+
+#define HCA(v, d) \
+ { .vendor = PCI_VENDOR_ID_##v, \
+ .device = d }
+
+static struct {
+ unsigned vendor;
+ unsigned device;
+} hca_table[] = {
+ HCA(MELLANOX, 0x6340), /* MT25408 "Hermon" SDR */
+ HCA(MELLANOX, 0x634a), /* MT25408 "Hermon" DDR */
+ HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */
+ HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */
+ HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */
+ HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */
+ HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+ HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+ HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+ HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/
+ HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+ HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */
+ HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */
+ HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */
+ HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */
+ HCA(MELLANOX, 0x1005), /* MT27510 Family */
+ HCA(MELLANOX, 0x1006), /* MT27511 Family */
+ HCA(MELLANOX, 0x1007), /* MT27520 Family */
+ HCA(MELLANOX, 0x1008), /* MT27521 Family */
+ HCA(MELLANOX, 0x1009), /* MT27530 Family */
+ HCA(MELLANOX, 0x100a), /* MT27531 Family */
+ HCA(MELLANOX, 0x100b), /* MT27540 Family */
+ HCA(MELLANOX, 0x100c), /* MT27541 Family */
+ HCA(MELLANOX, 0x100d), /* MT27550 Family */
+ HCA(MELLANOX, 0x100e), /* MT27551 Family */
+ HCA(MELLANOX, 0x100f), /* MT27560 Family */
+ HCA(MELLANOX, 0x1010), /* MT27561 Family */
+};
+
+static struct ibv_context_ops mlx4_ctx_ops = {
+ .query_device = mlx4_query_device,
+ .query_port = mlx4_query_port,
+ .alloc_pd = mlx4_alloc_pd,
+ .dealloc_pd = mlx4_free_pd,
+ .reg_mr = mlx4_reg_mr,
+ .rereg_mr = mlx4_rereg_mr,
+ .dereg_mr = mlx4_dereg_mr,
+ .alloc_mw = mlx4_alloc_mw,
+ .dealloc_mw = mlx4_dealloc_mw,
+ .bind_mw = mlx4_bind_mw,
+ .create_cq = mlx4_create_cq,
+ .poll_cq = mlx4_poll_cq,
+ .req_notify_cq = mlx4_arm_cq,
+ .cq_event = mlx4_cq_event,
+ .resize_cq = mlx4_resize_cq,
+ .destroy_cq = mlx4_destroy_cq,
+ .create_srq = mlx4_create_srq,
+ .modify_srq = mlx4_modify_srq,
+ .query_srq = mlx4_query_srq,
+ .destroy_srq = mlx4_destroy_srq,
+ .post_srq_recv = mlx4_post_srq_recv,
+ .create_qp = mlx4_create_qp,
+ .query_qp = mlx4_query_qp,
+ .modify_qp = mlx4_modify_qp,
+ .destroy_qp = mlx4_destroy_qp,
+ .post_send = mlx4_post_send,
+ .post_recv = mlx4_post_recv,
+ .create_ah = mlx4_create_ah,
+ .destroy_ah = mlx4_destroy_ah,
+ .attach_mcast = ibv_cmd_attach_mcast,
+ .detach_mcast = ibv_cmd_detach_mcast
+};
+
+static int mlx4_map_internal_clock(struct mlx4_device *mdev,
+ struct ibv_context *ibv_ctx)
+{
+ struct mlx4_context *context = to_mctx(ibv_ctx);
+ void *hca_clock_page;
+
+ hca_clock_page = mmap(NULL, mdev->page_size,
+ PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
+ mdev->page_size * 3);
+
+ if (hca_clock_page == MAP_FAILED) {
+ fprintf(stderr, PFX
+ "Warning: Timestamp available,\n"
+ "but failed to mmap() hca core clock page.\n");
+ return -1;
+ }
+
+ context->hca_core_clock = hca_clock_page +
+ (context->core_clock.offset & (mdev->page_size - 1));
+ return 0;
+}
+
+static int mlx4_init_context(struct verbs_device *v_device,
+ struct ibv_context *ibv_ctx, int cmd_fd)
+{
+ struct mlx4_context *context;
+ struct ibv_get_context cmd;
+ struct mlx4_alloc_ucontext_resp resp;
+ int i;
+ struct mlx4_alloc_ucontext_resp_v3 resp_v3;
+ __u16 bf_reg_size;
+ struct mlx4_device *dev = to_mdev(&v_device->device);
+ struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
+ struct ibv_device_attr_ex dev_attrs;
+
+ /* memory footprint of mlx4_context and verbs_context share
+ * struct ibv_context.
+ */
+ context = to_mctx(ibv_ctx);
+ ibv_ctx->cmd_fd = cmd_fd;
+
+ if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+ if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd,
+ &resp_v3.ibv_resp, sizeof resp_v3))
+ return errno;
+
+ context->num_qps = resp_v3.qp_tab_size;
+ bf_reg_size = resp_v3.bf_reg_size;
+ context->cqe_size = sizeof (struct mlx4_cqe);
+ } else {
+ if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp))
+ return errno;
+
+ context->num_qps = resp.qp_tab_size;
+ bf_reg_size = resp.bf_reg_size;
+ if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE)
+ context->cqe_size = resp.cqe_size;
+ else
+ context->cqe_size = sizeof (struct mlx4_cqe);
+ }
+
+ context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
+ context->qp_table_mask = (1 << context->qp_table_shift) - 1;
+ for (i = 0; i < MLX4_PORTS_NUM; ++i)
+ context->port_query_cache[i].valid = 0;
+
+ pthread_mutex_init(&context->qp_table_mutex, NULL);
+ for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
+ context->qp_table[i].refcnt = 0;
+
+ for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
+ context->db_list[i] = NULL;
+
+ mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps);
+ pthread_mutex_init(&context->db_list_mutex, NULL);
+
+ context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
+ MAP_SHARED, cmd_fd, 0);
+ if (context->uar == MAP_FAILED)
+ return errno;
+
+ if (bf_reg_size) {
+ context->bf_page = mmap(NULL, dev->page_size,
+ PROT_WRITE, MAP_SHARED, cmd_fd,
+ dev->page_size);
+ if (context->bf_page == MAP_FAILED) {
+ fprintf(stderr, PFX "Warning: BlueFlame available, "
+ "but failed to mmap() BlueFlame page.\n");
+ context->bf_page = NULL;
+ context->bf_buf_size = 0;
+ } else {
+ context->bf_buf_size = bf_reg_size / 2;
+ context->bf_offset = 0;
+ pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+ }
+ } else {
+ context->bf_page = NULL;
+ context->bf_buf_size = 0;
+ }
+
+ pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
+ ibv_ctx->ops = mlx4_ctx_ops;
+
+ context->hca_core_clock = NULL;
+ memset(&dev_attrs, 0, sizeof(dev_attrs));
+ if (!mlx4_query_device_ex(ibv_ctx, NULL, &dev_attrs,
+ sizeof(struct ibv_device_attr_ex))) {
+ context->max_qp_wr = dev_attrs.orig_attr.max_qp_wr;
+ context->max_sge = dev_attrs.orig_attr.max_sge;
+ if (context->core_clock.offset_valid)
+ mlx4_map_internal_clock(dev, ibv_ctx);
+ }
+
+ verbs_ctx->has_comp_mask = VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ |
+ VERBS_CONTEXT_QP;
+ verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
+ verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
+ verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
+ verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
+ verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
+ verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
+ verbs_set_ctx_op(verbs_ctx, ibv_create_flow, ibv_cmd_create_flow);
+ verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
+ verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex);
+ verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex);
+ verbs_set_ctx_op(verbs_ctx, query_rt_values, mlx4_query_rt_values);
+
+ return 0;
+
+}
+
+static void mlx4_uninit_context(struct verbs_device *v_device,
+ struct ibv_context *ibv_ctx)
+{
+ struct mlx4_context *context = to_mctx(ibv_ctx);
+
+ munmap(context->uar, to_mdev(&v_device->device)->page_size);
+ if (context->bf_page)
+ munmap(context->bf_page, to_mdev(&v_device->device)->page_size);
+ if (context->hca_core_clock)
+ munmap(context->hca_core_clock - context->core_clock.offset,
+ to_mdev(&v_device->device)->page_size);
+}
+
+static struct verbs_device_ops mlx4_dev_ops = {
+ .init_context = mlx4_init_context,
+ .uninit_context = mlx4_uninit_context,
+};
+
+static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version)
+{
+ char value[8];
+ struct mlx4_device *dev;
+ unsigned vendor, device;
+ int i;
+
+ if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
+ value, sizeof value) < 0)
+ return NULL;
+ vendor = strtol(value, NULL, 16);
+
+ if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
+ value, sizeof value) < 0)
+ return NULL;
+ device = strtol(value, NULL, 16);
+
+ for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
+ if (vendor == hca_table[i].vendor &&
+ device == hca_table[i].device)
+ goto found;
+
+ return NULL;
+
+found:
+ if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION ||
+ abi_version > MLX4_UVERBS_MAX_ABI_VERSION) {
+ fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
+ "(min supported %d, max supported %d)\n",
+ abi_version, uverbs_sys_path,
+ MLX4_UVERBS_MIN_ABI_VERSION,
+ MLX4_UVERBS_MAX_ABI_VERSION);
+ return NULL;
+ }
+
+ dev = calloc(1, sizeof *dev);
+ if (!dev) {
+ fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
+ uverbs_sys_path);
+ return NULL;
+ }
+
+ dev->page_size = sysconf(_SC_PAGESIZE);
+ dev->abi_version = abi_version;
+
+ dev->verbs_dev.ops = &mlx4_dev_ops;
+ dev->verbs_dev.sz = sizeof(*dev);
+ dev->verbs_dev.size_of_context =
+ sizeof(struct mlx4_context) - sizeof(struct ibv_context);
+
+ return &dev->verbs_dev;
+}
+
+static __attribute__((constructor)) void mlx4_register_driver(void)
+{
+ verbs_register_driver("mlx4", mlx4_driver_init);
+}
diff --git a/contrib/ofed/libmlx4/mlx4.h b/contrib/ofed/libmlx4/mlx4.h
new file mode 100644
index 000000000000..864ef9eccc60
--- /dev/null
+++ b/contrib/ofed/libmlx4/mlx4.h
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <infiniband/endian.h>
+#include <stddef.h>
+
+#include <infiniband/driver.h>
+#include <infiniband/udma_barrier.h>
+#include <infiniband/verbs.h>
+
+#define MLX4_PORTS_NUM 2
+
+#define PFX "mlx4: "
+
+enum {
+ MLX4_STAT_RATE_OFFSET = 5
+};
+
+enum {
+ MLX4_QP_TABLE_BITS = 8,
+ MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS,
+ MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
+};
+
+#define MLX4_REMOTE_SRQN_FLAGS(wr) htobe32(wr->qp_type.xrc.remote_srqn << 8)
+
+enum {
+ MLX4_XSRQ_TABLE_BITS = 8,
+ MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+ MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_xsrq_table {
+ struct {
+ struct mlx4_srq **table;
+ int refcnt;
+ } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+
+ pthread_mutex_t mutex;
+ int num_xsrq;
+ int shift;
+ int mask;
+};
+
+enum {
+ MLX4_XRC_QPN_BIT = (1 << 23)
+};
+
+enum mlx4_db_type {
+ MLX4_DB_TYPE_CQ,
+ MLX4_DB_TYPE_RQ,
+ MLX4_NUM_DB_TYPE
+};
+
+enum {
+ MLX4_OPCODE_NOP = 0x00,
+ MLX4_OPCODE_SEND_INVAL = 0x01,
+ MLX4_OPCODE_RDMA_WRITE = 0x08,
+ MLX4_OPCODE_RDMA_WRITE_IMM = 0x09,
+ MLX4_OPCODE_SEND = 0x0a,
+ MLX4_OPCODE_SEND_IMM = 0x0b,
+ MLX4_OPCODE_LSO = 0x0e,
+ MLX4_OPCODE_RDMA_READ = 0x10,
+ MLX4_OPCODE_ATOMIC_CS = 0x11,
+ MLX4_OPCODE_ATOMIC_FA = 0x12,
+ MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14,
+ MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15,
+ MLX4_OPCODE_BIND_MW = 0x18,
+ MLX4_OPCODE_FMR = 0x19,
+ MLX4_OPCODE_LOCAL_INVAL = 0x1b,
+ MLX4_OPCODE_CONFIG_CMD = 0x1f,
+
+ MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00,
+ MLX4_RECV_OPCODE_SEND = 0x01,
+ MLX4_RECV_OPCODE_SEND_IMM = 0x02,
+ MLX4_RECV_OPCODE_SEND_INVAL = 0x03,
+
+ MLX4_CQE_OPCODE_ERROR = 0x1e,
+ MLX4_CQE_OPCODE_RESIZE = 0x16,
+};
+
+struct mlx4_device {
+ struct verbs_device verbs_dev;
+ int page_size;
+ int abi_version;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+ struct ibv_context ibv_ctx;
+
+ void *uar;
+ pthread_spinlock_t uar_lock;
+
+ void *bf_page;
+ int bf_buf_size;
+ int bf_offset;
+ pthread_spinlock_t bf_lock;
+
+ struct {
+ struct mlx4_qp **table;
+ int refcnt;
+ } qp_table[MLX4_QP_TABLE_SIZE];
+ pthread_mutex_t qp_table_mutex;
+ int num_qps;
+ int qp_table_shift;
+ int qp_table_mask;
+ int max_qp_wr;
+ int max_sge;
+
+ struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
+ pthread_mutex_t db_list_mutex;
+ int cqe_size;
+ struct mlx4_xsrq_table xsrq_table;
+ struct {
+ uint8_t valid;
+ uint8_t link_layer;
+ enum ibv_port_cap_flags caps;
+ } port_query_cache[MLX4_PORTS_NUM];
+ struct {
+ uint64_t offset;
+ uint8_t offset_valid;
+ } core_clock;
+ void *hca_core_clock;
+};
+
+struct mlx4_buf {
+ void *buf;
+ size_t length;
+};
+
+struct mlx4_pd {
+ struct ibv_pd ibv_pd;
+ uint32_t pdn;
+};
+
+enum {
+ MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0,
+ MLX4_CQ_FLAGS_EXTENDED = 1 << 1,
+ MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2,
+};
+
+struct mlx4_cq {
+ struct ibv_cq_ex ibv_cq;
+ struct mlx4_buf buf;
+ struct mlx4_buf resize_buf;
+ pthread_spinlock_t lock;
+ uint32_t cqn;
+ uint32_t cons_index;
+ uint32_t *set_ci_db;
+ uint32_t *arm_db;
+ int arm_sn;
+ int cqe_size;
+ struct mlx4_qp *cur_qp;
+ struct mlx4_cqe *cqe;
+ uint32_t flags;
+};
+
+struct mlx4_srq {
+ struct verbs_srq verbs_srq;
+ struct mlx4_buf buf;
+ pthread_spinlock_t lock;
+ uint64_t *wrid;
+ uint32_t srqn;
+ int max;
+ int max_gs;
+ int wqe_shift;
+ int head;
+ int tail;
+ uint32_t *db;
+ uint16_t counter;
+ uint8_t ext_srq;
+};
+
+struct mlx4_wq {
+ uint64_t *wrid;
+ pthread_spinlock_t lock;
+ int wqe_cnt;
+ int max_post;
+ unsigned head;
+ unsigned tail;
+ int max_gs;
+ int wqe_shift;
+ int offset;
+};
+
+struct mlx4_qp {
+ struct verbs_qp verbs_qp;
+ struct mlx4_buf buf;
+ int max_inline_data;
+ int buf_size;
+
+ uint32_t doorbell_qpn;
+ uint32_t sq_signal_bits;
+ int sq_spare_wqes;
+ struct mlx4_wq sq;
+
+ uint32_t *db;
+ struct mlx4_wq rq;
+
+ uint8_t link_layer;
+ uint32_t qp_cap_cache;
+};
+
+struct mlx4_av {
+ uint32_t port_pd;
+ uint8_t reserved1;
+ uint8_t g_slid;
+ uint16_t dlid;
+ uint8_t reserved2;
+ uint8_t gid_index;
+ uint8_t stat_rate;
+ uint8_t hop_limit;
+ uint32_t sl_tclass_flowlabel;
+ uint8_t dgid[16];
+};
+
+struct mlx4_ah {
+ struct ibv_ah ibv_ah;
+ struct mlx4_av av;
+ uint16_t vlan;
+ uint8_t mac[6];
+};
+
+enum {
+ MLX4_CSUM_SUPPORT_UD_OVER_IB = (1 << 0),
+ MLX4_CSUM_SUPPORT_RAW_OVER_ETH = (1 << 1),
+ /* Only report rx checksum when the validation is valid */
+ MLX4_RX_CSUM_VALID = (1 << 16),
+};
+
+enum mlx4_cqe_status {
+ MLX4_CQE_STATUS_TCP_UDP_CSUM_OK = (1 << 2),
+ MLX4_CQE_STATUS_IPV4_PKT = (1 << 22),
+ MLX4_CQE_STATUS_IP_HDR_CSUM_OK = (1 << 28),
+ MLX4_CQE_STATUS_IPV4_CSUM_OK = MLX4_CQE_STATUS_IPV4_PKT |
+ MLX4_CQE_STATUS_IP_HDR_CSUM_OK |
+ MLX4_CQE_STATUS_TCP_UDP_CSUM_OK
+};
+
+struct mlx4_cqe {
+ uint32_t vlan_my_qpn;
+ uint32_t immed_rss_invalid;
+ uint32_t g_mlpath_rqpn;
+ union {
+ struct {
+ uint16_t sl_vid;
+ uint16_t rlid;
+ };
+ uint32_t ts_47_16;
+ };
+ uint32_t status;
+ uint32_t byte_cnt;
+ uint16_t wqe_index;
+ uint16_t checksum;
+ uint8_t reserved3;
+ uint8_t ts_15_8;
+ uint8_t ts_7_0;
+ uint8_t owner_sr_opcode;
+};
+
+static inline unsigned long align(unsigned long val, unsigned long align)
+{
+ return (val + align - 1) & ~(align - 1);
+}
+int align_queue_size(int req);
+
+#define to_mxxx(xxx, type) \
+ ((struct mlx4_##type *) \
+ ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev)
+{
+ /* ibv_device is first field of verbs_device
+ * see try_driver() in libibverbs.
+ */
+ return container_of(ibdev, struct mlx4_device, verbs_dev);
+}
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+ return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd)
+{
+ return to_mxxx(pd, pd);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+ return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
+{
+ return container_of(container_of(ibsrq, struct verbs_srq, srq),
+ struct mlx4_srq, verbs_srq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+ return container_of(container_of(ibqp, struct verbs_qp, qp),
+ struct mlx4_qp, verbs_qp);
+}
+
+static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
+{
+ return to_mxxx(ah, ah);
+}
+
+static inline void mlx4_update_cons_index(struct mlx4_cq *cq)
+{
+ *cq->set_ci_db = htobe32(cq->cons_index & 0xffffff);
+}
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+void mlx4_free_buf(struct mlx4_buf *buf);
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);
+
+int mlx4_query_device(struct ibv_context *context,
+ struct ibv_device_attr *attr);
+int mlx4_query_device_ex(struct ibv_context *context,
+ const struct ibv_query_device_ex_input *input,
+ struct ibv_device_attr_ex *attr,
+ size_t attr_size);
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+ struct ibv_port_attr *attr);
+int mlx4_query_rt_values(struct ibv_context *context,
+ struct ibv_values_ex *values);
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
+int mlx4_free_pd(struct ibv_pd *pd);
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr);
+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
+ size_t length, int access);
+int mlx4_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd,
+ void *addr, size_t length, int access);
+int mlx4_dereg_mr(struct ibv_mr *mr);
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type);
+int mlx4_dealloc_mw(struct ibv_mw *mw);
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind);
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector);
+struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
+ struct ibv_cq_init_attr_ex *cq_attr);
+void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr);
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+ int entry_size);
+int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
+int mlx4_destroy_cq(struct ibv_cq *cq);
+int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
+void mlx4_cq_event(struct ibv_cq *cq);
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq);
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe);
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+ struct ibv_srq_init_attr *attr);
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
+int mlx4_modify_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr,
+ int mask);
+int mlx4_query_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr);
+int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+ struct mlx4_srq *srq);
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+ struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr);
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr);
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr);
+int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+ int attr_mask,
+ struct ibv_qp_init_attr *init_attr);
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+ int attr_mask);
+int mlx4_destroy_qp(struct ibv_qp *qp);
+void mlx4_init_qp_indices(struct mlx4_qp *qp);
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+ struct ibv_send_wr **bad_wr);
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr);
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+ struct mlx4_qp *qp);
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
+ enum ibv_qp_type type, struct mlx4_qp *qp);
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+ enum ibv_qp_type type);
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn);
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp);
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+int mlx4_destroy_ah(struct ibv_ah *ah);
+int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
+ struct mlx4_ah *ah);
+void mlx4_free_av(struct mlx4_ah *ah);
+
+#endif /* MLX4_H */
diff --git a/contrib/ofed/libmlx4/mmio.h b/contrib/ofed/libmlx4/mmio.h
new file mode 100644
index 000000000000..a1a296658fdb
--- /dev/null
+++ b/contrib/ofed/libmlx4/mmio.h
@@ -0,0 +1,116 @@
+/* Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md
+ */
+#ifndef MMIO_H
+#define MMIO_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifdef __s390x__
+
+static inline long mmio_writeb(const unsigned long mmio_addr,
+ const uint8_t val)
+{
+ return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writew(const unsigned long mmio_addr,
+ const uint16_t val)
+{
+ return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writel(const unsigned long mmio_addr,
+ const uint32_t val)
+{
+ return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_writeq(const unsigned long mmio_addr,
+ const uint64_t val)
+{
+ return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val));
+}
+
+static inline long mmio_write(const unsigned long mmio_addr,
+ const void *val,
+ const size_t length)
+{
+ return syscall(__NR_s390_pci_mmio_write, mmio_addr, val, length);
+}
+
+static inline long mmio_readb(const unsigned long mmio_addr, uint8_t *val)
+{
+ return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readw(const unsigned long mmio_addr, uint16_t *val)
+{
+ return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readl(const unsigned long mmio_addr, uint32_t *val)
+{
+ return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_readq(const unsigned long mmio_addr, uint64_t *val)
+{
+ return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val));
+}
+
+static inline long mmio_read(const unsigned long mmio_addr,
+ void *val,
+ const size_t length)
+{
+ return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, length);
+}
+
+static inline void mlx4_bf_copy(unsigned long *dst,
+ unsigned long *src,
+ unsigned bytecnt)
+{
+ mmio_write((unsigned long)dst, src, bytecnt);
+}
+
+#else
+
+#define mmio_writeb(addr, value) \
+ (*((volatile uint8_t *)addr) = value)
+#define mmio_writew(addr, value) \
+ (*((volatile uint16_t *)addr) = value)
+#define mmio_writel(addr, value) \
+ (*((volatile uint32_t *)addr) = value)
+#define mmio_writeq(addr, value) \
+ (*((volatile uint64_t *)addr) = value)
+#define mmio_write(addr, value, length) \
+ memcpy(addr, value, length)
+
+#define mmio_readb(addr, value) \
+ (value = *((volatile uint8_t *)addr))
+#define mmio_readw(addr, value) \
+ (value = *((volatile uint16_t *)addr))
+#define mmio_readl(addr, value) \
+ (value = *((volatile uint32_t *)addr))
+#define mmio_readq(addr, value) \
+ (value = *((volatile uint64_t *)addr))
+#define mmio_read(addr, value, length) \
+ memcpy(value, addr, length)
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static inline void mlx4_bf_copy(unsigned long *dst,
+ unsigned long *src,
+ unsigned bytecnt)
+{
+ while (bytecnt > 0) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ bytecnt -= 2 * sizeof(long);
+ }
+}
+#endif
+
+#endif
diff --git a/contrib/ofed/libmlx4/qp.c b/contrib/ofed/libmlx4/qp.c
new file mode 100644
index 000000000000..577aab5287ab
--- /dev/null
+++ b/contrib/ofed/libmlx4/qp.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+
+static const uint32_t mlx4_ib_opcode[] = {
+ [IBV_WR_SEND] = MLX4_OPCODE_SEND,
+ [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
+ [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE,
+ [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM,
+ [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ,
+ [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS,
+ [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
+ [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL,
+ [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW,
+ [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL,
+};
+
+static void *get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+ return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
+}
+
+static void *get_send_wqe(struct mlx4_qp *qp, int n)
+{
+ return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+{
+ uint32_t *wqe = get_send_wqe(qp, n);
+ int i;
+ int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
+
+ for (i = 16; i < ds; i += 16)
+ wqe[i] = 0xffffffff;
+}
+
+void mlx4_init_qp_indices(struct mlx4_qp *qp)
+{
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+}
+
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
+{
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ int i;
+
+ for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+ ctrl = get_send_wqe(qp, i);
+ ctrl->owner_opcode = htobe32(1 << 31);
+ ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+ stamp_send_wqe(qp, i);
+ }
+}
+
+static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
+{
+ unsigned cur;
+
+ cur = wq->head - wq->tail;
+ if (cur + nreq < wq->max_post)
+ return 0;
+
+ pthread_spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ pthread_spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
+{
+ int acc = wr->bind_mw.bind_info.mw_access_flags;
+ bseg->flags1 = 0;
+ if (acc & IBV_ACCESS_REMOTE_ATOMIC)
+ bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
+ if (acc & IBV_ACCESS_REMOTE_WRITE)
+ bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
+ if (acc & IBV_ACCESS_REMOTE_READ)
+ bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
+
+ bseg->flags2 = 0;
+ if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
+ bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
+ if (acc & IBV_ACCESS_ZERO_BASED)
+ bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
+
+ bseg->new_rkey = htobe32(wr->bind_mw.rkey);
+ bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
+ bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
+ bseg->length = htobe64(wr->bind_mw.bind_info.length);
+}
+
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+ uint32_t rkey)
+{
+ iseg->mem_key = htobe32(rkey);
+
+ iseg->reserved1 = 0;
+ iseg->reserved2 = 0;
+ iseg->reserved3[0] = 0;
+ iseg->reserved3[1] = 0;
+}
+
+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+ uint64_t remote_addr, uint32_t rkey)
+{
+ rseg->raddr = htobe64(remote_addr);
+ rseg->rkey = htobe32(rkey);
+ rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
+{
+ if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
+ aseg->swap_add = htobe64(wr->wr.atomic.swap);
+ aseg->compare = htobe64(wr->wr.atomic.compare_add);
+ } else {
+ aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
+ aseg->compare = 0;
+ }
+
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+ struct ibv_send_wr *wr)
+{
+ memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+ dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
+ dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
+ dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
+ memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+ dseg->byte_count = htobe32(sg->length);
+ dseg->lkey = htobe32(sg->lkey);
+ dseg->addr = htobe64(sg->addr);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+ dseg->lkey = htobe32(sg->lkey);
+ dseg->addr = htobe64(sg->addr);
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ udma_to_device_barrier();
+
+ if (likely(sg->length))
+ dseg->byte_count = htobe32(sg->length);
+ else
+ dseg->byte_count = htobe32(0x80000000);
+}
+
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+ struct ibv_send_wr **bad_wr)
+{
+ struct mlx4_context *ctx;
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl = NULL;
+ int ind;
+ int nreq;
+ int inl = 0;
+ int ret = 0;
+ int size = 0;
+ int i;
+
+ pthread_spin_lock(&qp->sq.lock);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.head;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ctrl->srcrb_flags =
+ (wr->send_flags & IBV_SEND_SIGNALED ?
+ htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+ (wr->send_flags & IBV_SEND_SOLICITED ?
+ htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) |
+ qp->sq_signal_bits;
+
+ if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+ ctrl->imm = wr->imm_data;
+ else
+ ctrl->imm = 0;
+
+ wqe += sizeof *ctrl;
+ size = sizeof *ctrl / 16;
+
+ switch (ibqp->qp_type) {
+ case IBV_QPT_XRC_SEND:
+ ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
+ /* fall through */
+ case IBV_QPT_RC:
+ case IBV_QPT_UC:
+ switch (wr->opcode) {
+ case IBV_WR_ATOMIC_CMP_AND_SWP:
+ case IBV_WR_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ set_atomic_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_atomic_seg);
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+ break;
+
+ case IBV_WR_RDMA_READ:
+ inl = 1;
+ /* fall through */
+ case IBV_WR_RDMA_WRITE:
+ case IBV_WR_RDMA_WRITE_WITH_IMM:
+ if (!wr->num_sge)
+ inl = 1;
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+ size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+ break;
+ case IBV_WR_LOCAL_INV:
+ ctrl->srcrb_flags |=
+ htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_local_inv_seg(wqe, wr->imm_data);
+ wqe += sizeof
+ (struct mlx4_wqe_local_inval_seg);
+ size += sizeof
+ (struct mlx4_wqe_local_inval_seg) / 16;
+ break;
+ case IBV_WR_BIND_MW:
+ ctrl->srcrb_flags |=
+ htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_bind_seg(wqe, wr);
+ wqe += sizeof
+ (struct mlx4_wqe_bind_seg);
+ size += sizeof
+ (struct mlx4_wqe_bind_seg) / 16;
+ break;
+ case IBV_WR_SEND_WITH_INV:
+ ctrl->imm = htobe32(wr->imm_data);
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+ break;
+
+ case IBV_QPT_UD:
+ set_datagram_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+ if (wr->send_flags & IBV_SEND_IP_CSUM) {
+ if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+ ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ }
+ break;
+
+ case IBV_QPT_RAW_PACKET:
+ /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+ * to indicate that no icrc should be calculated */
+ ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
+ if (wr->send_flags & IBV_SEND_IP_CSUM) {
+ if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+ ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
+ struct mlx4_wqe_inline_seg *seg;
+ void *addr;
+ int len, seg_len;
+ int num_seg;
+ int off, to_copy;
+
+ inl = 0;
+
+ seg = wqe;
+ wqe += sizeof *seg;
+ off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
+ num_seg = 0;
+ seg_len = 0;
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ addr = (void *) (uintptr_t) wr->sg_list[i].addr;
+ len = wr->sg_list[i].length;
+ inl += len;
+
+ if (inl > qp->max_inline_data) {
+ inl = 0;
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ while (len >= MLX4_INLINE_ALIGN - off) {
+ to_copy = MLX4_INLINE_ALIGN - off;
+ memcpy(wqe, addr, to_copy);
+ len -= to_copy;
+ wqe += to_copy;
+ addr += to_copy;
+ seg_len += to_copy;
+ udma_to_device_barrier(); /* see comment below */
+ seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
+ seg_len = 0;
+ seg = wqe;
+ wqe += sizeof *seg;
+ off = sizeof *seg;
+ ++num_seg;
+ }
+
+ memcpy(wqe, addr, len);
+ wqe += len;
+ seg_len += len;
+ off += len;
+ }
+
+ if (seg_len) {
+ ++num_seg;
+ /*
+ * Need a barrier here to make sure
+ * all the data is visible before the
+ * byte_count field is set. Otherwise
+ * the HCA prefetcher could grab the
+ * 64-byte chunk with this inline
+ * segment and get a valid (!=
+ * 0xffffffff) byte count but stale
+ * data, and end up sending the wrong
+ * data.
+ */
+ udma_to_device_barrier();
+ seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
+ }
+
+ size += (inl + num_seg * sizeof * seg + 15) / 16;
+ } else {
+ struct mlx4_wqe_data_seg *seg = wqe;
+
+ for (i = wr->num_sge - 1; i >= 0 ; --i)
+ set_data_seg(seg + i, wr->sg_list + i);
+
+ size += wr->num_sge * (sizeof *seg / 16);
+ }
+
+ ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ udma_to_device_barrier();
+
+ ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
+ (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
+
+ /*
+ * We can improve latency by not stamping the last
+ * send queue WQE until after ringing the doorbell, so
+ * only stamp here if there are still more WQEs to post.
+ */
+ if (wr->next)
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+ (qp->sq.wqe_cnt - 1));
+
+ ++ind;
+ }
+
+out:
+ ctx = to_mctx(ibqp->context);
+
+ if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
+ ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
+
+ ctrl->bf_qpn |= qp->doorbell_qpn;
+ ++qp->sq.head;
+ /*
+ * Make sure that descriptor is written to memory
+ * before writing to BlueFlame page.
+ */
+ mmio_wc_spinlock(&ctx->bf_lock);
+
+ mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
+ align(size * 16, 64));
+ /* Flush before toggling bf_offset to be latency oriented */
+ mmio_flush_writes();
+
+ ctx->bf_offset ^= ctx->bf_buf_size;
+
+ pthread_spin_unlock(&ctx->bf_lock);
+ } else if (nreq) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ udma_to_device_barrier();
+
+ mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
+ qp->doorbell_qpn);
+ }
+
+ if (nreq)
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+ (qp->sq.wqe_cnt - 1));
+
+ pthread_spin_unlock(&qp->sq.lock);
+
+ return ret;
+}
+
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ int ret = 0;
+ int nreq;
+ int ind;
+ int i;
+
+ pthread_spin_lock(&qp->rq.lock);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (wr->num_sge > qp->rq.max_gs) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ scat = get_recv_wqe(qp, ind);
+
+ for (i = 0; i < wr->num_sge; ++i)
+ __set_data_seg(scat + i, wr->sg_list + i);
+
+ if (i < qp->rq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = htobe32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+
+ qp->rq.wrid[ind] = wr->wr_id;
+
+ ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+ }
+
+out:
+ if (nreq) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ udma_to_device_barrier();
+
+ *qp->db = htobe32(qp->rq.head & 0xffff);
+ }
+
+ pthread_spin_unlock(&qp->rq.lock);
+
+ return ret;
+}
+
+static int num_inline_segs(int data, enum ibv_qp_type type)
+{
+ /*
+ * Inline data segments are not allowed to cross 64 byte
+ * boundaries. For UD QPs, the data segments always start
+ * aligned to 64 bytes (16 byte control segment + 48 byte
+ * datagram segment); for other QPs, there will be a 16 byte
+ * control segment and possibly a 16 byte remote address
+ * segment, so in the worst case there will be only 32 bytes
+ * available for the first data segment.
+ */
+ if (type == IBV_QPT_UD)
+ data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg)) %
+ MLX4_INLINE_ALIGN;
+ else
+ data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg)) %
+ MLX4_INLINE_ALIGN;
+
+ return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
+ (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
+}
+
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+ struct mlx4_qp *qp)
+{
+ int size;
+ int max_sq_sge;
+
+ max_sq_sge = align(cap->max_inline_data +
+ num_inline_segs(cap->max_inline_data, type) *
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg)) /
+ sizeof (struct mlx4_wqe_data_seg);
+ if (max_sq_sge < cap->max_send_sge)
+ max_sq_sge = cap->max_send_sge;
+
+ size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
+ switch (type) {
+ case IBV_QPT_UD:
+ size += sizeof (struct mlx4_wqe_datagram_seg);
+ break;
+
+ case IBV_QPT_UC:
+ size += sizeof (struct mlx4_wqe_raddr_seg);
+ break;
+
+ case IBV_QPT_XRC_SEND:
+ case IBV_QPT_RC:
+ size += sizeof (struct mlx4_wqe_raddr_seg);
+ /*
+ * An atomic op will require an atomic segment, a
+ * remote address segment and one scatter entry.
+ */
+ if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_data_seg)))
+ size = (sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_data_seg));
+ break;
+
+ default:
+ break;
+ }
+
+ /* Make sure that we have enough space for a bind request */
+ if (size < sizeof (struct mlx4_wqe_bind_seg))
+ size = sizeof (struct mlx4_wqe_bind_seg);
+
+ size += sizeof (struct mlx4_wqe_ctrl_seg);
+
+ for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+ qp->sq.wqe_shift++)
+ ; /* nothing */
+}
+
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
+ enum ibv_qp_type type, struct mlx4_qp *qp)
+{
+ qp->rq.max_gs = cap->max_recv_sge;
+
+ if (qp->sq.wqe_cnt) {
+ qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
+ if (!qp->sq.wrid)
+ return -1;
+ }
+
+ if (qp->rq.wqe_cnt) {
+ qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
+ if (!qp->rq.wrid) {
+ free(qp->sq.wrid);
+ return -1;
+ }
+ }
+
+ for (qp->rq.wqe_shift = 4;
+ 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
+ qp->rq.wqe_shift++)
+ ; /* nothing */
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.offset = 0;
+ qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ } else {
+ qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+ qp->sq.offset = 0;
+ }
+
+ if (qp->buf_size) {
+ if (mlx4_alloc_buf(&qp->buf,
+ align(qp->buf_size, to_mdev(context->device)->page_size),
+ to_mdev(context->device)->page_size)) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ return -1;
+ }
+
+ memset(qp->buf.buf, 0, qp->buf_size);
+ } else {
+ qp->buf.buf = NULL;
+ }
+
+ return 0;
+}
+
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+ enum ibv_qp_type type)
+{
+ int wqe_size;
+
+ wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
+ switch (type) {
+ case IBV_QPT_UD:
+ wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
+ break;
+
+ case IBV_QPT_XRC_SEND:
+ case IBV_QPT_UC:
+ case IBV_QPT_RC:
+ wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
+ break;
+
+ default:
+ break;
+ }
+
+ qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg);
+ cap->max_send_sge = qp->sq.max_gs;
+ qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+ cap->max_send_wr = qp->sq.max_post;
+
+ /*
+ * Inline data segments can't cross a 64 byte boundary. So
+ * subtract off one segment header for each 64-byte chunk,
+ * taking into account the fact that wqe_size will be 32 mod
+ * 64 for non-UD QPs.
+ */
+ qp->max_inline_data = wqe_size -
+ sizeof (struct mlx4_wqe_inline_seg) *
+ (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
+ cap->max_inline_data = qp->max_inline_data;
+}
+
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+ if (ctx->qp_table[tind].refcnt)
+ return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
+ else
+ return NULL;
+}
+
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
+{
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+ if (!ctx->qp_table[tind].refcnt) {
+ ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
+ sizeof (struct mlx4_qp *));
+ if (!ctx->qp_table[tind].table)
+ return -1;
+ }
+
+ ++ctx->qp_table[tind].refcnt;
+ ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
+ return 0;
+}
+
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+ if (!--ctx->qp_table[tind].refcnt)
+ free(ctx->qp_table[tind].table);
+ else
+ ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
+}
diff --git a/contrib/ofed/libmlx4/srq.c b/contrib/ofed/libmlx4/srq.c
new file mode 100644
index 000000000000..b8d25bb343da
--- /dev/null
+++ b/contrib/ofed/libmlx4/srq.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+#include "mlx4-abi.h"
+
+static void *get_wqe(struct mlx4_srq *srq, int n)
+{
+ return srq->buf.buf + (n << srq->wqe_shift);
+}
+
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind)
+{
+ struct mlx4_wqe_srq_next_seg *next;
+
+ pthread_spin_lock(&srq->lock);
+
+ next = get_wqe(srq, srq->tail);
+ next->next_wqe_index = htobe16(ind);
+ srq->tail = ind;
+
+ pthread_spin_unlock(&srq->lock);
+}
+
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+ struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad_wr)
+{
+ struct mlx4_srq *srq = to_msrq(ibsrq);
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scat;
+ int err = 0;
+ int nreq;
+ int i;
+
+ pthread_spin_lock(&srq->lock);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (wr->num_sge > srq->max_gs) {
+ err = -1;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (srq->head == srq->tail) {
+ /* SRQ is full*/
+ err = -1;
+ *bad_wr = wr;
+ break;
+ }
+
+ srq->wrid[srq->head] = wr->wr_id;
+
+ next = get_wqe(srq, srq->head);
+ srq->head = be16toh(next->next_wqe_index);
+ scat = (struct mlx4_wqe_data_seg *) (next + 1);
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ scat[i].byte_count = htobe32(wr->sg_list[i].length);
+ scat[i].lkey = htobe32(wr->sg_list[i].lkey);
+ scat[i].addr = htobe64(wr->sg_list[i].addr);
+ }
+
+ if (i < srq->max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = htobe32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+ }
+
+ if (nreq) {
+ srq->counter += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * we write doorbell record.
+ */
+ udma_to_device_barrier();
+
+ *srq->db = htobe32(srq->counter);
+ }
+
+ pthread_spin_unlock(&srq->lock);
+
+ return err;
+}
+
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+ struct mlx4_srq *srq)
+{
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scatter;
+ int size;
+ int buf_size;
+ int i;
+
+ srq->wrid = malloc(srq->max * sizeof (uint64_t));
+ if (!srq->wrid)
+ return -1;
+
+ size = sizeof (struct mlx4_wqe_srq_next_seg) +
+ srq->max_gs * sizeof (struct mlx4_wqe_data_seg);
+
+ for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift)
+ ; /* nothing */
+
+ buf_size = srq->max << srq->wqe_shift;
+
+ if (mlx4_alloc_buf(&srq->buf, buf_size,
+ to_mdev(pd->context->device)->page_size)) {
+ free(srq->wrid);
+ return -1;
+ }
+
+ memset(srq->buf.buf, 0, buf_size);
+
+ /*
+ * Now initialize the SRQ buffer so that all of the WQEs are
+ * linked into the list of free WQEs.
+ */
+
+ for (i = 0; i < srq->max; ++i) {
+ next = get_wqe(srq, i);
+ next->next_wqe_index = htobe16((i + 1) & (srq->max - 1));
+
+ for (scatter = (void *) (next + 1);
+ (void *) scatter < (void *) next + (1 << srq->wqe_shift);
+ ++scatter)
+ scatter->lkey = htobe32(MLX4_INVALID_LKEY);
+ }
+
+ srq->head = 0;
+ srq->tail = srq->max - 1;
+
+ return 0;
+}
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
+{
+ memset(xsrq_table, 0, sizeof *xsrq_table);
+ xsrq_table->num_xsrq = size;
+ xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
+ xsrq_table->mask = (1 << xsrq_table->shift) - 1;
+
+ pthread_mutex_init(&xsrq_table->mutex, NULL);
+}
+
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+ int index;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ if (xsrq_table->xsrq_table[index].refcnt)
+ return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
+
+ return NULL;
+}
+
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq)
+{
+ int index, ret = 0;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
+ if (!xsrq_table->xsrq_table[index].refcnt) {
+ xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
+ sizeof(struct mlx4_srq *));
+ if (!xsrq_table->xsrq_table[index].table) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ xsrq_table->xsrq_table[index].refcnt++;
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;
+
+out:
+ pthread_mutex_unlock(&xsrq_table->mutex);
+ return ret;
+}
+
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+ int index;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
+
+ if (--xsrq_table->xsrq_table[index].refcnt)
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
+ else
+ free(xsrq_table->xsrq_table[index].table);
+
+ pthread_mutex_unlock(&xsrq_table->mutex);
+}
+
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ struct mlx4_create_xsrq cmd;
+ struct mlx4_create_srq_resp resp;
+ struct mlx4_srq *srq;
+ int ret;
+
+ /* Sanity check SRQ size before proceeding */
+ if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
+ return NULL;
+
+ srq = calloc(1, sizeof *srq);
+ if (!srq)
+ return NULL;
+
+ if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+ goto err;
+
+ srq->max = align_queue_size(attr_ex->attr.max_wr + 1);
+ srq->max_gs = attr_ex->attr.max_sge;
+ srq->counter = 0;
+ srq->ext_srq = 1;
+
+ if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
+ goto err;
+
+ srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+ if (!srq->db)
+ goto err_free;
+
+ *srq->db = 0;
+
+ cmd.buf_addr = (uintptr_t) srq->buf.buf;
+ cmd.db_addr = (uintptr_t) srq->db;
+
+ ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq,
+ sizeof(srq->verbs_srq),
+ attr_ex,
+ &cmd.ibv_cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp);
+ if (ret)
+ goto err_db;
+
+ ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
+ srq->verbs_srq.srq_num, srq);
+ if (ret)
+ goto err_destroy;
+
+ return &srq->verbs_srq.srq;
+
+err_destroy:
+ ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
+err_db:
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
+err_free:
+ free(srq->wrid);
+ mlx4_free_buf(&srq->buf);
+err:
+ free(srq);
+ return NULL;
+}
+
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
+{
+ struct mlx4_context *mctx = to_mctx(srq->context);
+ struct mlx4_srq *msrq = to_msrq(srq);
+ struct mlx4_cq *mcq;
+ int ret;
+
+ mcq = to_mcq(msrq->verbs_srq.cq);
+ mlx4_cq_clean(mcq, 0, msrq);
+ pthread_spin_lock(&mcq->lock);
+ mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
+ pthread_spin_unlock(&mcq->lock);
+
+ ret = ibv_cmd_destroy_srq(srq);
+ if (ret) {
+ pthread_spin_lock(&mcq->lock);
+ mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
+ pthread_spin_unlock(&mcq->lock);
+ return ret;
+ }
+
+ mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
+ mlx4_free_buf(&msrq->buf);
+ free(msrq->wrid);
+ free(msrq);
+
+ return 0;
+}
diff --git a/contrib/ofed/libmlx4/verbs.c b/contrib/ofed/libmlx4/verbs.c
new file mode 100644
index 000000000000..f6f43f9bef76
--- /dev/null
+++ b/contrib/ofed/libmlx4/verbs.c
@@ -0,0 +1,1255 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <infiniband/endian.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+#include "wqe.h"
+
+int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
+{
+ struct ibv_query_device cmd;
+ uint64_t raw_fw_ver;
+ unsigned major, minor, sub_minor;
+ int ret;
+
+ ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
+ if (ret)
+ return ret;
+
+ major = (raw_fw_ver >> 32) & 0xffff;
+ minor = (raw_fw_ver >> 16) & 0xffff;
+ sub_minor = raw_fw_ver & 0xffff;
+
+ snprintf(attr->fw_ver, sizeof attr->fw_ver,
+ "%d.%d.%03d", major, minor, sub_minor);
+
+ return 0;
+}
+
+int mlx4_query_device_ex(struct ibv_context *context,
+ const struct ibv_query_device_ex_input *input,
+ struct ibv_device_attr_ex *attr,
+ size_t attr_size)
+{
+ struct mlx4_context *mctx = to_mctx(context);
+ struct mlx4_query_device_ex_resp resp = {};
+ struct mlx4_query_device_ex cmd = {};
+ uint64_t raw_fw_ver;
+ unsigned sub_minor;
+ unsigned major;
+ unsigned minor;
+ int err;
+
+ err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
+ &raw_fw_ver,
+ &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp.ibv_resp),
+ sizeof(resp));
+ if (err)
+ return err;
+
+ if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) {
+ mctx->core_clock.offset = resp.hca_core_clock_offset;
+ mctx->core_clock.offset_valid = 1;
+ }
+
+ major = (raw_fw_ver >> 32) & 0xffff;
+ minor = (raw_fw_ver >> 16) & 0xffff;
+ sub_minor = raw_fw_ver & 0xffff;
+
+ snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver,
+ "%d.%d.%03d", major, minor, sub_minor);
+
+ return 0;
+}
+
+#define READL(ptr) (*((uint32_t *)(ptr)))
+static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
+{
+ unsigned int clockhi, clocklo, clockhi1;
+ int i;
+ struct mlx4_context *ctx = to_mctx(context);
+
+ if (!ctx->hca_core_clock)
+ return -EOPNOTSUPP;
+
+ /* Handle wraparound */
+ for (i = 0; i < 2; i++) {
+ clockhi = be32toh(READL(ctx->hca_core_clock));
+ clocklo = be32toh(READL(ctx->hca_core_clock + 4));
+ clockhi1 = be32toh(READL(ctx->hca_core_clock));
+ if (clockhi == clockhi1)
+ break;
+ }
+
+ *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
+
+ return 0;
+}
+
+int mlx4_query_rt_values(struct ibv_context *context,
+ struct ibv_values_ex *values)
+{
+ uint32_t comp_mask = 0;
+ int err = 0;
+
+ if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
+ uint64_t cycles;
+
+ err = mlx4_read_clock(context, &cycles);
+ if (!err) {
+ values->raw_clock.tv_sec = 0;
+ values->raw_clock.tv_nsec = cycles;
+ comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
+ }
+ }
+
+ values->comp_mask = comp_mask;
+
+ return err;
+}
+
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+ struct ibv_port_attr *attr)
+{
+ struct ibv_query_port cmd;
+ int err;
+
+ err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
+ if (!err && port <= MLX4_PORTS_NUM && port > 0) {
+ struct mlx4_context *mctx = to_mctx(context);
+ if (!mctx->port_query_cache[port - 1].valid) {
+ mctx->port_query_cache[port - 1].link_layer =
+ attr->link_layer;
+ mctx->port_query_cache[port - 1].caps =
+ attr->port_cap_flags;
+ mctx->port_query_cache[port - 1].valid = 1;
+ }
+ }
+
+ return err;
+}
+
+/* Only the fields in the port cache will be valid */
+static int query_port_cache(struct ibv_context *context, uint8_t port_num,
+ struct ibv_port_attr *port_attr)
+{
+ struct mlx4_context *mctx = to_mctx(context);
+ if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
+ return -EINVAL;
+ if (mctx->port_query_cache[port_num - 1].valid) {
+ port_attr->link_layer =
+ mctx->
+ port_query_cache[port_num - 1].
+ link_layer;
+ port_attr->port_cap_flags =
+ mctx->
+ port_query_cache[port_num - 1].
+ caps;
+ return 0;
+ }
+ return mlx4_query_port(context, port_num,
+ (struct ibv_port_attr *)port_attr);
+
+}
+
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
+{
+ struct ibv_alloc_pd cmd;
+ struct mlx4_alloc_pd_resp resp;
+ struct mlx4_pd *pd;
+
+ pd = malloc(sizeof *pd);
+ if (!pd)
+ return NULL;
+
+ if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp)) {
+ free(pd);
+ return NULL;
+ }
+
+ pd->pdn = resp.pdn;
+
+ return &pd->ibv_pd;
+}
+
+int mlx4_free_pd(struct ibv_pd *pd)
+{
+ int ret;
+
+ ret = ibv_cmd_dealloc_pd(pd);
+ if (ret)
+ return ret;
+
+ free(to_mpd(pd));
+ return 0;
+}
+
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr)
+{
+ struct ibv_open_xrcd cmd;
+ struct ibv_open_xrcd_resp resp;
+ struct verbs_xrcd *xrcd;
+ int ret;
+
+ xrcd = calloc(1, sizeof *xrcd);
+ if (!xrcd)
+ return NULL;
+
+ ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
+ if (ret)
+ goto err;
+
+ return &xrcd->xrcd;
+
+err:
+ free(xrcd);
+ return NULL;
+}
+
+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
+{
+ struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
+ int ret;
+
+ ret = ibv_cmd_close_xrcd(xrcd);
+ if (!ret)
+ free(xrcd);
+
+ return ret;
+}
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+ int access)
+{
+ struct ibv_mr *mr;
+ struct ibv_reg_mr cmd;
+ struct ibv_reg_mr_resp resp;
+ int ret;
+
+ mr = malloc(sizeof *mr);
+ if (!mr)
+ return NULL;
+
+ ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
+ access, mr, &cmd, sizeof cmd,
+ &resp, sizeof resp);
+ if (ret) {
+ free(mr);
+ return NULL;
+ }
+
+ return mr;
+}
+
+int mlx4_rereg_mr(struct ibv_mr *mr,
+ int flags,
+ struct ibv_pd *pd, void *addr,
+ size_t length, int access)
+{
+ struct ibv_rereg_mr cmd;
+ struct ibv_rereg_mr_resp resp;
+
+ if (flags & IBV_REREG_MR_KEEP_VALID)
+ return ENOTSUP;
+
+ return ibv_cmd_rereg_mr(mr, flags, addr, length,
+ (uintptr_t)addr,
+ access, pd,
+ &cmd, sizeof(cmd),
+ &resp, sizeof(resp));
+}
+
+int mlx4_dereg_mr(struct ibv_mr *mr)
+{
+ int ret;
+
+ ret = ibv_cmd_dereg_mr(mr);
+ if (ret)
+ return ret;
+
+ free(mr);
+ return 0;
+}
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
+{
+ struct ibv_mw *mw;
+ struct ibv_alloc_mw cmd;
+ struct ibv_alloc_mw_resp resp;
+ int ret;
+
+ mw = calloc(1, sizeof(*mw));
+ if (!mw)
+ return NULL;
+
+ ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd),
+ &resp, sizeof(resp));
+
+ if (ret) {
+ free(mw);
+ return NULL;
+ }
+
+ return mw;
+}
+
+int mlx4_dealloc_mw(struct ibv_mw *mw)
+{
+ int ret;
+ struct ibv_dealloc_mw cmd;
+
+ ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ free(mw);
+ return 0;
+}
+
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind)
+{
+ struct ibv_send_wr *bad_wr = NULL;
+ struct ibv_send_wr wr = { };
+ int ret;
+
+
+ wr.opcode = IBV_WR_BIND_MW;
+ wr.next = NULL;
+
+ wr.wr_id = mw_bind->wr_id;
+ wr.send_flags = mw_bind->send_flags;
+
+ wr.bind_mw.mw = mw;
+ wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
+ wr.bind_mw.bind_info = mw_bind->bind_info;
+
+ ret = mlx4_post_send(qp, &wr, &bad_wr);
+
+ if (ret)
+ return ret;
+
+ /* updating the mw with the latest rkey. */
+ mw->rkey = wr.bind_mw.rkey;
+
+ return 0;
+}
+
+int align_queue_size(int req)
+{
+ int nent;
+
+ for (nent = 1; nent < req; nent <<= 1)
+ ; /* nothing */
+
+ return nent;
+}
+
+enum {
+ CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS |
+ IBV_WC_EX_WITH_COMPLETION_TIMESTAMP
+};
+
+enum {
+ CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
+};
+
+enum {
+ CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
+};
+
+
+static int mlx4_cmd_create_cq(struct ibv_context *context,
+ struct ibv_cq_init_attr_ex *cq_attr,
+ struct mlx4_cq *cq)
+{
+ struct mlx4_create_cq cmd = {};
+ struct mlx4_create_cq_resp resp = {};
+ int ret;
+
+ cmd.buf_addr = (uintptr_t) cq->buf.buf;
+ cmd.db_addr = (uintptr_t) cq->set_ci_db;
+
+ ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel,
+ cq_attr->comp_vector,
+ ibv_cq_ex_to_cq(&cq->ibv_cq),
+ &cmd.ibv_cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp));
+ if (!ret)
+ cq->cqn = resp.cqn;
+
+ return ret;
+
+}
+
+static int mlx4_cmd_create_cq_ex(struct ibv_context *context,
+ struct ibv_cq_init_attr_ex *cq_attr,
+ struct mlx4_cq *cq)
+{
+ struct mlx4_create_cq_ex cmd = {};
+ struct mlx4_create_cq_resp_ex resp = {};
+ int ret;
+
+ cmd.buf_addr = (uintptr_t) cq->buf.buf;
+ cmd.db_addr = (uintptr_t) cq->set_ci_db;
+
+ ret = ibv_cmd_create_cq_ex(context, cq_attr,
+ &cq->ibv_cq, &cmd.ibv_cmd,
+ sizeof(cmd.ibv_cmd),
+ sizeof(cmd),
+ &resp.ibv_resp,
+ sizeof(resp.ibv_resp),
+ sizeof(resp));
+ if (!ret)
+ cq->cqn = resp.cqn;
+
+ return ret;
+}
+
+static struct ibv_cq_ex *create_cq(struct ibv_context *context,
+ struct ibv_cq_init_attr_ex *cq_attr,
+ int cq_alloc_flags)
+{
+ struct mlx4_cq *cq;
+ int ret;
+ struct mlx4_context *mctx = to_mctx(context);
+
+ /* Sanity check CQ size before proceeding */
+ if (cq_attr->cqe > 0x3fffff) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
+ errno = ENOTSUP;
+ return NULL;
+ }
+
+ if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
+ cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
+ errno = ENOTSUP;
+ return NULL;
+ }
+
+ if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS)
+ return NULL;
+
+ /* mlx4 devices don't support slid and sl in cqe when completion
+ * timestamp is enabled in the CQ
+ */
+ if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) &&
+ (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) {
+ errno = ENOTSUP;
+ return NULL;
+ }
+
+ cq = malloc(sizeof *cq);
+ if (!cq)
+ return NULL;
+
+ cq->cons_index = 0;
+
+ if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
+ goto err;
+
+ cq_attr->cqe = align_queue_size(cq_attr->cqe + 1);
+
+ if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size))
+ goto err;
+
+ cq->cqe_size = mctx->cqe_size;
+ cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
+ if (!cq->set_ci_db)
+ goto err_buf;
+
+ cq->arm_db = cq->set_ci_db + 1;
+ *cq->arm_db = 0;
+ cq->arm_sn = 1;
+ *cq->set_ci_db = 0;
+ cq->flags = cq_alloc_flags;
+
+ if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
+ cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
+ cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED;
+
+ --cq_attr->cqe;
+ if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
+ ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq);
+ else
+ ret = mlx4_cmd_create_cq(context, cq_attr, cq);
+
+ if (ret)
+ goto err_db;
+
+
+ if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
+ mlx4_cq_fill_pfns(cq, cq_attr);
+
+ return &cq->ibv_cq;
+
+err_db:
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
+
+err_buf:
+ mlx4_free_buf(&cq->buf);
+
+err:
+ free(cq);
+
+ return NULL;
+}
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector)
+{
+ struct ibv_cq_ex *cq;
+ struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
+ .comp_vector = comp_vector,
+ .wc_flags = IBV_WC_STANDARD_FLAGS};
+
+ cq = create_cq(context, &cq_attr, 0);
+ return cq ? ibv_cq_ex_to_cq(cq) : NULL;
+}
+
+struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
+ struct ibv_cq_init_attr_ex *cq_attr)
+{
+ /*
+ * Make local copy since some attributes might be adjusted
+ * for internal use.
+ */
+ struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe,
+ .channel = cq_attr->channel,
+ .comp_vector = cq_attr->comp_vector,
+ .wc_flags = cq_attr->wc_flags,
+ .comp_mask = cq_attr->comp_mask,
+ .flags = cq_attr->flags};
+
+ return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED);
+}
+
+int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+ struct mlx4_resize_cq cmd;
+ struct ibv_resize_cq_resp resp;
+ struct mlx4_buf buf;
+ int old_cqe, outst_cqe, ret;
+
+ /* Sanity check CQ size before proceeding */
+ if (cqe > 0x3fffff)
+ return EINVAL;
+
+ pthread_spin_lock(&cq->lock);
+
+ cqe = align_queue_size(cqe + 1);
+ if (cqe == ibcq->cqe + 1) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Can't be smaller then the number of outstanding CQEs */
+ outst_cqe = mlx4_get_outstanding_cqes(cq);
+ if (cqe < outst_cqe + 1) {
+ ret = EINVAL;
+ goto out;
+ }
+
+ ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size);
+ if (ret)
+ goto out;
+
+ old_cqe = ibcq->cqe;
+ cmd.buf_addr = (uintptr_t) buf.buf;
+
+ ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd,
+ &resp, sizeof resp);
+ if (ret) {
+ mlx4_free_buf(&buf);
+ goto out;
+ }
+
+ mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
+
+ mlx4_free_buf(&cq->buf);
+ cq->buf = buf;
+ mlx4_update_cons_index(cq);
+
+out:
+ pthread_spin_unlock(&cq->lock);
+ return ret;
+}
+
+int mlx4_destroy_cq(struct ibv_cq *cq)
+{
+ int ret;
+
+ ret = ibv_cmd_destroy_cq(cq);
+ if (ret)
+ return ret;
+
+ mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
+ mlx4_free_buf(&to_mcq(cq)->buf);
+ free(to_mcq(cq));
+
+ return 0;
+}
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+ struct ibv_srq_init_attr *attr)
+{
+ struct mlx4_create_srq cmd;
+ struct mlx4_create_srq_resp resp;
+ struct mlx4_srq *srq;
+ int ret;
+
+ /* Sanity check SRQ size before proceeding */
+ if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
+ return NULL;
+
+ srq = malloc(sizeof *srq);
+ if (!srq)
+ return NULL;
+
+ if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+ goto err;
+
+ srq->max = align_queue_size(attr->attr.max_wr + 1);
+ srq->max_gs = attr->attr.max_sge;
+ srq->counter = 0;
+ srq->ext_srq = 0;
+
+ if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
+ goto err;
+
+ srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+ if (!srq->db)
+ goto err_free;
+
+ *srq->db = 0;
+
+ cmd.buf_addr = (uintptr_t) srq->buf.buf;
+ cmd.db_addr = (uintptr_t) srq->db;
+
+ ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
+ &cmd.ibv_cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp);
+ if (ret)
+ goto err_db;
+
+ return &srq->verbs_srq.srq;
+
+err_db:
+ mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
+
+err_free:
+ free(srq->wrid);
+ mlx4_free_buf(&srq->buf);
+
+err:
+ free(srq);
+
+ return NULL;
+}
+
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
+ (attr_ex->srq_type == IBV_SRQT_BASIC))
+ return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
+ else if (attr_ex->srq_type == IBV_SRQT_XRC)
+ return mlx4_create_xrc_srq(context, attr_ex);
+
+ return NULL;
+}
+
+int mlx4_modify_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr,
+ int attr_mask)
+{
+ struct ibv_modify_srq cmd;
+
+ return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
+}
+
+int mlx4_query_srq(struct ibv_srq *srq,
+ struct ibv_srq_attr *attr)
+{
+ struct ibv_query_srq cmd;
+
+ return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
+}
+
+int mlx4_destroy_srq(struct ibv_srq *srq)
+{
+ int ret;
+
+ if (to_msrq(srq)->ext_srq)
+ return mlx4_destroy_xrc_srq(srq);
+
+ ret = ibv_cmd_destroy_srq(srq);
+ if (ret)
+ return ret;
+
+ mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
+ mlx4_free_buf(&to_msrq(srq)->buf);
+ free(to_msrq(srq)->wrid);
+ free(to_msrq(srq));
+
+ return 0;
+}
+
+static int mlx4_cmd_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr,
+ struct mlx4_create_qp *cmd,
+ struct mlx4_qp *qp)
+{
+ struct mlx4_create_qp_ex cmd_ex;
+ struct mlx4_create_qp_resp_ex resp;
+ int ret;
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
+ offsetof(typeof(cmd->ibv_cmd), is_srq) +
+ sizeof(cmd->ibv_cmd.is_srq) -
+ offsetof(typeof(cmd->ibv_cmd), user_handle));
+
+ memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
+ offsetof(typeof(*cmd), sq_no_prefetch) +
+ sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd));
+
+ ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
+ sizeof(qp->verbs_qp), attr,
+ &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
+ sizeof(cmd_ex), &resp.ibv_resp,
+ sizeof(resp.ibv_resp), sizeof(resp));
+ return ret;
+}
+
+enum {
+ MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_XRCD |
+ IBV_QP_INIT_ATTR_CREATE_FLAGS),
+};
+
+enum {
+ MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS),
+};
+
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr)
+{
+ struct mlx4_context *ctx = to_mctx(context);
+ struct mlx4_create_qp cmd;
+ struct ibv_create_qp_resp resp;
+ struct mlx4_qp *qp;
+ int ret;
+
+ /* Sanity check QP size before proceeding */
+ if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */
+ if (attr->cap.max_send_wr > ctx->max_qp_wr ||
+ attr->cap.max_recv_wr > ctx->max_qp_wr ||
+ attr->cap.max_send_sge > ctx->max_sge ||
+ attr->cap.max_recv_sge > ctx->max_sge)
+ return NULL;
+ } else {
+ if (attr->cap.max_send_wr > 65536 ||
+ attr->cap.max_recv_wr > 65536 ||
+ attr->cap.max_send_sge > 64 ||
+ attr->cap.max_recv_sge > 64)
+ return NULL;
+ }
+ if (attr->cap.max_inline_data > 1024)
+ return NULL;
+
+ if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK)
+ return NULL;
+
+ qp = calloc(1, sizeof *qp);
+ if (!qp)
+ return NULL;
+
+ if (attr->qp_type == IBV_QPT_XRC_RECV) {
+ attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
+ } else {
+ mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+ /*
+ * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+ qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+ }
+
+ if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
+ attr->qp_type == IBV_QPT_XRC_RECV) {
+ attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
+ } else {
+ qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+ if (attr->cap.max_recv_sge < 1)
+ attr->cap.max_recv_sge = 1;
+ if (attr->cap.max_recv_wr < 1)
+ attr->cap.max_recv_wr = 1;
+ }
+
+ if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp))
+ goto err;
+
+ mlx4_init_qp_indices(qp);
+
+ if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
+ pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
+ goto err_free;
+
+ if (attr->cap.max_recv_sge) {
+ qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+ if (!qp->db)
+ goto err_free;
+
+ *qp->db = 0;
+ cmd.db_addr = (uintptr_t) qp->db;
+ } else {
+ cmd.db_addr = 0;
+ }
+
+ cmd.buf_addr = (uintptr_t) qp->buf.buf;
+ cmd.log_sq_stride = qp->sq.wqe_shift;
+ for (cmd.log_sq_bb_count = 0;
+ qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
+ ++cmd.log_sq_bb_count)
+ ; /* nothing */
+ cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
+ memset(cmd.reserved, 0, sizeof cmd.reserved);
+ pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
+
+ if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK)
+ ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp);
+ else
+ ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp,
+ sizeof(qp->verbs_qp), attr,
+ &cmd.ibv_cmd, sizeof(cmd), &resp,
+ sizeof(resp));
+ if (ret)
+ goto err_rq_db;
+
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
+ ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
+ if (ret)
+ goto err_destroy;
+ }
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+
+ qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
+ qp->rq.max_gs = attr->cap.max_recv_sge;
+ if (attr->qp_type != IBV_QPT_XRC_RECV)
+ mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+
+ qp->doorbell_qpn = htobe32(qp->verbs_qp.qp.qp_num << 8);
+ if (attr->sq_sig_all)
+ qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE);
+ else
+ qp->sq_signal_bits = 0;
+
+ return &qp->verbs_qp.qp;
+
+err_destroy:
+ ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
+
+err_rq_db:
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+ if (attr->cap.max_recv_sge)
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
+
+err_free:
+ free(qp->sq.wrid);
+ if (qp->rq.wqe_cnt)
+ free(qp->rq.wrid);
+ mlx4_free_buf(&qp->buf);
+
+err:
+ free(qp);
+
+ return NULL;
+}
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
+{
+ struct ibv_qp_init_attr_ex attr_ex;
+ struct ibv_qp *qp;
+
+ memcpy(&attr_ex, attr, sizeof *attr);
+ attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
+ attr_ex.pd = pd;
+ qp = mlx4_create_qp_ex(pd->context, &attr_ex);
+ if (qp)
+ memcpy(attr, &attr_ex, sizeof *attr);
+ return qp;
+}
+
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
+{
+ struct ibv_open_qp cmd;
+ struct ibv_create_qp_resp resp;
+ struct mlx4_qp *qp;
+ int ret;
+
+ qp = calloc(1, sizeof *qp);
+ if (!qp)
+ return NULL;
+
+ ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
+ if (ret)
+ goto err;
+
+ return &qp->verbs_qp.qp;
+
+err:
+ free(qp);
+ return NULL;
+}
+
+int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
+ int attr_mask,
+ struct ibv_qp_init_attr *init_attr)
+{
+ struct ibv_query_qp cmd;
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ int ret;
+
+ ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd);
+ if (ret)
+ return ret;
+
+ init_attr->cap.max_send_wr = qp->sq.max_post;
+ init_attr->cap.max_send_sge = qp->sq.max_gs;
+ init_attr->cap.max_inline_data = qp->max_inline_data;
+
+ attr->cap = init_attr->cap;
+
+ return 0;
+}
+
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+ int attr_mask)
+{
+ struct ibv_modify_qp cmd = {};
+ struct ibv_port_attr port_attr;
+ struct mlx4_qp *mqp = to_mqp(qp);
+ struct ibv_device_attr device_attr;
+ int ret;
+
+ memset(&device_attr, 0, sizeof(device_attr));
+ if (attr_mask & IBV_QP_PORT) {
+ ret = ibv_query_port(qp->context, attr->port_num,
+ &port_attr);
+ if (ret)
+ return ret;
+ mqp->link_layer = port_attr.link_layer;
+
+ ret = ibv_query_device(qp->context, &device_attr);
+ if (ret)
+ return ret;
+
+ switch(qp->qp_type) {
+ case IBV_QPT_UD:
+ if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) &&
+ (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM))
+ mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB |
+ MLX4_RX_CSUM_VALID;
+ break;
+ case IBV_QPT_RAW_PACKET:
+ if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) &&
+ (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM))
+ mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH |
+ MLX4_RX_CSUM_VALID;
+ break;
+ default:
+ break;
+ }
+
+ }
+
+ if (qp->state == IBV_QPS_RESET &&
+ attr_mask & IBV_QP_STATE &&
+ attr->qp_state == IBV_QPS_INIT) {
+ mlx4_qp_init_sq_ownership(to_mqp(qp));
+ }
+
+ ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd);
+
+ if (!ret &&
+ (attr_mask & IBV_QP_STATE) &&
+ attr->qp_state == IBV_QPS_RESET) {
+ if (qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+ qp->srq ? to_msrq(qp->srq) : NULL);
+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
+
+ mlx4_init_qp_indices(to_mqp(qp));
+ if (to_mqp(qp)->rq.wqe_cnt)
+ *to_mqp(qp)->db = 0;
+ }
+
+ return ret;
+}
+
+static void mlx4_lock_cqs(struct ibv_qp *qp)
+{
+ struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+ struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ pthread_spin_lock(&send_cq->lock);
+ else if (qp->recv_cq)
+ pthread_spin_lock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
+ pthread_spin_lock(&send_cq->lock);
+ } else if (send_cq->cqn < recv_cq->cqn) {
+ pthread_spin_lock(&send_cq->lock);
+ pthread_spin_lock(&recv_cq->lock);
+ } else {
+ pthread_spin_lock(&recv_cq->lock);
+ pthread_spin_lock(&send_cq->lock);
+ }
+}
+
+static void mlx4_unlock_cqs(struct ibv_qp *qp)
+{
+ struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+ struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ pthread_spin_unlock(&send_cq->lock);
+ else if (qp->recv_cq)
+ pthread_spin_unlock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
+ pthread_spin_unlock(&send_cq->lock);
+ } else if (send_cq->cqn < recv_cq->cqn) {
+ pthread_spin_unlock(&recv_cq->lock);
+ pthread_spin_unlock(&send_cq->lock);
+ } else {
+ pthread_spin_unlock(&send_cq->lock);
+ pthread_spin_unlock(&recv_cq->lock);
+ }
+}
+
+int mlx4_destroy_qp(struct ibv_qp *ibqp)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ int ret;
+
+ pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex);
+ ret = ibv_cmd_destroy_qp(ibqp);
+ if (ret) {
+ pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+ return ret;
+ }
+
+ mlx4_lock_cqs(ibqp);
+
+ if (ibqp->recv_cq)
+ __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+ ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+ if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
+ __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
+
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
+ mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+
+ mlx4_unlock_cqs(ibqp);
+ pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+
+ if (qp->rq.wqe_cnt) {
+ mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
+ free(qp->rq.wrid);
+ }
+ if (qp->sq.wqe_cnt)
+ free(qp->sq.wrid);
+ mlx4_free_buf(&qp->buf);
+ free(qp);
+
+ return 0;
+}
+
+static int link_local_gid(const union ibv_gid *gid)
+{
+ uint32_t *tmp = (uint32_t *)gid->raw;
+ uint32_t hi = tmp[0];
+ uint32_t lo = tmp[1];
+
+ if (hi == htobe32(0xfe800000) && lo == 0)
+ return 1;
+
+ return 0;
+}
+
+static int is_multicast_gid(const union ibv_gid *gid)
+{
+ return gid->raw[0] == 0xff;
+}
+
+static uint16_t get_vlan_id(union ibv_gid *gid)
+{
+ uint16_t vid;
+ vid = gid->raw[11] << 8 | gid->raw[12];
+ return vid < 0x1000 ? vid : 0xffff;
+}
+
+static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah,
+ struct ibv_ah_attr *attr)
+{
+ int err, i;
+ uint16_t vid;
+ union ibv_gid sgid;
+
+ if (link_local_gid(&attr->grh.dgid)) {
+ memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
+ memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
+ ah->mac[0] ^= 2;
+
+ vid = get_vlan_id(&attr->grh.dgid);
+ } else if (is_multicast_gid(&attr->grh.dgid)) {
+ ah->mac[0] = 0x33;
+ ah->mac[1] = 0x33;
+ for (i = 2; i < 6; ++i)
+ ah->mac[i] = attr->grh.dgid.raw[i + 10];
+
+ err = ibv_query_gid(pd->context, attr->port_num,
+ attr->grh.sgid_index, &sgid);
+ if (err)
+ return err;
+
+ ah->av.dlid = htobe16(0xc000);
+ ah->av.port_pd |= htobe32(1 << 31);
+
+ vid = get_vlan_id(&sgid);
+ } else
+ return 1;
+
+ if (vid != 0xffff) {
+ ah->av.port_pd |= htobe32(1 << 29);
+ ah->vlan = vid | ((attr->sl & 7) << 13);
+ }
+
+ return 0;
+}
+
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+{
+ struct mlx4_ah *ah;
+ struct ibv_port_attr port_attr;
+
+ if (query_port_cache(pd->context, attr->port_num, &port_attr))
+ return NULL;
+
+ ah = malloc(sizeof *ah);
+ if (!ah)
+ return NULL;
+
+ memset(&ah->av, 0, sizeof ah->av);
+
+ ah->av.port_pd = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24));
+
+ if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+ ah->av.g_slid = attr->src_path_bits;
+ ah->av.dlid = htobe16(attr->dlid);
+ ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28);
+ } else
+ ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29);
+
+ if (attr->static_rate) {
+ ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
+ /* XXX check rate cap? */
+ }
+ if (attr->is_global) {
+ ah->av.g_slid |= 0x80;
+ ah->av.gid_index = attr->grh.sgid_index;
+ ah->av.hop_limit = attr->grh.hop_limit;
+ ah->av.sl_tclass_flowlabel |=
+ htobe32((attr->grh.traffic_class << 20) |
+ attr->grh.flow_label);
+ memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
+ }
+
+ if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+ if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) {
+ uint16_t vid;
+
+ if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
+ ah->mac, &vid)) {
+ free(ah);
+ return NULL;
+ }
+
+ if (vid <= 0xfff) {
+ ah->av.port_pd |= htobe32(1 << 29);
+ ah->vlan = vid |
+ ((attr->sl & 7) << 13);
+ }
+
+ } else {
+ if (mlx4_resolve_grh_to_l2(pd, ah, attr)) {
+ free(ah);
+ return NULL;
+ }
+ }
+ }
+
+ return &ah->ibv_ah;
+}
+
+int mlx4_destroy_ah(struct ibv_ah *ah)
+{
+ free(to_mah(ah));
+
+ return 0;
+}
diff --git a/contrib/ofed/libmlx4/wqe.h b/contrib/ofed/libmlx4/wqe.h
new file mode 100644
index 000000000000..6f833d9bf76b
--- /dev/null
+++ b/contrib/ofed/libmlx4/wqe.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef WQE_H
+#define WQE_H
+
+#include <stdint.h>
+
+enum {
+ MLX4_SEND_DOORBELL = 0x14,
+};
+
+enum {
+ MLX4_WQE_CTRL_SOLICIT = 1 << 1,
+ MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
+ MLX4_WQE_CTRL_IP_HDR_CSUM = 1 << 4,
+ MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5,
+ MLX4_WQE_CTRL_FENCE = 1 << 6,
+ MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7
+};
+
+enum {
+ MLX4_WQE_BIND_TYPE_2 = (1<<31),
+ MLX4_WQE_BIND_ZERO_BASED = (1<<30),
+};
+
+enum {
+ MLX4_INLINE_SEG = 1 << 31,
+ MLX4_INLINE_ALIGN = 64,
+};
+
+enum {
+ MLX4_INVALID_LKEY = 0x100,
+};
+
+struct mlx4_wqe_ctrl_seg {
+ uint32_t owner_opcode;
+ union {
+ struct {
+ uint8_t reserved[3];
+ uint8_t fence_size;
+ };
+ uint32_t bf_qpn;
+ };
+ /*
+ * High 24 bits are SRC remote buffer; low 8 bits are flags:
+ * [7] SO (strong ordering)
+ * [5] TCP/UDP checksum
+ * [4] IP checksum
+ * [3:2] C (generate completion queue entry)
+ * [1] SE (solicited event)
+ * [0] FL (force loopback)
+ */
+ uint32_t srcrb_flags;
+ /*
+ * imm is immediate data for send/RDMA write w/ immediate;
+ * also invalidation key for send with invalidate; input
+ * modifier for WQEs on CCQs.
+ */
+ uint32_t imm;
+};
+
+struct mlx4_wqe_datagram_seg {
+ uint32_t av[8];
+ uint32_t dqpn;
+ uint32_t qkey;
+ uint16_t vlan;
+ uint8_t mac[6];
+};
+
+struct mlx4_wqe_data_seg {
+ uint32_t byte_count;
+ uint32_t lkey;
+ uint64_t addr;
+};
+
+struct mlx4_wqe_inline_seg {
+ uint32_t byte_count;
+};
+
+struct mlx4_wqe_srq_next_seg {
+ uint16_t reserved1;
+ uint16_t next_wqe_index;
+ uint32_t reserved2[3];
+};
+
+struct mlx4_wqe_local_inval_seg {
+ uint64_t reserved1;
+ uint32_t mem_key;
+ uint32_t reserved2;
+ uint64_t reserved3[2];
+};
+
+enum {
+ MLX4_WQE_MW_REMOTE_READ = 1 << 29,
+ MLX4_WQE_MW_REMOTE_WRITE = 1 << 30,
+ MLX4_WQE_MW_ATOMIC = 1 << 31
+};
+
+struct mlx4_wqe_raddr_seg {
+ uint64_t raddr;
+ uint32_t rkey;
+ uint32_t reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+ uint64_t swap_add;
+ uint64_t compare;
+};
+
+struct mlx4_wqe_bind_seg {
+ uint32_t flags1;
+ uint32_t flags2;
+ uint32_t new_rkey;
+ uint32_t lkey;
+ uint64_t addr;
+ uint64_t length;
+};
+
+#endif /* WQE_H */