diff options
Diffstat (limited to 'contrib/ofed/libmlx4')
-rw-r--r-- | contrib/ofed/libmlx4/buf.c | 64 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/config.h | 13 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/cq.c | 819 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/dbrec.c | 151 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/doorbell.h | 70 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/mlx4-abi.h | 159 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/mlx4.c | 327 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/mlx4.h | 458 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/mmio.h | 116 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/qp.c | 776 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/srq.c | 325 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/verbs.c | 1255 | ||||
-rw-r--r-- | contrib/ofed/libmlx4/wqe.h | 149 |
13 files changed, 4682 insertions, 0 deletions
diff --git a/contrib/ofed/libmlx4/buf.c b/contrib/ofed/libmlx4/buf.c new file mode 100644 index 000000000000..9b41e7f62525 --- /dev/null +++ b/contrib/ofed/libmlx4/buf.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2006, 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <errno.h> +#include <sys/mman.h> + +#include "mlx4.h" + +int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size) +{ + int ret; + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void mlx4_free_buf(struct mlx4_buf *buf) +{ + if (buf->length) { + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); + } +} diff --git a/contrib/ofed/libmlx4/config.h b/contrib/ofed/libmlx4/config.h new file mode 100644 index 000000000000..af75292ef03e --- /dev/null +++ b/contrib/ofed/libmlx4/config.h @@ -0,0 +1,13 @@ +/* $FreeBSD$ */ + +#ifdef __LP64__ +#define SIZEOF_LONG 8 +#else +#define SIZEOF_LONG 4 +#endif + +#define VALGRIND_MAKE_MEM_DEFINED(...) 0 +#define SWITCH_FALLTHROUGH (void)0 +#define ALWAYS_INLINE __attribute__ ((__always_inline__)) +#define likely(x) __predict_true(x) +#define unlikely(x) __predict_false(x) diff --git a/contrib/ofed/libmlx4/cq.c b/contrib/ofed/libmlx4/cq.c new file mode 100644 index 000000000000..aa2ec1e9636a --- /dev/null +++ b/contrib/ofed/libmlx4/cq.c @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include <infiniband/opcode.h> + +#include "mlx4.h" +#include "doorbell.h" + +enum { + MLX4_CQ_DOORBELL = 0x20 +}; + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2 +}; + +#define MLX4_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MLX4_CQ_DB_REQ_NOT (2 << 24) + +enum { + MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_QPN_MASK = 0xffffff, +}; + +enum { + MLX4_CQE_OWNER_MASK = 0x80, + MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_OPCODE_MASK = 0x1f +}; + +enum { + MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +struct mlx4_err_cqe { + uint32_t vlan_my_qpn; + uint32_t reserved1[5]; + uint16_t wqe_index; + uint8_t vendor_err; + uint8_t syndrome; + uint8_t reserved2[3]; + uint8_t owner_sr_opcode; +}; + +static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry) +{ + return cq->buf.buf + entry * cq->cqe_size; +} + +static void *get_sw_cqe(struct mlx4_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe; + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq) +{ + return get_sw_cqe(cq, cq->cons_index); +} + +static enum ibv_wc_status mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) + printf(PFX "local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + htobe32(cqe->vlan_my_qpn), htobe32(cqe->wqe_index), + cqe->vendor_err, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + return IBV_WC_LOC_LEN_ERR; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + return IBV_WC_LOC_QP_OP_ERR; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + return IBV_WC_LOC_PROT_ERR; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + return IBV_WC_WR_FLUSH_ERR; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + return IBV_WC_MW_BIND_ERR; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + return IBV_WC_BAD_RESP_ERR; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + return IBV_WC_LOC_ACCESS_ERR; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + return IBV_WC_REM_INV_REQ_ERR; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + return IBV_WC_REM_ACCESS_ERR; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + return IBV_WC_REM_OP_ERR; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + return IBV_WC_RETRY_EXC_ERR; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + return IBV_WC_RNR_RETRY_EXC_ERR; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + return IBV_WC_REM_ABORT_ERR; + default: + return IBV_WC_GENERAL_ERR; + } +} + +static inline void handle_good_req(struct ibv_wc *wc, struct mlx4_cqe *cqe) +{ + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: + wc->opcode = IBV_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IBV_WC_LOCAL_INV; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + /* assume it's a send completion */ + wc->opcode = IBV_WC_SEND; + break; + } +} + +static inline int mlx4_get_next_cqe(struct mlx4_cq *cq, + struct mlx4_cqe **pcqe) + ALWAYS_INLINE; +static inline int mlx4_get_next_cqe(struct mlx4_cq *cq, + struct mlx4_cqe **pcqe) +{ + struct mlx4_cqe *cqe; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + if (cq->cqe_size == 64) + ++cqe; + + ++cq->cons_index; + + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + udma_from_device_barrier(); + + *pcqe = cqe; + + return CQ_OK; +} + +static inline int mlx4_parse_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc, int lazy) + ALWAYS_INLINE; +static inline int mlx4_parse_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc, int lazy) +{ + struct mlx4_wq *wq; + struct mlx4_srq *srq; + uint32_t qpn; + uint32_t g_mlpath_rqpn; + uint64_t *pwr_id; + uint16_t wqe_index; + struct mlx4_err_cqe *ecqe; + struct mlx4_context *mctx; + int is_error; + int is_send; + enum ibv_wc_status *pstatus; + + mctx = to_mctx(cq->ibv_cq.context); + qpn = be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + if (lazy) { + cq->cqe = cqe; + cq->flags &= (~MLX4_CQ_FLAGS_RX_CSUM_VALID); + } else + wc->qp_num = qpn; + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) { + /* + * We do not have to take the XSRQ table lock here, + * because CQs will be locked while SRQs are removed + * from the table. + */ + srq = mlx4_find_xsrq(&mctx->xsrq_table, + be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK); + if (!srq) + return CQ_POLL_ERR; + } else { + if (!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num)) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mlx4_find_qp(mctx, qpn); + if (!*cur_qp) + return CQ_POLL_ERR; + } + srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL; + } + + pwr_id = lazy ? &cq->ibv_cq.wr_id : &wc->wr_id; + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = be16toh(cqe->wqe_index); + wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); + *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if (srq) { + wqe_index = be16toh(cqe->wqe_index); + *pwr_id = srq->wrid[wqe_index]; + mlx4_free_srq_wqe(srq, wqe_index); + } else { + wq = &(*cur_qp)->rq; + *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + pstatus = lazy ? &cq->ibv_cq.status : &wc->status; + if (is_error) { + ecqe = (struct mlx4_err_cqe *)cqe; + *pstatus = mlx4_handle_error_cqe(ecqe); + if (!lazy) + wc->vendor_err = ecqe->vendor_err; + return CQ_OK; + } + + *pstatus = IBV_WC_SUCCESS; + if (lazy) { + if (!is_send) + if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) + cq->flags |= MLX4_CQ_FLAGS_RX_CSUM_VALID; + } else if (is_send) { + handle_good_req(wc, cqe); + } else { + wc->byte_len = be32toh(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IBV_WC_RECV; + wc->wc_flags |= IBV_WC_WITH_INV; + wc->imm_data = be32toh(cqe->immed_rss_invalid); + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + } + + wc->slid = be16toh(cqe->rlid); + g_mlpath_rqpn = be32toh(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; + wc->pkey_index = be32toh(cqe->immed_rss_invalid) & 0x7f; + /* When working with xrc srqs, don't have qp to check link layer. + * Using IB SL, should consider Roce. (TBD) + */ + if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) + wc->sl = be16toh(cqe->sl_vid) >> 13; + else + wc->sl = be16toh(cqe->sl_vid) >> 12; + + if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) { + wc->wc_flags |= ((cqe->status & htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) == + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) << + IBV_WC_IP_CSUM_OK_SHIFT; + } + } + + return CQ_OK; +} + +static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe) + ALWAYS_INLINE; +static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe) +{ + return mlx4_parse_cqe(cq, cqe, &cq->cur_qp, NULL, 1); +} + +static inline int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) + ALWAYS_INLINE; +static inline int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) +{ + struct mlx4_cqe *cqe; + int err; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) + return err; + + return mlx4_parse_cqe(cq, cqe, cur_qp, wc, 0); +} + +int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_qp *qp = NULL; + int npolled; + int err = CQ_OK; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mlx4_poll_one(cq, &qp, wc + npolled); + if (err != CQ_OK) + break; + } + + if (npolled || err == CQ_POLL_ERR) + mlx4_update_cons_index(cq); + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + +static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock) + ALWAYS_INLINE; +static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + mlx4_update_cons_index(cq); + + if (lock) + pthread_spin_unlock(&cq->lock); +} + +static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr, + int lock) + ALWAYS_INLINE; +static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr, + int lock) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_cqe *cqe; + int err; + + if (unlikely(attr->comp_mask)) + return EINVAL; + + if (lock) + pthread_spin_lock(&cq->lock); + + cq->cur_qp = NULL; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) { + if (lock) + pthread_spin_unlock(&cq->lock); + return ENOENT; + } + + err = mlx4_parse_lazy_cqe(cq, cqe); + if (lock && err) + pthread_spin_unlock(&cq->lock); + + return err; +} + +static int mlx4_next_poll(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_cqe *cqe; + int err; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) + return ENOENT; + + return mlx4_parse_lazy_cqe(cq, cqe); +} + +static void mlx4_end_poll(struct ibv_cq_ex *ibcq) +{ + _mlx4_end_poll(ibcq, 0); +} + +static void mlx4_end_poll_lock(struct ibv_cq_ex *ibcq) +{ + _mlx4_end_poll(ibcq, 1); +} + +static int mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return _mlx4_start_poll(ibcq, attr, 0); +} + +static int mlx4_start_poll_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return _mlx4_start_poll(ibcq, attr, 1); +} + +static enum ibv_wc_opcode mlx4_cq_read_wc_opcode(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + if (cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK) { + switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + case MLX4_OPCODE_RDMA_WRITE: + return IBV_WC_RDMA_WRITE; + case MLX4_OPCODE_SEND_INVAL: + case MLX4_OPCODE_SEND_IMM: + case MLX4_OPCODE_SEND: + return IBV_WC_SEND; + case MLX4_OPCODE_RDMA_READ: + return IBV_WC_RDMA_READ; + case MLX4_OPCODE_ATOMIC_CS: + return IBV_WC_COMP_SWAP; + case MLX4_OPCODE_ATOMIC_FA: + return IBV_WC_FETCH_ADD; + case MLX4_OPCODE_LOCAL_INVAL: + return IBV_WC_LOCAL_INV; + case MLX4_OPCODE_BIND_MW: + return IBV_WC_BIND_MW; + } + } else { + switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + return IBV_WC_RECV_RDMA_WITH_IMM; + case MLX4_RECV_OPCODE_SEND_INVAL: + case MLX4_RECV_OPCODE_SEND_IMM: + case MLX4_RECV_OPCODE_SEND: + return IBV_WC_RECV; + } + } + + return 0; +} + +static uint32_t mlx4_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; +} + +static int mlx4_cq_read_wc_flags(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + int is_send = cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + int wc_flags = 0; + + if (is_send) { + switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + case MLX4_OPCODE_SEND_IMM: + wc_flags |= IBV_WC_WITH_IMM; + break; + } + } else { + if (cq->flags & MLX4_CQ_FLAGS_RX_CSUM_VALID) + wc_flags |= ((cq->cqe->status & + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) == + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) << + IBV_WC_IP_CSUM_OK_SHIFT; + + switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + case MLX4_RECV_OPCODE_SEND_IMM: + wc_flags |= IBV_WC_WITH_IMM; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc_flags |= IBV_WC_WITH_INV; + break; + } + wc_flags |= (be32toh(cq->cqe->g_mlpath_rqpn) & 0x80000000) ? IBV_WC_GRH : 0; + } + + return wc_flags; +} + +static uint32_t mlx4_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->byte_cnt); +} + +static uint32_t mlx4_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_err_cqe *ecqe = (struct mlx4_err_cqe *)cq->cqe; + + return ecqe->vendor_err; +} + +static uint32_t mlx4_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + switch (cq->cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_SEND_INVAL: + return be32toh(cq->cqe->immed_rss_invalid); + default: + return cq->cqe->immed_rss_invalid; + } +} + +static uint32_t mlx4_cq_read_wc_slid(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (uint32_t)be16toh(cq->cqe->rlid); +} + +static uint8_t mlx4_cq_read_wc_sl(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + if ((cq->cur_qp) && (cq->cur_qp->link_layer == IBV_LINK_LAYER_ETHERNET)) + return be16toh(cq->cqe->sl_vid) >> 13; + else + return be16toh(cq->cqe->sl_vid) >> 12; +} + +static uint32_t mlx4_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->g_mlpath_rqpn) & 0xffffff; +} + +static uint8_t mlx4_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (be32toh(cq->cqe->g_mlpath_rqpn) >> 24) & 0x7f; +} + +static uint64_t mlx4_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return ((uint64_t)be32toh(cq->cqe->ts_47_16) << 16) | + (cq->cqe->ts_15_8 << 8) | + (cq->cqe->ts_7_0); +} + +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr) +{ + + if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) { + cq->ibv_cq.start_poll = mlx4_start_poll; + cq->ibv_cq.end_poll = mlx4_end_poll; + } else { + cq->ibv_cq.start_poll = mlx4_start_poll_lock; + cq->ibv_cq.end_poll = mlx4_end_poll_lock; + } + cq->ibv_cq.next_poll = mlx4_next_poll; + + cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode; + cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err; + cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) + cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) + cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) + cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) + cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) + cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) + cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts; +} + +int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + struct mlx4_cq *cq = to_mcq(ibvcq); + uint32_t doorbell[2]; + uint32_t sn; + uint32_t ci; + uint32_t cmd; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; + + *cq->arm_db = htobe32(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + udma_to_device_barrier(); + + doorbell[0] = htobe32(sn << 28 | cmd | cq->cqn); + doorbell[1] = htobe32(ci); + + mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL); + + return 0; +} + +void mlx4_cq_event(struct ibv_cq *cq) +{ + to_mcq(cq)->arm_sn++; +} + +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + struct mlx4_cqe *cqe, *dest; + uint32_t prod_index; + uint8_t owner_bit; + int nfreed = 0; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe += cqe_inc; + if (srq && srq->ext_srq && + (be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num && + !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) { + mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index)); + ++nfreed; + } else if ((be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + udma_to_device_barrier(); + mlx4_update_cons_index(cq); + } +} + +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __mlx4_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq) +{ + uint32_t i; + + for (i = cq->cons_index; get_sw_cqe(cq, i); ++i) + ; + + return i - cq->cons_index; +} + +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe) +{ + struct mlx4_cqe *cqe; + int i; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + i = cq->cons_index; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { + cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size, + cqe - cqe_inc, cq->cqe_size); + ++i; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + } + + ++cq->cons_index; +} + +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, + int entry_size) +{ + if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size), + dev->page_size)) + return -1; + memset(buf->buf, 0, nent * entry_size); + + return 0; +} diff --git a/contrib/ofed/libmlx4/dbrec.c b/contrib/ofed/libmlx4/dbrec.c new file mode 100644 index 000000000000..3e875738fa61 --- /dev/null +++ b/contrib/ofed/libmlx4/dbrec.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" + +struct mlx4_db_page { + struct mlx4_db_page *prev, *next; + struct mlx4_buf buf; + int num_db; + int use_cnt; + unsigned long free[0]; +}; + +static const int db_size[] = { + [MLX4_DB_TYPE_CQ] = 8, + [MLX4_DB_TYPE_RQ] = 4, +}; + +static struct mlx4_db_page *__add_page(struct mlx4_context *context, + enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + int ps = to_mdev(context->ibv_ctx.device)->page_size; + int pp; + int i; + + pp = ps / db_size[type]; + + page = malloc(sizeof *page + pp / 8); + if (!page) + return NULL; + + if (mlx4_alloc_buf(&page->buf, ps, ps)) { + free(page); + return NULL; + } + + page->num_db = pp; + page->use_cnt = 0; + for (i = 0; i < pp / (sizeof (long) * 8); ++i) + page->free[i] = ~0; + + page->prev = NULL; + page->next = context->db_list[type]; + context->db_list[type] = page; + if (page->next) + page->next->prev = page; + + return page; +} + +uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + uint32_t *db = NULL; + int i, j; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (page->use_cnt < page->num_db) + goto found; + + page = __add_page(context, type); + if (!page) + goto out; + +found: + ++page->use_cnt; + + for (i = 0; !page->free[i]; ++i) + /* nothing */; + + j = ffsl(page->free[i]); + page->free[i] &= ~(1UL << (j - 1)); + db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type]; + +out: + pthread_mutex_unlock(&context->db_list_mutex); + + return db; +} + +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db) +{ + struct mlx4_db_page *page; + uintptr_t ps = to_mdev(context->ibv_ctx.device)->page_size; + int i; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf) + break; + + if (!page) + goto out; + + i = ((void *) db - page->buf.buf) / db_size[type]; + page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long))); + + if (!--page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + context->db_list[type] = page->next; + if (page->next) + page->next->prev = page->prev; + + mlx4_free_buf(&page->buf); + free(page); + } + +out: + pthread_mutex_unlock(&context->db_list_mutex); +} diff --git a/contrib/ofed/libmlx4/doorbell.h b/contrib/ofed/libmlx4/doorbell.h new file mode 100644 index 000000000000..140a6158d7f2 --- /dev/null +++ b/contrib/ofed/libmlx4/doorbell.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef DOORBELL_H +#define DOORBELL_H + +#include <stdint.h> +#include <pthread.h> +#include "mlx4.h" +#include "mmio.h" + +struct mlx4_context; + +#if SIZEOF_LONG == 8 + +#if __BYTE_ORDER == __LITTLE_ENDIAN +# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0]) +#elif __BYTE_ORDER == __BIG_ENDIAN +# define MLX4_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1]) +#else +# error __BYTE_ORDER not defined +#endif + +static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset) +{ + mmio_writeq((unsigned long)(ctx->uar + offset), MLX4_PAIR_TO_64(val)); +} + +#else + +static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset) +{ + pthread_spin_lock(&ctx->uar_lock); + mmio_writel((unsigned long)(ctx->uar + offset), val[0]); + mmio_writel((unsigned long)(ctx->uar + offset + 4), val[1]); + pthread_spin_unlock(&ctx->uar_lock); +} + +#endif + +#endif /* DOORBELL_H */ diff --git a/contrib/ofed/libmlx4/mlx4-abi.h b/contrib/ofed/libmlx4/mlx4-abi.h new file mode 100644 index 000000000000..7d89505606e2 --- /dev/null +++ b/contrib/ofed/libmlx4/mlx4-abi.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_H +#define MLX4_ABI_H + +#include <infiniband/kern-abi.h> + +#define MLX4_UVERBS_MIN_ABI_VERSION 2 +#define MLX4_UVERBS_MAX_ABI_VERSION 4 + +#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 + +enum { + MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 +}; + +struct mlx4_alloc_ucontext_resp_v3 { + struct ibv_get_context_resp ibv_resp; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +enum mlx4_query_dev_ex_resp_mask { + MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, +}; + +struct mlx4_alloc_ucontext_resp { + struct ibv_get_context_resp ibv_resp; + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_alloc_pd_resp { + struct ibv_alloc_pd_resp ibv_resp; + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_create_cq { + struct ibv_create_cq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_cq_resp { + struct ibv_create_cq_resp ibv_resp; + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_create_cq_ex { + struct ibv_create_cq_ex ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_cq_resp_ex { + struct ibv_create_cq_resp_ex ibv_resp; + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_resize_cq { + struct ibv_resize_cq ibv_cmd; + __u64 buf_addr; +}; + +struct mlx4_query_device_ex_resp { + struct ibv_query_device_resp_ex ibv_resp; + __u32 comp_mask; + __u32 response_length; + __u64 hca_core_clock_offset; +}; + +struct mlx4_query_device_ex { + struct ibv_query_device_ex ibv_cmd; +}; + +struct mlx4_create_srq { + struct ibv_create_srq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_xsrq { + struct ibv_create_xsrq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_srq_resp { + struct ibv_create_srq_resp ibv_resp; + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_create_qp { + struct ibv_create_qp ibv_cmd; + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; /* was reserved in ABI 2 */ + __u8 reserved[5]; +}; + +struct mlx4_create_qp_drv_ex { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; /* was reserved in ABI 2 */ + __u8 reserved[5]; +}; + +struct mlx4_create_qp_ex { + struct ibv_create_qp_ex ibv_cmd; + struct mlx4_create_qp_drv_ex drv_ex; +}; + +struct mlx4_create_qp_resp_ex { + struct ibv_create_qp_resp_ex ibv_resp; +}; + +#endif /* MLX4_ABI_H */ diff --git a/contrib/ofed/libmlx4/mlx4.c b/contrib/ofed/libmlx4/mlx4.c new file mode 100644 index 000000000000..229c2670b5ed --- /dev/null +++ b/contrib/ofed/libmlx4/mlx4.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "mlx4-abi.h" + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#define HCA(v, d) \ + { .vendor = PCI_VENDOR_ID_##v, \ + .device = d } + +static struct { + unsigned vendor; + unsigned device; +} hca_table[] = { + HCA(MELLANOX, 0x6340), /* MT25408 "Hermon" SDR */ + HCA(MELLANOX, 0x634a), /* MT25408 "Hermon" DDR */ + HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */ + HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */ + HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */ + HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */ + HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */ + HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */ + HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */ + HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */ + HCA(MELLANOX, 0x1005), /* MT27510 Family */ + HCA(MELLANOX, 0x1006), /* MT27511 Family */ + HCA(MELLANOX, 0x1007), /* MT27520 Family */ + HCA(MELLANOX, 0x1008), /* MT27521 Family */ + HCA(MELLANOX, 0x1009), /* MT27530 Family */ + HCA(MELLANOX, 0x100a), /* MT27531 Family */ + HCA(MELLANOX, 0x100b), /* MT27540 Family */ + HCA(MELLANOX, 0x100c), /* MT27541 Family */ + HCA(MELLANOX, 0x100d), /* MT27550 Family */ + HCA(MELLANOX, 0x100e), /* MT27551 Family */ + HCA(MELLANOX, 0x100f), /* MT27560 Family */ + HCA(MELLANOX, 0x1010), /* MT27561 Family */ +}; + +static struct ibv_context_ops mlx4_ctx_ops = { + .query_device = mlx4_query_device, + .query_port = mlx4_query_port, + .alloc_pd = mlx4_alloc_pd, + .dealloc_pd = mlx4_free_pd, + .reg_mr = mlx4_reg_mr, + .rereg_mr = mlx4_rereg_mr, + .dereg_mr = mlx4_dereg_mr, + .alloc_mw = mlx4_alloc_mw, + .dealloc_mw = mlx4_dealloc_mw, + .bind_mw = mlx4_bind_mw, + .create_cq = mlx4_create_cq, + .poll_cq = mlx4_poll_cq, + .req_notify_cq = mlx4_arm_cq, + .cq_event = mlx4_cq_event, + .resize_cq = mlx4_resize_cq, + .destroy_cq = mlx4_destroy_cq, + .create_srq = mlx4_create_srq, + .modify_srq = mlx4_modify_srq, + .query_srq = mlx4_query_srq, + .destroy_srq = mlx4_destroy_srq, + .post_srq_recv = mlx4_post_srq_recv, + .create_qp = mlx4_create_qp, + .query_qp = mlx4_query_qp, + .modify_qp = mlx4_modify_qp, + .destroy_qp = mlx4_destroy_qp, + .post_send = mlx4_post_send, + .post_recv = mlx4_post_recv, + .create_ah = mlx4_create_ah, + .destroy_ah = mlx4_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast +}; + +static int mlx4_map_internal_clock(struct mlx4_device *mdev, + struct ibv_context *ibv_ctx) +{ + struct mlx4_context *context = to_mctx(ibv_ctx); + void *hca_clock_page; + + hca_clock_page = mmap(NULL, mdev->page_size, + PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, + mdev->page_size * 3); + + if (hca_clock_page == MAP_FAILED) { + fprintf(stderr, PFX + "Warning: Timestamp available,\n" + "but failed to mmap() hca core clock page.\n"); + return -1; + } + + context->hca_core_clock = hca_clock_page + + (context->core_clock.offset & (mdev->page_size - 1)); + return 0; +} + +static int mlx4_init_context(struct verbs_device *v_device, + struct ibv_context *ibv_ctx, int cmd_fd) +{ + struct mlx4_context *context; + struct ibv_get_context cmd; + struct mlx4_alloc_ucontext_resp resp; + int i; + struct mlx4_alloc_ucontext_resp_v3 resp_v3; + __u16 bf_reg_size; + struct mlx4_device *dev = to_mdev(&v_device->device); + struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); + struct ibv_device_attr_ex dev_attrs; + + /* memory footprint of mlx4_context and verbs_context share + * struct ibv_context. + */ + context = to_mctx(ibv_ctx); + ibv_ctx->cmd_fd = cmd_fd; + + if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd, + &resp_v3.ibv_resp, sizeof resp_v3)) + return errno; + + context->num_qps = resp_v3.qp_tab_size; + bf_reg_size = resp_v3.bf_reg_size; + context->cqe_size = sizeof (struct mlx4_cqe); + } else { + if (ibv_cmd_get_context(ibv_ctx, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) + return errno; + + context->num_qps = resp.qp_tab_size; + bf_reg_size = resp.bf_reg_size; + if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE) + context->cqe_size = resp.cqe_size; + else + context->cqe_size = sizeof (struct mlx4_cqe); + } + + context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; + context->qp_table_mask = (1 << context->qp_table_shift) - 1; + for (i = 0; i < MLX4_PORTS_NUM; ++i) + context->port_query_cache[i].valid = 0; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) + context->db_list[i] = NULL; + + mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps); + pthread_mutex_init(&context->db_list_mutex, NULL); + + context->uar = mmap(NULL, dev->page_size, PROT_WRITE, + MAP_SHARED, cmd_fd, 0); + if (context->uar == MAP_FAILED) + return errno; + + if (bf_reg_size) { + context->bf_page = mmap(NULL, dev->page_size, + PROT_WRITE, MAP_SHARED, cmd_fd, + dev->page_size); + if (context->bf_page == MAP_FAILED) { + fprintf(stderr, PFX "Warning: BlueFlame available, " + "but failed to mmap() BlueFlame page.\n"); + context->bf_page = NULL; + context->bf_buf_size = 0; + } else { + context->bf_buf_size = bf_reg_size / 2; + context->bf_offset = 0; + pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + } + } else { + context->bf_page = NULL; + context->bf_buf_size = 0; + } + + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + ibv_ctx->ops = mlx4_ctx_ops; + + context->hca_core_clock = NULL; + memset(&dev_attrs, 0, sizeof(dev_attrs)); + if (!mlx4_query_device_ex(ibv_ctx, NULL, &dev_attrs, + sizeof(struct ibv_device_attr_ex))) { + context->max_qp_wr = dev_attrs.orig_attr.max_qp_wr; + context->max_sge = dev_attrs.orig_attr.max_sge; + if (context->core_clock.offset_valid) + mlx4_map_internal_clock(dev, ibv_ctx); + } + + verbs_ctx->has_comp_mask = VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ | + VERBS_CONTEXT_QP; + verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd); + verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd); + verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex); + verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num); + verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex); + verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); + verbs_set_ctx_op(verbs_ctx, ibv_create_flow, ibv_cmd_create_flow); + verbs_set_ctx_op(verbs_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow); + verbs_set_ctx_op(verbs_ctx, create_cq_ex, mlx4_create_cq_ex); + verbs_set_ctx_op(verbs_ctx, query_device_ex, mlx4_query_device_ex); + verbs_set_ctx_op(verbs_ctx, query_rt_values, mlx4_query_rt_values); + + return 0; + +} + +static void mlx4_uninit_context(struct verbs_device *v_device, + struct ibv_context *ibv_ctx) +{ + struct mlx4_context *context = to_mctx(ibv_ctx); + + munmap(context->uar, to_mdev(&v_device->device)->page_size); + if (context->bf_page) + munmap(context->bf_page, to_mdev(&v_device->device)->page_size); + if (context->hca_core_clock) + munmap(context->hca_core_clock - context->core_clock.offset, + to_mdev(&v_device->device)->page_size); +} + +static struct verbs_device_ops mlx4_dev_ops = { + .init_context = mlx4_init_context, + .uninit_context = mlx4_uninit_context, +}; + +static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version) +{ + char value[8]; + struct mlx4_device *dev; + unsigned vendor, device; + int i; + + if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", + value, sizeof value) < 0) + return NULL; + vendor = strtol(value, NULL, 16); + + if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", + value, sizeof value) < 0) + return NULL; + device = strtol(value, NULL, 16); + + for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) + if (vendor == hca_table[i].vendor && + device == hca_table[i].device) + goto found; + + return NULL; + +found: + if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION || + abi_version > MLX4_UVERBS_MAX_ABI_VERSION) { + fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " + "(min supported %d, max supported %d)\n", + abi_version, uverbs_sys_path, + MLX4_UVERBS_MIN_ABI_VERSION, + MLX4_UVERBS_MAX_ABI_VERSION); + return NULL; + } + + dev = calloc(1, sizeof *dev); + if (!dev) { + fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", + uverbs_sys_path); + return NULL; + } + + dev->page_size = sysconf(_SC_PAGESIZE); + dev->abi_version = abi_version; + + dev->verbs_dev.ops = &mlx4_dev_ops; + dev->verbs_dev.sz = sizeof(*dev); + dev->verbs_dev.size_of_context = + sizeof(struct mlx4_context) - sizeof(struct ibv_context); + + return &dev->verbs_dev; +} + +static __attribute__((constructor)) void mlx4_register_driver(void) +{ + verbs_register_driver("mlx4", mlx4_driver_init); +} diff --git a/contrib/ofed/libmlx4/mlx4.h b/contrib/ofed/libmlx4/mlx4.h new file mode 100644 index 000000000000..864ef9eccc60 --- /dev/null +++ b/contrib/ofed/libmlx4/mlx4.h @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_H +#define MLX4_H + +#include <infiniband/endian.h> +#include <stddef.h> + +#include <infiniband/driver.h> +#include <infiniband/udma_barrier.h> +#include <infiniband/verbs.h> + +#define MLX4_PORTS_NUM 2 + +#define PFX "mlx4: " + +enum { + MLX4_STAT_RATE_OFFSET = 5 +}; + +enum { + MLX4_QP_TABLE_BITS = 8, + MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS, + MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 +}; + +#define MLX4_REMOTE_SRQN_FLAGS(wr) htobe32(wr->qp_type.xrc.remote_srqn << 8) + +enum { + MLX4_XSRQ_TABLE_BITS = 8, + MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, + MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 +}; + +struct mlx4_xsrq_table { + struct { + struct mlx4_srq **table; + int refcnt; + } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; + + pthread_mutex_t mutex; + int num_xsrq; + int shift; + int mask; +}; + +enum { + MLX4_XRC_QPN_BIT = (1 << 23) +}; + +enum mlx4_db_type { + MLX4_DB_TYPE_CQ, + MLX4_DB_TYPE_RQ, + MLX4_NUM_DB_TYPE +}; + +enum { + MLX4_OPCODE_NOP = 0x00, + MLX4_OPCODE_SEND_INVAL = 0x01, + MLX4_OPCODE_RDMA_WRITE = 0x08, + MLX4_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX4_OPCODE_SEND = 0x0a, + MLX4_OPCODE_SEND_IMM = 0x0b, + MLX4_OPCODE_LSO = 0x0e, + MLX4_OPCODE_RDMA_READ = 0x10, + MLX4_OPCODE_ATOMIC_CS = 0x11, + MLX4_OPCODE_ATOMIC_FA = 0x12, + MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14, + MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15, + MLX4_OPCODE_BIND_MW = 0x18, + MLX4_OPCODE_FMR = 0x19, + MLX4_OPCODE_LOCAL_INVAL = 0x1b, + MLX4_OPCODE_CONFIG_CMD = 0x1f, + + MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX4_RECV_OPCODE_SEND = 0x01, + MLX4_RECV_OPCODE_SEND_IMM = 0x02, + MLX4_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX4_CQE_OPCODE_ERROR = 0x1e, + MLX4_CQE_OPCODE_RESIZE = 0x16, +}; + +struct mlx4_device { + struct verbs_device verbs_dev; + int page_size; + int abi_version; +}; + +struct mlx4_db_page; + +struct mlx4_context { + struct ibv_context ibv_ctx; + + void *uar; + pthread_spinlock_t uar_lock; + + void *bf_page; + int bf_buf_size; + int bf_offset; + pthread_spinlock_t bf_lock; + + struct { + struct mlx4_qp **table; + int refcnt; + } qp_table[MLX4_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + int num_qps; + int qp_table_shift; + int qp_table_mask; + int max_qp_wr; + int max_sge; + + struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; + pthread_mutex_t db_list_mutex; + int cqe_size; + struct mlx4_xsrq_table xsrq_table; + struct { + uint8_t valid; + uint8_t link_layer; + enum ibv_port_cap_flags caps; + } port_query_cache[MLX4_PORTS_NUM]; + struct { + uint64_t offset; + uint8_t offset_valid; + } core_clock; + void *hca_core_clock; +}; + +struct mlx4_buf { + void *buf; + size_t length; +}; + +struct mlx4_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; +}; + +enum { + MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0, + MLX4_CQ_FLAGS_EXTENDED = 1 << 1, + MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2, +}; + +struct mlx4_cq { + struct ibv_cq_ex ibv_cq; + struct mlx4_buf buf; + struct mlx4_buf resize_buf; + pthread_spinlock_t lock; + uint32_t cqn; + uint32_t cons_index; + uint32_t *set_ci_db; + uint32_t *arm_db; + int arm_sn; + int cqe_size; + struct mlx4_qp *cur_qp; + struct mlx4_cqe *cqe; + uint32_t flags; +}; + +struct mlx4_srq { + struct verbs_srq verbs_srq; + struct mlx4_buf buf; + pthread_spinlock_t lock; + uint64_t *wrid; + uint32_t srqn; + int max; + int max_gs; + int wqe_shift; + int head; + int tail; + uint32_t *db; + uint16_t counter; + uint8_t ext_srq; +}; + +struct mlx4_wq { + uint64_t *wrid; + pthread_spinlock_t lock; + int wqe_cnt; + int max_post; + unsigned head; + unsigned tail; + int max_gs; + int wqe_shift; + int offset; +}; + +struct mlx4_qp { + struct verbs_qp verbs_qp; + struct mlx4_buf buf; + int max_inline_data; + int buf_size; + + uint32_t doorbell_qpn; + uint32_t sq_signal_bits; + int sq_spare_wqes; + struct mlx4_wq sq; + + uint32_t *db; + struct mlx4_wq rq; + + uint8_t link_layer; + uint32_t qp_cap_cache; +}; + +struct mlx4_av { + uint32_t port_pd; + uint8_t reserved1; + uint8_t g_slid; + uint16_t dlid; + uint8_t reserved2; + uint8_t gid_index; + uint8_t stat_rate; + uint8_t hop_limit; + uint32_t sl_tclass_flowlabel; + uint8_t dgid[16]; +}; + +struct mlx4_ah { + struct ibv_ah ibv_ah; + struct mlx4_av av; + uint16_t vlan; + uint8_t mac[6]; +}; + +enum { + MLX4_CSUM_SUPPORT_UD_OVER_IB = (1 << 0), + MLX4_CSUM_SUPPORT_RAW_OVER_ETH = (1 << 1), + /* Only report rx checksum when the validation is valid */ + MLX4_RX_CSUM_VALID = (1 << 16), +}; + +enum mlx4_cqe_status { + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK = (1 << 2), + MLX4_CQE_STATUS_IPV4_PKT = (1 << 22), + MLX4_CQE_STATUS_IP_HDR_CSUM_OK = (1 << 28), + MLX4_CQE_STATUS_IPV4_CSUM_OK = MLX4_CQE_STATUS_IPV4_PKT | + MLX4_CQE_STATUS_IP_HDR_CSUM_OK | + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK +}; + +struct mlx4_cqe { + uint32_t vlan_my_qpn; + uint32_t immed_rss_invalid; + uint32_t g_mlpath_rqpn; + union { + struct { + uint16_t sl_vid; + uint16_t rlid; + }; + uint32_t ts_47_16; + }; + uint32_t status; + uint32_t byte_cnt; + uint16_t wqe_index; + uint16_t checksum; + uint8_t reserved3; + uint8_t ts_15_8; + uint8_t ts_7_0; + uint8_t owner_sr_opcode; +}; + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} +int align_queue_size(int req); + +#define to_mxxx(xxx, type) \ + ((struct mlx4_##type *) \ + ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx))) + +static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev) +{ + /* ibv_device is first field of verbs_device + * see try_driver() in libibverbs. + */ + return container_of(ibdev, struct mlx4_device, verbs_dev); +} + +static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) +{ + return to_mxxx(ctx, context); +} + +static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd) +{ + return to_mxxx(pd, pd); +} + +static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) +{ + return to_mxxx(cq, cq); +} + +static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) +{ + return container_of(container_of(ibsrq, struct verbs_srq, srq), + struct mlx4_srq, verbs_srq); +} + +static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) +{ + return container_of(container_of(ibqp, struct verbs_qp, qp), + struct mlx4_qp, verbs_qp); +} + +static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) +{ + return to_mxxx(ah, ah); +} + +static inline void mlx4_update_cons_index(struct mlx4_cq *cq) +{ + *cq->set_ci_db = htobe32(cq->cons_index & 0xffffff); +} + +int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); +void mlx4_free_buf(struct mlx4_buf *buf); + +uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db); + +int mlx4_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int mlx4_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); +int mlx4_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values); +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); +int mlx4_free_pd(struct ibv_pd *pd); +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr); +int mlx4_close_xrcd(struct ibv_xrcd *xrcd); + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access); +int mlx4_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access); +int mlx4_dereg_mr(struct ibv_mr *mr); + +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type); +int mlx4_dealloc_mw(struct ibv_mw *mw); +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr); +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr); +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, + int entry_size); +int mlx4_resize_cq(struct ibv_cq *cq, int cqe); +int mlx4_destroy_cq(struct ibv_cq *cq); +int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_arm_cq(struct ibv_cq *cq, int solicited); +void mlx4_cq_event(struct ibv_cq *cq); +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq); +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe); + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int mask); +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int mlx4_destroy_srq(struct ibv_srq *srq); +int mlx4_destroy_xrc_srq(struct ibv_srq *srq); +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq); +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr); +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr); +int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int mlx4_destroy_qp(struct ibv_qp *qp); +void mlx4_init_qp_indices(struct mlx4_qp *qp); +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp); +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp); +int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mlx4_qp *qp); +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type); +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn); +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp); +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn); +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int mlx4_destroy_ah(struct ibv_ah *ah); +int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr, + struct mlx4_ah *ah); +void mlx4_free_av(struct mlx4_ah *ah); + +#endif /* MLX4_H */ diff --git a/contrib/ofed/libmlx4/mmio.h b/contrib/ofed/libmlx4/mmio.h new file mode 100644 index 000000000000..a1a296658fdb --- /dev/null +++ b/contrib/ofed/libmlx4/mmio.h @@ -0,0 +1,116 @@ +/* Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md + */ +#ifndef MMIO_H +#define MMIO_H + +#include <unistd.h> +#include <sys/syscall.h> +#ifdef __s390x__ + +static inline long mmio_writeb(const unsigned long mmio_addr, + const uint8_t val) +{ + return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val)); +} + +static inline long mmio_writew(const unsigned long mmio_addr, + const uint16_t val) +{ + return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val)); +} + +static inline long mmio_writel(const unsigned long mmio_addr, + const uint32_t val) +{ + return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val)); +} + +static inline long mmio_writeq(const unsigned long mmio_addr, + const uint64_t val) +{ + return syscall(__NR_s390_pci_mmio_write, mmio_addr, &val, sizeof(val)); +} + +static inline long mmio_write(const unsigned long mmio_addr, + const void *val, + const size_t length) +{ + return syscall(__NR_s390_pci_mmio_write, mmio_addr, val, length); +} + +static inline long mmio_readb(const unsigned long mmio_addr, uint8_t *val) +{ + return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val)); +} + +static inline long mmio_readw(const unsigned long mmio_addr, uint16_t *val) +{ + return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val)); +} + +static inline long mmio_readl(const unsigned long mmio_addr, uint32_t *val) +{ + return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val)); +} + +static inline long mmio_readq(const unsigned long mmio_addr, uint64_t *val) +{ + return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, sizeof(*val)); +} + +static inline long mmio_read(const unsigned long mmio_addr, + void *val, + const size_t length) +{ + return syscall(__NR_s390_pci_mmio_read, mmio_addr, val, length); +} + +static inline void mlx4_bf_copy(unsigned long *dst, + unsigned long *src, + unsigned bytecnt) +{ + mmio_write((unsigned long)dst, src, bytecnt); +} + +#else + +#define mmio_writeb(addr, value) \ + (*((volatile uint8_t *)addr) = value) +#define mmio_writew(addr, value) \ + (*((volatile uint16_t *)addr) = value) +#define mmio_writel(addr, value) \ + (*((volatile uint32_t *)addr) = value) +#define mmio_writeq(addr, value) \ + (*((volatile uint64_t *)addr) = value) +#define mmio_write(addr, value, length) \ + memcpy(addr, value, length) + +#define mmio_readb(addr, value) \ + (value = *((volatile uint8_t *)addr)) +#define mmio_readw(addr, value) \ + (value = *((volatile uint16_t *)addr)) +#define mmio_readl(addr, value) \ + (value = *((volatile uint32_t *)addr)) +#define mmio_readq(addr, value) \ + (value = *((volatile uint64_t *)addr)) +#define mmio_read(addr, value, length) \ + memcpy(value, addr, length) + +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static inline void mlx4_bf_copy(unsigned long *dst, + unsigned long *src, + unsigned bytecnt) +{ + while (bytecnt > 0) { + *dst++ = *src++; + *dst++ = *src++; + bytecnt -= 2 * sizeof(long); + } +} +#endif + +#endif diff --git a/contrib/ofed/libmlx4/qp.c b/contrib/ofed/libmlx4/qp.c new file mode 100644 index 000000000000..577aab5287ab --- /dev/null +++ b/contrib/ofed/libmlx4/qp.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> + +#include "mlx4.h" +#include "doorbell.h" +#include "wqe.h" + +static const uint32_t mlx4_ib_opcode[] = { + [IBV_WR_SEND] = MLX4_OPCODE_SEND, + [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, + [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE, + [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM, + [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ, + [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS, + [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, + [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL, + [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW, + [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL, +}; + +static void *get_recv_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. + */ +static void stamp_send_wqe(struct mlx4_qp *qp, int n) +{ + uint32_t *wqe = get_send_wqe(qp, n); + int i; + int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; + + for (i = 16; i < ds; i += 16) + wqe[i] = 0xffffffff; +} + +void mlx4_init_qp_indices(struct mlx4_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = htobe32(1 << 31); + ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + + stamp_send_wqe(qp, i); + } +} + +static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) +{ + unsigned cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr) +{ + int acc = wr->bind_mw.bind_info.mw_access_flags; + bseg->flags1 = 0; + if (acc & IBV_ACCESS_REMOTE_ATOMIC) + bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC); + if (acc & IBV_ACCESS_REMOTE_WRITE) + bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE); + if (acc & IBV_ACCESS_REMOTE_READ) + bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ); + + bseg->flags2 = 0; + if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2) + bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2); + if (acc & IBV_ACCESS_ZERO_BASED) + bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED); + + bseg->new_rkey = htobe32(wr->bind_mw.rkey); + bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey); + bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr); + bseg->length = htobe64(wr->bind_mw.bind_info.length); +} + +static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, + uint32_t rkey) +{ + iseg->mem_key = htobe32(rkey); + + iseg->reserved1 = 0; + iseg->reserved2 = 0; + iseg->reserved3[0] = 0; + iseg->reserved3[1] = 0; +} + +static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htobe64(remote_addr); + rseg->rkey = htobe32(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) +{ + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = htobe64(wr->wr.atomic.swap); + aseg->compare = htobe64(wr->wr.atomic.compare_add); + } else { + aseg->swap_add = htobe64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ibv_send_wr *wr) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = htobe32(wr->wr.ud.remote_qpn); + dseg->qkey = htobe32(wr->wr.ud.remote_qkey); + dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan); + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->byte_count = htobe32(sg->length); + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + udma_to_device_barrier(); + + if (likely(sg->length)) + dseg->byte_count = htobe32(sg->length); + else + dseg->byte_count = htobe32(0x80000000); +} + +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mlx4_context *ctx; + struct mlx4_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl = NULL; + int ind; + int nreq; + int inl = 0; + int ret = 0; + int size = 0; + int i; + + pthread_spin_lock(&qp->sq.lock); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IBV_SEND_SIGNALED ? + htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IBV_SEND_SOLICITED ? + htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) | + qp->sq_signal_bits; + + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ctrl->imm = wr->imm_data; + else + ctrl->imm = 0; + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_XRC_SEND: + ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); + /* fall through */ + case IBV_QPT_RC: + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IBV_WR_RDMA_READ: + inl = 1; + /* fall through */ + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + if (!wr->num_sge) + inl = 1; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + + break; + case IBV_WR_LOCAL_INV: + ctrl->srcrb_flags |= + htobe32(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->imm_data); + wqe += sizeof + (struct mlx4_wqe_local_inval_seg); + size += sizeof + (struct mlx4_wqe_local_inval_seg) / 16; + break; + case IBV_WR_BIND_MW: + ctrl->srcrb_flags |= + htobe32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof + (struct mlx4_wqe_bind_seg); + size += sizeof + (struct mlx4_wqe_bind_seg) / 16; + break; + case IBV_WR_SEND_WITH_INV: + ctrl->imm = htobe32(wr->imm_data); + break; + + default: + /* No extra segments required for sends */ + break; + } + break; + + case IBV_QPT_UD: + set_datagram_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + break; + + case IBV_QPT_RAW_PACKET: + /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used + * to indicate that no icrc should be calculated */ + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT); + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + break; + + default: + break; + } + + if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + + inl = 0; + + seg = wqe; + wqe += sizeof *seg; + off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (uintptr_t) wr->sg_list[i].addr; + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + udma_to_device_barrier(); /* see comment below */ + seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + udma_to_device_barrier(); + seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); + } + + size += (inl + num_seg * sizeof * seg + 15) / 16; + } else { + struct mlx4_wqe_data_seg *seg = wqe; + + for (i = wr->num_sge - 1; i >= 0 ; --i) + set_data_seg(seg + i, wr->sg_list + i); + + size += wr->num_sge * (sizeof *seg / 16); + } + + ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + udma_to_device_barrier(); + + ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) | + (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (wr->next) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); + + ++ind; + } + +out: + ctx = to_mctx(ibqp->context); + + if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) { + ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8); + + ctrl->bf_qpn |= qp->doorbell_qpn; + ++qp->sq.head; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + mmio_wc_spinlock(&ctx->bf_lock); + + mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, + align(size * 16, 64)); + /* Flush before toggling bf_offset to be latency oriented */ + mmio_flush_writes(); + + ctx->bf_offset ^= ctx->bf_buf_size; + + pthread_spin_unlock(&ctx->bf_lock); + } else if (nreq) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + + mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL), + qp->doorbell_qpn); + } + + if (nreq) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & + (qp->sq.wqe_cnt - 1)); + + pthread_spin_unlock(&qp->sq.lock); + + return ret; +} + +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + int ret = 0; + int nreq; + int ind; + int i; + + pthread_spin_lock(&qp->rq.lock); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->rq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htobe32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (nreq) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + + *qp->db = htobe32(qp->rq.head & 0xffff); + } + + pthread_spin_unlock(&qp->rq.lock); + + return ret; +} + +static int num_inline_segs(int data, enum ibv_qp_type type) +{ + /* + * Inline data segments are not allowed to cross 64 byte + * boundaries. For UD QPs, the data segments always start + * aligned to 64 bytes (16 byte control segment + 48 byte + * datagram segment); for other QPs, there will be a 16 byte + * control segment and possibly a 16 byte remote address + * segment, so in the worst case there will be only 32 bytes + * available for the first data segment. + */ + if (type == IBV_QPT_UD) + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg)) % + MLX4_INLINE_ALIGN; + else + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg)) % + MLX4_INLINE_ALIGN; + + return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) / + (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg)); +} + +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp) +{ + int size; + int max_sq_sge; + + max_sq_sge = align(cap->max_inline_data + + num_inline_segs(cap->max_inline_data, type) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) / + sizeof (struct mlx4_wqe_data_seg); + if (max_sq_sge < cap->max_send_sge) + max_sq_sge = cap->max_send_sge; + + size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg); + switch (type) { + case IBV_QPT_UD: + size += sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_UC: + size += sizeof (struct mlx4_wqe_raddr_seg); + break; + + case IBV_QPT_XRC_SEND: + case IBV_QPT_RC: + size += sizeof (struct mlx4_wqe_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + if (size < (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg))) + size = (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + if (size < sizeof (struct mlx4_wqe_bind_seg)) + size = sizeof (struct mlx4_wqe_bind_seg); + + size += sizeof (struct mlx4_wqe_ctrl_seg); + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ +} + +int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mlx4_qp *qp) +{ + qp->rq.max_gs = cap->max_recv_sge; + + if (qp->sq.wqe_cnt) { + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); + if (!qp->sq.wrid) + return -1; + } + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); + if (!qp->rq.wrid) { + free(qp->sq.wrid); + return -1; + } + } + + for (qp->rq.wqe_shift = 4; + 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); + qp->rq.wqe_shift++) + ; /* nothing */ + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + if (qp->buf_size) { + if (mlx4_alloc_buf(&qp->buf, + align(qp->buf_size, to_mdev(context->device)->page_size), + to_mdev(context->device)->page_size)) { + free(qp->sq.wrid); + free(qp->rq.wrid); + return -1; + } + + memset(qp->buf.buf, 0, qp->buf_size); + } else { + qp->buf.buf = NULL; + } + + return 0; +} + +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type) +{ + int wqe_size; + + wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg); + switch (type) { + case IBV_QPT_UD: + wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_XRC_SEND: + case IBV_QPT_UC: + case IBV_QPT_RC: + wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); + break; + + default: + break; + } + + qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); + cap->max_send_sge = qp->sq.max_gs; + qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_wr = qp->sq.max_post; + + /* + * Inline data segments can't cross a 64 byte boundary. So + * subtract off one segment header for each 64-byte chunk, + * taking into account the fact that wqe_size will be 32 mod + * 64 for non-UD QPs. + */ + qp->max_inline_data = wqe_size - + sizeof (struct mlx4_wqe_inline_seg) * + (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); + cap->max_inline_data = qp->max_inline_data; +} + +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + else + return NULL; +} + +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, + sizeof (struct mlx4_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; + return 0; +} + +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} diff --git a/contrib/ofed/libmlx4/srq.c b/contrib/ofed/libmlx4/srq.c new file mode 100644 index 000000000000..b8d25bb343da --- /dev/null +++ b/contrib/ofed/libmlx4/srq.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "doorbell.h" +#include "wqe.h" +#include "mlx4-abi.h" + +static void *get_wqe(struct mlx4_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind) +{ + struct mlx4_wqe_srq_next_seg *next; + + pthread_spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = htobe16(ind); + srq->tail = ind; + + pthread_spin_unlock(&srq->lock); +} + +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + int err = 0; + int nreq; + int i; + + pthread_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wr->num_sge > srq->max_gs) { + err = -1; + *bad_wr = wr; + break; + } + + if (srq->head == srq->tail) { + /* SRQ is full*/ + err = -1; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16toh(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = htobe32(wr->sg_list[i].length); + scat[i].lkey = htobe32(wr->sg_list[i].lkey); + scat[i].addr = htobe64(wr->sg_list[i].addr); + } + + if (i < srq->max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htobe32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (nreq) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + udma_to_device_barrier(); + + *srq->db = htobe32(srq->counter); + } + + pthread_spin_unlock(&srq->lock); + + return err; +} + +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq) +{ + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + int size; + int buf_size; + int i; + + srq->wrid = malloc(srq->max * sizeof (uint64_t)); + if (!srq->wrid) + return -1; + + size = sizeof (struct mlx4_wqe_srq_next_seg) + + srq->max_gs * sizeof (struct mlx4_wqe_data_seg); + + for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift) + ; /* nothing */ + + buf_size = srq->max << srq->wqe_shift; + + if (mlx4_alloc_buf(&srq->buf, buf_size, + to_mdev(pd->context->device)->page_size)) { + free(srq->wrid); + return -1; + } + + memset(srq->buf.buf, 0, buf_size); + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. + */ + + for (i = 0; i < srq->max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = htobe16((i + 1) & (srq->max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = htobe32(MLX4_INVALID_LKEY); + } + + srq->head = 0; + srq->tail = srq->max - 1; + + return 0; +} + +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) +{ + memset(xsrq_table, 0, sizeof *xsrq_table); + xsrq_table->num_xsrq = size; + xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS; + xsrq_table->mask = (1 << xsrq_table->shift) - 1; + + pthread_mutex_init(&xsrq_table->mutex, NULL); +} + +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + if (xsrq_table->xsrq_table[index].refcnt) + return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask]; + + return NULL; +} + +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq) +{ + int index, ret = 0; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + if (!xsrq_table->xsrq_table[index].refcnt) { + xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1, + sizeof(struct mlx4_srq *)); + if (!xsrq_table->xsrq_table[index].table) { + ret = -1; + goto out; + } + } + + xsrq_table->xsrq_table[index].refcnt++; + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq; + +out: + pthread_mutex_unlock(&xsrq_table->mutex); + return ret; +} + +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + + if (--xsrq_table->xsrq_table[index].refcnt) + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL; + else + free(xsrq_table->xsrq_table[index].table); + + pthread_mutex_unlock(&xsrq_table->mutex); +} + +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + struct mlx4_create_xsrq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) + return NULL; + + srq = calloc(1, sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = align_queue_size(attr_ex->attr.max_wr + 1); + srq->max_gs = attr_ex->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 1; + + if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, + sizeof(srq->verbs_srq), + attr_ex, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, + srq->verbs_srq.srq_num, srq); + if (ret) + goto err_destroy; + + return &srq->verbs_srq.srq; + +err_destroy: + ibv_cmd_destroy_srq(&srq->verbs_srq.srq); +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); +err_free: + free(srq->wrid); + mlx4_free_buf(&srq->buf); +err: + free(srq); + return NULL; +} + +int mlx4_destroy_xrc_srq(struct ibv_srq *srq) +{ + struct mlx4_context *mctx = to_mctx(srq->context); + struct mlx4_srq *msrq = to_msrq(srq); + struct mlx4_cq *mcq; + int ret; + + mcq = to_mcq(msrq->verbs_srq.cq); + mlx4_cq_clean(mcq, 0, msrq); + pthread_spin_lock(&mcq->lock); + mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num); + pthread_spin_unlock(&mcq->lock); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) { + pthread_spin_lock(&mcq->lock); + mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq); + pthread_spin_unlock(&mcq->lock); + return ret; + } + + mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); + mlx4_free_buf(&msrq->buf); + free(msrq->wrid); + free(msrq); + + return 0; +} diff --git a/contrib/ofed/libmlx4/verbs.c b/contrib/ofed/libmlx4/verbs.c new file mode 100644 index 000000000000..f6f43f9bef76 --- /dev/null +++ b/contrib/ofed/libmlx4/verbs.c @@ -0,0 +1,1255 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <infiniband/endian.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <errno.h> + +#include "mlx4.h" +#include "mlx4-abi.h" +#include "wqe.h" + +int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +int mlx4_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size) +{ + struct mlx4_context *mctx = to_mctx(context); + struct mlx4_query_device_ex_resp resp = {}; + struct mlx4_query_device_ex cmd = {}; + uint64_t raw_fw_ver; + unsigned sub_minor; + unsigned major; + unsigned minor; + int err; + + err = ibv_cmd_query_device_ex(context, input, attr, attr_size, + &raw_fw_ver, + &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), + &resp.ibv_resp, sizeof(resp.ibv_resp), + sizeof(resp)); + if (err) + return err; + + if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) { + mctx->core_clock.offset = resp.hca_core_clock_offset; + mctx->core_clock.offset_valid = 1; + } + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +#define READL(ptr) (*((uint32_t *)(ptr))) +static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles) +{ + unsigned int clockhi, clocklo, clockhi1; + int i; + struct mlx4_context *ctx = to_mctx(context); + + if (!ctx->hca_core_clock) + return -EOPNOTSUPP; + + /* Handle wraparound */ + for (i = 0; i < 2; i++) { + clockhi = be32toh(READL(ctx->hca_core_clock)); + clocklo = be32toh(READL(ctx->hca_core_clock + 4)); + clockhi1 = be32toh(READL(ctx->hca_core_clock)); + if (clockhi == clockhi1) + break; + } + + *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo; + + return 0; +} + +int mlx4_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values) +{ + uint32_t comp_mask = 0; + int err = 0; + + if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) { + uint64_t cycles; + + err = mlx4_read_clock(context, &cycles); + if (!err) { + values->raw_clock.tv_sec = 0; + values->raw_clock.tv_nsec = cycles; + comp_mask |= IBV_VALUES_MASK_RAW_CLOCK; + } + } + + values->comp_mask = comp_mask; + + return err; +} + +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + int err; + + err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); + if (!err && port <= MLX4_PORTS_NUM && port > 0) { + struct mlx4_context *mctx = to_mctx(context); + if (!mctx->port_query_cache[port - 1].valid) { + mctx->port_query_cache[port - 1].link_layer = + attr->link_layer; + mctx->port_query_cache[port - 1].caps = + attr->port_cap_flags; + mctx->port_query_cache[port - 1].valid = 1; + } + } + + return err; +} + +/* Only the fields in the port cache will be valid */ +static int query_port_cache(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + struct mlx4_context *mctx = to_mctx(context); + if (port_num <= 0 || port_num > MLX4_PORTS_NUM) + return -EINVAL; + if (mctx->port_query_cache[port_num - 1].valid) { + port_attr->link_layer = + mctx-> + port_query_cache[port_num - 1]. + link_layer; + port_attr->port_cap_flags = + mctx-> + port_query_cache[port_num - 1]. + caps; + return 0; + } + return mlx4_query_port(context, port_num, + (struct ibv_port_attr *)port_attr); + +} + +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct mlx4_alloc_pd_resp resp; + struct mlx4_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int mlx4_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_mpd(pd)); + return 0; +} + +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr) +{ + struct ibv_open_xrcd cmd; + struct ibv_open_xrcd_resp resp; + struct verbs_xrcd *xrcd; + int ret; + + xrcd = calloc(1, sizeof *xrcd); + if (!xrcd) + return NULL; + + ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &xrcd->xrcd; + +err: + free(xrcd); + return NULL; +} + +int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd) +{ + struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); + int ret; + + ret = ibv_cmd_close_xrcd(xrcd); + if (!ret) + free(xrcd); + + return ret; +} + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + int access) +{ + struct ibv_mr *mr; + struct ibv_reg_mr cmd; + struct ibv_reg_mr_resp resp; + int ret; + + mr = malloc(sizeof *mr); + if (!mr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, + access, mr, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + free(mr); + return NULL; + } + + return mr; +} + +int mlx4_rereg_mr(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_rereg_mr cmd; + struct ibv_rereg_mr_resp resp; + + if (flags & IBV_REREG_MR_KEEP_VALID) + return ENOTSUP; + + return ibv_cmd_rereg_mr(mr, flags, addr, length, + (uintptr_t)addr, + access, pd, + &cmd, sizeof(cmd), + &resp, sizeof(resp)); +} + +int mlx4_dereg_mr(struct ibv_mr *mr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(mr); + if (ret) + return ret; + + free(mr); + return 0; +} + +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct ibv_mw *mw; + struct ibv_alloc_mw cmd; + struct ibv_alloc_mw_resp resp; + int ret; + + mw = calloc(1, sizeof(*mw)); + if (!mw) + return NULL; + + ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), + &resp, sizeof(resp)); + + if (ret) { + free(mw); + return NULL; + } + + return mw; +} + +int mlx4_dealloc_mw(struct ibv_mw *mw) +{ + int ret; + struct ibv_dealloc_mw cmd; + + ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd)); + if (ret) + return ret; + + free(mw); + return 0; +} + +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_send_wr *bad_wr = NULL; + struct ibv_send_wr wr = { }; + int ret; + + + wr.opcode = IBV_WR_BIND_MW; + wr.next = NULL; + + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + + wr.bind_mw.mw = mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); + wr.bind_mw.bind_info = mw_bind->bind_info; + + ret = mlx4_post_send(qp, &wr, &bad_wr); + + if (ret) + return ret; + + /* updating the mw with the latest rkey. */ + mw->rkey = wr.bind_mw.rkey; + + return 0; +} + +int align_queue_size(int req) +{ + int nent; + + for (nent = 1; nent < req; nent <<= 1) + ; /* nothing */ + + return nent; +} + +enum { + CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP +}; + +enum { + CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS +}; + +enum { + CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED +}; + + +static int mlx4_cmd_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq cmd = {}; + struct mlx4_create_cq_resp resp = {}; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel, + cq_attr->comp_vector, + ibv_cq_ex_to_cq(&cq->ibv_cq), + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; + +} + +static int mlx4_cmd_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq_ex cmd = {}; + struct mlx4_create_cq_resp_ex resp = {}; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq_ex(context, cq_attr, + &cq->ibv_cq, &cmd.ibv_cmd, + sizeof(cmd.ibv_cmd), + sizeof(cmd), + &resp.ibv_resp, + sizeof(resp.ibv_resp), + sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; +} + +static struct ibv_cq_ex *create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + int cq_alloc_flags) +{ + struct mlx4_cq *cq; + int ret; + struct mlx4_context *mctx = to_mctx(context); + + /* Sanity check CQ size before proceeding */ + if (cq_attr->cqe > 0x3fffff) { + errno = EINVAL; + return NULL; + } + + if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) + return NULL; + + /* mlx4 devices don't support slid and sl in cqe when completion + * timestamp is enabled in the CQ + */ + if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) && + (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) { + errno = ENOTSUP; + return NULL; + } + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + cq->cons_index = 0; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + cq_attr->cqe = align_queue_size(cq_attr->cqe + 1); + + if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size)) + goto err; + + cq->cqe_size = mctx->cqe_size; + cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ); + if (!cq->set_ci_db) + goto err_buf; + + cq->arm_db = cq->set_ci_db + 1; + *cq->arm_db = 0; + cq->arm_sn = 1; + *cq->set_ci_db = 0; + cq->flags = cq_alloc_flags; + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) + cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED; + + --cq_attr->cqe; + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq); + else + ret = mlx4_cmd_create_cq(context, cq_attr, cq); + + if (ret) + goto err_db; + + + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + mlx4_cq_fill_pfns(cq, cq_attr); + + return &cq->ibv_cq; + +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); + +err_buf: + mlx4_free_buf(&cq->buf); + +err: + free(cq); + + return NULL; +} + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq_ex *cq; + struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, + .comp_vector = comp_vector, + .wc_flags = IBV_WC_STANDARD_FLAGS}; + + cq = create_cq(context, &cq_attr, 0); + return cq ? ibv_cq_ex_to_cq(cq) : NULL; +} + +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + /* + * Make local copy since some attributes might be adjusted + * for internal use. + */ + struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe, + .channel = cq_attr->channel, + .comp_vector = cq_attr->comp_vector, + .wc_flags = cq_attr->wc_flags, + .comp_mask = cq_attr->comp_mask, + .flags = cq_attr->flags}; + + return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED); +} + +int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_resize_cq cmd; + struct ibv_resize_cq_resp resp; + struct mlx4_buf buf; + int old_cqe, outst_cqe, ret; + + /* Sanity check CQ size before proceeding */ + if (cqe > 0x3fffff) + return EINVAL; + + pthread_spin_lock(&cq->lock); + + cqe = align_queue_size(cqe + 1); + if (cqe == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + /* Can't be smaller then the number of outstanding CQEs */ + outst_cqe = mlx4_get_outstanding_cqes(cq); + if (cqe < outst_cqe + 1) { + ret = EINVAL; + goto out; + } + + ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size); + if (ret) + goto out; + + old_cqe = ibcq->cqe; + cmd.buf_addr = (uintptr_t) buf.buf; + + ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + mlx4_free_buf(&buf); + goto out; + } + + mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe); + + mlx4_free_buf(&cq->buf); + cq->buf = buf; + mlx4_update_cons_index(cq); + +out: + pthread_spin_unlock(&cq->lock); + return ret; +} + +int mlx4_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db); + mlx4_free_buf(&to_mcq(cq)->buf); + free(to_mcq(cq)); + + return 0; +} + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct mlx4_create_srq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) + return NULL; + + srq = malloc(sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = align_queue_size(attr->attr.max_wr + 1); + srq->max_gs = attr->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 0; + + if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + return &srq->verbs_srq.srq; + +err_db: + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); + +err_free: + free(srq->wrid); + mlx4_free_buf(&srq->buf); + +err: + free(srq); + + return NULL; +} + +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || + (attr_ex->srq_type == IBV_SRQT_BASIC)) + return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex); + else if (attr_ex->srq_type == IBV_SRQT_XRC) + return mlx4_create_xrc_srq(context, attr_ex); + + return NULL; +} + +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); +} + +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int mlx4_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + if (to_msrq(srq)->ext_srq) + return mlx4_destroy_xrc_srq(srq); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db); + mlx4_free_buf(&to_msrq(srq)->buf); + free(to_msrq(srq)->wrid); + free(to_msrq(srq)); + + return 0; +} + +static int mlx4_cmd_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4_create_qp *cmd, + struct mlx4_qp *qp) +{ + struct mlx4_create_qp_ex cmd_ex; + struct mlx4_create_qp_resp_ex resp; + int ret; + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle, + offsetof(typeof(cmd->ibv_cmd), is_srq) + + sizeof(cmd->ibv_cmd.is_srq) - + offsetof(typeof(cmd->ibv_cmd), user_handle)); + + memcpy(&cmd_ex.drv_ex, &cmd->buf_addr, + offsetof(typeof(*cmd), sq_no_prefetch) + + sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd)); + + ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, + &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd), + sizeof(cmd_ex), &resp.ibv_resp, + sizeof(resp.ibv_resp), sizeof(resp)); + return ret; +} + +enum { + MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_CREATE_FLAGS), +}; + +enum { + MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS), +}; + +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) +{ + struct mlx4_context *ctx = to_mctx(context); + struct mlx4_create_qp cmd; + struct ibv_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; + + /* Sanity check QP size before proceeding */ + if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */ + if (attr->cap.max_send_wr > ctx->max_qp_wr || + attr->cap.max_recv_wr > ctx->max_qp_wr || + attr->cap.max_send_sge > ctx->max_sge || + attr->cap.max_recv_sge > ctx->max_sge) + return NULL; + } else { + if (attr->cap.max_send_wr > 65536 || + attr->cap.max_recv_wr > 65536 || + attr->cap.max_send_sge > 64 || + attr->cap.max_recv_sge > 64) + return NULL; + } + if (attr->cap.max_inline_data > 1024) + return NULL; + + if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK) + return NULL; + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + + if (attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_send_wr = qp->sq.wqe_cnt = 0; + } else { + mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); + } + + if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || + attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; + } else { + qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); + if (attr->cap.max_recv_sge < 1) + attr->cap.max_recv_sge = 1; + if (attr->cap.max_recv_wr < 1) + attr->cap.max_recv_wr = 1; + } + + if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp)) + goto err; + + mlx4_init_qp_indices(qp); + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + if (attr->cap.max_recv_sge) { + qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!qp->db) + goto err_free; + + *qp->db = 0; + cmd.db_addr = (uintptr_t) qp->db; + } else { + cmd.db_addr = 0; + } + + cmd.buf_addr = (uintptr_t) qp->buf.buf; + cmd.log_sq_stride = qp->sq.wqe_shift; + for (cmd.log_sq_bb_count = 0; + qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; + ++cmd.log_sq_bb_count) + ; /* nothing */ + cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ + memset(cmd.reserved, 0, sizeof cmd.reserved); + pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); + + if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK) + ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp); + else + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, + &cmd.ibv_cmd, sizeof(cmd), &resp, + sizeof(resp)); + if (ret) + goto err_rq_db; + + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { + ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp); + if (ret) + goto err_destroy; + } + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + + qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr; + qp->rq.max_gs = attr->cap.max_recv_sge; + if (attr->qp_type != IBV_QPT_XRC_RECV) + mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); + + qp->doorbell_qpn = htobe32(qp->verbs_qp.qp.qp_num << 8); + if (attr->sq_sig_all) + qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; + + return &qp->verbs_qp.qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->verbs_qp.qp); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + if (attr->cap.max_recv_sge) + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); + +err_free: + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + mlx4_free_buf(&qp->buf); + +err: + free(qp); + + return NULL; +} + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_qp_init_attr_ex attr_ex; + struct ibv_qp *qp; + + memcpy(&attr_ex, attr, sizeof *attr); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = pd; + qp = mlx4_create_qp_ex(pd->context, &attr_ex); + if (qp) + memcpy(attr, &attr_ex, sizeof *attr); + return qp; +} + +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr) +{ + struct ibv_open_qp cmd; + struct ibv_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &qp->verbs_qp.qp; + +err: + free(qp); + return NULL; +} + +int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); + if (ret) + return ret; + + init_attr->cap.max_send_wr = qp->sq.max_post; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return 0; +} + +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct ibv_port_attr port_attr; + struct mlx4_qp *mqp = to_mqp(qp); + struct ibv_device_attr device_attr; + int ret; + + memset(&device_attr, 0, sizeof(device_attr)); + if (attr_mask & IBV_QP_PORT) { + ret = ibv_query_port(qp->context, attr->port_num, + &port_attr); + if (ret) + return ret; + mqp->link_layer = port_attr.link_layer; + + ret = ibv_query_device(qp->context, &device_attr); + if (ret) + return ret; + + switch(qp->qp_type) { + case IBV_QPT_UD: + if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) && + (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM)) + mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB | + MLX4_RX_CSUM_VALID; + break; + case IBV_QPT_RAW_PACKET: + if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) && + (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM)) + mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH | + MLX4_RX_CSUM_VALID; + break; + default: + break; + } + + } + + if (qp->state == IBV_QPS_RESET && + attr_mask & IBV_QP_STATE && + attr->qp_state == IBV_QPS_INIT) { + mlx4_qp_init_sq_ownership(to_mqp(qp)); + } + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + if (qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq && qp->send_cq != qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mlx4_init_qp_indices(to_mqp(qp)); + if (to_mqp(qp)->rq.wqe_cnt) + *to_mqp(qp)->db = 0; + } + + return ret; +} + +static void mlx4_lock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_lock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_lock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + pthread_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void mlx4_unlock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_unlock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_unlock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + pthread_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +int mlx4_destroy_qp(struct ibv_qp *ibqp) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + return ret; + } + + mlx4_lock_cqs(ibqp); + + if (ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); + + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) + mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); + + mlx4_unlock_cqs(ibqp); + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + + if (qp->rq.wqe_cnt) { + mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); + free(qp->rq.wrid); + } + if (qp->sq.wqe_cnt) + free(qp->sq.wrid); + mlx4_free_buf(&qp->buf); + free(qp); + + return 0; +} + +static int link_local_gid(const union ibv_gid *gid) +{ + uint32_t *tmp = (uint32_t *)gid->raw; + uint32_t hi = tmp[0]; + uint32_t lo = tmp[1]; + + if (hi == htobe32(0xfe800000) && lo == 0) + return 1; + + return 0; +} + +static int is_multicast_gid(const union ibv_gid *gid) +{ + return gid->raw[0] == 0xff; +} + +static uint16_t get_vlan_id(union ibv_gid *gid) +{ + uint16_t vid; + vid = gid->raw[11] << 8 | gid->raw[12]; + return vid < 0x1000 ? vid : 0xffff; +} + +static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah, + struct ibv_ah_attr *attr) +{ + int err, i; + uint16_t vid; + union ibv_gid sgid; + + if (link_local_gid(&attr->grh.dgid)) { + memcpy(ah->mac, &attr->grh.dgid.raw[8], 3); + memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3); + ah->mac[0] ^= 2; + + vid = get_vlan_id(&attr->grh.dgid); + } else if (is_multicast_gid(&attr->grh.dgid)) { + ah->mac[0] = 0x33; + ah->mac[1] = 0x33; + for (i = 2; i < 6; ++i) + ah->mac[i] = attr->grh.dgid.raw[i + 10]; + + err = ibv_query_gid(pd->context, attr->port_num, + attr->grh.sgid_index, &sgid); + if (err) + return err; + + ah->av.dlid = htobe16(0xc000); + ah->av.port_pd |= htobe32(1 << 31); + + vid = get_vlan_id(&sgid); + } else + return 1; + + if (vid != 0xffff) { + ah->av.port_pd |= htobe32(1 << 29); + ah->vlan = vid | ((attr->sl & 7) << 13); + } + + return 0; +} + +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct mlx4_ah *ah; + struct ibv_port_attr port_attr; + + if (query_port_cache(pd->context, attr->port_num, &port_attr)) + return NULL; + + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + + memset(&ah->av, 0, sizeof ah->av); + + ah->av.port_pd = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24)); + + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + ah->av.g_slid = attr->src_path_bits; + ah->av.dlid = htobe16(attr->dlid); + ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28); + } else + ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29); + + if (attr->static_rate) { + ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET; + /* XXX check rate cap? */ + } + if (attr->is_global) { + ah->av.g_slid |= 0x80; + ah->av.gid_index = attr->grh.sgid_index; + ah->av.hop_limit = attr->grh.hop_limit; + ah->av.sl_tclass_flowlabel |= + htobe32((attr->grh.traffic_class << 20) | + attr->grh.flow_label); + memcpy(ah->av.dgid, attr->grh.dgid.raw, 16); + } + + if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) { + uint16_t vid; + + if (ibv_resolve_eth_l2_from_gid(pd->context, attr, + ah->mac, &vid)) { + free(ah); + return NULL; + } + + if (vid <= 0xfff) { + ah->av.port_pd |= htobe32(1 << 29); + ah->vlan = vid | + ((attr->sl & 7) << 13); + } + + } else { + if (mlx4_resolve_grh_to_l2(pd, ah, attr)) { + free(ah); + return NULL; + } + } + } + + return &ah->ibv_ah; +} + +int mlx4_destroy_ah(struct ibv_ah *ah) +{ + free(to_mah(ah)); + + return 0; +} diff --git a/contrib/ofed/libmlx4/wqe.h b/contrib/ofed/libmlx4/wqe.h new file mode 100644 index 000000000000..6f833d9bf76b --- /dev/null +++ b/contrib/ofed/libmlx4/wqe.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef WQE_H +#define WQE_H + +#include <stdint.h> + +enum { + MLX4_SEND_DOORBELL = 0x14, +}; + +enum { + MLX4_WQE_CTRL_SOLICIT = 1 << 1, + MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, + MLX4_WQE_CTRL_IP_HDR_CSUM = 1 << 4, + MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, + MLX4_WQE_CTRL_FENCE = 1 << 6, + MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7 +}; + +enum { + MLX4_WQE_BIND_TYPE_2 = (1<<31), + MLX4_WQE_BIND_ZERO_BASED = (1<<30), +}; + +enum { + MLX4_INLINE_SEG = 1 << 31, + MLX4_INLINE_ALIGN = 64, +}; + +enum { + MLX4_INVALID_LKEY = 0x100, +}; + +struct mlx4_wqe_ctrl_seg { + uint32_t owner_opcode; + union { + struct { + uint8_t reserved[3]; + uint8_t fence_size; + }; + uint32_t bf_qpn; + }; + /* + * High 24 bits are SRC remote buffer; low 8 bits are flags: + * [7] SO (strong ordering) + * [5] TCP/UDP checksum + * [4] IP checksum + * [3:2] C (generate completion queue entry) + * [1] SE (solicited event) + * [0] FL (force loopback) + */ + uint32_t srcrb_flags; + /* + * imm is immediate data for send/RDMA write w/ immediate; + * also invalidation key for send with invalidate; input + * modifier for WQEs on CCQs. + */ + uint32_t imm; +}; + +struct mlx4_wqe_datagram_seg { + uint32_t av[8]; + uint32_t dqpn; + uint32_t qkey; + uint16_t vlan; + uint8_t mac[6]; +}; + +struct mlx4_wqe_data_seg { + uint32_t byte_count; + uint32_t lkey; + uint64_t addr; +}; + +struct mlx4_wqe_inline_seg { + uint32_t byte_count; +}; + +struct mlx4_wqe_srq_next_seg { + uint16_t reserved1; + uint16_t next_wqe_index; + uint32_t reserved2[3]; +}; + +struct mlx4_wqe_local_inval_seg { + uint64_t reserved1; + uint32_t mem_key; + uint32_t reserved2; + uint64_t reserved3[2]; +}; + +enum { + MLX4_WQE_MW_REMOTE_READ = 1 << 29, + MLX4_WQE_MW_REMOTE_WRITE = 1 << 30, + MLX4_WQE_MW_ATOMIC = 1 << 31 +}; + +struct mlx4_wqe_raddr_seg { + uint64_t raddr; + uint32_t rkey; + uint32_t reserved; +}; + +struct mlx4_wqe_atomic_seg { + uint64_t swap_add; + uint64_t compare; +}; + +struct mlx4_wqe_bind_seg { + uint32_t flags1; + uint32_t flags2; + uint32_t new_rkey; + uint32_t lkey; + uint64_t addr; + uint64_t length; +}; + +#endif /* WQE_H */ |