diff options
Diffstat (limited to 'sys/dev/mlx5/mlx5_en')
-rw-r--r-- | sys/dev/mlx5/mlx5_en/en.h | 946 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/en_rl.h | 174 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c | 1203 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c | 1487 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_main.c | 3901 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_rl.c | 1542 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_rx.c | 550 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_tx.c | 666 | ||||
-rw-r--r-- | sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c | 53 |
9 files changed, 10522 insertions, 0 deletions
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h new file mode 100644 index 000000000000..73f0268ca270 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/en.h @@ -0,0 +1,946 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MLX5_EN_H_ +#define _MLX5_EN_H_ + +#include <linux/kmod.h> +#include <linux/page.h> +#include <linux/slab.h> +#include <linux/if_vlan.h> +#include <linux/if_ether.h> +#include <linux/vmalloc.h> +#include <linux/moduleparam.h> +#include <linux/delay.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/tcp_lro.h> +#include <netinet/udp.h> +#include <net/ethernet.h> +#include <sys/buf_ring.h> +#include <sys/kthread.h> + +#include "opt_rss.h" + +#ifdef RSS +#include <net/rss_config.h> +#include <netinet/in_rss.h> +#endif + +#include <machine/bus.h> + +#include <dev/mlx5/driver.h> +#include <dev/mlx5/qp.h> +#include <dev/mlx5/cq.h> +#include <dev/mlx5/port.h> +#include <dev/mlx5/vport.h> +#include <dev/mlx5/diagnostics.h> + +#include <dev/mlx5/mlx5_core/wq.h> +#include <dev/mlx5/mlx5_core/transobj.h> +#include <dev/mlx5/mlx5_core/mlx5_core.h> + +#define IEEE_8021QAZ_MAX_TCS 8 + +#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE 0x7 +#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE 0xa +#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE 0xe + +#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE 0x7 +#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa +#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xe + +#define MLX5E_MAX_RX_SEGS 7 + +#ifndef MLX5E_MAX_RX_BYTES +#define MLX5E_MAX_RX_BYTES MCLBYTES +#endif + +#if (MLX5E_MAX_RX_SEGS == 1) +/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */ +#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ MJUM16BYTES +#else +#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \ + MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES) +#endif +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10 +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3 +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20 +#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC 0x10 +#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS 0x20 +#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES 0x80 +#define MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ 0x7 +#define MLX5E_CACHELINE_SIZE CACHE_LINE_SIZE +#define MLX5E_HW2SW_MTU(hwmtu) \ + ((hwmtu) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN)) +#define MLX5E_SW2HW_MTU(swmtu) \ + ((swmtu) + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN)) +#define MLX5E_SW2MB_MTU(swmtu) \ + (MLX5E_SW2HW_MTU(swmtu) + MLX5E_NET_IP_ALIGN) +#define MLX5E_MTU_MIN 72 /* Min MTU allowed by the kernel */ +#define MLX5E_MTU_MAX MIN(ETHERMTU_JUMBO, MJUM16BYTES) /* Max MTU of Ethernet + * jumbo frames */ + +#define MLX5E_BUDGET_MAX 8192 /* RX and TX */ +#define MLX5E_RX_BUDGET_MAX 256 +#define MLX5E_SQ_BF_BUDGET 16 +#define MLX5E_SQ_TX_QUEUE_SIZE 4096 /* SQ drbr queue size */ + +#define MLX5E_MAX_TX_NUM_TC 8 /* units */ +#define MLX5E_MAX_TX_HEADER 128 /* bytes */ +#define MLX5E_MAX_TX_PAYLOAD_SIZE 65536 /* bytes */ +#define MLX5E_MAX_TX_MBUF_SIZE 65536 /* bytes */ +#define MLX5E_MAX_TX_MBUF_FRAGS \ + ((MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) - \ + (MLX5E_MAX_TX_HEADER / MLX5_SEND_WQE_DS) - \ + 1 /* the maximum value of the DS counter is 0x3F and not 0x40 */) /* units */ +#define MLX5E_MAX_TX_INLINE \ + (MLX5E_MAX_TX_HEADER - sizeof(struct mlx5e_tx_wqe) + \ + sizeof(((struct mlx5e_tx_wqe *)0)->eth.inline_hdr_start)) /* bytes */ + +#define MLX5E_100MB (100000) +#define MLX5E_1GB (1000000) + +MALLOC_DECLARE(M_MLX5EN); + +struct mlx5_core_dev; +struct mlx5e_cq; + +typedef void (mlx5e_cq_comp_t)(struct mlx5_core_cq *); + +#define MLX5E_STATS_COUNT(a,b,c,d) a +#define MLX5E_STATS_VAR(a,b,c,d) b; +#define MLX5E_STATS_DESC(a,b,c,d) c, d, + +#define MLX5E_VPORT_STATS(m) \ + /* HW counters */ \ + m(+1, u64 rx_packets, "rx_packets", "Received packets") \ + m(+1, u64 rx_bytes, "rx_bytes", "Received bytes") \ + m(+1, u64 tx_packets, "tx_packets", "Transmitted packets") \ + m(+1, u64 tx_bytes, "tx_bytes", "Transmitted bytes") \ + m(+1, u64 rx_error_packets, "rx_error_packets", "Received error packets") \ + m(+1, u64 rx_error_bytes, "rx_error_bytes", "Received error bytes") \ + m(+1, u64 tx_error_packets, "tx_error_packets", "Transmitted error packets") \ + m(+1, u64 tx_error_bytes, "tx_error_bytes", "Transmitted error bytes") \ + m(+1, u64 rx_unicast_packets, "rx_unicast_packets", "Received unicast packets") \ + m(+1, u64 rx_unicast_bytes, "rx_unicast_bytes", "Received unicast bytes") \ + m(+1, u64 tx_unicast_packets, "tx_unicast_packets", "Transmitted unicast packets") \ + m(+1, u64 tx_unicast_bytes, "tx_unicast_bytes", "Transmitted unicast bytes") \ + m(+1, u64 rx_multicast_packets, "rx_multicast_packets", "Received multicast packets") \ + m(+1, u64 rx_multicast_bytes, "rx_multicast_bytes", "Received multicast bytes") \ + m(+1, u64 tx_multicast_packets, "tx_multicast_packets", "Transmitted multicast packets") \ + m(+1, u64 tx_multicast_bytes, "tx_multicast_bytes", "Transmitted multicast bytes") \ + m(+1, u64 rx_broadcast_packets, "rx_broadcast_packets", "Received broadcast packets") \ + m(+1, u64 rx_broadcast_bytes, "rx_broadcast_bytes", "Received broadcast bytes") \ + m(+1, u64 tx_broadcast_packets, "tx_broadcast_packets", "Transmitted broadcast packets") \ + m(+1, u64 tx_broadcast_bytes, "tx_broadcast_bytes", "Transmitted broadcast bytes") \ + m(+1, u64 rx_out_of_buffer, "rx_out_of_buffer", "Receive out of buffer, no recv wqes events") \ + /* SW counters */ \ + m(+1, u64 tso_packets, "tso_packets", "Transmitted TSO packets") \ + m(+1, u64 tso_bytes, "tso_bytes", "Transmitted TSO bytes") \ + m(+1, u64 lro_packets, "lro_packets", "Received LRO packets") \ + m(+1, u64 lro_bytes, "lro_bytes", "Received LRO bytes") \ + m(+1, u64 sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO") \ + m(+1, u64 sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO") \ + m(+1, u64 rx_csum_good, "rx_csum_good", "Received checksum valid packets") \ + m(+1, u64 rx_csum_none, "rx_csum_none", "Received no checksum packets") \ + m(+1, u64 tx_csum_offload, "tx_csum_offload", "Transmit checksum offload packets") \ + m(+1, u64 tx_queue_dropped, "tx_queue_dropped", "Transmit queue dropped") \ + m(+1, u64 tx_defragged, "tx_defragged", "Transmit queue defragged") \ + m(+1, u64 rx_wqe_err, "rx_wqe_err", "Receive WQE errors") + +#define MLX5E_VPORT_STATS_NUM (0 MLX5E_VPORT_STATS(MLX5E_STATS_COUNT)) + +struct mlx5e_vport_stats { + struct sysctl_ctx_list ctx; + u64 arg [0]; + MLX5E_VPORT_STATS(MLX5E_STATS_VAR) + u32 rx_out_of_buffer_prev; +}; + +#define MLX5E_PPORT_IEEE802_3_STATS(m) \ + m(+1, u64 frames_tx, "frames_tx", "Frames transmitted") \ + m(+1, u64 frames_rx, "frames_rx", "Frames received") \ + m(+1, u64 check_seq_err, "check_seq_err", "Sequence errors") \ + m(+1, u64 alignment_err, "alignment_err", "Alignment errors") \ + m(+1, u64 octets_tx, "octets_tx", "Bytes transmitted") \ + m(+1, u64 octets_received, "octets_received", "Bytes received") \ + m(+1, u64 multicast_xmitted, "multicast_xmitted", "Multicast transmitted") \ + m(+1, u64 broadcast_xmitted, "broadcast_xmitted", "Broadcast transmitted") \ + m(+1, u64 multicast_rx, "multicast_rx", "Multicast received") \ + m(+1, u64 broadcast_rx, "broadcast_rx", "Broadcast received") \ + m(+1, u64 in_range_len_errors, "in_range_len_errors", "In range length errors") \ + m(+1, u64 out_of_range_len, "out_of_range_len", "Out of range length errors") \ + m(+1, u64 too_long_errors, "too_long_errors", "Too long errors") \ + m(+1, u64 symbol_err, "symbol_err", "Symbol errors") \ + m(+1, u64 mac_control_tx, "mac_control_tx", "MAC control transmitted") \ + m(+1, u64 mac_control_rx, "mac_control_rx", "MAC control received") \ + m(+1, u64 unsupported_op_rx, "unsupported_op_rx", "Unsupported operation received") \ + m(+1, u64 pause_ctrl_rx, "pause_ctrl_rx", "Pause control received") \ + m(+1, u64 pause_ctrl_tx, "pause_ctrl_tx", "Pause control transmitted") + +#define MLX5E_PPORT_RFC2819_STATS(m) \ + m(+1, u64 drop_events, "drop_events", "Dropped events") \ + m(+1, u64 octets, "octets", "Octets") \ + m(+1, u64 pkts, "pkts", "Packets") \ + m(+1, u64 broadcast_pkts, "broadcast_pkts", "Broadcast packets") \ + m(+1, u64 multicast_pkts, "multicast_pkts", "Multicast packets") \ + m(+1, u64 crc_align_errors, "crc_align_errors", "CRC alignment errors") \ + m(+1, u64 undersize_pkts, "undersize_pkts", "Undersized packets") \ + m(+1, u64 oversize_pkts, "oversize_pkts", "Oversized packets") \ + m(+1, u64 fragments, "fragments", "Fragments") \ + m(+1, u64 jabbers, "jabbers", "Jabbers") \ + m(+1, u64 collisions, "collisions", "Collisions") + +#define MLX5E_PPORT_RFC2819_STATS_DEBUG(m) \ + m(+1, u64 p64octets, "p64octets", "Bytes") \ + m(+1, u64 p65to127octets, "p65to127octets", "Bytes") \ + m(+1, u64 p128to255octets, "p128to255octets", "Bytes") \ + m(+1, u64 p256to511octets, "p256to511octets", "Bytes") \ + m(+1, u64 p512to1023octets, "p512to1023octets", "Bytes") \ + m(+1, u64 p1024to1518octets, "p1024to1518octets", "Bytes") \ + m(+1, u64 p1519to2047octets, "p1519to2047octets", "Bytes") \ + m(+1, u64 p2048to4095octets, "p2048to4095octets", "Bytes") \ + m(+1, u64 p4096to8191octets, "p4096to8191octets", "Bytes") \ + m(+1, u64 p8192to10239octets, "p8192to10239octets", "Bytes") + +#define MLX5E_PPORT_RFC2863_STATS_DEBUG(m) \ + m(+1, u64 in_octets, "in_octets", "In octets") \ + m(+1, u64 in_ucast_pkts, "in_ucast_pkts", "In unicast packets") \ + m(+1, u64 in_discards, "in_discards", "In discards") \ + m(+1, u64 in_errors, "in_errors", "In errors") \ + m(+1, u64 in_unknown_protos, "in_unknown_protos", "In unknown protocols") \ + m(+1, u64 out_octets, "out_octets", "Out octets") \ + m(+1, u64 out_ucast_pkts, "out_ucast_pkts", "Out unicast packets") \ + m(+1, u64 out_discards, "out_discards", "Out discards") \ + m(+1, u64 out_errors, "out_errors", "Out errors") \ + m(+1, u64 in_multicast_pkts, "in_multicast_pkts", "In multicast packets") \ + m(+1, u64 in_broadcast_pkts, "in_broadcast_pkts", "In broadcast packets") \ + m(+1, u64 out_multicast_pkts, "out_multicast_pkts", "Out multicast packets") \ + m(+1, u64 out_broadcast_pkts, "out_broadcast_pkts", "Out broadcast packets") + +#define MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m) \ + m(+1, u64 time_since_last_clear, "time_since_last_clear", \ + "Time since the last counters clear event (msec)") \ + m(+1, u64 symbol_errors, "symbol_errors", "Symbol errors") \ + m(+1, u64 sync_headers_errors, "sync_headers_errors", "Sync header error counter") \ + m(+1, u64 bip_errors_lane0, "edpl_bip_errors_lane0", \ + "Indicates the number of PRBS errors on lane 0") \ + m(+1, u64 bip_errors_lane1, "edpl_bip_errors_lane1", \ + "Indicates the number of PRBS errors on lane 1") \ + m(+1, u64 bip_errors_lane2, "edpl_bip_errors_lane2", \ + "Indicates the number of PRBS errors on lane 2") \ + m(+1, u64 bip_errors_lane3, "edpl_bip_errors_lane3", \ + "Indicates the number of PRBS errors on lane 3") \ + m(+1, u64 fc_corrected_blocks_lane0, "fc_corrected_blocks_lane0", \ + "FEC correctable block counter lane 0") \ + m(+1, u64 fc_corrected_blocks_lane1, "fc_corrected_blocks_lane1", \ + "FEC correctable block counter lane 1") \ + m(+1, u64 fc_corrected_blocks_lane2, "fc_corrected_blocks_lane2", \ + "FEC correctable block counter lane 2") \ + m(+1, u64 fc_corrected_blocks_lane3, "fc_corrected_blocks_lane3", \ + "FEC correctable block counter lane 3") \ + m(+1, u64 rs_corrected_blocks, "rs_corrected_blocks", \ + "FEC correcable block counter") \ + m(+1, u64 rs_uncorrectable_blocks, "rs_uncorrectable_blocks", \ + "FEC uncorrecable block counter") \ + m(+1, u64 rs_no_errors_blocks, "rs_no_errors_blocks", \ + "The number of RS-FEC blocks received that had no errors") \ + m(+1, u64 rs_single_error_blocks, "rs_single_error_blocks", \ + "The number of corrected RS-FEC blocks received that had" \ + "exactly 1 error symbol") \ + m(+1, u64 rs_corrected_symbols_total, "rs_corrected_symbols_total", \ + "Port FEC corrected symbol counter") \ + m(+1, u64 rs_corrected_symbols_lane0, "rs_corrected_symbols_lane0", \ + "FEC corrected symbol counter lane 0") \ + m(+1, u64 rs_corrected_symbols_lane1, "rs_corrected_symbols_lane1", \ + "FEC corrected symbol counter lane 1") \ + m(+1, u64 rs_corrected_symbols_lane2, "rs_corrected_symbols_lane2", \ + "FEC corrected symbol counter lane 2") \ + m(+1, u64 rs_corrected_symbols_lane3, "rs_corrected_symbols_lane3", \ + "FEC corrected symbol counter lane 3") + +/* Per priority statistics for PFC */ +#define MLX5E_PPORT_PER_PRIO_STATS_SUB(m,n,p) \ + m(n, p, +1, u64, rx_octets, "rx_octets", "Received octets") \ + m(n, p, +1, u64, reserved_0, "reserved_0", "Reserved") \ + m(n, p, +1, u64, reserved_1, "reserved_1", "Reserved") \ + m(n, p, +1, u64, reserved_2, "reserved_2", "Reserved") \ + m(n, p, +1, u64, rx_frames, "rx_frames", "Received frames") \ + m(n, p, +1, u64, tx_octets, "tx_octets", "Transmitted octets") \ + m(n, p, +1, u64, reserved_3, "reserved_3", "Reserved") \ + m(n, p, +1, u64, reserved_4, "reserved_4", "Reserved") \ + m(n, p, +1, u64, reserved_5, "reserved_5", "Reserved") \ + m(n, p, +1, u64, tx_frames, "tx_frames", "Transmitted frames") \ + m(n, p, +1, u64, rx_pause, "rx_pause", "Received pause frames") \ + m(n, p, +1, u64, rx_pause_duration, "rx_pause_duration", \ + "Received pause duration") \ + m(n, p, +1, u64, tx_pause, "tx_pause", "Transmitted pause frames") \ + m(n, p, +1, u64, tx_pause_duration, "tx_pause_duration", \ + "Transmitted pause duration") \ + m(n, p, +1, u64, rx_pause_transition, "rx_pause_transition", \ + "Received pause transitions") \ + m(n, p, +1, u64, rx_discards, "rx_discards", "Discarded received frames") \ + m(n, p, +1, u64, device_stall_minor_watermark, \ + "device_stall_minor_watermark", "Device stall minor watermark") \ + m(n, p, +1, u64, device_stall_critical_watermark, \ + "device_stall_critical_watermark", "Device stall critical watermark") + +#define MLX5E_PPORT_PER_PRIO_STATS_PREFIX(m,p,c,t,f,s,d) \ + m(c, t pri_##p##_##f, "prio" #p "_" s, "Priority " #p " - " d) + +#define MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO 8 + +#define MLX5E_PPORT_PER_PRIO_STATS(m) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,0) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,1) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,2) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,3) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,4) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,5) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,6) \ + MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,7) + +/* + * Make sure to update mlx5e_update_pport_counters() + * when adding a new MLX5E_PPORT_STATS block + */ +#define MLX5E_PPORT_STATS(m) \ + MLX5E_PPORT_PER_PRIO_STATS(m) \ + MLX5E_PPORT_IEEE802_3_STATS(m) \ + MLX5E_PPORT_RFC2819_STATS(m) + +#define MLX5E_PORT_STATS_DEBUG(m) \ + MLX5E_PPORT_RFC2819_STATS_DEBUG(m) \ + MLX5E_PPORT_RFC2863_STATS_DEBUG(m) \ + MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m) + +#define MLX5E_PPORT_IEEE802_3_STATS_NUM \ + (0 MLX5E_PPORT_IEEE802_3_STATS(MLX5E_STATS_COUNT)) +#define MLX5E_PPORT_RFC2819_STATS_NUM \ + (0 MLX5E_PPORT_RFC2819_STATS(MLX5E_STATS_COUNT)) +#define MLX5E_PPORT_STATS_NUM \ + (0 MLX5E_PPORT_STATS(MLX5E_STATS_COUNT)) + +#define MLX5E_PPORT_PER_PRIO_STATS_NUM \ + (0 MLX5E_PPORT_PER_PRIO_STATS(MLX5E_STATS_COUNT)) +#define MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM \ + (0 MLX5E_PPORT_RFC2819_STATS_DEBUG(MLX5E_STATS_COUNT)) +#define MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM \ + (0 MLX5E_PPORT_RFC2863_STATS_DEBUG(MLX5E_STATS_COUNT)) +#define MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM \ + (0 MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(MLX5E_STATS_COUNT)) +#define MLX5E_PORT_STATS_DEBUG_NUM \ + (0 MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_COUNT)) + +struct mlx5e_pport_stats { + struct sysctl_ctx_list ctx; + u64 arg [0]; + MLX5E_PPORT_STATS(MLX5E_STATS_VAR) +}; + +struct mlx5e_port_stats_debug { + struct sysctl_ctx_list ctx; + u64 arg [0]; + MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_VAR) +}; + +#define MLX5E_RQ_STATS(m) \ + m(+1, u64 packets, "packets", "Received packets") \ + m(+1, u64 csum_none, "csum_none", "Received packets") \ + m(+1, u64 lro_packets, "lro_packets", "Received packets") \ + m(+1, u64 lro_bytes, "lro_bytes", "Received packets") \ + m(+1, u64 sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO") \ + m(+1, u64 sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO") \ + m(+1, u64 wqe_err, "wqe_err", "Received packets") + +#define MLX5E_RQ_STATS_NUM (0 MLX5E_RQ_STATS(MLX5E_STATS_COUNT)) + +struct mlx5e_rq_stats { + struct sysctl_ctx_list ctx; + u64 arg [0]; + MLX5E_RQ_STATS(MLX5E_STATS_VAR) +}; + +#define MLX5E_SQ_STATS(m) \ + m(+1, u64 packets, "packets", "Transmitted packets") \ + m(+1, u64 tso_packets, "tso_packets", "Transmitted packets") \ + m(+1, u64 tso_bytes, "tso_bytes", "Transmitted bytes") \ + m(+1, u64 csum_offload_none, "csum_offload_none", "Transmitted packets") \ + m(+1, u64 defragged, "defragged", "Transmitted packets") \ + m(+1, u64 dropped, "dropped", "Transmitted packets") \ + m(+1, u64 nop, "nop", "Transmitted packets") + +#define MLX5E_SQ_STATS_NUM (0 MLX5E_SQ_STATS(MLX5E_STATS_COUNT)) + +struct mlx5e_sq_stats { + struct sysctl_ctx_list ctx; + u64 arg [0]; + MLX5E_SQ_STATS(MLX5E_STATS_VAR) +}; + +struct mlx5e_stats { + struct mlx5e_vport_stats vport; + struct mlx5e_pport_stats pport; + struct mlx5e_port_stats_debug port_stats_debug; +}; + +struct mlx5e_rq_param { + u32 rqc [MLX5_ST_SZ_DW(rqc)]; + struct mlx5_wq_param wq; +}; + +struct mlx5e_sq_param { + u32 sqc [MLX5_ST_SZ_DW(sqc)]; + struct mlx5_wq_param wq; +}; + +struct mlx5e_cq_param { + u32 cqc [MLX5_ST_SZ_DW(cqc)]; + struct mlx5_wq_param wq; +}; + +struct mlx5e_params { + u8 log_sq_size; + u8 log_rq_size; + u16 num_channels; + u8 default_vlan_prio; + u8 num_tc; + u8 rx_cq_moderation_mode; + u8 tx_cq_moderation_mode; + u16 rx_cq_moderation_usec; + u16 rx_cq_moderation_pkts; + u16 tx_cq_moderation_usec; + u16 tx_cq_moderation_pkts; + u16 min_rx_wqes; + bool hw_lro_en; + bool cqe_zipping_en; + u32 lro_wqe_sz; + u16 rx_hash_log_tbl_sz; + u32 tx_pauseframe_control __aligned(4); + u32 rx_pauseframe_control __aligned(4); + u32 tx_priority_flow_control __aligned(4); + u32 rx_priority_flow_control __aligned(4); + u16 tx_max_inline; + u8 tx_min_inline_mode; + u8 channels_rsss; +}; + +#define MLX5E_PARAMS(m) \ + m(+1, u64 tx_queue_size_max, "tx_queue_size_max", "Max send queue size") \ + m(+1, u64 rx_queue_size_max, "rx_queue_size_max", "Max receive queue size") \ + m(+1, u64 tx_queue_size, "tx_queue_size", "Default send queue size") \ + m(+1, u64 rx_queue_size, "rx_queue_size", "Default receive queue size") \ + m(+1, u64 channels, "channels", "Default number of channels") \ + m(+1, u64 channels_rsss, "channels_rsss", "Default channels receive side scaling stride") \ + m(+1, u64 coalesce_usecs_max, "coalesce_usecs_max", "Maximum usecs for joining packets") \ + m(+1, u64 coalesce_pkts_max, "coalesce_pkts_max", "Maximum packets to join") \ + m(+1, u64 rx_coalesce_usecs, "rx_coalesce_usecs", "Limit in usec for joining rx packets") \ + m(+1, u64 rx_coalesce_pkts, "rx_coalesce_pkts", "Maximum number of rx packets to join") \ + m(+1, u64 rx_coalesce_mode, "rx_coalesce_mode", "0: EQE mode 1: CQE mode") \ + m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \ + m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \ + m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \ + m(+1, u64 tx_bufring_disable, "tx_bufring_disable", "0: Enable bufring 1: Disable bufring") \ + m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \ + m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \ + m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \ + m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled") \ + m(+1, u64 modify_tx_dma, "modify_tx_dma", "0: Enable TX 1: Disable TX") \ + m(+1, u64 modify_rx_dma, "modify_rx_dma", "0: Enable RX 1: Disable RX") \ + m(+1, u64 diag_pci_enable, "diag_pci_enable", "0: Disabled 1: Enabled") \ + m(+1, u64 diag_general_enable, "diag_general_enable", "0: Disabled 1: Enabled") \ + m(+1, u64 hw_mtu, "hw_mtu", "Current hardware MTU value") \ + m(+1, u64 mc_local_lb, "mc_local_lb", "0: Local multicast loopback enabled 1: Disabled") \ + m(+1, u64 uc_local_lb, "uc_local_lb", "0: Local unicast loopback enabled 1: Disabled") + + +#define MLX5E_PARAMS_NUM (0 MLX5E_PARAMS(MLX5E_STATS_COUNT)) + +struct mlx5e_params_ethtool { + u64 arg [0]; + MLX5E_PARAMS(MLX5E_STATS_VAR) + u64 max_bw_value[IEEE_8021QAZ_MAX_TCS]; + u8 prio_tc[IEEE_8021QAZ_MAX_TCS]; + u8 dscp2prio[MLX5_MAX_SUPPORTED_DSCP]; + u8 trust_state; +}; + +/* EEPROM Standards for plug in modules */ +#ifndef MLX5E_ETH_MODULE_SFF_8472 +#define MLX5E_ETH_MODULE_SFF_8472 0x1 +#define MLX5E_ETH_MODULE_SFF_8472_LEN 128 +#endif + +#ifndef MLX5E_ETH_MODULE_SFF_8636 +#define MLX5E_ETH_MODULE_SFF_8636 0x2 +#define MLX5E_ETH_MODULE_SFF_8636_LEN 256 +#endif + +#ifndef MLX5E_ETH_MODULE_SFF_8436 +#define MLX5E_ETH_MODULE_SFF_8436 0x3 +#define MLX5E_ETH_MODULE_SFF_8436_LEN 256 +#endif + +/* EEPROM I2C Addresses */ +#define MLX5E_I2C_ADDR_LOW 0x50 +#define MLX5E_I2C_ADDR_HIGH 0x51 + +#define MLX5E_EEPROM_LOW_PAGE 0x0 +#define MLX5E_EEPROM_HIGH_PAGE 0x3 + +#define MLX5E_EEPROM_HIGH_PAGE_OFFSET 128 +#define MLX5E_EEPROM_PAGE_LENGTH 256 + +#define MLX5E_EEPROM_INFO_BYTES 0x3 + +struct mlx5e_cq { + /* data path - accessed per cqe */ + struct mlx5_cqwq wq; + + /* data path - accessed per HW polling */ + struct mlx5_core_cq mcq; + + /* control */ + struct mlx5e_priv *priv; + struct mlx5_wq_ctrl wq_ctrl; +} __aligned(MLX5E_CACHELINE_SIZE); + +struct mlx5e_rq_mbuf { + bus_dmamap_t dma_map; + caddr_t data; + struct mbuf *mbuf; +}; + +struct mlx5e_rq { + /* data path */ + struct mlx5_wq_ll wq; + struct mtx mtx; + bus_dma_tag_t dma_tag; + u32 wqe_sz; + u32 nsegs; + struct mlx5e_rq_mbuf *mbuf; + struct ifnet *ifp; + struct mlx5e_rq_stats stats; + struct mlx5e_cq cq; + struct lro_ctrl lro; + volatile int enabled; + int ix; + + /* control */ + struct mlx5_wq_ctrl wq_ctrl; + u32 rqn; + struct mlx5e_channel *channel; + struct callout watchdog; +} __aligned(MLX5E_CACHELINE_SIZE); + +struct mlx5e_sq_mbuf { + bus_dmamap_t dma_map; + struct mbuf *mbuf; + u32 num_bytes; + u32 num_wqebbs; +}; + +enum { + MLX5E_SQ_READY, + MLX5E_SQ_FULL +}; + +struct mlx5e_sq { + /* data path */ + struct mtx lock; + bus_dma_tag_t dma_tag; + struct mtx comp_lock; + + /* dirtied @completion */ + u16 cc; + + /* dirtied @xmit */ + u16 pc __aligned(MLX5E_CACHELINE_SIZE); + u16 bf_offset; + u16 cev_counter; /* completion event counter */ + u16 cev_factor; /* completion event factor */ + u16 cev_next_state; /* next completion event state */ +#define MLX5E_CEV_STATE_INITIAL 0 /* timer not started */ +#define MLX5E_CEV_STATE_SEND_NOPS 1 /* send NOPs */ +#define MLX5E_CEV_STATE_HOLD_NOPS 2 /* don't send NOPs yet */ + u16 stopped; /* set if SQ is stopped */ + struct callout cev_callout; + union { + u32 d32[2]; + u64 d64; + } doorbell; + struct mlx5e_sq_stats stats; + + struct mlx5e_cq cq; + struct task sq_task; + struct taskqueue *sq_tq; + + /* pointers to per packet info: write@xmit, read@completion */ + struct mlx5e_sq_mbuf *mbuf; + struct buf_ring *br; + + /* read only */ + struct mlx5_wq_cyc wq; + struct mlx5_uar uar; + struct ifnet *ifp; + u32 sqn; + u32 bf_buf_size; + u32 mkey_be; + u16 max_inline; + u8 min_inline_mode; + u8 vlan_inline_cap; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5e_priv *priv; + int tc; + unsigned int queue_state; +} __aligned(MLX5E_CACHELINE_SIZE); + +static inline bool +mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n) +{ + u16 cc = sq->cc; + u16 pc = sq->pc; + + return ((sq->wq.sz_m1 & (cc - pc)) >= n || cc == pc); +} + +struct mlx5e_channel { + /* data path */ + struct mlx5e_rq rq; + struct mlx5e_sq sq[MLX5E_MAX_TX_NUM_TC]; + struct ifnet *ifp; + u32 mkey_be; + u8 num_tc; + + /* control */ + struct mlx5e_priv *priv; + int ix; + int cpu; +} __aligned(MLX5E_CACHELINE_SIZE); + +enum mlx5e_traffic_types { + MLX5E_TT_IPV4_TCP, + MLX5E_TT_IPV6_TCP, + MLX5E_TT_IPV4_UDP, + MLX5E_TT_IPV6_UDP, + MLX5E_TT_IPV4_IPSEC_AH, + MLX5E_TT_IPV6_IPSEC_AH, + MLX5E_TT_IPV4_IPSEC_ESP, + MLX5E_TT_IPV6_IPSEC_ESP, + MLX5E_TT_IPV4, + MLX5E_TT_IPV6, + MLX5E_TT_ANY, + MLX5E_NUM_TT, +}; + +enum { + MLX5E_RQT_SPREADING = 0, + MLX5E_RQT_DEFAULT_RQ = 1, + MLX5E_NUM_RQT = 2, +}; + +struct mlx5_flow_rule; + +struct mlx5e_eth_addr_info { + u8 addr [ETH_ALEN + 2]; + u32 tt_vec; + /* flow table rule per traffic type */ + struct mlx5_flow_rule *ft_rule[MLX5E_NUM_TT]; +}; + +#define MLX5E_ETH_ADDR_HASH_SIZE (1 << BITS_PER_BYTE) + +struct mlx5e_eth_addr_hash_node; + +struct mlx5e_eth_addr_hash_head { + struct mlx5e_eth_addr_hash_node *lh_first; +}; + +struct mlx5e_eth_addr_db { + struct mlx5e_eth_addr_hash_head if_uc[MLX5E_ETH_ADDR_HASH_SIZE]; + struct mlx5e_eth_addr_hash_head if_mc[MLX5E_ETH_ADDR_HASH_SIZE]; + struct mlx5e_eth_addr_info broadcast; + struct mlx5e_eth_addr_info allmulti; + struct mlx5e_eth_addr_info promisc; + bool broadcast_enabled; + bool allmulti_enabled; + bool promisc_enabled; +}; + +enum { + MLX5E_STATE_ASYNC_EVENTS_ENABLE, + MLX5E_STATE_OPENED, +}; + +enum { + MLX5_BW_NO_LIMIT = 0, + MLX5_100_MBPS_UNIT = 3, + MLX5_GBPS_UNIT = 4, +}; + +struct mlx5e_vlan_db { + unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + struct mlx5_flow_rule *active_vlans_ft_rule[VLAN_N_VID]; + struct mlx5_flow_rule *untagged_ft_rule; + struct mlx5_flow_rule *any_cvlan_ft_rule; + struct mlx5_flow_rule *any_svlan_ft_rule; + bool filter_disabled; +}; + +struct mlx5e_flow_table { + int num_groups; + struct mlx5_flow_table *t; + struct mlx5_flow_group **g; +}; + +struct mlx5e_flow_tables { + struct mlx5_flow_namespace *ns; + struct mlx5e_flow_table vlan; + struct mlx5e_flow_table main; + struct mlx5e_flow_table inner_rss; +}; + +#ifdef RATELIMIT +#include "en_rl.h" +#endif + +#define MLX5E_TSTMP_PREC 10 + +struct mlx5e_clbr_point { + uint64_t base_curr; + uint64_t base_prev; + uint64_t clbr_hw_prev; + uint64_t clbr_hw_curr; + u_int clbr_gen; +}; + +struct mlx5e_priv { + struct mlx5_core_dev *mdev; /* must be first */ + + /* priv data path fields - start */ + int order_base_2_num_channels; + int queue_mapping_channel_mask; + int num_tc; + int default_vlan_prio; + /* priv data path fields - end */ + + unsigned long state; + int gone; +#define PRIV_LOCK(priv) sx_xlock(&(priv)->state_lock) +#define PRIV_UNLOCK(priv) sx_xunlock(&(priv)->state_lock) +#define PRIV_LOCKED(priv) sx_xlocked(&(priv)->state_lock) + struct sx state_lock; /* Protects Interface state */ + struct mlx5_uar cq_uar; + u32 pdn; + u32 tdn; + struct mlx5_core_mr mr; + + struct mlx5e_channel *volatile *channel; + u32 tisn[MLX5E_MAX_TX_NUM_TC]; + u32 rqtn; + u32 tirn[MLX5E_NUM_TT]; + + struct mlx5e_flow_tables fts; + struct mlx5e_eth_addr_db eth_addr; + struct mlx5e_vlan_db vlan; + + struct mlx5e_params params; + struct mlx5e_params_ethtool params_ethtool; + union mlx5_core_pci_diagnostics params_pci; + union mlx5_core_general_diagnostics params_general; + struct mtx async_events_mtx; /* sync hw events */ + struct work_struct update_stats_work; + struct work_struct update_carrier_work; + struct work_struct set_rx_mode_work; + MLX5_DECLARE_DOORBELL_LOCK(doorbell_lock) + + struct ifnet *ifp; + struct sysctl_ctx_list sysctl_ctx; + struct sysctl_oid *sysctl_ifnet; + struct sysctl_oid *sysctl_hw; + int sysctl_debug; + struct mlx5e_stats stats; + struct sysctl_ctx_list sysctl_ctx_channel_debug; + int counter_set_id; + + struct workqueue_struct *wq; + + eventhandler_tag vlan_detach; + eventhandler_tag vlan_attach; + struct ifmedia media; + int media_status_last; + int media_active_last; + + struct callout watchdog; +#ifdef RATELIMIT + struct mlx5e_rl_priv_data rl; +#endif + + struct callout tstmp_clbr; + int clbr_done; + int clbr_curr; + struct mlx5e_clbr_point clbr_points[2]; + u_int clbr_gen; +}; + +#define MLX5E_NET_IP_ALIGN 2 + +struct mlx5e_tx_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_eth_seg eth; +}; + +struct mlx5e_rx_wqe { + struct mlx5_wqe_srq_next_seg next; + struct mlx5_wqe_data_seg data[]; +}; + +/* the size of the structure above must be power of two */ +CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe))); + +struct mlx5e_eeprom { + int lock_bit; + int i2c_addr; + int page_num; + int device_addr; + int module_num; + int len; + int type; + int page_valid; + u32 *data; +}; + +/* + * This structure contains rate limit extension to the IEEE 802.1Qaz ETS + * managed object. + * Values are 64 bits long and specified in Kbps to enable usage over both + * slow and very fast networks. + * + * @tc_maxrate: maximal tc tx bandwidth indexed by traffic class + */ +struct ieee_maxrate { + __u64 tc_maxrate[IEEE_8021QAZ_MAX_TCS]; +}; + + +#define MLX5E_FLD_MAX(typ, fld) ((1ULL << __mlx5_bit_sz(typ, fld)) - 1ULL) + +int mlx5e_xmit(struct ifnet *, struct mbuf *); + +int mlx5e_open_locked(struct ifnet *); +int mlx5e_close_locked(struct ifnet *); + +void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event); +void mlx5e_rx_cq_comp(struct mlx5_core_cq *); +void mlx5e_tx_cq_comp(struct mlx5_core_cq *); +struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq); +void mlx5e_tx_que(void *context, int pending); + +int mlx5e_open_flow_table(struct mlx5e_priv *priv); +void mlx5e_close_flow_table(struct mlx5e_priv *priv); +void mlx5e_set_rx_mode_core(struct mlx5e_priv *priv); +void mlx5e_set_rx_mode_work(struct work_struct *work); + +void mlx5e_vlan_rx_add_vid(void *, struct ifnet *, u16); +void mlx5e_vlan_rx_kill_vid(void *, struct ifnet *, u16); +void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv); +void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv); +int mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv); +void mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv); + +static inline void +mlx5e_tx_notify_hw(struct mlx5e_sq *sq, u32 *wqe, int bf_sz) +{ + u16 ofst = MLX5_BF_OFFSET + sq->bf_offset; + + /* ensure wqe is visible to device before updating doorbell record */ + wmb(); + + *sq->wq.db = cpu_to_be32(sq->pc); + + /* + * Ensure the doorbell record is visible to device before ringing + * the doorbell: + */ + wmb(); + + if (bf_sz) { + __iowrite64_copy(sq->uar.bf_map + ofst, wqe, bf_sz); + + /* flush the write-combining mapped buffer */ + wmb(); + + } else { + mlx5_write64(wqe, sq->uar.map + ofst, + MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock)); + } + + sq->bf_offset ^= sq->bf_buf_size; +} + +static inline void +mlx5e_cq_arm(struct mlx5e_cq *cq, spinlock_t *dblock) +{ + struct mlx5_core_cq *mcq; + + mcq = &cq->mcq; + mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, dblock, cq->wq.cc); +} + +extern const struct ethtool_ops mlx5e_ethtool_ops; +void mlx5e_create_ethtool(struct mlx5e_priv *); +void mlx5e_create_stats(struct sysctl_ctx_list *, + struct sysctl_oid_list *, const char *, + const char **, unsigned, u64 *); +void mlx5e_send_nop(struct mlx5e_sq *, u32); +void mlx5e_sq_cev_timeout(void *); +int mlx5e_refresh_channel_params(struct mlx5e_priv *); +int mlx5e_open_cq(struct mlx5e_priv *, struct mlx5e_cq_param *, + struct mlx5e_cq *, mlx5e_cq_comp_t *, int eq_ix); +void mlx5e_close_cq(struct mlx5e_cq *); +void mlx5e_free_sq_db(struct mlx5e_sq *); +int mlx5e_alloc_sq_db(struct mlx5e_sq *); +int mlx5e_enable_sq(struct mlx5e_sq *, struct mlx5e_sq_param *, int tis_num); +int mlx5e_modify_sq(struct mlx5e_sq *, int curr_state, int next_state); +void mlx5e_disable_sq(struct mlx5e_sq *); +void mlx5e_drain_sq(struct mlx5e_sq *); +void mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value); +void mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value); +void mlx5e_resume_sq(struct mlx5e_sq *sq); +u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev); + +#endif /* _MLX5_EN_H_ */ diff --git a/sys/dev/mlx5/mlx5_en/en_rl.h b/sys/dev/mlx5/mlx5_en/en_rl.h new file mode 100644 index 000000000000..4e2c6c539857 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/en_rl.h @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __MLX5_EN_RL_H__ +#define __MLX5_EN_RL_H__ + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/proc.h> +#include <sys/condvar.h> +#include <sys/interrupt.h> +#include <sys/unistd.h> + +#include <sys/queue.h> + +#define MLX5E_RL_MAX_WORKERS 128 /* limited by Toeplitz hash */ +#define MLX5E_RL_MAX_TX_RATES (64 * 1024) /* software limit */ +#define MLX5E_RL_DEF_SQ_PER_WORKER (12 * 1024) /* software limit */ +#define MLX5E_RL_MAX_SQS (120 * 1024) /* software limit */ + +#define MLX5E_RL_TX_COAL_USEC_DEFAULT 32 +#define MLX5E_RL_TX_COAL_PKTS_DEFAULT 4 +#define MLX5E_RL_TX_COAL_MODE_DEFAULT 0 +#define MLX5E_RL_TX_COMP_FACT_DEFAULT 1 + +#define MLX5E_RL_WORKER_LOCK(rlw) mtx_lock(&(rlw)->mtx) +#define MLX5E_RL_WORKER_UNLOCK(rlw) mtx_unlock(&(rlw)->mtx) + +#define MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock) +#define MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock) + +#define MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock) +#define MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock) + +#define MLX5E_RL_PARAMS(m) \ + m(+1, u64 tx_queue_size, "tx_queue_size", "Default send queue size") \ + m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining TX packets") \ + m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \ + m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \ + m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \ + m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \ + m(+1, u64 tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \ + m(+1, u64 tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \ + m(+1, u64 tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \ + m(+1, u64 tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \ + m(+1, u64 tx_rates_max, "tx_rates_max", "Max number of TX rates") \ + m(+1, u64 tx_rates_def, "tx_rates_def", "Default number of TX rates") \ + m(+1, u64 tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \ + m(+1, u64 tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \ + m(+1, u64 tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \ + m(+1, u64 tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \ + m(+1, u64 tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets") + +#define MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT)) + +#define MLX5E_RL_STATS(m) \ + m(+1, u64 tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \ + m(+1, u64 tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \ + m(+1, u64 tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \ + m(+1, u64 tx_active_connections, "tx_active_connections", "Number of active connections") \ + m(+1, u64 tx_open_queues, "tx_open_queues", "Number of open TX queues") \ + m(+1, u64 tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available") + +#define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT)) + +#define MLX5E_RL_TABLE_PARAMS(m) \ + m(+1, u64 tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \ + m(+1, u64 tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \ + m(+1, u64 tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \ + m(+1, u64 tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \ + m(+1, u64 tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000") + +#define MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT)) + +#define MLX5E_RL_PARAMS_INDEX(n) \ + (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t)) + +struct mlx5e_priv; + +/* Indicates channel's state */ +enum { + MLX5E_RL_ST_FREE, + MLX5E_RL_ST_USED, + MLX5E_RL_ST_MODIFY, + MLX5E_RL_ST_DESTROY, +}; + +struct mlx5e_rl_stats { + u64 arg [0]; + MLX5E_RL_STATS(MLX5E_STATS_VAR) +}; + +struct mlx5e_rl_params { + u64 arg [0]; + MLX5E_RL_PARAMS(MLX5E_STATS_VAR) + u64 table_arg [0]; + MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR) +}; + +struct mlx5e_rl_channel_param { + struct mlx5e_sq_param sq; + struct mlx5e_cq_param cq; +}; + +struct mlx5e_rl_channel { + struct m_snd_tag m_snd_tag; + STAILQ_ENTRY(mlx5e_rl_channel) entry; + struct mlx5e_sq * volatile sq; + struct mlx5e_rl_worker *worker; + uint64_t new_rate; + uint64_t init_rate; + uint64_t last_rate; + uint16_t last_burst; + uint16_t state; +}; + +struct mlx5e_rl_worker { + struct mtx mtx; + struct cv cv; + STAILQ_HEAD(, mlx5e_rl_channel) index_list_head; + STAILQ_HEAD(, mlx5e_rl_channel) process_head; + struct mlx5e_priv *priv; + struct mlx5e_rl_channel *channels; + unsigned worker_done; +}; + +struct mlx5e_rl_priv_data { + struct sx rl_sxlock; + struct sysctl_ctx_list ctx; + struct mlx5e_rl_channel_param chan_param; + struct mlx5e_rl_params param; + struct mlx5e_rl_stats stats; + struct mlx5_uar sq_uar; + struct mlx5e_rl_worker *workers; + struct mlx5e_priv *priv; + uint64_t *rate_limit_table; + unsigned opened; + uint32_t tisn; +}; + +int mlx5e_rl_init(struct mlx5e_priv *priv); +void mlx5e_rl_cleanup(struct mlx5e_priv *priv); +if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc; +if_snd_tag_modify_t mlx5e_rl_snd_tag_modify; +if_snd_tag_query_t mlx5e_rl_snd_tag_query; +if_snd_tag_free_t mlx5e_rl_snd_tag_free; + +#endif /* __MLX5_EN_RL_H__ */ diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c new file mode 100644 index 000000000000..85b1fe85617f --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c @@ -0,0 +1,1203 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" +#include <net/sff8472.h> + +void +mlx5e_create_stats(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *parent, const char *buffer, + const char **desc, unsigned num, u64 * arg) +{ + struct sysctl_oid *node; + unsigned x; + + sysctl_ctx_init(ctx); + + node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, + buffer, CTLFLAG_RD, NULL, "Statistics"); + if (node == NULL) + return; + for (x = 0; x != num; x++) { + SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, + desc[2 * x], CTLFLAG_RD, arg + x, desc[2 * x + 1]); + } +} + +static void +mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv) +{ + /* + * Limit the maximum distance between completion events to + * half of the currently set TX queue size. + * + * The maximum number of queue entries a single IP packet can + * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. + * + * The worst case max value is then given as below: + */ + uint64_t max = priv->params_ethtool.tx_queue_size / + (2 * MLX5_SEND_WQE_MAX_WQEBBS); + + /* + * Update the maximum completion factor value in case the + * tx_queue_size field changed. Ensure we don't overflow + * 16-bits. + */ + if (max < 1) + max = 1; + else if (max > 65535) + max = 65535; + priv->params_ethtool.tx_completion_fact_max = max; + + /* + * Verify that the current TX completion factor is within the + * given limits: + */ + if (priv->params_ethtool.tx_completion_fact < 1) + priv->params_ethtool.tx_completion_fact = 1; + else if (priv->params_ethtool.tx_completion_fact > max) + priv->params_ethtool.tx_completion_fact = max; +} + +static int +mlx5e_getmaxrate(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; + u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; + int err; + int i; + + PRIV_LOCK(priv); + err = -mlx5_query_port_tc_rate_limit(mdev, max_bw_value, max_bw_unit); + if (err) + goto done; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + switch (max_bw_unit[i]) { + case MLX5_100_MBPS_UNIT: + priv->params_ethtool.max_bw_value[i] = max_bw_value[i] * MLX5E_100MB; + break; + case MLX5_GBPS_UNIT: + priv->params_ethtool.max_bw_value[i] = max_bw_value[i] * MLX5E_1GB; + break; + case MLX5_BW_NO_LIMIT: + priv->params_ethtool.max_bw_value[i] = 0; + break; + default: + priv->params_ethtool.max_bw_value[i] = -1; + WARN_ONCE(true, "non-supported BW unit"); + break; + } + } +done: + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_get_dscp(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (MLX5_CAP_GEN(mdev, qcam_reg) == 0 || + MLX5_CAP_QCAM_REG(mdev, qpts) == 0 || + MLX5_CAP_QCAM_REG(mdev, qpdpm) == 0) + return (EOPNOTSUPP); + + PRIV_LOCK(priv); + err = -mlx5_query_dscp2prio(mdev, priv->params_ethtool.dscp2prio); + if (err) + goto done; + + err = -mlx5_query_trust_state(mdev, &priv->params_ethtool.trust_state); + if (err) + goto done; +done: + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_tc_maxrate_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + int prio_index = arg2; + struct mlx5_core_dev *mdev = priv->mdev; + u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; + u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; + int i, err; + u64 bw_val; + u64 result = priv->params_ethtool.max_bw_value[prio_index]; + const u64 upper_limit_mbps = 255 * MLX5E_100MB; + const u64 upper_limit_gbps = 255 * MLX5E_1GB; + + PRIV_LOCK(priv); + err = sysctl_handle_64(oidp, &result, 0, req); + if (err || !req->newptr || + result == priv->params_ethtool.max_bw_value[prio_index]) + goto done; + + if (result % MLX5E_100MB) { + err = ERANGE; + goto done; + } + + memset(max_bw_value, 0, sizeof(max_bw_value)); + memset(max_bw_unit, 0, sizeof(max_bw_unit)); + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + bw_val = (i == prio_index) ? result : priv->params_ethtool.max_bw_value[i]; + + if (!bw_val) { + max_bw_unit[i] = MLX5_BW_NO_LIMIT; + } else if (bw_val > upper_limit_gbps) { + result = 0; + max_bw_unit[i] = MLX5_BW_NO_LIMIT; + } else if (bw_val <= upper_limit_mbps) { + max_bw_value[i] = howmany(bw_val, MLX5E_100MB); + max_bw_unit[i] = MLX5_100_MBPS_UNIT; + } else { + max_bw_value[i] = howmany(bw_val, MLX5E_1GB); + max_bw_unit[i] = MLX5_GBPS_UNIT; + } + } + + err = -mlx5_modify_port_tc_rate_limit(mdev, max_bw_value, max_bw_unit); + if (err) + goto done; + + priv->params_ethtool.max_bw_value[prio_index] = result; +done: + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_get_prio_tc(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err = 0; + int i; + + PRIV_LOCK(priv); + if (!MLX5_CAP_GEN(priv->mdev, ets)) { + PRIV_UNLOCK(priv); + return (EOPNOTSUPP); + } + + for (i = 0; i <= mlx5_max_tc(priv->mdev); i++) { + err = -mlx5_query_port_prio_tc(mdev, i, &(priv->params_ethtool.prio_tc[i])); + if (err) + break; + } + + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_prio_to_tc_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + int prio_index = arg2; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + uint8_t result = priv->params_ethtool.prio_tc[prio_index]; + + PRIV_LOCK(priv); + err = sysctl_handle_8(oidp, &result, 0, req); + if (err || !req->newptr || + result == priv->params_ethtool.prio_tc[prio_index]) + goto done; + + if (result > mlx5_max_tc(mdev)) { + err = ERANGE; + goto done; + } + + err = -mlx5_set_port_prio_tc(mdev, prio_index, result); + if (err) + goto done; + + priv->params_ethtool.prio_tc[prio_index] = result; + +done: + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_trust_state_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + u8 result; + + PRIV_LOCK(priv); + result = priv->params_ethtool.trust_state; + err = sysctl_handle_8(oidp, &result, 0, req); + if (err || !req->newptr || + result == priv->params_ethtool.trust_state) + goto done; + + switch (result) { + case MLX5_QPTS_TRUST_PCP: + case MLX5_QPTS_TRUST_DSCP: + break; + case MLX5_QPTS_TRUST_BOTH: + if (!MLX5_CAP_QCAM_FEATURE(mdev, qpts_trust_both)) { + err = EOPNOTSUPP; + goto done; + } + break; + default: + err = ERANGE; + goto done; + } + + err = -mlx5_set_trust_state(mdev, result); + if (err) + goto done; + + priv->params_ethtool.trust_state = result; +done: + PRIV_UNLOCK(priv); + return (err); +} + +static int +mlx5e_dscp_prio_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + int prio_index = arg2; + struct mlx5_core_dev *mdev = priv->mdev; + uint8_t dscp2prio[MLX5_MAX_SUPPORTED_DSCP]; + uint8_t x; + int err; + + PRIV_LOCK(priv); + err = SYSCTL_OUT(req, priv->params_ethtool.dscp2prio + prio_index, + sizeof(priv->params_ethtool.dscp2prio) / 8); + if (err || !req->newptr) + goto done; + + memcpy(dscp2prio, priv->params_ethtool.dscp2prio, sizeof(dscp2prio)); + err = SYSCTL_IN(req, dscp2prio + prio_index, sizeof(dscp2prio) / 8); + if (err) + goto done; + for (x = 0; x != MLX5_MAX_SUPPORTED_DSCP; x++) { + if (dscp2prio[x] > 7) { + err = ERANGE; + goto done; + } + } + err = -mlx5_set_dscp2prio(mdev, dscp2prio); + if (err) + goto done; + + /* update local array */ + memcpy(priv->params_ethtool.dscp2prio, dscp2prio, + sizeof(priv->params_ethtool.dscp2prio)); +done: + PRIV_UNLOCK(priv); + return (err); +} + +#define MLX5_PARAM_OFFSET(n) \ + __offsetof(struct mlx5e_priv, params_ethtool.n) + +static int +mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + uint64_t value; + int mode_modify; + int was_opened; + int error; + + PRIV_LOCK(priv); + value = priv->params_ethtool.arg[arg2]; + if (req != NULL) { + error = sysctl_handle_64(oidp, &value, 0, req); + if (error || req->newptr == NULL || + value == priv->params_ethtool.arg[arg2]) + goto done; + + /* assign new value */ + priv->params_ethtool.arg[arg2] = value; + } else { + error = 0; + } + /* check if device is gone */ + if (priv->gone) { + error = ENXIO; + goto done; + } + was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); + + switch (MLX5_PARAM_OFFSET(arg[arg2])) { + case MLX5_PARAM_OFFSET(rx_coalesce_usecs): + /* import RX coal time */ + if (priv->params_ethtool.rx_coalesce_usecs < 1) + priv->params_ethtool.rx_coalesce_usecs = 0; + else if (priv->params_ethtool.rx_coalesce_usecs > + MLX5E_FLD_MAX(cqc, cq_period)) { + priv->params_ethtool.rx_coalesce_usecs = + MLX5E_FLD_MAX(cqc, cq_period); + } + priv->params.rx_cq_moderation_usec = + priv->params_ethtool.rx_coalesce_usecs; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_refresh_channel_params(priv); + break; + + case MLX5_PARAM_OFFSET(rx_coalesce_pkts): + /* import RX coal pkts */ + if (priv->params_ethtool.rx_coalesce_pkts < 1) + priv->params_ethtool.rx_coalesce_pkts = 0; + else if (priv->params_ethtool.rx_coalesce_pkts > + MLX5E_FLD_MAX(cqc, cq_max_count)) { + priv->params_ethtool.rx_coalesce_pkts = + MLX5E_FLD_MAX(cqc, cq_max_count); + } + priv->params.rx_cq_moderation_pkts = + priv->params_ethtool.rx_coalesce_pkts; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_refresh_channel_params(priv); + break; + + case MLX5_PARAM_OFFSET(tx_coalesce_usecs): + /* import TX coal time */ + if (priv->params_ethtool.tx_coalesce_usecs < 1) + priv->params_ethtool.tx_coalesce_usecs = 0; + else if (priv->params_ethtool.tx_coalesce_usecs > + MLX5E_FLD_MAX(cqc, cq_period)) { + priv->params_ethtool.tx_coalesce_usecs = + MLX5E_FLD_MAX(cqc, cq_period); + } + priv->params.tx_cq_moderation_usec = + priv->params_ethtool.tx_coalesce_usecs; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_refresh_channel_params(priv); + break; + + case MLX5_PARAM_OFFSET(tx_coalesce_pkts): + /* import TX coal pkts */ + if (priv->params_ethtool.tx_coalesce_pkts < 1) + priv->params_ethtool.tx_coalesce_pkts = 0; + else if (priv->params_ethtool.tx_coalesce_pkts > + MLX5E_FLD_MAX(cqc, cq_max_count)) { + priv->params_ethtool.tx_coalesce_pkts = + MLX5E_FLD_MAX(cqc, cq_max_count); + } + priv->params.tx_cq_moderation_pkts = + priv->params_ethtool.tx_coalesce_pkts; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_refresh_channel_params(priv); + break; + + case MLX5_PARAM_OFFSET(tx_queue_size): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import TX queue size */ + if (priv->params_ethtool.tx_queue_size < + (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) { + priv->params_ethtool.tx_queue_size = + (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); + } else if (priv->params_ethtool.tx_queue_size > + priv->params_ethtool.tx_queue_size_max) { + priv->params_ethtool.tx_queue_size = + priv->params_ethtool.tx_queue_size_max; + } + /* store actual TX queue size */ + priv->params.log_sq_size = + order_base_2(priv->params_ethtool.tx_queue_size); + priv->params_ethtool.tx_queue_size = + 1 << priv->params.log_sq_size; + + /* verify TX completion factor */ + mlx5e_ethtool_sync_tx_completion_fact(priv); + + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(rx_queue_size): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import RX queue size */ + if (priv->params_ethtool.rx_queue_size < + (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) { + priv->params_ethtool.rx_queue_size = + (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE); + } else if (priv->params_ethtool.rx_queue_size > + priv->params_ethtool.rx_queue_size_max) { + priv->params_ethtool.rx_queue_size = + priv->params_ethtool.rx_queue_size_max; + } + /* store actual RX queue size */ + priv->params.log_rq_size = + order_base_2(priv->params_ethtool.rx_queue_size); + priv->params_ethtool.rx_queue_size = + 1 << priv->params.log_rq_size; + + /* update least number of RX WQEs */ + priv->params.min_rx_wqes = min( + priv->params_ethtool.rx_queue_size - 1, + MLX5E_PARAMS_DEFAULT_MIN_RX_WQES); + + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(channels_rsss): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import number of channels */ + if (priv->params_ethtool.channels_rsss < 1) + priv->params_ethtool.channels_rsss = 1; + else if (priv->params_ethtool.channels_rsss > 128) + priv->params_ethtool.channels_rsss = 128; + + priv->params.channels_rsss = priv->params_ethtool.channels_rsss; + + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(channels): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import number of channels */ + if (priv->params_ethtool.channels < 1) + priv->params_ethtool.channels = 1; + else if (priv->params_ethtool.channels > + (u64) priv->mdev->priv.eq_table.num_comp_vectors) { + priv->params_ethtool.channels = + (u64) priv->mdev->priv.eq_table.num_comp_vectors; + } + priv->params.num_channels = priv->params_ethtool.channels; + + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(rx_coalesce_mode): + /* network interface must be down */ + if (was_opened != 0 && mode_modify == 0) + mlx5e_close_locked(priv->ifp); + + /* import RX coalesce mode */ + if (priv->params_ethtool.rx_coalesce_mode != 0) + priv->params_ethtool.rx_coalesce_mode = 1; + priv->params.rx_cq_moderation_mode = + priv->params_ethtool.rx_coalesce_mode; + + /* restart network interface, if any */ + if (was_opened != 0) { + if (mode_modify == 0) + mlx5e_open_locked(priv->ifp); + else + error = mlx5e_refresh_channel_params(priv); + } + break; + + case MLX5_PARAM_OFFSET(tx_coalesce_mode): + /* network interface must be down */ + if (was_opened != 0 && mode_modify == 0) + mlx5e_close_locked(priv->ifp); + + /* import TX coalesce mode */ + if (priv->params_ethtool.tx_coalesce_mode != 0) + priv->params_ethtool.tx_coalesce_mode = 1; + priv->params.tx_cq_moderation_mode = + priv->params_ethtool.tx_coalesce_mode; + + /* restart network interface, if any */ + if (was_opened != 0) { + if (mode_modify == 0) + mlx5e_open_locked(priv->ifp); + else + error = mlx5e_refresh_channel_params(priv); + } + break; + + case MLX5_PARAM_OFFSET(hw_lro): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import HW LRO mode */ + if (priv->params_ethtool.hw_lro != 0) { + if ((priv->ifp->if_capenable & IFCAP_LRO) && + MLX5_CAP_ETH(priv->mdev, lro_cap)) { + priv->params.hw_lro_en = 1; + priv->params_ethtool.hw_lro = 1; + } else { + priv->params.hw_lro_en = 0; + priv->params_ethtool.hw_lro = 0; + error = EINVAL; + + if_printf(priv->ifp, "Can't enable HW LRO: " + "The HW or SW LRO feature is disabled\n"); + } + } else { + priv->params.hw_lro_en = 0; + } + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(cqe_zipping): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* import CQE zipping mode */ + if (priv->params_ethtool.cqe_zipping && + MLX5_CAP_GEN(priv->mdev, cqe_compression)) { + priv->params.cqe_zipping_en = true; + priv->params_ethtool.cqe_zipping = 1; + } else { + priv->params.cqe_zipping_en = false; + priv->params_ethtool.cqe_zipping = 0; + } + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(tx_bufring_disable): + /* rangecheck input value */ + priv->params_ethtool.tx_bufring_disable = + priv->params_ethtool.tx_bufring_disable ? 1 : 0; + + /* reconfigure the sendqueues, if any */ + if (was_opened) { + mlx5e_close_locked(priv->ifp); + mlx5e_open_locked(priv->ifp); + } + break; + + case MLX5_PARAM_OFFSET(tx_completion_fact): + /* network interface must be down */ + if (was_opened) + mlx5e_close_locked(priv->ifp); + + /* verify parameter */ + mlx5e_ethtool_sync_tx_completion_fact(priv); + + /* restart network interface, if any */ + if (was_opened) + mlx5e_open_locked(priv->ifp); + break; + + case MLX5_PARAM_OFFSET(modify_tx_dma): + /* check if network interface is opened */ + if (was_opened) { + priv->params_ethtool.modify_tx_dma = + priv->params_ethtool.modify_tx_dma ? 1 : 0; + /* modify tx according to value */ + mlx5e_modify_tx_dma(priv, value != 0); + } else { + /* if closed force enable tx */ + priv->params_ethtool.modify_tx_dma = 0; + } + break; + + case MLX5_PARAM_OFFSET(modify_rx_dma): + /* check if network interface is opened */ + if (was_opened) { + priv->params_ethtool.modify_rx_dma = + priv->params_ethtool.modify_rx_dma ? 1 : 0; + /* modify rx according to value */ + mlx5e_modify_rx_dma(priv, value != 0); + } else { + /* if closed force enable rx */ + priv->params_ethtool.modify_rx_dma = 0; + } + break; + + case MLX5_PARAM_OFFSET(diag_pci_enable): + priv->params_ethtool.diag_pci_enable = + priv->params_ethtool.diag_pci_enable ? 1 : 0; + + error = -mlx5_core_set_diagnostics_full(priv->mdev, + priv->params_ethtool.diag_pci_enable, + priv->params_ethtool.diag_general_enable); + break; + + case MLX5_PARAM_OFFSET(diag_general_enable): + priv->params_ethtool.diag_general_enable = + priv->params_ethtool.diag_general_enable ? 1 : 0; + + error = -mlx5_core_set_diagnostics_full(priv->mdev, + priv->params_ethtool.diag_pci_enable, + priv->params_ethtool.diag_general_enable); + break; + + case MLX5_PARAM_OFFSET(mc_local_lb): + priv->params_ethtool.mc_local_lb = + priv->params_ethtool.mc_local_lb ? 1 : 0; + + if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { + error = mlx5_nic_vport_modify_local_lb(priv->mdev, + MLX5_LOCAL_MC_LB, priv->params_ethtool.mc_local_lb); + } else { + error = EOPNOTSUPP; + } + break; + + case MLX5_PARAM_OFFSET(uc_local_lb): + priv->params_ethtool.uc_local_lb = + priv->params_ethtool.uc_local_lb ? 1 : 0; + + if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { + error = mlx5_nic_vport_modify_local_lb(priv->mdev, + MLX5_LOCAL_UC_LB, priv->params_ethtool.uc_local_lb); + } else { + error = EOPNOTSUPP; + } + break; + + default: + break; + } +done: + PRIV_UNLOCK(priv); + return (error); +} + +/* + * Read the first three bytes of the eeprom in order to get the needed info + * for the whole reading. + * Byte 0 - Identifier byte + * Byte 1 - Revision byte + * Byte 2 - Status byte + */ +static int +mlx5e_get_eeprom_info(struct mlx5e_priv *priv, struct mlx5e_eeprom *eeprom) +{ + struct mlx5_core_dev *dev = priv->mdev; + u32 data = 0; + int size_read = 0; + int ret; + + ret = mlx5_query_module_num(dev, &eeprom->module_num); + if (ret) { + if_printf(priv->ifp, "%s:%d: Failed query module error=%d\n", + __func__, __LINE__, ret); + return (ret); + } + + /* Read the first three bytes to get Identifier, Revision and Status */ + ret = mlx5_query_eeprom(dev, eeprom->i2c_addr, eeprom->page_num, + eeprom->device_addr, MLX5E_EEPROM_INFO_BYTES, eeprom->module_num, &data, + &size_read); + if (ret) { + if_printf(priv->ifp, "%s:%d: Failed query eeprom module error=0x%x\n", + __func__, __LINE__, ret); + return (ret); + } + + switch (data & MLX5_EEPROM_IDENTIFIER_BYTE_MASK) { + case SFF_8024_ID_QSFP: + eeprom->type = MLX5E_ETH_MODULE_SFF_8436; + eeprom->len = MLX5E_ETH_MODULE_SFF_8436_LEN; + break; + case SFF_8024_ID_QSFPPLUS: + case SFF_8024_ID_QSFP28: + if ((data & MLX5_EEPROM_IDENTIFIER_BYTE_MASK) == SFF_8024_ID_QSFP28 || + ((data & MLX5_EEPROM_REVISION_ID_BYTE_MASK) >> 8) >= 0x3) { + eeprom->type = MLX5E_ETH_MODULE_SFF_8636; + eeprom->len = MLX5E_ETH_MODULE_SFF_8636_LEN; + } else { + eeprom->type = MLX5E_ETH_MODULE_SFF_8436; + eeprom->len = MLX5E_ETH_MODULE_SFF_8436_LEN; + } + if ((data & MLX5_EEPROM_PAGE_3_VALID_BIT_MASK) == 0) + eeprom->page_valid = 1; + break; + case SFF_8024_ID_SFP: + eeprom->type = MLX5E_ETH_MODULE_SFF_8472; + eeprom->len = MLX5E_ETH_MODULE_SFF_8472_LEN; + break; + default: + if_printf(priv->ifp, "%s:%d: Not recognized cable type = 0x%x(%s)\n", + __func__, __LINE__, data & MLX5_EEPROM_IDENTIFIER_BYTE_MASK, + sff_8024_id[data & MLX5_EEPROM_IDENTIFIER_BYTE_MASK]); + return (EINVAL); + } + return (0); +} + +/* Read both low and high pages of the eeprom */ +static int +mlx5e_get_eeprom(struct mlx5e_priv *priv, struct mlx5e_eeprom *ee) +{ + struct mlx5_core_dev *dev = priv->mdev; + int size_read = 0; + int ret; + + if (ee->len == 0) + return (EINVAL); + + /* Read low page of the eeprom */ + while (ee->device_addr < ee->len) { + ret = mlx5_query_eeprom(dev, ee->i2c_addr, ee->page_num, ee->device_addr, + ee->len - ee->device_addr, ee->module_num, + ee->data + (ee->device_addr / 4), &size_read); + if (ret) { + if_printf(priv->ifp, "%s:%d: Failed reading eeprom, " + "error = 0x%02x\n", __func__, __LINE__, ret); + return (ret); + } + ee->device_addr += size_read; + } + + /* Read high page of the eeprom */ + if (ee->page_valid) { + ee->device_addr = MLX5E_EEPROM_HIGH_PAGE_OFFSET; + ee->page_num = MLX5E_EEPROM_HIGH_PAGE; + size_read = 0; + while (ee->device_addr < MLX5E_EEPROM_PAGE_LENGTH) { + ret = mlx5_query_eeprom(dev, ee->i2c_addr, ee->page_num, + ee->device_addr, MLX5E_EEPROM_PAGE_LENGTH - ee->device_addr, + ee->module_num, ee->data + (ee->len / 4) + + ((ee->device_addr - MLX5E_EEPROM_HIGH_PAGE_OFFSET) / 4), + &size_read); + if (ret) { + if_printf(priv->ifp, "%s:%d: Failed reading eeprom, " + "error = 0x%02x\n", __func__, __LINE__, ret); + return (ret); + } + ee->device_addr += size_read; + } + } + return (0); +} + +static void +mlx5e_print_eeprom(struct mlx5e_eeprom *eeprom) +{ + int row; + int index_in_row; + int byte_to_write = 0; + int line_length = 16; + + printf("\nOffset\t\tValues\n"); + printf("------\t\t------"); + while (byte_to_write < eeprom->len) { + printf("\n0x%04X\t\t", byte_to_write); + for (index_in_row = 0; index_in_row < line_length; index_in_row++) { + printf("%02X ", ((u8 *)eeprom->data)[byte_to_write]); + byte_to_write++; + } + } + + if (eeprom->page_valid) { + row = MLX5E_EEPROM_HIGH_PAGE_OFFSET; + printf("\n\nUpper Page 0x03\n"); + printf("\nOffset\t\tValues\n"); + printf("------\t\t------"); + while (row < MLX5E_EEPROM_PAGE_LENGTH) { + printf("\n0x%04X\t\t", row); + for (index_in_row = 0; index_in_row < line_length; index_in_row++) { + printf("%02X ", ((u8 *)eeprom->data)[byte_to_write]); + byte_to_write++; + row++; + } + } + } +} + +/* + * Read cable EEPROM module information by first inspecting the first + * three bytes to get the initial information for a whole reading. + * Information will be printed to dmesg. + */ +static int +mlx5e_read_eeprom(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + struct mlx5e_eeprom eeprom; + int error; + int result = 0; + + PRIV_LOCK(priv); + error = sysctl_handle_int(oidp, &result, 0, req); + if (error || !req->newptr) + goto done; + + /* Check if device is gone */ + if (priv->gone) { + error = ENXIO; + goto done; + } + + if (result == 1) { + eeprom.i2c_addr = MLX5E_I2C_ADDR_LOW; + eeprom.device_addr = 0; + eeprom.page_num = MLX5E_EEPROM_LOW_PAGE; + eeprom.page_valid = 0; + + /* Read three first bytes to get important info */ + error = mlx5e_get_eeprom_info(priv, &eeprom); + if (error) { + if_printf(priv->ifp, "%s:%d: Failed reading eeprom's " + "initial information\n", __func__, __LINE__); + error = 0; + goto done; + } + /* + * Allocate needed length buffer and additional space for + * page 0x03 + */ + eeprom.data = malloc(eeprom.len + MLX5E_EEPROM_PAGE_LENGTH, + M_MLX5EN, M_WAITOK | M_ZERO); + + /* Read the whole eeprom information */ + error = mlx5e_get_eeprom(priv, &eeprom); + if (error) { + if_printf(priv->ifp, "%s:%d: Failed reading eeprom\n", + __func__, __LINE__); + error = 0; + /* + * Continue printing partial information in case of + * an error + */ + } + mlx5e_print_eeprom(&eeprom); + free(eeprom.data, M_MLX5EN); + } +done: + PRIV_UNLOCK(priv); + return (error); +} + +static const char *mlx5e_params_desc[] = { + MLX5E_PARAMS(MLX5E_STATS_DESC) +}; + +static const char *mlx5e_port_stats_debug_desc[] = { + MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_DESC) +}; + +static int +mlx5e_ethtool_debug_channel_info(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv; + struct sbuf sb; + struct mlx5e_channel *c; + struct mlx5e_sq *sq; + struct mlx5e_rq *rq; + int error, i, tc; + + priv = arg1; + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + if (sbuf_new_for_sysctl(&sb, NULL, 128, req) == NULL) + return (ENOMEM); + sbuf_clear_flags(&sb, SBUF_INCLUDENUL); + + PRIV_LOCK(priv); + if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) + goto out; + for (i = 0; i < priv->params.num_channels; i++) { + c = priv->channel[i]; + rq = &c->rq; + sbuf_printf(&sb, "channel %d rq %d cq %d\n", + c->ix, rq->rqn, rq->cq.mcq.cqn); + for (tc = 0; tc < c->num_tc; tc++) { + sq = &c->sq[tc]; + sbuf_printf(&sb, "channel %d tc %d sq %d cq %d\n", + c->ix, tc, sq->sqn, sq->cq.mcq.cqn); + } + } +out: + PRIV_UNLOCK(priv); + error = sbuf_finish(&sb); + sbuf_delete(&sb); + return (error); +} + +static int +mlx5e_ethtool_debug_stats(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + int error, sys_debug; + + sys_debug = priv->sysctl_debug; + error = sysctl_handle_int(oidp, &priv->sysctl_debug, 0, req); + if (error != 0 || !req->newptr) + return (error); + priv->sysctl_debug = priv->sysctl_debug != 0; + if (sys_debug == priv->sysctl_debug) + return (0); + + PRIV_LOCK(priv); + if (priv->sysctl_debug) { + mlx5e_create_stats(&priv->stats.port_stats_debug.ctx, + SYSCTL_CHILDREN(priv->sysctl_ifnet), "debug_stats", + mlx5e_port_stats_debug_desc, MLX5E_PORT_STATS_DEBUG_NUM, + priv->stats.port_stats_debug.arg); + SYSCTL_ADD_PROC(&priv->sysctl_ctx_channel_debug, + SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, + "hw_ctx_debug", + CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_STRING, priv, 0, + mlx5e_ethtool_debug_channel_info, "S", ""); + } else { + sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); + sysctl_ctx_free(&priv->sysctl_ctx_channel_debug); + } + PRIV_UNLOCK(priv); + return (0); +} + +static void +mlx5e_create_diagnostics(struct mlx5e_priv *priv) +{ + struct mlx5_core_diagnostics_entry entry; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *node; + int x; + + /* sysctl context we are using */ + ctx = &priv->sysctl_ctx; + + /* create root node */ + node = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, + "diagnostics", CTLFLAG_RD, NULL, "Diagnostics"); + if (node == NULL) + return; + + /* create PCI diagnostics */ + for (x = 0; x != MLX5_CORE_PCI_DIAGNOSTICS_NUM; x++) { + entry = mlx5_core_pci_diagnostics_table[x]; + if (mlx5_core_supports_diagnostics(priv->mdev, entry.counter_id) == 0) + continue; + SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, + entry.desc, CTLFLAG_RD, priv->params_pci.array + x, + "PCI diagnostics counter"); + } + + /* create general diagnostics */ + for (x = 0; x != MLX5_CORE_GENERAL_DIAGNOSTICS_NUM; x++) { + entry = mlx5_core_general_diagnostics_table[x]; + if (mlx5_core_supports_diagnostics(priv->mdev, entry.counter_id) == 0) + continue; + SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, + entry.desc, CTLFLAG_RD, priv->params_general.array + x, + "General diagnostics counter"); + } +} + +void +mlx5e_create_ethtool(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct sysctl_oid *node, *qos_node; + const char *pnameunit; + unsigned x; + int i; + + /* set some defaults */ + priv->params_ethtool.tx_queue_size_max = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE; + priv->params_ethtool.rx_queue_size_max = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE; + priv->params_ethtool.tx_queue_size = 1 << priv->params.log_sq_size; + priv->params_ethtool.rx_queue_size = 1 << priv->params.log_rq_size; + priv->params_ethtool.channels = priv->params.num_channels; + priv->params_ethtool.channels_rsss = priv->params.channels_rsss; + priv->params_ethtool.coalesce_pkts_max = MLX5E_FLD_MAX(cqc, cq_max_count); + priv->params_ethtool.coalesce_usecs_max = MLX5E_FLD_MAX(cqc, cq_period); + priv->params_ethtool.rx_coalesce_mode = priv->params.rx_cq_moderation_mode; + priv->params_ethtool.rx_coalesce_usecs = priv->params.rx_cq_moderation_usec; + priv->params_ethtool.rx_coalesce_pkts = priv->params.rx_cq_moderation_pkts; + priv->params_ethtool.tx_coalesce_mode = priv->params.tx_cq_moderation_mode; + priv->params_ethtool.tx_coalesce_usecs = priv->params.tx_cq_moderation_usec; + priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts; + priv->params_ethtool.hw_lro = priv->params.hw_lro_en; + priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en; + mlx5e_ethtool_sync_tx_completion_fact(priv); + + /* get default values for local loopback, if any */ + if (MLX5_CAP_GEN(priv->mdev, disable_local_lb)) { + int err; + u8 val; + + err = mlx5_nic_vport_query_local_lb(priv->mdev, MLX5_LOCAL_MC_LB, &val); + if (err == 0) + priv->params_ethtool.mc_local_lb = val; + + err = mlx5_nic_vport_query_local_lb(priv->mdev, MLX5_LOCAL_UC_LB, &val); + if (err == 0) + priv->params_ethtool.uc_local_lb = val; + } + + /* create root node */ + node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, + SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, + "conf", CTLFLAG_RW, NULL, "Configuration"); + if (node == NULL) + return; + for (x = 0; x != MLX5E_PARAMS_NUM; x++) { + /* check for read-only parameter */ + if (strstr(mlx5e_params_desc[2 * x], "_max") != NULL || + strstr(mlx5e_params_desc[2 * x], "_mtu") != NULL) { + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, + mlx5e_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RD | + CTLFLAG_MPSAFE, priv, x, &mlx5e_ethtool_handler, "QU", + mlx5e_params_desc[2 * x + 1]); + } else { +#if (__FreeBSD_version < 1100000) + char path[64]; +#endif + /* + * NOTE: In FreeBSD-11 and newer the + * CTLFLAG_RWTUN flag will take care of + * loading default sysctl value from the + * kernel environment, if any: + */ + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, + mlx5e_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RWTUN | + CTLFLAG_MPSAFE, priv, x, &mlx5e_ethtool_handler, "QU", + mlx5e_params_desc[2 * x + 1]); + +#if (__FreeBSD_version < 1100000) + /* compute path for sysctl */ + snprintf(path, sizeof(path), "dev.mce.%d.conf.%s", + device_get_unit(priv->mdev->pdev->dev.bsddev), + mlx5e_params_desc[2 * x]); + + /* try to fetch tunable, if any */ + if (TUNABLE_QUAD_FETCH(path, &priv->params_ethtool.arg[x])) + mlx5e_ethtool_handler(NULL, priv, x, NULL); +#endif + } + } + + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, + "debug_stats", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, + 0, &mlx5e_ethtool_debug_stats, "I", "Extended debug statistics"); + + pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev); + + SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), + OID_AUTO, "device_name", CTLFLAG_RD, + __DECONST(void *, pnameunit), 0, + "PCI device name"); + + /* EEPROM support */ + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "eeprom_info", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + mlx5e_read_eeprom, "I", "EEPROM information"); + + /* Diagnostics support */ + mlx5e_create_diagnostics(priv); + + /* create qos node */ + qos_node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, + SYSCTL_CHILDREN(node), OID_AUTO, + "qos", CTLFLAG_RW, NULL, "Quality Of Service configuration"); + if (node == NULL) + return; + + /* Prioriry rate limit support */ + if (mlx5e_getmaxrate(priv)) + return; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + char name[32]; + snprintf(name, sizeof(name), "tc_%d_max_rate", i); + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), + OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, + priv, i, mlx5e_tc_maxrate_handler, "QU", + "Max rate for priority, specified in kilobits, where kilo=1000, \ + max_rate must be divisible by 100000"); + } + + if (mlx5e_get_prio_tc(priv)) + return; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + char name[32]; + snprintf(name, sizeof(name), "prio_%d_to_tc", i); + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), + OID_AUTO, name, CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE, + priv, i, mlx5e_prio_to_tc_handler, "CU", + "Set priority to traffic class"); + } + + /* DSCP support */ + if (mlx5e_get_dscp(priv) == 0) { + for (i = 0; i != MLX5_MAX_SUPPORTED_DSCP; i += 8) { + char name[32]; + snprintf(name, sizeof(name), "dscp_%d_%d_prio", i, i + 7); + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), + OID_AUTO, name, CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + priv, i, mlx5e_dscp_prio_handler, "CU", + "Set DSCP to priority mapping, 0..7"); + } +#define A "Set trust state, 1:PCP 2:DSCP" +#define B " 3:BOTH" + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), + OID_AUTO, "trust_state", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + priv, 0, mlx5e_trust_state_handler, "CU", + MLX5_CAP_QCAM_FEATURE(mdev, qpts_trust_both) ? + A B : A); +#undef B +#undef A + } +} diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c new file mode 100644 index 000000000000..2d1e456518e8 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c @@ -0,0 +1,1487 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" + +#include <linux/list.h> +#include <dev/mlx5/fs.h> + +#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) + +enum { + MLX5E_FULLMATCH = 0, + MLX5E_ALLMULTI = 1, + MLX5E_PROMISC = 2, +}; + +enum { + MLX5E_UC = 0, + MLX5E_MC_IPV4 = 1, + MLX5E_MC_IPV6 = 2, + MLX5E_MC_OTHER = 3, +}; + +enum { + MLX5E_ACTION_NONE = 0, + MLX5E_ACTION_ADD = 1, + MLX5E_ACTION_DEL = 2, +}; + +struct mlx5e_eth_addr_hash_node { + LIST_ENTRY(mlx5e_eth_addr_hash_node) hlist; + u8 action; + struct mlx5e_eth_addr_info ai; +}; + +static inline int +mlx5e_hash_eth_addr(const u8 * addr) +{ + return (addr[5]); +} + +static void +mlx5e_add_eth_addr_to_hash(struct mlx5e_eth_addr_hash_head *hash, + const u8 * addr) +{ + struct mlx5e_eth_addr_hash_node *hn; + int ix = mlx5e_hash_eth_addr(addr); + + LIST_FOREACH(hn, &hash[ix], hlist) { + if (bcmp(hn->ai.addr, addr, ETHER_ADDR_LEN) == 0) { + if (hn->action == MLX5E_ACTION_DEL) + hn->action = MLX5E_ACTION_NONE; + return; + } + } + + hn = malloc(sizeof(*hn), M_MLX5EN, M_NOWAIT | M_ZERO); + if (hn == NULL) + return; + + ether_addr_copy(hn->ai.addr, addr); + hn->action = MLX5E_ACTION_ADD; + + LIST_INSERT_HEAD(&hash[ix], hn, hlist); +} + +static void +mlx5e_del_eth_addr_from_hash(struct mlx5e_eth_addr_hash_node *hn) +{ + LIST_REMOVE(hn, hlist); + free(hn, M_MLX5EN); +} + +static void +mlx5e_del_eth_addr_from_flow_table(struct mlx5e_priv *priv, + struct mlx5e_eth_addr_info *ai) +{ + if (ai->tt_vec & (1 << MLX5E_TT_IPV6_IPSEC_ESP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV4_IPSEC_ESP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV6_IPSEC_AH)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV4_IPSEC_AH)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV6_TCP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_TCP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV4_TCP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_TCP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV6_UDP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_UDP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV4_UDP)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_UDP]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV6)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6]); + + if (ai->tt_vec & (1 << MLX5E_TT_IPV4)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4]); + + if (ai->tt_vec & (1 << MLX5E_TT_ANY)) + mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_ANY]); +} + +static int +mlx5e_get_eth_addr_type(const u8 * addr) +{ + if (ETHER_IS_MULTICAST(addr) == 0) + return (MLX5E_UC); + + if ((addr[0] == 0x01) && + (addr[1] == 0x00) && + (addr[2] == 0x5e) && + !(addr[3] & 0x80)) + return (MLX5E_MC_IPV4); + + if ((addr[0] == 0x33) && + (addr[1] == 0x33)) + return (MLX5E_MC_IPV6); + + return (MLX5E_MC_OTHER); +} + +static u32 +mlx5e_get_tt_vec(struct mlx5e_eth_addr_info *ai, int type) +{ + int eth_addr_type; + u32 ret; + + switch (type) { + case MLX5E_FULLMATCH: + eth_addr_type = mlx5e_get_eth_addr_type(ai->addr); + switch (eth_addr_type) { + case MLX5E_UC: + ret = + (1 << MLX5E_TT_IPV4_TCP) | + (1 << MLX5E_TT_IPV6_TCP) | + (1 << MLX5E_TT_IPV4_UDP) | + (1 << MLX5E_TT_IPV6_UDP) | + (1 << MLX5E_TT_IPV4) | + (1 << MLX5E_TT_IPV6) | + (1 << MLX5E_TT_ANY) | + 0; + break; + + case MLX5E_MC_IPV4: + ret = + (1 << MLX5E_TT_IPV4_UDP) | + (1 << MLX5E_TT_IPV4) | + 0; + break; + + case MLX5E_MC_IPV6: + ret = + (1 << MLX5E_TT_IPV6_UDP) | + (1 << MLX5E_TT_IPV6) | + 0; + break; + + default: + ret = + (1 << MLX5E_TT_ANY) | + 0; + break; + } + break; + + case MLX5E_ALLMULTI: + ret = + (1 << MLX5E_TT_IPV4_UDP) | + (1 << MLX5E_TT_IPV6_UDP) | + (1 << MLX5E_TT_IPV4) | + (1 << MLX5E_TT_IPV6) | + (1 << MLX5E_TT_ANY) | + 0; + break; + + default: /* MLX5E_PROMISC */ + ret = + (1 << MLX5E_TT_IPV4_TCP) | + (1 << MLX5E_TT_IPV6_TCP) | + (1 << MLX5E_TT_IPV4_UDP) | + (1 << MLX5E_TT_IPV6_UDP) | + (1 << MLX5E_TT_IPV4) | + (1 << MLX5E_TT_IPV6) | + (1 << MLX5E_TT_ANY) | + 0; + break; + } + + return (ret); +} + +static int +mlx5e_add_eth_addr_rule_sub(struct mlx5e_priv *priv, + struct mlx5e_eth_addr_info *ai, int type, + u32 *mc, u32 *mv) +{ + struct mlx5_flow_destination dest; + u8 mc_enable = 0; + struct mlx5_flow_rule **rule_p; + struct mlx5_flow_table *ft = priv->fts.main.t; + u8 *mc_dmac = MLX5_ADDR_OF(fte_match_param, mc, + outer_headers.dmac_47_16); + u8 *mv_dmac = MLX5_ADDR_OF(fte_match_param, mv, + outer_headers.dmac_47_16); + u32 *tirn = priv->tirn; + u32 tt_vec; + int err = 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + + switch (type) { + case MLX5E_FULLMATCH: + mc_enable = MLX5_MATCH_OUTER_HEADERS; + memset(mc_dmac, 0xff, ETH_ALEN); + ether_addr_copy(mv_dmac, ai->addr); + break; + + case MLX5E_ALLMULTI: + mc_enable = MLX5_MATCH_OUTER_HEADERS; + mc_dmac[0] = 0x01; + mv_dmac[0] = 0x01; + break; + + case MLX5E_PROMISC: + break; + default: + break; + } + + tt_vec = mlx5e_get_tt_vec(ai, type); + + if (tt_vec & BIT(MLX5E_TT_ANY)) { + rule_p = &ai->ft_rule[MLX5E_TT_ANY]; + dest.tir_num = tirn[MLX5E_TT_ANY]; + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_ANY); + } + + mc_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + + if (tt_vec & BIT(MLX5E_TT_IPV4)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV4]; + dest.tir_num = tirn[MLX5E_TT_IPV4]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IP); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV4); + } + + if (tt_vec & BIT(MLX5E_TT_IPV6)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV6]; + dest.tir_num = tirn[MLX5E_TT_IPV6]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IPV6); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV6); + } + + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_UDP); + + if (tt_vec & BIT(MLX5E_TT_IPV4_UDP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV4_UDP]; + dest.tir_num = tirn[MLX5E_TT_IPV4_UDP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IP); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV4_UDP); + } + + if (tt_vec & BIT(MLX5E_TT_IPV6_UDP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV6_UDP]; + dest.tir_num = tirn[MLX5E_TT_IPV6_UDP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IPV6); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV6_UDP); + } + + MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_TCP); + + if (tt_vec & BIT(MLX5E_TT_IPV4_TCP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV4_TCP]; + dest.tir_num = tirn[MLX5E_TT_IPV4_TCP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IP); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV4_TCP); + } + + if (tt_vec & BIT(MLX5E_TT_IPV6_TCP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV6_TCP]; + dest.tir_num = tirn[MLX5E_TT_IPV6_TCP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IPV6); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + + ai->tt_vec |= BIT(MLX5E_TT_IPV6_TCP); + } + + MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_AH); + + if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_AH)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH]; + dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_AH]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IP); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_AH); + } + + if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_AH)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH]; + dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_AH]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IPV6); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_AH); + } + + MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_ESP); + + if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_ESP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP]; + dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_ESP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IP); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_ESP); + } + + if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_ESP)) { + rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP]; + dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_ESP]; + MLX5_SET(fte_match_param, mv, outer_headers.ethertype, + ETHERTYPE_IPV6); + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, &dest); + if (IS_ERR_OR_NULL(*rule_p)) + goto err_del_ai; + ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_ESP); + } + + return 0; + +err_del_ai: + err = PTR_ERR(*rule_p); + *rule_p = NULL; + mlx5e_del_eth_addr_from_flow_table(priv, ai); + + return err; +} + +static int +mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv, + struct mlx5e_eth_addr_info *ai, int type) +{ + u32 *match_criteria; + u32 *match_value; + int err = 0; + + match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); + match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); + if (!match_value || !match_criteria) { + if_printf(priv->ifp, "%s: alloc failed\n", __func__); + err = -ENOMEM; + goto add_eth_addr_rule_out; + } + err = mlx5e_add_eth_addr_rule_sub(priv, ai, type, match_criteria, + match_value); + +add_eth_addr_rule_out: + kvfree(match_criteria); + kvfree(match_value); + + return (err); +} + +static int mlx5e_vport_context_update_vlans(struct mlx5e_priv *priv) +{ + struct ifnet *ifp = priv->ifp; + int max_list_size; + int list_size; + u16 *vlans; + int vlan; + int err; + int i; + + list_size = 0; + for_each_set_bit(vlan, priv->vlan.active_vlans, VLAN_N_VID) + list_size++; + + max_list_size = 1 << MLX5_CAP_GEN(priv->mdev, log_max_vlan_list); + + if (list_size > max_list_size) { + if_printf(ifp, + "ifnet vlans list size (%d) > (%d) max vport list size, some vlans will be dropped\n", + list_size, max_list_size); + list_size = max_list_size; + } + + vlans = kcalloc(list_size, sizeof(*vlans), GFP_KERNEL); + if (!vlans) + return -ENOMEM; + + i = 0; + for_each_set_bit(vlan, priv->vlan.active_vlans, VLAN_N_VID) { + if (i >= list_size) + break; + vlans[i++] = vlan; + } + + err = mlx5_modify_nic_vport_vlans(priv->mdev, vlans, list_size); + if (err) + if_printf(ifp, "Failed to modify vport vlans list err(%d)\n", + err); + + kfree(vlans); + return err; +} + +enum mlx5e_vlan_rule_type { + MLX5E_VLAN_RULE_TYPE_UNTAGGED, + MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, + MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, + MLX5E_VLAN_RULE_TYPE_MATCH_VID, +}; + +static int +mlx5e_add_vlan_rule_sub(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, u16 vid, + u32 *mc, u32 *mv) +{ + struct mlx5_flow_table *ft = priv->fts.vlan.t; + struct mlx5_flow_destination dest; + u8 mc_enable = 0; + struct mlx5_flow_rule **rule_p; + int err = 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = priv->fts.main.t; + + mc_enable = MLX5_MATCH_OUTER_HEADERS; + + switch (rule_type) { + case MLX5E_VLAN_RULE_TYPE_UNTAGGED: + rule_p = &priv->vlan.untagged_ft_rule; + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + break; + case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: + rule_p = &priv->vlan.any_cvlan_ft_rule; + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, mv, outer_headers.cvlan_tag, 1); + break; + case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: + rule_p = &priv->vlan.any_svlan_ft_rule; + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); + MLX5_SET(fte_match_param, mv, outer_headers.svlan_tag, 1); + break; + default: /* MLX5E_VLAN_RULE_TYPE_MATCH_VID */ + rule_p = &priv->vlan.active_vlans_ft_rule[vid]; + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, mv, outer_headers.cvlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); + MLX5_SET(fte_match_param, mv, outer_headers.first_vid, vid); + mlx5e_vport_context_update_vlans(priv); + break; + } + + *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_ETH_FLOW_TAG, + &dest); + + if (IS_ERR(*rule_p)) { + err = PTR_ERR(*rule_p); + *rule_p = NULL; + if_printf(priv->ifp, "%s: add rule failed\n", __func__); + } + + return (err); +} + +static int +mlx5e_add_vlan_rule(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, u16 vid) +{ + u32 *match_criteria; + u32 *match_value; + int err = 0; + + match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); + match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); + if (!match_value || !match_criteria) { + if_printf(priv->ifp, "%s: alloc failed\n", __func__); + err = -ENOMEM; + goto add_vlan_rule_out; + } + + err = mlx5e_add_vlan_rule_sub(priv, rule_type, vid, match_criteria, + match_value); + +add_vlan_rule_out: + kvfree(match_criteria); + kvfree(match_value); + + return (err); +} + +static void +mlx5e_del_vlan_rule(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, u16 vid) +{ + switch (rule_type) { + case MLX5E_VLAN_RULE_TYPE_UNTAGGED: + if (priv->vlan.untagged_ft_rule) { + mlx5_del_flow_rule(priv->vlan.untagged_ft_rule); + priv->vlan.untagged_ft_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: + if (priv->vlan.any_cvlan_ft_rule) { + mlx5_del_flow_rule(priv->vlan.any_cvlan_ft_rule); + priv->vlan.any_cvlan_ft_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: + if (priv->vlan.any_svlan_ft_rule) { + mlx5_del_flow_rule(priv->vlan.any_svlan_ft_rule); + priv->vlan.any_svlan_ft_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_MATCH_VID: + if (priv->vlan.active_vlans_ft_rule[vid]) { + mlx5_del_flow_rule(priv->vlan.active_vlans_ft_rule[vid]); + priv->vlan.active_vlans_ft_rule[vid] = NULL; + } + mlx5e_vport_context_update_vlans(priv); + break; + default: + break; + } +} + +static void +mlx5e_del_any_vid_rules(struct mlx5e_priv *priv) +{ + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); +} + +static int +mlx5e_add_any_vid_rules(struct mlx5e_priv *priv) +{ + int err; + + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); + if (err) + return (err); + + return (mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0)); +} + +void +mlx5e_enable_vlan_filter(struct mlx5e_priv *priv) +{ + if (priv->vlan.filter_disabled) { + priv->vlan.filter_disabled = false; + if (priv->ifp->if_flags & IFF_PROMISC) + return; + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_del_any_vid_rules(priv); + } +} + +void +mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) +{ + if (!priv->vlan.filter_disabled) { + priv->vlan.filter_disabled = true; + if (priv->ifp->if_flags & IFF_PROMISC) + return; + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_add_any_vid_rules(priv); + } +} + +void +mlx5e_vlan_rx_add_vid(void *arg, struct ifnet *ifp, u16 vid) +{ + struct mlx5e_priv *priv = arg; + + if (ifp != priv->ifp) + return; + + PRIV_LOCK(priv); + if (!test_and_set_bit(vid, priv->vlan.active_vlans) && + test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid); + PRIV_UNLOCK(priv); +} + +void +mlx5e_vlan_rx_kill_vid(void *arg, struct ifnet *ifp, u16 vid) +{ + struct mlx5e_priv *priv = arg; + + if (ifp != priv->ifp) + return; + + PRIV_LOCK(priv); + clear_bit(vid, priv->vlan.active_vlans); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid); + PRIV_UNLOCK(priv); +} + +int +mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv) +{ + int err; + int i; + + set_bit(0, priv->vlan.active_vlans); + for_each_set_bit(i, priv->vlan.active_vlans, VLAN_N_VID) { + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, + i); + if (err) + return (err); + } + + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); + if (err) + return (err); + + if (priv->vlan.filter_disabled) { + err = mlx5e_add_any_vid_rules(priv); + if (err) + return (err); + } + return (0); +} + +void +mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv) +{ + int i; + + if (priv->vlan.filter_disabled) + mlx5e_del_any_vid_rules(priv); + + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); + + for_each_set_bit(i, priv->vlan.active_vlans, VLAN_N_VID) + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, i); + clear_bit(0, priv->vlan.active_vlans); +} + +#define mlx5e_for_each_hash_node(hn, tmp, hash, i) \ + for (i = 0; i < MLX5E_ETH_ADDR_HASH_SIZE; i++) \ + LIST_FOREACH_SAFE(hn, &(hash)[i], hlist, tmp) + +static void +mlx5e_execute_action(struct mlx5e_priv *priv, + struct mlx5e_eth_addr_hash_node *hn) +{ + switch (hn->action) { + case MLX5E_ACTION_ADD: + mlx5e_add_eth_addr_rule(priv, &hn->ai, MLX5E_FULLMATCH); + hn->action = MLX5E_ACTION_NONE; + break; + + case MLX5E_ACTION_DEL: + mlx5e_del_eth_addr_from_flow_table(priv, &hn->ai); + mlx5e_del_eth_addr_from_hash(hn); + break; + + default: + break; + } +} + +static void +mlx5e_sync_ifp_addr(struct mlx5e_priv *priv) +{ + struct ifnet *ifp = priv->ifp; + struct ifaddr *ifa; + struct ifmultiaddr *ifma; + + /* XXX adding this entry might not be needed */ + mlx5e_add_eth_addr_to_hash(priv->eth_addr.if_uc, + LLADDR((struct sockaddr_dl *)(ifp->if_addr->ifa_addr))); + + if_addr_rlock(ifp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_LINK) + continue; + mlx5e_add_eth_addr_to_hash(priv->eth_addr.if_uc, + LLADDR((struct sockaddr_dl *)ifa->ifa_addr)); + } + if_addr_runlock(ifp); + + if_maddr_rlock(ifp); + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + mlx5e_add_eth_addr_to_hash(priv->eth_addr.if_mc, + LLADDR((struct sockaddr_dl *)ifma->ifma_addr)); + } + if_maddr_runlock(ifp); +} + +static void mlx5e_fill_addr_array(struct mlx5e_priv *priv, int list_type, + u8 addr_array[][ETH_ALEN], int size) +{ + bool is_uc = (list_type == MLX5_NIC_VPORT_LIST_TYPE_UC); + struct ifnet *ifp = priv->ifp; + struct mlx5e_eth_addr_hash_node *hn; + struct mlx5e_eth_addr_hash_head *addr_list; + struct mlx5e_eth_addr_hash_node *tmp; + int i = 0; + int hi; + + addr_list = is_uc ? priv->eth_addr.if_uc : priv->eth_addr.if_mc; + + if (is_uc) /* Make sure our own address is pushed first */ + ether_addr_copy(addr_array[i++], IF_LLADDR(ifp)); + else if (priv->eth_addr.broadcast_enabled) + ether_addr_copy(addr_array[i++], ifp->if_broadcastaddr); + + mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) { + if (ether_addr_equal(IF_LLADDR(ifp), hn->ai.addr)) + continue; + if (i >= size) + break; + ether_addr_copy(addr_array[i++], hn->ai.addr); + } +} + +static void mlx5e_vport_context_update_addr_list(struct mlx5e_priv *priv, + int list_type) +{ + bool is_uc = (list_type == MLX5_NIC_VPORT_LIST_TYPE_UC); + struct mlx5e_eth_addr_hash_node *hn; + u8 (*addr_array)[ETH_ALEN] = NULL; + struct mlx5e_eth_addr_hash_head *addr_list; + struct mlx5e_eth_addr_hash_node *tmp; + int max_size; + int size; + int err; + int hi; + + size = is_uc ? 0 : (priv->eth_addr.broadcast_enabled ? 1 : 0); + max_size = is_uc ? + 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_uc_list) : + 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_mc_list); + + addr_list = is_uc ? priv->eth_addr.if_uc : priv->eth_addr.if_mc; + mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) + size++; + + if (size > max_size) { + if_printf(priv->ifp, + "ifp %s list size (%d) > (%d) max vport list size, some addresses will be dropped\n", + is_uc ? "UC" : "MC", size, max_size); + size = max_size; + } + + if (size) { + addr_array = kcalloc(size, ETH_ALEN, GFP_KERNEL); + if (!addr_array) { + err = -ENOMEM; + goto out; + } + mlx5e_fill_addr_array(priv, list_type, addr_array, size); + } + + err = mlx5_modify_nic_vport_mac_list(priv->mdev, list_type, addr_array, size); +out: + if (err) + if_printf(priv->ifp, + "Failed to modify vport %s list err(%d)\n", + is_uc ? "UC" : "MC", err); + kfree(addr_array); +} + +static void mlx5e_vport_context_update(struct mlx5e_priv *priv) +{ + struct mlx5e_eth_addr_db *ea = &priv->eth_addr; + + mlx5e_vport_context_update_addr_list(priv, MLX5_NIC_VPORT_LIST_TYPE_UC); + mlx5e_vport_context_update_addr_list(priv, MLX5_NIC_VPORT_LIST_TYPE_MC); + mlx5_modify_nic_vport_promisc(priv->mdev, 0, + ea->allmulti_enabled, + ea->promisc_enabled); +} + +static void +mlx5e_apply_ifp_addr(struct mlx5e_priv *priv) +{ + struct mlx5e_eth_addr_hash_node *hn; + struct mlx5e_eth_addr_hash_node *tmp; + int i; + + mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_uc, i) + mlx5e_execute_action(priv, hn); + + mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_mc, i) + mlx5e_execute_action(priv, hn); +} + +static void +mlx5e_handle_ifp_addr(struct mlx5e_priv *priv) +{ + struct mlx5e_eth_addr_hash_node *hn; + struct mlx5e_eth_addr_hash_node *tmp; + int i; + + mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_uc, i) + hn->action = MLX5E_ACTION_DEL; + mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_mc, i) + hn->action = MLX5E_ACTION_DEL; + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_sync_ifp_addr(priv); + + mlx5e_apply_ifp_addr(priv); +} + +void +mlx5e_set_rx_mode_core(struct mlx5e_priv *priv) +{ + struct mlx5e_eth_addr_db *ea = &priv->eth_addr; + struct ifnet *ndev = priv->ifp; + + bool rx_mode_enable = test_bit(MLX5E_STATE_OPENED, &priv->state); + bool promisc_enabled = rx_mode_enable && (ndev->if_flags & IFF_PROMISC); + bool allmulti_enabled = rx_mode_enable && (ndev->if_flags & IFF_ALLMULTI); + bool broadcast_enabled = rx_mode_enable; + + bool enable_promisc = !ea->promisc_enabled && promisc_enabled; + bool disable_promisc = ea->promisc_enabled && !promisc_enabled; + bool enable_allmulti = !ea->allmulti_enabled && allmulti_enabled; + bool disable_allmulti = ea->allmulti_enabled && !allmulti_enabled; + bool enable_broadcast = !ea->broadcast_enabled && broadcast_enabled; + bool disable_broadcast = ea->broadcast_enabled && !broadcast_enabled; + + /* update broadcast address */ + ether_addr_copy(priv->eth_addr.broadcast.addr, + priv->ifp->if_broadcastaddr); + + if (enable_promisc) { + mlx5e_add_eth_addr_rule(priv, &ea->promisc, MLX5E_PROMISC); + if (!priv->vlan.filter_disabled) + mlx5e_add_any_vid_rules(priv); + } + if (enable_allmulti) + mlx5e_add_eth_addr_rule(priv, &ea->allmulti, MLX5E_ALLMULTI); + if (enable_broadcast) + mlx5e_add_eth_addr_rule(priv, &ea->broadcast, MLX5E_FULLMATCH); + + mlx5e_handle_ifp_addr(priv); + + if (disable_broadcast) + mlx5e_del_eth_addr_from_flow_table(priv, &ea->broadcast); + if (disable_allmulti) + mlx5e_del_eth_addr_from_flow_table(priv, &ea->allmulti); + if (disable_promisc) { + if (!priv->vlan.filter_disabled) + mlx5e_del_any_vid_rules(priv); + mlx5e_del_eth_addr_from_flow_table(priv, &ea->promisc); + } + + ea->promisc_enabled = promisc_enabled; + ea->allmulti_enabled = allmulti_enabled; + ea->broadcast_enabled = broadcast_enabled; + + mlx5e_vport_context_update(priv); +} + +void +mlx5e_set_rx_mode_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = + container_of(work, struct mlx5e_priv, set_rx_mode_work); + + PRIV_LOCK(priv); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_set_rx_mode_core(priv); + PRIV_UNLOCK(priv); +} + +static void +mlx5e_destroy_groups(struct mlx5e_flow_table *ft) +{ + int i; + + for (i = ft->num_groups - 1; i >= 0; i--) { + if (!IS_ERR_OR_NULL(ft->g[i])) + mlx5_destroy_flow_group(ft->g[i]); + ft->g[i] = NULL; + } + ft->num_groups = 0; +} + +static void +mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft) +{ + mlx5e_destroy_groups(ft); + kfree(ft->g); + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; +} + +#define MLX5E_NUM_MAIN_GROUPS 10 +#define MLX5E_MAIN_GROUP0_SIZE BIT(4) +#define MLX5E_MAIN_GROUP1_SIZE BIT(3) +#define MLX5E_MAIN_GROUP2_SIZE BIT(1) +#define MLX5E_MAIN_GROUP3_SIZE BIT(0) +#define MLX5E_MAIN_GROUP4_SIZE BIT(14) +#define MLX5E_MAIN_GROUP5_SIZE BIT(13) +#define MLX5E_MAIN_GROUP6_SIZE BIT(11) +#define MLX5E_MAIN_GROUP7_SIZE BIT(2) +#define MLX5E_MAIN_GROUP8_SIZE BIT(1) +#define MLX5E_MAIN_GROUP9_SIZE BIT(0) +#define MLX5E_MAIN_TABLE_SIZE (MLX5E_MAIN_GROUP0_SIZE +\ + MLX5E_MAIN_GROUP1_SIZE +\ + MLX5E_MAIN_GROUP2_SIZE +\ + MLX5E_MAIN_GROUP3_SIZE +\ + MLX5E_MAIN_GROUP4_SIZE +\ + MLX5E_MAIN_GROUP5_SIZE +\ + MLX5E_MAIN_GROUP6_SIZE +\ + MLX5E_MAIN_GROUP7_SIZE +\ + MLX5E_MAIN_GROUP8_SIZE +\ + MLX5E_MAIN_GROUP9_SIZE +\ + 0) + +static int +mlx5e_create_main_groups_sub(struct mlx5e_flow_table *ft, u32 *in, + int inlen) +{ + u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + u8 *dmac = MLX5_ADDR_OF(create_flow_group_in, in, + match_criteria.outer_headers.dmac_47_16); + int err; + int ix = 0; + + /* Tunnel rules need to be first in this list of groups */ + + /* Start tunnel rules */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.udp_dport); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP0_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + /* End Tunnel Rules */ + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP3_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + memset(dmac, 0xff, ETH_ALEN); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP4_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + memset(dmac, 0xff, ETH_ALEN); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP5_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + memset(dmac, 0xff, ETH_ALEN); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP6_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + dmac[0] = 0x01; + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP7_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + dmac[0] = 0x01; + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP8_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + dmac[0] = 0x01; + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_MAIN_GROUP9_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + return (0); + +err_destory_groups: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + mlx5e_destroy_groups(ft); + + return (err); +} + +static int +mlx5e_create_main_groups(struct mlx5e_flow_table *ft) +{ + u32 *in; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int err; + + in = mlx5_vzalloc(inlen); + if (!in) + return (-ENOMEM); + + err = mlx5e_create_main_groups_sub(ft, in, inlen); + + kvfree(in); + return (err); +} + +static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *ft = &priv->fts.main; + int err; + + ft->num_groups = 0; + ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "main", + MLX5E_MAIN_TABLE_SIZE); + + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return (err); + } + ft->g = kcalloc(MLX5E_NUM_MAIN_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) { + err = -ENOMEM; + goto err_destroy_main_flow_table; + } + + err = mlx5e_create_main_groups(ft); + if (err) + goto err_free_g; + return (0); + +err_free_g: + kfree(ft->g); + +err_destroy_main_flow_table: + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; + + return (err); +} + +static void mlx5e_destroy_main_flow_table(struct mlx5e_priv *priv) +{ + mlx5e_destroy_flow_table(&priv->fts.main); +} + +#define MLX5E_NUM_VLAN_GROUPS 3 +#define MLX5E_VLAN_GROUP0_SIZE BIT(12) +#define MLX5E_VLAN_GROUP1_SIZE BIT(1) +#define MLX5E_VLAN_GROUP2_SIZE BIT(0) +#define MLX5E_VLAN_TABLE_SIZE (MLX5E_VLAN_GROUP0_SIZE +\ + MLX5E_VLAN_GROUP1_SIZE +\ + MLX5E_VLAN_GROUP2_SIZE +\ + 0) + +static int +mlx5e_create_vlan_groups_sub(struct mlx5e_flow_table *ft, u32 *in, + int inlen) +{ + int err; + int ix = 0; + u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP0_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + return (0); + +err_destory_groups: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + mlx5e_destroy_groups(ft); + + return (err); +} + +static int +mlx5e_create_vlan_groups(struct mlx5e_flow_table *ft) +{ + u32 *in; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int err; + + in = mlx5_vzalloc(inlen); + if (!in) + return (-ENOMEM); + + err = mlx5e_create_vlan_groups_sub(ft, in, inlen); + + kvfree(in); + return (err); +} + +static int +mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *ft = &priv->fts.vlan; + int err; + + ft->num_groups = 0; + ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "vlan", + MLX5E_VLAN_TABLE_SIZE); + + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return (err); + } + ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) { + err = -ENOMEM; + goto err_destroy_vlan_flow_table; + } + + err = mlx5e_create_vlan_groups(ft); + if (err) + goto err_free_g; + + return (0); + +err_free_g: + kfree(ft->g); + +err_destroy_vlan_flow_table: + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; + + return (err); +} + +static void +mlx5e_destroy_vlan_flow_table(struct mlx5e_priv *priv) +{ + mlx5e_destroy_flow_table(&priv->fts.vlan); +} + +#define MLX5E_NUM_INNER_RSS_GROUPS 3 +#define MLX5E_INNER_RSS_GROUP0_SIZE BIT(3) +#define MLX5E_INNER_RSS_GROUP1_SIZE BIT(1) +#define MLX5E_INNER_RSS_GROUP2_SIZE BIT(0) +#define MLX5E_INNER_RSS_TABLE_SIZE (MLX5E_INNER_RSS_GROUP0_SIZE +\ + MLX5E_INNER_RSS_GROUP1_SIZE +\ + MLX5E_INNER_RSS_GROUP2_SIZE +\ + 0) + +static int +mlx5e_create_inner_rss_groups_sub(struct mlx5e_flow_table *ft, u32 *in, + int inlen) +{ + u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + int err; + int ix = 0; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_INNER_RSS_GROUP0_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_INNER_RSS_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_INNER_RSS_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destory_groups; + ft->num_groups++; + + return (0); + +err_destory_groups: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + mlx5e_destroy_groups(ft); + + return (err); +} + +static int +mlx5e_create_inner_rss_groups(struct mlx5e_flow_table *ft) +{ + u32 *in; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int err; + + in = mlx5_vzalloc(inlen); + if (!in) + return (-ENOMEM); + + err = mlx5e_create_inner_rss_groups_sub(ft, in, inlen); + + kvfree(in); + return (err); +} + +static int +mlx5e_create_inner_rss_flow_table(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *ft = &priv->fts.inner_rss; + int err; + + ft->num_groups = 0; + ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "inner_rss", + MLX5E_INNER_RSS_TABLE_SIZE); + + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return (err); + } + ft->g = kcalloc(MLX5E_NUM_INNER_RSS_GROUPS, sizeof(*ft->g), + GFP_KERNEL); + if (!ft->g) { + err = -ENOMEM; + goto err_destroy_inner_rss_flow_table; + } + + err = mlx5e_create_inner_rss_groups(ft); + if (err) + goto err_free_g; + + return (0); + +err_free_g: + kfree(ft->g); + +err_destroy_inner_rss_flow_table: + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; + + return (err); +} + +static void mlx5e_destroy_inner_rss_flow_table(struct mlx5e_priv *priv) +{ + mlx5e_destroy_flow_table(&priv->fts.inner_rss); +} + +int +mlx5e_open_flow_table(struct mlx5e_priv *priv) +{ + int err; + + priv->fts.ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + + err = mlx5e_create_vlan_flow_table(priv); + if (err) + return (err); + + err = mlx5e_create_main_flow_table(priv); + if (err) + goto err_destroy_vlan_flow_table; + + err = mlx5e_create_inner_rss_flow_table(priv); + if (err) + goto err_destroy_main_flow_table; + + return (0); + +err_destroy_main_flow_table: + mlx5e_destroy_main_flow_table(priv); +err_destroy_vlan_flow_table: + mlx5e_destroy_vlan_flow_table(priv); + + return (err); +} + +void +mlx5e_close_flow_table(struct mlx5e_priv *priv) +{ + mlx5e_destroy_inner_rss_flow_table(priv); + mlx5e_destroy_main_flow_table(priv); + mlx5e_destroy_vlan_flow_table(priv); +} diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c new file mode 100644 index 000000000000..916ebe72c46c --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -0,0 +1,3901 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" + +#include <sys/sockio.h> +#include <machine/atomic.h> + +#ifndef ETH_DRIVER_VERSION +#define ETH_DRIVER_VERSION "3.4.2" +#endif + +char mlx5e_version[] = "Mellanox Ethernet driver" + " (" ETH_DRIVER_VERSION ")"; + +static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs); + +struct mlx5e_channel_param { + struct mlx5e_rq_param rq; + struct mlx5e_sq_param sq; + struct mlx5e_cq_param rx_cq; + struct mlx5e_cq_param tx_cq; +}; + +static const struct { + u32 subtype; + u64 baudrate; +} mlx5e_mode_table[MLX5E_LINK_MODES_NUMBER] = { + + [MLX5E_1000BASE_CX_SGMII] = { + .subtype = IFM_1000_CX_SGMII, + .baudrate = IF_Mbps(1000ULL), + }, + [MLX5E_1000BASE_KX] = { + .subtype = IFM_1000_KX, + .baudrate = IF_Mbps(1000ULL), + }, + [MLX5E_10GBASE_CX4] = { + .subtype = IFM_10G_CX4, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_10GBASE_KX4] = { + .subtype = IFM_10G_KX4, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_10GBASE_KR] = { + .subtype = IFM_10G_KR, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_20GBASE_KR2] = { + .subtype = IFM_20G_KR2, + .baudrate = IF_Gbps(20ULL), + }, + [MLX5E_40GBASE_CR4] = { + .subtype = IFM_40G_CR4, + .baudrate = IF_Gbps(40ULL), + }, + [MLX5E_40GBASE_KR4] = { + .subtype = IFM_40G_KR4, + .baudrate = IF_Gbps(40ULL), + }, + [MLX5E_56GBASE_R4] = { + .subtype = IFM_56G_R4, + .baudrate = IF_Gbps(56ULL), + }, + [MLX5E_10GBASE_CR] = { + .subtype = IFM_10G_CR1, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_10GBASE_SR] = { + .subtype = IFM_10G_SR, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_10GBASE_ER] = { + .subtype = IFM_10G_ER, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_40GBASE_SR4] = { + .subtype = IFM_40G_SR4, + .baudrate = IF_Gbps(40ULL), + }, + [MLX5E_40GBASE_LR4] = { + .subtype = IFM_40G_LR4, + .baudrate = IF_Gbps(40ULL), + }, + [MLX5E_100GBASE_CR4] = { + .subtype = IFM_100G_CR4, + .baudrate = IF_Gbps(100ULL), + }, + [MLX5E_100GBASE_SR4] = { + .subtype = IFM_100G_SR4, + .baudrate = IF_Gbps(100ULL), + }, + [MLX5E_100GBASE_KR4] = { + .subtype = IFM_100G_KR4, + .baudrate = IF_Gbps(100ULL), + }, + [MLX5E_100GBASE_LR4] = { + .subtype = IFM_100G_LR4, + .baudrate = IF_Gbps(100ULL), + }, + [MLX5E_100BASE_TX] = { + .subtype = IFM_100_TX, + .baudrate = IF_Mbps(100ULL), + }, + [MLX5E_1000BASE_T] = { + .subtype = IFM_1000_T, + .baudrate = IF_Mbps(1000ULL), + }, + [MLX5E_10GBASE_T] = { + .subtype = IFM_10G_T, + .baudrate = IF_Gbps(10ULL), + }, + [MLX5E_25GBASE_CR] = { + .subtype = IFM_25G_CR, + .baudrate = IF_Gbps(25ULL), + }, + [MLX5E_25GBASE_KR] = { + .subtype = IFM_25G_KR, + .baudrate = IF_Gbps(25ULL), + }, + [MLX5E_25GBASE_SR] = { + .subtype = IFM_25G_SR, + .baudrate = IF_Gbps(25ULL), + }, + [MLX5E_50GBASE_CR2] = { + .subtype = IFM_50G_CR2, + .baudrate = IF_Gbps(50ULL), + }, + [MLX5E_50GBASE_KR2] = { + .subtype = IFM_50G_KR2, + .baudrate = IF_Gbps(50ULL), + }, +}; + +MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet"); + +static SYSCTL_NODE(_hw, OID_AUTO, mlx5, CTLFLAG_RW, 0, "MLX5 driver parameters"); + +static void +mlx5e_update_carrier(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + u32 eth_proto_oper; + int error; + u8 port_state; + u8 i; + + port_state = mlx5_query_vport_state(mdev, + MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0); + + if (port_state == VPORT_STATE_UP) { + priv->media_status_last |= IFM_ACTIVE; + } else { + priv->media_status_last &= ~IFM_ACTIVE; + priv->media_active_last = IFM_ETHER; + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + return; + } + + error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); + if (error) { + priv->media_active_last = IFM_ETHER; + priv->ifp->if_baudrate = 1; + if_printf(priv->ifp, "%s: query port ptys failed: 0x%x\n", + __func__, error); + return; + } + eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper); + + for (i = 0; i != MLX5E_LINK_MODES_NUMBER; i++) { + if (mlx5e_mode_table[i].baudrate == 0) + continue; + if (MLX5E_PROT_MASK(i) & eth_proto_oper) { + priv->ifp->if_baudrate = + mlx5e_mode_table[i].baudrate; + priv->media_active_last = + mlx5e_mode_table[i].subtype | IFM_ETHER | IFM_FDX; + } + } + if_link_state_change(priv->ifp, LINK_STATE_UP); +} + +static void +mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr) +{ + struct mlx5e_priv *priv = dev->if_softc; + + ifmr->ifm_status = priv->media_status_last; + ifmr->ifm_active = priv->media_active_last | + (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) | + (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0); + +} + +static u32 +mlx5e_find_link_mode(u32 subtype) +{ + u32 i; + u32 link_mode = 0; + + for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { + if (mlx5e_mode_table[i].baudrate == 0) + continue; + if (mlx5e_mode_table[i].subtype == subtype) + link_mode |= MLX5E_PROT_MASK(i); + } + + return (link_mode); +} + +static int +mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv) +{ + return (mlx5_set_port_pause_and_pfc(priv->mdev, 1, + priv->params.rx_pauseframe_control, + priv->params.tx_pauseframe_control, + priv->params.rx_priority_flow_control, + priv->params.tx_priority_flow_control)); +} + +static int +mlx5e_set_port_pfc(struct mlx5e_priv *priv) +{ + int error; + + if (priv->params.rx_pauseframe_control || + priv->params.tx_pauseframe_control) { + if_printf(priv->ifp, + "Global pauseframes must be disabled before enabling PFC.\n"); + error = -EINVAL; + } else { + error = mlx5e_set_port_pause_and_pfc(priv); + } + return (error); +} + +static int +mlx5e_media_change(struct ifnet *dev) +{ + struct mlx5e_priv *priv = dev->if_softc; + struct mlx5_core_dev *mdev = priv->mdev; + u32 eth_proto_cap; + u32 link_mode; + int was_opened; + int locked; + int error; + + locked = PRIV_LOCKED(priv); + if (!locked) + PRIV_LOCK(priv); + + if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) { + error = EINVAL; + goto done; + } + link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media)); + + /* query supported capabilities */ + error = mlx5_query_port_proto_cap(mdev, ð_proto_cap, MLX5_PTYS_EN); + if (error != 0) { + if_printf(dev, "Query port media capability failed\n"); + goto done; + } + /* check for autoselect */ + if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) { + link_mode = eth_proto_cap; + if (link_mode == 0) { + if_printf(dev, "Port media capability is zero\n"); + error = EINVAL; + goto done; + } + } else { + link_mode = link_mode & eth_proto_cap; + if (link_mode == 0) { + if_printf(dev, "Not supported link mode requested\n"); + error = EINVAL; + goto done; + } + } + if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) { + /* check if PFC is enabled */ + if (priv->params.rx_priority_flow_control || + priv->params.tx_priority_flow_control) { + if_printf(dev, "PFC must be disabled before enabling global pauseframes.\n"); + error = EINVAL; + goto done; + } + } + /* update pauseframe control bits */ + priv->params.rx_pauseframe_control = + (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0; + priv->params.tx_pauseframe_control = + (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0; + + /* check if device is opened */ + was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + + /* reconfigure the hardware */ + mlx5_set_port_status(mdev, MLX5_PORT_DOWN); + mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN); + error = -mlx5e_set_port_pause_and_pfc(priv); + if (was_opened) + mlx5_set_port_status(mdev, MLX5_PORT_UP); + +done: + if (!locked) + PRIV_UNLOCK(priv); + return (error); +} + +static void +mlx5e_update_carrier_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + update_carrier_work); + + PRIV_LOCK(priv); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_update_carrier(priv); + PRIV_UNLOCK(priv); +} + +/* + * This function reads the physical port counters from the firmware + * using a pre-defined layout defined by various MLX5E_PPORT_XXX() + * macros. The output is converted from big-endian 64-bit values into + * host endian ones and stored in the "priv->stats.pport" structure. + */ +static void +mlx5e_update_pport_counters(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_pport_stats *s = &priv->stats.pport; + struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; + u32 *in; + u32 *out; + const u64 *ptr; + unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + unsigned x; + unsigned y; + unsigned z; + + /* allocate firmware request structures */ + in = mlx5_vzalloc(sz); + out = mlx5_vzalloc(sz); + if (in == NULL || out == NULL) + goto free_out; + + /* + * Get pointer to the 64-bit counter set which is located at a + * fixed offset in the output firmware request structure: + */ + ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set); + + MLX5_SET(ppcnt_reg, in, local_port, 1); + + /* read IEEE802_3 counter group using predefined counter layout */ + MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM; + x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++) + s->arg[y] = be64toh(ptr[x]); + + /* read RFC2819 counter group using predefined counter layout */ + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++) + s->arg[y] = be64toh(ptr[x]); + for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM + + MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++) + s_debug->arg[y] = be64toh(ptr[x]); + + /* read RFC2863 counter group using predefined counter layout */ + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++) + s_debug->arg[y] = be64toh(ptr[x]); + + /* read physical layer stats counter group using predefined counter layout */ + MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++) + s_debug->arg[y] = be64toh(ptr[x]); + + /* read per-priority counters */ + MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP); + + /* iterate all the priorities */ + for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) { + MLX5_SET(ppcnt_reg, in, prio_tc, z); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + + /* read per priority stats counter group using predefined counter layout */ + for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM / + MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++) + s->arg[y] = be64toh(ptr[x]); + } +free_out: + /* free firmware request structures */ + kvfree(in); + kvfree(out); +} + +/* + * This function is called regularly to collect all statistics + * counters from the firmware. The values can be viewed through the + * sysctl interface. Execution is serialized using the priv's global + * configuration lock. + */ +static void +mlx5e_update_stats_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + update_stats_work); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_vport_stats *s = &priv->stats.vport; + struct mlx5e_rq_stats *rq_stats; + struct mlx5e_sq_stats *sq_stats; + struct buf_ring *sq_br; +#if (__FreeBSD_version < 1100000) + struct ifnet *ifp = priv->ifp; +#endif + + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); + u64 tso_packets = 0; + u64 tso_bytes = 0; + u64 tx_queue_dropped = 0; + u64 tx_defragged = 0; + u64 tx_offload_none = 0; + u64 lro_packets = 0; + u64 lro_bytes = 0; + u64 sw_lro_queued = 0; + u64 sw_lro_flushed = 0; + u64 rx_csum_none = 0; + u64 rx_wqe_err = 0; + u32 rx_out_of_buffer = 0; + int i; + int j; + + PRIV_LOCK(priv); + out = mlx5_vzalloc(outlen); + if (out == NULL) + goto free_out; + if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) + goto free_out; + + /* Collect firts the SW counters and then HW for consistency */ + for (i = 0; i < priv->params.num_channels; i++) { + struct mlx5e_rq *rq = &priv->channel[i]->rq; + + rq_stats = &priv->channel[i]->rq.stats; + + /* collect stats from LRO */ + rq_stats->sw_lro_queued = rq->lro.lro_queued; + rq_stats->sw_lro_flushed = rq->lro.lro_flushed; + sw_lro_queued += rq_stats->sw_lro_queued; + sw_lro_flushed += rq_stats->sw_lro_flushed; + lro_packets += rq_stats->lro_packets; + lro_bytes += rq_stats->lro_bytes; + rx_csum_none += rq_stats->csum_none; + rx_wqe_err += rq_stats->wqe_err; + + for (j = 0; j < priv->num_tc; j++) { + sq_stats = &priv->channel[i]->sq[j].stats; + sq_br = priv->channel[i]->sq[j].br; + + tso_packets += sq_stats->tso_packets; + tso_bytes += sq_stats->tso_bytes; + tx_queue_dropped += sq_stats->dropped; + if (sq_br != NULL) + tx_queue_dropped += sq_br->br_drops; + tx_defragged += sq_stats->defragged; + tx_offload_none += sq_stats->csum_offload_none; + } + } + + /* update counters */ + s->tso_packets = tso_packets; + s->tso_bytes = tso_bytes; + s->tx_queue_dropped = tx_queue_dropped; + s->tx_defragged = tx_defragged; + s->lro_packets = lro_packets; + s->lro_bytes = lro_bytes; + s->sw_lro_queued = sw_lro_queued; + s->sw_lro_flushed = sw_lro_flushed; + s->rx_csum_none = rx_csum_none; + s->rx_wqe_err = rx_wqe_err; + + /* HW counters */ + memset(in, 0, sizeof(in)); + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + MLX5_SET(query_vport_counter_in, in, op_mod, 0); + MLX5_SET(query_vport_counter_in, in, other_vport, 0); + + memset(out, 0, outlen); + + /* get number of out-of-buffer drops first */ + if (mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id, + &rx_out_of_buffer)) + goto free_out; + + /* accumulate difference into a 64-bit counter */ + s->rx_out_of_buffer += (u64)(u32)(rx_out_of_buffer - s->rx_out_of_buffer_prev); + s->rx_out_of_buffer_prev = rx_out_of_buffer; + + /* get port statistics */ + if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen)) + goto free_out; + +#define MLX5_GET_CTR(out, x) \ + MLX5_GET64(query_vport_counter_out, out, x) + + s->rx_error_packets = + MLX5_GET_CTR(out, received_errors.packets); + s->rx_error_bytes = + MLX5_GET_CTR(out, received_errors.octets); + s->tx_error_packets = + MLX5_GET_CTR(out, transmit_errors.packets); + s->tx_error_bytes = + MLX5_GET_CTR(out, transmit_errors.octets); + + s->rx_unicast_packets = + MLX5_GET_CTR(out, received_eth_unicast.packets); + s->rx_unicast_bytes = + MLX5_GET_CTR(out, received_eth_unicast.octets); + s->tx_unicast_packets = + MLX5_GET_CTR(out, transmitted_eth_unicast.packets); + s->tx_unicast_bytes = + MLX5_GET_CTR(out, transmitted_eth_unicast.octets); + + s->rx_multicast_packets = + MLX5_GET_CTR(out, received_eth_multicast.packets); + s->rx_multicast_bytes = + MLX5_GET_CTR(out, received_eth_multicast.octets); + s->tx_multicast_packets = + MLX5_GET_CTR(out, transmitted_eth_multicast.packets); + s->tx_multicast_bytes = + MLX5_GET_CTR(out, transmitted_eth_multicast.octets); + + s->rx_broadcast_packets = + MLX5_GET_CTR(out, received_eth_broadcast.packets); + s->rx_broadcast_bytes = + MLX5_GET_CTR(out, received_eth_broadcast.octets); + s->tx_broadcast_packets = + MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); + s->tx_broadcast_bytes = + MLX5_GET_CTR(out, transmitted_eth_broadcast.octets); + + s->rx_packets = + s->rx_unicast_packets + + s->rx_multicast_packets + + s->rx_broadcast_packets - + s->rx_out_of_buffer; + s->rx_bytes = + s->rx_unicast_bytes + + s->rx_multicast_bytes + + s->rx_broadcast_bytes; + s->tx_packets = + s->tx_unicast_packets + + s->tx_multicast_packets + + s->tx_broadcast_packets; + s->tx_bytes = + s->tx_unicast_bytes + + s->tx_multicast_bytes + + s->tx_broadcast_bytes; + + /* Update calculated offload counters */ + s->tx_csum_offload = s->tx_packets - tx_offload_none; + s->rx_csum_good = s->rx_packets - s->rx_csum_none; + + /* Get physical port counters */ + mlx5e_update_pport_counters(priv); + +#if (__FreeBSD_version < 1100000) + /* no get_counters interface in fbsd 10 */ + ifp->if_ipackets = s->rx_packets; + ifp->if_ierrors = s->rx_error_packets + + priv->stats.pport.alignment_err + + priv->stats.pport.check_seq_err + + priv->stats.pport.crc_align_errors + + priv->stats.pport.in_range_len_errors + + priv->stats.pport.jabbers + + priv->stats.pport.out_of_range_len + + priv->stats.pport.oversize_pkts + + priv->stats.pport.symbol_err + + priv->stats.pport.too_long_errors + + priv->stats.pport.undersize_pkts + + priv->stats.pport.unsupported_op_rx; + ifp->if_iqdrops = s->rx_out_of_buffer + + priv->stats.pport.drop_events; + ifp->if_opackets = s->tx_packets; + ifp->if_oerrors = s->tx_error_packets; + ifp->if_snd.ifq_drops = s->tx_queue_dropped; + ifp->if_ibytes = s->rx_bytes; + ifp->if_obytes = s->tx_bytes; + ifp->if_collisions = + priv->stats.pport.collisions; +#endif + +free_out: + kvfree(out); + + /* Update diagnostics, if any */ + if (priv->params_ethtool.diag_pci_enable || + priv->params_ethtool.diag_general_enable) { + int error = mlx5_core_get_diagnostics_full(mdev, + priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL, + priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL); + if (error != 0) + if_printf(priv->ifp, "Failed reading diagnostics: %d\n", error); + } + PRIV_UNLOCK(priv); +} + +static void +mlx5e_update_stats(void *arg) +{ + struct mlx5e_priv *priv = arg; + + queue_work(priv->wq, &priv->update_stats_work); + + callout_reset(&priv->watchdog, hz, &mlx5e_update_stats, priv); +} + +static void +mlx5e_async_event_sub(struct mlx5e_priv *priv, + enum mlx5_dev_event event) +{ + switch (event) { + case MLX5_DEV_EVENT_PORT_UP: + case MLX5_DEV_EVENT_PORT_DOWN: + queue_work(priv->wq, &priv->update_carrier_work); + break; + + default: + break; + } +} + +static void +mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5e_priv *priv = vpriv; + + mtx_lock(&priv->async_events_mtx); + if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state)) + mlx5e_async_event_sub(priv, event); + mtx_unlock(&priv->async_events_mtx); +} + +static void +mlx5e_enable_async_events(struct mlx5e_priv *priv) +{ + set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); +} + +static void +mlx5e_disable_async_events(struct mlx5e_priv *priv) +{ + mtx_lock(&priv->async_events_mtx); + clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); + mtx_unlock(&priv->async_events_mtx); +} + +static void mlx5e_calibration_callout(void *arg); +static int mlx5e_calibration_duration = 20; +static int mlx5e_fast_calibration = 1; +static int mlx5e_normal_calibration = 30; + +static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0, + "MLX5 timestamp calibration parameteres"); + +SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN, + &mlx5e_calibration_duration, 0, + "Duration of initial calibration"); +SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN, + &mlx5e_fast_calibration, 0, + "Recalibration interval during initial calibration"); +SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN, + &mlx5e_normal_calibration, 0, + "Recalibration interval during normal operations"); + +/* + * Ignites the calibration process. + */ +static void +mlx5e_reset_calibration_callout(struct mlx5e_priv *priv) +{ + + if (priv->clbr_done == 0) + mlx5e_calibration_callout(priv); + else + callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done < + mlx5e_calibration_duration ? mlx5e_fast_calibration : + mlx5e_normal_calibration) * hz, mlx5e_calibration_callout, + priv); +} + +static uint64_t +mlx5e_timespec2usec(const struct timespec *ts) +{ + + return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec); +} + +static uint64_t +mlx5e_hw_clock(struct mlx5e_priv *priv) +{ + struct mlx5_init_seg *iseg; + uint32_t hw_h, hw_h1, hw_l; + + iseg = priv->mdev->iseg; + do { + hw_h = ioread32be(&iseg->internal_timer_h); + hw_l = ioread32be(&iseg->internal_timer_l); + hw_h1 = ioread32be(&iseg->internal_timer_h); + } while (hw_h1 != hw_h); + return (((uint64_t)hw_h << 32) | hw_l); +} + +/* + * The calibration callout, it runs either in the context of the + * thread which enables calibration, or in callout. It takes the + * snapshot of system and adapter clocks, then advances the pointers to + * the calibration point to allow rx path to read the consistent data + * lockless. + */ +static void +mlx5e_calibration_callout(void *arg) +{ + struct mlx5e_priv *priv; + struct mlx5e_clbr_point *next, *curr; + struct timespec ts; + int clbr_curr_next; + + priv = arg; + curr = &priv->clbr_points[priv->clbr_curr]; + clbr_curr_next = priv->clbr_curr + 1; + if (clbr_curr_next >= nitems(priv->clbr_points)) + clbr_curr_next = 0; + next = &priv->clbr_points[clbr_curr_next]; + + next->base_prev = curr->base_curr; + next->clbr_hw_prev = curr->clbr_hw_curr; + + next->clbr_hw_curr = mlx5e_hw_clock(priv); + if (((next->clbr_hw_curr - curr->clbr_hw_prev) >> MLX5E_TSTMP_PREC) == + 0) { + if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx," + "disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev); + priv->clbr_done = 0; + return; + } + + nanouptime(&ts); + next->base_curr = mlx5e_timespec2usec(&ts); + + curr->clbr_gen = 0; + atomic_thread_fence_rel(); + priv->clbr_curr = clbr_curr_next; + atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen)); + + if (priv->clbr_done < mlx5e_calibration_duration) + priv->clbr_done++; + mlx5e_reset_calibration_callout(priv); +} + +static const char *mlx5e_rq_stats_desc[] = { + MLX5E_RQ_STATS(MLX5E_STATS_DESC) +}; + +static int +mlx5e_create_rq(struct mlx5e_channel *c, + struct mlx5e_rq_param *param, + struct mlx5e_rq *rq) +{ + struct mlx5e_priv *priv = c->priv; + struct mlx5_core_dev *mdev = priv->mdev; + char buffer[16]; + void *rqc = param->rqc; + void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); + int wq_sz; + int err; + int i; + u32 nsegs, wqe_sz; + + err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); + if (err != 0) + goto done; + + /* Create DMA descriptor TAG */ + if ((err = -bus_dma_tag_create( + bus_get_dma_tag(mdev->pdev->dev.bsddev), + 1, /* any alignment */ + 0, /* no boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */ + nsegs, /* nsegments */ + nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg */ + &rq->dma_tag))) + goto done; + + err = mlx5_wq_ll_create(mdev, ¶m->wq, rqc_wq, &rq->wq, + &rq->wq_ctrl); + if (err) + goto err_free_dma_tag; + + rq->wq.db = &rq->wq.db[MLX5_RCV_DBR]; + + err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs); + if (err != 0) + goto err_rq_wq_destroy; + + wq_sz = mlx5_wq_ll_get_size(&rq->wq); + + err = -tcp_lro_init_args(&rq->lro, c->ifp, TCP_LRO_ENTRIES, wq_sz); + if (err) + goto err_rq_wq_destroy; + + rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); + for (i = 0; i != wq_sz; i++) { + struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); +#if (MLX5E_MAX_RX_SEGS == 1) + uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN; +#else + int j; +#endif + + err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map); + if (err != 0) { + while (i--) + bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); + goto err_rq_mbuf_free; + } + + /* set value for constant fields */ +#if (MLX5E_MAX_RX_SEGS == 1) + wqe->data[0].lkey = c->mkey_be; + wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING); +#else + for (j = 0; j < rq->nsegs; j++) + wqe->data[j].lkey = c->mkey_be; +#endif + } + + rq->ifp = c->ifp; + rq->channel = c; + rq->ix = c->ix; + + snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix); + mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM, + rq->stats.arg); + return (0); + +err_rq_mbuf_free: + free(rq->mbuf, M_MLX5EN); + tcp_lro_free(&rq->lro); +err_rq_wq_destroy: + mlx5_wq_destroy(&rq->wq_ctrl); +err_free_dma_tag: + bus_dma_tag_destroy(rq->dma_tag); +done: + return (err); +} + +static void +mlx5e_destroy_rq(struct mlx5e_rq *rq) +{ + int wq_sz; + int i; + + /* destroy all sysctl nodes */ + sysctl_ctx_free(&rq->stats.ctx); + + /* free leftover LRO packets, if any */ + tcp_lro_free(&rq->lro); + + wq_sz = mlx5_wq_ll_get_size(&rq->wq); + for (i = 0; i != wq_sz; i++) { + if (rq->mbuf[i].mbuf != NULL) { + bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map); + m_freem(rq->mbuf[i].mbuf); + } + bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); + } + free(rq->mbuf, M_MLX5EN); + mlx5_wq_destroy(&rq->wq_ctrl); +} + +static int +mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) +{ + struct mlx5e_channel *c = rq->channel; + struct mlx5e_priv *priv = c->priv; + struct mlx5_core_dev *mdev = priv->mdev; + + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + + sizeof(u64) * rq->wq_ctrl.buf.npages; + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + + memcpy(rqc, param->rqc, sizeof(param->rqc)); + + MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + if (priv->counter_set_id >= 0) + MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id); + MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - + PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); + + mlx5_fill_page_array(&rq->wq_ctrl.buf, + (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn); + + kvfree(in); + + return (err); +} + +static int +mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state) +{ + struct mlx5e_channel *c = rq->channel; + struct mlx5e_priv *priv = c->priv; + struct mlx5_core_dev *mdev = priv->mdev; + + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rqn, rq->rqn); + MLX5_SET(modify_rq_in, in, rq_state, curr_state); + MLX5_SET(rqc, rqc, state, next_state); + + err = mlx5_core_modify_rq(mdev, in, inlen); + + kvfree(in); + + return (err); +} + +static void +mlx5e_disable_rq(struct mlx5e_rq *rq) +{ + struct mlx5e_channel *c = rq->channel; + struct mlx5e_priv *priv = c->priv; + struct mlx5_core_dev *mdev = priv->mdev; + + mlx5_core_destroy_rq(mdev, rq->rqn); +} + +static int +mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq) +{ + struct mlx5e_channel *c = rq->channel; + struct mlx5e_priv *priv = c->priv; + struct mlx5_wq_ll *wq = &rq->wq; + int i; + + for (i = 0; i < 1000; i++) { + if (wq->cur_sz >= priv->params.min_rx_wqes) + return (0); + + msleep(4); + } + return (-ETIMEDOUT); +} + +static int +mlx5e_open_rq(struct mlx5e_channel *c, + struct mlx5e_rq_param *param, + struct mlx5e_rq *rq) +{ + int err; + + err = mlx5e_create_rq(c, param, rq); + if (err) + return (err); + + err = mlx5e_enable_rq(rq, param); + if (err) + goto err_destroy_rq; + + err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) + goto err_disable_rq; + + c->rq.enabled = 1; + + return (0); + +err_disable_rq: + mlx5e_disable_rq(rq); +err_destroy_rq: + mlx5e_destroy_rq(rq); + + return (err); +} + +static void +mlx5e_close_rq(struct mlx5e_rq *rq) +{ + mtx_lock(&rq->mtx); + rq->enabled = 0; + callout_stop(&rq->watchdog); + mtx_unlock(&rq->mtx); + + callout_drain(&rq->watchdog); + + mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); +} + +static void +mlx5e_close_rq_wait(struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = rq->channel->priv->mdev; + + /* wait till RQ is empty */ + while (!mlx5_wq_ll_is_empty(&rq->wq) && + (mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR)) { + msleep(4); + rq->cq.mcq.comp(&rq->cq.mcq); + } + + mlx5e_disable_rq(rq); + mlx5e_destroy_rq(rq); +} + +void +mlx5e_free_sq_db(struct mlx5e_sq *sq) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + int x; + + for (x = 0; x != wq_sz; x++) + bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); + free(sq->mbuf, M_MLX5EN); +} + +int +mlx5e_alloc_sq_db(struct mlx5e_sq *sq) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + int err; + int x; + + sq->mbuf = malloc(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); + + /* Create DMA descriptor MAPs */ + for (x = 0; x != wq_sz; x++) { + err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map); + if (err != 0) { + while (x--) + bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); + free(sq->mbuf, M_MLX5EN); + return (err); + } + } + return (0); +} + +static const char *mlx5e_sq_stats_desc[] = { + MLX5E_SQ_STATS(MLX5E_STATS_DESC) +}; + +static int +mlx5e_create_sq(struct mlx5e_channel *c, + int tc, + struct mlx5e_sq_param *param, + struct mlx5e_sq *sq) +{ + struct mlx5e_priv *priv = c->priv; + struct mlx5_core_dev *mdev = priv->mdev; + char buffer[16]; + + void *sqc = param->sqc; + void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); +#ifdef RSS + cpuset_t cpu_mask; + int cpu_id; +#endif + int err; + + /* Create DMA descriptor TAG */ + if ((err = -bus_dma_tag_create( + bus_get_dma_tag(mdev->pdev->dev.bsddev), + 1, /* any alignment */ + 0, /* no boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ + MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ + MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg */ + &sq->dma_tag))) + goto done; + + err = mlx5_alloc_map_uar(mdev, &sq->uar); + if (err) + goto err_free_dma_tag; + + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, + &sq->wq_ctrl); + if (err) + goto err_unmap_free_uar; + + sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; + sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; + + err = mlx5e_alloc_sq_db(sq); + if (err) + goto err_sq_wq_destroy; + + sq->mkey_be = c->mkey_be; + sq->ifp = priv->ifp; + sq->priv = priv; + sq->tc = tc; + sq->max_inline = priv->params.tx_max_inline; + sq->min_inline_mode = priv->params.tx_min_inline_mode; + sq->vlan_inline_cap = MLX5_CAP_ETH(mdev, wqe_vlan_insert); + + /* check if we should allocate a second packet buffer */ + if (priv->params_ethtool.tx_bufring_disable == 0) { + sq->br = buf_ring_alloc(MLX5E_SQ_TX_QUEUE_SIZE, M_MLX5EN, + M_WAITOK, &sq->lock); + if (sq->br == NULL) { + if_printf(c->ifp, "%s: Failed allocating sq drbr buffer\n", + __func__); + err = -ENOMEM; + goto err_free_sq_db; + } + + sq->sq_tq = taskqueue_create_fast("mlx5e_que", M_WAITOK, + taskqueue_thread_enqueue, &sq->sq_tq); + if (sq->sq_tq == NULL) { + if_printf(c->ifp, "%s: Failed allocating taskqueue\n", + __func__); + err = -ENOMEM; + goto err_free_drbr; + } + + TASK_INIT(&sq->sq_task, 0, mlx5e_tx_que, sq); +#ifdef RSS + cpu_id = rss_getcpu(c->ix % rss_getnumbuckets()); + CPU_SETOF(cpu_id, &cpu_mask); + taskqueue_start_threads_cpuset(&sq->sq_tq, 1, PI_NET, &cpu_mask, + "%s TX SQ%d.%d CPU%d", c->ifp->if_xname, c->ix, tc, cpu_id); +#else + taskqueue_start_threads(&sq->sq_tq, 1, PI_NET, + "%s TX SQ%d.%d", c->ifp->if_xname, c->ix, tc); +#endif + } + snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc); + mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM, + sq->stats.arg); + + return (0); + +err_free_drbr: + buf_ring_free(sq->br, M_MLX5EN); +err_free_sq_db: + mlx5e_free_sq_db(sq); +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); + +err_unmap_free_uar: + mlx5_unmap_free_uar(mdev, &sq->uar); + +err_free_dma_tag: + bus_dma_tag_destroy(sq->dma_tag); +done: + return (err); +} + +static void +mlx5e_destroy_sq(struct mlx5e_sq *sq) +{ + /* destroy all sysctl nodes */ + sysctl_ctx_free(&sq->stats.ctx); + + mlx5e_free_sq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); + mlx5_unmap_free_uar(sq->priv->mdev, &sq->uar); + if (sq->sq_tq != NULL) { + taskqueue_drain(sq->sq_tq, &sq->sq_task); + taskqueue_free(sq->sq_tq); + } + if (sq->br != NULL) + buf_ring_free(sq->br, M_MLX5EN); +} + +int +mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param, + int tis_num) +{ + void *in; + void *sqc; + void *wq; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + + sizeof(u64) * sq->wq_ctrl.buf.npages; + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + memcpy(sqc, param->sqc, sizeof(param->sqc)); + + MLX5_SET(sqc, sqc, tis_num_0, tis_num); + MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, uar_page, sq->uar.index); + MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - + PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); + + mlx5_fill_page_array(&sq->wq_ctrl.buf, + (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn); + + kvfree(in); + + return (err); +} + +int +mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state) +{ + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + + MLX5_SET(modify_sq_in, in, sqn, sq->sqn); + MLX5_SET(modify_sq_in, in, sq_state, curr_state); + MLX5_SET(sqc, sqc, state, next_state); + + err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen); + + kvfree(in); + + return (err); +} + +void +mlx5e_disable_sq(struct mlx5e_sq *sq) +{ + + mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn); +} + +static int +mlx5e_open_sq(struct mlx5e_channel *c, + int tc, + struct mlx5e_sq_param *param, + struct mlx5e_sq *sq) +{ + int err; + + err = mlx5e_create_sq(c, tc, param, sq); + if (err) + return (err); + + err = mlx5e_enable_sq(sq, param, c->priv->tisn[tc]); + if (err) + goto err_destroy_sq; + + err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); + if (err) + goto err_disable_sq; + + WRITE_ONCE(sq->queue_state, MLX5E_SQ_READY); + + return (0); + +err_disable_sq: + mlx5e_disable_sq(sq); +err_destroy_sq: + mlx5e_destroy_sq(sq); + + return (err); +} + +static void +mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep) +{ + /* fill up remainder with NOPs */ + while (sq->cev_counter != 0) { + while (!mlx5e_sq_has_room_for(sq, 1)) { + if (can_sleep != 0) { + mtx_unlock(&sq->lock); + msleep(4); + mtx_lock(&sq->lock); + } else { + goto done; + } + } + /* send a single NOP */ + mlx5e_send_nop(sq, 1); + atomic_thread_fence_rel(); + } +done: + /* Check if we need to write the doorbell */ + if (likely(sq->doorbell.d64 != 0)) { + mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); + sq->doorbell.d64 = 0; + } +} + +void +mlx5e_sq_cev_timeout(void *arg) +{ + struct mlx5e_sq *sq = arg; + + mtx_assert(&sq->lock, MA_OWNED); + + /* check next state */ + switch (sq->cev_next_state) { + case MLX5E_CEV_STATE_SEND_NOPS: + /* fill TX ring with NOPs, if any */ + mlx5e_sq_send_nops_locked(sq, 0); + + /* check if completed */ + if (sq->cev_counter == 0) { + sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; + return; + } + break; + default: + /* send NOPs on next timeout */ + sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS; + break; + } + + /* restart timer */ + callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq); +} + +void +mlx5e_drain_sq(struct mlx5e_sq *sq) +{ + int error; + struct mlx5_core_dev *mdev= sq->priv->mdev; + + /* + * Check if already stopped. + * + * NOTE: The "stopped" variable is only written when both the + * priv's configuration lock and the SQ's lock is locked. It + * can therefore safely be read when only one of the two locks + * is locked. This function is always called when the priv's + * configuration lock is locked. + */ + if (sq->stopped != 0) + return; + + mtx_lock(&sq->lock); + + /* don't put more packets into the SQ */ + sq->stopped = 1; + + /* teardown event factor timer, if any */ + sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; + callout_stop(&sq->cev_callout); + + /* send dummy NOPs in order to flush the transmit ring */ + mlx5e_sq_send_nops_locked(sq, 1); + mtx_unlock(&sq->lock); + + /* make sure it is safe to free the callout */ + callout_drain(&sq->cev_callout); + + /* wait till SQ is empty or link is down */ + mtx_lock(&sq->lock); + while (sq->cc != sq->pc && + (sq->priv->media_status_last & IFM_ACTIVE) != 0 && + mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mtx_unlock(&sq->lock); + msleep(1); + sq->cq.mcq.comp(&sq->cq.mcq); + mtx_lock(&sq->lock); + } + mtx_unlock(&sq->lock); + + /* error out remaining requests */ + error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR); + if (error != 0) { + if_printf(sq->ifp, + "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error); + } + + /* wait till SQ is empty */ + mtx_lock(&sq->lock); + while (sq->cc != sq->pc && + mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mtx_unlock(&sq->lock); + msleep(1); + sq->cq.mcq.comp(&sq->cq.mcq); + mtx_lock(&sq->lock); + } + mtx_unlock(&sq->lock); +} + +static void +mlx5e_close_sq_wait(struct mlx5e_sq *sq) +{ + + mlx5e_drain_sq(sq); + mlx5e_disable_sq(sq); + mlx5e_destroy_sq(sq); +} + +static int +mlx5e_create_cq(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq, + mlx5e_cq_comp_t *comp, + int eq_ix) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_core_cq *mcq = &cq->mcq; + int eqn_not_used; + int irqn; + int err; + u32 i; + + param->wq.buf_numa_node = 0; + param->wq.db_numa_node = 0; + + err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, + &cq->wq_ctrl); + if (err) + return (err); + + mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn); + + mcq->cqe_sz = 64; + mcq->set_ci_db = cq->wq_ctrl.db.db; + mcq->arm_db = cq->wq_ctrl.db.db + 1; + *mcq->set_ci_db = 0; + *mcq->arm_db = 0; + mcq->vector = eq_ix; + mcq->comp = comp; + mcq->event = mlx5e_cq_error_event; + mcq->irqn = irqn; + mcq->uar = &priv->cq_uar; + + for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); + + cqe->op_own = 0xf1; + } + + cq->priv = priv; + + return (0); +} + +static void +mlx5e_destroy_cq(struct mlx5e_cq *cq) +{ + mlx5_wq_destroy(&cq->wq_ctrl); +} + +static int +mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix) +{ + struct mlx5_core_cq *mcq = &cq->mcq; + void *in; + void *cqc; + int inlen; + int irqn_not_used; + int eqn; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * cq->wq_ctrl.buf.npages; + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + + memcpy(cqc, param->cqc, sizeof(param->cqc)); + + mlx5_fill_page_array(&cq->wq_ctrl.buf, + (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas)); + + mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used); + + MLX5_SET(cqc, cqc, c_eqn, eqn); + MLX5_SET(cqc, cqc, uar_page, mcq->uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - + PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); + + err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen); + + kvfree(in); + + if (err) + return (err); + + mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock)); + + return (0); +} + +static void +mlx5e_disable_cq(struct mlx5e_cq *cq) +{ + + mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq); +} + +int +mlx5e_open_cq(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq, + mlx5e_cq_comp_t *comp, + int eq_ix) +{ + int err; + + err = mlx5e_create_cq(priv, param, cq, comp, eq_ix); + if (err) + return (err); + + err = mlx5e_enable_cq(cq, param, eq_ix); + if (err) + goto err_destroy_cq; + + return (0); + +err_destroy_cq: + mlx5e_destroy_cq(cq); + + return (err); +} + +void +mlx5e_close_cq(struct mlx5e_cq *cq) +{ + mlx5e_disable_cq(cq); + mlx5e_destroy_cq(cq); +} + +static int +mlx5e_open_tx_cqs(struct mlx5e_channel *c, + struct mlx5e_channel_param *cparam) +{ + int err; + int tc; + + for (tc = 0; tc < c->num_tc; tc++) { + /* open completion queue */ + err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq, + &mlx5e_tx_cq_comp, c->ix); + if (err) + goto err_close_tx_cqs; + } + return (0); + +err_close_tx_cqs: + for (tc--; tc >= 0; tc--) + mlx5e_close_cq(&c->sq[tc].cq); + + return (err); +} + +static void +mlx5e_close_tx_cqs(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_cq(&c->sq[tc].cq); +} + +static int +mlx5e_open_sqs(struct mlx5e_channel *c, + struct mlx5e_channel_param *cparam) +{ + int err; + int tc; + + for (tc = 0; tc < c->num_tc; tc++) { + err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]); + if (err) + goto err_close_sqs; + } + + return (0); + +err_close_sqs: + for (tc--; tc >= 0; tc--) + mlx5e_close_sq_wait(&c->sq[tc]); + + return (err); +} + +static void +mlx5e_close_sqs_wait(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_sq_wait(&c->sq[tc]); +} + +static void +mlx5e_chan_mtx_init(struct mlx5e_channel *c) +{ + int tc; + + mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF); + + callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0); + + for (tc = 0; tc < c->num_tc; tc++) { + struct mlx5e_sq *sq = c->sq + tc; + + mtx_init(&sq->lock, "mlx5tx", + MTX_NETWORK_LOCK " TX", MTX_DEF); + mtx_init(&sq->comp_lock, "mlx5comp", + MTX_NETWORK_LOCK " TX", MTX_DEF); + + callout_init_mtx(&sq->cev_callout, &sq->lock, 0); + + sq->cev_factor = c->priv->params_ethtool.tx_completion_fact; + + /* ensure the TX completion event factor is not zero */ + if (sq->cev_factor == 0) + sq->cev_factor = 1; + } +} + +static void +mlx5e_chan_mtx_destroy(struct mlx5e_channel *c) +{ + int tc; + + mtx_destroy(&c->rq.mtx); + + for (tc = 0; tc < c->num_tc; tc++) { + mtx_destroy(&c->sq[tc].lock); + mtx_destroy(&c->sq[tc].comp_lock); + } +} + +static int +mlx5e_open_channel(struct mlx5e_priv *priv, int ix, + struct mlx5e_channel_param *cparam, + struct mlx5e_channel *volatile *cp) +{ + struct mlx5e_channel *c; + int err; + + c = malloc(sizeof(*c), M_MLX5EN, M_WAITOK | M_ZERO); + c->priv = priv; + c->ix = ix; + c->cpu = 0; + c->ifp = priv->ifp; + c->mkey_be = cpu_to_be32(priv->mr.key); + c->num_tc = priv->num_tc; + + /* init mutexes */ + mlx5e_chan_mtx_init(c); + + /* open transmit completion queue */ + err = mlx5e_open_tx_cqs(c, cparam); + if (err) + goto err_free; + + /* open receive completion queue */ + err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq, + &mlx5e_rx_cq_comp, c->ix); + if (err) + goto err_close_tx_cqs; + + err = mlx5e_open_sqs(c, cparam); + if (err) + goto err_close_rx_cq; + + err = mlx5e_open_rq(c, &cparam->rq, &c->rq); + if (err) + goto err_close_sqs; + + /* store channel pointer */ + *cp = c; + + /* poll receive queue initially */ + c->rq.cq.mcq.comp(&c->rq.cq.mcq); + + return (0); + +err_close_sqs: + mlx5e_close_sqs_wait(c); + +err_close_rx_cq: + mlx5e_close_cq(&c->rq.cq); + +err_close_tx_cqs: + mlx5e_close_tx_cqs(c); + +err_free: + /* destroy mutexes */ + mlx5e_chan_mtx_destroy(c); + free(c, M_MLX5EN); + return (err); +} + +static void +mlx5e_close_channel(struct mlx5e_channel *volatile *pp) +{ + struct mlx5e_channel *c = *pp; + + /* check if channel is already closed */ + if (c == NULL) + return; + mlx5e_close_rq(&c->rq); +} + +static void +mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp) +{ + struct mlx5e_channel *c = *pp; + + /* check if channel is already closed */ + if (c == NULL) + return; + /* ensure channel pointer is no longer used */ + *pp = NULL; + + mlx5e_close_rq_wait(&c->rq); + mlx5e_close_sqs_wait(c); + mlx5e_close_cq(&c->rq.cq); + mlx5e_close_tx_cqs(c); + /* destroy mutexes */ + mlx5e_chan_mtx_destroy(c); + free(c, M_MLX5EN); +} + +static int +mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs) +{ + u32 r, n; + + r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz : + MLX5E_SW2MB_MTU(priv->ifp->if_mtu); + if (r > MJUM16BYTES) + return (-ENOMEM); + + if (r > MJUM9BYTES) + r = MJUM16BYTES; + else if (r > MJUMPAGESIZE) + r = MJUM9BYTES; + else if (r > MCLBYTES) + r = MJUMPAGESIZE; + else + r = MCLBYTES; + + /* + * n + 1 must be a power of two, because stride size must be. + * Stride size is 16 * (n + 1), as the first segment is + * control. + */ + for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++) + ; + + *wqe_sz = r; + *nsegs = n; + return (0); +} + +static void +mlx5e_build_rq_param(struct mlx5e_priv *priv, + struct mlx5e_rq_param *param) +{ + void *rqc = param->rqc; + void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + u32 wqe_sz, nsegs; + + mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + + nsegs * sizeof(struct mlx5_wqe_data_seg))); + MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size); + MLX5_SET(wq, wq, pd, priv->pdn); + + param->wq.buf_numa_node = 0; + param->wq.db_numa_node = 0; + param->wq.linear = 1; +} + +static void +mlx5e_build_sq_param(struct mlx5e_priv *priv, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, pd, priv->pdn); + + param->wq.buf_numa_node = 0; + param->wq.db_numa_node = 0; + param->wq.linear = 1; +} + +static void +mlx5e_build_common_cq_param(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index); +} + +static void +mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + + /* + * TODO The sysctl to control on/off is a bool value for now, which means + * we only support CSUM, once HASH is implemnted we'll need to address that. + */ + if (priv->params.cqe_zipping_en) { + MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_CSUM); + MLX5_SET(cqc, cqc, cqe_compression_en, 1); + } + + MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size); + MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); + MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); + + switch (priv->params.rx_cq_moderation_mode) { + case 0: + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + default: + if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); + else + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + } + + mlx5e_build_common_cq_param(priv, param); +} + +static void +mlx5e_build_tx_cq_param(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size); + MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec); + MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts); + + switch (priv->params.tx_cq_moderation_mode) { + case 0: + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + default: + if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); + else + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + } + + mlx5e_build_common_cq_param(priv, param); +} + +static void +mlx5e_build_channel_param(struct mlx5e_priv *priv, + struct mlx5e_channel_param *cparam) +{ + memset(cparam, 0, sizeof(*cparam)); + + mlx5e_build_rq_param(priv, &cparam->rq); + mlx5e_build_sq_param(priv, &cparam->sq); + mlx5e_build_rx_cq_param(priv, &cparam->rx_cq); + mlx5e_build_tx_cq_param(priv, &cparam->tx_cq); +} + +static int +mlx5e_open_channels(struct mlx5e_priv *priv) +{ + struct mlx5e_channel_param cparam; + void *ptr; + int err; + int i; + int j; + + priv->channel = malloc(priv->params.num_channels * + sizeof(struct mlx5e_channel *), M_MLX5EN, M_WAITOK | M_ZERO); + + mlx5e_build_channel_param(priv, &cparam); + for (i = 0; i < priv->params.num_channels; i++) { + err = mlx5e_open_channel(priv, i, &cparam, &priv->channel[i]); + if (err) + goto err_close_channels; + } + + for (j = 0; j < priv->params.num_channels; j++) { + err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j]->rq); + if (err) + goto err_close_channels; + } + + return (0); + +err_close_channels: + for (i--; i >= 0; i--) { + mlx5e_close_channel(&priv->channel[i]); + mlx5e_close_channel_wait(&priv->channel[i]); + } + + /* remove "volatile" attribute from "channel" pointer */ + ptr = __DECONST(void *, priv->channel); + priv->channel = NULL; + + free(ptr, M_MLX5EN); + + return (err); +} + +static void +mlx5e_close_channels(struct mlx5e_priv *priv) +{ + void *ptr; + int i; + + if (priv->channel == NULL) + return; + + for (i = 0; i < priv->params.num_channels; i++) + mlx5e_close_channel(&priv->channel[i]); + for (i = 0; i < priv->params.num_channels; i++) + mlx5e_close_channel_wait(&priv->channel[i]); + + /* remove "volatile" attribute from "channel" pointer */ + ptr = __DECONST(void *, priv->channel); + priv->channel = NULL; + + free(ptr, M_MLX5EN); +} + +static int +mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq) +{ + + if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { + uint8_t cq_mode; + + switch (priv->params.tx_cq_moderation_mode) { + case 0: + cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; + break; + default: + cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; + break; + } + + return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq, + priv->params.tx_cq_moderation_usec, + priv->params.tx_cq_moderation_pkts, + cq_mode)); + } + + return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq, + priv->params.tx_cq_moderation_usec, + priv->params.tx_cq_moderation_pkts)); +} + +static int +mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq) +{ + + if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { + uint8_t cq_mode; + int retval; + + switch (priv->params.rx_cq_moderation_mode) { + case 0: + cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; + break; + default: + cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; + break; + } + + retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, + priv->params.rx_cq_moderation_usec, + priv->params.rx_cq_moderation_pkts, + cq_mode); + + return (retval); + } + + return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq, + priv->params.rx_cq_moderation_usec, + priv->params.rx_cq_moderation_pkts)); +} + +static int +mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) +{ + int err; + int i; + + if (c == NULL) + return (EINVAL); + + err = mlx5e_refresh_rq_params(priv, &c->rq); + if (err) + goto done; + + for (i = 0; i != c->num_tc; i++) { + err = mlx5e_refresh_sq_params(priv, &c->sq[i]); + if (err) + goto done; + } +done: + return (err); +} + +int +mlx5e_refresh_channel_params(struct mlx5e_priv *priv) +{ + int i; + + if (priv->channel == NULL) + return (EINVAL); + + for (i = 0; i < priv->params.num_channels; i++) { + int err; + + err = mlx5e_refresh_channel_params_sub(priv, priv->channel[i]); + if (err) + return (err); + } + return (0); +} + +static int +mlx5e_open_tis(struct mlx5e_priv *priv, int tc) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + memset(in, 0, sizeof(in)); + + MLX5_SET(tisc, tisc, prio, tc); + MLX5_SET(tisc, tisc, transport_domain, priv->tdn); + + return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc])); +} + +static void +mlx5e_close_tis(struct mlx5e_priv *priv, int tc) +{ + mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]); +} + +static int +mlx5e_open_tises(struct mlx5e_priv *priv) +{ + int num_tc = priv->num_tc; + int err; + int tc; + + for (tc = 0; tc < num_tc; tc++) { + err = mlx5e_open_tis(priv, tc); + if (err) + goto err_close_tises; + } + + return (0); + +err_close_tises: + for (tc--; tc >= 0; tc--) + mlx5e_close_tis(priv, tc); + + return (err); +} + +static void +mlx5e_close_tises(struct mlx5e_priv *priv) +{ + int num_tc = priv->num_tc; + int tc; + + for (tc = 0; tc < num_tc; tc++) + mlx5e_close_tis(priv, tc); +} + +static int +mlx5e_open_rqt(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 *in; + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; + void *rqtc; + int inlen; + int err; + int sz; + int i; + + sz = 1 << priv->params.rx_hash_log_tbl_sz; + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) { + int ix = i; +#ifdef RSS + ix = rss_get_indirection_to_bucket(ix); +#endif + /* ensure we don't overflow */ + ix %= priv->params.num_channels; + + /* apply receive side scaling stride, if any */ + ix -= ix % (int)priv->params.channels_rsss; + + MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix]->rq.rqn); + } + + MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); + + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (!err) + priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn); + + kvfree(in); + + return (err); +} + +static void +mlx5e_close_rqt(struct mlx5e_priv *priv) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn); + + mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)); +} + +static void +mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt) +{ + void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + __be32 *hkey; + + MLX5_SET(tirc, tirc, transport_domain, priv->tdn); + +#define ROUGH_MAX_L2_L3_HDR_SZ 256 + +#define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP) + +#define MLX5_HASH_ALL (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP |\ + MLX5_HASH_FIELD_SEL_L4_SPORT |\ + MLX5_HASH_FIELD_SEL_L4_DPORT) + +#define MLX5_HASH_IP_IPSEC_SPI (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP |\ + MLX5_HASH_FIELD_SEL_IPSEC_SPI) + + if (priv->params.hw_lro_en) { + MLX5_SET(tirc, tirc, lro_enable_mask, + MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | + MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); + MLX5_SET(tirc, tirc, lro_max_msg_sz, + (priv->params.lro_wqe_sz - + ROUGH_MAX_L2_L3_HDR_SZ) >> 8); + /* TODO: add the option to choose timer value dynamically */ + MLX5_SET(tirc, tirc, lro_timeout_period_usecs, + MLX5_CAP_ETH(priv->mdev, + lro_timer_supported_periods[2])); + } + + /* setup parameters for hashing TIR type, if any */ + switch (tt) { + case MLX5E_TT_ANY: + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, + priv->channel[0]->rq.rqn); + break; + default: + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + priv->rqtn); + MLX5_SET(tirc, tirc, rx_hash_fn, + MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); + hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); +#ifdef RSS + /* + * The FreeBSD RSS implementation does currently not + * support symmetric Toeplitz hashes: + */ + MLX5_SET(tirc, tirc, rx_hash_symmetric, 0); + rss_getkey((uint8_t *)hkey); +#else + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + hkey[0] = cpu_to_be32(0xD181C62C); + hkey[1] = cpu_to_be32(0xF7F4DB5B); + hkey[2] = cpu_to_be32(0x1983A2FC); + hkey[3] = cpu_to_be32(0x943E1ADB); + hkey[4] = cpu_to_be32(0xD9389E6B); + hkey[5] = cpu_to_be32(0xD1039C2C); + hkey[6] = cpu_to_be32(0xA74499AD); + hkey[7] = cpu_to_be32(0x593D56D9); + hkey[8] = cpu_to_be32(0xF3253C06); + hkey[9] = cpu_to_be32(0x2ADC1FFC); +#endif + break; + } + + switch (tt) { + case MLX5E_TT_IPV4_TCP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); +#ifdef RSS + if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) { + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + } else +#endif + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_ALL); + break; + + case MLX5E_TT_IPV6_TCP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); +#ifdef RSS + if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) { + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + } else +#endif + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_ALL); + break; + + case MLX5E_TT_IPV4_UDP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); +#ifdef RSS + if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) { + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + } else +#endif + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_ALL); + break; + + case MLX5E_TT_IPV6_UDP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); +#ifdef RSS + if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) { + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + } else +#endif + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_ALL); + break; + + case MLX5E_TT_IPV4_IPSEC_AH: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP_IPSEC_SPI); + break; + + case MLX5E_TT_IPV6_IPSEC_AH: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP_IPSEC_SPI); + break; + + case MLX5E_TT_IPV4_IPSEC_ESP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP_IPSEC_SPI); + break; + + case MLX5E_TT_IPV6_IPSEC_ESP: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP_IPSEC_SPI); + break; + + case MLX5E_TT_IPV4: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + break; + + case MLX5E_TT_IPV6: + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, + MLX5_HASH_IP); + break; + + default: + break; + } +} + +static int +mlx5e_open_tir(struct mlx5e_priv *priv, int tt) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); + + mlx5e_build_tir_ctx(priv, tirc, tt); + + err = mlx5_core_create_tir(mdev, in, inlen, &priv->tirn[tt]); + + kvfree(in); + + return (err); +} + +static void +mlx5e_close_tir(struct mlx5e_priv *priv, int tt) +{ + mlx5_core_destroy_tir(priv->mdev, priv->tirn[tt]); +} + +static int +mlx5e_open_tirs(struct mlx5e_priv *priv) +{ + int err; + int i; + + for (i = 0; i < MLX5E_NUM_TT; i++) { + err = mlx5e_open_tir(priv, i); + if (err) + goto err_close_tirs; + } + + return (0); + +err_close_tirs: + for (i--; i >= 0; i--) + mlx5e_close_tir(priv, i); + + return (err); +} + +static void +mlx5e_close_tirs(struct mlx5e_priv *priv) +{ + int i; + + for (i = 0; i < MLX5E_NUM_TT; i++) + mlx5e_close_tir(priv, i); +} + +/* + * SW MTU does not include headers, + * HW MTU includes all headers and checksums. + */ +static int +mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu) +{ + struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5_core_dev *mdev = priv->mdev; + int hw_mtu; + int err; + + hw_mtu = MLX5E_SW2HW_MTU(sw_mtu); + + err = mlx5_set_port_mtu(mdev, hw_mtu); + if (err) { + if_printf(ifp, "%s: mlx5_set_port_mtu failed setting %d, err=%d\n", + __func__, sw_mtu, err); + return (err); + } + + /* Update vport context MTU */ + err = mlx5_set_vport_mtu(mdev, hw_mtu); + if (err) { + if_printf(ifp, "%s: Failed updating vport context with MTU size, err=%d\n", + __func__, err); + } + + ifp->if_mtu = sw_mtu; + + err = mlx5_query_vport_mtu(mdev, &hw_mtu); + if (err || !hw_mtu) { + /* fallback to port oper mtu */ + err = mlx5_query_port_oper_mtu(mdev, &hw_mtu); + } + if (err) { + if_printf(ifp, "Query port MTU, after setting new " + "MTU value, failed\n"); + return (err); + } else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) { + err = -E2BIG, + if_printf(ifp, "Port MTU %d is smaller than " + "ifp mtu %d\n", hw_mtu, sw_mtu); + } else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) { + err = -EINVAL; + if_printf(ifp, "Port MTU %d is bigger than " + "ifp mtu %d\n", hw_mtu, sw_mtu); + } + priv->params_ethtool.hw_mtu = hw_mtu; + + return (err); +} + +int +mlx5e_open_locked(struct ifnet *ifp) +{ + struct mlx5e_priv *priv = ifp->if_softc; + int err; + u16 set_id; + + /* check if already opened */ + if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) + return (0); + +#ifdef RSS + if (rss_getnumbuckets() > priv->params.num_channels) { + if_printf(ifp, "NOTE: There are more RSS buckets(%u) than " + "channels(%u) available\n", rss_getnumbuckets(), + priv->params.num_channels); + } +#endif + err = mlx5e_open_tises(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_open_tises failed, %d\n", + __func__, err); + return (err); + } + err = mlx5_vport_alloc_q_counter(priv->mdev, + MLX5_INTERFACE_PROTOCOL_ETH, &set_id); + if (err) { + if_printf(priv->ifp, + "%s: mlx5_vport_alloc_q_counter failed: %d\n", + __func__, err); + goto err_close_tises; + } + /* store counter set ID */ + priv->counter_set_id = set_id; + + err = mlx5e_open_channels(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_open_channels failed, %d\n", + __func__, err); + goto err_dalloc_q_counter; + } + err = mlx5e_open_rqt(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_open_rqt failed, %d\n", + __func__, err); + goto err_close_channels; + } + err = mlx5e_open_tirs(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_open_tir failed, %d\n", + __func__, err); + goto err_close_rqls; + } + err = mlx5e_open_flow_table(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_open_flow_table failed, %d\n", + __func__, err); + goto err_close_tirs; + } + err = mlx5e_add_all_vlan_rules(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_add_all_vlan_rules failed, %d\n", + __func__, err); + goto err_close_flow_table; + } + set_bit(MLX5E_STATE_OPENED, &priv->state); + + mlx5e_update_carrier(priv); + mlx5e_set_rx_mode_core(priv); + + return (0); + +err_close_flow_table: + mlx5e_close_flow_table(priv); + +err_close_tirs: + mlx5e_close_tirs(priv); + +err_close_rqls: + mlx5e_close_rqt(priv); + +err_close_channels: + mlx5e_close_channels(priv); + +err_dalloc_q_counter: + mlx5_vport_dealloc_q_counter(priv->mdev, + MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); + +err_close_tises: + mlx5e_close_tises(priv); + + return (err); +} + +static void +mlx5e_open(void *arg) +{ + struct mlx5e_priv *priv = arg; + + PRIV_LOCK(priv); + if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP)) + if_printf(priv->ifp, + "%s: Setting port status to up failed\n", + __func__); + + mlx5e_open_locked(priv->ifp); + priv->ifp->if_drv_flags |= IFF_DRV_RUNNING; + PRIV_UNLOCK(priv); +} + +int +mlx5e_close_locked(struct ifnet *ifp) +{ + struct mlx5e_priv *priv = ifp->if_softc; + + /* check if already closed */ + if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) + return (0); + + clear_bit(MLX5E_STATE_OPENED, &priv->state); + + mlx5e_set_rx_mode_core(priv); + mlx5e_del_all_vlan_rules(priv); + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + mlx5e_close_flow_table(priv); + mlx5e_close_tirs(priv); + mlx5e_close_rqt(priv); + mlx5e_close_channels(priv); + mlx5_vport_dealloc_q_counter(priv->mdev, + MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); + mlx5e_close_tises(priv); + + return (0); +} + +#if (__FreeBSD_version >= 1100000) +static uint64_t +mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt) +{ + struct mlx5e_priv *priv = ifp->if_softc; + u64 retval; + + /* PRIV_LOCK(priv); XXX not allowed */ + switch (cnt) { + case IFCOUNTER_IPACKETS: + retval = priv->stats.vport.rx_packets; + break; + case IFCOUNTER_IERRORS: + retval = priv->stats.vport.rx_error_packets + + priv->stats.pport.alignment_err + + priv->stats.pport.check_seq_err + + priv->stats.pport.crc_align_errors + + priv->stats.pport.in_range_len_errors + + priv->stats.pport.jabbers + + priv->stats.pport.out_of_range_len + + priv->stats.pport.oversize_pkts + + priv->stats.pport.symbol_err + + priv->stats.pport.too_long_errors + + priv->stats.pport.undersize_pkts + + priv->stats.pport.unsupported_op_rx; + break; + case IFCOUNTER_IQDROPS: + retval = priv->stats.vport.rx_out_of_buffer + + priv->stats.pport.drop_events; + break; + case IFCOUNTER_OPACKETS: + retval = priv->stats.vport.tx_packets; + break; + case IFCOUNTER_OERRORS: + retval = priv->stats.vport.tx_error_packets; + break; + case IFCOUNTER_IBYTES: + retval = priv->stats.vport.rx_bytes; + break; + case IFCOUNTER_OBYTES: + retval = priv->stats.vport.tx_bytes; + break; + case IFCOUNTER_IMCASTS: + retval = priv->stats.vport.rx_multicast_packets; + break; + case IFCOUNTER_OMCASTS: + retval = priv->stats.vport.tx_multicast_packets; + break; + case IFCOUNTER_OQDROPS: + retval = priv->stats.vport.tx_queue_dropped; + break; + case IFCOUNTER_COLLISIONS: + retval = priv->stats.pport.collisions; + break; + default: + retval = if_get_counter_default(ifp, cnt); + break; + } + /* PRIV_UNLOCK(priv); XXX not allowed */ + return (retval); +} +#endif + +static void +mlx5e_set_rx_mode(struct ifnet *ifp) +{ + struct mlx5e_priv *priv = ifp->if_softc; + + queue_work(priv->wq, &priv->set_rx_mode_work); +} + +static int +mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct mlx5e_priv *priv; + struct ifreq *ifr; + struct ifi2creq i2c; + int error = 0; + int mask = 0; + int size_read = 0; + int module_status; + int module_num; + int max_mtu; + uint8_t read_addr; + + priv = ifp->if_softc; + + /* check if detaching */ + if (priv == NULL || priv->gone != 0) + return (ENXIO); + + switch (command) { + case SIOCSIFMTU: + ifr = (struct ifreq *)data; + + PRIV_LOCK(priv); + mlx5_query_port_max_mtu(priv->mdev, &max_mtu); + + if (ifr->ifr_mtu >= MLX5E_MTU_MIN && + ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) { + int was_opened; + + was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + if (was_opened) + mlx5e_close_locked(ifp); + + /* set new MTU */ + mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu); + + if (was_opened) + mlx5e_open_locked(ifp); + } else { + error = EINVAL; + if_printf(ifp, "Invalid MTU value. Min val: %d, Max val: %d\n", + MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu)); + } + PRIV_UNLOCK(priv); + break; + case SIOCSIFFLAGS: + if ((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + mlx5e_set_rx_mode(ifp); + break; + } + PRIV_LOCK(priv); + if (ifp->if_flags & IFF_UP) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) + mlx5e_open_locked(ifp); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + mlx5_set_port_status(priv->mdev, MLX5_PORT_UP); + } + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + mlx5_set_port_status(priv->mdev, + MLX5_PORT_DOWN); + if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) + mlx5e_close_locked(ifp); + mlx5e_update_carrier(priv); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + } + } + PRIV_UNLOCK(priv); + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + mlx5e_set_rx_mode(ifp); + break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + case SIOCGIFXMEDIA: + ifr = (struct ifreq *)data; + error = ifmedia_ioctl(ifp, ifr, &priv->media, command); + break; + case SIOCSIFCAP: + ifr = (struct ifreq *)data; + PRIV_LOCK(priv); + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + + if (mask & IFCAP_TXCSUM) { + ifp->if_capenable ^= IFCAP_TXCSUM; + ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); + + if (IFCAP_TSO4 & ifp->if_capenable && + !(IFCAP_TXCSUM & ifp->if_capenable)) { + ifp->if_capenable &= ~IFCAP_TSO4; + ifp->if_hwassist &= ~CSUM_IP_TSO; + if_printf(ifp, + "tso4 disabled due to -txcsum.\n"); + } + } + if (mask & IFCAP_TXCSUM_IPV6) { + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + + if (IFCAP_TSO6 & ifp->if_capenable && + !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + ifp->if_capenable &= ~IFCAP_TSO6; + ifp->if_hwassist &= ~CSUM_IP6_TSO; + if_printf(ifp, + "tso6 disabled due to -txcsum6.\n"); + } + } + if (mask & IFCAP_RXCSUM) + ifp->if_capenable ^= IFCAP_RXCSUM; + if (mask & IFCAP_RXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + if (mask & IFCAP_TSO4) { + if (!(IFCAP_TSO4 & ifp->if_capenable) && + !(IFCAP_TXCSUM & ifp->if_capenable)) { + if_printf(ifp, "enable txcsum first.\n"); + error = EAGAIN; + goto out; + } + ifp->if_capenable ^= IFCAP_TSO4; + ifp->if_hwassist ^= CSUM_IP_TSO; + } + if (mask & IFCAP_TSO6) { + if (!(IFCAP_TSO6 & ifp->if_capenable) && + !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if_printf(ifp, "enable txcsum6 first.\n"); + error = EAGAIN; + goto out; + } + ifp->if_capenable ^= IFCAP_TSO6; + ifp->if_hwassist ^= CSUM_IP6_TSO; + } + if (mask & IFCAP_VLAN_HWFILTER) { + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) + mlx5e_disable_vlan_filter(priv); + else + mlx5e_enable_vlan_filter(priv); + + ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; + } + if (mask & IFCAP_VLAN_HWTAGGING) + ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; + if (mask & IFCAP_WOL_MAGIC) + ifp->if_capenable ^= IFCAP_WOL_MAGIC; + + VLAN_CAPABILITIES(ifp); + /* turn off LRO means also turn of HW LRO - if it's on */ + if (mask & IFCAP_LRO) { + int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + bool need_restart = false; + + ifp->if_capenable ^= IFCAP_LRO; + if (!(ifp->if_capenable & IFCAP_LRO)) { + if (priv->params.hw_lro_en) { + priv->params.hw_lro_en = false; + need_restart = true; + /* Not sure this is the correct way */ + priv->params_ethtool.hw_lro = priv->params.hw_lro_en; + } + } + if (was_opened && need_restart) { + mlx5e_close_locked(ifp); + mlx5e_open_locked(ifp); + } + } + if (mask & IFCAP_HWRXTSTMP) { + ifp->if_capenable ^= IFCAP_HWRXTSTMP; + if (ifp->if_capenable & IFCAP_HWRXTSTMP) { + if (priv->clbr_done == 0) + mlx5e_reset_calibration_callout(priv); + } else { + callout_drain(&priv->tstmp_clbr); + priv->clbr_done = 0; + } + } +out: + PRIV_UNLOCK(priv); + break; + + case SIOCGI2C: + ifr = (struct ifreq *)data; + + /* + * Copy from the user-space address ifr_data to the + * kernel-space address i2c + */ + error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); + if (error) + break; + + if (i2c.len > sizeof(i2c.data)) { + error = EINVAL; + break; + } + + PRIV_LOCK(priv); + /* Get module_num which is required for the query_eeprom */ + error = mlx5_query_module_num(priv->mdev, &module_num); + if (error) { + if_printf(ifp, "Query module num failed, eeprom " + "reading is not supported\n"); + error = EINVAL; + goto err_i2c; + } + /* Check if module is present before doing an access */ + module_status = mlx5_query_module_status(priv->mdev, module_num); + if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED && + module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) { + error = EINVAL; + goto err_i2c; + } + /* + * Currently 0XA0 and 0xA2 are the only addresses permitted. + * The internal conversion is as follows: + */ + if (i2c.dev_addr == 0xA0) + read_addr = MLX5E_I2C_ADDR_LOW; + else if (i2c.dev_addr == 0xA2) + read_addr = MLX5E_I2C_ADDR_HIGH; + else { + if_printf(ifp, "Query eeprom failed, " + "Invalid Address: %X\n", i2c.dev_addr); + error = EINVAL; + goto err_i2c; + } + error = mlx5_query_eeprom(priv->mdev, + read_addr, MLX5E_EEPROM_LOW_PAGE, + (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num, + (uint32_t *)i2c.data, &size_read); + if (error) { + if_printf(ifp, "Query eeprom failed, eeprom " + "reading is not supported\n"); + error = EINVAL; + goto err_i2c; + } + + if (i2c.len > MLX5_EEPROM_MAX_BYTES) { + error = mlx5_query_eeprom(priv->mdev, + read_addr, MLX5E_EEPROM_LOW_PAGE, + (uint32_t)(i2c.offset + size_read), + (uint32_t)(i2c.len - size_read), module_num, + (uint32_t *)(i2c.data + size_read), &size_read); + } + if (error) { + if_printf(ifp, "Query eeprom failed, eeprom " + "reading is not supported\n"); + error = EINVAL; + goto err_i2c; + } + + error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); +err_i2c: + PRIV_UNLOCK(priv); + break; + + default: + error = ether_ioctl(ifp, command, data); + break; + } + return (error); +} + +static int +mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) +{ + /* + * TODO: uncoment once FW really sets all these bits if + * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap || + * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap || + * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return + * -ENOTSUPP; + */ + + /* TODO: add more must-to-have features */ + + if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return (-ENODEV); + + return (0); +} + +static u16 +mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev) +{ + int bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; + + return bf_buf_size - + sizeof(struct mlx5e_tx_wqe) + + 2 /*sizeof(mlx5e_tx_wqe.inline_hdr_start)*/; +} + +static void +mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev, + struct mlx5e_priv *priv, + int num_comp_vectors) +{ + /* + * TODO: Consider link speed for setting "log_sq_size", + * "log_rq_size" and "cq_moderation_xxx": + */ + priv->params.log_sq_size = + MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + priv->params.log_rq_size = + MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; + priv->params.rx_cq_moderation_usec = + MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? + MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE : + MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC; + priv->params.rx_cq_moderation_mode = + MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0; + priv->params.rx_cq_moderation_pkts = + MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS; + priv->params.tx_cq_moderation_usec = + MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC; + priv->params.tx_cq_moderation_pkts = + MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; + priv->params.min_rx_wqes = + MLX5E_PARAMS_DEFAULT_MIN_RX_WQES; + priv->params.rx_hash_log_tbl_sz = + (order_base_2(num_comp_vectors) > + MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ? + order_base_2(num_comp_vectors) : + MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ; + priv->params.num_tc = 1; + priv->params.default_vlan_prio = 0; + priv->counter_set_id = -1; + priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev); + mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode); + + /* + * hw lro is currently defaulted to off. when it won't anymore we + * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)" + */ + priv->params.hw_lro_en = false; + priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; + + priv->params.cqe_zipping_en = !!MLX5_CAP_GEN(mdev, cqe_compression); + + priv->mdev = mdev; + priv->params.num_channels = num_comp_vectors; + priv->params.channels_rsss = 1; + priv->order_base_2_num_channels = order_base_2(num_comp_vectors); + priv->queue_mapping_channel_mask = + roundup_pow_of_two(num_comp_vectors) - 1; + priv->num_tc = priv->params.num_tc; + priv->default_vlan_prio = priv->params.default_vlan_prio; + + INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work); + INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); + INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); +} + +static int +mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, + struct mlx5_core_mr *mkey) +{ + struct ifnet *ifp = priv->ifp; + struct mlx5_core_dev *mdev = priv->mdev; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = mlx5_vzalloc(inlen); + if (in == NULL) { + if_printf(ifp, "%s: failed to allocate inbox\n", __func__); + return (-ENOMEM); + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + if (err) + if_printf(ifp, "%s: mlx5_core_create_mkey failed, %d\n", + __func__, err); + + kvfree(in); + return (err); +} + +static const char *mlx5e_vport_stats_desc[] = { + MLX5E_VPORT_STATS(MLX5E_STATS_DESC) +}; + +static const char *mlx5e_pport_stats_desc[] = { + MLX5E_PPORT_STATS(MLX5E_STATS_DESC) +}; + +static void +mlx5e_priv_mtx_init(struct mlx5e_priv *priv) +{ + mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF); + sx_init(&priv->state_lock, "mlx5state"); + callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0); + MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock); +} + +static void +mlx5e_priv_mtx_destroy(struct mlx5e_priv *priv) +{ + mtx_destroy(&priv->async_events_mtx); + sx_destroy(&priv->state_lock); +} + +static int +sysctl_firmware(SYSCTL_HANDLER_ARGS) +{ + /* + * %d.%d%.d the string format. + * fw_rev_{maj,min,sub} return u16, 2^16 = 65536. + * We need at most 5 chars to store that. + * It also has: two "." and NULL at the end, which means we need 18 + * (5*3 + 3) chars at most. + */ + char fw[18]; + struct mlx5e_priv *priv = arg1; + int error; + + snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev), + fw_rev_sub(priv->mdev)); + error = sysctl_handle_string(oidp, fw, sizeof(fw), req); + return (error); +} + +static void +mlx5e_disable_tx_dma(struct mlx5e_channel *ch) +{ + int i; + + for (i = 0; i < ch->num_tc; i++) + mlx5e_drain_sq(&ch->sq[i]); +} + +static void +mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq) +{ + + sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP); + sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8); + mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); + sq->doorbell.d64 = 0; +} + +void +mlx5e_resume_sq(struct mlx5e_sq *sq) +{ + int err; + + /* check if already enabled */ + if (sq->stopped == 0) + return; + + err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR, + MLX5_SQC_STATE_RST); + if (err != 0) { + if_printf(sq->ifp, + "mlx5e_modify_sq() from ERR to RST failed: %d\n", err); + } + + sq->cc = 0; + sq->pc = 0; + + /* reset doorbell prior to moving from RST to RDY */ + mlx5e_reset_sq_doorbell_record(sq); + + err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, + MLX5_SQC_STATE_RDY); + if (err != 0) { + if_printf(sq->ifp, + "mlx5e_modify_sq() from RST to RDY failed: %d\n", err); + } + + mtx_lock(&sq->lock); + sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; + sq->stopped = 0; + mtx_unlock(&sq->lock); + +} + +static void +mlx5e_enable_tx_dma(struct mlx5e_channel *ch) +{ + int i; + + for (i = 0; i < ch->num_tc; i++) + mlx5e_resume_sq(&ch->sq[i]); +} + +static void +mlx5e_disable_rx_dma(struct mlx5e_channel *ch) +{ + struct mlx5e_rq *rq = &ch->rq; + int err; + + mtx_lock(&rq->mtx); + rq->enabled = 0; + callout_stop(&rq->watchdog); + mtx_unlock(&rq->mtx); + + callout_drain(&rq->watchdog); + + err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); + if (err != 0) { + if_printf(rq->ifp, + "mlx5e_modify_rq() from RDY to RST failed: %d\n", err); + } + + while (!mlx5_wq_ll_is_empty(&rq->wq)) { + msleep(1); + rq->cq.mcq.comp(&rq->cq.mcq); + } + + /* + * Transitioning into RST state will allow the FW to track less ERR state queues, + * thus reducing the recv queue flushing time + */ + err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST); + if (err != 0) { + if_printf(rq->ifp, + "mlx5e_modify_rq() from ERR to RST failed: %d\n", err); + } +} + +static void +mlx5e_enable_rx_dma(struct mlx5e_channel *ch) +{ + struct mlx5e_rq *rq = &ch->rq; + int err; + + rq->wq.wqe_ctr = 0; + mlx5_wq_ll_update_db_record(&rq->wq); + err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err != 0) { + if_printf(rq->ifp, + "mlx5e_modify_rq() from RST to RDY failed: %d\n", err); + } + + rq->enabled = 1; + + rq->cq.mcq.comp(&rq->cq.mcq); +} + +void +mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value) +{ + int i; + + if (priv->channel == NULL) + return; + + for (i = 0; i < priv->params.num_channels; i++) { + + if (!priv->channel[i]) + continue; + + if (value) + mlx5e_disable_tx_dma(priv->channel[i]); + else + mlx5e_enable_tx_dma(priv->channel[i]); + } +} + +void +mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value) +{ + int i; + + if (priv->channel == NULL) + return; + + for (i = 0; i < priv->params.num_channels; i++) { + + if (!priv->channel[i]) + continue; + + if (value) + mlx5e_disable_rx_dma(priv->channel[i]); + else + mlx5e_enable_rx_dma(priv->channel[i]); + } +} + +u8 +mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev) +{ + u8 min_inline_mode; + + min_inline_mode = MLX5_INLINE_MODE_L2; + mlx5_query_min_inline(mdev, &min_inline_mode); + if (min_inline_mode == MLX5_INLINE_MODE_NONE && + !MLX5_CAP_ETH(mdev, wqe_vlan_insert)) + min_inline_mode = MLX5_INLINE_MODE_L2; + + return (min_inline_mode); +} + +static void +mlx5e_add_hw_stats(struct mlx5e_priv *priv) +{ + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), + OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, priv, 0, + sysctl_firmware, "A", "HCA firmware version"); + + SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), + OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0, + "Board ID"); +} + +static int +mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + uint32_t tx_pfc; + uint32_t value; + int error; + + PRIV_LOCK(priv); + + tx_pfc = priv->params.tx_priority_flow_control; + + /* get current value */ + value = (tx_pfc >> arg2) & 1; + + error = sysctl_handle_32(oidp, &value, 0, req); + + /* range check value */ + if (value != 0) + priv->params.tx_priority_flow_control |= (1 << arg2); + else + priv->params.tx_priority_flow_control &= ~(1 << arg2); + + /* check if update is required */ + if (error == 0 && priv->gone == 0 && + tx_pfc != priv->params.tx_priority_flow_control) { + error = -mlx5e_set_port_pfc(priv); + /* restore previous value */ + if (error != 0) + priv->params.tx_priority_flow_control= tx_pfc; + } + PRIV_UNLOCK(priv); + + return (error); +} + +static int +mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_priv *priv = arg1; + uint32_t rx_pfc; + uint32_t value; + int error; + + PRIV_LOCK(priv); + + rx_pfc = priv->params.rx_priority_flow_control; + + /* get current value */ + value = (rx_pfc >> arg2) & 1; + + error = sysctl_handle_32(oidp, &value, 0, req); + + /* range check value */ + if (value != 0) + priv->params.rx_priority_flow_control |= (1 << arg2); + else + priv->params.rx_priority_flow_control &= ~(1 << arg2); + + /* check if update is required */ + if (error == 0 && priv->gone == 0 && + rx_pfc != priv->params.rx_priority_flow_control) { + error = -mlx5e_set_port_pfc(priv); + /* restore previous value */ + if (error != 0) + priv->params.rx_priority_flow_control= rx_pfc; + } + PRIV_UNLOCK(priv); + + return (error); +} + +static void +mlx5e_setup_pauseframes(struct mlx5e_priv *priv) +{ + unsigned int x; + char path[96]; + int error; + + /* enable pauseframes by default */ + priv->params.tx_pauseframe_control = 1; + priv->params.rx_pauseframe_control = 1; + + /* disable ports flow control, PFC, by default */ + priv->params.tx_priority_flow_control = 0; + priv->params.rx_priority_flow_control = 0; + +#if (__FreeBSD_version < 1100000) + /* compute path for sysctl */ + snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control", + device_get_unit(priv->mdev->pdev->dev.bsddev)); + + /* try to fetch tunable, if any */ + TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control); + + /* compute path for sysctl */ + snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control", + device_get_unit(priv->mdev->pdev->dev.bsddev)); + + /* try to fetch tunable, if any */ + TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control); + + for (x = 0; x != 8; x++) { + + /* compute path for sysctl */ + snprintf(path, sizeof(path), "dev.mce.%d.tx_priority_flow_control_%u", + device_get_unit(priv->mdev->pdev->dev.bsddev), x); + + /* try to fetch tunable, if any */ + if (TUNABLE_INT_FETCH(path, &value) == 0 && value != 0) + priv->params.tx_priority_flow_control |= 1 << x; + + /* compute path for sysctl */ + snprintf(path, sizeof(path), "dev.mce.%d.rx_priority_flow_control_%u", + device_get_unit(priv->mdev->pdev->dev.bsddev), x); + + /* try to fetch tunable, if any */ + if (TUNABLE_INT_FETCH(path, &value) == 0 && value != 0) + priv->params.rx_priority_flow_control |= 1 << x; + } +#endif + + /* register pauseframe SYSCTLs */ + SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN, + &priv->params.tx_pauseframe_control, 0, + "Set to enable TX pause frames. Clear to disable."); + + SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN, + &priv->params.rx_pauseframe_control, 0, + "Set to enable RX pause frames. Clear to disable."); + + /* register priority_flow control, PFC, SYSCTLs */ + for (x = 0; x != 8; x++) { + snprintf(path, sizeof(path), "tx_priority_flow_control_%u", x); + + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, path, CTLTYPE_UINT | CTLFLAG_RWTUN | + CTLFLAG_MPSAFE, priv, x, &mlx5e_sysctl_tx_priority_flow_control, "IU", + "Set to enable TX ports flow control frames for given priority. Clear to disable."); + + snprintf(path, sizeof(path), "rx_priority_flow_control_%u", x); + + SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, path, CTLTYPE_UINT | CTLFLAG_RWTUN | + CTLFLAG_MPSAFE, priv, x, &mlx5e_sysctl_rx_priority_flow_control, "IU", + "Set to enable RX ports flow control frames for given priority. Clear to disable."); + } + + PRIV_LOCK(priv); + + /* range check */ + priv->params.tx_pauseframe_control = + priv->params.tx_pauseframe_control ? 1 : 0; + priv->params.rx_pauseframe_control = + priv->params.rx_pauseframe_control ? 1 : 0; + + /* update firmware */ + error = mlx5e_set_port_pause_and_pfc(priv); + if (error == -EINVAL) { + if_printf(priv->ifp, + "Global pauseframes must be disabled before enabling PFC.\n"); + priv->params.rx_priority_flow_control = 0; + priv->params.tx_priority_flow_control = 0; + + /* update firmware */ + (void) mlx5e_set_port_pause_and_pfc(priv); + } + PRIV_UNLOCK(priv); +} + +static void * +mlx5e_create_ifp(struct mlx5_core_dev *mdev) +{ + struct ifnet *ifp; + struct mlx5e_priv *priv; + u8 dev_addr[ETHER_ADDR_LEN] __aligned(4); + struct sysctl_oid_list *child; + int ncv = mdev->priv.eq_table.num_comp_vectors; + char unit[16]; + int err; + int i; + u32 eth_proto_cap; + + if (mlx5e_check_required_hca_cap(mdev)) { + mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n"); + return (NULL); + } + priv = malloc(sizeof(*priv), M_MLX5EN, M_WAITOK | M_ZERO); + mlx5e_priv_mtx_init(priv); + + ifp = priv->ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + mlx5_core_err(mdev, "if_alloc() failed\n"); + goto err_free_priv; + } + ifp->if_softc = priv; + if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev)); + ifp->if_mtu = ETHERMTU; + ifp->if_init = mlx5e_open; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = mlx5e_ioctl; + ifp->if_transmit = mlx5e_xmit; + ifp->if_qflush = if_qflush; +#if (__FreeBSD_version >= 1100000) + ifp->if_get_counter = mlx5e_get_counter; +#endif + ifp->if_snd.ifq_maxlen = ifqmaxlen; + /* + * Set driver features + */ + ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6; + ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; + ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; + ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; + ifp->if_capabilities |= IFCAP_LRO; + ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; + ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; +#ifdef RATELIMIT + ifp->if_capabilities |= IFCAP_TXRTLMT; + ifp->if_snd_tag_alloc = mlx5e_rl_snd_tag_alloc; + ifp->if_snd_tag_free = mlx5e_rl_snd_tag_free; + ifp->if_snd_tag_modify = mlx5e_rl_snd_tag_modify; + ifp->if_snd_tag_query = mlx5e_rl_snd_tag_query; +#endif + + /* set TSO limits so that we don't have to drop TX packets */ + ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; + ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE; + + ifp->if_capenable = ifp->if_capabilities; + ifp->if_hwassist = 0; + if (ifp->if_capenable & IFCAP_TSO) + ifp->if_hwassist |= CSUM_TSO; + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + + sysctl_ctx_init(&priv->sysctl_ctx_channel_debug); + + /* ifnet sysctl tree */ + sysctl_ctx_init(&priv->sysctl_ctx); + priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), + OID_AUTO, ifp->if_dname, CTLFLAG_RD, 0, "MLX5 ethernet - interface name"); + if (priv->sysctl_ifnet == NULL) { + mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); + goto err_free_sysctl; + } + snprintf(unit, sizeof(unit), "%d", ifp->if_dunit); + priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, unit, CTLFLAG_RD, 0, "MLX5 ethernet - interface unit"); + if (priv->sysctl_ifnet == NULL) { + mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); + goto err_free_sysctl; + } + + /* HW sysctl tree */ + child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev)); + priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child, + OID_AUTO, "hw", CTLFLAG_RD, 0, "MLX5 ethernet dev hw"); + if (priv->sysctl_hw == NULL) { + mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); + goto err_free_sysctl; + } + mlx5e_build_ifp_priv(mdev, priv, ncv); + + snprintf(unit, sizeof(unit), "mce%u_wq", + device_get_unit(mdev->pdev->dev.bsddev)); + priv->wq = alloc_workqueue(unit, 0, 1); + if (priv->wq == NULL) { + if_printf(ifp, "%s: alloc_workqueue failed\n", __func__); + goto err_free_sysctl; + } + + err = mlx5_alloc_map_uar(mdev, &priv->cq_uar); + if (err) { + if_printf(ifp, "%s: mlx5_alloc_map_uar failed, %d\n", + __func__, err); + goto err_free_wq; + } + err = mlx5_core_alloc_pd(mdev, &priv->pdn); + if (err) { + if_printf(ifp, "%s: mlx5_core_alloc_pd failed, %d\n", + __func__, err); + goto err_unmap_free_uar; + } + err = mlx5_alloc_transport_domain(mdev, &priv->tdn); + if (err) { + if_printf(ifp, "%s: mlx5_alloc_transport_domain failed, %d\n", + __func__, err); + goto err_dealloc_pd; + } + err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); + if (err) { + if_printf(ifp, "%s: mlx5e_create_mkey failed, %d\n", + __func__, err); + goto err_dealloc_transport_domain; + } + mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr); + + /* check if we should generate a random MAC address */ + if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 && + is_zero_ether_addr(dev_addr)) { + random_ether_addr(dev_addr); + if_printf(ifp, "Assigned random MAC address\n"); + } +#ifdef RATELIMIT + err = mlx5e_rl_init(priv); + if (err) { + if_printf(ifp, "%s: mlx5e_rl_init failed, %d\n", + __func__, err); + goto err_create_mkey; + } +#endif + + /* set default MTU */ + mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu); + + /* Set desc */ + device_set_desc(mdev->pdev->dev.bsddev, mlx5e_version); + + /* Set default media status */ + priv->media_status_last = IFM_AVALID; + priv->media_active_last = IFM_ETHER | IFM_AUTO | + IFM_ETH_RXPAUSE | IFM_FDX; + + /* setup default pauseframes configuration */ + mlx5e_setup_pauseframes(priv); + + err = mlx5_query_port_proto_cap(mdev, ð_proto_cap, MLX5_PTYS_EN); + if (err) { + eth_proto_cap = 0; + if_printf(ifp, "%s: Query port media capability failed, %d\n", + __func__, err); + } + + /* Setup supported medias */ + ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, + mlx5e_media_change, mlx5e_media_status); + + for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { + if (mlx5e_mode_table[i].baudrate == 0) + continue; + if (MLX5E_PROT_MASK(i) & eth_proto_cap) { + ifmedia_add(&priv->media, + mlx5e_mode_table[i].subtype | + IFM_ETHER, 0, NULL); + ifmedia_add(&priv->media, + mlx5e_mode_table[i].subtype | + IFM_ETHER | IFM_FDX | + IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); + } + } + + ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | + IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); + + /* Set autoselect by default */ + ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | + IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE); + ether_ifattach(ifp, dev_addr); + + /* Register for VLAN events */ + priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); + priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); + + /* Link is down by default */ + if_link_state_change(ifp, LINK_STATE_DOWN); + + mlx5e_enable_async_events(priv); + + mlx5e_add_hw_stats(priv); + + mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM, + priv->stats.vport.arg); + + mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM, + priv->stats.pport.arg); + + mlx5e_create_ethtool(priv); + + mtx_lock(&priv->async_events_mtx); + mlx5e_update_stats(priv); + mtx_unlock(&priv->async_events_mtx); + + SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), + OID_AUTO, "rx_clbr_done", CTLFLAG_RD, + &priv->clbr_done, 0, + "RX timestamps calibration state"); + callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT); + mlx5e_reset_calibration_callout(priv); + + return (priv); + +#ifdef RATELIMIT +err_create_mkey: + mlx5_core_destroy_mkey(priv->mdev, &priv->mr); +#endif +err_dealloc_transport_domain: + mlx5_dealloc_transport_domain(mdev, priv->tdn); + +err_dealloc_pd: + mlx5_core_dealloc_pd(mdev, priv->pdn); + +err_unmap_free_uar: + mlx5_unmap_free_uar(mdev, &priv->cq_uar); + +err_free_wq: + destroy_workqueue(priv->wq); + +err_free_sysctl: + sysctl_ctx_free(&priv->sysctl_ctx); + sysctl_ctx_free(&priv->sysctl_ctx_channel_debug); + + if_free(ifp); + +err_free_priv: + mlx5e_priv_mtx_destroy(priv); + free(priv, M_MLX5EN); + return (NULL); +} + +static void +mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv) +{ + struct mlx5e_priv *priv = vpriv; + struct ifnet *ifp = priv->ifp; + + /* don't allow more IOCTLs */ + priv->gone = 1; + + /* + * Clear the device description to avoid use after free, + * because the bsddev is not destroyed when this module is + * unloaded: + */ + device_set_desc(mdev->pdev->dev.bsddev, NULL); + + /* XXX wait a bit to allow IOCTL handlers to complete */ + pause("W", hz); + +#ifdef RATELIMIT + /* + * The kernel can have reference(s) via the m_snd_tag's into + * the ratelimit channels, and these must go away before + * detaching: + */ + while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) { + if_printf(priv->ifp, "Waiting for all ratelimit connections " + "to terminate\n"); + pause("W", hz); + } +#endif + /* stop watchdog timer */ + callout_drain(&priv->watchdog); + + callout_drain(&priv->tstmp_clbr); + + if (priv->vlan_attach != NULL) + EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); + if (priv->vlan_detach != NULL) + EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); + + /* make sure device gets closed */ + PRIV_LOCK(priv); + mlx5e_close_locked(ifp); + PRIV_UNLOCK(priv); + + /* unregister device */ + ifmedia_removeall(&priv->media); + ether_ifdetach(ifp); + if_free(ifp); + +#ifdef RATELIMIT + mlx5e_rl_cleanup(priv); +#endif + /* destroy all remaining sysctl nodes */ + if (priv->sysctl_debug) { + sysctl_ctx_free(&priv->sysctl_ctx_channel_debug); + sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); + } + sysctl_ctx_free(&priv->stats.vport.ctx); + sysctl_ctx_free(&priv->stats.pport.ctx); + sysctl_ctx_free(&priv->sysctl_ctx); + + mlx5_core_destroy_mkey(priv->mdev, &priv->mr); + mlx5_dealloc_transport_domain(priv->mdev, priv->tdn); + mlx5_core_dealloc_pd(priv->mdev, priv->pdn); + mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); + mlx5e_disable_async_events(priv); + destroy_workqueue(priv->wq); + mlx5e_priv_mtx_destroy(priv); + free(priv, M_MLX5EN); +} + +static void * +mlx5e_get_ifp(void *vpriv) +{ + struct mlx5e_priv *priv = vpriv; + + return (priv->ifp); +} + +static struct mlx5_interface mlx5e_interface = { + .add = mlx5e_create_ifp, + .remove = mlx5e_destroy_ifp, + .event = mlx5e_async_event, + .protocol = MLX5_INTERFACE_PROTOCOL_ETH, + .get_dev = mlx5e_get_ifp, +}; + +void +mlx5e_init(void) +{ + mlx5_register_interface(&mlx5e_interface); +} + +void +mlx5e_cleanup(void) +{ + mlx5_unregister_interface(&mlx5e_interface); +} + +module_init_order(mlx5e_init, SI_ORDER_THIRD); +module_exit_order(mlx5e_cleanup, SI_ORDER_THIRD); + +#if (__FreeBSD_version >= 1100000) +MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1); +#endif +MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1); +MODULE_VERSION(mlx5en, 1); diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c new file mode 100644 index 000000000000..4dac7377cef1 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c @@ -0,0 +1,1542 @@ +/*- + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" + +#ifdef RATELIMIT + +static int mlx5e_rl_open_workers(struct mlx5e_priv *); +static void mlx5e_rl_close_workers(struct mlx5e_priv *); +static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); +static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, + struct sysctl_oid *, const char *name, const char *desc); +static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, + struct sysctl_oid *node, const char *name, const char *desc); +static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); +static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); + +static void +mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); + + MLX5_SET(wq, wq, log_wq_sz, log_sq_size); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, pd, rl->priv->pdn); + + param->wq.buf_numa_node = 0; + param->wq.db_numa_node = 0; + param->wq.linear = 1; +} + +static void +mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); + + MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); + MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); + MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); + + switch (rl->param.tx_coalesce_mode) { + case 0: + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + default: + if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); + else + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + break; + } +} + +static void +mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, + struct mlx5e_rl_channel_param *cparam) +{ + memset(cparam, 0, sizeof(*cparam)); + + mlx5e_rl_build_sq_param(rl, &cparam->sq); + mlx5e_rl_build_cq_param(rl, &cparam->cq); +} + +static int +mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, + struct mlx5e_sq_param *param, int ix) +{ + struct mlx5_core_dev *mdev = priv->mdev; + void *sqc = param->sqc; + void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); + int err; + + /* Create DMA descriptor TAG */ + if ((err = -bus_dma_tag_create( + bus_get_dma_tag(mdev->pdev->dev.bsddev), + 1, /* any alignment */ + 0, /* no boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ + MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ + MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg */ + &sq->dma_tag))) + goto done; + + /* use shared UAR */ + sq->uar = priv->rl.sq_uar; + + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, + &sq->wq_ctrl); + if (err) + goto err_free_dma_tag; + + sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; + /* + * The sq->bf_buf_size variable is intentionally left zero so + * that the doorbell writes will occur at the same memory + * location. + */ + + err = mlx5e_alloc_sq_db(sq); + if (err) + goto err_sq_wq_destroy; + + sq->mkey_be = cpu_to_be32(priv->mr.key); + sq->ifp = priv->ifp; + sq->priv = priv; + sq->max_inline = priv->params.tx_max_inline; + sq->min_inline_mode = priv->params.tx_min_inline_mode; + sq->vlan_inline_cap = MLX5_CAP_ETH(mdev, wqe_vlan_insert); + + return (0); + +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); +err_free_dma_tag: + bus_dma_tag_destroy(sq->dma_tag); +done: + return (err); +} + +static void +mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) +{ + + mlx5e_free_sq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static int +mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, + struct mlx5e_sq_param *param, int ix) +{ + int err; + + err = mlx5e_rl_create_sq(priv, sq, param, ix); + if (err) + return (err); + + err = mlx5e_enable_sq(sq, param, priv->rl.tisn); + if (err) + goto err_destroy_sq; + + err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); + if (err) + goto err_disable_sq; + + return (0); + +err_disable_sq: + mlx5e_disable_sq(sq); +err_destroy_sq: + mlx5e_rl_destroy_sq(sq); + + return (err); +} + +static void +mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) +{ + mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); + mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); + + callout_init_mtx(&sq->cev_callout, &sq->lock, 0); + + sq->cev_factor = priv->rl.param.tx_completion_fact; + + /* ensure the TX completion event factor is not zero */ + if (sq->cev_factor == 0) + sq->cev_factor = 1; +} + +static int +mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, + struct mlx5e_rl_channel_param *cparam, + struct mlx5e_sq *volatile *ppsq) +{ + struct mlx5e_priv *priv = rlw->priv; + struct mlx5e_sq *sq; + int err; + + sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); + + /* init mutexes */ + mlx5e_rl_chan_mtx_init(priv, sq); + + /* open TX completion queue */ + err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, + &mlx5e_tx_cq_comp, eq_ix); + if (err) + goto err_free; + + err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); + if (err) + goto err_close_tx_cq; + + /* store TX channel pointer */ + *ppsq = sq; + + /* poll TX queue initially */ + sq->cq.mcq.comp(&sq->cq.mcq); + + return (0); + +err_close_tx_cq: + mlx5e_close_cq(&sq->cq); + +err_free: + /* destroy mutexes */ + mtx_destroy(&sq->lock); + mtx_destroy(&sq->comp_lock); + free(sq, M_MLX5EN); + atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); + return (err); +} + +static void +mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) +{ + struct mlx5e_sq *sq = *ppsq; + + /* check if channel is already closed */ + if (sq == NULL) + return; + /* ensure channel pointer is no longer used */ + *ppsq = NULL; + + /* teardown and destroy SQ */ + mlx5e_drain_sq(sq); + mlx5e_disable_sq(sq); + mlx5e_rl_destroy_sq(sq); + + /* close CQ */ + mlx5e_close_cq(&sq->cq); + + /* destroy mutexes */ + mtx_destroy(&sq->lock); + mtx_destroy(&sq->comp_lock); + + free(sq, M_MLX5EN); +} + +static void +mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl) +{ + /* + * Limit the maximum distance between completion events to + * half of the currently set TX queue size. + * + * The maximum number of queue entries a single IP packet can + * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. + * + * The worst case max value is then given as below: + */ + uint64_t max = rl->param.tx_queue_size / + (2 * MLX5_SEND_WQE_MAX_WQEBBS); + + /* + * Update the maximum completion factor value in case the + * tx_queue_size field changed. Ensure we don't overflow + * 16-bits. + */ + if (max < 1) + max = 1; + else if (max > 65535) + max = 65535; + rl->param.tx_completion_fact_max = max; + + /* + * Verify that the current TX completion factor is within the + * given limits: + */ + if (rl->param.tx_completion_fact < 1) + rl->param.tx_completion_fact = 1; + else if (rl->param.tx_completion_fact > max) + rl->param.tx_completion_fact = max; +} + +static int +mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index) +{ + struct mlx5e_priv *priv = sq->priv; + struct mlx5_core_dev *mdev = priv->mdev; + + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (in == NULL) + return (-ENOMEM); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + + MLX5_SET(modify_sq_in, in, sqn, sq->sqn); + MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY); + MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); + + err = mlx5_core_modify_sq(mdev, in, inlen); + + kvfree(in); + + return (err); +} + +/* + * This function will search the configured rate limit table for the + * best match to avoid that a single socket based application can + * allocate all the available hardware rates. If the user selected + * rate deviates too much from the closes rate available in the rate + * limit table, unlimited rate will be selected. + */ +static uint64_t +mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) +{ + uint64_t distance = -1ULL; + uint64_t diff; + uint64_t retval = 0; /* unlimited */ + uint64_t x; + + /* search for closest rate */ + for (x = 0; x != rl->param.tx_rates_def; x++) { + uint64_t rate = rl->rate_limit_table[x]; + if (rate == 0) + continue; + + if (rate > user_rate) + diff = rate - user_rate; + else + diff = user_rate - rate; + + /* check if distance is smaller than previous rate */ + if (diff < distance) { + distance = diff; + retval = rate; + } + } + + /* range check for multiplication below */ + if (user_rate > rl->param.tx_limit_max) + user_rate = rl->param.tx_limit_max; + + /* fallback to unlimited, if rate deviates too much */ + if (distance > howmany(user_rate * + rl->param.tx_allowed_deviation, 1000ULL)) + retval = 0; + + return (retval); +} + +/* + * This function sets the requested rate for a rate limit channel, in + * bits per second. The requested rate will be filtered through the + * find best rate function above. + */ +static int +mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, + struct mlx5e_rl_channel *channel, uint64_t rate) +{ + struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; + struct mlx5e_sq *sq; + uint64_t temp; + uint16_t index; + uint16_t burst; + int error; + + if (rate != 0) { + MLX5E_RL_WORKER_UNLOCK(rlw); + + MLX5E_RL_RLOCK(rl); + + /* get current burst size in bytes */ + temp = rl->param.tx_burst_size * + MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu); + + /* limit burst size to 64K currently */ + if (temp > 65535) + temp = 65535; + burst = temp; + + /* find best rate */ + rate = mlx5e_rl_find_best_rate_locked(rl, rate); + + MLX5E_RL_RUNLOCK(rl); + + if (rate == 0) { + /* rate doesn't exist, fallback to unlimited */ + error = EINVAL; + index = 0; + rate = 0; + atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); + } else { + /* get a reference on the new rate */ + error = -mlx5_rl_add_rate(rlw->priv->mdev, + howmany(rate, 1000), burst, &index); + + if (error != 0) { + /* adding rate failed, fallback to unlimited */ + index = 0; + rate = 0; + atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); + } + } + MLX5E_RL_WORKER_LOCK(rlw); + } else { + index = 0; + burst = 0; /* default */ + } + + /* atomically swap rates */ + temp = channel->last_rate; + channel->last_rate = rate; + rate = temp; + + /* atomically swap burst size */ + temp = channel->last_burst; + channel->last_burst = burst; + burst = temp; + + MLX5E_RL_WORKER_UNLOCK(rlw); + /* put reference on the old rate, if any */ + if (rate != 0) { + mlx5_rl_remove_rate(rlw->priv->mdev, + howmany(rate, 1000), burst); + } + + /* set new rate */ + sq = channel->sq; + if (sq != NULL) { + error = mlx5e_rl_modify_sq(sq, index); + if (error != 0) + atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); + } else + error = 0; + MLX5E_RL_WORKER_LOCK(rlw); + + return (-error); +} + +static void +mlx5e_rl_worker(void *arg) +{ + struct thread *td; + struct mlx5e_rl_worker *rlw = arg; + struct mlx5e_rl_channel *channel; + struct mlx5e_priv *priv; + unsigned ix; + uint64_t x; + int error; + + /* set thread priority */ + td = curthread; + + thread_lock(td); + sched_prio(td, PI_SWI(SWI_NET)); + thread_unlock(td); + + priv = rlw->priv; + + /* compute completion vector */ + ix = (rlw - priv->rl.workers) % + priv->mdev->priv.eq_table.num_comp_vectors; + + /* TODO bind to CPU */ + + /* open all the SQs */ + MLX5E_RL_WORKER_LOCK(rlw); + for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { + struct mlx5e_rl_channel *channel = rlw->channels + x; + +#if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) + if (channel->state == MLX5E_RL_ST_FREE) + continue; +#endif + MLX5E_RL_WORKER_UNLOCK(rlw); + + MLX5E_RL_RLOCK(&priv->rl); + error = mlx5e_rl_open_channel(rlw, ix, + &priv->rl.chan_param, &channel->sq); + MLX5E_RL_RUNLOCK(&priv->rl); + + MLX5E_RL_WORKER_LOCK(rlw); + if (error != 0) { + if_printf(priv->ifp, + "mlx5e_rl_open_channel failed: %d\n", error); + break; + } + mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); + } + while (1) { + if (STAILQ_FIRST(&rlw->process_head) == NULL) { + /* check if we are tearing down */ + if (rlw->worker_done != 0) + break; + cv_wait(&rlw->cv, &rlw->mtx); + } + /* check if we are tearing down */ + if (rlw->worker_done != 0) + break; + channel = STAILQ_FIRST(&rlw->process_head); + if (channel != NULL) { + STAILQ_REMOVE_HEAD(&rlw->process_head, entry); + + switch (channel->state) { + case MLX5E_RL_ST_MODIFY: + channel->state = MLX5E_RL_ST_USED; + MLX5E_RL_WORKER_UNLOCK(rlw); + + /* create channel by demand */ + if (channel->sq == NULL) { + MLX5E_RL_RLOCK(&priv->rl); + error = mlx5e_rl_open_channel(rlw, ix, + &priv->rl.chan_param, &channel->sq); + MLX5E_RL_RUNLOCK(&priv->rl); + + if (error != 0) { + if_printf(priv->ifp, + "mlx5e_rl_open_channel failed: %d\n", error); + } else { + atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); + } + } else { + mlx5e_resume_sq(channel->sq); + } + + MLX5E_RL_WORKER_LOCK(rlw); + /* convert from bytes/s to bits/s and set new rate */ + error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, + channel->new_rate * 8ULL); + if (error != 0) { + if_printf(priv->ifp, + "mlx5e_rlw_channel_set_rate_locked failed: %d\n", + error); + } + break; + + case MLX5E_RL_ST_DESTROY: + error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); + if (error != 0) { + if_printf(priv->ifp, + "mlx5e_rlw_channel_set_rate_locked failed: %d\n", + error); + } + if (channel->sq != NULL) { + /* + * Make sure all packets are + * transmitted before SQ is + * returned to free list: + */ + MLX5E_RL_WORKER_UNLOCK(rlw); + mlx5e_drain_sq(channel->sq); + MLX5E_RL_WORKER_LOCK(rlw); + } + /* put the channel back into the free list */ + STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); + channel->state = MLX5E_RL_ST_FREE; + atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); + break; + default: + /* NOP */ + break; + } + } + } + + /* close all the SQs */ + for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { + struct mlx5e_rl_channel *channel = rlw->channels + x; + + /* update the initial rate */ + channel->init_rate = channel->last_rate; + + /* make sure we free up the rate resource */ + mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); + + if (channel->sq != NULL) { + MLX5E_RL_WORKER_UNLOCK(rlw); + mlx5e_rl_close_channel(&channel->sq); + atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); + MLX5E_RL_WORKER_LOCK(rlw); + } + } + + rlw->worker_done = 0; + cv_broadcast(&rlw->cv); + MLX5E_RL_WORKER_UNLOCK(rlw); + + kthread_exit(); +} + +static int +mlx5e_rl_open_tis(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + memset(in, 0, sizeof(in)); + + MLX5_SET(tisc, tisc, prio, 0); + MLX5_SET(tisc, tisc, transport_domain, priv->tdn); + + return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn)); +} + +static void +mlx5e_rl_close_tis(struct mlx5e_priv *priv) +{ + mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn); +} + +static void +mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, + struct mlx5_core_dev *mdev) +{ + /* ratelimit workers */ + param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; + param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; + + /* range check */ + if (param->tx_worker_threads_def == 0 || + param->tx_worker_threads_def > param->tx_worker_threads_max) + param->tx_worker_threads_def = param->tx_worker_threads_max; + + /* ratelimit channels */ + param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / + param->tx_worker_threads_def; + param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; + + /* range check */ + if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) + param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; + + /* set default burst size */ + param->tx_burst_size = 4; /* MTUs */ + + /* + * Set maximum burst size + * + * The burst size is multiplied by the MTU and clamped to the + * range 0 ... 65535 bytes inclusivly before fed into the + * firmware. + * + * NOTE: If the burst size or MTU is changed only ratelimit + * connections made after the change will use the new burst + * size. + */ + param->tx_burst_size_max = 255; + + /* get firmware rate limits in 1000bit/s and convert them to bit/s */ + param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; + param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; + + /* ratelimit table size */ + param->tx_rates_max = mdev->priv.rl_table.max_size; + + /* range check */ + if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) + param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; + + /* set default number of rates */ + param->tx_rates_def = param->tx_rates_max; + + /* set maximum allowed rate deviation */ + if (param->tx_limit_max != 0) { + /* + * Make sure the deviation multiplication doesn't + * overflow unsigned 64-bit: + */ + param->tx_allowed_deviation_max = -1ULL / + param->tx_limit_max; + } + /* set default rate deviation */ + param->tx_allowed_deviation = 50; /* 5.0% */ + + /* channel parameters */ + param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); + param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; + param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT; + param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT; + param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT; +} + +static const char *mlx5e_rl_params_desc[] = { + MLX5E_RL_PARAMS(MLX5E_STATS_DESC) +}; + +static const char *mlx5e_rl_table_params_desc[] = { + MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) +}; + +static const char *mlx5e_rl_stats_desc[] = { + MLX5E_RL_STATS(MLX5E_STATS_DESC) +}; + +int +mlx5e_rl_init(struct mlx5e_priv *priv) +{ + struct mlx5e_rl_priv_data *rl = &priv->rl; + struct sysctl_oid *node; + struct sysctl_oid *stats; + char buf[64]; + uint64_t i; + uint64_t j; + int error; + + /* check if there is support for packet pacing */ + if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) + return (0); + + rl->priv = priv; + + sysctl_ctx_init(&rl->ctx); + + sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); + + /* allocate shared UAR for SQs */ + error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar); + if (error) + goto done; + + /* open own TIS domain for ratelimit SQs */ + error = mlx5e_rl_open_tis(priv); + if (error) + goto err_uar; + + /* setup default value for parameters */ + mlx5e_rl_set_default_params(&rl->param, priv->mdev); + + /* update the completion factor */ + mlx5e_rl_sync_tx_completion_fact(rl); + + /* create root node */ + node = SYSCTL_ADD_NODE(&rl->ctx, + SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, + "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support"); + + if (node != NULL) { + /* create SYSCTLs */ + for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) { + mlx5e_rl_sysctl_add_u64_oid(rl, + MLX5E_RL_PARAMS_INDEX(arg[i]), + node, mlx5e_rl_params_desc[2 * i], + mlx5e_rl_params_desc[2 * i + 1]); + } + + stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node), + OID_AUTO, "stats", CTLFLAG_RD, NULL, + "Rate limiting statistics"); + if (stats != NULL) { + /* create SYSCTLs */ + for (i = 0; i != MLX5E_RL_STATS_NUM; i++) { + mlx5e_rl_sysctl_add_stats_u64_oid(rl, i, + stats, mlx5e_rl_stats_desc[2 * i], + mlx5e_rl_stats_desc[2 * i + 1]); + } + } + } + + /* allocate workers array */ + rl->workers = malloc(sizeof(rl->workers[0]) * + rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); + + /* allocate rate limit array */ + rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * + rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); + + if (node != NULL) { + /* create more SYSCTls */ + SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, + "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, + "A", "Show table of all configured TX rates"); + + /* try to fetch rate table from kernel environment */ + for (i = 0; i != rl->param.tx_rates_def; i++) { + /* compute path for tunable */ + snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", + device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); + if (TUNABLE_QUAD_FETCH(buf, &j)) + mlx5e_rl_tx_limit_add(rl, j); + } + + /* setup rate table sysctls */ + for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { + mlx5e_rl_sysctl_add_u64_oid(rl, + MLX5E_RL_PARAMS_INDEX(table_arg[i]), + node, mlx5e_rl_table_params_desc[2 * i], + mlx5e_rl_table_params_desc[2 * i + 1]); + } + } + + for (j = 0; j < rl->param.tx_worker_threads_def; j++) { + struct mlx5e_rl_worker *rlw = rl->workers + j; + + rlw->priv = priv; + + cv_init(&rlw->cv, "mlx5-worker-cv"); + mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); + STAILQ_INIT(&rlw->index_list_head); + STAILQ_INIT(&rlw->process_head); + + rlw->channels = malloc(sizeof(rlw->channels[0]) * + rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); + + MLX5E_RL_WORKER_LOCK(rlw); + for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { + struct mlx5e_rl_channel *channel = rlw->channels + i; + channel->worker = rlw; + channel->m_snd_tag.ifp = priv->ifp; + STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); + } + MLX5E_RL_WORKER_UNLOCK(rlw); + } + + PRIV_LOCK(priv); + error = mlx5e_rl_open_workers(priv); + PRIV_UNLOCK(priv); + + if (error != 0) { + if_printf(priv->ifp, + "mlx5e_rl_open_workers failed: %d\n", error); + } + + return (0); + +err_uar: + mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar); +done: + sysctl_ctx_free(&rl->ctx); + sx_destroy(&rl->rl_sxlock); + return (error); +} + +static int +mlx5e_rl_open_workers(struct mlx5e_priv *priv) +{ + struct mlx5e_rl_priv_data *rl = &priv->rl; + struct thread *rl_thread = NULL; + struct proc *rl_proc = NULL; + uint64_t j; + int error; + + if (priv->gone || rl->opened) + return (-EINVAL); + + MLX5E_RL_WLOCK(rl); + /* compute channel parameters once */ + mlx5e_rl_build_channel_param(rl, &rl->chan_param); + MLX5E_RL_WUNLOCK(rl); + + for (j = 0; j < rl->param.tx_worker_threads_def; j++) { + struct mlx5e_rl_worker *rlw = rl->workers + j; + + /* start worker thread */ + error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, + RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); + if (error != 0) { + if_printf(rl->priv->ifp, + "kproc_kthread_add failed: %d\n", error); + rlw->worker_done = 1; + } + } + + rl->opened = 1; + + return (0); +} + +static void +mlx5e_rl_close_workers(struct mlx5e_priv *priv) +{ + struct mlx5e_rl_priv_data *rl = &priv->rl; + uint64_t y; + + if (rl->opened == 0) + return; + + /* tear down worker threads simultaneously */ + for (y = 0; y < rl->param.tx_worker_threads_def; y++) { + struct mlx5e_rl_worker *rlw = rl->workers + y; + + /* tear down worker before freeing SQs */ + MLX5E_RL_WORKER_LOCK(rlw); + if (rlw->worker_done == 0) { + rlw->worker_done = 1; + cv_broadcast(&rlw->cv); + } else { + /* XXX thread not started */ + rlw->worker_done = 0; + } + MLX5E_RL_WORKER_UNLOCK(rlw); + } + + /* wait for worker threads to exit */ + for (y = 0; y < rl->param.tx_worker_threads_def; y++) { + struct mlx5e_rl_worker *rlw = rl->workers + y; + + /* tear down worker before freeing SQs */ + MLX5E_RL_WORKER_LOCK(rlw); + while (rlw->worker_done != 0) + cv_wait(&rlw->cv, &rlw->mtx); + MLX5E_RL_WORKER_UNLOCK(rlw); + } + + rl->opened = 0; +} + +static void +mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) +{ + unsigned x; + + MLX5E_RL_WLOCK(rl); + for (x = 0; x != rl->param.tx_rates_def; x++) + rl->rate_limit_table[x] = 0; + MLX5E_RL_WUNLOCK(rl); +} + +void +mlx5e_rl_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_rl_priv_data *rl = &priv->rl; + uint64_t y; + + /* check if there is support for packet pacing */ + if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) + return; + + /* TODO check if there is support for packet pacing */ + + sysctl_ctx_free(&rl->ctx); + + PRIV_LOCK(priv); + mlx5e_rl_close_workers(priv); + PRIV_UNLOCK(priv); + + mlx5e_rl_reset_rates(rl); + + /* free shared UAR for SQs */ + mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar); + + /* close TIS domain */ + mlx5e_rl_close_tis(priv); + + for (y = 0; y < rl->param.tx_worker_threads_def; y++) { + struct mlx5e_rl_worker *rlw = rl->workers + y; + + cv_destroy(&rlw->cv); + mtx_destroy(&rlw->mtx); + free(rlw->channels, M_MLX5EN); + } + free(rl->rate_limit_table, M_MLX5EN); + free(rl->workers, M_MLX5EN); + sx_destroy(&rl->rl_sxlock); +} + +static void +mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, + struct mlx5e_rl_channel *channel) +{ + STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); + cv_broadcast(&rlw->cv); +} + +static void +mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) +{ + if (channel == NULL) + return; + + MLX5E_RL_WORKER_LOCK(rlw); + switch (channel->state) { + case MLX5E_RL_ST_MODIFY: + channel->state = MLX5E_RL_ST_DESTROY; + break; + case MLX5E_RL_ST_USED: + channel->state = MLX5E_RL_ST_DESTROY; + mlx5e_rlw_queue_channel_locked(rlw, channel); + break; + default: + break; + } + MLX5E_RL_WORKER_UNLOCK(rlw); +} + +static int +mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) +{ + + MLX5E_RL_WORKER_LOCK(rlw); + channel->new_rate = rate; + switch (channel->state) { + case MLX5E_RL_ST_USED: + channel->state = MLX5E_RL_ST_MODIFY; + mlx5e_rlw_queue_channel_locked(rlw, channel); + break; + default: + break; + } + MLX5E_RL_WORKER_UNLOCK(rlw); + + return (0); +} + +static int +mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t *prate) +{ + int retval; + + MLX5E_RL_WORKER_LOCK(rlw); + switch (channel->state) { + case MLX5E_RL_ST_USED: + *prate = channel->last_rate; + retval = 0; + break; + case MLX5E_RL_ST_MODIFY: + retval = EBUSY; + break; + default: + retval = EINVAL; + break; + } + MLX5E_RL_WORKER_UNLOCK(rlw); + + return (retval); +} + +static int +mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, + struct mlx5e_rl_channel **pchannel) +{ + struct mlx5e_rl_channel *channel; + int retval = ENOMEM; + + MLX5E_RL_WORKER_LOCK(rlw); + /* Check for available channel in free list */ + if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { + retval = 0; + /* Remove head index from available list */ + STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); + channel->state = MLX5E_RL_ST_USED; + atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); + } else { + atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); + } + MLX5E_RL_WORKER_UNLOCK(rlw); + + *pchannel = channel; +#ifdef RATELIMIT_DEBUG + if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel); +#endif + return (retval); +} + +int +mlx5e_rl_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct mlx5e_rl_channel *channel; + struct mlx5e_rl_worker *rlw; + struct mlx5e_priv *priv; + int error; + + priv = ifp->if_softc; + + /* check if there is support for packet pacing or if device is going away */ + if (!MLX5_CAP_GEN(priv->mdev, qos) || + !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || + params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) + return (EOPNOTSUPP); + + /* compute worker thread this TCP connection belongs to */ + rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % + priv->rl.param.tx_worker_threads_def); + + error = mlx5e_find_available_tx_ring_index(rlw, &channel); + if (error != 0) + goto done; + + error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); + if (error != 0) { + mlx5e_rl_free(rlw, channel); + goto done; + } + + /* store pointer to mbuf tag */ + *ppmt = &channel->m_snd_tag; +done: + return (error); +} + + +int +mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) +{ + struct mlx5e_rl_channel *channel = + container_of(pmt, struct mlx5e_rl_channel, m_snd_tag); + + return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); +} + +int +mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) +{ + struct mlx5e_rl_channel *channel = + container_of(pmt, struct mlx5e_rl_channel, m_snd_tag); + + return (mlx5e_rl_query(channel->worker, channel, ¶ms->rate_limit.max_rate)); +} + +void +mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) +{ + struct mlx5e_rl_channel *channel = + container_of(pmt, struct mlx5e_rl_channel, m_snd_tag); + + mlx5e_rl_free(channel->worker, channel); +} + +static int +mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_rl_priv_data *rl = arg1; + struct mlx5e_priv *priv = rl->priv; + struct sbuf sbuf; + unsigned x; + int error; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + + PRIV_LOCK(priv); + + sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); + + sbuf_printf(&sbuf, + "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" + "\t" "--------------------------------------------\n"); + + MLX5E_RL_RLOCK(rl); + for (x = 0; x != rl->param.tx_rates_def; x++) { + if (rl->rate_limit_table[x] == 0) + continue; + + sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", + x, (unsigned)rl->param.tx_burst_size, + (long long)rl->rate_limit_table[x]); + } + MLX5E_RL_RUNLOCK(rl); + + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + + PRIV_UNLOCK(priv); + + return (error); +} + +static int +mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) +{ + uint64_t x; + uint64_t y; + + MLX5E_RL_WLOCK(rl); + /* compute channel parameters once */ + mlx5e_rl_build_channel_param(rl, &rl->chan_param); + MLX5E_RL_WUNLOCK(rl); + + for (y = 0; y != rl->param.tx_worker_threads_def; y++) { + struct mlx5e_rl_worker *rlw = rl->workers + y; + + for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { + struct mlx5e_rl_channel *channel; + struct mlx5e_sq *sq; + + channel = rlw->channels + x; + sq = channel->sq; + + if (sq == NULL) + continue; + + if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { + mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, + rl->param.tx_coalesce_usecs, + rl->param.tx_coalesce_pkts, + rl->param.tx_coalesce_mode); + } else { + mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, + rl->param.tx_coalesce_usecs, + rl->param.tx_coalesce_pkts); + } + } + } + return (0); +} + +static int +mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) +{ + unsigned x; + int error; + + if (value < 1000 || + mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) + return (EINVAL); + + MLX5E_RL_WLOCK(rl); + error = ENOMEM; + + /* check if rate already exists */ + for (x = 0; x != rl->param.tx_rates_def; x++) { + if (rl->rate_limit_table[x] != value) + continue; + error = EEXIST; + break; + } + + /* check if there is a free rate entry */ + if (x == rl->param.tx_rates_def) { + for (x = 0; x != rl->param.tx_rates_def; x++) { + if (rl->rate_limit_table[x] != 0) + continue; + rl->rate_limit_table[x] = value; + error = 0; + break; + } + } + MLX5E_RL_WUNLOCK(rl); + + return (error); +} + +static int +mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) +{ + unsigned x; + int error; + + if (value == 0) + return (EINVAL); + + MLX5E_RL_WLOCK(rl); + + /* check if rate already exists */ + for (x = 0; x != rl->param.tx_rates_def; x++) { + if (rl->rate_limit_table[x] != value) + continue; + /* free up rate */ + rl->rate_limit_table[x] = 0; + break; + } + + /* check if there is a free rate entry */ + if (x == rl->param.tx_rates_def) + error = ENOENT; + else + error = 0; + MLX5E_RL_WUNLOCK(rl); + + return (error); +} + +static int +mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + struct mlx5e_rl_priv_data *rl = arg1; + struct mlx5e_priv *priv = rl->priv; + unsigned mode_modify; + unsigned was_opened; + uint64_t value; + uint64_t old; + int error; + + PRIV_LOCK(priv); + + MLX5E_RL_RLOCK(rl); + value = rl->param.arg[arg2]; + MLX5E_RL_RUNLOCK(rl); + + if (req != NULL) { + old = value; + error = sysctl_handle_64(oidp, &value, 0, req); + if (error || req->newptr == NULL || + value == rl->param.arg[arg2]) + goto done; + } else { + old = 0; + error = 0; + } + + /* check if device is gone */ + if (priv->gone) { + error = ENXIO; + goto done; + } + was_opened = rl->opened; + mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); + + switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { + case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): + if (value > rl->param.tx_worker_threads_max) + value = rl->param.tx_worker_threads_max; + else if (value < 1) + value = 1; + + /* store new value */ + rl->param.arg[arg2] = value; + break; + + case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): + if (value > rl->param.tx_channels_per_worker_max) + value = rl->param.tx_channels_per_worker_max; + else if (value < 1) + value = 1; + + /* store new value */ + rl->param.arg[arg2] = value; + break; + + case MLX5E_RL_PARAMS_INDEX(tx_rates_def): + if (value > rl->param.tx_rates_max) + value = rl->param.tx_rates_max; + else if (value < 1) + value = 1; + + /* store new value */ + rl->param.arg[arg2] = value; + break; + + case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): + /* range check */ + if (value < 1) + value = 0; + else if (value > MLX5E_FLD_MAX(cqc, cq_period)) + value = MLX5E_FLD_MAX(cqc, cq_period); + + /* store new value */ + rl->param.arg[arg2] = value; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_rl_refresh_channel_params(rl); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): + /* import TX coal pkts */ + if (value < 1) + value = 0; + else if (value > MLX5E_FLD_MAX(cqc, cq_max_count)) + value = MLX5E_FLD_MAX(cqc, cq_max_count); + + /* store new value */ + rl->param.arg[arg2] = value; + + /* check to avoid down and up the network interface */ + if (was_opened) + error = mlx5e_rl_refresh_channel_params(rl); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): + /* network interface must be down */ + if (was_opened != 0 && mode_modify == 0) + mlx5e_rl_close_workers(priv); + + /* import TX coalesce mode */ + if (value != 0) + value = 1; + + /* store new value */ + rl->param.arg[arg2] = value; + + /* restart network interface, if any */ + if (was_opened != 0) { + if (mode_modify == 0) + mlx5e_rl_open_workers(priv); + else + error = mlx5e_rl_refresh_channel_params(rl); + } + break; + + case MLX5E_RL_PARAMS_INDEX(tx_queue_size): + /* network interface must be down */ + if (was_opened) + mlx5e_rl_close_workers(priv); + + /* import TX queue size */ + if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) + value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); + else if (value > priv->params_ethtool.tx_queue_size_max) + value = priv->params_ethtool.tx_queue_size_max; + + /* store actual TX queue size */ + value = 1ULL << order_base_2(value); + + /* store new value */ + rl->param.arg[arg2] = value; + + /* verify TX completion factor */ + mlx5e_rl_sync_tx_completion_fact(rl); + + /* restart network interface, if any */ + if (was_opened) + mlx5e_rl_open_workers(priv); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): + /* network interface must be down */ + if (was_opened) + mlx5e_rl_close_workers(priv); + + /* store new value */ + rl->param.arg[arg2] = value; + + /* verify parameter */ + mlx5e_rl_sync_tx_completion_fact(rl); + + /* restart network interface, if any */ + if (was_opened) + mlx5e_rl_open_workers(priv); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_limit_add): + error = mlx5e_rl_tx_limit_add(rl, value); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): + error = mlx5e_rl_tx_limit_clr(rl, value); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): + /* range check */ + if (value > rl->param.tx_allowed_deviation_max) + value = rl->param.tx_allowed_deviation_max; + else if (value < rl->param.tx_allowed_deviation_min) + value = rl->param.tx_allowed_deviation_min; + + MLX5E_RL_WLOCK(rl); + rl->param.arg[arg2] = value; + MLX5E_RL_WUNLOCK(rl); + break; + + case MLX5E_RL_PARAMS_INDEX(tx_burst_size): + /* range check */ + if (value > rl->param.tx_burst_size_max) + value = rl->param.tx_burst_size_max; + else if (value < rl->param.tx_burst_size_min) + value = rl->param.tx_burst_size_min; + + MLX5E_RL_WLOCK(rl); + rl->param.arg[arg2] = value; + MLX5E_RL_WUNLOCK(rl); + break; + + default: + break; + } +done: + PRIV_UNLOCK(priv); + return (error); +} + +static void +mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, + struct sysctl_oid *node, const char *name, const char *desc) +{ + /* + * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will + * take care of loading default sysctl value from the kernel + * environment, if any: + */ + if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { + /* read-only SYSCTLs */ + SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, + name, CTLTYPE_U64 | CTLFLAG_RD | + CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); + } else { + if (strstr(name, "_def") != 0) { +#ifdef RATELIMIT_DEBUG + /* tunable read-only advanced SYSCTLs */ + SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, + name, CTLTYPE_U64 | CTLFLAG_RDTUN | + CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); +#endif + } else { + /* read-write SYSCTLs */ + SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, + name, CTLTYPE_U64 | CTLFLAG_RWTUN | + CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); + } + } +} + +static void +mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, + struct sysctl_oid *node, const char *name, const char *desc) +{ + /* read-only SYSCTLs */ + SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, + CTLFLAG_RD, &rl->stats.arg[x], 0, desc); +} + +#endif diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c new file mode 100644 index 000000000000..cbd7e00a35b9 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -0,0 +1,550 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" +#include <machine/in_cksum.h> + +static inline int +mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, + struct mlx5e_rx_wqe *wqe, u16 ix) +{ + bus_dma_segment_t segs[rq->nsegs]; + struct mbuf *mb; + int nsegs; + int err; +#if (MLX5E_MAX_RX_SEGS != 1) + struct mbuf *mb_head; + int i; +#endif + if (rq->mbuf[ix].mbuf != NULL) + return (0); + +#if (MLX5E_MAX_RX_SEGS == 1) + mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz); + if (unlikely(!mb)) + return (-ENOMEM); + + mb->m_pkthdr.len = mb->m_len = rq->wqe_sz; +#else + mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, + MLX5E_MAX_RX_BYTES); + if (unlikely(mb == NULL)) + return (-ENOMEM); + + mb->m_len = MLX5E_MAX_RX_BYTES; + mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES; + + for (i = 1; i < rq->nsegs; i++) { + if (mb_head->m_pkthdr.len >= rq->wqe_sz) + break; + mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0, + MLX5E_MAX_RX_BYTES); + if (unlikely(mb == NULL)) { + m_freem(mb_head); + return (-ENOMEM); + } + mb->m_len = MLX5E_MAX_RX_BYTES; + mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES; + } + /* rewind to first mbuf in chain */ + mb = mb_head; +#endif + /* get IP header aligned */ + m_adj(mb, MLX5E_NET_IP_ALIGN); + + err = -bus_dmamap_load_mbuf_sg(rq->dma_tag, rq->mbuf[ix].dma_map, + mb, segs, &nsegs, BUS_DMA_NOWAIT); + if (err != 0) + goto err_free_mbuf; + if (unlikely(nsegs == 0)) { + bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map); + err = -ENOMEM; + goto err_free_mbuf; + } +#if (MLX5E_MAX_RX_SEGS == 1) + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr); +#else + wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr); + wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len | + MLX5_HW_START_PADDING); + for (i = 1; i != nsegs; i++) { + wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr); + wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len); + } + for (; i < rq->nsegs; i++) { + wqe->data[i].addr = 0; + wqe->data[i].byte_count = 0; + } +#endif + + rq->mbuf[ix].mbuf = mb; + rq->mbuf[ix].data = mb->m_data; + + bus_dmamap_sync(rq->dma_tag, rq->mbuf[ix].dma_map, + BUS_DMASYNC_PREREAD); + return (0); + +err_free_mbuf: + m_freem(mb); + return (err); +} + +static void +mlx5e_post_rx_wqes(struct mlx5e_rq *rq) +{ + if (unlikely(rq->enabled == 0)) + return; + + while (!mlx5_wq_ll_is_full(&rq->wq)) { + struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, rq->wq.head); + + if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, rq->wq.head))) { + callout_reset_curcpu(&rq->watchdog, 1, (void *)&mlx5e_post_rx_wqes, rq); + break; + } + mlx5_wq_ll_push(&rq->wq, be16_to_cpu(wqe->next.next_wqe_index)); + } + + /* ensure wqes are visible to device before updating doorbell record */ + atomic_thread_fence_rel(); + + mlx5_wq_ll_update_db_record(&rq->wq); +} + +static void +mlx5e_lro_update_hdr(struct mbuf *mb, struct mlx5_cqe64 *cqe) +{ + /* TODO: consider vlans, ip options, ... */ + struct ether_header *eh; + uint16_t eh_type; + uint16_t tot_len; + struct ip6_hdr *ip6 = NULL; + struct ip *ip4 = NULL; + struct tcphdr *th; + uint32_t *ts_ptr; + uint8_t l4_hdr_type; + int tcp_ack; + + eh = mtod(mb, struct ether_header *); + eh_type = ntohs(eh->ether_type); + + l4_hdr_type = get_cqe_l4_hdr_type(cqe); + tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA == l4_hdr_type) || + (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type)); + + /* TODO: consider vlan */ + tot_len = be32_to_cpu(cqe->byte_cnt) - ETHER_HDR_LEN; + + switch (eh_type) { + case ETHERTYPE_IP: + ip4 = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip4 + 1); + break; + case ETHERTYPE_IPV6: + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + break; + default: + return; + } + + ts_ptr = (uint32_t *)(th + 1); + + if (get_cqe_lro_tcppsh(cqe)) + th->th_flags |= TH_PUSH; + + if (tcp_ack) { + th->th_flags |= TH_ACK; + th->th_ack = cqe->lro_ack_seq_num; + th->th_win = cqe->lro_tcp_win; + + /* + * FreeBSD handles only 32bit aligned timestamp right after + * the TCP hdr + * +--------+--------+--------+--------+ + * | NOP | NOP | TSopt | 10 | + * +--------+--------+--------+--------+ + * | TSval timestamp | + * +--------+--------+--------+--------+ + * | TSecr timestamp | + * +--------+--------+--------+--------+ + */ + if (get_cqe_lro_timestamp_valid(cqe) && + (__predict_true(*ts_ptr) == ntohl(TCPOPT_NOP << 24 | + TCPOPT_NOP << 16 | TCPOPT_TIMESTAMP << 8 | + TCPOLEN_TIMESTAMP))) { + /* + * cqe->timestamp is 64bit long. + * [0-31] - timestamp. + * [32-64] - timestamp echo replay. + */ + ts_ptr[1] = *(uint32_t *)&cqe->timestamp; + ts_ptr[2] = *((uint32_t *)&cqe->timestamp + 1); + } + } + if (ip4) { + ip4->ip_ttl = cqe->lro_min_ttl; + ip4->ip_len = cpu_to_be16(tot_len); + ip4->ip_sum = 0; + ip4->ip_sum = in_cksum(mb, ip4->ip_hl << 2); + } else { + ip6->ip6_hlim = cqe->lro_min_ttl; + ip6->ip6_plen = cpu_to_be16(tot_len - + sizeof(struct ip6_hdr)); + } + /* TODO: handle tcp checksum */ +} + +static uint64_t +mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp) +{ + struct mlx5e_clbr_point *cp; + uint64_t a1, a2, res; + u_int gen; + + do { + cp = &priv->clbr_points[priv->clbr_curr]; + gen = atomic_load_acq_int(&cp->clbr_gen); + a1 = (hw_tstmp - cp->clbr_hw_prev) >> MLX5E_TSTMP_PREC; + a2 = (cp->base_curr - cp->base_prev) >> MLX5E_TSTMP_PREC; + res = (a1 * a2) << MLX5E_TSTMP_PREC; + + /* + * Divisor cannot be zero because calibration callback + * checks for the condition and disables timestamping + * if clock halted. + */ + res /= (cp->clbr_hw_curr - cp->clbr_hw_prev) >> + MLX5E_TSTMP_PREC; + + res += cp->base_prev; + atomic_thread_fence_acq(); + } while (gen == 0 || gen != cp->clbr_gen); + return (res); +} + +static inline void +mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, + struct mlx5e_rq *rq, struct mbuf *mb, + u32 cqe_bcnt) +{ + struct ifnet *ifp = rq->ifp; + struct mlx5e_channel *c; +#if (MLX5E_MAX_RX_SEGS != 1) + struct mbuf *mb_head; +#endif + int lro_num_seg; /* HW LRO session aggregated packets counter */ + uint64_t tstmp; + + lro_num_seg = be32_to_cpu(cqe->srqn) >> 24; + if (lro_num_seg > 1) { + mlx5e_lro_update_hdr(mb, cqe); + rq->stats.lro_packets++; + rq->stats.lro_bytes += cqe_bcnt; + } + +#if (MLX5E_MAX_RX_SEGS == 1) + mb->m_pkthdr.len = mb->m_len = cqe_bcnt; +#else + mb->m_pkthdr.len = cqe_bcnt; + for (mb_head = mb; mb != NULL; mb = mb->m_next) { + if (mb->m_len > cqe_bcnt) + mb->m_len = cqe_bcnt; + cqe_bcnt -= mb->m_len; + if (likely(cqe_bcnt == 0)) { + if (likely(mb->m_next != NULL)) { + /* trim off empty mbufs */ + m_freem(mb->m_next); + mb->m_next = NULL; + } + break; + } + } + /* rewind to first mbuf in chain */ + mb = mb_head; +#endif + /* check if a Toeplitz hash was computed */ + if (cqe->rss_hash_type != 0) { + mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result); +#ifdef RSS + /* decode the RSS hash type */ + switch (cqe->rss_hash_type & + (CQE_RSS_DST_HTYPE_L4 | CQE_RSS_DST_HTYPE_IP)) { + /* IPv4 */ + case (CQE_RSS_DST_HTYPE_TCP | CQE_RSS_DST_HTYPE_IPV4): + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_TCP_IPV4); + break; + case (CQE_RSS_DST_HTYPE_UDP | CQE_RSS_DST_HTYPE_IPV4): + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_UDP_IPV4); + break; + case CQE_RSS_DST_HTYPE_IPV4: + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_IPV4); + break; + /* IPv6 */ + case (CQE_RSS_DST_HTYPE_TCP | CQE_RSS_DST_HTYPE_IPV6): + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_TCP_IPV6); + break; + case (CQE_RSS_DST_HTYPE_UDP | CQE_RSS_DST_HTYPE_IPV6): + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_UDP_IPV6); + break; + case CQE_RSS_DST_HTYPE_IPV6: + M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_IPV6); + break; + default: /* Other */ + M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE_HASH); + break; + } +#else + M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE_HASH); +#endif + } else { + mb->m_pkthdr.flowid = rq->ix; + M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE); + } + mb->m_pkthdr.rcvif = ifp; + + if (likely(ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) && + ((cqe->hds_ip_ext & (CQE_L2_OK | CQE_L3_OK | CQE_L4_OK)) == + (CQE_L2_OK | CQE_L3_OK | CQE_L4_OK))) { + mb->m_pkthdr.csum_flags = + CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + mb->m_pkthdr.csum_data = htons(0xffff); + } else { + rq->stats.csum_none++; + } + + if (cqe_has_vlan(cqe)) { + mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info); + mb->m_flags |= M_VLANTAG; + } + + c = container_of(rq, struct mlx5e_channel, rq); + if (c->priv->clbr_done >= 2) { + tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp)); + if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) { + /* + * Timestamp was taken on the packet entrance, + * instead of the cqe generation. + */ + tstmp &= ~MLX5_CQE_TSTMP_PTP; + mb->m_flags |= M_TSTMP_HPREC; + } + mb->m_pkthdr.rcv_tstmp = tstmp; + mb->m_flags |= M_TSTMP; + } +} + +static inline void +mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cc, void *data) +{ + memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, (cc & cq->wq.sz_m1)), + sizeof(struct mlx5_cqe64)); +} + +static inline void +mlx5e_write_cqe_slot(struct mlx5e_cq *cq, u32 cc, void *data) +{ + memcpy(mlx5_cqwq_get_wqe(&cq->wq, cc & cq->wq.sz_m1), + data, sizeof(struct mlx5_cqe64)); +} + +static inline void +mlx5e_decompress_cqe(struct mlx5e_cq *cq, struct mlx5_cqe64 *title, + struct mlx5_mini_cqe8 *mini, + u16 wqe_counter, int i) +{ + /* + * NOTE: The fields which are not set here are copied from the + * initial and common title. See memcpy() in + * mlx5e_write_cqe_slot(). + */ + title->byte_cnt = mini->byte_cnt; + title->wqe_counter = cpu_to_be16((wqe_counter + i) & cq->wq.sz_m1); + title->check_sum = mini->checksum; + title->op_own = (title->op_own & 0xf0) | + (((cq->wq.cc + i) >> cq->wq.log_sz) & 1); +} + +#define MLX5E_MINI_ARRAY_SZ 8 +/* Make sure structs are not packet differently */ +CTASSERT(sizeof(struct mlx5_cqe64) == + sizeof(struct mlx5_mini_cqe8) * MLX5E_MINI_ARRAY_SZ); +static void +mlx5e_decompress_cqes(struct mlx5e_cq *cq) +{ + struct mlx5_mini_cqe8 mini_array[MLX5E_MINI_ARRAY_SZ]; + struct mlx5_cqe64 title; + u32 cqe_count; + u32 i = 0; + u16 title_wqe_counter; + + mlx5e_read_cqe_slot(cq, cq->wq.cc, &title); + title_wqe_counter = be16_to_cpu(title.wqe_counter); + cqe_count = be32_to_cpu(title.byte_cnt); + + /* Make sure we won't overflow */ + KASSERT(cqe_count <= cq->wq.sz_m1, + ("%s: cqe_count %u > cq->wq.sz_m1 %u", __func__, + cqe_count, cq->wq.sz_m1)); + + mlx5e_read_cqe_slot(cq, cq->wq.cc + 1, mini_array); + while (true) { + mlx5e_decompress_cqe(cq, &title, + &mini_array[i % MLX5E_MINI_ARRAY_SZ], + title_wqe_counter, i); + mlx5e_write_cqe_slot(cq, cq->wq.cc + i, &title); + i++; + + if (i == cqe_count) + break; + if (i % MLX5E_MINI_ARRAY_SZ == 0) + mlx5e_read_cqe_slot(cq, cq->wq.cc + i, mini_array); + } +} + +static int +mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) +{ + int i; + + for (i = 0; i < budget; i++) { + struct mlx5e_rx_wqe *wqe; + struct mlx5_cqe64 *cqe; + struct mbuf *mb; + __be16 wqe_counter_be; + u16 wqe_counter; + u32 byte_cnt; + + cqe = mlx5e_get_cqe(&rq->cq); + if (!cqe) + break; + + if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) + mlx5e_decompress_cqes(&rq->cq); + + mlx5_cqwq_pop(&rq->cq.wq); + + wqe_counter_be = cqe->wqe_counter; + wqe_counter = be16_to_cpu(wqe_counter_be); + wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter); + byte_cnt = be32_to_cpu(cqe->byte_cnt); + + bus_dmamap_sync(rq->dma_tag, + rq->mbuf[wqe_counter].dma_map, + BUS_DMASYNC_POSTREAD); + + if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) { + rq->stats.wqe_err++; + goto wq_ll_pop; + } + if ((MHLEN - MLX5E_NET_IP_ALIGN) >= byte_cnt && + (mb = m_gethdr(M_NOWAIT, MT_DATA)) != NULL) { +#if (MLX5E_MAX_RX_SEGS != 1) + /* set maximum mbuf length */ + mb->m_len = MHLEN - MLX5E_NET_IP_ALIGN; +#endif + /* get IP header aligned */ + mb->m_data += MLX5E_NET_IP_ALIGN; + + bcopy(rq->mbuf[wqe_counter].data, mtod(mb, caddr_t), + byte_cnt); + } else { + mb = rq->mbuf[wqe_counter].mbuf; + rq->mbuf[wqe_counter].mbuf = NULL; /* safety clear */ + + bus_dmamap_unload(rq->dma_tag, + rq->mbuf[wqe_counter].dma_map); + } + + mlx5e_build_rx_mbuf(cqe, rq, mb, byte_cnt); + rq->stats.packets++; + +#if !defined(HAVE_TCP_LRO_RX) + tcp_lro_queue_mbuf(&rq->lro, mb); +#else + if (mb->m_pkthdr.csum_flags == 0 || + (rq->ifp->if_capenable & IFCAP_LRO) == 0 || + rq->lro.lro_cnt == 0 || + tcp_lro_rx(&rq->lro, mb, 0) != 0) { + rq->ifp->if_input(rq->ifp, mb); + } +#endif +wq_ll_pop: + mlx5_wq_ll_pop(&rq->wq, wqe_counter_be, + &wqe->next.next_wqe_index); + } + + mlx5_cqwq_update_db_record(&rq->cq.wq); + + /* ensure cq space is freed before enabling more cqes */ + atomic_thread_fence_rel(); + return (i); +} + +void +mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq) +{ + struct mlx5e_rq *rq = container_of(mcq, struct mlx5e_rq, cq.mcq); + int i = 0; + +#ifdef HAVE_PER_CQ_EVENT_PACKET +#if (MHLEN < 15) +#error "MHLEN is too small" +#endif + struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA); + + if (mb != NULL) { + /* this code is used for debugging purpose only */ + mb->m_pkthdr.len = mb->m_len = 15; + memset(mb->m_data, 255, 14); + mb->m_data[14] = rq->ix; + mb->m_pkthdr.rcvif = rq->ifp; + rq->ifp->if_input(rq->ifp, mb); + } +#endif + + mtx_lock(&rq->mtx); + + /* + * Polling the entire CQ without posting new WQEs results in + * lack of receive WQEs during heavy traffic scenarios. + */ + while (1) { + if (mlx5e_poll_rx_cq(rq, MLX5E_RX_BUDGET_MAX) != + MLX5E_RX_BUDGET_MAX) + break; + i += MLX5E_RX_BUDGET_MAX; + if (i >= MLX5E_BUDGET_MAX) + break; + mlx5e_post_rx_wqes(rq); + } + mlx5e_post_rx_wqes(rq); + mlx5e_cq_arm(&rq->cq, MLX5_GET_DOORBELL_LOCK(&rq->channel->priv->doorbell_lock)); + tcp_lro_flush_all(&rq->lro); + mtx_unlock(&rq->mtx); +} diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c new file mode 100644 index 000000000000..40d8157c6771 --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c @@ -0,0 +1,666 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" +#include <machine/atomic.h> + +static inline bool +mlx5e_do_send_cqe(struct mlx5e_sq *sq) +{ + sq->cev_counter++; + /* interleave the CQEs */ + if (sq->cev_counter >= sq->cev_factor) { + sq->cev_counter = 0; + return (1); + } + return (0); +} + +void +mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt) +{ + u16 pi = sq->pc & sq->wq.sz_m1; + struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); + + memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); + + wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP); + wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); + if (mlx5e_do_send_cqe(sq)) + wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + else + wqe->ctrl.fm_ce_se = 0; + + /* Copy data for doorbell */ + memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); + + sq->mbuf[pi].mbuf = NULL; + sq->mbuf[pi].num_bytes = 0; + sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); + sq->pc += sq->mbuf[pi].num_wqebbs; +} + +#if (__FreeBSD_version >= 1100000) +static uint32_t mlx5e_hash_value; + +static void +mlx5e_hash_init(void *arg) +{ + mlx5e_hash_value = m_ether_tcpip_hash_init(); +} + +/* Make kernel call mlx5e_hash_init after the random stack finished initializing */ +SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL); +#endif + +static struct mlx5e_sq * +mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb) +{ + struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_channel * volatile *ppch; + struct mlx5e_channel *pch; + u32 ch; + u32 tc; + + ppch = priv->channel; + + /* check if channels are successfully opened */ + if (unlikely(ppch == NULL)) + return (NULL); + + /* obtain VLAN information if present */ + if (mb->m_flags & M_VLANTAG) { + tc = (mb->m_pkthdr.ether_vtag >> 13); + if (tc >= priv->num_tc) + tc = priv->default_vlan_prio; + } else { + tc = priv->default_vlan_prio; + } + + ch = priv->params.num_channels; + +#ifdef RATELIMIT + if (mb->m_pkthdr.snd_tag != NULL) { + struct mlx5e_sq *sq; + + /* check for route change */ + if (mb->m_pkthdr.snd_tag->ifp != ifp) + return (NULL); + + /* get pointer to sendqueue */ + sq = container_of(mb->m_pkthdr.snd_tag, + struct mlx5e_rl_channel, m_snd_tag)->sq; + + /* check if valid */ + if (sq != NULL && sq->stopped == 0) + return (sq); + + /* FALLTHROUGH */ + } +#endif + /* check if flowid is set */ + if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) { +#ifdef RSS + u32 temp; + + if (rss_hash2bucket(mb->m_pkthdr.flowid, + M_HASHTYPE_GET(mb), &temp) == 0) + ch = temp % ch; + else +#endif + ch = (mb->m_pkthdr.flowid % 128) % ch; + } else { +#if (__FreeBSD_version >= 1100000) + ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | + MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch; +#else + /* + * m_ether_tcpip_hash not present in stable, so just + * throw unhashed mbufs on queue 0 + */ + ch = 0; +#endif + } + + /* check if channel is allocated and not stopped */ + pch = ppch[ch]; + if (likely(pch != NULL && pch->sq[tc].stopped == 0)) + return (&pch->sq[tc]); + return (NULL); +} + +static inline u16 +mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, struct mbuf *mb) +{ + + switch(sq->min_inline_mode) { + case MLX5_INLINE_MODE_NONE: + /* + * When inline mode is NONE, we do not need to copy + * headers into WQEs, except when vlan tag framing is + * requested. Hardware might offload vlan tagging on + * transmit. This is a separate capability, which is + * known to be disabled on ConnectX-5 due to a hardware + * bug RM 931383. If vlan_inline_cap is not present and + * the packet has vlan tag, fall back to inlining. + */ + if ((mb->m_flags & M_VLANTAG) != 0 && + sq->vlan_inline_cap == 0) + break; + return (0); + case MLX5_INLINE_MODE_L2: + /* + * Due to hardware limitations, when trust mode is + * DSCP, the hardware may request MLX5_INLINE_MODE_L2 + * while it really needs all L2 headers and the 4 first + * bytes of the IP header (which include the + * TOS/traffic-class). + * + * To avoid doing a firmware command for querying the + * trust state and parsing the mbuf for doing + * unnecessary checks (VLAN/eth_type) in the fast path, + * we are going for the worth case (22 Bytes) if + * the mb->m_pkthdr.len allows it. + */ + if (mb->m_pkthdr.len > ETHER_HDR_LEN + + ETHER_VLAN_ENCAP_LEN + 4) + return (MIN(sq->max_inline, ETHER_HDR_LEN + + ETHER_VLAN_ENCAP_LEN + 4)); + break; + } + return (MIN(sq->max_inline, mb->m_pkthdr.len)); +} + +static int +mlx5e_get_header_size(struct mbuf *mb) +{ + struct ether_vlan_header *eh; + struct tcphdr *th; + struct ip *ip; + int ip_hlen, tcp_hlen; + struct ip6_hdr *ip6; + uint16_t eth_type; + int eth_hdr_len; + + eh = mtod(mb, struct ether_vlan_header *); + if (mb->m_len < ETHER_HDR_LEN) + return (0); + if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + eth_type = ntohs(eh->evl_proto); + eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + eth_type = ntohs(eh->evl_encap_proto); + eth_hdr_len = ETHER_HDR_LEN; + } + if (mb->m_len < eth_hdr_len) + return (0); + switch (eth_type) { + case ETHERTYPE_IP: + ip = (struct ip *)(mb->m_data + eth_hdr_len); + if (mb->m_len < eth_hdr_len + sizeof(*ip)) + return (0); + if (ip->ip_p != IPPROTO_TCP) + return (0); + ip_hlen = ip->ip_hl << 2; + eth_hdr_len += ip_hlen; + break; + case ETHERTYPE_IPV6: + ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); + if (mb->m_len < eth_hdr_len + sizeof(*ip6)) + return (0); + if (ip6->ip6_nxt != IPPROTO_TCP) + return (0); + eth_hdr_len += sizeof(*ip6); + break; + default: + return (0); + } + if (mb->m_len < eth_hdr_len + sizeof(*th)) + return (0); + th = (struct tcphdr *)(mb->m_data + eth_hdr_len); + tcp_hlen = th->th_off << 2; + eth_hdr_len += tcp_hlen; + if (mb->m_len < eth_hdr_len) + return (0); + return (eth_hdr_len); +} + +/* + * The return value is not going back to the stack because of + * the drbr + */ +static int +mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp) +{ + bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS]; + struct mlx5_wqe_data_seg *dseg; + struct mlx5e_tx_wqe *wqe; + struct ifnet *ifp; + int nsegs; + int err; + int x; + struct mbuf *mb = *mbp; + u16 ds_cnt; + u16 ihs; + u16 pi; + u8 opcode; + + /* + * Return ENOBUFS if the queue is full, this may trigger reinsertion + * of the mbuf into the drbr (see mlx5e_xmit_locked) + */ + if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) { + return (ENOBUFS); + } + + /* Align SQ edge with NOPs to avoid WQE wrap around */ + pi = ((~sq->pc) & sq->wq.sz_m1); + if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) { + /* Send one multi NOP message instead of many */ + mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS); + pi = ((~sq->pc) & sq->wq.sz_m1); + if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) + return (ENOMEM); + } + + /* Setup local variables */ + pi = sq->pc & sq->wq.sz_m1; + wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); + ifp = sq->ifp; + + memset(wqe, 0, sizeof(*wqe)); + + /* Send a copy of the frame to the BPF listener, if any */ + if (ifp != NULL && ifp->if_bpf != NULL) + ETHER_BPF_MTAP(ifp, mb); + + if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) { + wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM; + } + if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) { + wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM; + } + if (wqe->eth.cs_flags == 0) { + sq->stats.csum_offload_none++; + } + if (mb->m_pkthdr.csum_flags & CSUM_TSO) { + u32 payload_len; + u32 mss = mb->m_pkthdr.tso_segsz; + u32 num_pkts; + + wqe->eth.mss = cpu_to_be16(mss); + opcode = MLX5_OPCODE_LSO; + ihs = mlx5e_get_header_size(mb); + payload_len = mb->m_pkthdr.len - ihs; + if (payload_len == 0) + num_pkts = 1; + else + num_pkts = DIV_ROUND_UP(payload_len, mss); + sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs); + + sq->stats.tso_packets++; + sq->stats.tso_bytes += payload_len; + } else { + opcode = MLX5_OPCODE_SEND; + ihs = mlx5e_get_inline_hdr_size(sq, mb); + sq->mbuf[pi].num_bytes = max_t (unsigned int, + mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); + } + if (ihs == 0) { + if ((mb->m_flags & M_VLANTAG) != 0) { + wqe->eth.vlan_cmd = htons(0x8000); /* bit 0 CVLAN */ + wqe->eth.vlan_hdr = htons(mb->m_pkthdr.ether_vtag); + } else { + wqe->eth.inline_hdr_sz = 0; + } + } else { + if ((mb->m_flags & M_VLANTAG) != 0) { + struct ether_vlan_header *eh = (struct ether_vlan_header + *)wqe->eth.inline_hdr_start; + + /* Range checks */ + if (ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN)) + ihs = (MLX5E_MAX_TX_INLINE - + ETHER_VLAN_ENCAP_LEN); + else if (ihs < ETHER_HDR_LEN) { + err = EINVAL; + goto tx_drop; + } + m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh); + m_adj(mb, ETHER_HDR_LEN); + /* Insert 4 bytes VLAN tag into data stream */ + eh->evl_proto = eh->evl_encap_proto; + eh->evl_encap_proto = htons(ETHERTYPE_VLAN); + eh->evl_tag = htons(mb->m_pkthdr.ether_vtag); + /* Copy rest of header data, if any */ + m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + + 1)); + m_adj(mb, ihs - ETHER_HDR_LEN); + /* Extend header by 4 bytes */ + ihs += ETHER_VLAN_ENCAP_LEN; + } else { + m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start); + m_adj(mb, ihs); + } + wqe->eth.inline_hdr_sz = cpu_to_be16(ihs); + } + + ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; + if (ihs > sizeof(wqe->eth.inline_hdr_start)) { + ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start), + MLX5_SEND_WQE_DS); + } + dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt; + + err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, + mb, segs, &nsegs, BUS_DMA_NOWAIT); + if (err == EFBIG) { + /* Update statistics */ + sq->stats.defragged++; + /* Too many mbuf fragments */ + mb = m_defrag(*mbp, M_NOWAIT); + if (mb == NULL) { + mb = *mbp; + goto tx_drop; + } + /* Try again */ + err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, + mb, segs, &nsegs, BUS_DMA_NOWAIT); + } + /* Catch errors */ + if (err != 0) + goto tx_drop; + + /* Make sure all mbuf data, if any, is written to RAM */ + if (nsegs != 0) { + bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, + BUS_DMASYNC_PREWRITE); + } else { + /* All data was inlined, free the mbuf. */ + bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map); + m_freem(mb); + mb = NULL; + } + + for (x = 0; x != nsegs; x++) { + if (segs[x].ds_len == 0) + continue; + dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr); + dseg->lkey = sq->mkey_be; + dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len); + dseg++; + } + + ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl)); + + wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); + wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); + if (mlx5e_do_send_cqe(sq)) + wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + else + wqe->ctrl.fm_ce_se = 0; + + /* Copy data for doorbell */ + memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); + + /* Store pointer to mbuf */ + sq->mbuf[pi].mbuf = mb; + sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); + sq->pc += sq->mbuf[pi].num_wqebbs; + + sq->stats.packets++; + *mbp = NULL; /* safety clear */ + return (0); + +tx_drop: + sq->stats.dropped++; + *mbp = NULL; + m_freem(mb); + return err; +} + +static void +mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget) +{ + u16 sqcc; + + /* + * sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + sqcc = sq->cc; + + while (budget > 0) { + struct mlx5_cqe64 *cqe; + struct mbuf *mb; + u16 x; + u16 ci; + + cqe = mlx5e_get_cqe(&sq->cq); + if (!cqe) + break; + + mlx5_cqwq_pop(&sq->cq.wq); + + /* update budget according to the event factor */ + budget -= sq->cev_factor; + + for (x = 0; x != sq->cev_factor; x++) { + ci = sqcc & sq->wq.sz_m1; + mb = sq->mbuf[ci].mbuf; + sq->mbuf[ci].mbuf = NULL; /* Safety clear */ + + if (mb == NULL) { + if (sq->mbuf[ci].num_bytes == 0) { + /* NOP */ + sq->stats.nop++; + } + } else { + bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map); + + /* Free transmitted mbuf */ + m_freem(mb); + } + sqcc += sq->mbuf[ci].num_wqebbs; + } + } + + mlx5_cqwq_update_db_record(&sq->cq.wq); + + /* Ensure cq space is freed before enabling more cqes */ + atomic_thread_fence_rel(); + + sq->cc = sqcc; + + if (sq->sq_tq != NULL && + atomic_cmpset_int(&sq->queue_state, MLX5E_SQ_FULL, MLX5E_SQ_READY)) + taskqueue_enqueue(sq->sq_tq, &sq->sq_task); +} + +static int +mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb) +{ + struct mbuf *next; + int err = 0; + + if (likely(mb != NULL)) { + /* + * If we can't insert mbuf into drbr, try to xmit anyway. + * We keep the error we got so we could return that after xmit. + */ + err = drbr_enqueue(ifp, sq->br, mb); + } + + /* + * Check if the network interface is closed or if the SQ is + * being stopped: + */ + if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + sq->stopped != 0)) + return (err); + + /* Process the queue */ + while ((next = drbr_peek(ifp, sq->br)) != NULL) { + if (mlx5e_sq_xmit(sq, &next) != 0) { + if (next != NULL) { + drbr_putback(ifp, sq->br, next); + atomic_store_rel_int(&sq->queue_state, MLX5E_SQ_FULL); + break; + } + } + drbr_advance(ifp, sq->br); + } + /* Check if we need to write the doorbell */ + if (likely(sq->doorbell.d64 != 0)) { + mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); + sq->doorbell.d64 = 0; + } + /* + * Check if we need to start the event timer which flushes the + * transmit ring on timeout: + */ + if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL && + sq->cev_factor != 1)) { + /* start the timer */ + mlx5e_sq_cev_timeout(sq); + } else { + /* don't send NOPs yet */ + sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; + } + return (err); +} + +static int +mlx5e_xmit_locked_no_br(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb) +{ + int err = 0; + + if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + sq->stopped != 0)) { + m_freem(mb); + return (ENETDOWN); + } + + /* Do transmit */ + if (mlx5e_sq_xmit(sq, &mb) != 0) { + /* NOTE: m_freem() is NULL safe */ + m_freem(mb); + err = ENOBUFS; + } + + /* Check if we need to write the doorbell */ + if (likely(sq->doorbell.d64 != 0)) { + mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); + sq->doorbell.d64 = 0; + } + + /* + * Check if we need to start the event timer which flushes the + * transmit ring on timeout: + */ + if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL && + sq->cev_factor != 1)) { + /* start the timer */ + mlx5e_sq_cev_timeout(sq); + } else { + /* don't send NOPs yet */ + sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; + } + return (err); +} + +int +mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb) +{ + struct mlx5e_sq *sq; + int ret; + + sq = mlx5e_select_queue(ifp, mb); + if (unlikely(sq == NULL)) { +#ifdef RATELIMIT + /* Check for route change */ + if (mb->m_pkthdr.snd_tag != NULL && + mb->m_pkthdr.snd_tag->ifp != ifp) { + /* Free mbuf */ + m_freem(mb); + + /* + * Tell upper layers about route change and to + * re-transmit this packet: + */ + return (EAGAIN); + } +#endif + /* Free mbuf */ + m_freem(mb); + + /* Invalid send queue */ + return (ENXIO); + } + + if (unlikely(sq->br == NULL)) { + /* rate limited traffic */ + mtx_lock(&sq->lock); + ret = mlx5e_xmit_locked_no_br(ifp, sq, mb); + mtx_unlock(&sq->lock); + } else if (mtx_trylock(&sq->lock)) { + ret = mlx5e_xmit_locked(ifp, sq, mb); + mtx_unlock(&sq->lock); + } else { + ret = drbr_enqueue(ifp, sq->br, mb); + taskqueue_enqueue(sq->sq_tq, &sq->sq_task); + } + + return (ret); +} + +void +mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq) +{ + struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq); + + mtx_lock(&sq->comp_lock); + mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX); + mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock)); + mtx_unlock(&sq->comp_lock); +} + +void +mlx5e_tx_que(void *context, int pending) +{ + struct mlx5e_sq *sq = context; + struct ifnet *ifp = sq->ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + mtx_lock(&sq->lock); + if (!drbr_empty(ifp, sq->br)) + mlx5e_xmit_locked(ifp, sq, NULL); + mtx_unlock(&sq->lock); + } +} diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c new file mode 100644 index 000000000000..771b4c69ffbc --- /dev/null +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_txrx.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "en.h" + +struct mlx5_cqe64 * +mlx5e_get_cqe(struct mlx5e_cq *cq) +{ + struct mlx5_cqe64 *cqe; + + cqe = mlx5_cqwq_get_wqe(&cq->wq, mlx5_cqwq_get_ci(&cq->wq)); + + if ((cqe->op_own ^ mlx5_cqwq_get_wrap_cnt(&cq->wq)) & MLX5_CQE_OWNER_MASK) + return (NULL); + + /* ensure cqe content is read after cqe ownership bit */ + atomic_thread_fence_acq(); + + return (cqe); +} + +void +mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event) +{ + struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq); + + if_printf(cq->priv->ifp, "%s: cqn=0x%.6x event=0x%.2x\n", + __func__, mcq->cqn, event); +} |