diff options
-rw-r--r-- | sys/modules/tcp/rack/Makefile | 2 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/rack.c | 6007 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/rack_bbr_common.c | 34 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/rack_bbr_common.h | 3 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/tailq_hash.c | 344 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/tailq_hash.h | 73 | ||||
-rw-r--r-- | sys/netinet/tcp_stacks/tcp_rack.h | 165 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 5 |
8 files changed, 5218 insertions, 1415 deletions
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile index cf95faa7fcfd..b80f34ba7ed4 100644 --- a/sys/modules/tcp/rack/Makefile +++ b/sys/modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c +SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_kern_tls.h diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 8b205d12d7f7..514d10098ff6 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -129,6 +129,7 @@ __FBSDID("$FreeBSD$"); #endif #include "sack_filter.h" #include "tcp_rack.h" +#include "tailq_hash.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; @@ -191,21 +192,38 @@ static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 * - 60 seconds */ +static uint32_t rack_clamp_ss_upper = 110; +static uint32_t rack_clamp_ca_upper = 105; +static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */ +static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */ +static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */ +static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */ +static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ +static int32_t rack_rxt_controls = 0; +static int32_t rack_fill_cw_state = 0; static uint8_t rack_req_measurements = 1; /* Attack threshold detections */ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; +static uint32_t rack_merge_out_sacks_on_attack = 0; static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ -static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ -static int32_t rack_hw_rate_caps = 1; /* 1; */ +static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ +static int32_t rack_hw_rate_caps = 0; /* 1; */ +static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ static int32_t rack_hw_rate_min = 0; /* 1500000;*/ static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ -static int32_t rack_hw_up_only = 1; +static int32_t rack_hw_up_only = 0; static int32_t rack_stats_gets_ms_rtt = 1; static int32_t rack_prr_addbackmax = 2; static int32_t rack_do_hystart = 0; static int32_t rack_apply_rtt_with_reduced_conf = 0; +static int32_t rack_hibeta_setting = 0; +static int32_t rack_default_pacing_divisor = 250; +static int32_t rack_uses_full_dgp_in_rec = 1; +static uint16_t rack_pacing_min_seg = 0; + +static uint32_t sad_seg_size_per = 800; /* 80.0 % */ static int32_t rack_pkt_delay = 1000; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ @@ -219,11 +237,13 @@ static int32_t rack_use_rsm_rfo = 1; static int32_t rack_max_abc_post_recovery = 2; static int32_t rack_client_low_buf = 0; static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ +static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */ #ifdef TCP_ACCOUNTING static int32_t rack_tcp_accounting = 0; #endif static int32_t rack_limits_scwnd = 1; static int32_t rack_enable_mqueue_for_nonpaced = 0; +static int32_t rack_hybrid_allow_set_maxseg = 0; static int32_t rack_disable_prr = 0; static int32_t use_rack_rr = 1; static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ @@ -233,11 +253,12 @@ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to us static int32_t rack_default_init_window = 0; /* Use system default */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ - +static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ +static int32_t rack_full_buffer_discount = 10; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -326,8 +347,6 @@ static int32_t rack_req_segs = 1; static uint64_t rack_bw_rate_cap = 0; -/* Weird delayed ack mode */ -static int32_t rack_use_imac_dack = 0; /* Rack specific counters */ counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enobuf_hw; @@ -336,6 +355,7 @@ counter_u64_t rack_persists_sends; counter_u64_t rack_persists_acks; counter_u64_t rack_persists_loss; counter_u64_t rack_persists_lost_ends; +counter_u64_t rack_total_bytes; #ifdef INVARIANTS counter_u64_t rack_adjust_map_bw; #endif @@ -352,6 +372,8 @@ counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; +counter_u64_t rack_rxt_clamps_cwnd; +counter_u64_t rack_rxt_clamps_cwnd_uniq; counter_u64_t rack_multi_single_eq; counter_u64_t rack_proc_non_comp_ack; @@ -367,6 +389,7 @@ counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_sack_attacks_detected; counter_u64_t rack_sack_attacks_reversed; +counter_u64_t rack_sack_attacks_suspect; counter_u64_t rack_sack_used_next_merge; counter_u64_t rack_sack_splits; counter_u64_t rack_sack_used_prev_merge; @@ -455,18 +478,25 @@ static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt); static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line, uint8_t quality); +static void +rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); + static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp, void **ptr); static void rack_init_sysctls(void); + static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, - struct tcphdr *th, int entered_rec, int dup_ack_struck); + struct tcphdr *th, int entered_rec, int dup_ack_struck, + int *dsack_seen, int *sacks_seen); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, - struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls); + struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); + +static uint64_t rack_get_gp_est(struct tcp_rack *rack); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, @@ -477,7 +507,7 @@ static int32_t rack_output(struct tcpcb *tp); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, - uint32_t cts, int *moved_two); + uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz); static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt); @@ -486,10 +516,10 @@ static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); + struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); + struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); @@ -530,6 +560,7 @@ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); +static void rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); @@ -544,6 +575,26 @@ rack_apply_deferred_options(struct tcp_rack *rack); int32_t rack_clear_counter=0; +static uint64_t +rack_get_lt_bw(struct tcp_rack *rack) +{ + struct timeval tv; + uint64_t tim, bytes; + + tim = rack->r_ctl.lt_bw_time; + bytes = rack->r_ctl.lt_bw_bytes; + if (rack->lt_bw_up) { + /* Include all the current bytes too */ + microuptime(&tv); + bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); + tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); + } + if ((bytes != 0) && (tim != 0)) + return ((bytes * (uint64_t)1000000) / tim); + else + return (0); +} + static void rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) { @@ -645,7 +696,7 @@ rack_set_cc_pacing(struct tcp_rack *rack) rack->rc_pacing_cc_set = 1; rack_swap_beta_values(rack, 3); } - + static void rack_undo_cc_pacing(struct tcp_rack *rack) { @@ -659,6 +710,42 @@ rack_undo_cc_pacing(struct tcp_rack *rack) rack_swap_beta_values(rack, 4); } +static void +rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, + uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = seq_end; + log.u_bbr.flex2 = rack->rc_tp->gput_seq; + log.u_bbr.flex3 = ack_end_t; + log.u_bbr.flex4 = rack->rc_tp->gput_ts; + log.u_bbr.flex5 = send_end_t; + log.u_bbr.flex6 = rack->rc_tp->gput_ack; + log.u_bbr.flex7 = mode; + log.u_bbr.flex8 = 69; + log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; + log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; + log.u_bbr.pkts_out = line; + log.u_bbr.cwnd_gain = rack->app_limited_needs_set; + log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; + if (rsm != NULL) { + log.u_bbr.applimited = rsm->r_start; + log.u_bbr.delivered = rsm->r_end; + log.u_bbr.epoch = rsm->r_flags; + } + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSI_CALC, 0, + 0, &log, false, &tv); + } +} + #ifdef NETFLIX_PEAKRATE static inline void rack_update_peakrate_thr(struct tcpcb *tp) @@ -697,6 +784,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_saw_enobuf_hw); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_persists_sends); + counter_u64_zero(rack_total_bytes); counter_u64_zero(rack_persists_acks); counter_u64_zero(rack_persists_loss); counter_u64_zero(rack_persists_lost_ends); @@ -719,10 +807,13 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); + counter_u64_zero(rack_rxt_clamps_cwnd); + counter_u64_zero(rack_rxt_clamps_cwnd_uniq); counter_u64_zero(rack_multi_single_eq); counter_u64_zero(rack_proc_non_comp_ack); counter_u64_zero(rack_sack_attacks_detected); counter_u64_zero(rack_sack_attacks_reversed); + counter_u64_zero(rack_sack_attacks_suspect); counter_u64_zero(rack_sack_used_next_merge); counter_u64_zero(rack_sack_used_prev_merge); counter_u64_zero(rack_sack_splits); @@ -737,6 +828,18 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_collapsed_win_rxt); counter_u64_zero(rack_collapsed_win_seen); counter_u64_zero(rack_collapsed_win_rxt_bytes); + } else if (stat == 2) { +#ifdef INVARIANTS + printf("Clearing RACK option array\n"); +#endif + COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); + } else if (stat == 3) { + printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); + } else if (stat == 4) { +#ifdef INVARIANTS + printf("Clearing RACK out size array\n"); +#endif + COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); } rack_clear_counter = 0; return (0); @@ -895,14 +998,44 @@ rack_init_sysctls(void) "Pacing related Controls"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fulldgpinrec", CTLFLAG_RW, + &rack_uses_full_dgp_in_rec, 1, + "Do we use all DGP features in recovery (fillcw, timely et.al.)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fullbufdisc", CTLFLAG_RW, + &rack_full_buffer_discount, 10, + "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fillcw", CTLFLAG_RW, + &rack_fill_cw_state, 0, + "Enable fillcw on new connections (default=0 off)?"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "min_burst", CTLFLAG_RW, + &rack_pacing_min_seg, 0, + "What is the min burst size for pacing (0 disables)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "divisor", CTLFLAG_RW, + &rack_default_pacing_divisor, 4, + "What is the default divisor given to the rl code?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, + &rack_bw_multipler, 2, + "What is the multiplier of the current gp_est that fillcw can increase the b/w too?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "max_pace_over", CTLFLAG_RW, &rack_max_per_above, 30, "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "pace_to_one", CTLFLAG_RW, + OID_AUTO, "allow1mss", CTLFLAG_RW, &rack_pace_one_seg, 0, - "Do we allow low b/w pacing of 1MSS instead of two"); + "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "limit_wsrtt", CTLFLAG_RW, @@ -967,8 +1100,13 @@ rack_init_sysctls(void) "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "precheck", CTLFLAG_RW, + &rack_hw_check_queue, 0, + "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, - &rack_enobuf_hw_boost_mult, 2, + &rack_enobuf_hw_boost_mult, 0, "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), @@ -988,10 +1126,15 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "rate_cap", CTLFLAG_RW, - &rack_hw_rate_caps, 1, + &rack_hw_rate_caps, 0, "Does the highest hardware pacing rate cap the rate we will send at??"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "uncap_per", CTLFLAG_RW, + &rack_hw_rate_cap_per, 0, + "If you go over b/w by this amount you will be uncapped (0 = never)"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "rate_min", CTLFLAG_RW, &rack_hw_rate_min, 0, "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); @@ -1003,12 +1146,12 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "up_only", CTLFLAG_RW, - &rack_hw_up_only, 1, + &rack_hw_up_only, 0, "Do we allow hw pacing to lower the rate selected?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "extra_mss_precise", CTLFLAG_RW, - &rack_hw_pace_extra_slots, 2, + &rack_hw_pace_extra_slots, 0, "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1287,6 +1430,16 @@ rack_init_sysctls(void) "features", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Feature controls"); + SYSCTL_ADD_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_features), + OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW, + &rack_rxt_clamp_thresh, 0, + "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_features), + OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, + &rack_hybrid_allow_set_maxseg, 0, + "Should hybrid pacing allow the setmss command"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_features), OID_AUTO, "cmpack", CTLFLAG_RW, @@ -1333,6 +1486,26 @@ rack_init_sysctls(void) #endif SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "dnd", CTLFLAG_RW, + &rack_dnd_default, 0, + "Do not disturb default for rack_rrr = 3"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "sad_seg_per", CTLFLAG_RW, + &sad_seg_size_per, 800, + "Percentage of segment size needed in a sack 800 = 80.0?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rxt_controls", CTLFLAG_RW, + &rack_rxt_controls, 0, + "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rack_hibeta", CTLFLAG_RW, + &rack_hibeta_setting, 0, + "Do we ue a high beta (80 instead of 50)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, &rack_apply_rtt_with_reduced_conf, 0, "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); @@ -1373,11 +1546,6 @@ rack_init_sysctls(void) "Should RACK place low end time limits on the shared cwnd feature"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "iMac_dack", CTLFLAG_RW, - &rack_use_imac_dack, 0, - "Should RACK try to emulate iMac delayed ack"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), OID_AUTO, "no_prr", CTLFLAG_RW, &rack_disable_prr, 0, "Should RACK not use prr and only pace (must have pacing on)"); @@ -1406,9 +1574,40 @@ rack_init_sysctls(void) OID_AUTO, "autoscale", CTLFLAG_RW, &rack_autosndbuf_inc, 20, "What percentage should rack scale up its snd buffer by?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW, + &rack_rxt_min_rnds, 10, + "Number of rounds needed between RTT clamps due to high loss rates"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW, + &rack_unclamp_round_thresh, 100, + "Number of rounds needed with no loss to unclamp"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW, + &rack_unclamp_rxt_thresh, 5, + "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "clamp_ss_upper", CTLFLAG_RW, + &rack_clamp_ss_upper, 110, + "Clamp percentage ceiling in SS?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "clamp_ca_upper", CTLFLAG_RW, + &rack_clamp_ca_upper, 110, + "Clamp percentage ceiling in CA?"); /* Sack Attacker detection stuff */ SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "merge_out", CTLFLAG_RW, + &rack_merge_out_sacks_on_attack, 0, + "Do we merge the sendmap when we decide we are being attacked?"); + + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), OID_AUTO, "detect_highsackratio", CTLFLAG_RW, &rack_highest_sack_thresh_seen, 0, "Highest sack to ack ratio seen"); @@ -1459,6 +1658,13 @@ rack_init_sysctls(void) OID_AUTO, "reversed", CTLFLAG_RD, &rack_sack_attacks_reversed, "Total number of SACK attackers that were later determined false positive"); + rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "suspect", CTLFLAG_RD, + &rack_sack_attacks_suspect, + "Total number of SACKs that triggered early detection"); + rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), @@ -1472,6 +1678,12 @@ rack_init_sysctls(void) &rack_sack_used_prev_merge, "Total number of times we used the prev merge"); /* Counters */ + rack_total_bytes = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "totalbytes", CTLFLAG_RD, + &rack_total_bytes, + "Total number of bytes sent"); rack_fto_send = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1599,6 +1811,18 @@ rack_init_sysctls(void) OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); + rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, + &rack_rxt_clamps_cwnd, + "Number of times that excessive rxt clamped the cwnd down"); + rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, + &rack_rxt_clamps_cwnd_uniq, + "Number of connections that have had excessive rxt clamped the cwnd down"); rack_persists_sends = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1726,49 +1950,6 @@ rack_init_sysctls(void) &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } -static __inline int -rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) -{ - if (SEQ_GEQ(b->r_start, a->r_start) && - SEQ_LT(b->r_start, a->r_end)) { - /* - * The entry b is within the - * block a. i.e.: - * a -- |-------------| - * b -- |----| - * <or> - * b -- |------| - * <or> - * b -- |-----------| - */ - return (0); - } else if (SEQ_GEQ(b->r_start, a->r_end)) { - /* - * b falls as either the next - * sequence block after a so a - * is said to be smaller than b. - * i.e: - * a -- |------| - * b -- |--------| - * or - * b -- |-----| - */ - return (1); - } - /* - * Whats left is where a is - * larger than b. i.e: - * a -- |-------| - * b -- |---| - * or even possibly - * b -- |--------------| - */ - return (-1); -} - -RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); -RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); - static uint32_t rc_init_window(struct tcp_rack *rack) { @@ -1796,14 +1977,282 @@ rack_get_fixed_pacing_bw(struct tcp_rack *rack) return (rack->r_ctl.rc_fixed_pacing_rate_ca); } -static uint64_t -rack_get_bw(struct tcp_rack *rack) +static void +rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, + uint64_t data, uint8_t mod, uint16_t aux, + struct http_sendfile_track *cur) { - if (rack->use_fixed_rate) { - /* Return the fixed pacing rate */ - return (rack_get_fixed_pacing_bw(rack)); +#ifdef TCP_REQUEST_TRK + int do_log = 0; + + /* + * The rate cap one is noisy and only should come out when normal BB logging + * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out + * once per chunk and make up the BBpoint that can be turned on by the client. + */ + if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { + if (rack_verbose_logging != 0) + do_log = tcp_bblogging_on(rack->rc_tp); + else + do_log = 0; + } else + do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); + + if (do_log) { + union tcp_log_stackspecific log; + struct timeval tv; + uint64_t lt_bw; + + /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); + + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.rttProp = tim; + log.u_bbr.bw_inuse = cbw; + log.u_bbr.delRate = rack_get_gp_est(rack); + lt_bw = rack_get_lt_bw(rack); + log.u_bbr.flex1 = seq; + log.u_bbr.pacing_gain = aux; + /* lt_bw = < flex3 | flex2 > */ + log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); + log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); + /* Record the last obtained us rtt in inflight */ + if (cur == NULL) { + /* Make sure we are looking at the right log if an overide comes in */ + cur = rack->r_ctl.rc_last_sft; + } + if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) + log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; + else { + /* Use the last known rtt i.e. the rack-rtt */ + log.u_bbr.inflight = rack->rc_rack_rtt; + } + if (cur != NULL) { + uint64_t off; + + log.u_bbr.cur_del_rate = cur->deadline; + if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { + /* start = < lost | pkt_epoch > */ + log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); + log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); + log.u_bbr.flex6 = cur->start_seq; + log.u_bbr.pkts_out = cur->end_seq; + } else { + /* start = < lost | pkt_epoch > */ + log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); + log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); + /* end = < pkts_out | flex6 > */ + log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); + log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); + } + /* first_send = <lt_epoch | epoch> */ + log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); + log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); + /* localtime = <delivered | applimited>*/ + log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); + log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); + off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]); + log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track)); + log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); + log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); + log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; + } else { + log.u_bbr.flex7 = 0xffff; + log.u_bbr.cur_del_rate = 0xffffffffffffffff; + } + /* + * Compose bbr_state to be a bit wise 0000ADHF + * where A is the always_pace flag + * where D is the dgp_on flag + * where H is the hybrid_mode on flag + * where F is the use_fixed_rate flag. + */ + log.u_bbr.bbr_state = rack->rc_always_pace; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->dgp_on; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->rc_hybrid_mode; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->use_fixed_rate; + log.u_bbr.flex8 = mod; + tcp_log_event(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_HYBRID_PACING_LOG, 0, + 0, &log, false, NULL, __func__, __LINE__, &tv); + + } +#endif +} + +static inline uint64_t +rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) +{ + uint64_t ret_bw, ether; + uint64_t u_segsiz; + + ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); + if (rack->r_is_v6){ +#ifdef INET6 + ether += sizeof(struct ip6_hdr); +#endif + ether += 14; /* eheader size 6+6+2 */ + } else { +#ifdef INET + ether += sizeof(struct ip); +#endif + ether += 14; /* eheader size 6+6+2 */ + } + u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); + ret_bw = bw; + ret_bw *= ether; + ret_bw /= u_segsiz; + return (ret_bw); +} + +static void +rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) +{ +#ifdef TCP_REQUEST_TRK + struct timeval tv; + uint64_t timenow, timeleft, lenleft, lengone, calcbw; +#endif + + if (rack->r_ctl.bw_rate_cap == 0) + return; +#ifdef TCP_REQUEST_TRK + if (rack->rc_catch_up && rack->rc_hybrid_mode && + (rack->r_ctl.rc_last_sft != NULL)) { + /* + * We have a dynamic cap. The original target + * is in bw_rate_cap, but we need to look at + * how long it is until we hit the deadline. + */ + struct http_sendfile_track *ent; + + ent = rack->r_ctl.rc_last_sft; + microuptime(&tv); + timenow = tcp_tv_to_lusectick(&tv); + if (timenow >= ent->deadline) { + /* No time left we do DGP only */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + /* We have the time */ + timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; + if (timeleft < HPTS_MSEC_IN_SEC) { + /* If there is less than a ms left just use DGPs rate */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + /* + * Now lets find the amount of data left to send. + * + * Now ideally we want to use the end_seq to figure out how much more + * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. + */ + if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) { + if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) + lenleft = ent->end_seq - rack->rc_tp->snd_una; + else { + /* TSNH, we should catch it at the send */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + } else { + /* + * The hard way, figure out how much is gone and then + * take that away from the total the client asked for + * (thats off by tls overhead if this is tls). + */ + if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) + lengone = rack->rc_tp->snd_una - ent->start_seq; + else + lengone = 0; + if (lengone < (ent->end - ent->start)) + lenleft = (ent->end - ent->start) - lengone; + else { + /* TSNH, we should catch it at the send */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + } + if (lenleft == 0) { + /* We have it all sent */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent); + if (rack->r_ctl.bw_rate_cap) + goto normal_ratecap; + else + return; + } + calcbw = lenleft * HPTS_USEC_IN_SEC; + calcbw /= timeleft; + /* Now we must compensate for IP/TCP overhead */ + calcbw = rack_compensate_for_linerate(rack, calcbw); + /* Update the bit rate cap */ + rack->r_ctl.bw_rate_cap = calcbw; + if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && + (rack_hybrid_allow_set_maxseg == 1) && + ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { + /* Lets set in a smaller mss possibly here to match our rate-cap */ + uint32_t orig_max; + + orig_max = rack->r_ctl.rc_pace_max_segs; + rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; + rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); + rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); + } + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent); + if ((calcbw > 0) && (*bw > calcbw)) { + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent); + *capped = 1; + *bw = calcbw; + } + return; } - if (rack->r_ctl.gp_bw == 0) { +normal_ratecap: +#endif + if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { +#ifdef TCP_REQUEST_TRK + if (rack->rc_hybrid_mode && + rack->rc_catch_up && + (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && + (rack_hybrid_allow_set_maxseg == 1) && + ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { + /* Lets set in a smaller mss possibly here to match our rate-cap */ + uint32_t orig_max; + + orig_max = rack->r_ctl.rc_pace_max_segs; + rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; + rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); + rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); + } +#endif + *capped = 1; + *bw = rack->r_ctl.bw_rate_cap; + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + *bw, 0, 0, + HYBRID_LOG_RATE_CAP, 1, NULL); + } +} + +static uint64_t +rack_get_gp_est(struct tcp_rack *rack) +{ + uint64_t bw, lt_bw, ret_bw; + + if (rack->rc_gp_filled == 0) { /* * We have yet no b/w measurement, * if we have a user set initial bw @@ -1815,15 +2264,20 @@ rack_get_bw(struct tcp_rack *rack) * so if we have like IW=30, we are not * calculating a "huge" b/w. */ - uint64_t bw, srtt; + uint64_t srtt; + + lt_bw = rack_get_lt_bw(rack); + if (lt_bw) { + /* + * No goodput bw but a long-term b/w does exist + * lets use that. + */ + ret_bw = lt_bw; + goto compensate; + } if (rack->r_ctl.init_rate) return (rack->r_ctl.init_rate); - /* Has the user set a max peak rate? */ -#ifdef NETFLIX_PEAKRATE - if (rack->rc_tp->t_maxpeakrate) - return (rack->rc_tp->t_maxpeakrate); -#endif /* Ok lets come up with the IW guess, if we have a srtt */ if (rack->rc_tp->t_srtt == 0) { /* @@ -1837,32 +2291,71 @@ rack_get_bw(struct tcp_rack *rack) srtt = (uint64_t)rack->rc_tp->t_srtt; bw *= (uint64_t)USECS_IN_SECOND; bw /= srtt; - if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) - bw = rack->r_ctl.bw_rate_cap; - return (bw); + ret_bw = bw; + goto compensate; + + } + if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { + /* Averaging is done, we can return the value */ + bw = rack->r_ctl.gp_bw; } else { - uint64_t bw; + /* Still doing initial average must calculate */ + bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); + } + lt_bw = rack_get_lt_bw(rack); + if (lt_bw == 0) { + /* If we don't have one then equate it to the gp_bw */ + lt_bw = rack->r_ctl.gp_bw; + } + if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){ + /* if clamped take the lowest */ + if (lt_bw < bw) + ret_bw = lt_bw; + else + ret_bw = bw; + } else { + /* If not set for clamped to get lowest, take the highest */ + if (lt_bw > bw) + ret_bw = lt_bw; + else + ret_bw = bw; + } + /* + * Now lets compensate based on the TCP/IP overhead. Our + * Goodput estimate does not include this so we must pace out + * a bit faster since our pacing calculations do. The pacing + * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz + * we are using to do this, so we do that here in the opposite + * direction as well. This means that if we are tunneled and the + * segsiz is say 1200 bytes we will get quite a boost, but its + * compensated for in the pacing time the opposite way. + */ +compensate: + ret_bw = rack_compensate_for_linerate(rack, ret_bw); + return(ret_bw); +} - if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { - /* Averaging is done, we can return the value */ - bw = rack->r_ctl.gp_bw; - } else { - /* Still doing initial average must calculate */ - bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; - } + +static uint64_t +rack_get_bw(struct tcp_rack *rack) +{ + uint64_t bw; + + if (rack->use_fixed_rate) { + /* Return the fixed pacing rate */ + return (rack_get_fixed_pacing_bw(rack)); + } + bw = rack_get_gp_est(rack); #ifdef NETFLIX_PEAKRATE - if ((rack->rc_tp->t_maxpeakrate) && - (bw > rack->rc_tp->t_maxpeakrate)) { - /* The user has set a peak rate to pace at - * don't allow us to pace faster than that. - */ - return (rack->rc_tp->t_maxpeakrate); - } -#endif - if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) - bw = rack->r_ctl.bw_rate_cap; - return (bw); + if ((rack->rc_tp->t_maxpeakrate) && + (bw > rack->rc_tp->t_maxpeakrate)) { + /* The user has set a peak rate to pace at + * don't allow us to pace faster than that. + */ + return (rack->rc_tp->t_maxpeakrate); } +#endif + return (bw); } static uint16_t @@ -1996,9 +2489,28 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, uint64_t bw_est, high_rate; uint64_t gain; - gain = (uint64_t)rack_get_output_gain(rack, rsm); - bw_est = bw * gain; - bw_est /= (uint64_t)100; + if ((rack->r_pacing_discount == 0) || + (rack_full_buffer_discount == 0)) { + /* + * No buffer level based discount from client buffer + * level is enabled or the feature is disabled. + */ + gain = (uint64_t)rack_get_output_gain(rack, rsm); + bw_est = bw * gain; + bw_est /= (uint64_t)100; + } else { + /* + * We have a discount in place apply it with + * just a 100% gain (we get no boost if the buffer + * is full). + */ + uint64_t discount; + + discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm); + discount /= 100; + /* What %% of the b/w do we discount */ + bw_est = bw - discount; + } /* Never fall below the minimum (def 64kbps) */ if (bw_est < RACK_MIN_BW) bw_est = RACK_MIN_BW; @@ -2009,6 +2521,11 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); if (bw_est >= high_rate) { /* We are capping bw at the highest rate table entry */ + if (rack_hw_rate_cap_per && + (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { + rack->r_rack_hw_rate_caps = 0; + goto done; + } rack_log_hdwr_pacing(rack, bw_est, high_rate, __LINE__, 0, 3); @@ -2039,6 +2556,7 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, } } } +done: return (bw_est); } @@ -2049,7 +2567,9 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t union tcp_log_stackspecific log; struct timeval tv; - if ((mod != 1) && (rack_verbose_logging == 0)) { + if (rack->sack_attack_disable > 0) + goto log_anyway; + if ((mod != 1) && (rack_verbose_logging == 0)) { /* * We get 3 values currently for mod * 1 - We are retransmitting and this tells the reason. @@ -2061,6 +2581,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t */ return; } +log_anyway: memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = tsused; log.u_bbr.flex2 = thresh; @@ -2109,9 +2630,11 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; - log.u_bbr.cwnd_gain = rack->rc_has_collapsed; + log.u_bbr.cwnd_gain = rack->rack_deferred_inited; + log.u_bbr.pkt_epoch = rack->rc_has_collapsed; log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; log.u_bbr.lost = rack_rto_min; + log.u_bbr.epoch = rack->r_ctl.roundends; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2350,6 +2873,29 @@ rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time } +static void +rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = idx; + log.u_bbr.flex2 = rack_ts_to_msec(tsv); + log.u_bbr.flex3 = tsecho; + log.u_bbr.flex7 = 3; + log.u_bbr.rttProp = tsv; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_LOG_RTT, 0, + 0, &log, false, &tv); + } +} + static inline void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) @@ -2379,7 +2925,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, } static void -rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) +rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) { if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -2391,6 +2937,9 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ log.u_bbr.flex2 = 0; else log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = rack->r_ctl.ack_during_sd; + log.u_bbr.flex6 = line; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.timeStamp = cts; @@ -2459,6 +3008,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = arg1; log.u_bbr.flex5 = arg2; + log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; log.u_bbr.flex6 = arg3; log.u_bbr.flex8 = frm; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -2658,6 +3208,7 @@ rack_log_sad(struct tcp_rack *rack, int event) log.u_bbr.applimited = tcp_map_minimum; log.u_bbr.flex7 = rack->sack_attack_disable; log.u_bbr.flex8 = event; + log.u_bbr.bbr_state = rack->rc_suspicious; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.delivered = tcp_sad_decay_val; @@ -2673,6 +3224,7 @@ rack_log_sad(struct tcp_rack *rack, int event) static void rack_counter_destroy(void) { + counter_u64_free(rack_total_bytes); counter_u64_free(rack_fto_send); counter_u64_free(rack_fto_rsm_send); counter_u64_free(rack_nfto_resend); @@ -2687,6 +3239,7 @@ rack_counter_destroy(void) counter_u64_free(rack_move_some); counter_u64_free(rack_sack_attacks_detected); counter_u64_free(rack_sack_attacks_reversed); + counter_u64_free(rack_sack_attacks_suspect); counter_u64_free(rack_sack_used_next_merge); counter_u64_free(rack_sack_used_prev_merge); counter_u64_free(rack_tlp_tot); @@ -2705,6 +3258,8 @@ rack_counter_destroy(void) counter_u64_free(rack_alloc_limited_conns); counter_u64_free(rack_split_limited); counter_u64_free(rack_multi_single_eq); + counter_u64_free(rack_rxt_clamps_cwnd); + counter_u64_free(rack_rxt_clamps_cwnd_uniq); counter_u64_free(rack_proc_non_comp_ack); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_restart); @@ -2794,15 +3349,26 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) if (limit_type) { /* currently there is only one limit type */ - if (V_tcp_map_split_limit > 0 && + if (rack->r_ctl.rc_split_limit > 0 && (rack->do_detection == 0) && - rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { + rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { + counter_u64_add(rack_split_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + return (NULL); +#ifdef NETFLIX_EXP_DETECTION + } else if ((tcp_sad_limit != 0) && + (rack->do_detection == 1) && + (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) { counter_u64_add(rack_split_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); +#endif } } @@ -2816,6 +3382,24 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) } static void +rack_free_trim(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + /* + * Free up all the tail entries until + * we get our list down to the limit. + */ + while (rack->rc_free_cnt > rack_free_cache) { + rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); + rack->rc_free_cnt--; + rack->r_ctl.rc_num_maps_alloced--; + uma_zfree(rack_zone, rsm); + } +} + +static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { @@ -2830,13 +3414,8 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) if (rsm == rack->r_ctl.rc_first_appl) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_first_appl = NULL; - else { - /* Follow the next one out */ - struct rack_sendmap fe; - - fe.r_start = rsm->r_nseq_appl; - rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); - } + else + rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); } if (rsm == rack->r_ctl.rc_resend) rack->r_ctl.rc_resend = NULL; @@ -2847,28 +3426,14 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; memset(rsm, 0, sizeof(struct rack_sendmap)); + /* Make sure we are not going to overrun our count limit of 0xff */ + if ((rack->rc_free_cnt + 1) > 0xff) { + rack_free_trim(rack); + } TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); rack->rc_free_cnt++; } -static void -rack_free_trim(struct tcp_rack *rack) -{ - struct rack_sendmap *rsm; - - /* - * Free up all the tail entries until - * we get our list down to the limit. - */ - while (rack->rc_free_cnt > rack_free_cache) { - rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); - TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); - rack->rc_free_cnt--; - uma_zfree(rack_zone, rsm); - } -} - - static uint32_t rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) { @@ -2956,15 +3521,34 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ /* * Has enough time passed for the GP measurement to be valid? */ + if (SEQ_LT(th_ack, tp->gput_seq)) { + /* Not enough bytes yet */ + return (0); + } if ((tp->snd_max == tp->snd_una) || (th_ack == tp->snd_max)){ - /* All is acked */ + /* + * All is acked quality of all acked is + * usually low or medium, but we in theory could split + * all acked into two cases, where you got + * a signifigant amount of your window and + * where you did not. For now we leave it + * but it is something to contemplate in the + * future. The danger here is that delayed ack + * is effecting the last byte (which is a 50:50 chance). + */ *quality = RACK_QUALITY_ALLACKED; return (1); } - if (SEQ_LT(th_ack, tp->gput_seq)) { - /* Not enough bytes yet */ - return (0); + if (SEQ_GEQ(th_ack, tp->gput_ack)) { + /* + * We obtained our entire window of data we wanted + * no matter if we are in recovery or not then + * its ok since expanding the window does not + * make things fuzzy (or at least not as much). + */ + *quality = RACK_QUALITY_HIGH; + return (1); } segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); if (SEQ_LT(th_ack, tp->gput_ack) && @@ -2984,7 +3568,13 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ /* Now what about time? */ srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; - if (tim >= srtts) { + if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { + /* + * We do not allow a measurement if we are in recovery + * that would shrink the goodput window we wanted. + * This is to prevent cloudyness of when the last send + * was actually made. + */ *quality = RACK_QUALITY_HIGH; return (1); } @@ -3188,10 +3778,10 @@ extra_boost: calc = 0xffff; logged |= 1; rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; - if (rack_per_upper_bound_ss && + if (rack->r_ctl.rack_per_upper_bound_ca && (rack->rc_dragged_bottom == 0) && - (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) - rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; + (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) + rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; } if (rack->rc_gp_saw_ca && (rack->rc_gp_saw_ss == 0) && @@ -3203,10 +3793,10 @@ extra_boost: calc = 0xffff; logged |= 2; rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; - if (rack_per_upper_bound_ca && + if (rack->r_ctl.rack_per_upper_bound_ca && (rack->rc_dragged_bottom == 0) && - (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) - rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; + (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) + rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; } if (rack->rc_gp_saw_ss && rack_bw_can_be_raised(rack, cur_bw, last_bw_est, @@ -3216,10 +3806,10 @@ extra_boost: if (calc > 0xffff) calc = 0xffff; rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; - if (rack_per_upper_bound_ss && + if (rack->r_ctl.rack_per_upper_bound_ss && (rack->rc_dragged_bottom == 0) && - (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) - rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; + (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) + rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; logged |= 4; } if (logged && @@ -3569,7 +4159,7 @@ rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) * We need to mark these as app-limited so we * don't collapse the b/w. */ - rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; @@ -3836,7 +4426,7 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last if (rack->r_ctl.rc_no_push_at_mrtt > 1) rack_validate_multipliers_at_or_below_100(rack); rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); - } else if ((last_bw_est < low_bnd) && !losses) { + } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { /* * We are decreasing this is a bit complicated this * means we are loosing ground. This could be @@ -3858,8 +4448,7 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last rack->rc_gp_bwred = 1; rack->rc_gp_timely_dec_cnt = 0; } - if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || - (timely_says == 0)) { + if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { /* * Push another time with a faster pacing * to try to gain back (we include override to @@ -3905,8 +4494,8 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last __LINE__, 3); rack->r_ctl.last_gp_comp_bw = cur_bw; if (rack->rc_gp_saw_ss && - rack_per_upper_bound_ss && - (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { + rack->r_ctl.rack_per_upper_bound_ss && + (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { /* * In cases where we can't go higher * we should just use timely. @@ -3914,8 +4503,8 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last goto use_timely; } if (rack->rc_gp_saw_ca && - rack_per_upper_bound_ca && - (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { + rack->r_ctl.rack_per_upper_bound_ca && + (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { /* * In cases where we can't go higher * we should just use timely. @@ -4027,11 +4616,134 @@ rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff return (timely_says); } +static __inline int +rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) +{ + if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && + SEQ_LEQ(rsm->r_end, tp->gput_ack)) { + /** + * This covers the case that the + * resent is completely inside + * the gp range or up to it. + * |----------------| + * |-----| <or> + * |----| + * <or> |---| + */ + return (1); + } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && + SEQ_GT(rsm->r_end, tp->gput_seq)){ + /** + * This covers the case of + * |--------------| + * |-------->| + */ + return (1); + } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && + SEQ_LT(rsm->r_start, tp->gput_ack) && + SEQ_GEQ(rsm->r_end, tp->gput_ack)) { + + /** + * This covers the case of + * |--------------| + * |-------->| + */ + return (1); + } + return (0); +} + +static __inline void +rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) +{ + + if ((tp->t_flags & TF_GPUTINPROG) == 0) + return; + /* + * We have a Goodput measurement in progress. Mark + * the send if its within the window. If its not + * in the window make sure it does not have the mark. + */ + if (rack_in_gp_window(tp, rsm)) + rsm->r_flags |= RACK_IN_GP_WIN; + else + rsm->r_flags &= ~RACK_IN_GP_WIN; +} + +static __inline void +rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* A GP measurement is ending, clear all marks on the send map*/ + struct rack_sendmap *rsm = NULL; + + rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); + if (rsm == NULL) { + rsm = tqhash_min(rack->r_ctl.tqh); + } + /* Nothing left? */ + while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ + rsm->r_flags &= ~RACK_IN_GP_WIN; + rsm = tqhash_next(rack->r_ctl.tqh, rsm); + } +} + + +static __inline void +rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) +{ + struct rack_sendmap *rsm = NULL; + + if (tp->snd_una == tp->snd_max) { + /* Nothing outstanding yet, nothing to do here */ + return; + } + if (SEQ_GT(tp->gput_seq, tp->snd_una)) { + /* + * We are measuring ahead of some outstanding + * data. We need to walk through up until we get + * to gp_seq marking so that no rsm is set incorrectly + * with RACK_IN_GP_WIN. + */ + rsm = tqhash_min(rack->r_ctl.tqh); + while (rsm != NULL) { + rack_mark_in_gp_win(tp, rsm); + if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) + break; + rsm = tqhash_next(rack->r_ctl.tqh, rsm); + } + } + if (rsm == NULL) { + /* + * Need to find the GP seq, if rsm is + * set we stopped as we hit it. + */ + rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); + if (rsm == NULL) + return; + rack_mark_in_gp_win(tp, rsm); + } + /* + * Now we may need to mark already sent rsm, ahead of + * gput_seq in the window since they may have been sent + * *before* we started our measurment. The rsm, if non-null + * has been marked (note if rsm would have been NULL we would have + * returned in the previous block). So we go to the next, and continue + * until we run out of entries or we exceed the gp_ack value. + */ + rsm = tqhash_next(rack->r_ctl.tqh, rsm); + while (rsm) { + rack_mark_in_gp_win(tp, rsm); + if (SEQ_GT(rsm->r_end, tp->gput_ack)) + break; + rsm = tqhash_next(rack->r_ctl.tqh, rsm); + } +} + static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line, uint8_t quality) { - uint64_t tim, bytes_ps, ltim, stim, utim; + uint64_t tim, bytes_ps, stim, utim; uint32_t segsiz, bytes, reqbytes, us_cts; int32_t gput, new_rtt_diff, timely_says; uint64_t resid_bw, subpart = 0, addpart = 0, srtt; @@ -4058,10 +4770,8 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, utim = max(tim, 1); else utim = max(stim, 1); - /* Lets get a msec time ltim too for the old stuff */ - ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); - gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); + rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); if ((tim == 0) && (stim == 0)) { /* * Invalid measurement time, maybe @@ -4171,10 +4881,10 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, } } timely_says = rack_make_timely_judgement(rack, - rack->r_ctl.rc_gp_srtt, - rack->r_ctl.rc_rtt_diff, - rack->r_ctl.rc_prev_gp_srtt - ); + rack->r_ctl.rc_gp_srtt, + rack->r_ctl.rc_rtt_diff, + rack->r_ctl.rc_prev_gp_srtt + ); bytes_ps *= HPTS_USEC_IN_SEC; bytes_ps /= utim; if (bytes_ps > rack->r_ctl.last_max_bw) { @@ -4229,6 +4939,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, /* We have collected enough to move forward */ rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; } + rack_set_pace_segments(tp, rack, __LINE__, NULL); did_add = 3; } else { /* @@ -4334,12 +5045,15 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.gp_bw = resid_bw + addpart; } } + rack_set_pace_segments(tp, rack, __LINE__, NULL); } if ((rack->gp_ready == 0) && (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { /* We have enough measurements now */ rack->gp_ready = 1; - rack_set_cc_pacing(rack); + if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) || + rack->rack_hibeta) + rack_set_cc_pacing(rack); if (rack->defer_options) rack_apply_deferred_options(rack); } @@ -4352,10 +5066,18 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rtt_diff); rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, rack_get_bw(rack), 3, line, NULL, quality); + rack_log_pacing_delay_calc(rack, + bytes, /* flex2 */ + tim, /* flex1 */ + bytes_ps, /* bw_inuse */ + rack->r_ctl.gp_bw, /* delRate */ + rack_get_lt_bw(rack), /* rttProp */ + 20, line, NULL, 0); /* reset the gp srtt and setup the new prev */ rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; /* Record the lost count for the next measurement */ rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; +skip_measurement: /* * We restart our diffs based on the gpsrtt in the * measurement window. @@ -4365,24 +5087,30 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, rack->rc_gp_saw_ca = 0; rack->rc_gp_saw_ss = 0; rack->rc_dragged_bottom = 0; -skip_measurement: + if (quality == RACK_QUALITY_HIGH) { + /* + * Gput in the stats world is in kbps where bytes_ps is + * bytes per second so we do ((x * 8)/ 1000). + */ + gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); #ifdef STATS - stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, - gput); - /* - * XXXLAS: This is a temporary hack, and should be - * chained off VOI_TCP_GPUT when stats(9) grows an - * API to deal with chained VOIs. - */ - if (tp->t_stats_gput_prev > 0) - stats_voi_update_abs_s32(tp->t_stats, - VOI_TCP_GPUT_ND, - ((gput - tp->t_stats_gput_prev) * 100) / - tp->t_stats_gput_prev); + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + gput); + /* + * XXXLAS: This is a temporary hack, and should be + * chained off VOI_TCP_GPUT when stats(9) grows an + * API to deal with chained VOIs. + */ + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); #endif + tp->t_stats_gput_prev = gput; + } tp->t_flags &= ~TF_GPUTINPROG; - tp->t_stats_gput_prev = gput; /* * Now are we app limited now and there is space from where we * were to where we want to go? @@ -4399,7 +5127,7 @@ skip_measurement: /* * Yep there is enough outstanding to make a measurement here. */ - struct rack_sendmap *rsm, fe; + struct rack_sendmap *rsm; rack->r_ctl.rc_gp_lowrtt = 0xffffffff; rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; @@ -4443,8 +5171,8 @@ skip_measurement: * Now we need to find the timestamp of the send at tp->gput_seq * for the send based measurement. */ - fe.r_start = tp->gput_seq; - rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + rack->r_ctl.rc_gp_cumack_ts = 0; + rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); if (rsm) { /* Ok send-based limit is set */ if (SEQ_LT(rsm->r_start, tp->gput_seq)) { @@ -4456,11 +5184,21 @@ skip_measurement: */ tp->gput_seq = rsm->r_start; } - if (rsm->r_flags & RACK_ACKED) + if (rsm->r_flags & RACK_ACKED) { + struct rack_sendmap *nrsm; + tp->gput_ts = (uint32_t)rsm->r_ack_arrival; - else + tp->gput_seq = rsm->r_end; + nrsm = tqhash_next(rack->r_ctl.tqh, rsm); + if (nrsm) + rsm = nrsm; + else { + rack->app_limited_needs_set = 1; + } + } else rack->app_limited_needs_set = 1; - rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + /* We always go from the first send */ + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; } else { /* * If we don't find the rsm due to some @@ -4472,14 +5210,22 @@ skip_measurement: microuptime(&tv); rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); } + rack_tend_gp_marks(tp, rack); rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)rsm, tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, - __LINE__, NULL, quality); + __LINE__, rsm, quality); + rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); + } else { + /* + * To make sure proper timestamp merging occurs, we need to clear + * all GP marks if we don't start a measurement. + */ + rack_clear_gp_marks(tp, rack); } } @@ -4509,6 +5255,20 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); #endif + if ((th_ack == tp->snd_max) && rack->lt_bw_up) { + /* We will ack all, time + * to end any lt_bw_up we + * have running until something + * new is sent. + */ + struct timeval tv; + + rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); + rack->r_ctl.lt_seq = tp->snd_max; + (void)tcp_get_usecs(&tv); + rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); + rack->lt_bw_up = 0; + } quality = RACK_QUALITY_NONE; if ((tp->t_flags & TF_GPUTINPROG) && rack_enough_for_measurement(tp, rack, th_ack, &quality)) { @@ -4546,7 +5306,8 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint } prior_cwnd = tp->snd_cwnd; if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || - (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) + (rack_client_low_buf && rack->client_bufferlvl && + (rack->client_bufferlvl < rack_client_low_buf))) labc_to_use = rack->rc_labc; else labc_to_use = rack_max_abc_post_recovery; @@ -4565,7 +5326,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint log.u_bbr.flex7 = V_tcp_do_newsack; log.u_bbr.flex8 = 1; lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); + 0, &log, false, NULL, __func__, __LINE__,&tv); } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ @@ -4634,6 +5395,348 @@ tcp_rack_partialack(struct tcpcb *tp) rack->r_wanted_output = 1; } +static inline void +rack_set_most_aggr(struct tcp_rack *rack) +{ + rack->r_fill_less_agg = 0; + /* Once the cwnd as been clamped we don't do fill_cw */ + if (rack->r_cwnd_was_clamped == 0) + rack->rc_pace_to_cwnd = 1; + rack->r_pacing_discount = 0; +} + +static inline void +rack_limit_fillcw(struct tcp_rack *rack) +{ + rack->r_fill_less_agg = 1; + /* Once the cwnd as been clamped we don't do fill_cw */ + if (rack->r_cwnd_was_clamped == 0) + rack->rc_pace_to_cwnd = 1; + rack->r_pacing_discount = 0; +} + +static inline void +rack_disable_fillcw(struct tcp_rack *rack) +{ + rack->r_fill_less_agg = 1; + rack->rc_pace_to_cwnd = 0; + rack->r_pacing_discount = 0; +} + +static void +rack_client_buffer_level_set(struct tcp_rack *rack) +{ + /* + * Only if DGP is on do we do anything that + * changes stack behavior. If DGP is off all + * we will do is issue a BB log (if BB logging is + * on) and return. + */ + if (rack->dgp_on == 0) { + rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl, + 0, 0, 0, 30, __LINE__, NULL, 0); + return; + } + if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) { + goto set_most_agg; + } + /* + * We are in DGP so what setting should we + * apply based on where the client is? + */ + switch(rack->r_ctl.rc_dgp_bl_agg) { + default: + case DGP_LEVEL0: +set_most_agg: + rack_set_most_aggr(rack); + break; + case DGP_LEVEL1: + if (rack->client_bufferlvl == 4) + rack_limit_fillcw(rack); + else if (rack->client_bufferlvl == 5) + rack_disable_fillcw(rack); + else + rack_set_most_aggr(rack); + break; + case DGP_LEVEL2: + if (rack->client_bufferlvl == 3) + rack_limit_fillcw(rack); + else if (rack->client_bufferlvl == 4) + rack_disable_fillcw(rack); + else if (rack->client_bufferlvl == 5) { + rack_disable_fillcw(rack); + rack->r_pacing_discount = 1; + rack->r_ctl.pacing_discount_amm = 1; + } else + rack_set_most_aggr(rack); + break; + case DGP_LEVEL3: + if (rack->client_bufferlvl == 2) + rack_limit_fillcw(rack); + else if (rack->client_bufferlvl == 3) + rack_disable_fillcw(rack); + else if (rack->client_bufferlvl == 4) { + rack_disable_fillcw(rack); + rack->r_pacing_discount = 1; + rack->r_ctl.pacing_discount_amm = 1; + } else if (rack->client_bufferlvl == 5) { + rack_disable_fillcw(rack); + rack->r_pacing_discount = 1; + rack->r_ctl.pacing_discount_amm = 2; + } else + rack_set_most_aggr(rack); + break; + } + rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0, + 0, 0, 30, __LINE__, NULL, 0); +} + +static void +do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* + * Can we unclamp. We unclamp if more than + * N rounds have transpired with no loss. + */ + uint64_t snds, rxts, rxt_per; + uint32_t rnds; + + rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; + if ((rack_unclamp_round_thresh > 0) && + (rnds >= rack_unclamp_round_thresh)) { + snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; + KASSERT ((snds > 0), ("rack:%p tp:%p snds:%lu is 0", rack, tp, snds)); + rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; + rxt_per = rxts * 1000; + rxt_per /= snds; + if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) { + /* Unclamp */ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex3 = rnds; + log.u_bbr.flex4 = rack_unclamp_round_thresh; + log.u_bbr.flex5 = (uint32_t)rxt_per; + log.u_bbr.flex8 = 6; + log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; + log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; + log.u_bbr.applimited = rack->r_ctl.max_clamps; + log.u_bbr.epoch = rack->r_ctl.clamp_options; + log.u_bbr.cur_del_rate = rxts; + log.u_bbr.bw_inuse = rack_get_lt_bw(rack); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); + log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); + tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } + rack->r_ctl.num_of_clamps_applied = 0; + rack->r_cwnd_was_clamped = 0; + rack->excess_rxt_on = 1; + if (rack->r_ctl.clamp_options) { + /* + * We only allow fillcw to be toggled + * if you are setting a max seg too. + */ + if (rack->r_ctl.clamp_options & 0x1) { + if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { + /* turn on fill cw for non-dgp*/ + rack->rc_pace_to_cwnd = 0; + } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { + /* For DGP we want it off */ + rack->rc_pace_to_cwnd = 1; + } + } + } + if (rack->dgp_on) { + /* Reset all multipliers to 100.0 so just the measured bw */ + /* Crash any per boosts down to 100% */ + rack->r_ctl.rack_per_of_gp_rec = 100; + rack->r_ctl.rack_per_of_gp_ss = 100; + rack->r_ctl.rack_per_of_gp_ca = 100; + /* Set in an upper bound for ss/ca % increase */ + rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; + rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; + } + } + } +} + +static void +do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* + * Rack excess rxt accounting is turned on. If we + * are above a threshold of rxt's in at least N + * rounds, then back off the cwnd and ssthresh + * to fit into the long-term b/w. + */ + uint64_t snds, rxts, rxt_per, lt_bw, bdp; + uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0; + + /* Is it shut off by 0 rounds? */ + if (rack_rxt_min_rnds == 0) + return; + if ((rack->r_ctl.max_clamps > 0) && + (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) { + /* + * The idea, if max_clamps is set, is that if clamping it + * N times did not work again, then there is no sense + * clamping it again. The link is just a lossy link and + * our clamps are doing no good. Turn it off so we don't come + * back here again. + */ + rack->excess_rxt_on = 0; + rack->r_cwnd_was_clamped = 0; + rack->r_ctl.num_of_clamps_applied = 0; + return; + } + snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; + rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; + rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; + /* Has enough rounds progressed for us to re-measure? */ + if ((rnds >= rack_rxt_min_rnds) && + (rack->r_ctl.rxt_threshold > 0)){ + rxt_per = rxts * 1000; + rxt_per /= snds; + if (rxt_per >= rack->r_ctl.rxt_threshold) { + /* + * Action required: + * We are above our excess retransmit level, lets + * cut down the cwnd and ssthresh to match the long-term + * b/w we are getting. + */ + /* First disable scwnd if enabled */ +#ifdef NETFLIX_SHARED_CWND + rack->rack_enable_scwnd = 0; + if (rack->r_ctl.rc_scw) { + uint32_t limit; + + shared_cwnd_was_enabled = 1; + if (rack->r_limit_scw) + limit = max(1, rack->r_ctl.rc_lowest_us_rtt); + else + limit = 0; + tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, + rack->r_ctl.rc_scw_index, + limit); + rack->r_ctl.rc_scw = NULL; + } + +#endif + /* Calculate what the cwnd and ssthresh should be */ + tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT); + lt_bw = rack_get_lt_bw(rack); + if (lt_bw == 0) { + /* + * No lt_bw, lets chop things to one MSS + * and the ssthresh to the iwnd. + */ +reset_to_iw: + new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp)); + } else { + rtt = rack->rc_rack_rtt; + if (rtt == 0) { + /* If we have no rack_rtt drop to the IW situation */ + goto reset_to_iw; + } + bdp = lt_bw * (uint64_t)rtt; + bdp /= HPTS_USEC_IN_SEC; + new_cwnd = (uint32_t)bdp; + new_ssthresh = new_cwnd - 1; + if (new_cwnd < ctf_fixed_maxseg(tp)) { + /* Rock bottom, goto IW settings */ + goto reset_to_iw; + } + } + rack->r_cwnd_was_clamped = 1; + rack->r_ctl.num_of_clamps_applied++; + /* Reset the counter fromn now */ + tp->t_bytes_acked = 0; + /* + * Now what about options? + * We look at the bottom 8 bits: + * F = fill cw bit (toggle it if set) + * S = Segment bits + * M = set max segment bit + * + * SSSS SSMF + */ + if (rack->r_ctl.clamp_options) { + if (rack->r_ctl.clamp_options & 0x1) { + if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { + /* turn on fill cw for non-dgp*/ + rack->rc_pace_to_cwnd = 1; + } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { + /* For DGP we want it off */ + rack->rc_pace_to_cwnd = 0; + } + } + } + if (rack->dgp_on) { + /* Reset all multipliers to 100.0 so just the measured bw */ + /* Crash any per boosts down to 100% */ + rack->r_ctl.rack_per_of_gp_rec = 100; + rack->r_ctl.rack_per_of_gp_ss = 100; + rack->r_ctl.rack_per_of_gp_ca = 100; + /* Set in an upper bound for ss/ca % increase */ + rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper; + rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper; + /* Now move to the lt_bw */ + rack->r_ctl.gp_bw = lt_bw; + rack->rc_gp_filled = 1; + rack->r_ctl.num_measurements = RACK_REQ_AVG; + } + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = new_cwnd; + log.u_bbr.flex2 = new_ssthresh; + log.u_bbr.flex3 = rnds; + log.u_bbr.flex4 = rack_rxt_min_rnds; + log.u_bbr.flex5 = rtt; + log.u_bbr.flex6 = shared_cwnd_was_enabled; + log.u_bbr.flex8 = 5; + log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; + log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; + log.u_bbr.applimited = rack->r_ctl.max_clamps; + log.u_bbr.epoch = rack->r_ctl.clamp_options; + log.u_bbr.cur_del_rate = rxts; + log.u_bbr.delRate = snds; + log.u_bbr.rttProp = rack->r_ctl.rxt_threshold; + log.u_bbr.bw_inuse = lt_bw; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); + log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); + tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } + /* Update our point where we did it */ + if (rack->r_ctl.already_had_a_excess == 0) { + rack->r_ctl.already_had_a_excess = 1; + counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1); + } + counter_u64_add(rack_rxt_clamps_cwnd, 1); + rack->r_ctl.last_sndbytes = tp->t_sndbytes; + rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes; + rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round; + if (new_cwnd < tp->snd_cwnd) + tp->snd_cwnd = new_cwnd; + if (new_ssthresh < tp->snd_ssthresh) + tp->snd_ssthresh = new_ssthresh; + } + } +} + static void rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) { @@ -4672,7 +5775,7 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex8 = 2; tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); + 0, &log, false, NULL, __func__, __LINE__, &tv); } if ((rack->rack_no_prr == 0) && (rack->no_prr_addback == 0) && @@ -4707,6 +5810,8 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) rack_log_dsack_event(rack, 1, __LINE__, 0, 0); } EXIT_RECOVERY(tp->t_flags); + if (rack->r_ctl.full_dgp_in_rec) + rack_client_buffer_level_set(rack); } static void @@ -4731,8 +5836,15 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) tp->t_flags &= ~TF_WASFRECOVERY; tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { + if (rack->dgp_on && rack->r_cwnd_was_clamped) { + /* Reset the gains so that on exit we will be softer longer */ + rack->r_ctl.rack_per_of_gp_rec = 100; + rack->r_ctl.rack_per_of_gp_ss = 98; + rack->r_ctl.rack_per_of_gp_ca = 98; + } rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; + rack->r_fast_output = 0; if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 2, in_rec_at_entry, line); @@ -4752,6 +5864,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) SEQ_GEQ(ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); + rack->r_fast_output = 0; tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; @@ -4760,6 +5873,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; + rack->r_fast_output = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); @@ -4794,6 +5908,8 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) } if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { rack_log_to_prr(rack, 15, cwnd_enter, line); + if (rack->r_ctl.full_dgp_in_rec) + rack_client_buffer_level_set(rack); rack->r_ctl.dsack_byte_cnt = 0; rack->r_ctl.retran_during_recovery = 0; rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; @@ -4880,7 +5996,8 @@ rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) * should be the last segment (which it was not). */ prsm = rsm; - RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { + + TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } @@ -5091,7 +6208,7 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) uint32_t srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; - if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { + if (tqhash_empty(rack->r_ctl.tqh)) { return (NULL); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); @@ -5158,6 +6275,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_ } rack->rc_on_min_to = 0; if ((tp->t_state < TCPS_ESTABLISHED) || + (rack->sack_attack_disable > 0) || ((tp->t_flags & TF_SACK_PERMIT) == 0)) { goto activate_rxt; } @@ -5372,8 +6490,10 @@ activate_tlp: } static void -rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) { + struct timeval tv; + if (rack->rc_in_persist == 0) { if (tp->t_flags & TF_GPUTINPROG) { /* @@ -5389,7 +6509,19 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) rack->rack_scwnd_is_idle = 1; } #endif - rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv); + if (rack->lt_bw_up) { + /* Suspend our LT BW measurement */ + uint64_t tmark; + + rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); + rack->r_ctl.lt_seq = snd_una; + tmark = tcp_tv_to_lusectick(&tv); + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + rack->r_ctl.lt_timemark = tmark; + rack->lt_bw_up = 0; + rack->r_persist_lt_bw_off = 1; + } if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack_timer_cancel(tp, rack, cts, __LINE__); @@ -5406,6 +6538,9 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { + struct timeval tv; + uint32_t t_time; + if (tcp_in_hpts(rack->rc_inp)) { tcp_hpts_remove(rack->rc_inp); rack->r_ctl.rc_hpts_flags = 0; @@ -5416,6 +6551,7 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) rack->rack_scwnd_is_idle = 0; } #endif + t_time = tcp_get_usecs(&tv); if (rack->rc_gp_dyn_mul && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { @@ -5425,7 +6561,7 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) */ uint32_t time_idle, idle_min; - time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; + time_idle = t_time - rack->r_ctl.rc_went_idle_time; idle_min = rack_min_probertt_hold; if (rack_probertt_gpsrtt_cnt_div) { uint64_t extra; @@ -5449,6 +6585,12 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } } } + if (rack->r_persist_lt_bw_off) { + /* Continue where we left off */ + rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); + rack->lt_bw_up = 1; + rack->r_persist_lt_bw_off = 0; + } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_rxtshift = 0; @@ -5600,13 +6742,10 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, rack->r_late = 0; } } - if (slot) { - /* We are pacing too */ - rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; - } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); #ifdef NETFLIX_EXP_DETECTION if (rack->sack_attack_disable && + (rack->r_ctl.ack_during_sd > 0) && (slot < tcp_sad_pacing_interval)) { /* * We have a potential attacker on @@ -5619,6 +6758,8 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * micro-seconds, so lets convert to msecs. */ slot = tcp_sad_pacing_interval; + rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); + rack->r_ctl.ack_during_sd = 0; } #endif if (tp->t_flags & TF_DELACK) { @@ -5738,6 +6879,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, */ inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); if (slot) { + rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; rack->r_ctl.rc_last_output_to = us_cts + slot; /* * A pacing timer (slot) is being set, in @@ -5753,10 +6895,27 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * even a SACK should not disturb us (with * the exception of r_rr_config 3). */ - if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && - (rack->r_rr_config != 3)) - inp->inp_flags2 |= INP_DONT_SACK_QUEUE; - if (rack->rc_ack_can_sendout_data) { + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { + if (rack->r_rr_config != 3) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + else if (rack->rc_pace_dnd) { + if (IN_RECOVERY(tp->t_flags)) { + /* + * When DND is on, we only let a sack + * interrupt us if we are not in recovery. + * + * If DND is off, then we never hit here + * and let all sacks wake us up. + * + */ + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + } + } + } + /* For sack attackers we want to ignore sack */ + if (rack->sack_attack_disable == 1) { + inp->inp_flags2 |= (INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); + } else if (rack->rc_ack_can_sendout_data) { /* * Ahh but wait, this is that special case * where the pacing timer can be disturbed @@ -5791,6 +6950,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * but it may change the prr stats so letting it in (the set defaults * at the start of this block) are good enough. */ + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); @@ -5806,7 +6966,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, } rack->rc_tmr_stopped = 0; if (slot) - rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); + rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); } /* @@ -5859,32 +7019,62 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) return (0); } + + static void rack_adjust_orig_mlen(struct rack_sendmap *rsm) { - if (rsm->m->m_len > rsm->orig_m_len) { + + if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { /* - * Mbuf grew, caused by sbcompress, our offset does - * not change. + * The trailing space changed, mbufs can grow + * at the tail but they can't shrink from + * it, KASSERT that. Adjust the orig_m_len to + * compensate for this change. */ - rsm->orig_m_len = rsm->m->m_len; - } else if (rsm->m->m_len < rsm->orig_m_len) { + KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), + ("mbuf:%p rsm:%p trailing_space:%lu ots:%u oml:%u mlen:%u\n", + rsm->m, + rsm, + M_TRAILINGROOM(rsm->m), + rsm->orig_t_space, + rsm->orig_m_len, + rsm->m->m_len)); + rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } + if (rsm->m->m_len < rsm->orig_m_len) { /* * Mbuf shrank, trimmed off the top by an ack, our * offset changes. */ - rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); + KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), + ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", + rsm->m, rsm->m->m_len, + rsm, rsm->orig_m_len, + rsm->soff)); + if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) + rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); + else + rsm->soff = 0; rsm->orig_m_len = rsm->m->m_len; +#ifdef INVARIANTS + } else if (rsm->m->m_len > rsm->orig_m_len) { + panic("rsm:%p m:%p m_len grew outside of t_space compensation", + rsm, rsm->m); +#endif } } static void -rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) +rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) { struct mbuf *m; uint32_t soff; - if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) { + if (src_rsm->m && + ((src_rsm->orig_m_len != src_rsm->m->m_len) || + (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { /* Fix up the orig_m_len and possibly the mbuf offset */ rack_adjust_orig_mlen(src_rsm); } @@ -5897,10 +7087,25 @@ rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm KASSERT((m != NULL), ("rsm:%p nrsm:%p hit at soff:%u null m", src_rsm, rsm, soff)); + if (m == NULL) { + /* This should *not* happen which is why there is a kassert */ + src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, + (src_rsm->r_start - rack->rc_tp->snd_una), + &src_rsm->soff); + src_rsm->orig_m_len = src_rsm->m->m_len; + src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); + rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, + (rsm->r_start - rack->rc_tp->snd_una), + &rsm->soff); + rsm->orig_m_len = rsm->m->m_len; + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + return; + } } rsm->m = m; rsm->soff = soff; rsm->orig_m_len = m->m_len; + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); } static __inline void @@ -5917,6 +7122,7 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; nrsm->r_rtr_bytes = 0; nrsm->r_fas = rsm->r_fas; + nrsm->r_bas = rsm->r_bas; rsm->r_end = nrsm->r_start; nrsm->r_just_ret = rsm->r_just_ret; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { @@ -5944,7 +7150,7 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); if (rsm->m) - rack_setup_offset_for_rsm(rsm, nrsm); + rack_setup_offset_for_rsm(rack, rsm, nrsm); } static struct rack_sendmap * @@ -5962,9 +7168,6 @@ rack_merge_rsm(struct tcp_rack *rack, * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ -#ifdef INVARIANTS - struct rack_sendmap *rm; -#endif rack_log_map_chg(rack->rc_tp, rack, NULL, l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); l_rsm->r_end = r_rsm->r_end; @@ -5997,21 +7200,40 @@ rack_merge_rsm(struct tcp_rack *rack, if (r_rsm == rack->r_ctl.rc_first_appl) rack->r_ctl.rc_first_appl = l_rsm; } -#ifndef INVARIANTS - (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); -#else - rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); - if (rm != r_rsm) { - panic("removing head in rack:%p rsm:%p rm:%p", - rack, r_rsm, rm); + tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); + /* + * We keep the largest value, which is the newest + * send. We do this in case a segment that is + * joined together and not part of a GP estimate + * later gets expanded into the GP estimate. + * + * We prohibit the merging of unlike kinds i.e. + * all pieces that are in the GP estimate can be + * merged and all pieces that are not in a GP estimate + * can be merged, but not disimilar pieces. Combine + * this with taking the highest here and we should + * be ok unless of course the client reneges. Then + * all bets are off. + */ + if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < + r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { + l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; } -#endif + /* + * When merging two RSM's we also need to consider the ack time and keep + * newest. If the ack gets merged into a measurement then that is the + * one we will want to be using. + */ + if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) + l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; + if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; l_rsm->r_limit_type = 0; } rack_free(rack, r_rsm); + l_rsm->r_flags |= RACK_MERGED; return (l_rsm); } @@ -6030,9 +7252,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; -#ifdef INVARIANTS - struct rack_sendmap *insret; -#endif + int insret __diagused; struct socket *so = tptosocket(tp); uint32_t amm; uint32_t out, avail; @@ -6136,7 +7356,7 @@ need_retran: if (rack_always_send_oldest) rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); else { - rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { rsm = rack_find_high_nonack(rack, rsm); } @@ -6149,24 +7369,17 @@ need_retran: } } else { /* - * We must find the last segment - * that was acceptable by the client. + * We had a collapsed window, lets find + * the point before the collapse. */ - RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { - if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { - /* Found one */ - break; - } + if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) + rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); + else { + rsm = tqhash_min(rack->r_ctl.tqh); } if (rsm == NULL) { - /* None? if so send the first */ - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - if (rsm == NULL) { -#ifdef TCP_BLACKBOX - tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); -#endif - goto out; - } + /* Huh */ + goto out; } } if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { @@ -6187,11 +7400,10 @@ need_retran: (rsm->r_end - ctf_fixed_maxseg(tp))); rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -6303,6 +7515,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) rack->r_ctl.persist_lost_ends++; } counter_u64_add(rack_persists_sends, 1); + counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); @@ -6414,9 +7627,11 @@ rack_remxt_tmr(struct tcpcb *tp) * back in its seq ordered place. */ TAILQ_INIT(&rack->r_ctl.rc_tmap); - RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { + + TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { rsm->r_dupack = 0; - rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + if (rack_verbose_logging) + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); @@ -6439,7 +7654,7 @@ rack_remxt_tmr(struct tcpcb *tp) rack->r_ctl.rc_agg_early = 0; rack->r_late = 0; /* Clear the tlp rtx mark */ - rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); if (rack->r_ctl.rc_resend != NULL) rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; rack->r_ctl.rc_prr_sndcnt = 0; @@ -6465,46 +7680,7 @@ rack_remxt_tmr(struct tcpcb *tp) static void rack_convert_rtts(struct tcpcb *tp) { - if (tp->t_srtt > 1) { - uint32_t val, frac; - - val = tp->t_srtt >> TCP_RTT_SHIFT; - frac = tp->t_srtt & 0x1f; - tp->t_srtt = TICKS_2_USEC(val); - /* - * frac is the fractional part of the srtt (if any) - * but its in ticks and every bit represents - * 1/32nd of a hz. - */ - if (frac) { - if (hz == 1000) { - frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); - } else { - frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); - } - tp->t_srtt += frac; - } - } - if (tp->t_rttvar) { - uint32_t val, frac; - - val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; - frac = tp->t_rttvar & 0x1f; - tp->t_rttvar = TICKS_2_USEC(val); - /* - * frac is the fractional part of the srtt (if any) - * but its in ticks and every bit represents - * 1/32nd of a hz. - */ - if (frac) { - if (hz == 1000) { - frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); - } else { - frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); - } - tp->t_rttvar += frac; - } - } + tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); tp->t_rxtcur = RACK_REXMTVAL(tp); if (TCPS_HAVEESTABLISHED(tp->t_state)) { tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); @@ -6613,7 +7789,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) (TP_KEEPINIT(tp) != 0)) { struct rack_sendmap *rsm; - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_min(rack->r_ctl.tqh); if (rsm) { /* Ok we have something outstanding to test keepinit with */ if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && @@ -6891,8 +8067,11 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 } /* * Ok our timer went off early and we are not paced false - * alarm, go back to sleep. + * alarm, go back to sleep. We make sure we don't have + * no-sack wakeup on since we no longer have a PKT_OUTPUT + * flag in place. */ + rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left)); @@ -6971,23 +8150,20 @@ rack_stopall(struct tcpcb *tp) } static void -rack_stop_all_timers(struct tcpcb *tp) +rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) { - struct tcp_rack *rack; - /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ - rack = (struct tcp_rack *)tp->t_fb_ptr; rack->rc_in_persist = 1; } } static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) + struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz) { int32_t idx; @@ -7019,8 +8195,11 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } + /* Lets make sure it really is in or not the GP window */ + rack_mark_in_gp_win(tp, rsm); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; + rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); /* Take off the must retransmit flag, if its on */ if (rsm->r_flags & RACK_MUST_RXT) { if (rack->r_must_retran) @@ -7035,6 +8214,8 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, } rsm->r_flags &= ~RACK_MUST_RXT; } + /* Remove any collapsed flag */ + rsm->r_flags &= ~RACK_RWND_COLLAPSED; if (rsm->r_flags & RACK_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; @@ -7044,16 +8225,14 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) + struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct rack_sendmap *nrsm; -#ifdef INVARIANTS - struct rack_sendmap *insret; -#endif + int insret __diagused; uint32_t c_end; int32_t len; @@ -7064,7 +8243,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ - rack_update_rsm(tp, rack, rsm, ts, add_flag); + rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); if (c_end == rsm->r_end) { *lenp = 0; return (0); @@ -7101,11 +8280,10 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, nrsm->r_dupack = 0; rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -7114,7 +8292,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); - rack_update_rsm(tp, rack, rsm, ts, add_flag); + rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); /* Log a split of rsm into rsm and nrsm */ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); *lenp = 0; @@ -7124,13 +8302,13 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, - struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls) + struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, + uint32_t s_moff, int hw_tls, int segsiz) { struct tcp_rack *rack; - struct rack_sendmap *rsm, *nrsm, fe; -#ifdef INVARIANTS - struct rack_sendmap *insret; -#endif + struct rack_sendmap *rsm, *nrsm; + int insret __diagused; + register uint32_t snd_max, snd_una; /* @@ -7216,6 +8394,7 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, /* First question is it a retransmission or new? */ if (seq_out == snd_max) { /* Its new */ + rack_chk_http_and_hybrid_on_out(rack, seq_out, len, cts); again: rsm = rack_alloc(rack); if (rsm == NULL) { @@ -7241,6 +8420,7 @@ again: } rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; + rack_mark_in_gp_win(tp, rsm); rsm->r_dupack = 0; /* * save off the mbuf location that @@ -7280,17 +8460,20 @@ again: rsm->m = lm; } rsm->orig_m_len = rsm->m->m_len; - } else + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } else { rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + } + rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* Log a new rsm */ rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + (void)tqhash_insert(rack->r_ctl.tqh, rsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -7306,7 +8489,7 @@ again: (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { struct rack_sendmap *prsm; - prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + prsm = tqhash_prev(rack->r_ctl.tqh, rsm); if (prsm) prsm->r_one_out_nr = 1; } @@ -7315,7 +8498,6 @@ again: /* * If we reach here its a retransmission and we need to find it. */ - memset(&fe, 0, sizeof(fe)); more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; @@ -7325,7 +8507,7 @@ more: rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { - seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); + seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); if (len == 0) { return; } else { @@ -7334,11 +8516,10 @@ more: } /* Ok it was not the last pointer go through it the hard way. */ refind: - fe.r_start = seq_out; - rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + rsm = tqhash_find(rack->r_ctl.tqh, seq_out); if (rsm) { if (rsm->r_start == seq_out) { - seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); + seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); if (len == 0) { return; } else { @@ -7353,7 +8534,7 @@ refind: */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { - rack_update_rsm(tp, rack, rsm, cts, add_flag); + rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); return; } /* @@ -7363,11 +8544,10 @@ refind: rack_clone_rsm(rack, nrsm, rsm, seq_out); rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -7376,7 +8556,7 @@ refind: nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); - seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); + seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); if (len == 0) { return; } else if (len > 0) @@ -7394,7 +8574,7 @@ refind: printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); - RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { + TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } @@ -7465,7 +8645,7 @@ tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t u rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; rack->r_ctl.rack_rs.confidence = confidence; rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; - } else if (confidence || rack->r_ctl.rack_rs.confidence) { + } else if (confidence != 0) { /* * Once we have a confident number, * we can update it with a smaller @@ -7625,7 +8805,7 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) /* Send in the microsecond rtt has close to the path RTT as we can get */ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); } - + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); #endif /* * the retransmit should happen at rtt + 4 * rttvar. Because of the @@ -7795,9 +8975,13 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); } } - if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + if ((rack->r_ctl.rc_rack_tmit_time == 0) || + (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, + (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + if (rack->r_ctl.rc_rack_tmit_time == 0) + rack->r_ctl.rc_rack_tmit_time = 1; rack->rc_rack_rtt = t; } return (1); @@ -7858,10 +9042,13 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rack_min_rtt = 1; } } - if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + if ((rack->r_ctl.rc_rack_tmit_time == 0) || + (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, + (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + if (rack->r_ctl.rc_rack_tmit_time == 0) + rack->r_ctl.rc_rack_tmit_time = 1; rack->rc_rack_rtt = t; } rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); @@ -7870,6 +9057,12 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, return (1); } } + /* If we are logging log out the sendmap */ + if (tcp_bblogging_on(rack->rc_tp)) { + for (i = 0; i < rsm->r_rtr_cnt; i++) { + rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); + } + } goto ts_not_found; } else { /* @@ -7906,9 +9099,13 @@ ts_not_found: rack->r_ctl.rc_rack_min_rtt = 1; } } - if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { + if ((rack->r_ctl.rc_rack_tmit_time == 0) || + (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, + (uint32_t)rsm->r_tim_lastsent[i]))) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; + if (rack->r_ctl.rc_rack_tmit_time == 0) + rack->r_ctl.rc_rack_tmit_time = 1; rack->rc_rack_rtt = t; } return (1); @@ -7969,6 +9166,7 @@ rack_need_set_test(struct tcpcb *tp, int line, int use_which) { + struct rack_sendmap *s_rsm; if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(rsm->r_end, tp->gput_seq)) { @@ -8006,26 +9204,24 @@ rack_need_set_test(struct tcpcb *tp, * up some of the rsm, we set RACK_USE_BEG * since whats at r_start (i.e. th_ack) * is left unacked and thats where the - * measurement not starts. + * measurement now starts. */ tp->gput_seq = rsm->r_start; - rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; } if ((use_which == RACK_USE_END) && SEQ_GEQ(rsm->r_end, tp->gput_seq)) { - /* - * We use the end when the cumack - * is moving forward and completely - * deleting the rsm passed so basically - * r_end holds th_ack. - * - * For SACK's we also want to use the end - * since this piece just got sacked and - * we want to target anything after that - * in our measurement. - */ - tp->gput_seq = rsm->r_end; - rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + /* + * We use the end when the cumack + * is moving forward and completely + * deleting the rsm passed so basically + * r_end holds th_ack. + * + * For SACK's we also want to use the end + * since this piece just got sacked and + * we want to target anything after that + * in our measurement. + */ + tp->gput_seq = rsm->r_end; } if (use_which == RACK_USE_END_OR_THACK) { /* @@ -8038,7 +9234,28 @@ rack_need_set_test(struct tcpcb *tp, tp->gput_seq = th_ack; else tp->gput_seq = rsm->r_end; - rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + } + if (SEQ_LT(tp->gput_seq, tp->snd_max)) + s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); + else + s_rsm = NULL; + /* + * Pick up the correct send time if we can the rsm passed in + * may be equal to s_rsm if the RACK_USE_BEG was set. For the other + * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will + * find a different seq i.e. the next send up. + * + * If that has not been sent, s_rsm will be NULL and we must + * arrange it so this function will get called again by setting + * app_limited_needs_set. + */ + if (s_rsm) + rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; + else { + /* If we hit here we have to have *not* sent tp->gput_seq */ + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; + /* Set it up so we will go through here again */ + rack->app_limited_needs_set = 1; } if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { /* @@ -8060,7 +9277,10 @@ rack_need_set_test(struct tcpcb *tp, (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) rack->measure_saw_probe_rtt = 0; rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, - seq, tp->gput_seq, 0, 5, line, NULL, 0); + seq, tp->gput_seq, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | + (uint64_t)rack->r_ctl.rc_gp_output_ts), + 5, line, NULL, 0); if (rack->rc_gp_filled && ((tp->gput_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * @@ -8077,7 +9297,10 @@ rack_need_set_test(struct tcpcb *tp, */ tp->t_flags &= ~TF_GPUTINPROG; rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, - 0, 0, 0, 6, __LINE__, NULL, 0); + 0, 0, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | + (uint64_t)rack->r_ctl.rc_gp_output_ts), + 6, __LINE__, NULL, 0); } else { /* * Reset the window further out. @@ -8085,6 +9308,8 @@ rack_need_set_test(struct tcpcb *tp, tp->gput_ack = tp->gput_seq + ideal_amount; } } + rack_tend_gp_marks(tp, rack); + rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); } } @@ -8104,23 +9329,43 @@ is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm } + static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, - struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) + struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, + int *no_extra, + int *moved_two, uint32_t segsiz) { uint32_t start, end, changed = 0; struct rack_sendmap stack_map; - struct rack_sendmap *rsm, *nrsm, fe, *prev, *next; -#ifdef INVARIANTS - struct rack_sendmap *insret; -#endif + struct rack_sendmap *rsm, *nrsm, *prev, *next; + int insret __diagused; int32_t used_ref = 1; int moved = 0; +#ifdef NETFLIX_EXP_DETECTION + int allow_segsiz; + int first_time_through = 1; +#endif + int noextra = 0; + int can_use_hookery = 0; start = sack->start; end = sack->end; rsm = *prsm; - memset(&fe, 0, sizeof(fe)); +#ifdef NETFLIX_EXP_DETECTION + /* + * There are a strange number of proxys and meddle boxes in the world + * that seem to cut up segments on different boundaries. This gets us + * smaller sacks that are still ok in terms of it being an attacker. + * We use the base segsiz to calculate an allowable smallness but + * also enforce a min on the segsiz in case it is an attacker playing + * games with MSS. So basically if the sack arrives and it is + * larger than a worse case 960 bytes, we don't classify the guy + * as supicious. + */ + allow_segsiz = max(segsiz, 1200) * sad_seg_size_per; + allow_segsiz /= 1000; +#endif do_rest_ofb: if ((rsm == NULL) || (SEQ_LT(end, rsm->r_start)) || @@ -8131,14 +9376,106 @@ do_rest_ofb: * find the correct spot in the tree. */ used_ref = 0; - fe.r_start = start; - rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + rsm = tqhash_find(rack->r_ctl.tqh, start); moved++; } if (rsm == NULL) { /* TSNH */ goto out; } +#ifdef NETFLIX_EXP_DETECTION + /* Now we must check for suspicous activity */ + if ((first_time_through == 1) && + ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) && + ((rsm->r_flags & RACK_PMTU_CHG) == 0) && + ((rsm->r_flags & RACK_TLP) == 0)) { + /* + * Its less than a full MSS or the segment being acked + * this should only happen if the rsm in question had the + * r_just_ret flag set <and> the end matches the end of + * the rsm block. + * + * Note we do not look at segments that have had TLP's on + * them since we can get un-reported rwnd collapses that + * basically we TLP on and then we get back a sack block + * that goes from the start to only a small way. + * + */ + int loss, ok; + + ok = 0; + if (SEQ_GEQ(end, rsm->r_end)) { + if (rsm->r_just_ret == 1) { + /* This was at the end of a send which is ok */ + ok = 1; + } else { + /* A bit harder was it the end of our segment */ + int segs, len; + + len = (rsm->r_end - rsm->r_start); + segs = len / segsiz; + segs *= segsiz; + if ((segs + (rsm->r_end - start)) == len) { + /* + * So this last bit was the + * end of our send if we cut it + * up into segsiz pieces so its ok. + */ + ok = 1; + } + } + } + if (ok == 0) { + /* + * This guy is doing something suspicious + * lets start detection. + */ + if (rack->rc_suspicious == 0) { + tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT); + counter_u64_add(rack_sack_attacks_suspect, 1); + rack->rc_suspicious = 1; + rack_log_sad(rack, 4); + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = end; + log.u_bbr.flex2 = start; + log.u_bbr.flex3 = rsm->r_end; + log.u_bbr.flex4 = rsm->r_start; + log.u_bbr.flex5 = segsiz; + log.u_bbr.flex6 = rsm->r_fas; + log.u_bbr.flex7 = rsm->r_bas; + log.u_bbr.flex8 = 5; + log.u_bbr.pkts_out = rsm->r_flags; + log.u_bbr.bbr_state = rack->rc_suspicious; + log.u_bbr.bbr_substate = rsm->r_just_ret; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_SAD_DETECTION, 0, + 0, &log, false, &tv); + } + } + /* You loose some ack count every time you sack + * a small bit that is not butting to the end of + * what we have sent. This is because we never + * send small bits unless its the end of the sb. + * Anyone sending a sack that is not at the end + * is thus very very suspicious. + */ + loss = (segsiz/2) / (end - start); + if (loss < rack->r_ctl.ack_count) + rack->r_ctl.ack_count -= loss; + else + rack->r_ctl.ack_count = 0; + } + } + first_time_through = 0; +#endif /* Ok we have an ACK for some piece of this rsm */ if (rsm->r_start != start) { if ((rsm->r_flags & RACK_ACKED) == 0) { @@ -8198,8 +9535,30 @@ do_rest_ofb: * the next guy and it is already sacked. * */ - next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - if (next && (next->r_flags & RACK_ACKED) && + /* + * Hookery can only be used if the two entries + * are in the same bucket and neither one of + * them staddle the bucket line. + */ + next = tqhash_next(rack->r_ctl.tqh, rsm); + if (next && + (rsm->bindex == next->bindex) && + ((rsm->r_flags & RACK_STRADDLE) == 0) && + ((next->r_flags & RACK_STRADDLE) == 0) && + (rsm->r_flags & RACK_IN_GP_WIN) && + (next->r_flags & RACK_IN_GP_WIN)) + can_use_hookery = 1; + else if (next && + (rsm->bindex == next->bindex) && + ((rsm->r_flags & RACK_STRADDLE) == 0) && + ((next->r_flags & RACK_STRADDLE) == 0) && + ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && + ((next->r_flags & RACK_IN_GP_WIN) == 0)) + can_use_hookery = 1; + else + can_use_hookery = 0; + if (next && can_use_hookery && + (next->r_flags & RACK_ACKED) && SEQ_GEQ(end, next->r_start)) { /** * So the next one is already acked, and @@ -8218,13 +9577,44 @@ do_rest_ofb: * use to update all the gizmos. */ /* Copy up our fudge block */ + noextra++; nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); /* Now adjust our tree blocks */ rsm->r_end = start; next->r_start = start; + rsm->r_flags |= RACK_SHUFFLED; + next->r_flags |= RACK_SHUFFLED; /* Now we must adjust back where next->m is */ - rack_setup_offset_for_rsm(rsm, next); + rack_setup_offset_for_rsm(rack, rsm, next); + /* + * Which timestamp do we keep? It is rather + * important in GP measurements to have the + * accurate end of the send window. + * + * We keep the largest value, which is the newest + * send. We do this in case a segment that is + * joined together and not part of a GP estimate + * later gets expanded into the GP estimate. + * + * We prohibit the merging of unlike kinds i.e. + * all pieces that are in the GP estimate can be + * merged and all pieces that are not in a GP estimate + * can be merged, but not disimilar pieces. Combine + * this with taking the highest here and we should + * be ok unless of course the client reneges. Then + * all bets are off. + */ + if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < + nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) + next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; + /* + * And we must keep the newest ack arrival time. + */ + if (next->r_ack_arrival < + rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) + next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); + /* We don't need to adjust rsm, it did not change */ /* Clear out the dup ack count of the remainder */ @@ -8238,9 +9628,14 @@ do_rest_ofb: if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); + /* You get a count for acking a whole segment or more */ + if ((nrsm->r_end - nrsm->r_start) >= segsiz) + rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } /* * Now we want to go up from rsm (the @@ -8271,7 +9666,7 @@ do_rest_ofb: counter_u64_add(rack_sack_used_next_merge, 1); /* Postion for the next block */ start = next->r_end; - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); + rsm = tqhash_next(rack->r_ctl.tqh, next); if (rsm == NULL) goto out; } else { @@ -8302,13 +9697,13 @@ do_rest_ofb: } counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, start); + moved++; rsm->r_just_ret = 0; #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -8327,12 +9722,12 @@ do_rest_ofb: moved++; if (end == rsm->r_end) { /* Done with block */ - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + rsm = tqhash_next(rack->r_ctl.tqh, rsm); goto out; } else if (SEQ_LT(end, rsm->r_end)) { /* A partial sack to a already sacked block */ moved++; - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + rsm = tqhash_next(rack->r_ctl.tqh, rsm); goto out; } else { /* @@ -8341,7 +9736,7 @@ do_rest_ofb: * next block. */ start = rsm->r_end; - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + rsm = tqhash_next(rack->r_ctl.tqh, rsm); if (rsm == NULL) goto out; } @@ -8397,6 +9792,9 @@ do_rest_ofb: } rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); + /* You get a count for acking a whole segment or more */ + if ((rsm->r_end - rsm->r_start) >= segsiz) + rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ rack_log_sack_passed(tp, rack, rsm); @@ -8404,6 +9802,8 @@ do_rest_ofb: if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); @@ -8426,7 +9826,7 @@ do_rest_ofb: * There is more not coverend by this rsm move on * to the next block in the RB tree. */ - nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + nrsm = tqhash_next(rack->r_ctl.tqh, rsm); start = rsm->r_end; rsm = nrsm; if (rsm == NULL) @@ -8478,8 +9878,30 @@ do_rest_ofb: rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); } } - prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + /* + * Hookery can only be used if the two entries + * are in the same bucket and neither one of + * them staddle the bucket line. + */ + prev = tqhash_prev(rack->r_ctl.tqh, rsm); if (prev && + (rsm->bindex == prev->bindex) && + ((rsm->r_flags & RACK_STRADDLE) == 0) && + ((prev->r_flags & RACK_STRADDLE) == 0) && + (rsm->r_flags & RACK_IN_GP_WIN) && + (prev->r_flags & RACK_IN_GP_WIN)) + can_use_hookery = 1; + else if (prev && + (rsm->bindex == prev->bindex) && + ((rsm->r_flags & RACK_STRADDLE) == 0) && + ((prev->r_flags & RACK_STRADDLE) == 0) && + ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && + ((prev->r_flags & RACK_IN_GP_WIN) == 0)) + can_use_hookery = 1; + else + can_use_hookery = 0; + + if (prev && can_use_hookery && (prev->r_flags & RACK_ACKED)) { /** * Goal, we want the right remainder of rsm to shrink @@ -8498,22 +9920,55 @@ do_rest_ofb: * Note if either prev/rsm is a TLP we don't * do this. */ + noextra++; nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); prev->r_end = end; rsm->r_start = end; + rsm->r_flags |= RACK_SHUFFLED; + prev->r_flags |= RACK_SHUFFLED; /* Now adjust nrsm (stack copy) to be * the one that is the small * piece that was "sacked". */ nrsm->r_end = end; rsm->r_dupack = 0; + /* + * Which timestamp do we keep? It is rather + * important in GP measurements to have the + * accurate end of the send window. + * + * We keep the largest value, which is the newest + * send. We do this in case a segment that is + * joined together and not part of a GP estimate + * later gets expanded into the GP estimate. + * + * We prohibit the merging of unlike kinds i.e. + * all pieces that are in the GP estimate can be + * merged and all pieces that are not in a GP estimate + * can be merged, but not disimilar pieces. Combine + * this with taking the highest here and we should + * be ok unless of course the client reneges. Then + * all bets are off. + */ + if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < + nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { + prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + } + /* + * And we must keep the newest ack arrival time. + */ + + if(prev->r_ack_arrival < + rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) + prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* * Now that the rsm has had its start moved forward * lets go ahead and get its new place in the world. */ - rack_setup_offset_for_rsm(prev, rsm); + rack_setup_offset_for_rsm(rack, prev, rsm); /* * Now nrsm is our new little piece * that is acked (which was merged @@ -8524,9 +9979,15 @@ do_rest_ofb: if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); + /* You get a count for acking a whole segment or more */ + if ((nrsm->r_end - nrsm->r_start) >= segsiz) + rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); + rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); rsm = prev; @@ -8550,32 +10011,32 @@ do_rest_ofb: */ if (rack->rc_last_tlp_acked_set && (is_rsm_inside_declared_tlp_block(rack, rsm))) { - /* - * We already turned this on since this block is inside - * the previous one was a partially sack now we - * are getting another one (maybe all of it). - */ - rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); - /* - * Lets make sure we have all of it though. - */ - if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { - rack->r_ctl.last_tlp_acked_start = rsm->r_start; - rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, - rack->r_ctl.last_tlp_acked_end); - } - if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { - rack->r_ctl.last_tlp_acked_end = rsm->r_end; - rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, - rack->r_ctl.last_tlp_acked_end); - } - } else { - rack->r_ctl.last_tlp_acked_start = rsm->r_start; - rack->r_ctl.last_tlp_acked_end = rsm->r_end; - rack->rc_last_tlp_acked_set = 1; - rack->rc_last_tlp_past_cumack = 0; - rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); - } + /* + * We already turned this on since this block is inside + * the previous one was a partially sack now we + * are getting another one (maybe all of it). + */ + rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); + /* + * Lets make sure we have all of it though. + */ + if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { + rack->r_ctl.last_tlp_acked_start = rsm->r_start; + rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, + rack->r_ctl.last_tlp_acked_end); + } + if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { + rack->r_ctl.last_tlp_acked_end = rsm->r_end; + rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, + rack->r_ctl.last_tlp_acked_end); + } + } else { + rack->r_ctl.last_tlp_acked_start = rsm->r_start; + rack->r_ctl.last_tlp_acked_end = rsm->r_end; + rack->rc_last_tlp_acked_set = 1; + rack->rc_last_tlp_past_cumack = 0; + rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); + } } /** * In this case nrsm becomes @@ -8597,14 +10058,14 @@ do_rest_ofb: */ counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, end); + moved++; rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_just_ret = 0; #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:% rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -8616,6 +10077,10 @@ do_rest_ofb: rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); + /* You get a count for acking a whole segment or more */ + if ((rsm->r_end - rsm->r_start) >= segsiz) + rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); + rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ rack_log_sack_passed(tp, rack, rsm); @@ -8623,6 +10088,8 @@ do_rest_ofb: if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); @@ -8650,26 +10117,58 @@ out: * with either the previous or * next block? */ - next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + next = tqhash_next(rack->r_ctl.tqh, rsm); while (next) { if (next->r_flags & RACK_TLP) break; + /* Only allow merges between ones in or out of GP window */ + if ((next->r_flags & RACK_IN_GP_WIN) && + ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { + break; + } + if ((rsm->r_flags & RACK_IN_GP_WIN) && + ((next->r_flags & RACK_IN_GP_WIN) == 0)) { + break; + } + if (rsm->bindex != next->bindex) + break; + if (rsm->r_flags & RACK_STRADDLE) + break; + if (next->r_flags & RACK_STRADDLE) + break; if (next->r_flags & RACK_ACKED) { - /* yep this and next can be merged */ + /* yep this and next can be merged */ rsm = rack_merge_rsm(rack, rsm, next); - next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + noextra++; + next = tqhash_next(rack->r_ctl.tqh, rsm); } else break; } /* Now what about the previous? */ - prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + prev = tqhash_prev(rack->r_ctl.tqh, rsm); while (prev) { if (prev->r_flags & RACK_TLP) break; + /* Only allow merges between ones in or out of GP window */ + if ((prev->r_flags & RACK_IN_GP_WIN) && + ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { + break; + } + if ((rsm->r_flags & RACK_IN_GP_WIN) && + ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { + break; + } + if (rsm->bindex != prev->bindex) + break; + if (rsm->r_flags & RACK_STRADDLE) + break; + if (prev->r_flags & RACK_STRADDLE) + break; if (prev->r_flags & RACK_ACKED) { /* yep the previous and this can be merged */ rsm = rack_merge_rsm(rack, prev, rsm); - prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + noextra++; + prev = tqhash_prev(rack->r_ctl.tqh, rsm); } else break; } @@ -8680,13 +10179,11 @@ out: counter_u64_add(rack_sack_proc_short, 1); } /* Save off the next one for quick reference. */ - if (rsm) - nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - else - nrsm = NULL; + nrsm = tqhash_find(rack->r_ctl.tqh, end); *prsm = rack->r_ctl.rc_sacklast = nrsm; /* Pass back the moved. */ *moved_two = moved; + *no_extra = noextra; return (changed); } @@ -8715,7 +10212,7 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac tmap = rsm; } tmap->r_in_tmap = 1; - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + rsm = tqhash_next(rack->r_ctl.tqh, rsm); } /* * Now lets possibly clear the sack filter so we start @@ -8751,6 +10248,8 @@ rack_do_decay(struct tcp_rack *rack) * Check for decay of non-SAD, * we want all SAD detection metrics to * decay 1/4 per second (or more) passed. + * Current default is 800 so it decays + * 80% every second. */ #ifdef NETFLIX_EXP_DETECTION uint32_t pkt_delta; @@ -8784,20 +10283,110 @@ rack_do_decay(struct tcp_rack *rack) } } +static void inline +rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) +{ + /* + * We look at advancing the end send time for our GP + * measurement tracking only as the cumulative acknowledgment + * moves forward. You might wonder about this, why not + * at every transmission or retransmission within the + * GP window update the rc_gp_cumack_ts? Well its rather + * nuanced but basically the GP window *may* expand (as + * it does below) or worse and harder to track it may shrink. + * + * This last makes it impossible to track at the time of + * the send, since you may set forward your rc_gp_cumack_ts + * when you send, because that send *is* in your currently + * "guessed" window, but then it shrinks. Now which was + * the send time of the last bytes in the window, by the + * time you ask that question that part of the sendmap + * is freed. So you don't know and you will have too + * long of send window. Instead by updating the time + * marker only when the cumack advances this assures us + * that we will have only the sends in the window of our + * GP measurement. + * + * Another complication from this is the + * merging of sendmap entries. During SACK processing this + * can happen to conserve the sendmap size. That breaks + * everything down in tracking the send window of the GP + * estimate. So to prevent that and keep it working with + * a tiny bit more limited merging, we only allow like + * types to be merged. I.e. if two sends are in the GP window + * then its ok to merge them together. If two sends are not + * in the GP window its ok to merge them together too. Though + * one send in and one send out cannot be merged. We combine + * this with never allowing the shrinking of the GP window when + * we are in recovery so that we can properly calculate the + * sending times. + * + * This all of course seems complicated, because it is.. :) + * + * The cum-ack is being advanced upon the sendmap. + * If we are not doing a GP estimate don't + * proceed. + */ + uint64_t ts; + + if ((tp->t_flags & TF_GPUTINPROG) == 0) + return; + /* + * If this sendmap entry is going + * beyond the measurement window we had picked, + * expand the measurement window by that much. + */ + if (SEQ_GT(rsm->r_end, tp->gput_ack)) { + tp->gput_ack = rsm->r_end; + } + /* + * If we have not setup a ack, then we + * have no idea if the newly acked pieces + * will be "in our seq measurement range". If + * it is when we clear the app_limited_needs_set + * flag the timestamp will be updated. + */ + if (rack->app_limited_needs_set) + return; + /* + * Finally, we grab out the latest timestamp + * that this packet was sent and then see + * if: + * a) The packet touches are newly defined GP range. + * b) The time is greater than (newer) than the + * one we currently have. If so we update + * our sending end time window. + * + * Note we *do not* do this at send time. The reason + * is that if you do you *may* pick up a newer timestamp + * for a range you are not going to measure. We project + * out how far and then sometimes modify that to be + * smaller. If that occurs then you will have a send + * that does not belong to the range included. + */ + if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= + rack->r_ctl.rc_gp_cumack_ts) + return; + if (rack_in_gp_window(tp, rsm)) { + rack->r_ctl.rc_gp_cumack_ts = ts; + rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, + __LINE__, from, rsm); + } +} + static void -rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) +rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) { struct rack_sendmap *rsm; -#ifdef INVARIANTS - struct rack_sendmap *rm; -#endif - /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ + rack->r_wanted_output = 1; + if (SEQ_GT(th_ack, tp->snd_una)) + rack->r_ctl.last_cumack_advance = acktime; /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ if ((rack->rc_last_tlp_acked_set == 1)&& @@ -8848,7 +10437,7 @@ rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_ rack->rc_last_sent_tlp_past_cumack = 1; } more: - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_min(rack->r_ctl.tqh); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* @@ -8932,17 +10521,12 @@ more: rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; - /* Record the time of highest cumack sent */ - rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; -#ifndef INVARIANTS - (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); -#else - rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - if (rm != rsm) { - panic("removing head in rack:%p rsm:%p rm:%p", - rack, rsm, rm); - } -#endif + /* + * Record the time of highest cumack sent if its in our measurement + * window and possibly bump out the end. + */ + rack_rsm_sender_update(rack, tp, rsm, 4); + tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; @@ -8965,6 +10549,8 @@ more: rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; if (rack->r_ent_rec_ns) { /* * We have sent no more, and we saw an sack @@ -8998,7 +10584,7 @@ more: goto more; } /* Check for reneging */ - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_min(rack->r_ctl.tqh); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to @@ -9046,17 +10632,21 @@ more: } /* * Update where the piece starts and record - * the time of send of highest cumack sent. + * the time of send of highest cumack sent if + * its in our GP range. */ - rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); /* Now we need to move our offset forward too */ - if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) { + if (rsm->m && + ((rsm->orig_m_len != rsm->m->m_len) || + (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { /* Fix up the orig_m_len and possibly the mbuf offset */ rack_adjust_orig_mlen(rsm); } rsm->soff += (th_ack - rsm->r_start); - rsm->r_start = th_ack; + rack_rsm_sender_update(rack, tp, rsm, 5); + /* The trim will move th_ack into r_start for us */ + tqhash_trim(rack->r_ctl.tqh, th_ack); /* Now do we need to move the mbuf fwd too? */ if (rsm->m) { while (rsm->soff >= rsm->m->m_len) { @@ -9067,8 +10657,10 @@ more: rsm, rsm->soff)); } rsm->orig_m_len = rsm->m->m_len; + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); } - if (rack->app_limited_needs_set) + if (rack->app_limited_needs_set && + SEQ_GEQ(th_ack, tp->gput_seq)) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); } @@ -9114,13 +10706,76 @@ rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) } #ifdef NETFLIX_EXP_DETECTION + +static void +rack_merge_out_sacks(struct tcp_rack *rack) +{ + struct rack_sendmap *cur, *next, *rsm, *trsm = NULL; + + cur = tqhash_min(rack->r_ctl.tqh); + while(cur) { + next = tqhash_next(rack->r_ctl.tqh, cur); + /* + * The idea is to go through all and merge back + * together the pieces sent together, + */ + if ((next != NULL) && + (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) { + rack_merge_rsm(rack, cur, next); + } else { + cur = next; + } + } + /* + * now treat it like a rxt event, everything is outstanding + * and sent nothing acvked and dupacks are all zero. If this + * is not an attacker it will have to dupack its way through + * it all. + */ + TAILQ_INIT(&rack->r_ctl.rc_tmap); + TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { + rsm->r_dupack = 0; + /* We must re-add it back to the tlist */ + if (trsm == NULL) { + TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); + } else { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); + } + rsm->r_in_tmap = 1; + trsm = rsm; + rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); + } + sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una); +} + static void rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) { - if ((rack->do_detection || tcp_force_detection) && - tcp_sack_to_ack_thresh && - tcp_sack_to_move_thresh && - ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { + int do_detection = 0; + + if (rack->sack_attack_disable || rack->rc_suspicious) { + /* + * If we have been disabled we must detect + * to possibly reverse it. Or if the guy has + * sent in suspicious sacks we want to do detection too. + */ + do_detection = 1; + + } else if ((rack->do_detection || tcp_force_detection) && + (tcp_sack_to_ack_thresh > 0) && + (tcp_sack_to_move_thresh > 0) && + (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) { + /* + * We only detect here if: + * 1) System wide forcing is on <or> do_detection is on + * <and> + * 2) We have thresholds for move and ack (set one to 0 and we are off) + * <and> + * 3) We have maps allocated larger than our min (500). + */ + do_detection = 1; + } + if (do_detection > 0) { /* * We have thresholds set to find * possible attackers and disable sack. @@ -9130,39 +10785,74 @@ rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ /* Log detecting */ rack_log_sad(rack, 1); - ackratio = (uint64_t)(rack->r_ctl.sack_count); - ackratio *= (uint64_t)(1000); - if (rack->r_ctl.ack_count) - ackratio /= (uint64_t)(rack->r_ctl.ack_count); - else { - /* We really should not hit here */ - ackratio = 1000; + /* Do we establish a ack ratio */ + if ((rack->r_ctl.sack_count > tcp_map_minimum) || + (rack->rc_suspicious == 1) || + (rack->sack_attack_disable > 0)) { + ackratio = (uint64_t)(rack->r_ctl.sack_count); + ackratio *= (uint64_t)(1000); + if (rack->r_ctl.ack_count) + ackratio /= (uint64_t)(rack->r_ctl.ack_count); + else { + /* We can hit this due to ack totals degregation (via small sacks) */ + ackratio = 1000; + } + } else { + /* + * No ack ratio needed if we have not + * seen more sacks then the number of map entries. + * The exception to that is if we have disabled sack then + * we need to find a ratio. + */ + ackratio = 0; } + if ((rack->sack_attack_disable == 0) && (ackratio > rack_highest_sack_thresh_seen)) rack_highest_sack_thresh_seen = (uint32_t)ackratio; - movetotal = rack->r_ctl.sack_moved_extra; - movetotal += rack->r_ctl.sack_noextra_move; - moveratio = rack->r_ctl.sack_moved_extra; - moveratio *= (uint64_t)1000; - if (movetotal) - moveratio /= movetotal; - else { - /* No moves, thats pretty good */ + /* Do we establish a move ratio? */ + if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) || + (rack->rc_suspicious == 1) || + (rack->sack_attack_disable > 0)) { + /* + * We need to have more sack moves than maps + * allocated to have a move ratio considered. + */ + movetotal = rack->r_ctl.sack_moved_extra; + movetotal += rack->r_ctl.sack_noextra_move; + moveratio = rack->r_ctl.sack_moved_extra; + moveratio *= (uint64_t)1000; + if (movetotal) + moveratio /= movetotal; + else { + /* No moves, thats pretty good */ + moveratio = 0; + } + } else { + /* + * Not enough moves have occured to consider + * if we are out of whack in that ratio. + * The exception to that is if we have disabled sack then + * we need to find a ratio. + */ moveratio = 0; } if ((rack->sack_attack_disable == 0) && (moveratio > rack_highest_move_thresh_seen)) rack_highest_move_thresh_seen = (uint32_t)moveratio; + /* Now the tests */ if (rack->sack_attack_disable == 0) { + /* Not disabled, do we need to disable? */ if ((ackratio > tcp_sack_to_ack_thresh) && (moveratio > tcp_sack_to_move_thresh)) { /* Disable sack processing */ + tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); rack->sack_attack_disable = 1; - if (rack->r_rep_attack == 0) { - rack->r_rep_attack = 1; - counter_u64_add(rack_sack_attacks_detected, 1); - } + /* set it so we have the built in delay */ + rack->r_ctl.ack_during_sd = 1; + if (rack_merge_out_sacks_on_attack) + rack_merge_out_sacks(rack); + counter_u64_add(rack_sack_attacks_detected, 1); tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); /* Clamp the cwnd at flight size */ rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; @@ -9172,20 +10862,20 @@ rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ } else { /* We are sack-disabled check for false positives */ if ((ackratio <= tcp_restoral_thresh) || - (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { + ((rack_merge_out_sacks_on_attack == 0) && + (rack->rc_suspicious == 0) && + (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) { rack->sack_attack_disable = 0; rack_log_sad(rack, 3); /* Restart counting */ rack->r_ctl.sack_count = 0; rack->r_ctl.sack_moved_extra = 0; rack->r_ctl.sack_noextra_move = 1; + rack->rc_suspicious = 0; rack->r_ctl.ack_count = max(1, - (bytes_this_ack / segsiz)); + (bytes_this_ack / segsiz)); - if (rack->r_rep_reverse == 0) { - rack->r_rep_reverse = 1; - counter_u64_add(rack_sack_attacks_reversed, 1); - } + counter_u64_add(rack_sack_attacks_reversed, 1); /* Restore the cwnd */ if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; @@ -9339,7 +11029,8 @@ rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_s } static void -rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) +rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, + int *dsack_seen, int *sacks_seen) { uint32_t changed; struct tcp_rack *rack; @@ -9348,8 +11039,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point; - int loop_start = 0, moved_two = 0; + int loop_start = 0, moved_two = 0, no_extra = 0; uint32_t tsused; + uint32_t segsiz, o_cnt; INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -9359,12 +11051,13 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_get_usecs(NULL); - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_min(rack->r_ctl.tqh); changed = 0; th_ack = th->th_ack; if (rack->sack_attack_disable == 0) rack_do_decay(rack); - if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { + segsiz = ctf_fixed_maxseg(rack->rc_tp); + if (BYTES_THIS_ACK(tp, th) >= segsiz) { /* * You only get credit for * MSS and greater (and you get extra @@ -9391,7 +11084,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered if (rsm && SEQ_GT(th_ack, rsm->r_start)) changed = th_ack - rsm->r_start; if (changed) { - rack_process_to_cumack(tp, rack, th_ack, cts, to); + rack_process_to_cumack(tp, rack, th_ack, cts, to, + tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left and no sack. */ @@ -9427,6 +11121,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered SEQ_LEQ(sack.end, th_ack)) { int was_tlp; + if (dsack_seen != NULL) + *dsack_seen = 1; was_tlp = rack_note_dsack(rack, sack.start, sack.end); /* * Its a D-SACK block. @@ -9446,13 +11142,52 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ + o_cnt = num_sack_blks; num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); + if (sacks_seen != NULL) + *sacks_seen = num_sack_blks; if (num_sack_blks == 0) { - /* Nothing to sack (DSACKs?) */ + /* Nothing to sack, but we need to update counts */ + if ((o_cnt == 1) && + (*dsack_seen != 1)) + rack->r_ctl.sack_count++; + else if (o_cnt > 1) + rack->r_ctl.sack_count++; goto out_with_totals; } + if (rack->sack_attack_disable) { + /* + * An attacker disablement is in place, for + * every sack block that is not at least a full MSS + * count up sack_count. + */ + for (i = 0; i < num_sack_blks; i++) { + if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) { + rack->r_ctl.sack_count++; + } + if (rack->r_ctl.sack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + } + goto out; + } + /* Its a sack of some sort */ + rack->r_ctl.sack_count += num_sack_blks; + if (rack->r_ctl.sack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } if (num_sack_blks < 2) { /* Only one, we don't need to sort */ goto do_sack_work; @@ -9520,7 +11255,7 @@ do_sack_work: * We probably did the FR and the next * SACK in continues as we would expect. */ - acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz); if (acked) { rack->r_wanted_output = 1; changed += acked; @@ -9547,43 +11282,42 @@ do_sack_work: rack->r_ctl.ack_count /= 2; rack->r_ctl.sack_count /= 2; } + if (moved_two) { + /* + * If we did not get a SACK for at least a MSS and + * had to move at all, or if we moved more than our + * threshold, it counts against the "extra" move. + */ + rack->r_ctl.sack_moved_extra += moved_two; + rack->r_ctl.sack_noextra_move += no_extra; + counter_u64_add(rack_move_some, 1); + } else { + /* + * else we did not have to move + * any more than we would expect. + */ + rack->r_ctl.sack_noextra_move += no_extra; + rack->r_ctl.sack_noextra_move++; + counter_u64_add(rack_move_none, 1); + } + if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || + (rack->r_ctl.sack_noextra_move > 0xfff00000)) { + rack->r_ctl.sack_moved_extra /= 2; + rack->r_ctl.sack_noextra_move /= 2; + } goto out_with_totals; } else { /* * Start the loop through the * rest of blocks, past the first block. */ - moved_two = 0; loop_start = 1; } } - /* Its a sack of some sort */ - rack->r_ctl.sack_count++; - if (rack->r_ctl.sack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } counter_u64_add(rack_sack_total, 1); - if (rack->sack_attack_disable) { - /* An attacker disablement is in place */ - if (num_sack_blks > 1) { - rack->r_ctl.sack_count += (num_sack_blks - 1); - rack->r_ctl.sack_moved_extra++; - counter_u64_add(rack_move_some, 1); - if (rack->r_ctl.sack_moved_extra > 0xfff00000) { - rack->r_ctl.sack_moved_extra /= 2; - rack->r_ctl.sack_noextra_move /= 2; - } - } - goto out; - } rsm = rack->r_ctl.rc_sacklast; for (i = loop_start; i < num_sack_blks; i++) { - acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz); if (acked) { rack->r_wanted_output = 1; changed += acked; @@ -9595,15 +11329,22 @@ do_sack_work: * threshold, it counts against the "extra" move. */ rack->r_ctl.sack_moved_extra += moved_two; + rack->r_ctl.sack_noextra_move += no_extra; counter_u64_add(rack_move_some, 1); } else { /* * else we did not have to move * any more than we would expect. */ + rack->r_ctl.sack_noextra_move += no_extra; rack->r_ctl.sack_noextra_move++; counter_u64_add(rack_move_none, 1); } + if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || + (rack->r_ctl.sack_noextra_move > 0xfff00000)) { + rack->r_ctl.sack_moved_extra /= 2; + rack->r_ctl.sack_noextra_move /= 2; + } if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { /* * If the SACK was not a full MSS then @@ -9612,6 +11353,10 @@ do_sack_work: * a MSS if its a TSO send) we had to skip by. */ rack->r_ctl.sack_count += moved_two; + if (rack->r_ctl.sack_count > 0xfff00000) { + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } counter_u64_add(rack_sack_total, moved_two); } /* @@ -9621,16 +11366,8 @@ do_sack_work: * the various counts, and then clear out * moved_two. */ - if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || - (rack->r_ctl.sack_noextra_move > 0xfff00000)) { - rack->r_ctl.sack_moved_extra /= 2; - rack->r_ctl.sack_noextra_move /= 2; - } - if (rack->r_ctl.sack_count > 0xfff00000) { - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } moved_two = 0; + no_extra = 0; } out_with_totals: if (num_sack_blks > 1) { @@ -9707,21 +11444,17 @@ rack_strike_dupack(struct tcp_rack *rack) struct rack_sendmap *rsm; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { - rsm = TAILQ_NEXT(rsm, r_tnext); - if (rsm->r_flags & RACK_MUST_RXT) { - /* Sendmap entries that are marked to - * be retransmitted do not need dupack's - * struck. We get these marks for a number - * of reasons (rxt timeout with no sack, - * mtu change, or rwnd collapses). When - * these events occur, we know we must retransmit - * them and mark the sendmap entries. Dupack counting - * is not needed since we are already set to retransmit - * it as soon as we can. - */ + while (rsm) { + /* + * We need to skip anything already set + * to be retransmitted. + */ + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + (rsm->r_flags & RACK_MUST_RXT)) { + rsm = TAILQ_NEXT(rsm, r_tnext); continue; } + break; } if (rsm && (rsm->r_dupack < 0xff)) { rsm->r_dupack++; @@ -9755,13 +11488,12 @@ rack_strike_dupack(struct tcp_rack *rack) static void rack_check_bottom_drag(struct tcpcb *tp, struct tcp_rack *rack, - struct socket *so, int32_t acked) + struct socket *so) { uint32_t segsiz, minseg; segsiz = ctf_fixed_maxseg(tp); minseg = segsiz; - if (tp->snd_max == tp->snd_una) { /* * We are doing dynamic pacing and we are way @@ -9772,43 +11504,19 @@ rack_check_bottom_drag(struct tcpcb *tp, * addition to any earlier boosting of * the multiplier. */ + uint64_t lt_bw; + + lt_bw = rack_get_lt_bw(rack); rack->rc_dragged_bottom = 1; rack_validate_multipliers_at_or_above100(rack); - /* - * Lets use the segment bytes acked plus - * the lowest RTT seen as the basis to - * form a b/w estimate. This will be off - * due to the fact that the true estimate - * should be around 1/2 the time of the RTT - * but we can settle for that. - */ if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && - acked) { - uint64_t bw, calc_bw, rtt; - - rtt = rack->r_ctl.rack_rs.rs_us_rtt; - if (rtt == 0) { - /* no us sample is there a ms one? */ - if (rack->r_ctl.rack_rs.rs_rtt_lowest) { - rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; - } else { - goto no_measurement; - } - } - bw = acked; - calc_bw = bw * 1000000; - calc_bw /= rtt; - if (rack->r_ctl.last_max_bw && - (rack->r_ctl.last_max_bw < calc_bw)) { - /* - * If we have a last calculated max bw - * enforce it. - */ - calc_bw = rack->r_ctl.last_max_bw; - } - /* now plop it in */ + (lt_bw > 0)) { + /* + * Lets use the long-term b/w we have + * been getting as a base. + */ if (rack->rc_gp_filled == 0) { - if (calc_bw > ONE_POINT_TWO_MEG) { + if (lt_bw > ONE_POINT_TWO_MEG) { /* * If we have no measurement * don't let us set in more than @@ -9817,19 +11525,19 @@ rack_check_bottom_drag(struct tcpcb *tp, * will hopefully have a max b/w * available to sanity check things. */ - calc_bw = ONE_POINT_TWO_MEG; + lt_bw = ONE_POINT_TWO_MEG; } rack->r_ctl.rc_rtt_diff = 0; - rack->r_ctl.gp_bw = calc_bw; + rack->r_ctl.gp_bw = lt_bw; rack->rc_gp_filled = 1; if (rack->r_ctl.num_measurements < RACK_REQ_AVG) rack->r_ctl.num_measurements = RACK_REQ_AVG; rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); - } else if (calc_bw > rack->r_ctl.gp_bw) { + } else if (lt_bw > rack->r_ctl.gp_bw) { rack->r_ctl.rc_rtt_diff = 0; if (rack->r_ctl.num_measurements < RACK_REQ_AVG) rack->r_ctl.num_measurements = RACK_REQ_AVG; - rack->r_ctl.gp_bw = calc_bw; + rack->r_ctl.gp_bw = lt_bw; rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); } else rack_increase_bw_mul(rack, -1, 0, 0, 1); @@ -9837,21 +11545,16 @@ rack_check_bottom_drag(struct tcpcb *tp, (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { /* We have enough measurements now */ rack->gp_ready = 1; - rack_set_cc_pacing(rack); + if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) || + rack->rack_hibeta) + rack_set_cc_pacing(rack); if (rack->defer_options) rack_apply_deferred_options(rack); } - /* - * For acks over 1mss we do a extra boost to simulate - * where we would get 2 acks (we want 110 for the mul). - */ - if (acked > segsiz) - rack_increase_bw_mul(rack, -1, 0, 0, 1); } else { /* * zero rtt possibly?, settle for just an old increase. */ -no_measurement: rack_increase_bw_mul(rack, -1, 0, 0, 1); } } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && @@ -9875,7 +11578,225 @@ no_measurement: } } +#ifdef TCP_REQUEST_TRK +static void +rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, + struct http_sendfile_track *cur, uint8_t mod, int line, int err) +{ + int do_log; + do_log = tcp_bblogging_on(rack->rc_tp); + if (do_log == 0) { + if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) + return; + /* We only allow the three below with point logging on */ + if ((mod != HYBRID_LOG_RULES_APP) && + (mod != HYBRID_LOG_RULES_SET) && + (mod != HYBRID_LOG_REQ_COMP)) + return; + + } + if (do_log) { + union tcp_log_stackspecific log; + struct timeval tv; + + /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = seq; + log.u_bbr.cwnd_gain = line; + if (cur != NULL) { + uint64_t off; + + log.u_bbr.flex2 = cur->start_seq; + log.u_bbr.flex3 = cur->end_seq; + log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); + log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); + log.u_bbr.flex6 = cur->flags; + log.u_bbr.pkts_out = cur->hybrid_flags; + log.u_bbr.rttProp = cur->timestamp; + log.u_bbr.cur_del_rate = cur->cspr; + log.u_bbr.bw_inuse = cur->start; + log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); + log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; + log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); + log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; + log.u_bbr.bbr_state = 1; + off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]); + log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track)); + } else { + log.u_bbr.flex2 = err; + } + /* + * Fill in flex7 to be CHD (catchup|hybrid|DGP) + */ + log.u_bbr.flex7 = rack->rc_catch_up; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->rc_hybrid_mode; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->dgp_on; + log.u_bbr.flex8 = mod; + log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; + log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; + log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; + log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; + tcp_log_event(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_HYBRID_PACING_LOG, 0, + 0, &log, false, NULL, __func__, __LINE__, &tv); + } +} +#endif + +#ifdef TCP_REQUEST_TRK +static void +rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len) +{ + struct http_sendfile_track *rc_cur; + struct tcpcb *tp; + int err = 0; + + rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, seq); + if (rc_cur == NULL) { + /* If not in the beginning what about the end piece */ + rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); + rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, (seq + len - 1)); + } else { + err = 12345; + } + /* If we find no parameters we are in straight DGP mode */ + if(rc_cur == NULL) { + /* None found for this seq, just DGP for now */ + rack->r_ctl.client_suggested_maxseg = 0; + rack->rc_catch_up = 0; + rack->r_ctl.bw_rate_cap = 0; + rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); + if (rack->r_ctl.rc_last_sft) { + rack->r_ctl.rc_last_sft = NULL; + } + return; + } + /* + * Ok if we have a new entry *or* have never + * set up an entry we need to proceed. If + * we have already set it up this entry we + * just continue along with what we already + * setup. + */ + tp = rack->rc_tp; + if ((rack->r_ctl.rc_last_sft != NULL) && + (rack->r_ctl.rc_last_sft == rc_cur)) { + /* Its already in place */ + rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); + return; + } + if (rack->rc_hybrid_mode == 0) { + rack->r_ctl.rc_last_sft = rc_cur; + rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); + return; + } + if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ + /* Compensate for all the header overhead's */ + rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); + } else + rack->r_ctl.bw_rate_cap = 0; + if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) + rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; + else + rack->r_ctl.client_suggested_maxseg = 0; + if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && + (rc_cur->cspr > 0)) { + uint64_t len; + + rack->rc_catch_up = 1; + /* + * Calculate the deadline time, first set the + * time to when the request arrived. + */ + rc_cur->deadline = rc_cur->localtime; + /* + * Next calculate the length and compensate for + * TLS if need be. + */ + len = rc_cur->end - rc_cur->start; + if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { + /* + * This session is doing TLS. Take a swag guess + * at the overhead. + */ + len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); + } + /* + * Now considering the size, and the cspr, what is the time that + * would be required at the cspr rate. Here we use the raw + * cspr value since the client only looks at the raw data. We + * do use len which includes TLS overhead, but not the TCP/IP etc. + * That will get made up for in the CU pacing rate set. + */ + len *= HPTS_USEC_IN_SEC; + len /= rc_cur->cspr; + rc_cur->deadline += len; + } else { + rack->rc_catch_up = 0; + rc_cur->deadline = 0; + } + if (rack->r_ctl.client_suggested_maxseg != 0) { + /* + * We need to reset the max pace segs if we have a + * client_suggested_maxseg. + */ + rack_set_pace_segments(tp, rack, __LINE__, NULL); + } + rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); + /* Remember it for next time and for CU mode */ + rack->r_ctl.rc_last_sft = rc_cur; +} +#endif + +static void +rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) +{ +#ifdef TCP_REQUEST_TRK + struct http_sendfile_track *ent; + + ent = rack->r_ctl.rc_last_sft; + if ((ent == NULL) || + (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) || + (SEQ_GEQ(seq, ent->end_seq))) { + /* Time to update the track. */ + rack_set_dgp_hybrid_mode(rack, seq, len); + ent = rack->r_ctl.rc_last_sft; + } + /* Out of all */ + if (ent == NULL) { + return; + } + if (SEQ_LT(ent->end_seq, (seq + len))) { + /* + * This is the case where our end_seq guess + * was wrong. This is usually due to TLS having + * more bytes then our guess. It could also be the + * case that the client sent in two requests closely + * and the SB is full of both so we are sending part + * of each (end|beg). In such a case lets move this + * guys end to match the end of this send. That + * way it will complete when all of it is acked. + */ + ent->end_seq = (seq + len); + rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent); + } + /* Now validate we have set the send time of this one */ + if ((ent->flags & TCP_HTTP_TRACK_FLG_FSND) == 0) { + ent->flags |= TCP_HTTP_TRACK_FLG_FSND; + ent->first_send = cts; + ent->sent_at_fs = rack->rc_tp->t_sndbytes; + ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; + } +#endif +} static void rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) @@ -9918,7 +11839,7 @@ rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket } static void -rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) +rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) { /* * Here any sendmap entry that points to the @@ -9928,7 +11849,7 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) * 2) snd_una adjusted to its new position. * * Note that (2) implies rack_ack_received has also - * been called. + * been called and all the sbcut's have been done. * * We grab the first mbuf in the socket buffer and * then go through the front of the sendmap, recalculating @@ -9939,17 +11860,30 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) * not be a penalty though, since we just referenced the sb * to go in and trim off the mbufs that we freed (of course * there will be a penalty for the sendmap references though). + * + * Note also with INVARIANT on, we validate with a KASSERT + * that the first sendmap entry has a soff of 0. + * */ struct mbuf *m; struct rack_sendmap *rsm; + tcp_seq snd_una; +#ifdef INVARIANTS + int first_processed = 0; +#endif + snd_una = rack->rc_tp->snd_una; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_min(rack->r_ctl.tqh); if ((rsm == NULL) || (m == NULL)) { /* Nothing outstanding */ return; } + /* The very first RSM's mbuf must point to the head mbuf in the sb */ + KASSERT((rsm->m == m), + ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", + rack, sb, rsm)); while (rsm->m && (rsm->m == m)) { /* one to adjust */ #ifdef INVARIANTS @@ -9957,10 +11891,17 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) uint32_t soff; tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); - if (rsm->orig_m_len != m->m_len) { + if ((rsm->orig_m_len != m->m_len) || + (rsm->orig_t_space != M_TRAILINGROOM(m))){ rack_adjust_orig_mlen(rsm); } - if (rsm->soff != soff) { + if (first_processed == 0) { + KASSERT((rsm->soff == 0), + ("Rack:%p rsm:%p -- rsm at head but soff not zero", + rack, rsm)); + first_processed = 1; + } + if ((rsm->soff != soff) || (rsm->m != tm)) { /* * This is not a fatal error, we anticipate it * might happen (the else code), so we count it here @@ -9971,24 +11912,104 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) } rsm->m = tm; rsm->soff = soff; - if (tm) + if (tm) { rsm->orig_m_len = rsm->m->m_len; - else + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } else { rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + } #else rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); - if (rsm->m) + if (rsm->m) { rsm->orig_m_len = rsm->m->m_len; - else + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } else { rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + } #endif - rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, - rsm); + rsm = tqhash_next(rack->r_ctl.tqh, rsm); if (rsm == NULL) break; } } +#ifdef TCP_REQUEST_TRK +static inline void +rack_http_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) +{ + struct http_sendfile_track *ent; + int i; + + if ((rack->rc_hybrid_mode == 0) && + (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { + /* + * Just do normal completions hybrid pacing is not on + * and CLDL is off as well. + */ + tcp_http_check_for_comp(rack->rc_tp, th_ack); + return; + } + /* + * Originally I was just going to find the th_ack associated + * with an entry. But then I realized a large strech ack could + * in theory ack two or more requests at once. So instead we + * need to find all entries that are completed by th_ack not + * just a single entry and do our logging. + */ + ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); + while (ent != NULL) { + /* + * We may be doing hybrid pacing or CLDL and need more details possibly + * so we do it manually instead of calling + * tcp_http_check_for_comp() + */ + uint64_t laa, tim, data, cbw, ftim; + + /* Ok this ack frees it */ + rack_log_hybrid(rack, th_ack, + ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); + /* calculate the time based on the ack arrival */ + data = ent->end - ent->start; + laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + if (ent->flags & TCP_HTTP_TRACK_FLG_FSND) { + if (ent->first_send > ent->localtime) + ftim = ent->first_send; + else + ftim = ent->localtime; + } else { + /* TSNH */ + ftim = ent->localtime; + } + if (laa > ent->localtime) + tim = laa - ftim; + else + tim = 0; + cbw = data * HPTS_USEC_IN_SEC; + if (tim > 0) + cbw /= tim; + else + cbw = 0; + rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent); + /* + * Check to see if we are freeing what we are pointing to send wise + * if so be sure to NULL the pointer so we know we are no longer + * set to anything. + */ + if (ent == rack->r_ctl.rc_last_sft) + rack->r_ctl.rc_last_sft = NULL; + /* Generate the log that the tcp_netflix call would have */ + tcp_http_log_req_info(rack->rc_tp, ent, + i, TCP_HTTP_REQ_LOG_FREED, 0, 0); + /* Free it and see if there is another one */ + tcp_http_free_a_slot(rack->rc_tp, ent); + ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); + } +} +#endif + + /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. @@ -10025,6 +12046,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { int in_rec, dup_ack_struck = 0; + int dsack_seen = 0, sacks_seen = 0; in_rec = IN_FASTRECOVERY(tp->t_flags); if (rack->rc_in_persist) { @@ -10032,13 +12054,33 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); } + if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd) && ((to->to_flags & TOF_SACK) == 0)) { rack_strike_dupack(rack); dup_ack_struck = 1; } - rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); + rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), + dup_ack_struck, &dsack_seen, &sacks_seen); + if ((rack->sack_attack_disable > 0) && + (th->th_ack == tp->snd_una) && + (tiwin == tp->snd_wnd) && + (dsack_seen == 0) && + (sacks_seen > 0)) { + /* + * If sacks have been disabled we may + * want to strike a dup-ack "ignoring" the + * sack as long as the sack was not a "dsack". Note + * that if no sack is sent (TOF_SACK is off) then the + * normal dsack code above rack_log_ack() would have + * already struck. So this is just to catch the case + * were we are ignoring sacks from this guy due to + * it being a suspected attacker. + */ + rack_strike_dupack(rack); + } + } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* @@ -10048,6 +12090,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } return (0); } @@ -10110,8 +12154,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); -#ifdef NETFLIX_HTTP_LOGGING - tcp_http_check_for_comp(rack->rc_tp, th->th_ack); +#ifdef TCP_REQUEST_TRK + rack_http_check_for_comp(rack, th->th_ack); #endif } /* @@ -10169,11 +12213,19 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, ourfinisacked = 1; } tp->snd_una = th->th_ack; + /* wakeups? */ if (acked_amount && sbavail(&so->so_snd)) - rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + rack_adjust_sendmap_head(rack, &so->so_snd); rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); + /* now check the rxt clamps */ + if ((recovery == 1) && + (rack->excess_rxt_on) && + (rack->r_cwnd_was_clamped == 0)) { + do_rack_excess_rxt(tp, rack); + } else if (rack->r_cwnd_was_clamped) + do_rack_check_for_unclamp(tp, rack); m_freem(mfree); if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; @@ -10187,7 +12239,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack->rc_gp_dyn_mul && rack->rc_always_pace) { /* Check if we are dragging bottom */ - rack_check_bottom_drag(tp, rack, so, acked); + rack_check_bottom_drag(tp, rack, so); } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ @@ -10201,6 +12253,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&tptosocket(tp)->so_snd) == 0) tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + rack->rc_suspicious = 0; /* Set need output so persist might get set */ rack->r_wanted_output = 1; sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); @@ -10261,7 +12314,7 @@ rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t } static void -rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line) +rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) { /* * Here all we do is mark the collapsed point and set the flag. @@ -10271,28 +12324,26 @@ rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line) */ tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); if ((rack->rc_has_collapsed == 0) || - (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd))) + (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) counter_u64_add(rack_collapsed_win_seen, 1); - rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; + rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; rack->rc_has_collapsed = 1; rack->r_collapse_point_valid = 1; - rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); + rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); } static void rack_un_collapse_window(struct tcp_rack *rack, int line) { - struct rack_sendmap *nrsm, *rsm, fe; + struct rack_sendmap *nrsm, *rsm; int cnt = 0, split = 0; -#ifdef INVARIANTS - struct rack_sendmap *insret; -#endif + int insret __diagused; + - memset(&fe, 0, sizeof(fe)); + tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); rack->rc_has_collapsed = 0; - fe.r_start = rack->r_ctl.last_collapse_point; - rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); if (rsm == NULL) { /* Nothing to do maybe the peer ack'ed it all */ rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); @@ -10312,11 +12363,10 @@ rack_un_collapse_window(struct tcp_rack *rack, int line) split = 1; rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); #ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + (void)tqhash_insert(rack->r_ctl.tqh, nrsm); #else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); - if (insret != NULL) { - panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { + panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif @@ -10332,8 +12382,10 @@ rack_un_collapse_window(struct tcp_rack *rack, int line) */ rsm = nrsm; } + no_split: - RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { + TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { + cnt++; nrsm->r_flags |= RACK_RWND_COLLAPSED; rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); cnt++; @@ -10349,24 +12401,12 @@ rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, int32_t tlen, int32_t tfo_syn) { if (DELAY_ACK(tp, tlen) || tfo_syn) { - if (rack->rc_dack_mode && - (tlen > 500) && - (rack->rc_dack_toggle == 1)) { - goto no_delayed_ack; - } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { -no_delayed_ack: rack->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; - if (rack->rc_dack_mode) { - if (tp->t_flags & TF_DELACK) - rack->rc_dack_toggle = 1; - else - rack->rc_dack_toggle = 0; - } } } @@ -10451,7 +12491,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, } if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed the window */ - rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); + rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); else if (rack->rc_has_collapsed) rack_un_collapse_window(rack, __LINE__); if ((rack->r_collapse_point_valid) && @@ -10479,7 +12519,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing is outstanding, and there is * data to send. Enter persists. */ - rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); @@ -10593,7 +12633,6 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * DSACK actually handled in the fastpath * above. */ - RACK_OPTS_INC(tcp_sack_path_1); tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { @@ -10603,22 +12642,18 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * Partial overlap, recorded at todrop * above. */ - RACK_OPTS_INC(tcp_sack_path_2a); tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { - RACK_OPTS_INC(tcp_sack_path_2b); tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ - RACK_OPTS_INC(tcp_sack_path_3); tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { - RACK_OPTS_INC(tcp_sack_path_4); tcp_update_dsack_list(tp, save_start, save_start + tlen); } @@ -10882,7 +12917,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, under_pacing = 1; } nsegs = max(1, m->m_pkthdr.lro_nsegs); - rack_log_ack(tp, to, th, 0, 0); + rack_log_ack(tp, to, th, 0, 0, NULL, NULL); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; @@ -10910,7 +12945,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing is outstanding, and there is * data to send. Enter persists. */ - rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); } /* * If last ACK falls within this segment's sequence numbers, record @@ -10960,7 +12995,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, mfree = sbcut_locked(&so->so_snd, acked); tp->snd_una = th->th_ack; /* Note we want to hold the sb lock through the sendmap adjust */ - rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + rack_adjust_sendmap_head(rack, &so->so_snd); /* Wake up the socket if we have room to write more */ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); sowwakeup_locked(so); @@ -10976,8 +13011,9 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); -#ifdef NETFLIX_HTTP_LOGGING - tcp_http_check_for_comp(rack->rc_tp, th->th_ack); + +#ifdef TCP_REQUEST_TRK + rack_http_check_for_comp(rack, th->th_ack); #endif } /* @@ -10987,7 +13023,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (tp->snd_wnd < ctf_outstanding(tp)) { /* The peer collapsed the window */ - rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); + rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); } else if (rack->rc_has_collapsed) rack_un_collapse_window(rack, __LINE__); if ((rack->r_collapse_point_valid) && @@ -11014,11 +13050,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack->rc_gp_dyn_mul && rack->rc_always_pace) { /* Check if we are dragging bottom */ - rack_check_bottom_drag(tp, rack, so, acked); + rack_check_bottom_drag(tp, rack, so); } if (tp->snd_una == tp->snd_max) { tp->t_flags &= ~TF_PREVVALID; rack->r_ctl.retran_during_recovery = 0; + rack->rc_suspicious = 0; rack->r_ctl.dsack_byte_cnt = 0; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) @@ -11125,7 +13162,6 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { rack->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; - rack->rc_dack_toggle = 0; } tcp_ecn_input_syn_sent(tp, thflags, iptos); @@ -12195,6 +14231,15 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u int chged = 0; uint32_t user_max, orig_min, orig_max; +#ifdef TCP_REQUEST_TRK + if (rack->rc_hybrid_mode && + (rack->r_ctl.rc_pace_max_segs != 0) && + (rack_hybrid_allow_set_maxseg == 1) && + (rack->r_ctl.rc_last_sft != NULL)) { + rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; + return; + } +#endif orig_min = rack->r_ctl.rc_pace_min_segs; orig_max = rack->r_ctl.rc_pace_max_segs; user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; @@ -12214,13 +14259,18 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u rack->r_ctl.rc_pace_max_segs = user_max; } else { /* We are pacing right at the hardware rate */ - uint32_t segsiz; + uint32_t segsiz, pace_one; + if (rack_pace_one_seg || + (rack->r_ctl.rc_user_set_min_segs == 1)) + pace_one = 1; + else + pace_one = 0; segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( - tp, bw_est, segsiz, 0, - rack->r_ctl.crte, NULL); + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( + tp, bw_est, segsiz, pace_one, + rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); } } else if (rack->rc_always_pace) { if (rack->r_ctl.gp_bw || @@ -12266,7 +14316,7 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u static void -rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) +rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) { #ifdef INET6 struct ip6_hdr *ip6 = NULL; @@ -12296,6 +14346,7 @@ rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) tcpip_fillheaders(rack->rc_inp, tp->t_port, ip6, rack->r_ctl.fsb.th); + rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); } else #endif /* INET6 */ #ifdef INET @@ -12317,8 +14368,11 @@ rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) tcpip_fillheaders(rack->rc_inp, tp->t_port, ip, rack->r_ctl.fsb.th); + rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; } #endif + rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), + (long)TCP_MAXWIN << tp->rcv_scale); rack->r_fsb_inited = 1; } @@ -12343,17 +14397,485 @@ rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) return (0); } +static void +rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) +{ + /* + * Types of logs (mod value) + * 20 - Initial round setup + * 21 - Rack declares a new round. + */ + struct tcpcb *tp; + + tp = rack->rc_tp; + if (tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = rack->r_ctl.current_round; + log.u_bbr.flex2 = rack->r_ctl.roundends; + log.u_bbr.flex3 = high_seq; + log.u_bbr.flex4 = tp->snd_max; + log.u_bbr.flex8 = mod; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; + log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + TCP_HYSTART, 0, + 0, &log, false, &tv); + } +} + +static void +rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) +{ + rack->rack_deferred_inited = 1; + rack->r_ctl.roundends = tp->snd_max; + rack->r_ctl.rc_high_rwnd = tp->snd_wnd; + rack->r_ctl.cwnd_to_use = tp->snd_cwnd; +} + +static void +rack_init_retransmit_value(struct tcp_rack *rack, int ctl) +{ + /* Retransmit bit controls. + * + * The setting of these values control one of + * three settings you can have and dictate + * how rack does retransmissions. Note this + * is in *any* mode i.e. pacing on or off DGP + * fixed rate pacing, or just bursting rack. + * + * 1 - Use full sized retransmits i.e. limit + * the size to whatever the pace_max_segments + * size is. + * + * 2 - Use pacer min granularity as a guide to + * the size combined with the current calculated + * goodput b/w measurement. So for example if + * the goodput is measured at 20Mbps we would + * calculate 8125 (pacer minimum 250usec in + * that b/w) and then round it up to the next + * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. + * + * 0 - The rack default 1 MSS (anything not 0/1/2 + * fall here too if we are setting via rack_init()). + * + */ + if (ctl == 1) { + rack->full_size_rxt = 1; + rack->shape_rxt_to_pacing_min = 0; + } else if (ctl == 2) { + rack->full_size_rxt = 0; + rack->shape_rxt_to_pacing_min = 1; + } else { + rack->full_size_rxt = 0; + rack->shape_rxt_to_pacing_min = 0; + } +} + +static void +rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, + uint32_t flex1, + uint32_t flex2, + uint32_t flex3) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex8 = mod; + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, + 0, &log, false, NULL, __func__, __LINE__, &tv); + } +} + static int -rack_init(struct tcpcb *tp, void **ptr) +rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) +{ + struct tcp_rack *rack; + struct rack_sendmap *rsm; + int i; + + + rack = (struct tcp_rack *)tp->t_fb_ptr; + switch (reqr->req) { + case TCP_QUERY_SENDMAP: + if ((reqr->req_param == tp->snd_max) || + (tp->snd_max == tp->snd_una)){ + /* Unlikely */ + return (0); + } + rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); + if (rsm == NULL) { + /* Can't find that seq -- unlikely */ + return (0); + } + reqr->sendmap_start = rsm->r_start; + reqr->sendmap_end = rsm->r_end; + reqr->sendmap_send_cnt = rsm->r_rtr_cnt; + reqr->sendmap_fas = rsm->r_fas; + if (reqr->sendmap_send_cnt > SNDMAP_NRTX) + reqr->sendmap_send_cnt = SNDMAP_NRTX; + for(i=0; i<reqr->sendmap_send_cnt; i++) + reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; + reqr->sendmap_ack_arrival = rsm->r_ack_arrival; + reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; + reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; + reqr->sendmap_dupacks = rsm->r_dupack; + rack_log_chg_info(tp, rack, 1, + rsm->r_start, + rsm->r_end, + rsm->r_flags); + return(1); + break; + case TCP_QUERY_TIMERS_UP: + if (rack->r_ctl.rc_hpts_flags == 0) { + /* no timers up */ + return (0); + } + reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; + } + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; + } + rack_log_chg_info(tp, rack, 2, + rack->r_ctl.rc_hpts_flags, + rack->r_ctl.rc_last_output_to, + rack->r_ctl.rc_timer_exp); + return (1); + break; + case TCP_QUERY_RACK_TIMES: + /* Reordering items */ + reqr->rack_num_dsacks = rack->r_ctl.num_dsack; + reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; + /* Timerstamps and timers */ + reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; + reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; + reqr->rack_rtt = rack->rc_rack_rtt; + reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; + reqr->rack_srtt_measured = rack->rc_srtt_measure_made; + /* PRR data */ + reqr->rack_sacked = rack->r_ctl.rc_sacked; + reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; + reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; + reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; + reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; + reqr->rack_prr_out = rack->r_ctl.rc_prr_out; + /* TLP and persists info */ + reqr->rack_tlp_out = rack->rc_tlp_in_progress; + reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; + if (rack->rc_in_persist) { + reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; + reqr->rack_in_persist = 1; + } else { + reqr->rack_time_went_idle = 0; + reqr->rack_in_persist = 0; + } + if (rack->r_wanted_output) + reqr->rack_wanted_output = 1; + else + reqr->rack_wanted_output = 0; + return (1); + break; + default: + return (-EINVAL); + } +} + +static void +rack_switch_failed(struct tcpcb *tp) { + /* + * This method gets called if a stack switch was + * attempted and it failed. We are left + * but our hpts timers were stopped and we + * need to validate time units and inp_flags2. + */ struct inpcb *inp = tptoinpcb(tp); - struct tcp_rack *rack = NULL; + struct tcp_rack *rack; + struct timeval tv; + uint32_t cts; + uint32_t toval; + struct hpts_diag diag; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); + if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) + inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + else + inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + if (inp->inp_in_hpts) { + /* Strange */ + return; + } + cts = tcp_get_usecs(&tv); + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { + toval = rack->r_ctl.rc_last_output_to - cts; + } else { + /* one slot please */ + toval = HPTS_TICKS_PER_SLOT; + } + } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { + toval = rack->r_ctl.rc_timer_exp - cts; + } else { + /* one slot please */ + toval = HPTS_TICKS_PER_SLOT; + } + } else + toval = HPTS_TICKS_PER_SLOT; + (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + __LINE__, &diag); + rack_log_hpts_diag(rack, cts, &diag, &tv); +} + +static int +rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) +{ + struct rack_sendmap *rsm, *ersm; + int insret __diagused; + /* + * When initing outstanding, we must be quite careful + * to not refer to tp->t_fb_ptr. This has the old rack + * pointer in it, not the "new" one (when we are doing + * a stack switch). + */ + + + if (tp->t_fb->tfb_chg_query == NULL) { + /* Create a send map for the current outstanding data */ + + rsm = rack_alloc(rack); + if (rsm == NULL) { + uma_zfree(rack_pcb_zone, ptr); + return (ENOMEM); + } + rsm->r_no_rtt_allowed = 1; + rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); + rsm->r_rtr_cnt = 1; + rsm->r_rtr_bytes = 0; + if (tp->t_flags & TF_SENTFIN) + rsm->r_flags |= RACK_HAS_FIN; + rsm->r_end = tp->snd_max; + if (tp->snd_una == tp->iss) { + /* The data space is one beyond snd_una */ + rsm->r_flags |= RACK_HAS_SYN; + rsm->r_start = tp->iss; + rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); + } else + rsm->r_start = tp->snd_una; + rsm->r_dupack = 0; + if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { + rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); + if (rsm->m) { + rsm->orig_m_len = rsm->m->m_len; + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } else { + rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + } + } else { + /* + * This can happen if we have a stand-alone FIN or + * SYN. + */ + rsm->m = NULL; + rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + rsm->soff = 0; + } #ifdef INVARIANTS - struct rack_sendmap *insret; + if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { + panic("Insert in rb tree fails ret:%d rack:%p rsm:%p", + insret, rack, rsm); + } +#else + (void)tqhash_insert(rack->r_ctl.tqh, rsm); #endif + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + } else { + /* We have a query mechanism, lets use it */ + struct tcp_query_resp qr; + int i; + tcp_seq at; + + at = tp->snd_una; + while (at != tp->snd_max) { + memset(&qr, 0, sizeof(qr)); + qr.req = TCP_QUERY_SENDMAP; + qr.req_param = at; + if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) + break; + /* Move forward */ + at = qr.sendmap_end; + /* Now lets build the entry for this one */ + rsm = rack_alloc(rack); + if (rsm == NULL) { + uma_zfree(rack_pcb_zone, ptr); + return (ENOMEM); + } + memset(rsm, 0, sizeof(struct rack_sendmap)); + /* Now configure the rsm and insert it */ + rsm->r_dupack = qr.sendmap_dupacks; + rsm->r_start = qr.sendmap_start; + rsm->r_end = qr.sendmap_end; + if (qr.sendmap_fas) + rsm->r_fas = qr.sendmap_end; + else + rsm->r_fas = rsm->r_start - tp->snd_una; + /* + * We have carefully aligned the bits + * so that all we have to do is copy over + * the bits with the mask. + */ + rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; + rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; + rsm->r_rtr_cnt = qr.sendmap_send_cnt; + rsm->r_ack_arrival = qr.sendmap_ack_arrival; + for (i=0 ; i<rsm->r_rtr_cnt; i++) + rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; + rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, + (rsm->r_start - tp->snd_una), &rsm->soff); + if (rsm->m) { + rsm->orig_m_len = rsm->m->m_len; + rsm->orig_t_space = M_TRAILINGROOM(rsm->m); + } else { + rsm->orig_m_len = 0; + rsm->orig_t_space = 0; + } +#ifdef INVARIANTS + if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { + panic("Insert in rb tree fails ret:%d rack:%p rsm:%p", + insret, rack, rsm); + } +#else + (void)tqhash_insert(rack->r_ctl.tqh, rsm); +#endif + if ((rsm->r_flags & RACK_ACKED) == 0) { + TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { + if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > + rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { + /* + * If the existing ersm was sent at + * a later time than the new one, then + * the new one should appear ahead of this + * ersm. + */ + rsm->r_in_tmap = 1; + TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); + break; + } + } + if (rsm->r_in_tmap == 0) { + /* + * Not found so shove it on the tail. + */ + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + } + } else { + if ((rack->r_ctl.rc_sacklast == NULL) || + (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { + rack->r_ctl.rc_sacklast = rsm; + } + } + rack_log_chg_info(tp, rack, 3, + rsm->r_start, + rsm->r_end, + rsm->r_flags); + } + } + return (0); +} + +static void +rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval) +{ + /* + * P = percent bits + * F = fill cw bit -- Toggle fillcw if this bit is set. + * S = Segment bits + * M = set max segment bit + * U = Unclamined + * C = If set to non-zero override the max number of clamps. + * L = Bit to indicate if clamped gets lower. + * + * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP + * + * The lowest 3 nibbles is the perentage .1 - 6553.5% + * where 10.1 = 101, max 6553.5 + * The upper 16 bits holds some options. + * The F bit will turn on fill-cw on if you are + * not pacing, it will turn it off if dgp is on. + * The L bit will change it so when clamped we get + * the min(gp, lt-bw) for dgp. + */ + uint16_t per; + + rack->r_ctl.saved_rxt_clamp_val = optval; + per = optval & 0x0000ffff; + rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff); + if (optval > 0) { + uint16_t clamp_opt; + + rack->excess_rxt_on = 1; + clamp_opt = ((optval & 0xffff0000) >> 16); + rack->r_ctl.clamp_options = clamp_opt & 0x00ff; + if (clamp_opt & 0xff00) { + /* A max clamps is also present */ + rack->r_ctl.max_clamps = (clamp_opt >> 8); + } else { + /* No specified clamps means no limit */ + rack->r_ctl.max_clamps = 0; + } + if (rack->r_ctl.clamp_options & 0x0002) { + rack->r_clamped_gets_lower = 1; + } else { + rack->r_clamped_gets_lower = 0; + } + } else { + /* Turn it off back to default */ + rack->excess_rxt_on = 0; + rack->r_clamped_gets_lower = 0; + } + +} + + +static int32_t +rack_init(struct tcpcb *tp, void **ptr) +{ + struct inpcb *inp = tptoinpcb(tp); + struct tcp_rack *rack = NULL; uint32_t iwin, snt, us_cts; - int err; + int err, no_query; + /* + * First are we the initial or are we a switched stack? + * If we are initing via tcp_newtcppcb the ptr passed + * will be tp->t_fb_ptr. If its a stack switch that + * has a previous stack we can query it will be a local + * var that will in the end be set into t_fb_ptr. + */ + if (ptr == &tp->t_fb_ptr) + no_query = 1; + else + no_query = 0; *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (*ptr == NULL) { /* @@ -12362,12 +14884,16 @@ rack_init(struct tcpcb *tp, void **ptr) * scheme to drop the locks fails :( * */ - return (ENOMEM); + return(ENOMEM); } - memset(ptr, 0, sizeof(struct tcp_rack)); - - rack = (struct tcp_rack *)ptr; - RB_INIT(&rack->r_ctl.rc_mtree); + memset(*ptr, 0, sizeof(struct tcp_rack)); + rack = (struct tcp_rack *)*ptr; + rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); + if (rack->r_ctl.tqh == NULL) { + uma_zfree(rack_pcb_zone, rack); + return(ENOMEM); + } + tqhash_init(rack->r_ctl.tqh); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; @@ -12387,7 +14913,11 @@ rack_init(struct tcpcb *tp, void **ptr) * rc_pacing_cc_set. That way whenever we turn off pacing * or switch off this stack, we will know to go restore * the saved values. + * + * We specifically put into the beta the ecn value for pacing. */ + rack->rc_new_rnd_needed = 1; + rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; /* We want abe like behavior as well */ @@ -12395,9 +14925,21 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; - rack->r_ctl.roundends = tp->snd_max; + if (rack_rxt_clamp_thresh) { + rack_translate_clamp_value(rack, rack_rxt_clamp_thresh); + rack->excess_rxt_on = 1; + } + if (rack_uses_full_dgp_in_rec) + rack->r_ctl.full_dgp_in_rec = 1; + if (rack_fill_cw_state) + rack->rc_pace_to_cwnd = 1; + if (rack_pacing_min_seg) + rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; if (use_rack_rr) rack->use_rack_rr = 1; + if (rack_dnd_default) { + rack->rc_pace_dnd = 1; + } if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else @@ -12407,13 +14949,16 @@ rack_init(struct tcpcb *tp, void **ptr) tp->t_flags2 |= TF2_TCP_ACCOUNTING; } #endif + rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; + rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; if (rack_enable_shared_cwnd) rack->rack_enable_scwnd = 1; + rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; rack->rc_user_set_max_segs = rack_hptsi_segments; rack->rc_force_max_seg = 0; - if (rack_use_imac_dack) - rack->rc_dack_mode = 1; TAILQ_INIT(&rack->r_ctl.opt_list); + if (rack_hibeta_setting) + rack->rack_hibeta = 1; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; @@ -12429,7 +14974,7 @@ rack_init(struct tcpcb *tp, void **ptr) rack->rc_gp_no_rec_chg = 1; if (rack_pace_every_seg && tcp_can_enable_pacing()) { rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) + if ((rack->gp_ready) && (rack->rc_always_pace && (rack->use_fixed_rate == 0))) rack_set_cc_pacing(rack); } else rack->rc_always_pace = 0; @@ -12437,18 +14982,13 @@ rack_init(struct tcpcb *tp, void **ptr) rack->r_mbuf_queue = 1; else rack->r_mbuf_queue = 0; - if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) - inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; - else - inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; rack_set_pace_segments(tp, rack, __LINE__, NULL); if (rack_limits_scwnd) rack->r_limit_scw = 1; else rack->r_limit_scw = 0; + rack_init_retransmit_value(rack, rack_rxt_controls); rack->rc_labc = V_tcp_abc_l_var; - rack->r_ctl.rc_high_rwnd = tp->snd_wnd; - rack->r_ctl.cwnd_to_use = tp->snd_cwnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; @@ -12493,70 +15033,103 @@ rack_init(struct tcpcb *tp, void **ptr) rack->rack_hdw_pace_ena = 1; if (rack_hw_rate_caps) rack->r_rack_hw_rate_caps = 1; - /* Do we force on detection? */ #ifdef NETFLIX_EXP_DETECTION - if (tcp_force_detection) - rack->do_detection = 1; - else + rack->do_detection = 1; +#else + rack->do_detection = 0; #endif - rack->do_detection = 0; if (rack_non_rxt_use_cr) rack->rack_rec_nonrxt_use_cr = 1; + /* Lets setup the fsb block */ err = rack_init_fsb(tp, rack); if (err) { - uma_zfree(rack_pcb_zone, tp->t_fb_ptr); - tp->t_fb_ptr = NULL; + uma_zfree(rack_pcb_zone, *ptr); + *ptr = NULL; return (err); } - if (tp->snd_una != tp->snd_max) { - /* Create a send map for the current outstanding data */ - struct rack_sendmap *rsm; - - rsm = rack_alloc(rack); - if (rsm == NULL) { - uma_zfree(rack_pcb_zone, tp->t_fb_ptr); - tp->t_fb_ptr = NULL; - return (ENOMEM); - } - rsm->r_no_rtt_allowed = 1; - rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); - rsm->r_rtr_cnt = 1; - rsm->r_rtr_bytes = 0; - if (tp->t_flags & TF_SENTFIN) - rsm->r_flags |= RACK_HAS_FIN; - if ((tp->snd_una == tp->iss) && - !TCPS_HAVEESTABLISHED(tp->t_state)) - rsm->r_flags |= RACK_HAS_SYN; - rsm->r_start = tp->snd_una; - rsm->r_end = tp->snd_max; - rsm->r_dupack = 0; - if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { - rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); - if (rsm->m) - rsm->orig_m_len = rsm->m->m_len; - else - rsm->orig_m_len = 0; - } else { + if (rack_do_hystart) { + tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; + if (rack_do_hystart > 1) + tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; + if (rack_do_hystart > 2) + tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; + } + /* Log what we will do with queries */ + rack_log_chg_info(tp, rack, 7, + no_query, 0, 0); + if (rack_def_profile) + rack_set_profile(rack, rack_def_profile); + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; + if ((tp->t_state != TCPS_CLOSED) && + (tp->t_state != TCPS_TIME_WAIT)) { + /* + * We are already open, we may + * need to adjust a few things. + */ + if (SEQ_GT(tp->snd_max, tp->iss)) + snt = tp->snd_max - tp->iss; + else + snt = 0; + iwin = rc_init_window(rack); + if ((snt < iwin) && + (no_query == 1)) { + /* We are not past the initial window + * on the first init (i.e. a stack switch + * has not yet occured) so we need to make + * sure cwnd and ssthresh is correct. + */ + if (tp->snd_cwnd < iwin) + tp->snd_cwnd = iwin; /* - * This can happen if we have a stand-alone FIN or - * SYN. + * If we are within the initial window + * we want ssthresh to be unlimited. Setting + * it to the rwnd (which the default stack does + * and older racks) is not really a good idea + * since we want to be in SS and grow both the + * cwnd and the rwnd (via dynamic rwnd growth). If + * we set it to the rwnd then as the peer grows its + * rwnd we will be stuck in CA and never hit SS. + * + * Its far better to raise it up high (this takes the + * risk that there as been a loss already, probably + * we should have an indicator in all stacks of loss + * but we don't), but considering the normal use this + * is a risk worth taking. The consequences of not + * hitting SS are far worse than going one more time + * into it early on (before we have sent even a IW). + * It is highly unlikely that we will have had a loss + * before getting the IW out. */ - rsm->m = NULL; - rsm->orig_m_len = 0; - rsm->soff = 0; + tp->snd_ssthresh = 0xffffffff; } -#ifndef INVARIANTS - (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); -#else - insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - if (insret != NULL) { - panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", - insret, rack, rsm); + /* + * Any init based on sequence numbers + * should be done in the deferred init path + * since we can be CLOSED and not have them + * inited when rack_init() is called. We + * are not closed so lets call it. + */ + rack_deferred_init(tp, rack); + } + if ((tp->t_state != TCPS_CLOSED) && + (tp->t_state != TCPS_TIME_WAIT) && + (no_query == 0) && + (tp->snd_una != tp->snd_max)) { + err = rack_init_outstanding(tp, rack, us_cts, *ptr); + if (err) { + *ptr = NULL; + return(err); } -#endif - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); - rsm->r_in_tmap = 1; } + rack_stop_all_timers(tp, rack); + /* Setup all the inp_flags2 */ + if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) + tptoinpcb(tp)->inp_flags2 |= INP_SUPPORTS_MBUFQ; + else + tptoinpcb(tp)->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; /* * Timers in Rack are kept in microseconds so lets * convert any initial incoming variables @@ -12567,57 +15140,110 @@ rack_init(struct tcpcb *tp, void **ptr) * these to get the full precision. */ rack_convert_rtts(tp); - tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); - if (rack_do_hystart) { - tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; - if (rack_do_hystart > 1) - tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; - if (rack_do_hystart > 2) - tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; + rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); + if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { + /* We do not start any timers on DROPPED connections */ + if (tp->t_fb->tfb_chg_query == NULL) { + rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); + } else { + struct tcp_query_resp qr; + int ret; + + memset(&qr, 0, sizeof(qr)); + + /* Get the misc time stamps and such for rack */ + qr.req = TCP_QUERY_RACK_TIMES; + ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); + if (ret == 1) { + rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; + rack->r_ctl.num_dsack = qr.rack_num_dsacks; + rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; + rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; + rack->rc_rack_rtt = qr.rack_rtt; + rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; + rack->r_ctl.rc_sacked = qr.rack_sacked; + rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; + rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; + rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; + rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; + rack->r_ctl.rc_prr_out = qr.rack_prr_out; + if (qr.rack_tlp_out) { + rack->rc_tlp_in_progress = 1; + rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; + } else { + rack->rc_tlp_in_progress = 0; + rack->r_ctl.rc_tlp_cnt_out = 0; + } + if (qr.rack_srtt_measured) + rack->rc_srtt_measure_made = 1; + if (qr.rack_in_persist == 1) { + rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; +#ifdef NETFLIX_SHARED_CWND + if (rack->r_ctl.rc_scw) { + tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 1; + } +#endif + rack->r_ctl.persist_lost_ends = 0; + rack->probe_not_answered = 0; + rack->forced_ack = 0; + tp->t_rxtshift = 0; + rack->rc_in_persist = 1; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); + } + if (qr.rack_wanted_output) + rack->r_wanted_output = 1; + rack_log_chg_info(tp, rack, 6, + qr.rack_min_rtt, + qr.rack_rtt, + qr.rack_reorder_ts); + } + /* Get the old stack timers */ + qr.req_param = 0; + qr.req = TCP_QUERY_TIMERS_UP; + ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); + if (ret) { + /* + * non-zero return means we have a timer('s) + * to start. Zero means no timer (no keepalive + * I suppose). + */ + uint32_t tov = 0; + + rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; + if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { + rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; + if (TSTMP_GT(qr.timer_pacing_to, us_cts)) + tov = qr.timer_pacing_to - us_cts; + else + tov = HPTS_TICKS_PER_SLOT; + } + if (qr.timer_hpts_flags & PACE_TMR_MASK) { + rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; + if (tov == 0) { + if (TSTMP_GT(qr.timer_timer_exp, us_cts)) + tov = qr.timer_timer_exp - us_cts; + else + tov = HPTS_TICKS_PER_SLOT; + } + } + rack_log_chg_info(tp, rack, 4, + rack->r_ctl.rc_hpts_flags, + rack->r_ctl.rc_last_output_to, + rack->r_ctl.rc_timer_exp); + if (tov) { + struct hpts_diag diag; + + (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov), + __LINE__, &diag); + rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); + } + } + } + rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, + __LINE__, RACK_RTTS_INIT); } - if (rack_def_profile) - rack_set_profile(rack, rack_def_profile); - /* Cancel the GP measurement in progress */ - tp->t_flags &= ~TF_GPUTINPROG; - if (SEQ_GT(tp->snd_max, tp->iss)) - snt = tp->snd_max - tp->iss; - else - snt = 0; - iwin = rc_init_window(rack); - if (snt < iwin) { - /* We are not past the initial window - * so we need to make sure cwnd is - * correct. - */ - if (tp->snd_cwnd < iwin) - tp->snd_cwnd = iwin; - /* - * If we are within the initial window - * we want ssthresh to be unlimited. Setting - * it to the rwnd (which the default stack does - * and older racks) is not really a good idea - * since we want to be in SS and grow both the - * cwnd and the rwnd (via dynamic rwnd growth). If - * we set it to the rwnd then as the peer grows its - * rwnd we will be stuck in CA and never hit SS. - * - * Its far better to raise it up high (this takes the - * risk that there as been a loss already, probably - * we should have an indicator in all stacks of loss - * but we don't), but considering the normal use this - * is a risk worth taking. The consequences of not - * hitting SS are far worse than going one more time - * into it early on (before we have sent even a IW). - * It is highly unlikely that we will have had a loss - * before getting the IW out. - */ - tp->snd_ssthresh = 0xffffffff; - } - rack_stop_all_timers(tp); - /* Lets setup the fsb block */ - rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); - rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, - __LINE__, RACK_RTTS_INIT); return (0); } @@ -12660,42 +15286,25 @@ rack_handoff_ok(struct tcpcb *tp) return (EINVAL); } - static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { - struct inpcb *inp = tptoinpcb(tp); if (tp->t_fb_ptr) { + uint32_t cnt_free = 0; struct tcp_rack *rack; - struct rack_sendmap *rsm, *nrsm; -#ifdef INVARIANTS - struct rack_sendmap *rm; -#endif + struct rack_sendmap *rsm; - rack = (struct tcp_rack *)tp->t_fb_ptr; - if (tp->t_in_pkt) { - /* - * It is unsafe to process the packets since a - * reset may be lurking in them (its rare but it - * can occur). If we were to find a RST, then we - * would end up dropping the connection and the - * INP lock, so when we return the caller (tcp_usrreq) - * will blow up when it trys to unlock the inp. - */ - struct mbuf *save, *m; - - m = tp->t_in_pkt; - tp->t_in_pkt = NULL; - tp->t_tail_pkt = NULL; - while (m) { - save = m->m_nextpkt; - m->m_nextpkt = NULL; - m_freem(m); - m = save; - } - } + tcp_handle_orphaned_packets(tp); tp->t_flags &= ~TF_FORCEDATA; + rack = (struct tcp_rack *)tp->t_fb_ptr; + rack_log_pacing_delay_calc(rack, + 0, + 0, + 0, + rack_get_gp_est(rack), /* delRate */ + rack_get_lt_bw(rack), /* rttProp */ + 20, __LINE__, NULL, 0); #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { uint32_t limit; @@ -12715,51 +15324,6 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) rack->r_ctl.fsb.tcp_ip_hdr = NULL; rack->r_ctl.fsb.th = NULL; } - /* Convert back to ticks, with */ - if (tp->t_srtt > 1) { - uint32_t val, frac; - - val = USEC_2_TICKS(tp->t_srtt); - frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); - tp->t_srtt = val << TCP_RTT_SHIFT; - /* - * frac is the fractional part here is left - * over from converting to hz and shifting. - * We need to convert this to the 5 bit - * remainder. - */ - if (frac) { - if (hz == 1000) { - frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); - } else { - frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); - } - tp->t_srtt += frac; - } - } - if (tp->t_rttvar) { - uint32_t val, frac; - - val = USEC_2_TICKS(tp->t_rttvar); - frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); - tp->t_rttvar = val << TCP_RTTVAR_SHIFT; - /* - * frac is the fractional part here is left - * over from converting to hz and shifting. - * We need to convert this to the 5 bit - * remainder. - */ - if (frac) { - if (hz == 1000) { - frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); - } else { - frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); - } - tp->t_rttvar += frac; - } - } - tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); - tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); if (rack->rc_always_pace) { tcp_decrement_paced_conn(); rack_undo_cc_pacing(rack); @@ -12782,35 +15346,57 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif - RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { -#ifndef INVARIANTS - (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); -#else - rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); - if (rm != rsm) { - panic("At fini, rack:%p rsm:%p rm:%p", - rack, rsm, rm); - } -#endif + /* + * Lets take a different approach to purging just + * get each one and free it like a cum-ack would and + * not use a foreach loop. + */ + rsm = tqhash_min(rack->r_ctl.tqh); + while (rsm) { + tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); + rack->r_ctl.rc_num_maps_alloced--; uma_zfree(rack_zone, rsm); + rsm = tqhash_min(rack->r_ctl.tqh); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); + rack->r_ctl.rc_num_maps_alloced--; + rack->rc_free_cnt--; + cnt_free++; uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } + if ((rack->r_ctl.rc_num_maps_alloced > 0) && + (tcp_bblogging_on(tp))) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex8 = 10; + log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; + log.u_bbr.flex2 = rack->rc_free_cnt; + log.u_bbr.flex3 = cnt_free; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + rsm = tqhash_min(rack->r_ctl.tqh); + log.u_bbr.delRate = (uint64_t)rsm; + rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); + log.u_bbr.cur_del_rate = (uint64_t)rsm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.pkt_epoch = __LINE__; + (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, + 0, &log, false, NULL, NULL, 0, &tv); + } + KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), + ("rack:%p num_aloc:%u after freeing all?", + rack, + rack->r_ctl.rc_num_maps_alloced)); rack->rc_free_cnt = 0; + free(rack->r_ctl.tqh, M_TCPFSB); + rack->r_ctl.tqh = NULL; uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } - inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; - inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; - inp->inp_flags2 &= ~INP_MBUF_ACKCMP; - /* Cancel the GP measurement in progress */ - tp->t_flags &= ~TF_GPUTINPROG; - inp->inp_flags2 &= ~INP_MBUF_L_ACKS; /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } @@ -12836,7 +15422,6 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: - rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; @@ -12856,7 +15441,6 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: - rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; @@ -12956,7 +15540,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) static void -rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) +rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) { if ((SEQ_LT(tp->snd_wl1, seq) || (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || @@ -12980,6 +15564,8 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin /* Not a valid win update */ return; } + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; /* Do we exit persists? */ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), @@ -12999,7 +15585,7 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin * nothing is outstanding, and there is * data to send. Enter persists. */ - rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); } } @@ -13036,6 +15622,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->r_might_revert; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; + log.u_bbr.bbr_state = rack->rc_free_cnt; log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; @@ -13236,7 +15823,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb int win_up_req = 0; #endif int nsegs = 0; - int under_pacing = 1; + int under_pacing = 0; int recovery = 0; #ifdef TCP_ACCOUNTING sched_pin(); @@ -13244,8 +15831,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->gp_ready && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) - under_pacing = 0; - else under_pacing = 1; if (rack->r_state != tp->t_state) @@ -13302,6 +15887,14 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb #endif rack_clear_rate_sample(rack); ae = ((mtod(m, struct tcp_ackent *)) + i); + if (ae->flags & TH_FIN) + rack_log_pacing_delay_calc(rack, + 0, + 0, + 0, + rack_get_gp_est(rack), /* delRate */ + rack_get_lt_bw(rack), /* rttProp */ + 20, __LINE__, NULL, 0); /* Setup the window */ tiwin = ae->win << tp->snd_scale; if (tiwin > rack->r_ctl.rc_high_rwnd) @@ -13320,6 +15913,10 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb /* Case C */ ae->ack_val_set = ACK_RWND; } + if (rack->sack_attack_disable > 0) { + rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); + rack->r_ctl.ack_during_sd++; + } rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); /* Validate timestamp */ if (ae->flags & HAS_TSTMP) { @@ -13395,6 +15992,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb */ if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + if (rack->r_ctl.rc_reorder_ts == 0) + rack->r_ctl.rc_reorder_ts = 1; } } else if (ae->ack_val_set == ACK_DUPACK) { /* Case D */ @@ -13419,7 +16018,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb win_upd_ack = ae->ack; win_seq = ae->seq; the_win = tiwin; - rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); + rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); } else { /* Case A */ if (SEQ_GT(ae->ack, tp->snd_max)) { @@ -13439,7 +16038,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb win_upd_ack = ae->ack; win_seq = ae->seq; the_win = tiwin; - rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); + rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); } #ifdef TCP_ACCOUNTING /* Account for the acks */ @@ -13448,35 +16047,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb } #endif high_seq = ae->ack; - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = high_seq; - log.u_bbr.flex2 = rack->r_ctl.roundends; - log.u_bbr.flex3 = rack->r_ctl.current_round; - log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround; - log.u_bbr.flex8 = 8; - tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - /* - * The draft (v3) calls for us to use SEQ_GEQ, but that - * causes issues when we are just going app limited. Lets - * instead use SEQ_GT <or> where its equal but more data - * is outstanding. - */ - if ((SEQ_GT(high_seq, rack->r_ctl.roundends)) || - ((high_seq == rack->r_ctl.roundends) && - SEQ_GT(tp->snd_max, tp->snd_una))) { - rack->r_ctl.current_round++; - rack->r_ctl.roundends = tp->snd_max; - if (CC_ALGO(tp)->newround != NULL) { - CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); - } - } + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) + rack_log_hystart_event(rack, high_seq, 8); /* Setup our act_rcv_time */ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { ts.tv_sec = ae->timestamp / 1000000000; @@ -13486,7 +16058,11 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb } else { rack->r_ctl.act_rcv_time = *tv; } - rack_process_to_cumack(tp, rack, ae->ack, cts, to); + rack_process_to_cumack(tp, rack, ae->ack, cts, to, + tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); +#ifdef TCP_REQUEST_TRK + rack_http_check_for_comp(rack, high_seq); +#endif if (rack->rc_dsack_round_seen) { /* Is the dsack round over? */ if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { @@ -13516,7 +16092,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb /* Tend to any collapsed window */ if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { /* The peer collapsed the window */ - rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__); + rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); } else if (rack->rc_has_collapsed) rack_un_collapse_window(rack, __LINE__); if ((rack->r_collapse_point_valid) && @@ -13525,6 +16101,28 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb acked_amount = acked = (high_seq - tp->snd_una); if (acked) { /* + * The draft (v3) calls for us to use SEQ_GEQ, but that + * causes issues when we are just going app limited. Lets + * instead use SEQ_GT <or> where its equal but more data + * is outstanding. + * + * Also make sure we are on the last ack of a series. We + * have to have all the ack's processed in queue to know + * if there is something left outstanding. + * + */ + if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && + (rack->rc_new_rnd_needed == 0) && + (nxt_pkt == 0)) { + rack_log_hystart_event(rack, high_seq, 21); + rack->r_ctl.current_round++; + /* Force the next send to setup the next round */ + rack->rc_new_rnd_needed = 1; + if (CC_ALGO(tp)->newround != NULL) { + CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); + } + } + /* * Clear the probe not answered flag * since cum-ack moved forward. */ @@ -13624,10 +16222,16 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb mfree = sbcut_locked(&so->so_snd, acked_amount); tp->snd_una = high_seq; /* Note we want to hold the sb lock through the sendmap adjust */ - rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + rack_adjust_sendmap_head(rack, &so->so_snd); /* Wake up the socket if we have room to write more */ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); sowwakeup_locked(so); + if ((recovery == 1) && + (rack->excess_rxt_on) && + (rack->r_cwnd_was_clamped == 0)) { + do_rack_excess_rxt(tp, rack); + } else if (rack->r_cwnd_was_clamped) + do_rack_check_for_unclamp(tp, rack); m_freem(mfree); } /* update progress */ @@ -13651,9 +16255,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); -#ifdef NETFLIX_HTTP_LOGGING - tcp_http_check_for_comp(rack->rc_tp, high_seq); -#endif tp->snd_wl2 = high_seq; tp->t_dupacks = 0; if (under_pacing && @@ -13662,11 +16263,12 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb rack->rc_gp_dyn_mul && rack->rc_always_pace) { /* Check if we are dragging bottom */ - rack_check_bottom_drag(tp, rack, so, acked); + rack_check_bottom_drag(tp, rack, so); } if (tp->snd_una == tp->snd_max) { tp->t_flags &= ~TF_PREVVALID; rack->r_ctl.retran_during_recovery = 0; + rack->rc_suspicious = 0; rack->r_ctl.dsack_byte_cnt = 0; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) @@ -13916,6 +16518,15 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. */ rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->rack_deferred_inited == 0) { + /* + * If we are the connecting socket we will + * hit rack_init() when no sequence numbers + * are setup. This makes it so we must defer + * some initialization. Call that now. + */ + rack_deferred_init(tp, rack); + } if (m->m_flags & M_ACKCMP) { /* * All compressed ack's are ack's by definition so @@ -13924,6 +16535,20 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, rack->rc_ack_required = 0; return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); } + thflags = tcp_get_flags(th); + /* + * If there is a RST or FIN lets dump out the bw + * with a FIN the connection may go on but we + * may not. + */ + if ((thflags & TH_FIN) || (thflags & TH_RST)) + rack_log_pacing_delay_calc(rack, + rack->r_ctl.gp_bw, + 0, + 0, + rack_get_gp_est(rack), /* delRate */ + rack_get_lt_bw(rack), /* rttProp */ + 20, __LINE__, NULL, 0); if (m->m_flags & M_ACKCMP) { panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); } @@ -13931,7 +16556,6 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, ms_cts = tcp_tv_to_mssectick(tv); nsegs = m->m_pkthdr.lro_nsegs; counter_u64_add(rack_proc_non_comp_ack, 1); - thflags = tcp_get_flags(th); #ifdef TCP_ACCOUNTING sched_pin(); if (thflags & TH_ACK) @@ -14035,6 +16659,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->r_might_revert; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; + log.u_bbr.bbr_state = rack->rc_free_cnt; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; log.u_bbr.flex3 = m->m_flags; @@ -14094,6 +16719,10 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Remove ack required flag if set, we have one */ if (thflags & TH_ACK) rack->rc_ack_required = 0; + if (rack->sack_attack_disable > 0) { + rack->r_ctl.ack_during_sd++; + rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); + } if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; retval = 0; @@ -14274,9 +16903,16 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, if (rack->r_state != tp->t_state) rack_set_state(tp, rack); if (SEQ_GT(th->th_ack, tp->snd_una) && - (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) + (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; + if ((thflags & TH_RST) && + ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { + /* The connection will be killed by a reset check the tracepoint */ + tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); + } retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt, iptos); @@ -14328,38 +16964,37 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, if (nxt_pkt == 0) { if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { do_output_now: - if (tcp_output(tp) < 0) + if (tcp_output(tp) < 0) { +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif return (1); + } did_out = 1; } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); rack_free_trim(rack); } /* Update any rounds needed */ - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = high_seq; - log.u_bbr.flex2 = rack->r_ctl.roundends; - log.u_bbr.flex3 = rack->r_ctl.current_round; - log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround; - log.u_bbr.flex8 = 9; - tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) + rack_log_hystart_event(rack, high_seq, 8); /* * The draft (v3) calls for us to use SEQ_GEQ, but that * causes issues when we are just going app limited. Lets * instead use SEQ_GT <or> where its equal but more data * is outstanding. + * + * Also make sure we are on the last ack of a series. We + * have to have all the ack's processed in queue to know + * if there is something left outstanding. */ - if ((SEQ_GT(tp->snd_una, rack->r_ctl.roundends)) || - ((tp->snd_una == rack->r_ctl.roundends) && SEQ_GT(tp->snd_max, tp->snd_una))) { + if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && + (rack->rc_new_rnd_needed == 0) && + (nxt_pkt == 0)) { + rack_log_hystart_event(rack, tp->snd_una, 21); rack->r_ctl.current_round++; - rack->r_ctl.roundends = tp->snd_max; + /* Force the next send to setup the next round */ + rack->rc_new_rnd_needed = 1; if (CC_ALGO(tp)->newround != NULL) { CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); } @@ -14449,9 +17084,10 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt = 0, thresh = 0, ts_low = 0; + int no_sack = 0; /* Return the next guy to be re-transmitted */ - if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { + if (tqhash_empty(rack->r_ctl.tqh)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { @@ -14471,13 +17107,16 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) return (NULL); } check_it: - if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && + if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) || + (rack->sack_attack_disable > 0)) { + no_sack = 1; + } + if ((no_sack > 0) && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { /* * No sack so we automatically do the 3 strikes and * retransmit (no rack timer would be started). */ - return (rsm); } if (rsm->r_flags & RACK_ACKED) { @@ -14580,6 +17219,14 @@ rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; log.u_bbr.bbr_substate = quality; + log.u_bbr.bbr_state = rack->dgp_on; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->r_fill_less_agg; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; + log.u_bbr.bbr_state <<= 2; + log.u_bbr.bbr_state |= rack->r_pacing_discount; + log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -14591,7 +17238,7 @@ rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) { - uint32_t new_tso, user_max; + uint32_t new_tso, user_max, pace_one; user_max = rack->rc_user_set_max_segs * mss; if (rack->rc_force_max_seg) { @@ -14603,9 +17250,23 @@ rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) /* Use the user mss since we are not exactly matched */ return (user_max); } - new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); + if (rack_pace_one_seg || + (rack->r_ctl.rc_user_set_min_segs == 1)) + pace_one = 1; + else + pace_one = 0; + + new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, + pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); if (new_tso > user_max) new_tso = user_max; + if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { + if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) + new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; + } + if (rack->r_ctl.rc_user_set_min_segs && + ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) + new_tso = rack->r_ctl.rc_user_set_min_segs * mss; return (new_tso); } @@ -14630,30 +17291,40 @@ pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t se } /* * first lets calculate the b/w based on the last us-rtt - * and the sndwnd. + * and the the smallest send window. */ - fill_bw = rack->r_ctl.cwnd_to_use; + fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); /* Take the rwnd if its smaller */ if (fill_bw > rack->rc_tp->snd_wnd) fill_bw = rack->rc_tp->snd_wnd; + /* Now lets make it into a b/w */ + fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; + fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; if (rack->r_fill_less_agg) { /* - * Now take away the inflight (this will reduce our - * aggressiveness and yeah, if we get that much out in 1RTT - * we will have had acks come back and still be behind). + * We want the average of the rate_wanted + * and our fill-cw calculated bw. We also want + * to cap any increase to be no more than + * X times the lt_bw (where X is the rack_bw_multipler). */ - fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + uint64_t lt_bw, rate; + + lt_bw = rack_get_lt_bw(rack); + if (lt_bw > *rate_wanted) + rate = lt_bw; + else + rate = *rate_wanted; + fill_bw += rate; + fill_bw /= 2; + if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) { + fill_bw = rate * rack_bw_multipler; + } } - /* Now lets make it into a b/w */ - fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; - fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; /* We are below the min b/w */ if (non_paced) *rate_wanted = fill_bw; if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) return (slot); - if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) - fill_bw = rack->r_ctl.bw_rate_cap; rack->r_via_fill_cw = 1; if (rack->r_rack_hw_rate_caps && (rack->r_ctl.crte != NULL)) { @@ -14695,9 +17366,15 @@ pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t se } } } + if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL); + fill_bw = rack->r_ctl.bw_rate_cap; + } /* * Ok fill_bw holds our mythical b/w to fill the cwnd - * in a rtt, what does that time wise equate too? + * in an rtt (unless it was capped), what does that + * time wise equate too? */ lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; lentim /= fill_bw; @@ -14715,9 +17392,16 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str { uint64_t srtt; int32_t slot = 0; + int32_t minslot = 0; int can_start_hw_pacing = 1; int err; + int pace_one; + if (rack_pace_one_seg || + (rack->r_ctl.rc_user_set_min_segs == 1)) + pace_one = 1; + else + pace_one = 0; if (rack->rc_always_pace == 0) { /* * We use the most optimistic possible cwnd/srtt for @@ -14774,9 +17458,12 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); } else rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); + /*******************************************************/ + /* RRS: We insert non-paced call to stats here for len */ + /*******************************************************/ } else { uint64_t bw_est, res, lentim, rate_wanted; - uint32_t orig_val, segs, oh; + uint32_t segs, oh; int capped = 0; int prev_fill; @@ -14804,6 +17491,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str */ goto old_method; } + rack_rate_cap_bw(rack, &rate_wanted, &capped); /* We need to account for all the overheads */ segs = (len + segsiz - 1) / segsiz; /* @@ -14812,27 +17500,58 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * means we may be off if we are larger than 1500 bytes * or smaller. But this just makes us more conservative. */ - if (rack_hw_rate_min && - (bw_est < rack_hw_rate_min)) - can_start_hw_pacing = 0; - if (ETHERNET_SEGMENT_SIZE > segsiz) - oh = ETHERNET_SEGMENT_SIZE - segsiz; - else - oh = 0; + + oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); + if (rack->r_is_v6) { +#ifdef INET6 + oh += sizeof(struct ip6_hdr); +#endif + } else { +#ifdef INET + oh += sizeof(struct ip); +#endif + } + /* We add a fixed 14 for the ethernet header */ + oh += 14; segs *= oh; lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; res = lentim / rate_wanted; slot = (uint32_t)res; - orig_val = rack->r_ctl.rc_pace_max_segs; - if (rack->r_ctl.crte == NULL) { + if (rack_hw_rate_min && + (rate_wanted < rack_hw_rate_min)) { + can_start_hw_pacing = 0; + if (rack->r_ctl.crte) { + /* + * Ok we need to release it, we + * have fallen too low. + */ + tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); + rack->r_ctl.crte = NULL; + rack->rack_attempt_hdwr_pace = 0; + rack->rack_hdrw_pacing = 0; + } + } + if (rack->r_ctl.crte && + (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { /* - * Only do this if we are not hardware pacing - * since if we are doing hw-pacing below we will - * set make a call after setting up or changing - * the rate. + * We want more than the hardware can give us, + * don't start any hw pacing. */ - rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); - } else if (rack->rc_inp->inp_snd_tag == NULL) { + can_start_hw_pacing = 0; + if (rack->r_rack_hw_rate_caps == 0) { + /* + * Ok we need to release it, we + * want more than the card can give us and + * no rate cap is in place. Set it up so + * when we want less we can retry. + */ + tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); + rack->r_ctl.crte = NULL; + rack->rack_attempt_hdwr_pace = 0; + rack->rack_hdrw_pacing = 0; + } + } + if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { /* * We lost our rate somehow, this can happen * if the interface changed underneath us. @@ -14846,9 +17565,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str rate_wanted, bw_est, __LINE__, 0, 6); } - /* Did we change the TSO size, if so log it */ - if (rack->r_ctl.rc_pace_max_segs != orig_val) - rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0); prev_fill = rack->r_via_fill_cw; if ((rack->rc_pace_to_cwnd) && (capped == 0) && @@ -14860,6 +17576,28 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * fill the cwnd to the max if its not full. */ slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); + /* Re-check to make sure we are not exceeding our max b/w */ + if ((rack->r_ctl.crte != NULL) && + (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { + /* + * We want more than the hardware can give us, + * don't start any hw pacing. + */ + can_start_hw_pacing = 0; + if (rack->r_rack_hw_rate_caps == 0) { + /* + * Ok we need to release it, we + * want more than the card can give us and + * no rate cap is in place. Set it up so + * when we want less we can retry. + */ + tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); + rack->r_ctl.crte = NULL; + rack->rack_attempt_hdwr_pace = 0; + rack->rack_hdrw_pacing = 0; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); + } + } } if ((rack->rc_inp->inp_route.ro_nh != NULL) && (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { @@ -14879,9 +17617,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str &err, &rack->r_ctl.crte_prev_rate); if (rack->r_ctl.crte) { rack->rack_hdrw_pacing = 1; - rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, - 0, rack->r_ctl.crte, - NULL); + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, + pace_one, rack->r_ctl.crte, + NULL, rack->r_ctl.pace_len_divisor); rack_log_hdwr_pacing(rack, rate_wanted, rack->r_ctl.crte->rate, __LINE__, err, 0); @@ -14933,13 +17671,16 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str goto done_w_hdwr; } nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, - rack->rc_tp, - rack->rc_inp->inp_route.ro_nh->nh_ifp, - rate_wanted, - RS_PACING_GEQ, - &err, &rack->r_ctl.crte_prev_rate); + rack->rc_tp, + rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, + RS_PACING_GEQ, + &err, &rack->r_ctl.crte_prev_rate); if (nrte == NULL) { - /* Lost the rate */ + /* + * Lost the rate, lets drop hardware pacing + * period. + */ rack->rack_hdrw_pacing = 0; rack->r_ctl.crte = NULL; rack_log_hdwr_pacing(rack, @@ -14949,10 +17690,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str counter_u64_add(rack_hw_pace_lost, 1); } else if (nrte != rack->r_ctl.crte) { rack->r_ctl.crte = nrte; - rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, - segsiz, 0, - rack->r_ctl.crte, - NULL); + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, + segsiz, pace_one, rack->r_ctl.crte, + NULL, rack->r_ctl.pace_len_divisor); rack_log_hdwr_pacing(rack, rate_wanted, rack->r_ctl.crte->rate, __LINE__, err, 2); @@ -14968,17 +17708,10 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str } } } - if ((rack->r_ctl.crte != NULL) && - (rack->r_ctl.crte->rate == rate_wanted)) { - /* - * We need to add a extra if the rates - * are exactly matched. The idea is - * we want the software to make sure the - * queue is empty before adding more, this - * gives us N MSS extra pace times where - * N is our sysctl - */ - slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); + if (minslot && (minslot > slot)) { + rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, + 98, __LINE__, NULL, 0); + slot = minslot; } done_w_hdwr: if (rack_limit_time_with_srtt && @@ -15006,6 +17739,9 @@ done_w_hdwr: slot = srtt; } } + /*******************************************************************/ + /* RRS: We insert paced call to stats here for len and rate_wanted */ + /*******************************************************************/ rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); } if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { @@ -15033,7 +17769,6 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq startseq, uint32_t sb_offset) { struct rack_sendmap *my_rsm = NULL; - struct rack_sendmap fe; if (tp->t_state < TCPS_ESTABLISHED) { /* @@ -15057,6 +17792,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, } } tp->t_flags |= TF_GPUTINPROG; + rack->r_ctl.rc_gp_cumack_ts = 0; rack->r_ctl.rc_gp_lowrtt = 0xffffffff; rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; tp->gput_seq = startseq; @@ -15067,7 +17803,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) rack->measure_saw_probe_rtt = 0; if (rack->rc_gp_filled) - tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + tp->gput_ts = rack->r_ctl.last_cumack_advance; else { /* Special case initial measurement */ struct timeval tv; @@ -15092,9 +17828,11 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, tp->gput_ack, 0, tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, __LINE__, NULL, 0); + rack_tend_gp_marks(tp, rack); + rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); return; } if (sb_offset) { @@ -15102,6 +17840,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, * We are out somewhere in the sb * can we use the already outstanding data? */ + if (rack->r_ctl.rc_app_limited_cnt == 0) { /* * Yes first one is good and in this case @@ -15109,7 +17848,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, * the last ack that arrived (no need to * set things up when an ack comes in). */ - my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + my_rsm = tqhash_min(rack->r_ctl.tqh); if ((my_rsm == NULL) || (my_rsm->r_rtr_cnt != 1)) { /* retransmission? */ @@ -15137,13 +17876,11 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, * next with space i.e. over 1 MSS or the one * after that (after the app-limited). */ - my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, - rack->r_ctl.rc_first_appl); + my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); if (my_rsm) { if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) /* Have to use the next one */ - my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, - my_rsm); + my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); else { /* Use after the first MSS of it is acked */ tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); @@ -15165,19 +17902,44 @@ start_set: /* * This one has been acked use the arrival ack time */ + struct rack_sendmap *nrsm; + tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; rack->app_limited_needs_set = 0; + /* + * Ok in this path we need to use the r_end now + * since this guy is the starting ack. + */ + tp->gput_seq = my_rsm->r_end; + /* + * We also need to adjust up the sendtime + * to the send of the next data after my_rsm. + */ + nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); + if (nrsm != NULL) + my_rsm = nrsm; + else { + /* + * The next as not been sent, thats the + * case for using the latest. + */ + goto use_latest; + } } - rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; + rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); + rack->r_ctl.rc_gp_cumack_ts = 0; rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)my_rsm, tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, - __LINE__, NULL, 0); + __LINE__, my_rsm, 0); + /* Now lets make sure all are marked as they should be */ + rack_tend_gp_marks(tp, rack); + rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); return; } @@ -15191,11 +17953,11 @@ use_latest: */ rack->app_limited_needs_set = 1; tp->gput_ack = startseq + rack_get_measure_window(tp, rack); + rack->r_ctl.rc_gp_cumack_ts = 0; /* Find this guy so we can pull the send time */ - fe.r_start = startseq; - my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); if (my_rsm) { - rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; + rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; if (my_rsm->r_flags & RACK_ACKED) { /* * Unlikely since its probably what was @@ -15219,13 +17981,15 @@ use_latest: microuptime(&tv); rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); } + rack_tend_gp_marks(tp, rack); rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)my_rsm, tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, + (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 9, __LINE__, NULL, 0); + rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); } static inline uint32_t @@ -15299,7 +18063,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_ log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, - len, &log, false, NULL, NULL, 0, &tv); + len, &log, false, NULL, __func__, __LINE__, &tv); } } @@ -15442,6 +18206,7 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, * come in. */ fsb->o_m_len = smb->m_len; + fsb->o_t_len = M_TRAILINGROOM(smb); } else { /* * This is the case where the next mbuf went to NULL. This @@ -15452,6 +18217,7 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, * us send more. */ fsb->o_m_len = 0; + fsb->o_t_len = 0; } } return (top); @@ -15473,24 +18239,48 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, struct mbuf *m, *n; int32_t soff; - soff = rack->r_ctl.fsb.off; m = rack->r_ctl.fsb.m; - if (rack->r_ctl.fsb.o_m_len > m->m_len) { + if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { /* - * The mbuf had the front of it chopped off by an ack - * we need to adjust the soff/off by that difference. + * The trailing space changed, mbufs can grow + * at the tail but they can't shrink from + * it, KASSERT that. Adjust the orig_m_len to + * compensate for this change. */ - uint32_t delta; - - delta = rack->r_ctl.fsb.o_m_len - m->m_len; - soff -= delta; - } else if (rack->r_ctl.fsb.o_m_len < m->m_len) { + KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), + ("mbuf:%p rack:%p trailing_space:%lu ots:%u oml:%u mlen:%u\n", + m, + rack, + M_TRAILINGROOM(m), + rack->r_ctl.fsb.o_t_len, + rack->r_ctl.fsb.o_m_len, + m->m_len)); + rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); + rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); + } + if (m->m_len < rack->r_ctl.fsb.o_m_len) { /* - * The mbuf was expanded probably by - * a m_compress. Just update o_m_len. + * Mbuf shrank, trimmed off the top by an ack, our + * offset changes. */ + KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), + ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", + m, m->m_len, + rack, rack->r_ctl.fsb.o_m_len, + rack->r_ctl.fsb.off)); + + if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) + rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); + else + rack->r_ctl.fsb.off = 0; rack->r_ctl.fsb.o_m_len = m->m_len; +#ifdef INVARIANTS + } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { + panic("rack:%p m:%p m_len grew outside of t_space compensation", + rack, m); +#endif } + soff = rack->r_ctl.fsb.off; KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", @@ -15505,6 +18295,105 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, return (n); } +/* Log the buffer level */ +static void +rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, + int len, struct timeval *tv, + uint32_t cts) +{ + uint32_t p_rate = 0, p_queue = 0, err = 0; + union tcp_log_stackspecific log; + +#ifdef RATELIMIT + err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); + err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); +#endif + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.flex1 = p_rate; + log.u_bbr.flex2 = p_queue; + log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; + log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; + log.u_bbr.flex6 = rack->r_ctl.crte->time_between; + log.u_bbr.flex7 = 99; + log.u_bbr.flex8 = 0; + log.u_bbr.pkts_out = err; + log.u_bbr.delRate = rack->r_ctl.crte->rate; + log.u_bbr.timeStamp = cts; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, + len, &log, false, NULL, __func__, __LINE__, tv); + +} + +static uint32_t +rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, + struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) +{ + uint64_t lentime = 0; +#ifdef RATELIMIT + uint32_t p_rate = 0, p_queue = 0, err; + union tcp_log_stackspecific log; + uint64_t bw; + + err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); + /* Failed or queue is zero */ + if (err || (p_queue == 0)) { + lentime = 0; + goto out; + } + err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); + if (err) { + lentime = 0; + goto out; + } + /* + * If we reach here we have some bytes in + * the queue. The number returned is a value + * between 0 and 0xffff where ffff is full + * and 0 is empty. So how best to make this into + * something usable? + * + * The "safer" way is lets take the b/w gotten + * from the query (which should be our b/w rate) + * and pretend that a full send (our rc_pace_max_segs) + * is outstanding. We factor it so its as if a full + * number of our MSS segment is terms of full + * ethernet segments are outstanding. + */ + bw = p_rate / 8; + if (bw) { + lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); + lentime *= ETHERNET_SEGMENT_SIZE; + lentime *= (uint64_t)HPTS_USEC_IN_SEC; + lentime /= bw; + } else { + /* TSNH -- KASSERT? */ + lentime = 0; + } +out: + if (tcp_bblogging_on(tp)) { + memset(&log, 0, sizeof(log)); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.flex1 = p_rate; + log.u_bbr.flex2 = p_queue; + log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; + log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; + log.u_bbr.flex6 = rack->r_ctl.crte->time_between; + log.u_bbr.flex7 = 99; + log.u_bbr.flex8 = 0; + log.u_bbr.pkts_out = err; + log.u_bbr.delRate = rack->r_ctl.crte->rate; + log.u_bbr.cur_del_rate = lentime; + log.u_bbr.timeStamp = cts; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, + len, &log, false, NULL, __func__, __LINE__,tv); + } +#endif + return ((uint32_t)lentime); +} + static int rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) @@ -15535,6 +18424,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma uint16_t flags; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; + int32_t ip_sendflag = IP_NO_SND_TAG_RL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; @@ -15609,7 +18499,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma /* Establish the len to send */ if (len > max_val) len = max_val; - if ((tso) && (len + optlen > tp->t_maxseg)) { + if ((tso) && (len + optlen > segsiz)) { uint32_t if_hw_tsomax; int32_t max_len; @@ -15643,6 +18533,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } if ((tso == 0) && (len > segsiz)) len = segsiz; + (void)tcp_get_usecs(tv); if ((len == 0) || (len <= MHLEN - hdrlen - max_linkhdr)) { goto failed; @@ -15683,7 +18574,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma #endif if (rsm->m == NULL) goto failed; - if (rsm->orig_m_len != rsm->m->m_len) { + if (rsm->m && + ((rsm->orig_m_len != rsm->m->m_len) || + (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { /* Fix up the orig_m_len and possibly the mbuf offset */ rack_adjust_orig_mlen(rsm); } @@ -15726,6 +18619,13 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma ip->ip_tos |= ect; } } + if (rack->r_ctl.crte != NULL) { + /* See if we can send via the hw queue */ + slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); + /* If there is nothing in queue (no pacing time) we can send via the hw queue */ + if (slot == 0) + ip_sendflag = 0; + } tcp_set_flags(th, flags); m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 @@ -15770,10 +18670,15 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } #endif if (tso) { - KASSERT(len > tp->t_maxseg - optlen, + /* + * Here we use segsiz since we have no added options besides + * any standard timestamp options (no DSACKs or SACKS are sent + * via either fast-path). + */ + KASSERT(len > segsiz, ("%s: len <= tso_segsz tp:%p", __func__, tp)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + m->m_pkthdr.tso_segsz = segsiz; } #ifdef INET6 if (rack->r_is_v6) { @@ -15802,6 +18707,19 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } } #endif + if (doing_tlp == 0) { + /* Set we retransmitted */ + rack->rc_gp_saw_rec = 1; + } else { + /* Its a TLP set ca or ss */ + if (tp->snd_cwnd > tp->snd_ssthresh) { + /* Set we sent in CA */ + rack->rc_gp_saw_ca = 1; + } else { + /* Set we sent in SS */ + rack->rc_gp_saw_ss = 1; + } + } /* Time to copy in our header */ cpto = mtod(m, uint8_t *); memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); @@ -15829,11 +18747,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = max_val; - log.u_bbr.flex5 = 0; /* Save off the early/late values */ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; if (doing_tlp == 0) log.u_bbr.flex8 = 1; else @@ -15843,17 +18761,43 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + if (rsm && (rsm->r_rtr_cnt > 0)) { + /* + * When we have a retransmit we want to log the + * burst at send and flight at send from before. + */ + log.u_bbr.flex5 = rsm->r_fas; + log.u_bbr.bbr_substate = rsm->r_bas; + } else { + /* + * This is currently unlikely until we do the + * packet pair probes but I will add it for completeness. + */ + log.u_bbr.flex5 = log.u_bbr.inflight; + log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); + } log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; log.u_bbr.delivered = 0; + log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.delRate = rsm->r_flags; + log.u_bbr.delRate <<= 31; + log.u_bbr.delRate |= rack->r_must_retran; + log.u_bbr.delRate <<= 1; + log.u_bbr.delRate |= 1; + log.u_bbr.pkt_epoch = __LINE__; lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, tv); + len, &log, false, NULL, __func__, __LINE__, tv); } else lgb = NULL; + if ((rack->r_ctl.crte != NULL) && + tcp_bblogging_on(tp)) { + rack_log_queue_level(tp, rack, len, tv, cts); + } #ifdef INET6 if (rack->r_is_v6) { error = ip6_output(m, NULL, &inp->inp_route6, - 0, NULL, NULL, inp); + ip_sendflag, NULL, NULL, inp); } else #endif @@ -15861,7 +18805,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma { error = ip_output(m, NULL, &inp->inp_route, - 0, 0, inp); + ip_sendflag, 0, inp); } #endif m = NULL; @@ -15871,14 +18815,22 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma } if (error) { goto failed; + } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { + rack->rc_hw_nobuf = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; } + rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), - rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls); - if (doing_tlp && (rack->fast_rsm_hack == 0)) { + rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); + if (doing_tlp) { rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_tlp_cnt_out++; } if (error == 0) { + counter_u64_add(rack_total_bytes, len); tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); if (doing_tlp) { rack->rc_last_sent_tlp_past_cumack = 0; @@ -15909,6 +18861,8 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma if (error && (error == ENOBUFS)) { if (rack->r_ctl.crte != NULL) { tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); + if (tcp_bblogging_on(rack->rc_tp)) + rack_log_queue_level(tp, rack, len, tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); @@ -15916,6 +18870,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma rack->rc_enobuf++; if (slot < (10 * HPTS_USEC_IN_MSEC)) slot = 10 * HPTS_USEC_IN_MSEC; + if (rack->r_ctl.crte != NULL) { + counter_u64_add(rack_saw_enobuf_hw, 1); + tcp_rl_log_enobuf(rack->r_ctl.crte); + } + counter_u64_add(rack_saw_enobuf, 1); } else slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); if ((slot == 0) || @@ -16101,7 +19060,7 @@ again: /* Establish the len to send */ if (len > max_val) len = max_val; - if ((tso) && (len + optlen > tp->t_maxseg)) { + if ((tso) && (len + optlen > segsiz)) { uint32_t if_hw_tsomax; int32_t max_len; @@ -16135,6 +19094,7 @@ again: } if ((tso == 0) && (len > segsiz)) len = segsiz; + (void)tcp_get_usecs(tv); if ((len == 0) || (len <= MHLEN - hdrlen - max_linkhdr)) { goto failed; @@ -16172,7 +19132,7 @@ again: } if (rack->r_ctl.fsb.rfo_apply_push && (len == rack->r_ctl.fsb.left_to_send)) { - flags |= TH_PUSH; + tcp_set_flags(th, flags | TH_PUSH); add_flag |= RACK_HAD_PUSH; } if ((m->m_next == NULL) || (len <= 0)){ @@ -16250,10 +19210,15 @@ again: } #endif if (tso) { - KASSERT(len > tp->t_maxseg - optlen, + /* + * Here we use segsiz since we have no added options besides + * any standard timestamp options (no DSACKs or SACKS are sent + * via either fast-path). + */ + KASSERT(len > segsiz, ("%s: len <= tso_segsz tp:%p", __func__, tp)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + m->m_pkthdr.tso_segsz = segsiz; } #ifdef INET6 if (rack->r_is_v6) { @@ -16282,6 +19247,13 @@ again: } } #endif + if (tp->snd_cwnd > tp->snd_ssthresh) { + /* Set we sent in CA */ + rack->rc_gp_saw_ca = 1; + } else { + /* Set we sent in SS */ + rack->rc_gp_saw_ss = 1; + } /* Time to copy in our header */ cpto = mtod(m, uint8_t *); memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); @@ -16292,6 +19264,10 @@ again: } else { th->th_off = sizeof(struct tcphdr) >> 2; } + if ((rack->r_ctl.crte != NULL) && + tcp_bblogging_on(tp)) { + rack_log_queue_level(tp, rack, len, tv, cts); + } if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -16304,21 +19280,29 @@ again: log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = max_val; - log.u_bbr.flex5 = 0; /* Save off the early/late values */ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; log.u_bbr.flex8 = 0; log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); log.u_bbr.flex7 = 44; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.flex5 = log.u_bbr.inflight; log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; log.u_bbr.delivered = 0; + log.u_bbr.rttProp = 0; + log.u_bbr.delRate = rack->r_must_retran; + log.u_bbr.delRate <<= 1; + log.u_bbr.pkt_epoch = __LINE__; + /* For fast output no retrans so just inflight and how many mss we send */ + log.u_bbr.flex5 = log.u_bbr.inflight; + log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, tv); + len, &log, false, NULL, __func__, __LINE__, tv); } else lgb = NULL; #ifdef INET6 @@ -16346,17 +19330,29 @@ again: *send_err = error; m = NULL; goto failed; + } else if (rack->rc_hw_nobuf) { + rack->rc_hw_nobuf = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; + } + if ((error == 0) && (rack->lt_bw_up == 0)) { + /* Unlikely */ + rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); + rack->r_ctl.lt_seq = tp->snd_una; + rack->lt_bw_up = 1; } rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), - NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls); + NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); m = NULL; if (tp->snd_una == tp->snd_max) { rack->r_ctl.rc_tlp_rxt_last_time = cts; rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } - if (error == 0) - tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); + counter_u64_add(rack_total_bytes, len); + tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); rack->forced_ack = 0; /* If we send something zap the FA flag */ tot_len += len; @@ -16364,6 +19360,15 @@ again: rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); tp->snd_max += len; tp->snd_nxt = tp->snd_max; + if (rack->rc_new_rnd_needed) { + /* + * Update the rnd to start ticking not + * that from a time perspective all of + * the preceding idle time is "in the round" + */ + rack->rc_new_rnd_needed = 0; + rack->r_ctl.roundends = tp->snd_max; + } { int idx; @@ -16426,16 +19431,65 @@ failed: return (-1); } +static inline void +rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, + struct sockbuf *sb, + int len, int orig_len, int segsiz, uint32_t pace_max_seg, + bool hw_tls, + uint16_t flags) +{ + rack->r_fast_output = 1; + rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); + rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; + rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); + rack->r_ctl.fsb.tcp_flags = flags; + rack->r_ctl.fsb.left_to_send = orig_len - len; + if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { + /* Less than a full sized pace, lets not */ + rack->r_fast_output = 0; + return; + } else { + /* Round down to the nearest pace_max_seg */ + rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); + } + if (hw_tls) + rack->r_ctl.fsb.hw_tls = 1; + else + rack->r_ctl.fsb.hw_tls = 0; + KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), + ("rack:%p left_to_send:%u sbavail:%u out:%u", + rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), + (tp->snd_max - tp->snd_una))); + if (rack->r_ctl.fsb.left_to_send < segsiz) + rack->r_fast_output = 0; + else { + if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) + rack->r_ctl.fsb.rfo_apply_push = 1; + else + rack->r_ctl.fsb.rfo_apply_push = 0; + } +} + +static uint32_t +rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) +{ + uint64_t min_time; + uint32_t maxlen; + + min_time = (uint64_t)get_hpts_min_sleep_time(); + maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); + maxlen = roundup(maxlen, segsiz); + return (maxlen); +} + static struct rack_sendmap * rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) { struct rack_sendmap *rsm = NULL; - struct rack_sendmap fe; int thresh; restart: - fe.r_start = rack->r_ctl.last_collapse_point; - rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { /* Nothing, strange turn off validity */ rack->r_collapse_point_valid = 0; @@ -16478,6 +19532,35 @@ restart: return (NULL); } +static inline void +rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) +{ + if ((rack->full_size_rxt == 0) && + (rack->shape_rxt_to_pacing_min == 0) && + (*len >= segsiz)) { + *len = segsiz; + } else if (rack->shape_rxt_to_pacing_min && + rack->gp_ready) { + /* We use pacing min as shaping len req */ + uint32_t maxlen; + + maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); + if (*len > maxlen) + *len = maxlen; + } else { + /* + * The else is full_size_rxt is on so send it all + * note we do need to check this for exceeding + * our max segment size due to the fact that + * we do sometimes merge chunks together i.e. + * we cannot just assume that we will never have + * a chunk greater than pace_max_seg + */ + if (*len > pace_max_seg) + *len = pace_max_seg; + } +} + static int rack_output(struct tcpcb *tp) { @@ -16500,6 +19583,7 @@ rack_output(struct tcpcb *tp) struct tcphdr *th; uint8_t pass = 0; uint8_t mark = 0; + uint8_t check_done = 0; uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen=0; @@ -16558,6 +19642,15 @@ rack_output(struct tcpcb *tp) return (tcp_offload_output(tp)); } #endif + if (rack->rack_deferred_inited == 0) { + /* + * If we are the connecting socket we will + * hit rack_init() when no sequence numbers + * are setup. This makes it so we must defer + * some initialization. Call that now. + */ + rack_deferred_init(tp, rack); + } /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. @@ -16603,7 +19696,7 @@ rack_output(struct tcpcb *tp) int retval; retval = rack_process_timers(tp, rack, cts, hpts_calling, - &doing_tlp); + &doing_tlp); if (retval != 0) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); #ifdef TCP_ACCOUNTING @@ -16752,8 +19845,11 @@ rack_output(struct tcpcb *tp) } idle = 0; } - if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) - rack_init_fsb_block(tp, rack); + if (rack_use_fsb && + (rack->r_ctl.fsb.tcp_ip_hdr) && + (rack->r_fsb_inited == 0) && + (rack->r_state != TCPS_CLOSED)) + rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); again: /* * If we've recently taken a timeout, snd_max will be greater than @@ -16811,8 +19907,7 @@ again: __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (len >= segsiz) - len = segsiz; + rack_validate_sizes(rack, &len, segsiz, pace_max_seg); } else if (rack->r_collapse_point_valid && ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { /* @@ -16832,10 +19927,7 @@ again: len = rsm->r_end - rsm->r_start; sb_offset = rsm->r_start - tp->snd_una; sendalot = 0; - if ((rack->full_size_rxt == 0) && - (rack->shape_rxt_to_pacing_min == 0) && - (len >= segsiz)) - len = segsiz; + rack_validate_sizes(rack, &len, segsiz, pace_max_seg); } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { /* We have a retransmit that takes precedence */ if ((!IN_FASTRECOVERY(tp->t_flags)) && @@ -16857,13 +19949,12 @@ again: rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; sendalot = 0; - if (len >= segsiz) - len = segsiz; + rack_validate_sizes(rack, &len, segsiz, pace_max_seg); if (len > 0) { sack_rxmit = 1; KMOD_TCPSTAT_INC(tcps_sack_rexmits); KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, segsiz)); + min(len, segsiz)); } } else if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ @@ -16955,6 +20046,15 @@ again: (rack->shape_rxt_to_pacing_min == 0) && (len >= segsiz)) len = segsiz; + else if (rack->shape_rxt_to_pacing_min && + rack->gp_ready) { + /* We use pacing min as shaping len req */ + uint32_t maxlen; + + maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); + if (len > maxlen) + len = maxlen; + } /* * Delay removing the flag RACK_MUST_RXT so * that the fastpath for retransmit will @@ -16990,7 +20090,8 @@ again: flags &= ~TH_FIN; } } - if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && + if (rsm && rack->r_fsb_inited && + rack_use_rsm_rfo && ((rsm->r_flags & RACK_HAS_FIN) == 0)) { int ret; @@ -17029,8 +20130,8 @@ again: if (rack->r_ctl.rc_scw) { /* First lets update and get the cwnd */ rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, - rack->r_ctl.rc_scw_index, - tp->snd_cwnd, tp->snd_wnd, segsiz); + rack->r_ctl.rc_scw_index, + tp->snd_cwnd, tp->snd_wnd, segsiz); } } #endif @@ -17092,7 +20193,11 @@ again: } else { len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); } - if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { + if ((rack->r_ctl.crte == NULL) && + IN_FASTRECOVERY(tp->t_flags) && + (rack->full_size_rxt == 0) && + (rack->shape_rxt_to_pacing_min == 0) && + (len > segsiz)) { /* * For prr=off, we need to send only 1 MSS * at a time. We do this because another sack could @@ -17101,7 +20206,15 @@ again: * that keeps us from sending out the retransmit. */ len = segsiz; - } + } else if (rack->shape_rxt_to_pacing_min && + rack->gp_ready) { + /* We use pacing min as shaping len req */ + uint32_t maxlen; + + maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); + if (len > maxlen) + len = maxlen; + }/* The else is full_size_rxt is on so send it all */ } else { uint32_t outstanding; /* @@ -17259,7 +20372,7 @@ again: (TCPS_HAVEESTABLISHED(tp->t_state)) && (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { - rack_enter_persist(tp, rack, cts); + rack_enter_persist(tp, rack, cts, tp->snd_una); } } else if ((rsm == NULL) && (doing_tlp == 0) && @@ -17287,9 +20400,9 @@ again: * Nothing out we can * go into persists. */ - rack_enter_persist(tp, rack, cts); + rack_enter_persist(tp, rack, cts, tp->snd_una); } - } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && + } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && (len < minseg)) { @@ -17346,7 +20459,6 @@ again: * larger TSO's out). */ len = 0; - } } @@ -17418,7 +20530,7 @@ again: } } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), - (long)TCP_MAXWIN << tp->rcv_scale); + (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following @@ -17519,7 +20631,7 @@ again: if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); if (adv > oldwin) - adv -= oldwin; + adv -= oldwin; else { /* We can't increase the window */ adv = 0; @@ -17591,6 +20703,7 @@ just_return_nolock: (tp->rcv_numsacks == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && + ((IN_RECOVERY(tp->t_flags)) == 0) && (rack->r_must_retran == 0) && ((tp->t_flags & TF_NEEDFIN) == 0) && (len > 0) && (orig_len > 0) && @@ -17599,28 +20712,8 @@ just_return_nolock: ((optlen == 0) || ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { /* We can send at least one more MSS using our fsb */ - - rack->r_fast_output = 1; - rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); - rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; - rack->r_ctl.fsb.tcp_flags = flags; - rack->r_ctl.fsb.left_to_send = orig_len - len; - if (hw_tls) - rack->r_ctl.fsb.hw_tls = 1; - else - rack->r_ctl.fsb.hw_tls = 0; - KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), - ("rack:%p left_to_send:%u sbavail:%u out:%u", - rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), - (tp->snd_max - tp->snd_una))); - if (rack->r_ctl.fsb.left_to_send < segsiz) - rack->r_fast_output = 0; - else { - if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) - rack->r_ctl.fsb.rfo_apply_push = 1; - else - rack->r_ctl.fsb.rfo_apply_push = 0; - } + rack_setup_fast_output(tp, rack, sb, len, orig_len, + segsiz, pace_max_seg, hw_tls, flags); } else rack->r_fast_output = 0; @@ -17634,7 +20727,7 @@ just_return_nolock: int end_window = 0; uint32_t seq = tp->gput_ack; - rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_max(rack->r_ctl.tqh); if (rsm) { /* * Mark the last sent that we just-returned (hinting @@ -17653,12 +20746,12 @@ just_return_nolock: /* We are limited by the rwnd */ app_limited = CTF_JR_RWND_LIMITED; if (IN_FASTRECOVERY(tp->t_flags)) - rack->r_ctl.rc_prr_sndcnt = 0; + rack->r_ctl.rc_prr_sndcnt = 0; } else if (ctf_outstanding(tp) >= sbavail(sb)) { /* We are limited by whats available -- app limited */ app_limited = CTF_JR_APP_LIMITED; if (IN_FASTRECOVERY(tp->t_flags)) - rack->r_ctl.rc_prr_sndcnt = 0; + rack->r_ctl.rc_prr_sndcnt = 0; } else if ((idle == 0) && ((tp->t_flags & TF_NODELAY) == 0) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && @@ -17754,7 +20847,7 @@ just_return_nolock: log = 1; } /* Mark the last packet has app limited */ - rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rsm = tqhash_max(rack->r_ctl.tqh); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; @@ -17784,7 +20877,7 @@ just_return_nolock: (sbavail(sb) > tp->snd_wnd) && (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { /* Yes lets make sure to move to persist before timer-start */ - rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); } rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); @@ -17822,6 +20915,26 @@ just_return_nolock: return (0); send: + if ((rack->r_ctl.crte != NULL) && + (rsm == NULL) && + ((rack->rc_hw_nobuf == 1) || + (rack_hw_check_queue && (check_done == 0)))) { + /* + * We only want to do this once with the hw_check_queue, + * for the enobuf case we would only do it once if + * we come around to again, the flag will be clear. + */ + check_done = 1; + slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); + if (slot) { + rack->r_ctl.rc_agg_delayed = 0; + rack->r_ctl.rc_agg_early = 0; + rack->r_early = 0; + rack->r_late = 0; + SOCKBUF_UNLOCK(&so->so_snd); + goto skip_all_send; + } + } if (rsm || sack_rxmit) counter_u64_add(rack_nfto_resend, 1); else @@ -18037,7 +21150,7 @@ send: * In case there are too many small fragments don't * use TSO: */ - if (len <= segsiz) { + if (len <= max_len) { mark = 4; tso = 0; } @@ -18515,6 +21628,10 @@ send: * not the case for IPv6. */ if (tso) { + /* + * Here we must use t_maxseg and the optlen since + * the optlen may include SACK's (or DSACK). + */ KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; @@ -18528,6 +21645,11 @@ send: /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif + if ((rack->r_ctl.crte != NULL) && + (rack->rc_hw_nobuf == 0) && + tcp_bblogging_on(tp)) { + rack_log_queue_level(tp, rack, len, &tv, cts); + } /* We're getting ready to send; log now. */ if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; @@ -18545,6 +21667,7 @@ send: log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; log.u_bbr.flex8 = 0; if (rsm) { if (rsm->r_flags & RACK_RWND_COLLAPSED) { @@ -18559,8 +21682,6 @@ send: } else { if (doing_tlp) log.u_bbr.flex8 = 3; - else - log.u_bbr.flex8 = 0; } log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); log.u_bbr.flex7 = mark; @@ -18569,10 +21690,39 @@ send: log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + if (rsm && (rsm->r_rtr_cnt > 0)) { + /* + * When we have a retransmit we want to log the + * burst at send and flight at send from before. + */ + log.u_bbr.flex5 = rsm->r_fas; + log.u_bbr.bbr_substate = rsm->r_bas; + } else { + /* + * New transmits we log in flex5 the inflight again as + * well as the number of segments in our send in the + * substate field. + */ + log.u_bbr.flex5 = log.u_bbr.inflight; + log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); + } log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.delivered = sendalot; + log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.pkt_epoch = __LINE__; + if (rsm) { + log.u_bbr.delRate = rsm->r_flags; + log.u_bbr.delRate <<= 31; + log.u_bbr.delRate |= rack->r_must_retran; + log.u_bbr.delRate <<= 1; + log.u_bbr.delRate |= (sack_rxmit & 0x00000001); + } else { + log.u_bbr.delRate = rack->r_must_retran; + log.u_bbr.delRate <<= 1; + log.u_bbr.delRate |= (sack_rxmit & 0x00000001); + } lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, &tv); + len, &log, false, NULL, __func__, __LINE__, &tv); } else lgb = NULL; @@ -18684,7 +21834,31 @@ out: * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ + rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, + rack_to_usec_ts(&tv), + rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); if (error == 0) { + if (rsm == NULL) { + if (rack->lt_bw_up == 0) { + rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); + rack->r_ctl.lt_seq = tp->snd_una; + rack->lt_bw_up = 1; + } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { + /* + * Need to record what we have since we are + * approaching seq wrap. + */ + uint64_t tmark; + + rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); + rack->r_ctl.lt_seq = tp->snd_una; + tmark = tcp_tv_to_lusectick(&tv); + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + rack->r_ctl.lt_timemark = tmark; + } + } + rack->forced_ack = 0; /* If we send something zap the FA flag */ + counter_u64_add(rack_total_bytes, len); tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); if (rsm && doing_tlp) { rack->rc_last_sent_tlp_past_cumack = 0; @@ -18692,7 +21866,13 @@ out: rack->r_ctl.last_sent_tlp_seq = rsm->r_start; rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; } - rack->forced_ack = 0; /* If we send something zap the FA flag */ + if (rack->rc_hw_nobuf) { + rack->rc_hw_nobuf = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_early = 0; + rack->r_late = 0; + rack->r_ctl.rc_agg_early = 0; + } if (rsm && (doing_tlp == 0)) { /* Set we retransmitted */ rack->rc_gp_saw_rec = 1; @@ -18710,11 +21890,9 @@ out: tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); tot_len_this_send += len; - if (len == 0) + if (len == 0) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); - else if (len == 1) { - counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); - } else if (len > 1) { + } else { int idx; idx = (len / segsiz) + 3; @@ -18740,9 +21918,6 @@ out: /* If its a resend without TLP then it must not have the flag */ rsm->r_flags &= ~RACK_TLP; } - rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, - rack_to_usec_ts(&tv), - rsm, add_flag, s_mb, s_moff, hw_tls); if ((error == 0) && @@ -18803,6 +21978,15 @@ out: tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; + if (rack->rc_new_rnd_needed) { + /* + * Update the rnd to start ticking not + * that from a time perspective all of + * the preceding idle time is "in the round" + */ + rack->rc_new_rnd_needed = 0; + rack->r_ctl.roundends = tp->snd_max; + } /* * Time this transmission if not a retransmission and * not currently timing anything. @@ -18837,6 +22021,7 @@ out: if (rack->r_fast_output) { rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; + rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); } } } @@ -18878,6 +22063,8 @@ nomore: */ if (rack->r_ctl.crte != NULL) { tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); + if (tcp_bblogging_on(rack->rc_tp)) + rack_log_queue_level(tp, rack, len, &tv, cts); } else tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); @@ -18903,8 +22090,13 @@ nomore: if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { + int saved_mtu; + + saved_mtu = tp->t_maxseg; tcp_mss_update(tp, -1, mtu, NULL, NULL); - goto again; + if (saved_mtu > tp->t_maxseg) { + goto again; + } } slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); @@ -19048,6 +22240,7 @@ enobufs: (tp->rcv_numsacks == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && + ((IN_RECOVERY(tp->t_flags)) == 0) && (rack->r_must_retran == 0) && ((tp->t_flags & TF_NEEDFIN) == 0) && (len > 0) && (orig_len > 0) && @@ -19056,28 +22249,8 @@ enobufs: ((optlen == 0) || ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { /* We can send at least one more MSS using our fsb */ - - rack->r_fast_output = 1; - rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); - rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; - rack->r_ctl.fsb.tcp_flags = flags; - rack->r_ctl.fsb.left_to_send = orig_len - len; - if (hw_tls) - rack->r_ctl.fsb.hw_tls = 1; - else - rack->r_ctl.fsb.hw_tls = 0; - KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), - ("rack:%p left_to_send:%u sbavail:%u out:%u", - rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), - (tp->snd_max - tp->snd_una))); - if (rack->r_ctl.fsb.left_to_send < segsiz) - rack->r_fast_output = 0; - else { - if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) - rack->r_ctl.fsb.rfo_apply_push = 1; - else - rack->r_ctl.fsb.rfo_apply_push = 0; - } + rack_setup_fast_output(tp, rack, sb, len, orig_len, + segsiz, pace_max_seg, hw_tls, flags); } else rack->r_fast_output = 0; rack_log_fsb(rack, tp, so, flags, @@ -19097,6 +22270,7 @@ enobufs: (rack->r_must_retran == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && + ((IN_RECOVERY(tp->t_flags)) == 0) && ((tp->t_flags & TF_NEEDFIN) == 0) && (len > 0) && (orig_len > 0) && (orig_len > len) && @@ -19104,31 +22278,9 @@ enobufs: ((optlen == 0) || ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { /* we can use fast_output for more */ - - rack->r_fast_output = 1; - rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); - rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; - rack->r_ctl.fsb.tcp_flags = flags; - rack->r_ctl.fsb.left_to_send = orig_len - len; - if (hw_tls) - rack->r_ctl.fsb.hw_tls = 1; - else - rack->r_ctl.fsb.hw_tls = 0; - KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), - ("rack:%p left_to_send:%u sbavail:%u out:%u", - rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), - (tp->snd_max - tp->snd_una))); - if (rack->r_ctl.fsb.left_to_send < segsiz) { - rack->r_fast_output = 0; - } + rack_setup_fast_output(tp, rack, sb, len, orig_len, + segsiz, pace_max_seg, hw_tls, flags); if (rack->r_fast_output) { - if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) - rack->r_ctl.fsb.rfo_apply_push = 1; - else - rack->r_ctl.fsb.rfo_apply_push = 0; - rack_log_fsb(rack, tp, so, flags, - ipoptlen, orig_len, len, error, - (rsm == NULL), optlen, __LINE__, 3); error = 0; ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); if (ret >= 0) @@ -19141,6 +22293,7 @@ enobufs: goto again; } /* Assure when we leave that snd_nxt will point to top */ +skip_all_send: if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); @@ -19208,7 +22361,7 @@ rack_mtu_change(struct tcpcb *tp) rack->r_must_retran = 1; /* Mark all inflight to needing to be rxt'd */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { - rsm->r_flags |= RACK_MUST_RXT; + rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); } } sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); @@ -19217,125 +22370,110 @@ rack_mtu_change(struct tcpcb *tp) } static int -rack_set_profile(struct tcp_rack *rack, int prof) +rack_set_dgp(struct tcp_rack *rack) { - int err = EINVAL; - if (prof == 1) { - /* pace_always=1 */ - if (rack->rc_always_pace == 0) { - if (tcp_can_enable_pacing() == 0) - return (EBUSY); - } - rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) - rack_set_cc_pacing(rack); - rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; - rack->rack_attempt_hdwr_pace = 0; - /* cmpack=1 */ - if (rack_use_cmp_acks) - rack->r_use_cmp_ack = 1; - if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && - rack->r_use_cmp_ack) - rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; - /* scwnd=1 */ - rack->rack_enable_scwnd = 1; - /* dynamic=100 */ - rack->rc_gp_dyn_mul = 1; - /* gp_inc_ca */ - rack->r_ctl.rack_per_of_gp_ca = 100; - /* rrr_conf=3 */ - rack->r_rr_config = 3; - /* npush=2 */ - rack->r_ctl.rc_no_push_at_mrtt = 2; - /* fillcw=1 */ + /* pace_always=1 */ + if (rack->rc_always_pace == 0) { + if (tcp_can_enable_pacing() == 0) + return (EBUSY); + } + rack->dgp_on = 1; + rack->rc_always_pace = 1; + rack->use_fixed_rate = 0; + if (rack->gp_ready) + rack_set_cc_pacing(rack); + rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + rack->rack_attempt_hdwr_pace = 0; + /* rxt settings */ + rack->full_size_rxt = 1; + rack->shape_rxt_to_pacing_min = 0; + /* cmpack=1 */ + rack->r_use_cmp_ack = 1; + if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && + rack->r_use_cmp_ack) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + /* gp_inc_ca */ + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=1 */ + if (rack->r_cwnd_was_clamped == 0) { rack->rc_pace_to_cwnd = 1; - rack->rc_pace_fill_if_rttin_range = 0; - rack->rtt_limit_mul = 0; - /* noprr=1 */ - rack->rack_no_prr = 1; - /* lscwnd=1 */ - rack->r_limit_scw = 1; - /* gp_inc_rec */ - rack->r_ctl.rack_per_of_gp_rec = 90; - err = 0; - - } else if (prof == 3) { - /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ - /* pace_always=1 */ - if (rack->rc_always_pace == 0) { - if (tcp_can_enable_pacing() == 0) - return (EBUSY); - } - rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) - rack_set_cc_pacing(rack); - rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; - rack->rack_attempt_hdwr_pace = 0; - /* cmpack=1 */ - if (rack_use_cmp_acks) - rack->r_use_cmp_ack = 1; - if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && - rack->r_use_cmp_ack) - rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; - /* scwnd=1 */ - rack->rack_enable_scwnd = 1; - /* dynamic=100 */ - rack->rc_gp_dyn_mul = 1; - /* gp_inc_ca */ + } else { + rack->rc_pace_to_cwnd = 0; + /* Reset all multipliers to 100.0 so just the measured bw */ + rack->r_ctl.rack_per_of_gp_ss = 100; rack->r_ctl.rack_per_of_gp_ca = 100; - /* rrr_conf=3 */ - rack->r_rr_config = 3; - /* npush=2 */ - rack->r_ctl.rc_no_push_at_mrtt = 2; - /* fillcw=2 */ - rack->rc_pace_to_cwnd = 1; - rack->r_fill_less_agg = 1; - rack->rc_pace_fill_if_rttin_range = 0; - rack->rtt_limit_mul = 0; - /* noprr=1 */ - rack->rack_no_prr = 1; - /* lscwnd=1 */ - rack->r_limit_scw = 1; - /* gp_inc_rec */ - rack->r_ctl.rack_per_of_gp_rec = 90; - err = 0; + } + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=1 */ + rack->r_limit_scw = 1; + /* gp_inc_rec */ + rack->r_ctl.rack_per_of_gp_rec = 90; + rack_client_buffer_level_set(rack); + return (0); +} + +static int +rack_set_profile(struct tcp_rack *rack, int prof) +{ + int err = EINVAL; + if (prof == 1) { + /* + * Profile 1 is "standard" DGP. It ignores + * client buffer level. + */ + rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0; + err = rack_set_dgp(rack); + if (err) + return (err); } else if (prof == 2) { - /* cmpack=1 */ - if (rack->rc_always_pace == 0) { - if (tcp_can_enable_pacing() == 0) - return (EBUSY); - } - rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) - rack_set_cc_pacing(rack); - rack->r_use_cmp_ack = 1; - if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) - rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; - /* pace_always=1 */ - rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; - /* scwnd=1 */ - rack->rack_enable_scwnd = 1; - /* dynamic=100 */ - rack->rc_gp_dyn_mul = 1; - rack->r_ctl.rack_per_of_gp_ca = 100; - /* rrr_conf=3 */ - rack->r_rr_config = 3; - /* npush=2 */ - rack->r_ctl.rc_no_push_at_mrtt = 2; - /* fillcw=1 */ - rack->rc_pace_to_cwnd = 1; - rack->rc_pace_fill_if_rttin_range = 0; - rack->rtt_limit_mul = 0; - /* noprr=1 */ - rack->rack_no_prr = 1; - /* lscwnd=0 */ - rack->r_limit_scw = 0; - err = 0; + /* + * Profile 2 is DGP. Less aggressive with + * respect to client buffer level. + */ + rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1; + err = rack_set_dgp(rack); + if (err) + return (err); + } else if (prof == 3) { + /* + * Profile 3 is DGP. Even Less aggressive with + * respect to client buffer level. + */ + rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2; + err = rack_set_dgp(rack); + if (err) + return (err); + } else if (prof == 4) { + /* + * Profile 4 is DGP with the most responsiveness + * to client buffer level. + */ + rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3; + err = rack_set_dgp(rack); + if (err) + return (err); } else if (prof == 0) { /* This changes things back to the default settings */ + rack->dgp_on = 0; + rack->rc_hybrid_mode = 0; err = 0; + if (rack_fill_cw_state) + rack->rc_pace_to_cwnd = 1; + else + rack->rc_pace_to_cwnd = 0; if (rack->rc_always_pace) { tcp_decrement_paced_conn(); rack_undo_cc_pacing(rack); @@ -19343,7 +22481,7 @@ rack_set_profile(struct tcp_rack *rack, int prof) } if (rack_pace_every_seg && tcp_can_enable_pacing()) { rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) + if ((rack->gp_ready) && (rack->use_fixed_rate == 0)) rack_set_cc_pacing(rack); } else rack->rc_always_pace = 0; @@ -19407,6 +22545,7 @@ rack_set_profile(struct tcp_rack *rack, int prof) rack->r_limit_scw = 1; else rack->r_limit_scw = 0; + rack_init_retransmit_value(rack, rack_rxt_controls); err = 0; } return (err); @@ -19432,8 +22571,90 @@ rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) } static int +process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) +{ +#ifdef TCP_REQUEST_TRK + struct http_sendfile_track *sft; + struct timeval tv; + tcp_seq seq; + int err; + + microuptime(&tv); + + /* + * If BB logging is not on we need to look at the DTL flag. + * If its on already then those reasons override the DTL input. + * We do this with any request, you can turn DTL on, but it does + * not turn off at least from hybrid pacing requests. + */ + if (tcp_bblogging_on(rack->rc_tp) == 0) { + if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) { + /* Turn on BB point logging */ + tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS, + TCP_BBPOINT_REQ_LEVEL_LOGGING); + } + } + /* Make sure no fixed rate is on */ + rack->use_fixed_rate = 0; + rack->r_ctl.rc_fixed_pacing_rate_rec = 0; + rack->r_ctl.rc_fixed_pacing_rate_ca = 0; + rack->r_ctl.rc_fixed_pacing_rate_ss = 0; + /* Now allocate or find our entry that will have these settings */ + sft = tcp_http_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0); + if (sft == NULL) { + rack->rc_tp->tcp_hybrid_error++; + /* no space, where would it have gone? */ + seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; + rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); + return (ENOSPC); + } + /* The seq will be snd_una + everything in the buffer */ + seq = sft->start_seq; + if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { + /* Disabling hybrid pacing */ + if (rack->rc_hybrid_mode) { + rack_set_profile(rack, 0); + rack->rc_tp->tcp_hybrid_stop++; + } + rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); + return (0); + } + if (rack->dgp_on == 0) { + /* + * If we have not yet turned DGP on, do so + * now setting pure DGP mode, no buffer level + * response. + */ + if ((err = rack_set_profile(rack, 1)) != 0){ + /* Failed to turn pacing on */ + rack->rc_tp->tcp_hybrid_error++; + rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); + return (err); + } + } + /* Now set in our flags */ + sft->hybrid_flags = hybrid->hybrid_flags; + if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) + sft->cspr = hybrid->cspr; + else + sft->cspr = 0; + if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) + sft->hint_maxseg = hybrid->hint_maxseg; + else + sft->hint_maxseg = 0; + rack->rc_hybrid_mode = 1; + rack->rc_tp->tcp_hybrid_start++; + rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); + return (0); +#else + return (ENOTSUP); +#endif +} + +static int rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, - uint32_t optval, uint64_t loptval) + uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) + { struct epoch_tracker et; struct sockopt sopt; @@ -19444,7 +22665,17 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, uint16_t ca, ss; switch (sopt_name) { - + case TCP_RACK_SET_RXT_OPTIONS: + if ((optval >= 0) && (optval <= 2)) { + rack_init_retransmit_value(rack, optval); + } else { + /* + * You must send in 0, 1 or 2 all else is + * invalid. + */ + error = EINVAL; + } + break; case TCP_RACK_DSACK_OPT: RACK_OPTS_INC(tcp_rack_dsack_opt); if (optval & 0x1) { @@ -19459,6 +22690,24 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, } rack_log_dsack_event(rack, 5, __LINE__, 0, 0); break; + case TCP_RACK_PACING_DIVISOR: + RACK_OPTS_INC(tcp_rack_pacing_divisor); + if (optval == 0) { + rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; + } else { + if (optval < RL_MIN_DIVISOR) + rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; + else + rack->r_ctl.pace_len_divisor = optval; + } + break; + case TCP_RACK_HI_BETA: + RACK_OPTS_INC(tcp_rack_hi_beta); + if (optval) + rack->rack_hibeta = 1; + else + rack->rack_hibeta = 0; + break; case TCP_RACK_PACING_BETA: RACK_OPTS_INC(tcp_rack_beta); if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { @@ -19575,6 +22824,13 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, RACK_OPTS_INC(tcp_pacing_rate_cap); rack->r_ctl.bw_rate_cap = loptval; break; + case TCP_HYBRID_PACING: + if (hybrid == NULL) { + error = EINVAL; + break; + } + error = process_hybrid_pacing(rack, hybrid); + break; case TCP_RACK_PROFILE: RACK_OPTS_INC(tcp_profile); error = rack_set_profile(rack, optval); @@ -19599,6 +22855,17 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, else rack->r_limit_scw = 0; break; + case TCP_RACK_DGP_IN_REC: + RACK_OPTS_INC(tcp_dgp_in_rec); + if (optval) + rack->r_ctl.full_dgp_in_rec = 1; + else + rack->r_ctl.full_dgp_in_rec = 0; + break; + case TCP_RXT_CLAMP: + RACK_OPTS_INC(tcp_rxt_clamp); + rack_translate_clamp_value(rack, optval); + break; case TCP_RACK_PACE_TO_FILL: RACK_OPTS_INC(tcp_fillcw); if (optval == 0) @@ -19699,7 +22966,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; - /* Pacing related ones */ + /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: /* * zero is old rack method, 1 is new @@ -19712,7 +22979,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, break; } else if (tcp_can_enable_pacing()) { rack->rc_always_pace = 1; - if (rack->use_fixed_rate || rack->gp_ready) + if ((rack->gp_ready) && (rack->use_fixed_rate == 0)) rack_set_cc_pacing(rack); } else { @@ -19803,6 +23070,11 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, else rack->rc_force_max_seg = 0; break; + case TCP_RACK_PACE_MIN_SEG: + RACK_OPTS_INC(tcp_rack_min_seg); + rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); + rack_set_pace_segments(tp, rack, __LINE__, NULL); + break; case TCP_RACK_PACE_MAX_SEG: /* Max segments size in a pace in bytes */ RACK_OPTS_INC(tcp_rack_max_seg); @@ -19818,7 +23090,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) rack->r_ctl.rc_fixed_pacing_rate_ss = optval; rack->use_fixed_rate = 1; - if (rack->rc_always_pace) + if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, @@ -19836,7 +23108,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; - if (rack->rc_always_pace) + if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, @@ -19854,7 +23126,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; - if (rack->rc_always_pace) + if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, @@ -19914,6 +23186,12 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, else rack->r_rr_config = 0; break; + case TCP_PACING_DND: /* URL:dnd */ + if (optval > 0) + rack->rc_pace_dnd = 1; + else + rack->rc_pace_dnd = 0; + break; case TCP_HDWR_RATE_CAP: RACK_OPTS_INC(tcp_hdwr_rate_cap); if (optval) { @@ -19925,6 +23203,10 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, rack->r_rack_hw_rate_caps = 0; } break; + case TCP_RACK_SPLIT_LIMIT: + RACK_OPTS_INC(tcp_split_limit); + rack->r_ctl.rc_split_limit = optval; + break; case TCP_BBR_HDWR_PACE: RACK_OPTS_INC(tcp_hdwr_pacing); if (optval){ @@ -19945,7 +23227,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, #endif } break; - /* End Pacing related ones */ + /* End Pacing related ones */ case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); @@ -20002,13 +23284,6 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, else rack->use_rack_rr = 0; break; - case TCP_FAST_RSM_HACK: - RACK_OPTS_INC(tcp_rack_fastrsm_hack); - if (optval) - rack->fast_rsm_hack = 1; - else - rack->fast_rsm_hack = 0; - break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); @@ -20065,7 +23340,7 @@ rack_apply_deferred_options(struct tcp_rack *rack) TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); /* Disadvantage of deferal is you loose the error return */ s_optval = (uint32_t)dol->optval; - (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); + (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); free(dol, M_TCPDO); } } @@ -20091,6 +23366,33 @@ rack_pru_options(struct tcpcb *tp, int flags) return (0); } +static bool +rack_wake_check(struct tcpcb *tp) +{ + struct tcp_rack *rack; + struct timeval tv; + uint32_t cts; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->r_ctl.rc_hpts_flags) { + cts = tcp_get_usecs(&tv); + if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ + /* + * Pacing timer is up, check if we are ready. + */ + if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) + return (true); + } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { + /* + * A timer is up, check if we are ready. + */ + if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) + return (true); + } + } + return (false); +} + static struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, @@ -20106,6 +23408,9 @@ static struct tcp_function_block __tcp_rack = { .tfb_tcp_mtu_chg = rack_mtu_change, .tfb_pru_options = rack_pru_options, .tfb_hwtls_change = rack_hw_tls_change, + .tfb_chg_query = rack_chg_query, + .tfb_switch_failed = rack_switch_failed, + .tfb_early_wake_check = rack_wake_check, .tfb_compute_pipe = rack_compute_pipe, .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, }; @@ -20127,8 +23432,9 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) #endif struct tcpcb *tp; struct tcp_rack *rack; + struct tcp_hybrid_req hybrid; uint64_t loptval; - int32_t error = 0, optval; + int32_t error = 0, mask, optval, tclass; tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -20153,10 +23459,15 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) break; case IPV6_TCLASS: /* - * The DSCP codepoint has changed, update the fsb. + * The DSCP codepoint has changed, update the fsb + * by overwriting any previous traffic class. */ - ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | - (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK); + if (inp->in6p_outputopts) { + mask = 0xfc; + tclass = inp->in6p_outputopts->ip6po_tclass; + ip6->ip6_flow &= htonl((~mask) << 20); + ip6->ip6_flow |= htonl((tclass & mask) << 20); + } break; } INP_WUNLOCK(inp); @@ -20181,67 +23492,95 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) INP_WUNLOCK(inp); return (0); #endif - } +#ifdef SO_PEERPRIO + case SOL_SOCKET: + switch (sopt->sopt_name) { + case SO_PEERPRIO: /* SC-URL:bs */ + /* Already read in and sanity checked in sosetopt(). */ + if (inp->inp_socket) { + rack->client_bufferlvl = inp->inp_socket->so_peerprio; + rack_client_buffer_level_set(rack); + } + break; + } + INP_WUNLOCK(inp); + return (0); +#endif + case IPPROTO_TCP: + switch (sopt->sopt_name) { + case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ + /* Pacing related ones */ + case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ + case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ + case TCP_BBR_IWINTSO: /* URL:tso_iwin */ + case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ + case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ + case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ + case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ + case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ + case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ + case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ + case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ + case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ + case TCP_RACK_RR_CONF: /* URL:rrr_conf */ + case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ + case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ + case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ + case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ + case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ + case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ + case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ + case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */ + /* End pacing related */ + case TCP_RXT_CLAMP: /* URL:rxtclamp */ + case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ + case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ + case TCP_RACK_MIN_TO: /* URL:min_to */ + case TCP_RACK_EARLY_SEG: /* URL:early_seg */ + case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ + case TCP_RACK_REORD_FADE: /* URL:reord_fade */ + case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ + case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ + case TCP_RACK_TLP_USE: /* URL:tlp_use */ + case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ + case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ + case TCP_RACK_DO_DETECTION: /* URL:detect */ + case TCP_NO_PRR: /* URL:noprr */ + case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ + case TCP_DATA_AFTER_CLOSE: /* no URL */ + case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ + case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ + case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ + case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ + case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ + case TCP_RACK_PROFILE: /* URL:profile */ + case TCP_HYBRID_PACING: /* URL:hybrid */ + case TCP_USE_CMP_ACKS: /* URL:cmpack */ + case TCP_RACK_ABC_VAL: /* URL:labc */ + case TCP_REC_ABC_VAL: /* URL:reclabc */ + case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ + case TCP_DEFER_OPTIONS: /* URL:defer */ + case TCP_RACK_DSACK_OPT: /* URL:dsack */ + case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ + case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ + case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ + case TCP_RACK_HI_BETA: /* URL:hibeta */ + case TCP_RACK_SPLIT_LIMIT: /* URL:split */ + case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ + case TCP_PACING_DND: /* URL:dnd */ + goto process_opt; + break; + default: + /* Filter off all unknown options to the base stack */ + return (tcp_default_ctloutput(inp, sopt)); + break; + } - switch (sopt->sopt_name) { - case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ - /* Pacing related ones */ - case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ - case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ - case TCP_BBR_IWINTSO: /* URL:tso_iwin */ - case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ - case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ - case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ - case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ - case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ - case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ - case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ - case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ - case TCP_RACK_RR_CONF: /* URL:rrr_conf */ - case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ - case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ - case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ - case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ - /* End pacing related */ - case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ - case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ - case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ - case TCP_RACK_MIN_TO: /* URL:min_to */ - case TCP_RACK_EARLY_SEG: /* URL:early_seg */ - case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ - case TCP_RACK_REORD_FADE: /* URL:reord_fade */ - case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ - case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ - case TCP_RACK_TLP_USE: /* URL:tlp_use */ - case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ - case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ - case TCP_RACK_DO_DETECTION: /* URL:detect */ - case TCP_NO_PRR: /* URL:noprr */ - case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ - case TCP_DATA_AFTER_CLOSE: /* no URL */ - case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ - case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ - case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ - case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ - case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ - case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ - case TCP_RACK_PROFILE: /* URL:profile */ - case TCP_USE_CMP_ACKS: /* URL:cmpack */ - case TCP_RACK_ABC_VAL: /* URL:labc */ - case TCP_REC_ABC_VAL: /* URL:reclabc */ - case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ - case TCP_DEFER_OPTIONS: /* URL:defer */ - case TCP_RACK_DSACK_OPT: /* URL:dsack */ - case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ - case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ - case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ - case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ - break; default: - /* Filter off all unknown options to the base stack */ - return (tcp_default_ctloutput(inp, sopt)); - break; + INP_WUNLOCK(inp); + return (0); } +process_opt: INP_WUNLOCK(inp); if (sopt->sopt_name == TCP_PACING_RATE_CAP) { error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); @@ -20250,6 +23589,8 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) * means rates > 34Gbps won't show right, but thats probably ok. */ optval = (uint32_t)loptval; + } else if (sopt->sopt_name == TCP_HYBRID_PACING) { + error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); } else { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); /* Save it in 64 bit form too */ @@ -20258,17 +23599,15 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); INP_WLOCK(inp); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } if (tp->t_fb != &__tcp_rack) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } if (rack->defer_options && (rack->gp_ready == 0) && (sopt->sopt_name != TCP_DEFER_OPTIONS) && + (sopt->sopt_name != TCP_HYBRID_PACING) && (sopt->sopt_name != TCP_RACK_PACING_BETA) && + (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { /* Options are beind deferred */ @@ -20281,7 +23620,7 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) return (ENOMEM); } } - error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); + error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); INP_WUNLOCK(inp); return (error); } @@ -20440,8 +23779,14 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) } } break; - case TCP_FAST_RSM_HACK: - optval = rack->fast_rsm_hack; + case TCP_RACK_DGP_IN_REC: + optval = rack->r_ctl.full_dgp_in_rec; + break; + case TCP_RACK_HI_BETA: + optval = rack->rack_hibeta; + break; + case TCP_RXT_CLAMP: + optval = rack->r_ctl.saved_rxt_clamp_val; break; case TCP_DEFER_OPTIONS: optval = rack->defer_options; @@ -20465,6 +23810,10 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) /* You cannot retrieve a profile, its write only */ error = EINVAL; break; + case TCP_HYBRID_PACING: + /* You cannot retrieve hybrid pacing information, its write only */ + error = EINVAL; + break; case TCP_USE_CMP_ACKS: optval = rack->r_use_cmp_ack; break; @@ -20517,6 +23866,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) case TCP_RACK_FORCE_MSEG: optval = rack->rc_force_max_seg; break; + case TCP_RACK_PACE_MIN_SEG: + optval = rack->r_ctl.rc_user_set_min_segs; + break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = rack->rc_user_set_max_segs; @@ -20533,6 +23885,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) /* Minimum time between rack t-o's in ms */ optval = rack->r_ctl.rc_min_to; break; + case TCP_RACK_SPLIT_LIMIT: + optval = rack->r_ctl.rc_split_limit; + break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ optval = rack->r_ctl.rc_early_recovery_segs; @@ -20569,6 +23924,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; + case TCP_PACING_DND: + optval = rack->rc_pace_dnd; + break; case TCP_RACK_PACE_RATE_CA: optval = rack->r_ctl.rc_fixed_pacing_rate_ca; break; @@ -20584,6 +23942,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) case TCP_RACK_GP_INCREASE_CA: optval = rack->r_ctl.rack_per_of_gp_ss; break; + case TCP_RACK_PACING_DIVISOR: + optval = rack->r_ctl.pace_len_divisor; + break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; break; diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index 7e71e764990d..d4a70c529386 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ratelimit.h" -#include "opt_kern_tls.h" #include <sys/param.h> #include <sys/arb.h> #include <sys/module.h> @@ -51,9 +50,6 @@ __FBSDID("$FreeBSD$"); #include <sys/qmath.h> #include <sys/socket.h> #include <sys/socketvar.h> -#ifdef KERN_TLS -#include <sys/ktls.h> -#endif #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/tree.h> @@ -130,36 +126,6 @@ __FBSDID("$FreeBSD$"); * Common TCP Functions - These are shared by borth * rack and BBR. */ -#ifdef KERN_TLS -uint32_t -ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) -{ - struct ktls_session *tls; - uint32_t len; - -again: - tls = so->so_snd.sb_tls_info; - len = tls->params.max_frame_len; /* max tls payload */ - len += tls->params.tls_hlen; /* tls header len */ - len += tls->params.tls_tlen; /* tls trailer len */ - if ((len * 4) > rwnd) { - /* - * Stroke this will suck counter and what - * else should we do Drew? From the - * TCP perspective I am not sure - * what should be done... - */ - if (tls->params.max_frame_len > 4096) { - tls->params.max_frame_len -= 4096; - if (tls->params.max_frame_len < 4096) - tls->params.max_frame_len = 4096; - goto again; - } - } - return (len); -} -#endif - static int ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m) { diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index 688c64dd92c3..e9c38c01c3c8 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -87,9 +87,6 @@ #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); -#ifdef KERN_TLS -uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd); -#endif int ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt); diff --git a/sys/netinet/tcp_stacks/tailq_hash.c b/sys/netinet/tcp_stacks/tailq_hash.c new file mode 100644 index 000000000000..2e3b57be2388 --- /dev/null +++ b/sys/netinet/tcp_stacks/tailq_hash.c @@ -0,0 +1,344 @@ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_ratelimit.h" +#include "opt_kern_tls.h" +#include <sys/param.h> +#include <sys/arb.h> +#include <sys/module.h> +#include <sys/kernel.h> +#ifdef TCP_HHOOK +#include <sys/hhook.h> +#endif +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#ifdef STATS +#include <sys/qmath.h> +#include <sys/tree.h> +#include <sys/stats.h> /* Must come after qmath.h and tree.h */ +#else +#include <sys/tree.h> +#endif +#include <sys/refcount.h> +#include <sys/queue.h> +#include <sys/tim_filter.h> +#include <sys/smp.h> +#include <sys/kthread.h> +#include <sys/kern_prefetch.h> +#include <sys/protosw.h> +#ifdef TCP_ACCOUNTING +#include <sys/sched.h> +#include <machine/cpu.h> +#endif +#include <vm/uma.h> + +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/vnet.h> + +#define TCPSTATES /* for logging */ + +#include <netinet/in.h> +#include <netinet/in_kdtrace.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* required for icmp_var.h */ +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/ip_var.h> +#include <netinet/ip6.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/ip6_var.h> +#include <netinet/tcp.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_log_buf.h> +#include <netinet/tcp_syncache.h> +#include <netinet/tcp_hpts.h> +#include <netinet/tcp_ratelimit.h> +#include <netinet/tcp_accounting.h> +#include <netinet/tcpip.h> +#include <netinet/cc/cc.h> +#include <netinet/cc/cc_newreno.h> +#include <netinet/tcp_fastopen.h> +#include <netinet/tcp_lro.h> +#ifdef NETFLIX_SHARED_CWND +#include <netinet/tcp_shared_cwnd.h> +#endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcp_ecn.h> + +#include <netipsec/ipsec_support.h> + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include <netipsec/ipsec.h> +#include <netipsec/ipsec6.h> +#endif /* IPSEC */ + +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <machine/in_cksum.h> + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif +#include "sack_filter.h" +#include "tcp_rack.h" +#include "tailq_hash.h" + + +struct rack_sendmap * +tqhash_min(struct tailq_hash *hs) +{ + struct rack_sendmap *rsm; + + rsm = tqhash_find(hs, hs->min); + return(rsm); +} + +struct rack_sendmap * +tqhash_max(struct tailq_hash *hs) +{ + struct rack_sendmap *rsm; + + rsm = tqhash_find(hs, (hs->max - 1)); + return (rsm); +} + +int +tqhash_empty(struct tailq_hash *hs) +{ + if (hs->count == 0) + return(1); + return(0); +} + +struct rack_sendmap * +tqhash_find(struct tailq_hash *hs, uint32_t seq) +{ + struct rack_sendmap *e; + int bindex, pbucket, fc = 1; + + if ((SEQ_LT(seq, hs->min)) || + (hs->count == 0) || + (SEQ_GEQ(seq, hs->max))) { + /* Not here */ + return (NULL); + } + bindex = seq / SEQ_BUCKET_SIZE; + bindex %= MAX_HASH_ENTRIES; + /* Lets look through the bucket it belongs to */ + if (TAILQ_EMPTY(&hs->ht[bindex])) { + goto look_backwards; + } + TAILQ_FOREACH(e, &hs->ht[bindex], next) { + if (fc == 1) { + /* + * Special check for when a cum-ack + * as moved up over a seq and now its + * a bucket behind where it belongs. In + * the case of SACKs which create new rsm's + * this won't occur. + */ + if (SEQ_GT(e->r_start, seq)) { + goto look_backwards; + } + fc = 0; + } + if (SEQ_GEQ(seq, e->r_start) && + (SEQ_LT(seq, e->r_end))) { + /* Its in this block */ + return (e); + } + } + /* Did not find it */ + return (NULL); +look_backwards: + if (bindex == 0) + pbucket = MAX_HASH_ENTRIES - 1; + else + pbucket = bindex - 1; + TAILQ_FOREACH_REVERSE(e, &hs->ht[pbucket], rack_head, next) { + if (SEQ_GEQ(seq, e->r_start) && + (SEQ_LT(seq, e->r_end))) { + /* Its in this block */ + return (e); + } + if (SEQ_GEQ(e->r_end, seq)) + break; + } + return (NULL); +} + +struct rack_sendmap * +tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm) +{ + struct rack_sendmap *e; + + e = TAILQ_NEXT(rsm, next); + if (e == NULL) { + /* Move to next bucket */ + int nxt; + + nxt = rsm->bindex + 1; + if (nxt >= MAX_HASH_ENTRIES) + nxt = 0; + e = TAILQ_FIRST(&hs->ht[nxt]); + } + return(e); +} + +struct rack_sendmap * +tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm) +{ + struct rack_sendmap *e; + + e = TAILQ_PREV(rsm, rack_head, next); + if (e == NULL) { + int prev; + + if (rsm->bindex > 0) + prev = rsm->bindex - 1; + else + prev = MAX_HASH_ENTRIES - 1; + e = TAILQ_LAST(&hs->ht[prev], rack_head); + } + return (e); +} + +void +tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type) +{ + TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next); + hs->count--; + if (hs->count == 0) { + hs->min = hs->max; + } else if (type == REMOVE_TYPE_CUMACK) { + hs->min = rsm->r_end; + } +} + +int +tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm) +{ + struct rack_sendmap *e, *l; + int inserted = 0; + uint32_t ebucket; + + if (hs->count > 0) { + if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) { + return (-1); + } + e = tqhash_find(hs, rsm->r_start); + if (e) { + return (-2); + } + } + rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE; + rsm->bindex %= MAX_HASH_ENTRIES; + ebucket = rsm->r_end / SEQ_BUCKET_SIZE; + ebucket %= MAX_HASH_ENTRIES; + if (ebucket != rsm->bindex) { + /* This RSM straddles the bucket boundary */ + rsm->r_flags |= RACK_STRADDLE; + } else { + rsm->r_flags &= ~RACK_STRADDLE; + } + if (hs->count == 0) { + /* Special case */ + hs->min = rsm->r_start; + hs->max = rsm->r_end; + hs->count = 1; + } else { + hs->count++; + if (SEQ_GT(rsm->r_end, hs->max)) + hs->max = rsm->r_end; + if (SEQ_LT(rsm->r_start, hs->min)) + hs->min = rsm->r_start; + } + /* Check the common case of inserting at the end */ + l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head); + if ((l == NULL) || (SEQ_GT(rsm->r_start, l->r_start))) { + TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next); + return (0); + } + TAILQ_FOREACH(e, &hs->ht[rsm->bindex], next) { + if (SEQ_LEQ(rsm->r_start, e->r_start)) { + inserted = 1; + TAILQ_INSERT_BEFORE(e, rsm, next); + break; + } + } + if (inserted == 0) { + TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next); + } + return (0); +} + +void +tqhash_init(struct tailq_hash *hs) +{ + int i; + + for(i = 0; i < MAX_HASH_ENTRIES; i++) { + TAILQ_INIT(&hs->ht[i]); + } + hs->min = hs->max = 0; + hs->count = 0; +} + +int +tqhash_trim(struct tailq_hash *hs, uint32_t th_ack) +{ + struct rack_sendmap *rsm; + + if (SEQ_LT(th_ack, hs->min)) { + /* It can't be behind our current min */ + return (-1); + } + if (SEQ_GEQ(th_ack, hs->max)) { + /* It can't be beyond or at our current max */ + return (-2); + } + rsm = tqhash_min(hs); + if (rsm == NULL) { + /* nothing to trim */ + return (-3); + } + if (SEQ_GEQ(th_ack, rsm->r_end)) { + /* + * You can't trim all bytes instead + * you need to remove it. + */ + return (-4); + } + if (SEQ_GT(th_ack, hs->min)) + hs->min = th_ack; + /* + * Should we trim it for the caller? + * they may have already which is ok... + */ + if (SEQ_GT(th_ack, rsm->r_start)) { + rsm->r_start = th_ack; + } + return (0); +} + diff --git a/sys/netinet/tcp_stacks/tailq_hash.h b/sys/netinet/tcp_stacks/tailq_hash.h new file mode 100644 index 000000000000..ae8d3e00f558 --- /dev/null +++ b/sys/netinet/tcp_stacks/tailq_hash.h @@ -0,0 +1,73 @@ +#ifndef __tailq_hash__ +#define __tailq_hash__ + +/* Must be powers of 2 */ +#define MAX_HASH_ENTRIES 128 +#define SEQ_BUCKET_SIZE 262144 +/* + * The max seq range that can be stored is + * 64 x 262144 or 16Meg. We have one extra slot + * for fall-over but must keep it so we never have + * wrap in hashing over valid other entries. + */ +#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1)) + +struct tailq_hash { + struct rack_head ht[MAX_HASH_ENTRIES]; + uint32_t min; + uint32_t max; + uint32_t count; +}; + +struct rack_sendmap * +tqhash_min(struct tailq_hash *hs); + +struct rack_sendmap * +tqhash_max(struct tailq_hash *hs); + +int +tqhash_empty(struct tailq_hash *hs); + +struct rack_sendmap * +tqhash_find(struct tailq_hash *hs, uint32_t seq); + +struct rack_sendmap * +tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm); + +struct rack_sendmap * +tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm); + +#define REMOVE_TYPE_CUMACK 1 /* Cumack moved */ +#define REMOVE_TYPE_MERGE 2 /* Merging two blocks */ +#define REMOVE_TYPE_FINI 3 /* The connection is over */ + +void +tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type); + +int +tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm); + +void +tqhash_init(struct tailq_hash *hs); + +int +tqhash_trim(struct tailq_hash *hs, uint32_t th_ack); + + +#define TQHASH_FOREACH(var, head) \ + for ((var) = tqhash_min((head)); \ + (var); \ + (var) = tqhash_next((head), (var))) + +#define TQHASH_FOREACH_FROM(var, head, fvar) \ + for ((var) = ((fvar) ? (fvar) : tqhash_min((head))); \ + (var); \ + (var) = tqhash_next((head), (var))) + +#define TQHASH_FOREACH_REVERSE_FROM(var, head) \ + for ((var) = ((var) ? (var) : tqhash_max((head))); \ + (var); \ + (var) = tqhash_prev((head), (var))) + + +#endif diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index 6f447d5ea470..fbf7b4bddda9 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -45,6 +45,11 @@ #define RACK_SENT_FP 0x004000/* sent in fast path */ #define RACK_HAD_PUSH 0x008000/* Push was sent on original send */ #define RACK_MUST_RXT 0x010000/* We must retransmit this rsm (non-sack/mtu chg)*/ +#define RACK_IN_GP_WIN 0x020000/* Send was in GP window when sent */ +#define RACK_SHUFFLED 0x040000/* The RSM was shuffled some data from one to another */ +#define RACK_MERGED 0x080000/* The RSM was merged */ +#define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */ +#define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */ @@ -52,7 +57,9 @@ #define RACK_REQ_AVG 3 /* Must be less than 256 */ struct rack_sendmap { + TAILQ_ENTRY(rack_sendmap) next; TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ + uint32_t bindex; uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ @@ -60,7 +67,8 @@ struct rack_sendmap { r_rtr_cnt : 8; /* Retran count, index this -1 to get time */ struct mbuf *m; uint32_t soff; - uint32_t orig_m_len; + uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */ + uint32_t orig_t_space; /* The original trailing space when we sent (can update) */ uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */ uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ @@ -72,8 +80,8 @@ struct rack_sendmap { r_avail : 4; uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; uint64_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */ - RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */ uint32_t r_fas; /* Flight at send */ + uint8_t r_bas; /* The burst size (burst at send = bas) */ }; struct deferred_opt_list { @@ -201,11 +209,11 @@ struct rack_opts_stats { uint64_t tcp_rack_pace_rate_ss; uint64_t tcp_rack_pace_rate_rec; /* Temp counters for dsack */ - uint64_t tcp_sack_path_1; - uint64_t tcp_sack_path_2a; - uint64_t tcp_sack_path_2b; - uint64_t tcp_sack_path_3; - uint64_t tcp_sack_path_4; + uint64_t tcp_sack_path_1; /* not used */ + uint64_t tcp_sack_path_2a; /* not used */ + uint64_t tcp_sack_path_2b; /* not used */ + uint64_t tcp_sack_path_3; /* not used */ + uint64_t tcp_sack_path_4; /* not used */ /* non temp counters */ uint64_t tcp_rack_scwnd; uint64_t tcp_rack_noprr; @@ -227,11 +235,16 @@ struct rack_opts_stats { uint64_t tcp_rack_rtt_use; uint64_t tcp_data_after_close; uint64_t tcp_defer_opt; - uint64_t tcp_rack_fastrsm_hack; + uint64_t tcp_rxt_clamp; uint64_t tcp_rack_beta; uint64_t tcp_rack_beta_ecn; uint64_t tcp_rack_timer_slop; uint64_t tcp_rack_dsack_opt; + uint64_t tcp_rack_hi_beta; + uint64_t tcp_split_limit; + uint64_t tcp_rack_pacing_divisor; + uint64_t tcp_rack_min_seg; + uint64_t tcp_dgp_in_rec; }; /* RTT shrink reasons */ @@ -261,38 +274,6 @@ struct rack_opts_stats { #define RACK_QUALITY_PROBERTT 4 /* A measurement where we went into or exited probe RTT */ #define RACK_QUALITY_ALLACKED 5 /* All data is now acknowledged */ -/*********************/ -/* Rack Trace points */ -/*********************/ -/* - * Rack trace points are interesting points within - * the rack code that the author/debugger may want - * to have BB logging enabled if we hit that point. - * In order to enable a trace point you set the - * sysctl var net.inet.tcp.<stack>.tp.number to - * one of the numbers listed below. You also - * must make sure net.inet.tcp.<stack>.tp.bbmode is - * non-zero, the default is 4 for continuous tracing. - * You also set in the number of connections you want - * have get BB logs in net.inet.tcp.<stack>.tp.count. - * - * Count will decrement every time BB logging is assigned - * to a connection that hit your tracepoint. - * - * You can enable all trace points by setting the number - * to 0xffffffff. You can disable all trace points by - * setting number to zero (or count to 0). - * - * Below are the enumerated list of tracepoints that - * have currently been defined in the code. Add more - * as you add a call to rack_trace_point(rack, <name>); - * where <name> is defined below. - */ -#define RACK_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */ -#define RACK_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */ -#define RACK_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */ -#define RACK_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */ - #define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */ #ifdef _KERNEL #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) @@ -356,14 +337,17 @@ struct rack_fast_send_blk { struct udphdr *udp; struct mbuf *m; uint32_t o_m_len; + uint32_t o_t_len; uint32_t rfo_apply_push : 1, hw_tls : 1, unused : 30; }; +struct tailq_hash; + struct rack_control { /* Second cache line 0x40 from tcp_rack */ - struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */ + struct tailq_hash *tqh; /* Tree of all segments Lock(a) */ struct rack_head rc_tmap; /* List in transmit order Lock(a) */ struct rack_sendmap *rc_tlpsend; /* Remembered place for * tlp_sending Lock(a) */ @@ -371,8 +355,8 @@ struct rack_control { * resend */ struct rack_fast_send_blk fsb; /* The fast-send block */ uint32_t timer_slop; - uint32_t input_pkt; - uint32_t saved_input_pkt; + uint16_t pace_len_divisor; + uint16_t rc_user_set_min_segs; uint32_t rc_hpts_flags; uint32_t rc_fixed_pacing_rate_ca; uint32_t rc_fixed_pacing_rate_rec; @@ -387,6 +371,7 @@ struct rack_control { uint64_t last_hw_bw_req; uint64_t crte_prev_rate; uint64_t bw_rate_cap; + uint64_t last_cumack_advance; /* Last time cumack moved forward */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP @@ -401,6 +386,7 @@ struct rack_control { uint32_t last_sent_tlp_seq; /* Last tlp sequence that was retransmitted Lock(a) */ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ + uint16_t rc_tlp_cnt_out; /* count of times we have sent a TLP without new data */ uint16_t last_sent_tlp_len; /* Number of bytes in the last sent tlp */ @@ -418,6 +404,7 @@ struct rack_control { * have allocated */ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ + uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */ uint32_t rc_last_output_to; uint32_t rc_went_idle_time; @@ -462,7 +449,20 @@ struct rack_control { uint64_t last_max_bw; /* Our calculated max b/w last */ struct time_filter_small rc_gp_min_rtt; struct def_opt_head opt_list; + uint64_t lt_bw_time; /* Total time with data outstanding (lt_bw = long term bandwidth) */ + uint64_t lt_bw_bytes; /* Total bytes acked */ + uint64_t lt_timemark; /* 64 bit timestamp when we started sending */ + struct http_sendfile_track *rc_last_sft; + uint32_t lt_seq; /* Seq at start of lt_bw gauge */ int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ + uint64_t last_sndbytes; + uint64_t last_snd_rxt_bytes; + uint64_t rxt_threshold; + uint32_t last_rnd_rxt_clamped; + uint32_t num_of_clamps_applied; + uint32_t clamp_options; + uint32_t max_clamps; + uint32_t rc_gp_srtt; /* Current GP srtt */ uint32_t rc_prev_gp_srtt; /* Previous RTT */ uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */ @@ -502,6 +502,10 @@ struct rack_control { uint32_t rc_min_to; /* Socket option value Lock(a) */ uint32_t rc_pkt_delay; /* Socket option value Lock(a) */ uint32_t persist_lost_ends; + uint32_t ack_during_sd; + uint32_t input_pkt; + uint32_t saved_input_pkt; + uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */ struct newreno rc_saved_beta; /* * For newreno cc: * rc_saved_cc are the values we have had @@ -516,6 +520,8 @@ struct rack_control { */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ + uint8_t rack_per_upper_bound_ss; + uint8_t rack_per_upper_bound_ca; uint8_t dsack_persist; uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */ uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */ @@ -523,9 +529,55 @@ struct rack_control { uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_rate_sample_method; + uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */ + uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */ + uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */ + uint8_t pacing_discount_amm; /* + * This is a multipler to the base discount that + * can be used to increase the discount. + */ + uint8_t already_had_a_excess; }; #endif +/* DGP with no buffer level mitigations */ +#define DGP_LEVEL0 0 + +/* + * DGP with buffer level mitigation where BL:4 caps fillcw and BL:5 + * turns off fillcw. + */ +#define DGP_LEVEL1 1 + +/* + * DGP with buffer level mitigation where BL:3 caps fillcw and BL:4 turns off fillcw + * and BL:5 reduces by 10% + */ +#define DGP_LEVEL2 2 + +/* + * DGP with buffer level mitigation where BL:2 caps fillcw and BL:3 turns off + * fillcw BL:4 reduces by 10% and BL:5 reduces by 20% + */ +#define DGP_LEVEL3 3 + +/* Hybrid pacing log defines */ +#define HYBRID_LOG_NO_ROOM 0 /* No room for the clients request */ +#define HYBRID_LOG_TURNED_OFF 1 /* Turned off hybrid pacing */ +#define HYBRID_LOG_NO_PACING 2 /* Failed to set pacing on */ +#define HYBRID_LOG_RULES_SET 3 /* Hybrid pacing for this chunk is set */ +#define HYBRID_LOG_NO_RANGE 4 /* In DGP mode, no range found */ +#define HYBRID_LOG_RULES_APP 5 /* The specified rules were applied */ +#define HYBRID_LOG_REQ_COMP 6 /* The request completed */ +#define HYBRID_LOG_BW_MEASURE 7 /* Follow up b/w measurements to the previous completed log */ +#define HYBRID_LOG_RATE_CAP 8 /* We had a rate cap apply */ +#define HYBRID_LOG_CAP_CALC 9 /* How we calculate the cap */ +#define HYBRID_LOG_ISSAME 10 /* Same as before -- temp */ +#define HYBRID_LOG_ALLSENT 11 /* We sent it all no more rate-cap */ +#define HYBRID_LOG_OUTOFTIME 12 /* We are past the deadline DGP */ +#define HYBRID_LOG_CAPERROR 13 /* Hit one of the TSNH cases */ +#define HYBRID_LOG_EXTEND 14 /* We extended the end */ + #define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */ #define RACK_MINRTT_FILTER_TIM 10 /* Seconds */ @@ -558,11 +610,11 @@ struct tcp_rack { shape_rxt_to_pacing_min : 1, /* ******************************************************************** */ rc_ack_required: 1, - spare : 1; + r_pacing_discount : 1; uint8_t no_prr_addback : 1, gp_ready : 1, defer_options: 1, - fast_rsm_hack: 1, + excess_rxt_on: 1, /* Are actions on for excess retransmissions? */ rc_ack_can_sendout_data: 1, /* * If set it will override pacing restrictions on not sending * data when the pacing timer is running. I.e. you set this @@ -590,7 +642,8 @@ struct tcp_rack { rc_last_sent_tlp_seq_valid: 1, rc_last_sent_tlp_past_cumack: 1, probe_not_answered: 1, - avail_bytes : 2; + rack_hibeta : 1, + lt_bw_up : 1; uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */ rtt_limit_mul : 4, /* muliply this by low rtt */ @@ -616,11 +669,15 @@ struct tcp_rack { r_use_labc_for_rec: 1, rc_highly_buffered: 1, /* The path is highly buffered */ rc_dragged_bottom: 1, - rc_dack_mode : 1, /* Mac O/S emulation of d-ack */ - rc_dack_toggle : 1, /* For Mac O/S emulation of d-ack */ + rc_pace_dnd : 1, /* The pace do not disturb bit */ + rc_avali2 : 1, rc_gp_filled : 1, - rc_is_spare : 1; - uint8_t r_state; /* Current rack state Lock(a) */ + rc_hw_nobuf : 1; + uint8_t r_state : 4, /* Current rack state Lock(a) */ + rc_catch_up : 1, /* catch up mode in dgp */ + rc_hybrid_mode : 1, /* We are in hybrid mode */ + rc_suspicious : 1, /* Suspect sacks have been given */ + rc_new_rnd_needed: 1; uint8_t rc_tmr_stopped : 7, t_timers_stopped : 1; uint8_t rc_enobuf : 7, /* count of enobufs on connection provides */ @@ -636,8 +693,8 @@ struct tcp_rack { uint8_t app_limited_needs_set : 1, use_fixed_rate : 1, rc_has_collapsed : 1, - r_rep_attack : 1, - r_rep_reverse : 1, + r_cwnd_was_clamped : 1, + r_clamped_gets_lower : 1, rack_hdrw_pacing : 1, /* We are doing Hardware pacing */ rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */ rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */ @@ -660,8 +717,8 @@ struct tcp_rack { r_wanted_output: 1, r_rr_config : 2, r_persist_lt_bw_off : 1, - r_collapse_point_valid : 1, - rc_avail_bit : 2; + r_collapse_point_valid : 1, + dgp_on : 1; uint16_t rc_init_win : 8, rc_gp_rtt_set : 1, rc_gp_dyn_mul : 1, diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 1f2256c6b6f9..fcd430f270f3 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -158,6 +158,11 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, CTLFLAG_RW, &tcp_sad_limit, 10000, "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); +int32_t tcp_sad_limit = 10000; +SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, + CTLFLAG_RW, + &tcp_sad_limit, 10000, + "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, |