aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/modules/tcp/rack/Makefile2
-rw-r--r--sys/netinet/tcp_stacks/rack.c6007
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.c34
-rw-r--r--sys/netinet/tcp_stacks/rack_bbr_common.h3
-rw-r--r--sys/netinet/tcp_stacks/tailq_hash.c344
-rw-r--r--sys/netinet/tcp_stacks/tailq_hash.h73
-rw-r--r--sys/netinet/tcp_stacks/tcp_rack.h165
-rw-r--r--sys/netinet/tcp_subr.c5
8 files changed, 5218 insertions, 1415 deletions
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
index cf95faa7fcfd..b80f34ba7ed4 100644
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_kern_tls.h
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 8b205d12d7f7..514d10098ff6 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -129,6 +129,7 @@ __FBSDID("$FreeBSD$");
#endif
#include "sack_filter.h"
#include "tcp_rack.h"
+#include "tailq_hash.h"
#include "rack_bbr_common.h"
uma_zone_t rack_zone;
@@ -191,21 +192,38 @@ static int32_t rack_tlp_use_greater = 1;
static int32_t rack_reorder_thresh = 2;
static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
* - 60 seconds */
+static uint32_t rack_clamp_ss_upper = 110;
+static uint32_t rack_clamp_ca_upper = 105;
+static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */
+static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */
+static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */
+static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */
+static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */
+static int32_t rack_rxt_controls = 0;
+static int32_t rack_fill_cw_state = 0;
static uint8_t rack_req_measurements = 1;
/* Attack threshold detections */
static uint32_t rack_highest_sack_thresh_seen = 0;
static uint32_t rack_highest_move_thresh_seen = 0;
+static uint32_t rack_merge_out_sacks_on_attack = 0;
static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
-static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */
-static int32_t rack_hw_rate_caps = 1; /* 1; */
+static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */
+static int32_t rack_hw_rate_caps = 0; /* 1; */
+static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */
static int32_t rack_hw_rate_min = 0; /* 1500000;*/
static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
-static int32_t rack_hw_up_only = 1;
+static int32_t rack_hw_up_only = 0;
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_do_hystart = 0;
static int32_t rack_apply_rtt_with_reduced_conf = 0;
+static int32_t rack_hibeta_setting = 0;
+static int32_t rack_default_pacing_divisor = 250;
+static int32_t rack_uses_full_dgp_in_rec = 1;
+static uint16_t rack_pacing_min_seg = 0;
+
+static uint32_t sad_seg_size_per = 800; /* 80.0 % */
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */
@@ -219,11 +237,13 @@ static int32_t rack_use_rsm_rfo = 1;
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
+static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
static int32_t rack_limits_scwnd = 1;
static int32_t rack_enable_mqueue_for_nonpaced = 0;
+static int32_t rack_hybrid_allow_set_maxseg = 0;
static int32_t rack_disable_prr = 0;
static int32_t use_rack_rr = 1;
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
@@ -233,11 +253,12 @@ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to us
static int32_t rack_default_init_window = 0; /* Use system default */
static int32_t rack_limit_time_with_srtt = 0;
static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
-static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */
+static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */
static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
-
+static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */
+static int32_t rack_full_buffer_discount = 10;
/*
* Currently regular tcp has a rto_min of 30ms
* the backoff goes 12 times so that ends up
@@ -326,8 +347,6 @@ static int32_t rack_req_segs = 1;
static uint64_t rack_bw_rate_cap = 0;
-/* Weird delayed ack mode */
-static int32_t rack_use_imac_dack = 0;
/* Rack specific counters */
counter_u64_t rack_saw_enobuf;
counter_u64_t rack_saw_enobuf_hw;
@@ -336,6 +355,7 @@ counter_u64_t rack_persists_sends;
counter_u64_t rack_persists_acks;
counter_u64_t rack_persists_loss;
counter_u64_t rack_persists_lost_ends;
+counter_u64_t rack_total_bytes;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
@@ -352,6 +372,8 @@ counter_u64_t rack_to_alloc_emerg;
counter_u64_t rack_to_alloc_limited;
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
+counter_u64_t rack_rxt_clamps_cwnd;
+counter_u64_t rack_rxt_clamps_cwnd_uniq;
counter_u64_t rack_multi_single_eq;
counter_u64_t rack_proc_non_comp_ack;
@@ -367,6 +389,7 @@ counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
counter_u64_t rack_sack_attacks_detected;
counter_u64_t rack_sack_attacks_reversed;
+counter_u64_t rack_sack_attacks_suspect;
counter_u64_t rack_sack_used_next_merge;
counter_u64_t rack_sack_splits;
counter_u64_t rack_sack_used_prev_merge;
@@ -455,18 +478,25 @@ static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt);
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tcp_seq th_ack, int line, uint8_t quality);
+static void
+rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm);
+
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
static int32_t rack_handoff_ok(struct tcpcb *tp);
static int32_t rack_init(struct tcpcb *tp, void **ptr);
static void rack_init_sysctls(void);
+
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
- struct tcphdr *th, int entered_rec, int dup_ack_struck);
+ struct tcphdr *th, int entered_rec, int dup_ack_struck,
+ int *dsack_seen, int *sacks_seen);
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
- struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
+ struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
+
+static uint64_t rack_get_gp_est(struct tcp_rack *rack);
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
@@ -477,7 +507,7 @@ static int32_t rack_output(struct tcpcb *tp);
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
- uint32_t cts, int *moved_two);
+ uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz);
static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
static void rack_remxt_tmr(struct tcpcb *tp);
static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt);
@@ -486,10 +516,10 @@ static int32_t rack_stopall(struct tcpcb *tp);
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
+ struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz);
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
+ struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz);
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
@@ -530,6 +560,7 @@ static int
rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
+static void rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts);
struct rack_sendmap *
tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
uint32_t tsused);
@@ -544,6 +575,26 @@ rack_apply_deferred_options(struct tcp_rack *rack);
int32_t rack_clear_counter=0;
+static uint64_t
+rack_get_lt_bw(struct tcp_rack *rack)
+{
+ struct timeval tv;
+ uint64_t tim, bytes;
+
+ tim = rack->r_ctl.lt_bw_time;
+ bytes = rack->r_ctl.lt_bw_bytes;
+ if (rack->lt_bw_up) {
+ /* Include all the current bytes too */
+ microuptime(&tv);
+ bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq);
+ tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
+ }
+ if ((bytes != 0) && (tim != 0))
+ return ((bytes * (uint64_t)1000000) / tim);
+ else
+ return (0);
+}
+
static void
rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8)
{
@@ -645,7 +696,7 @@ rack_set_cc_pacing(struct tcp_rack *rack)
rack->rc_pacing_cc_set = 1;
rack_swap_beta_values(rack, 3);
}
-
+
static void
rack_undo_cc_pacing(struct tcp_rack *rack)
{
@@ -659,6 +710,42 @@ rack_undo_cc_pacing(struct tcp_rack *rack)
rack_swap_beta_values(rack, 4);
}
+static void
+rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
+ uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
+{
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = seq_end;
+ log.u_bbr.flex2 = rack->rc_tp->gput_seq;
+ log.u_bbr.flex3 = ack_end_t;
+ log.u_bbr.flex4 = rack->rc_tp->gput_ts;
+ log.u_bbr.flex5 = send_end_t;
+ log.u_bbr.flex6 = rack->rc_tp->gput_ack;
+ log.u_bbr.flex7 = mode;
+ log.u_bbr.flex8 = 69;
+ log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts;
+ log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts;
+ log.u_bbr.pkts_out = line;
+ log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
+ log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
+ if (rsm != NULL) {
+ log.u_bbr.applimited = rsm->r_start;
+ log.u_bbr.delivered = rsm->r_end;
+ log.u_bbr.epoch = rsm->r_flags;
+ }
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HPTSI_CALC, 0,
+ 0, &log, false, &tv);
+ }
+}
+
#ifdef NETFLIX_PEAKRATE
static inline void
rack_update_peakrate_thr(struct tcpcb *tp)
@@ -697,6 +784,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_saw_enobuf_hw);
counter_u64_zero(rack_saw_enetunreach);
counter_u64_zero(rack_persists_sends);
+ counter_u64_zero(rack_total_bytes);
counter_u64_zero(rack_persists_acks);
counter_u64_zero(rack_persists_loss);
counter_u64_zero(rack_persists_lost_ends);
@@ -719,10 +807,13 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_to_alloc_limited);
counter_u64_zero(rack_alloc_limited_conns);
counter_u64_zero(rack_split_limited);
+ counter_u64_zero(rack_rxt_clamps_cwnd);
+ counter_u64_zero(rack_rxt_clamps_cwnd_uniq);
counter_u64_zero(rack_multi_single_eq);
counter_u64_zero(rack_proc_non_comp_ack);
counter_u64_zero(rack_sack_attacks_detected);
counter_u64_zero(rack_sack_attacks_reversed);
+ counter_u64_zero(rack_sack_attacks_suspect);
counter_u64_zero(rack_sack_used_next_merge);
counter_u64_zero(rack_sack_used_prev_merge);
counter_u64_zero(rack_sack_splits);
@@ -737,6 +828,18 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_collapsed_win_rxt);
counter_u64_zero(rack_collapsed_win_seen);
counter_u64_zero(rack_collapsed_win_rxt_bytes);
+ } else if (stat == 2) {
+#ifdef INVARIANTS
+ printf("Clearing RACK option array\n");
+#endif
+ COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE);
+ } else if (stat == 3) {
+ printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n");
+ } else if (stat == 4) {
+#ifdef INVARIANTS
+ printf("Clearing RACK out size array\n");
+#endif
+ COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE);
}
rack_clear_counter = 0;
return (0);
@@ -895,14 +998,44 @@ rack_init_sysctls(void)
"Pacing related Controls");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "fulldgpinrec", CTLFLAG_RW,
+ &rack_uses_full_dgp_in_rec, 1,
+ "Do we use all DGP features in recovery (fillcw, timely et.al.)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "fullbufdisc", CTLFLAG_RW,
+ &rack_full_buffer_discount, 10,
+ "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "fillcw", CTLFLAG_RW,
+ &rack_fill_cw_state, 0,
+ "Enable fillcw on new connections (default=0 off)?");
+ SYSCTL_ADD_U16(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "min_burst", CTLFLAG_RW,
+ &rack_pacing_min_seg, 0,
+ "What is the min burst size for pacing (0 disables)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "divisor", CTLFLAG_RW,
+ &rack_default_pacing_divisor, 4,
+ "What is the default divisor given to the rl code?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
+ &rack_bw_multipler, 2,
+ "What is the multiplier of the current gp_est that fillcw can increase the b/w too?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "max_pace_over", CTLFLAG_RW,
&rack_max_per_above, 30,
"What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
- OID_AUTO, "pace_to_one", CTLFLAG_RW,
+ OID_AUTO, "allow1mss", CTLFLAG_RW,
&rack_pace_one_seg, 0,
- "Do we allow low b/w pacing of 1MSS instead of two");
+ "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
@@ -967,8 +1100,13 @@ rack_init_sysctls(void)
"How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "precheck", CTLFLAG_RW,
+ &rack_hw_check_queue, 0,
+ "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
- &rack_enobuf_hw_boost_mult, 2,
+ &rack_enobuf_hw_boost_mult, 0,
"By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
@@ -988,10 +1126,15 @@ rack_init_sysctls(void)
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rate_cap", CTLFLAG_RW,
- &rack_hw_rate_caps, 1,
+ &rack_hw_rate_caps, 0,
"Does the highest hardware pacing rate cap the rate we will send at??");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "uncap_per", CTLFLAG_RW,
+ &rack_hw_rate_cap_per, 0,
+ "If you go over b/w by this amount you will be uncapped (0 = never)");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "rate_min", CTLFLAG_RW,
&rack_hw_rate_min, 0,
"Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
@@ -1003,12 +1146,12 @@ rack_init_sysctls(void)
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "up_only", CTLFLAG_RW,
- &rack_hw_up_only, 1,
+ &rack_hw_up_only, 0,
"Do we allow hw pacing to lower the rate selected?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_hw_pacing),
OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
- &rack_hw_pace_extra_slots, 2,
+ &rack_hw_pace_extra_slots, 0,
"If the rates between software and hardware match precisely how many extra time_betweens do we get?");
rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1287,6 +1430,16 @@ rack_init_sysctls(void)
"features",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Feature controls");
+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_features),
+ OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW,
+ &rack_rxt_clamp_thresh, 0,
+ "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_features),
+ OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
+ &rack_hybrid_allow_set_maxseg, 0,
+ "Should hybrid pacing allow the setmss command");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_features),
OID_AUTO, "cmpack", CTLFLAG_RW,
@@ -1333,6 +1486,26 @@ rack_init_sysctls(void)
#endif
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "dnd", CTLFLAG_RW,
+ &rack_dnd_default, 0,
+ "Do not disturb default for rack_rrr = 3");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "sad_seg_per", CTLFLAG_RW,
+ &sad_seg_size_per, 800,
+ "Percentage of segment size needed in a sack 800 = 80.0?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rxt_controls", CTLFLAG_RW,
+ &rack_rxt_controls, 0,
+ "Retransmit sending size controls (valid values 0, 1, 2 default=1)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rack_hibeta", CTLFLAG_RW,
+ &rack_hibeta_setting, 0,
+ "Do we ue a high beta (80 instead of 50)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
&rack_apply_rtt_with_reduced_conf, 0,
"When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
@@ -1373,11 +1546,6 @@ rack_init_sysctls(void)
"Should RACK place low end time limits on the shared cwnd feature");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "iMac_dack", CTLFLAG_RW,
- &rack_use_imac_dack, 0,
- "Should RACK try to emulate iMac delayed ack");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "no_prr", CTLFLAG_RW,
&rack_disable_prr, 0,
"Should RACK not use prr and only pace (must have pacing on)");
@@ -1406,9 +1574,40 @@ rack_init_sysctls(void)
OID_AUTO, "autoscale", CTLFLAG_RW,
&rack_autosndbuf_inc, 20,
"What percentage should rack scale up its snd buffer by?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW,
+ &rack_rxt_min_rnds, 10,
+ "Number of rounds needed between RTT clamps due to high loss rates");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW,
+ &rack_unclamp_round_thresh, 100,
+ "Number of rounds needed with no loss to unclamp");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW,
+ &rack_unclamp_rxt_thresh, 5,
+ "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "clamp_ss_upper", CTLFLAG_RW,
+ &rack_clamp_ss_upper, 110,
+ "Clamp percentage ceiling in SS?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "clamp_ca_upper", CTLFLAG_RW,
+ &rack_clamp_ca_upper, 110,
+ "Clamp percentage ceiling in CA?");
/* Sack Attacker detection stuff */
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "merge_out", CTLFLAG_RW,
+ &rack_merge_out_sacks_on_attack, 0,
+ "Do we merge the sendmap when we decide we are being attacked?");
+
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
&rack_highest_sack_thresh_seen, 0,
"Highest sack to ack ratio seen");
@@ -1459,6 +1658,13 @@ rack_init_sysctls(void)
OID_AUTO, "reversed", CTLFLAG_RD,
&rack_sack_attacks_reversed,
"Total number of SACK attackers that were later determined false positive");
+ rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_attack),
+ OID_AUTO, "suspect", CTLFLAG_RD,
+ &rack_sack_attacks_suspect,
+ "Total number of SACKs that triggered early detection");
+
rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
@@ -1472,6 +1678,12 @@ rack_init_sysctls(void)
&rack_sack_used_prev_merge,
"Total number of times we used the prev merge");
/* Counters */
+ rack_total_bytes = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "totalbytes", CTLFLAG_RD,
+ &rack_total_bytes,
+ "Total number of bytes sent");
rack_fto_send = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1599,6 +1811,18 @@ rack_init_sysctls(void)
OID_AUTO, "split_limited", CTLFLAG_RD,
&rack_split_limited,
"Split allocations dropped due to limit");
+ rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD,
+ &rack_rxt_clamps_cwnd,
+ "Number of times that excessive rxt clamped the cwnd down");
+ rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD,
+ &rack_rxt_clamps_cwnd_uniq,
+ "Number of connections that have had excessive rxt clamped the cwnd down");
rack_persists_sends = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1726,49 +1950,6 @@ rack_init_sysctls(void)
&rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
}
-static __inline int
-rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
-{
- if (SEQ_GEQ(b->r_start, a->r_start) &&
- SEQ_LT(b->r_start, a->r_end)) {
- /*
- * The entry b is within the
- * block a. i.e.:
- * a -- |-------------|
- * b -- |----|
- * <or>
- * b -- |------|
- * <or>
- * b -- |-----------|
- */
- return (0);
- } else if (SEQ_GEQ(b->r_start, a->r_end)) {
- /*
- * b falls as either the next
- * sequence block after a so a
- * is said to be smaller than b.
- * i.e:
- * a -- |------|
- * b -- |--------|
- * or
- * b -- |-----|
- */
- return (1);
- }
- /*
- * Whats left is where a is
- * larger than b. i.e:
- * a -- |-------|
- * b -- |---|
- * or even possibly
- * b -- |--------------|
- */
- return (-1);
-}
-
-RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
-RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
-
static uint32_t
rc_init_window(struct tcp_rack *rack)
{
@@ -1796,14 +1977,282 @@ rack_get_fixed_pacing_bw(struct tcp_rack *rack)
return (rack->r_ctl.rc_fixed_pacing_rate_ca);
}
-static uint64_t
-rack_get_bw(struct tcp_rack *rack)
+static void
+rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim,
+ uint64_t data, uint8_t mod, uint16_t aux,
+ struct http_sendfile_track *cur)
{
- if (rack->use_fixed_rate) {
- /* Return the fixed pacing rate */
- return (rack_get_fixed_pacing_bw(rack));
+#ifdef TCP_REQUEST_TRK
+ int do_log = 0;
+
+ /*
+ * The rate cap one is noisy and only should come out when normal BB logging
+ * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out
+ * once per chunk and make up the BBpoint that can be turned on by the client.
+ */
+ if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
+ if (rack_verbose_logging != 0)
+ do_log = tcp_bblogging_on(rack->rc_tp);
+ else
+ do_log = 0;
+ } else
+ do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING);
+
+ if (do_log) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+ uint64_t lt_bw;
+
+ /* Convert our ms to a microsecond */
+ memset(&log, 0, sizeof(log));
+
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.rttProp = tim;
+ log.u_bbr.bw_inuse = cbw;
+ log.u_bbr.delRate = rack_get_gp_est(rack);
+ lt_bw = rack_get_lt_bw(rack);
+ log.u_bbr.flex1 = seq;
+ log.u_bbr.pacing_gain = aux;
+ /* lt_bw = < flex3 | flex2 > */
+ log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff);
+ log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff);
+ /* Record the last obtained us rtt in inflight */
+ if (cur == NULL) {
+ /* Make sure we are looking at the right log if an overide comes in */
+ cur = rack->r_ctl.rc_last_sft;
+ }
+ if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY)
+ log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt;
+ else {
+ /* Use the last known rtt i.e. the rack-rtt */
+ log.u_bbr.inflight = rack->rc_rack_rtt;
+ }
+ if (cur != NULL) {
+ uint64_t off;
+
+ log.u_bbr.cur_del_rate = cur->deadline;
+ if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
+ /* start = < lost | pkt_epoch > */
+ log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
+ log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex6 = cur->start_seq;
+ log.u_bbr.pkts_out = cur->end_seq;
+ } else {
+ /* start = < lost | pkt_epoch > */
+ log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
+ log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
+ /* end = < pkts_out | flex6 > */
+ log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff);
+ log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
+ }
+ /* first_send = <lt_epoch | epoch> */
+ log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff);
+ log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff);
+ /* localtime = <delivered | applimited>*/
+ log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
+ log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
+ off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]);
+ log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track));
+ log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
+ log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
+ log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
+ } else {
+ log.u_bbr.flex7 = 0xffff;
+ log.u_bbr.cur_del_rate = 0xffffffffffffffff;
+ }
+ /*
+ * Compose bbr_state to be a bit wise 0000ADHF
+ * where A is the always_pace flag
+ * where D is the dgp_on flag
+ * where H is the hybrid_mode on flag
+ * where F is the use_fixed_rate flag.
+ */
+ log.u_bbr.bbr_state = rack->rc_always_pace;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->dgp_on;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->use_fixed_rate;
+ log.u_bbr.flex8 = mod;
+ tcp_log_event(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_HYBRID_PACING_LOG, 0,
+ 0, &log, false, NULL, __func__, __LINE__, &tv);
+
+ }
+#endif
+}
+
+static inline uint64_t
+rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw)
+{
+ uint64_t ret_bw, ether;
+ uint64_t u_segsiz;
+
+ ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr);
+ if (rack->r_is_v6){
+#ifdef INET6
+ ether += sizeof(struct ip6_hdr);
+#endif
+ ether += 14; /* eheader size 6+6+2 */
+ } else {
+#ifdef INET
+ ether += sizeof(struct ip);
+#endif
+ ether += 14; /* eheader size 6+6+2 */
+ }
+ u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs);
+ ret_bw = bw;
+ ret_bw *= ether;
+ ret_bw /= u_segsiz;
+ return (ret_bw);
+}
+
+static void
+rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped)
+{
+#ifdef TCP_REQUEST_TRK
+ struct timeval tv;
+ uint64_t timenow, timeleft, lenleft, lengone, calcbw;
+#endif
+
+ if (rack->r_ctl.bw_rate_cap == 0)
+ return;
+#ifdef TCP_REQUEST_TRK
+ if (rack->rc_catch_up && rack->rc_hybrid_mode &&
+ (rack->r_ctl.rc_last_sft != NULL)) {
+ /*
+ * We have a dynamic cap. The original target
+ * is in bw_rate_cap, but we need to look at
+ * how long it is until we hit the deadline.
+ */
+ struct http_sendfile_track *ent;
+
+ ent = rack->r_ctl.rc_last_sft;
+ microuptime(&tv);
+ timenow = tcp_tv_to_lusectick(&tv);
+ if (timenow >= ent->deadline) {
+ /* No time left we do DGP only */
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent);
+ rack->r_ctl.bw_rate_cap = 0;
+ return;
+ }
+ /* We have the time */
+ timeleft = rack->r_ctl.rc_last_sft->deadline - timenow;
+ if (timeleft < HPTS_MSEC_IN_SEC) {
+ /* If there is less than a ms left just use DGPs rate */
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent);
+ rack->r_ctl.bw_rate_cap = 0;
+ return;
+ }
+ /*
+ * Now lets find the amount of data left to send.
+ *
+ * Now ideally we want to use the end_seq to figure out how much more
+ * but it might not be possible (only if we have the TRACK_FG_COMP on the entry..
+ */
+ if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
+ if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una))
+ lenleft = ent->end_seq - rack->rc_tp->snd_una;
+ else {
+ /* TSNH, we should catch it at the send */
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent);
+ rack->r_ctl.bw_rate_cap = 0;
+ return;
+ }
+ } else {
+ /*
+ * The hard way, figure out how much is gone and then
+ * take that away from the total the client asked for
+ * (thats off by tls overhead if this is tls).
+ */
+ if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq))
+ lengone = rack->rc_tp->snd_una - ent->start_seq;
+ else
+ lengone = 0;
+ if (lengone < (ent->end - ent->start))
+ lenleft = (ent->end - ent->start) - lengone;
+ else {
+ /* TSNH, we should catch it at the send */
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent);
+ rack->r_ctl.bw_rate_cap = 0;
+ return;
+ }
+ }
+ if (lenleft == 0) {
+ /* We have it all sent */
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent);
+ if (rack->r_ctl.bw_rate_cap)
+ goto normal_ratecap;
+ else
+ return;
+ }
+ calcbw = lenleft * HPTS_USEC_IN_SEC;
+ calcbw /= timeleft;
+ /* Now we must compensate for IP/TCP overhead */
+ calcbw = rack_compensate_for_linerate(rack, calcbw);
+ /* Update the bit rate cap */
+ rack->r_ctl.bw_rate_cap = calcbw;
+ if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
+ (rack_hybrid_allow_set_maxseg == 1) &&
+ ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
+ /* Lets set in a smaller mss possibly here to match our rate-cap */
+ uint32_t orig_max;
+
+ orig_max = rack->r_ctl.rc_pace_max_segs;
+ rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
+ rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp));
+ rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
+ }
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent);
+ if ((calcbw > 0) && (*bw > calcbw)) {
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent);
+ *capped = 1;
+ *bw = calcbw;
+ }
+ return;
}
- if (rack->r_ctl.gp_bw == 0) {
+normal_ratecap:
+#endif
+ if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) {
+#ifdef TCP_REQUEST_TRK
+ if (rack->rc_hybrid_mode &&
+ rack->rc_catch_up &&
+ (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
+ (rack_hybrid_allow_set_maxseg == 1) &&
+ ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
+ /* Lets set in a smaller mss possibly here to match our rate-cap */
+ uint32_t orig_max;
+
+ orig_max = rack->r_ctl.rc_pace_max_segs;
+ rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
+ rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp));
+ rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
+ }
+#endif
+ *capped = 1;
+ *bw = rack->r_ctl.bw_rate_cap;
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ *bw, 0, 0,
+ HYBRID_LOG_RATE_CAP, 1, NULL);
+ }
+}
+
+static uint64_t
+rack_get_gp_est(struct tcp_rack *rack)
+{
+ uint64_t bw, lt_bw, ret_bw;
+
+ if (rack->rc_gp_filled == 0) {
/*
* We have yet no b/w measurement,
* if we have a user set initial bw
@@ -1815,15 +2264,20 @@ rack_get_bw(struct tcp_rack *rack)
* so if we have like IW=30, we are not
* calculating a "huge" b/w.
*/
- uint64_t bw, srtt;
+ uint64_t srtt;
+
+ lt_bw = rack_get_lt_bw(rack);
+ if (lt_bw) {
+ /*
+ * No goodput bw but a long-term b/w does exist
+ * lets use that.
+ */
+ ret_bw = lt_bw;
+ goto compensate;
+ }
if (rack->r_ctl.init_rate)
return (rack->r_ctl.init_rate);
- /* Has the user set a max peak rate? */
-#ifdef NETFLIX_PEAKRATE
- if (rack->rc_tp->t_maxpeakrate)
- return (rack->rc_tp->t_maxpeakrate);
-#endif
/* Ok lets come up with the IW guess, if we have a srtt */
if (rack->rc_tp->t_srtt == 0) {
/*
@@ -1837,32 +2291,71 @@ rack_get_bw(struct tcp_rack *rack)
srtt = (uint64_t)rack->rc_tp->t_srtt;
bw *= (uint64_t)USECS_IN_SECOND;
bw /= srtt;
- if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
- bw = rack->r_ctl.bw_rate_cap;
- return (bw);
+ ret_bw = bw;
+ goto compensate;
+
+ }
+ if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
+ /* Averaging is done, we can return the value */
+ bw = rack->r_ctl.gp_bw;
} else {
- uint64_t bw;
+ /* Still doing initial average must calculate */
+ bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
+ }
+ lt_bw = rack_get_lt_bw(rack);
+ if (lt_bw == 0) {
+ /* If we don't have one then equate it to the gp_bw */
+ lt_bw = rack->r_ctl.gp_bw;
+ }
+ if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){
+ /* if clamped take the lowest */
+ if (lt_bw < bw)
+ ret_bw = lt_bw;
+ else
+ ret_bw = bw;
+ } else {
+ /* If not set for clamped to get lowest, take the highest */
+ if (lt_bw > bw)
+ ret_bw = lt_bw;
+ else
+ ret_bw = bw;
+ }
+ /*
+ * Now lets compensate based on the TCP/IP overhead. Our
+ * Goodput estimate does not include this so we must pace out
+ * a bit faster since our pacing calculations do. The pacing
+ * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz
+ * we are using to do this, so we do that here in the opposite
+ * direction as well. This means that if we are tunneled and the
+ * segsiz is say 1200 bytes we will get quite a boost, but its
+ * compensated for in the pacing time the opposite way.
+ */
+compensate:
+ ret_bw = rack_compensate_for_linerate(rack, ret_bw);
+ return(ret_bw);
+}
- if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
- /* Averaging is done, we can return the value */
- bw = rack->r_ctl.gp_bw;
- } else {
- /* Still doing initial average must calculate */
- bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
- }
+
+static uint64_t
+rack_get_bw(struct tcp_rack *rack)
+{
+ uint64_t bw;
+
+ if (rack->use_fixed_rate) {
+ /* Return the fixed pacing rate */
+ return (rack_get_fixed_pacing_bw(rack));
+ }
+ bw = rack_get_gp_est(rack);
#ifdef NETFLIX_PEAKRATE
- if ((rack->rc_tp->t_maxpeakrate) &&
- (bw > rack->rc_tp->t_maxpeakrate)) {
- /* The user has set a peak rate to pace at
- * don't allow us to pace faster than that.
- */
- return (rack->rc_tp->t_maxpeakrate);
- }
-#endif
- if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
- bw = rack->r_ctl.bw_rate_cap;
- return (bw);
+ if ((rack->rc_tp->t_maxpeakrate) &&
+ (bw > rack->rc_tp->t_maxpeakrate)) {
+ /* The user has set a peak rate to pace at
+ * don't allow us to pace faster than that.
+ */
+ return (rack->rc_tp->t_maxpeakrate);
}
+#endif
+ return (bw);
}
static uint16_t
@@ -1996,9 +2489,28 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm,
uint64_t bw_est, high_rate;
uint64_t gain;
- gain = (uint64_t)rack_get_output_gain(rack, rsm);
- bw_est = bw * gain;
- bw_est /= (uint64_t)100;
+ if ((rack->r_pacing_discount == 0) ||
+ (rack_full_buffer_discount == 0)) {
+ /*
+ * No buffer level based discount from client buffer
+ * level is enabled or the feature is disabled.
+ */
+ gain = (uint64_t)rack_get_output_gain(rack, rsm);
+ bw_est = bw * gain;
+ bw_est /= (uint64_t)100;
+ } else {
+ /*
+ * We have a discount in place apply it with
+ * just a 100% gain (we get no boost if the buffer
+ * is full).
+ */
+ uint64_t discount;
+
+ discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm);
+ discount /= 100;
+ /* What %% of the b/w do we discount */
+ bw_est = bw - discount;
+ }
/* Never fall below the minimum (def 64kbps) */
if (bw_est < RACK_MIN_BW)
bw_est = RACK_MIN_BW;
@@ -2009,6 +2521,11 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm,
high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
if (bw_est >= high_rate) {
/* We are capping bw at the highest rate table entry */
+ if (rack_hw_rate_cap_per &&
+ (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) {
+ rack->r_rack_hw_rate_caps = 0;
+ goto done;
+ }
rack_log_hdwr_pacing(rack,
bw_est, high_rate, __LINE__,
0, 3);
@@ -2039,6 +2556,7 @@ rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm,
}
}
}
+done:
return (bw_est);
}
@@ -2049,7 +2567,9 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
union tcp_log_stackspecific log;
struct timeval tv;
- if ((mod != 1) && (rack_verbose_logging == 0)) {
+ if (rack->sack_attack_disable > 0)
+ goto log_anyway;
+ if ((mod != 1) && (rack_verbose_logging == 0)) {
/*
* We get 3 values currently for mod
* 1 - We are retransmitting and this tells the reason.
@@ -2061,6 +2581,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
*/
return;
}
+log_anyway:
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = tsused;
log.u_bbr.flex2 = thresh;
@@ -2109,9 +2630,11 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
- log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
+ log.u_bbr.cwnd_gain = rack->rack_deferred_inited;
+ log.u_bbr.pkt_epoch = rack->rc_has_collapsed;
log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
log.u_bbr.lost = rack_rto_min;
+ log.u_bbr.epoch = rack->r_ctl.roundends;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2350,6 +2873,29 @@ rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time
}
+static void
+rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
+{
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ /* Convert our ms to a microsecond */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = idx;
+ log.u_bbr.flex2 = rack_ts_to_msec(tsv);
+ log.u_bbr.flex3 = tsecho;
+ log.u_bbr.flex7 = 3;
+ log.u_bbr.rttProp = tsv;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_LOG_RTT, 0,
+ 0, &log, false, &tv);
+ }
+}
+
static inline void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
@@ -2379,7 +2925,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
}
static void
-rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
+rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line)
{
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
@@ -2391,6 +2937,9 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_
log.u_bbr.flex2 = 0;
else
log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.flex5 = rack->r_ctl.ack_during_sd;
+ log.u_bbr.flex6 = line;
log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
log.u_bbr.flex8 = rack->rc_in_persist;
log.u_bbr.timeStamp = cts;
@@ -2459,6 +3008,7 @@ rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = arg1;
log.u_bbr.flex5 = arg2;
+ log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs;
log.u_bbr.flex6 = arg3;
log.u_bbr.flex8 = frm;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
@@ -2658,6 +3208,7 @@ rack_log_sad(struct tcp_rack *rack, int event)
log.u_bbr.applimited = tcp_map_minimum;
log.u_bbr.flex7 = rack->sack_attack_disable;
log.u_bbr.flex8 = event;
+ log.u_bbr.bbr_state = rack->rc_suspicious;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.delivered = tcp_sad_decay_val;
@@ -2673,6 +3224,7 @@ rack_log_sad(struct tcp_rack *rack, int event)
static void
rack_counter_destroy(void)
{
+ counter_u64_free(rack_total_bytes);
counter_u64_free(rack_fto_send);
counter_u64_free(rack_fto_rsm_send);
counter_u64_free(rack_nfto_resend);
@@ -2687,6 +3239,7 @@ rack_counter_destroy(void)
counter_u64_free(rack_move_some);
counter_u64_free(rack_sack_attacks_detected);
counter_u64_free(rack_sack_attacks_reversed);
+ counter_u64_free(rack_sack_attacks_suspect);
counter_u64_free(rack_sack_used_next_merge);
counter_u64_free(rack_sack_used_prev_merge);
counter_u64_free(rack_tlp_tot);
@@ -2705,6 +3258,8 @@ rack_counter_destroy(void)
counter_u64_free(rack_alloc_limited_conns);
counter_u64_free(rack_split_limited);
counter_u64_free(rack_multi_single_eq);
+ counter_u64_free(rack_rxt_clamps_cwnd);
+ counter_u64_free(rack_rxt_clamps_cwnd_uniq);
counter_u64_free(rack_proc_non_comp_ack);
counter_u64_free(rack_sack_proc_all);
counter_u64_free(rack_sack_proc_restart);
@@ -2794,15 +3349,26 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
if (limit_type) {
/* currently there is only one limit type */
- if (V_tcp_map_split_limit > 0 &&
+ if (rack->r_ctl.rc_split_limit > 0 &&
(rack->do_detection == 0) &&
- rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
+ rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) {
+ counter_u64_add(rack_split_limited, 1);
+ if (!rack->alloc_limit_reported) {
+ rack->alloc_limit_reported = 1;
+ counter_u64_add(rack_alloc_limited_conns, 1);
+ }
+ return (NULL);
+#ifdef NETFLIX_EXP_DETECTION
+ } else if ((tcp_sad_limit != 0) &&
+ (rack->do_detection == 1) &&
+ (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) {
counter_u64_add(rack_split_limited, 1);
if (!rack->alloc_limit_reported) {
rack->alloc_limit_reported = 1;
counter_u64_add(rack_alloc_limited_conns, 1);
}
return (NULL);
+#endif
}
}
@@ -2816,6 +3382,24 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
}
static void
+rack_free_trim(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ /*
+ * Free up all the tail entries until
+ * we get our list down to the limit.
+ */
+ while (rack->rc_free_cnt > rack_free_cache) {
+ rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
+ rack->rc_free_cnt--;
+ rack->r_ctl.rc_num_maps_alloced--;
+ uma_zfree(rack_zone, rsm);
+ }
+}
+
+static void
rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
{
if (rsm->r_flags & RACK_APP_LIMITED) {
@@ -2830,13 +3414,8 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
if (rsm == rack->r_ctl.rc_first_appl) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
rack->r_ctl.rc_first_appl = NULL;
- else {
- /* Follow the next one out */
- struct rack_sendmap fe;
-
- fe.r_start = rsm->r_nseq_appl;
- rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
- }
+ else
+ rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl);
}
if (rsm == rack->r_ctl.rc_resend)
rack->r_ctl.rc_resend = NULL;
@@ -2847,28 +3426,14 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
if (rack->r_ctl.rc_sacklast == rsm)
rack->r_ctl.rc_sacklast = NULL;
memset(rsm, 0, sizeof(struct rack_sendmap));
+ /* Make sure we are not going to overrun our count limit of 0xff */
+ if ((rack->rc_free_cnt + 1) > 0xff) {
+ rack_free_trim(rack);
+ }
TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
rack->rc_free_cnt++;
}
-static void
-rack_free_trim(struct tcp_rack *rack)
-{
- struct rack_sendmap *rsm;
-
- /*
- * Free up all the tail entries until
- * we get our list down to the limit.
- */
- while (rack->rc_free_cnt > rack_free_cache) {
- rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
- TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
- rack->rc_free_cnt--;
- uma_zfree(rack_zone, rsm);
- }
-}
-
-
static uint32_t
rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
{
@@ -2956,15 +3521,34 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_
/*
* Has enough time passed for the GP measurement to be valid?
*/
+ if (SEQ_LT(th_ack, tp->gput_seq)) {
+ /* Not enough bytes yet */
+ return (0);
+ }
if ((tp->snd_max == tp->snd_una) ||
(th_ack == tp->snd_max)){
- /* All is acked */
+ /*
+ * All is acked quality of all acked is
+ * usually low or medium, but we in theory could split
+ * all acked into two cases, where you got
+ * a signifigant amount of your window and
+ * where you did not. For now we leave it
+ * but it is something to contemplate in the
+ * future. The danger here is that delayed ack
+ * is effecting the last byte (which is a 50:50 chance).
+ */
*quality = RACK_QUALITY_ALLACKED;
return (1);
}
- if (SEQ_LT(th_ack, tp->gput_seq)) {
- /* Not enough bytes yet */
- return (0);
+ if (SEQ_GEQ(th_ack, tp->gput_ack)) {
+ /*
+ * We obtained our entire window of data we wanted
+ * no matter if we are in recovery or not then
+ * its ok since expanding the window does not
+ * make things fuzzy (or at least not as much).
+ */
+ *quality = RACK_QUALITY_HIGH;
+ return (1);
}
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
if (SEQ_LT(th_ack, tp->gput_ack) &&
@@ -2984,7 +3568,13 @@ rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_
/* Now what about time? */
srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
- if (tim >= srtts) {
+ if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
+ /*
+ * We do not allow a measurement if we are in recovery
+ * that would shrink the goodput window we wanted.
+ * This is to prevent cloudyness of when the last send
+ * was actually made.
+ */
*quality = RACK_QUALITY_HIGH;
return (1);
}
@@ -3188,10 +3778,10 @@ extra_boost:
calc = 0xffff;
logged |= 1;
rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
- if (rack_per_upper_bound_ss &&
+ if (rack->r_ctl.rack_per_upper_bound_ca &&
(rack->rc_dragged_bottom == 0) &&
- (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
- rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
+ (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca))
+ rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca;
}
if (rack->rc_gp_saw_ca &&
(rack->rc_gp_saw_ss == 0) &&
@@ -3203,10 +3793,10 @@ extra_boost:
calc = 0xffff;
logged |= 2;
rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
- if (rack_per_upper_bound_ca &&
+ if (rack->r_ctl.rack_per_upper_bound_ca &&
(rack->rc_dragged_bottom == 0) &&
- (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
- rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
+ (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca))
+ rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca;
}
if (rack->rc_gp_saw_ss &&
rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
@@ -3216,10 +3806,10 @@ extra_boost:
if (calc > 0xffff)
calc = 0xffff;
rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
- if (rack_per_upper_bound_ss &&
+ if (rack->r_ctl.rack_per_upper_bound_ss &&
(rack->rc_dragged_bottom == 0) &&
- (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
- rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
+ (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss))
+ rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss;
logged |= 4;
}
if (logged &&
@@ -3569,7 +4159,7 @@ rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
* We need to mark these as app-limited so we
* don't collapse the b/w.
*/
- rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
@@ -3836,7 +4426,7 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last
if (rack->r_ctl.rc_no_push_at_mrtt > 1)
rack_validate_multipliers_at_or_below_100(rack);
rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
- } else if ((last_bw_est < low_bnd) && !losses) {
+ } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) {
/*
* We are decreasing this is a bit complicated this
* means we are loosing ground. This could be
@@ -3858,8 +4448,7 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last
rack->rc_gp_bwred = 1;
rack->rc_gp_timely_dec_cnt = 0;
}
- if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
- (timely_says == 0)) {
+ if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) {
/*
* Push another time with a faster pacing
* to try to gain back (we include override to
@@ -3905,8 +4494,8 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last
__LINE__, 3);
rack->r_ctl.last_gp_comp_bw = cur_bw;
if (rack->rc_gp_saw_ss &&
- rack_per_upper_bound_ss &&
- (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
+ rack->r_ctl.rack_per_upper_bound_ss &&
+ (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) {
/*
* In cases where we can't go higher
* we should just use timely.
@@ -3914,8 +4503,8 @@ rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last
goto use_timely;
}
if (rack->rc_gp_saw_ca &&
- rack_per_upper_bound_ca &&
- (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
+ rack->r_ctl.rack_per_upper_bound_ca &&
+ (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) {
/*
* In cases where we can't go higher
* we should just use timely.
@@ -4027,11 +4616,134 @@ rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff
return (timely_says);
}
+static __inline int
+rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)
+{
+ if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
+ SEQ_LEQ(rsm->r_end, tp->gput_ack)) {
+ /**
+ * This covers the case that the
+ * resent is completely inside
+ * the gp range or up to it.
+ * |----------------|
+ * |-----| <or>
+ * |----|
+ * <or> |---|
+ */
+ return (1);
+ } else if (SEQ_LT(rsm->r_start, tp->gput_seq) &&
+ SEQ_GT(rsm->r_end, tp->gput_seq)){
+ /**
+ * This covers the case of
+ * |--------------|
+ * |-------->|
+ */
+ return (1);
+ } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
+ SEQ_LT(rsm->r_start, tp->gput_ack) &&
+ SEQ_GEQ(rsm->r_end, tp->gput_ack)) {
+
+ /**
+ * This covers the case of
+ * |--------------|
+ * |-------->|
+ */
+ return (1);
+ }
+ return (0);
+}
+
+static __inline void
+rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)
+{
+
+ if ((tp->t_flags & TF_GPUTINPROG) == 0)
+ return;
+ /*
+ * We have a Goodput measurement in progress. Mark
+ * the send if its within the window. If its not
+ * in the window make sure it does not have the mark.
+ */
+ if (rack_in_gp_window(tp, rsm))
+ rsm->r_flags |= RACK_IN_GP_WIN;
+ else
+ rsm->r_flags &= ~RACK_IN_GP_WIN;
+}
+
+static __inline void
+rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /* A GP measurement is ending, clear all marks on the send map*/
+ struct rack_sendmap *rsm = NULL;
+
+ rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
+ if (rsm == NULL) {
+ rsm = tqhash_min(rack->r_ctl.tqh);
+ }
+ /* Nothing left? */
+ while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){
+ rsm->r_flags &= ~RACK_IN_GP_WIN;
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
+ }
+}
+
+
+static __inline void
+rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm = NULL;
+
+ if (tp->snd_una == tp->snd_max) {
+ /* Nothing outstanding yet, nothing to do here */
+ return;
+ }
+ if (SEQ_GT(tp->gput_seq, tp->snd_una)) {
+ /*
+ * We are measuring ahead of some outstanding
+ * data. We need to walk through up until we get
+ * to gp_seq marking so that no rsm is set incorrectly
+ * with RACK_IN_GP_WIN.
+ */
+ rsm = tqhash_min(rack->r_ctl.tqh);
+ while (rsm != NULL) {
+ rack_mark_in_gp_win(tp, rsm);
+ if (SEQ_GEQ(rsm->r_end, tp->gput_seq))
+ break;
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
+ }
+ }
+ if (rsm == NULL) {
+ /*
+ * Need to find the GP seq, if rsm is
+ * set we stopped as we hit it.
+ */
+ rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
+ if (rsm == NULL)
+ return;
+ rack_mark_in_gp_win(tp, rsm);
+ }
+ /*
+ * Now we may need to mark already sent rsm, ahead of
+ * gput_seq in the window since they may have been sent
+ * *before* we started our measurment. The rsm, if non-null
+ * has been marked (note if rsm would have been NULL we would have
+ * returned in the previous block). So we go to the next, and continue
+ * until we run out of entries or we exceed the gp_ack value.
+ */
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
+ while (rsm) {
+ rack_mark_in_gp_win(tp, rsm);
+ if (SEQ_GT(rsm->r_end, tp->gput_ack))
+ break;
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
+ }
+}
+
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tcp_seq th_ack, int line, uint8_t quality)
{
- uint64_t tim, bytes_ps, ltim, stim, utim;
+ uint64_t tim, bytes_ps, stim, utim;
uint32_t segsiz, bytes, reqbytes, us_cts;
int32_t gput, new_rtt_diff, timely_says;
uint64_t resid_bw, subpart = 0, addpart = 0, srtt;
@@ -4058,10 +4770,8 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
utim = max(tim, 1);
else
utim = max(stim, 1);
- /* Lets get a msec time ltim too for the old stuff */
- ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
- gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
+ rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL);
if ((tim == 0) && (stim == 0)) {
/*
* Invalid measurement time, maybe
@@ -4171,10 +4881,10 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
}
}
timely_says = rack_make_timely_judgement(rack,
- rack->r_ctl.rc_gp_srtt,
- rack->r_ctl.rc_rtt_diff,
- rack->r_ctl.rc_prev_gp_srtt
- );
+ rack->r_ctl.rc_gp_srtt,
+ rack->r_ctl.rc_rtt_diff,
+ rack->r_ctl.rc_prev_gp_srtt
+ );
bytes_ps *= HPTS_USEC_IN_SEC;
bytes_ps /= utim;
if (bytes_ps > rack->r_ctl.last_max_bw) {
@@ -4229,6 +4939,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
/* We have collected enough to move forward */
rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
}
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
did_add = 3;
} else {
/*
@@ -4334,12 +5045,15 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
rack->r_ctl.gp_bw = resid_bw + addpart;
}
}
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
}
if ((rack->gp_ready == 0) &&
(rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
/* We have enough measurements now */
rack->gp_ready = 1;
- rack_set_cc_pacing(rack);
+ if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) ||
+ rack->rack_hibeta)
+ rack_set_cc_pacing(rack);
if (rack->defer_options)
rack_apply_deferred_options(rack);
}
@@ -4352,10 +5066,18 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
rack->r_ctl.rc_rtt_diff);
rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
rack_get_bw(rack), 3, line, NULL, quality);
+ rack_log_pacing_delay_calc(rack,
+ bytes, /* flex2 */
+ tim, /* flex1 */
+ bytes_ps, /* bw_inuse */
+ rack->r_ctl.gp_bw, /* delRate */
+ rack_get_lt_bw(rack), /* rttProp */
+ 20, line, NULL, 0);
/* reset the gp srtt and setup the new prev */
rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
/* Record the lost count for the next measurement */
rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
+skip_measurement:
/*
* We restart our diffs based on the gpsrtt in the
* measurement window.
@@ -4365,24 +5087,30 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
rack->rc_gp_saw_ca = 0;
rack->rc_gp_saw_ss = 0;
rack->rc_dragged_bottom = 0;
-skip_measurement:
+ if (quality == RACK_QUALITY_HIGH) {
+ /*
+ * Gput in the stats world is in kbps where bytes_ps is
+ * bytes per second so we do ((x * 8)/ 1000).
+ */
+ gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000);
#ifdef STATS
- stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
- gput);
- /*
- * XXXLAS: This is a temporary hack, and should be
- * chained off VOI_TCP_GPUT when stats(9) grows an
- * API to deal with chained VOIs.
- */
- if (tp->t_stats_gput_prev > 0)
- stats_voi_update_abs_s32(tp->t_stats,
- VOI_TCP_GPUT_ND,
- ((gput - tp->t_stats_gput_prev) * 100) /
- tp->t_stats_gput_prev);
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+ gput);
+ /*
+ * XXXLAS: This is a temporary hack, and should be
+ * chained off VOI_TCP_GPUT when stats(9) grows an
+ * API to deal with chained VOIs.
+ */
+ if (tp->t_stats_gput_prev > 0)
+ stats_voi_update_abs_s32(tp->t_stats,
+ VOI_TCP_GPUT_ND,
+ ((gput - tp->t_stats_gput_prev) * 100) /
+ tp->t_stats_gput_prev);
#endif
+ tp->t_stats_gput_prev = gput;
+ }
tp->t_flags &= ~TF_GPUTINPROG;
- tp->t_stats_gput_prev = gput;
/*
* Now are we app limited now and there is space from where we
* were to where we want to go?
@@ -4399,7 +5127,7 @@ skip_measurement:
/*
* Yep there is enough outstanding to make a measurement here.
*/
- struct rack_sendmap *rsm, fe;
+ struct rack_sendmap *rsm;
rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
@@ -4443,8 +5171,8 @@ skip_measurement:
* Now we need to find the timestamp of the send at tp->gput_seq
* for the send based measurement.
*/
- fe.r_start = tp->gput_seq;
- rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ rack->r_ctl.rc_gp_cumack_ts = 0;
+ rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
if (rsm) {
/* Ok send-based limit is set */
if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
@@ -4456,11 +5184,21 @@ skip_measurement:
*/
tp->gput_seq = rsm->r_start;
}
- if (rsm->r_flags & RACK_ACKED)
+ if (rsm->r_flags & RACK_ACKED) {
+ struct rack_sendmap *nrsm;
+
tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
- else
+ tp->gput_seq = rsm->r_end;
+ nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
+ if (nrsm)
+ rsm = nrsm;
+ else {
+ rack->app_limited_needs_set = 1;
+ }
+ } else
rack->app_limited_needs_set = 1;
- rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ /* We always go from the first send */
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
} else {
/*
* If we don't find the rsm due to some
@@ -4472,14 +5210,22 @@ skip_measurement:
microuptime(&tv);
rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
}
+ rack_tend_gp_marks(tp, rack);
rack_log_pacing_delay_calc(rack,
tp->gput_seq,
tp->gput_ack,
(uint64_t)rsm,
tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
9,
- __LINE__, NULL, quality);
+ __LINE__, rsm, quality);
+ rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
+ } else {
+ /*
+ * To make sure proper timestamp merging occurs, we need to clear
+ * all GP marks if we don't start a measurement.
+ */
+ rack_clear_gp_marks(tp, rack);
}
}
@@ -4509,6 +5255,20 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint
stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
#endif
+ if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
+ /* We will ack all, time
+ * to end any lt_bw_up we
+ * have running until something
+ * new is sent.
+ */
+ struct timeval tv;
+
+ rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
+ rack->r_ctl.lt_seq = tp->snd_max;
+ (void)tcp_get_usecs(&tv);
+ rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
+ rack->lt_bw_up = 0;
+ }
quality = RACK_QUALITY_NONE;
if ((tp->t_flags & TF_GPUTINPROG) &&
rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
@@ -4546,7 +5306,8 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint
}
prior_cwnd = tp->snd_cwnd;
if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
- (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
+ (rack_client_low_buf && rack->client_bufferlvl &&
+ (rack->client_bufferlvl < rack_client_low_buf)))
labc_to_use = rack->rc_labc;
else
labc_to_use = rack_max_abc_post_recovery;
@@ -4565,7 +5326,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint
log.u_bbr.flex7 = V_tcp_do_newsack;
log.u_bbr.flex8 = 1;
lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
+ 0, &log, false, NULL, __func__, __LINE__,&tv);
}
if (CC_ALGO(tp)->ack_received != NULL) {
/* XXXLAS: Find a way to live without this */
@@ -4634,6 +5395,348 @@ tcp_rack_partialack(struct tcpcb *tp)
rack->r_wanted_output = 1;
}
+static inline void
+rack_set_most_aggr(struct tcp_rack *rack)
+{
+ rack->r_fill_less_agg = 0;
+ /* Once the cwnd as been clamped we don't do fill_cw */
+ if (rack->r_cwnd_was_clamped == 0)
+ rack->rc_pace_to_cwnd = 1;
+ rack->r_pacing_discount = 0;
+}
+
+static inline void
+rack_limit_fillcw(struct tcp_rack *rack)
+{
+ rack->r_fill_less_agg = 1;
+ /* Once the cwnd as been clamped we don't do fill_cw */
+ if (rack->r_cwnd_was_clamped == 0)
+ rack->rc_pace_to_cwnd = 1;
+ rack->r_pacing_discount = 0;
+}
+
+static inline void
+rack_disable_fillcw(struct tcp_rack *rack)
+{
+ rack->r_fill_less_agg = 1;
+ rack->rc_pace_to_cwnd = 0;
+ rack->r_pacing_discount = 0;
+}
+
+static void
+rack_client_buffer_level_set(struct tcp_rack *rack)
+{
+ /*
+ * Only if DGP is on do we do anything that
+ * changes stack behavior. If DGP is off all
+ * we will do is issue a BB log (if BB logging is
+ * on) and return.
+ */
+ if (rack->dgp_on == 0) {
+ rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl,
+ 0, 0, 0, 30, __LINE__, NULL, 0);
+ return;
+ }
+ if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) {
+ goto set_most_agg;
+ }
+ /*
+ * We are in DGP so what setting should we
+ * apply based on where the client is?
+ */
+ switch(rack->r_ctl.rc_dgp_bl_agg) {
+ default:
+ case DGP_LEVEL0:
+set_most_agg:
+ rack_set_most_aggr(rack);
+ break;
+ case DGP_LEVEL1:
+ if (rack->client_bufferlvl == 4)
+ rack_limit_fillcw(rack);
+ else if (rack->client_bufferlvl == 5)
+ rack_disable_fillcw(rack);
+ else
+ rack_set_most_aggr(rack);
+ break;
+ case DGP_LEVEL2:
+ if (rack->client_bufferlvl == 3)
+ rack_limit_fillcw(rack);
+ else if (rack->client_bufferlvl == 4)
+ rack_disable_fillcw(rack);
+ else if (rack->client_bufferlvl == 5) {
+ rack_disable_fillcw(rack);
+ rack->r_pacing_discount = 1;
+ rack->r_ctl.pacing_discount_amm = 1;
+ } else
+ rack_set_most_aggr(rack);
+ break;
+ case DGP_LEVEL3:
+ if (rack->client_bufferlvl == 2)
+ rack_limit_fillcw(rack);
+ else if (rack->client_bufferlvl == 3)
+ rack_disable_fillcw(rack);
+ else if (rack->client_bufferlvl == 4) {
+ rack_disable_fillcw(rack);
+ rack->r_pacing_discount = 1;
+ rack->r_ctl.pacing_discount_amm = 1;
+ } else if (rack->client_bufferlvl == 5) {
+ rack_disable_fillcw(rack);
+ rack->r_pacing_discount = 1;
+ rack->r_ctl.pacing_discount_amm = 2;
+ } else
+ rack_set_most_aggr(rack);
+ break;
+ }
+ rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0,
+ 0, 0, 30, __LINE__, NULL, 0);
+}
+
+static void
+do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /*
+ * Can we unclamp. We unclamp if more than
+ * N rounds have transpired with no loss.
+ */
+ uint64_t snds, rxts, rxt_per;
+ uint32_t rnds;
+
+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
+ if ((rack_unclamp_round_thresh > 0) &&
+ (rnds >= rack_unclamp_round_thresh)) {
+ snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
+ KASSERT ((snds > 0), ("rack:%p tp:%p snds:%lu is 0", rack, tp, snds));
+ rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
+ rxt_per = rxts * 1000;
+ rxt_per /= snds;
+ if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) {
+ /* Unclamp */
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex3 = rnds;
+ log.u_bbr.flex4 = rack_unclamp_round_thresh;
+ log.u_bbr.flex5 = (uint32_t)rxt_per;
+ log.u_bbr.flex8 = 6;
+ log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
+ log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
+ log.u_bbr.applimited = rack->r_ctl.max_clamps;
+ log.u_bbr.epoch = rack->r_ctl.clamp_options;
+ log.u_bbr.cur_del_rate = rxts;
+ log.u_bbr.bw_inuse = rack_get_lt_bw(rack);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
+ log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
+ tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+ rack->r_ctl.num_of_clamps_applied = 0;
+ rack->r_cwnd_was_clamped = 0;
+ rack->excess_rxt_on = 1;
+ if (rack->r_ctl.clamp_options) {
+ /*
+ * We only allow fillcw to be toggled
+ * if you are setting a max seg too.
+ */
+ if (rack->r_ctl.clamp_options & 0x1) {
+ if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
+ /* turn on fill cw for non-dgp*/
+ rack->rc_pace_to_cwnd = 0;
+ } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
+ /* For DGP we want it off */
+ rack->rc_pace_to_cwnd = 1;
+ }
+ }
+ }
+ if (rack->dgp_on) {
+ /* Reset all multipliers to 100.0 so just the measured bw */
+ /* Crash any per boosts down to 100% */
+ rack->r_ctl.rack_per_of_gp_rec = 100;
+ rack->r_ctl.rack_per_of_gp_ss = 100;
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* Set in an upper bound for ss/ca % increase */
+ rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
+ rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
+ }
+ }
+ }
+}
+
+static void
+do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /*
+ * Rack excess rxt accounting is turned on. If we
+ * are above a threshold of rxt's in at least N
+ * rounds, then back off the cwnd and ssthresh
+ * to fit into the long-term b/w.
+ */
+ uint64_t snds, rxts, rxt_per, lt_bw, bdp;
+ uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0;
+
+ /* Is it shut off by 0 rounds? */
+ if (rack_rxt_min_rnds == 0)
+ return;
+ if ((rack->r_ctl.max_clamps > 0) &&
+ (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) {
+ /*
+ * The idea, if max_clamps is set, is that if clamping it
+ * N times did not work again, then there is no sense
+ * clamping it again. The link is just a lossy link and
+ * our clamps are doing no good. Turn it off so we don't come
+ * back here again.
+ */
+ rack->excess_rxt_on = 0;
+ rack->r_cwnd_was_clamped = 0;
+ rack->r_ctl.num_of_clamps_applied = 0;
+ return;
+ }
+ snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
+ rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
+ /* Has enough rounds progressed for us to re-measure? */
+ if ((rnds >= rack_rxt_min_rnds) &&
+ (rack->r_ctl.rxt_threshold > 0)){
+ rxt_per = rxts * 1000;
+ rxt_per /= snds;
+ if (rxt_per >= rack->r_ctl.rxt_threshold) {
+ /*
+ * Action required:
+ * We are above our excess retransmit level, lets
+ * cut down the cwnd and ssthresh to match the long-term
+ * b/w we are getting.
+ */
+ /* First disable scwnd if enabled */
+#ifdef NETFLIX_SHARED_CWND
+ rack->rack_enable_scwnd = 0;
+ if (rack->r_ctl.rc_scw) {
+ uint32_t limit;
+
+ shared_cwnd_was_enabled = 1;
+ if (rack->r_limit_scw)
+ limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
+ else
+ limit = 0;
+ tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
+ rack->r_ctl.rc_scw_index,
+ limit);
+ rack->r_ctl.rc_scw = NULL;
+ }
+
+#endif
+ /* Calculate what the cwnd and ssthresh should be */
+ tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT);
+ lt_bw = rack_get_lt_bw(rack);
+ if (lt_bw == 0) {
+ /*
+ * No lt_bw, lets chop things to one MSS
+ * and the ssthresh to the iwnd.
+ */
+reset_to_iw:
+ new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp));
+ } else {
+ rtt = rack->rc_rack_rtt;
+ if (rtt == 0) {
+ /* If we have no rack_rtt drop to the IW situation */
+ goto reset_to_iw;
+ }
+ bdp = lt_bw * (uint64_t)rtt;
+ bdp /= HPTS_USEC_IN_SEC;
+ new_cwnd = (uint32_t)bdp;
+ new_ssthresh = new_cwnd - 1;
+ if (new_cwnd < ctf_fixed_maxseg(tp)) {
+ /* Rock bottom, goto IW settings */
+ goto reset_to_iw;
+ }
+ }
+ rack->r_cwnd_was_clamped = 1;
+ rack->r_ctl.num_of_clamps_applied++;
+ /* Reset the counter fromn now */
+ tp->t_bytes_acked = 0;
+ /*
+ * Now what about options?
+ * We look at the bottom 8 bits:
+ * F = fill cw bit (toggle it if set)
+ * S = Segment bits
+ * M = set max segment bit
+ *
+ * SSSS SSMF
+ */
+ if (rack->r_ctl.clamp_options) {
+ if (rack->r_ctl.clamp_options & 0x1) {
+ if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
+ /* turn on fill cw for non-dgp*/
+ rack->rc_pace_to_cwnd = 1;
+ } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
+ /* For DGP we want it off */
+ rack->rc_pace_to_cwnd = 0;
+ }
+ }
+ }
+ if (rack->dgp_on) {
+ /* Reset all multipliers to 100.0 so just the measured bw */
+ /* Crash any per boosts down to 100% */
+ rack->r_ctl.rack_per_of_gp_rec = 100;
+ rack->r_ctl.rack_per_of_gp_ss = 100;
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* Set in an upper bound for ss/ca % increase */
+ rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper;
+ rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper;
+ /* Now move to the lt_bw */
+ rack->r_ctl.gp_bw = lt_bw;
+ rack->rc_gp_filled = 1;
+ rack->r_ctl.num_measurements = RACK_REQ_AVG;
+ }
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = new_cwnd;
+ log.u_bbr.flex2 = new_ssthresh;
+ log.u_bbr.flex3 = rnds;
+ log.u_bbr.flex4 = rack_rxt_min_rnds;
+ log.u_bbr.flex5 = rtt;
+ log.u_bbr.flex6 = shared_cwnd_was_enabled;
+ log.u_bbr.flex8 = 5;
+ log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
+ log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
+ log.u_bbr.applimited = rack->r_ctl.max_clamps;
+ log.u_bbr.epoch = rack->r_ctl.clamp_options;
+ log.u_bbr.cur_del_rate = rxts;
+ log.u_bbr.delRate = snds;
+ log.u_bbr.rttProp = rack->r_ctl.rxt_threshold;
+ log.u_bbr.bw_inuse = lt_bw;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
+ log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
+ tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+ /* Update our point where we did it */
+ if (rack->r_ctl.already_had_a_excess == 0) {
+ rack->r_ctl.already_had_a_excess = 1;
+ counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1);
+ }
+ counter_u64_add(rack_rxt_clamps_cwnd, 1);
+ rack->r_ctl.last_sndbytes = tp->t_sndbytes;
+ rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes;
+ rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round;
+ if (new_cwnd < tp->snd_cwnd)
+ tp->snd_cwnd = new_cwnd;
+ if (new_ssthresh < tp->snd_ssthresh)
+ tp->snd_ssthresh = new_ssthresh;
+ }
+ }
+}
+
static void
rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
{
@@ -4672,7 +5775,7 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.flex8 = 2;
tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
+ 0, &log, false, NULL, __func__, __LINE__, &tv);
}
if ((rack->rack_no_prr == 0) &&
(rack->no_prr_addback == 0) &&
@@ -4707,6 +5810,8 @@ rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
}
EXIT_RECOVERY(tp->t_flags);
+ if (rack->r_ctl.full_dgp_in_rec)
+ rack_client_buffer_level_set(rack);
}
static void
@@ -4731,8 +5836,15 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
tp->t_flags &= ~TF_WASFRECOVERY;
tp->t_flags &= ~TF_WASCRECOVERY;
if (!IN_FASTRECOVERY(tp->t_flags)) {
+ if (rack->dgp_on && rack->r_cwnd_was_clamped) {
+ /* Reset the gains so that on exit we will be softer longer */
+ rack->r_ctl.rack_per_of_gp_rec = 100;
+ rack->r_ctl.rack_per_of_gp_ss = 98;
+ rack->r_ctl.rack_per_of_gp_ca = 98;
+ }
rack->r_ctl.rc_prr_delivered = 0;
rack->r_ctl.rc_prr_out = 0;
+ rack->r_fast_output = 0;
if (rack->rack_no_prr == 0) {
rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
rack_log_to_prr(rack, 2, in_rec_at_entry, line);
@@ -4752,6 +5864,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
SEQ_GEQ(ack, tp->snd_recover)) {
EXIT_CONGRECOVERY(tp->t_flags);
KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
+ rack->r_fast_output = 0;
tp->snd_recover = tp->snd_max + 1;
if (tp->t_flags2 & TF2_ECN_PERMIT)
tp->t_flags2 |= TF2_ECN_SND_CWR;
@@ -4760,6 +5873,7 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
case CC_RTO:
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
+ rack->r_fast_output = 0;
EXIT_RECOVERY(tp->t_flags);
tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
@@ -4794,6 +5908,8 @@ rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
}
if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
rack_log_to_prr(rack, 15, cwnd_enter, line);
+ if (rack->r_ctl.full_dgp_in_rec)
+ rack_client_buffer_level_set(rack);
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.retran_during_recovery = 0;
rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
@@ -4880,7 +5996,8 @@ rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
* should be the last segment (which it was not).
*/
prsm = rsm;
- RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
+
+ TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) {
if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
continue;
}
@@ -5091,7 +6208,7 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
uint32_t srtt, thresh;
rack = (struct tcp_rack *)tp->t_fb_ptr;
- if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
+ if (tqhash_empty(rack->r_ctl.tqh)) {
return (NULL);
}
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
@@ -5158,6 +6275,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_
}
rack->rc_on_min_to = 0;
if ((tp->t_state < TCPS_ESTABLISHED) ||
+ (rack->sack_attack_disable > 0) ||
((tp->t_flags & TF_SACK_PERMIT) == 0)) {
goto activate_rxt;
}
@@ -5372,8 +6490,10 @@ activate_tlp:
}
static void
-rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
+rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una)
{
+ struct timeval tv;
+
if (rack->rc_in_persist == 0) {
if (tp->t_flags & TF_GPUTINPROG) {
/*
@@ -5389,7 +6509,19 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
rack->rack_scwnd_is_idle = 1;
}
#endif
- rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
+ rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv);
+ if (rack->lt_bw_up) {
+ /* Suspend our LT BW measurement */
+ uint64_t tmark;
+
+ rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
+ rack->r_ctl.lt_seq = snd_una;
+ tmark = tcp_tv_to_lusectick(&tv);
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ rack->r_ctl.lt_timemark = tmark;
+ rack->lt_bw_up = 0;
+ rack->r_persist_lt_bw_off = 1;
+ }
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
@@ -5406,6 +6538,9 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
static void
rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
+ struct timeval tv;
+ uint32_t t_time;
+
if (tcp_in_hpts(rack->rc_inp)) {
tcp_hpts_remove(rack->rc_inp);
rack->r_ctl.rc_hpts_flags = 0;
@@ -5416,6 +6551,7 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
rack->rack_scwnd_is_idle = 0;
}
#endif
+ t_time = tcp_get_usecs(&tv);
if (rack->rc_gp_dyn_mul &&
(rack->use_fixed_rate == 0) &&
(rack->rc_always_pace)) {
@@ -5425,7 +6561,7 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
*/
uint32_t time_idle, idle_min;
- time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
+ time_idle = t_time - rack->r_ctl.rc_went_idle_time;
idle_min = rack_min_probertt_hold;
if (rack_probertt_gpsrtt_cnt_div) {
uint64_t extra;
@@ -5449,6 +6585,12 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
}
}
}
+ if (rack->r_persist_lt_bw_off) {
+ /* Continue where we left off */
+ rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
+ rack->lt_bw_up = 1;
+ rack->r_persist_lt_bw_off = 0;
+ }
rack->rc_in_persist = 0;
rack->r_ctl.rc_went_idle_time = 0;
tp->t_rxtshift = 0;
@@ -5600,13 +6742,10 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
rack->r_late = 0;
}
}
- if (slot) {
- /* We are pacing too */
- rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
- }
hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
#ifdef NETFLIX_EXP_DETECTION
if (rack->sack_attack_disable &&
+ (rack->r_ctl.ack_during_sd > 0) &&
(slot < tcp_sad_pacing_interval)) {
/*
* We have a potential attacker on
@@ -5619,6 +6758,8 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* micro-seconds, so lets convert to msecs.
*/
slot = tcp_sad_pacing_interval;
+ rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
+ rack->r_ctl.ack_during_sd = 0;
}
#endif
if (tp->t_flags & TF_DELACK) {
@@ -5738,6 +6879,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
*/
inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
if (slot) {
+ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
rack->r_ctl.rc_last_output_to = us_cts + slot;
/*
* A pacing timer (slot) is being set, in
@@ -5753,10 +6895,27 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* even a SACK should not disturb us (with
* the exception of r_rr_config 3).
*/
- if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
- (rack->r_rr_config != 3))
- inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
- if (rack->rc_ack_can_sendout_data) {
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) {
+ if (rack->r_rr_config != 3)
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ else if (rack->rc_pace_dnd) {
+ if (IN_RECOVERY(tp->t_flags)) {
+ /*
+ * When DND is on, we only let a sack
+ * interrupt us if we are not in recovery.
+ *
+ * If DND is off, then we never hit here
+ * and let all sacks wake us up.
+ *
+ */
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ }
+ }
+ }
+ /* For sack attackers we want to ignore sack */
+ if (rack->sack_attack_disable == 1) {
+ inp->inp_flags2 |= (INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
+ } else if (rack->rc_ack_can_sendout_data) {
/*
* Ahh but wait, this is that special case
* where the pacing timer can be disturbed
@@ -5791,6 +6950,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* but it may change the prr stats so letting it in (the set defaults
* at the start of this block) are good enough.
*/
+ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
(void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),
__LINE__, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
@@ -5806,7 +6966,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
}
rack->rc_tmr_stopped = 0;
if (slot)
- rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
+ rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
}
/*
@@ -5859,32 +7019,62 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
return (0);
}
+
+
static void
rack_adjust_orig_mlen(struct rack_sendmap *rsm)
{
- if (rsm->m->m_len > rsm->orig_m_len) {
+
+ if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) {
/*
- * Mbuf grew, caused by sbcompress, our offset does
- * not change.
+ * The trailing space changed, mbufs can grow
+ * at the tail but they can't shrink from
+ * it, KASSERT that. Adjust the orig_m_len to
+ * compensate for this change.
*/
- rsm->orig_m_len = rsm->m->m_len;
- } else if (rsm->m->m_len < rsm->orig_m_len) {
+ KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)),
+ ("mbuf:%p rsm:%p trailing_space:%lu ots:%u oml:%u mlen:%u\n",
+ rsm->m,
+ rsm,
+ M_TRAILINGROOM(rsm->m),
+ rsm->orig_t_space,
+ rsm->orig_m_len,
+ rsm->m->m_len));
+ rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m));
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ }
+ if (rsm->m->m_len < rsm->orig_m_len) {
/*
* Mbuf shrank, trimmed off the top by an ack, our
* offset changes.
*/
- rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
+ KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)),
+ ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n",
+ rsm->m, rsm->m->m_len,
+ rsm, rsm->orig_m_len,
+ rsm->soff));
+ if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len))
+ rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
+ else
+ rsm->soff = 0;
rsm->orig_m_len = rsm->m->m_len;
+#ifdef INVARIANTS
+ } else if (rsm->m->m_len > rsm->orig_m_len) {
+ panic("rsm:%p m:%p m_len grew outside of t_space compensation",
+ rsm, rsm->m);
+#endif
}
}
static void
-rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
+rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
{
struct mbuf *m;
uint32_t soff;
- if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
+ if (src_rsm->m &&
+ ((src_rsm->orig_m_len != src_rsm->m->m_len) ||
+ (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) {
/* Fix up the orig_m_len and possibly the mbuf offset */
rack_adjust_orig_mlen(src_rsm);
}
@@ -5897,10 +7087,25 @@ rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm
KASSERT((m != NULL),
("rsm:%p nrsm:%p hit at soff:%u null m",
src_rsm, rsm, soff));
+ if (m == NULL) {
+ /* This should *not* happen which is why there is a kassert */
+ src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
+ (src_rsm->r_start - rack->rc_tp->snd_una),
+ &src_rsm->soff);
+ src_rsm->orig_m_len = src_rsm->m->m_len;
+ src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m);
+ rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
+ (rsm->r_start - rack->rc_tp->snd_una),
+ &rsm->soff);
+ rsm->orig_m_len = rsm->m->m_len;
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ return;
+ }
}
rsm->m = m;
rsm->soff = soff;
rsm->orig_m_len = m->m_len;
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
}
static __inline void
@@ -5917,6 +7122,7 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
nrsm->r_rtr_bytes = 0;
nrsm->r_fas = rsm->r_fas;
+ nrsm->r_bas = rsm->r_bas;
rsm->r_end = nrsm->r_start;
nrsm->r_just_ret = rsm->r_just_ret;
for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
@@ -5944,7 +7150,7 @@ rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
(rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
if (rsm->m)
- rack_setup_offset_for_rsm(rsm, nrsm);
+ rack_setup_offset_for_rsm(rack, rsm, nrsm);
}
static struct rack_sendmap *
@@ -5962,9 +7168,6 @@ rack_merge_rsm(struct tcp_rack *rack,
* is any reason we need to try to find
* the oldest (or last oldest retransmitted).
*/
-#ifdef INVARIANTS
- struct rack_sendmap *rm;
-#endif
rack_log_map_chg(rack->rc_tp, rack, NULL,
l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
l_rsm->r_end = r_rsm->r_end;
@@ -5997,21 +7200,40 @@ rack_merge_rsm(struct tcp_rack *rack,
if (r_rsm == rack->r_ctl.rc_first_appl)
rack->r_ctl.rc_first_appl = l_rsm;
}
-#ifndef INVARIANTS
- (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
-#else
- rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
- if (rm != r_rsm) {
- panic("removing head in rack:%p rsm:%p rm:%p",
- rack, r_rsm, rm);
+ tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE);
+ /*
+ * We keep the largest value, which is the newest
+ * send. We do this in case a segment that is
+ * joined together and not part of a GP estimate
+ * later gets expanded into the GP estimate.
+ *
+ * We prohibit the merging of unlike kinds i.e.
+ * all pieces that are in the GP estimate can be
+ * merged and all pieces that are not in a GP estimate
+ * can be merged, but not disimilar pieces. Combine
+ * this with taking the highest here and we should
+ * be ok unless of course the client reneges. Then
+ * all bets are off.
+ */
+ if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] <
+ r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) {
+ l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)];
}
-#endif
+ /*
+ * When merging two RSM's we also need to consider the ack time and keep
+ * newest. If the ack gets merged into a measurement then that is the
+ * one we will want to be using.
+ */
+ if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival)
+ l_rsm->r_ack_arrival = r_rsm->r_ack_arrival;
+
if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
/* Transfer the split limit to the map we free */
r_rsm->r_limit_type = l_rsm->r_limit_type;
l_rsm->r_limit_type = 0;
}
rack_free(rack, r_rsm);
+ l_rsm->r_flags |= RACK_MERGED;
return (l_rsm);
}
@@ -6030,9 +7252,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
* Tail Loss Probe.
*/
struct rack_sendmap *rsm = NULL;
-#ifdef INVARIANTS
- struct rack_sendmap *insret;
-#endif
+ int insret __diagused;
struct socket *so = tptosocket(tp);
uint32_t amm;
uint32_t out, avail;
@@ -6136,7 +7356,7 @@ need_retran:
if (rack_always_send_oldest)
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
else {
- rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
rsm = rack_find_high_nonack(rack, rsm);
}
@@ -6149,24 +7369,17 @@ need_retran:
}
} else {
/*
- * We must find the last segment
- * that was acceptable by the client.
+ * We had a collapsed window, lets find
+ * the point before the collapse.
*/
- RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
- if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
- /* Found one */
- break;
- }
+ if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una))
+ rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1));
+ else {
+ rsm = tqhash_min(rack->r_ctl.tqh);
}
if (rsm == NULL) {
- /* None? if so send the first */
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- if (rsm == NULL) {
-#ifdef TCP_BLACKBOX
- tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
-#endif
- goto out;
- }
+ /* Huh */
+ goto out;
}
}
if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
@@ -6187,11 +7400,10 @@ need_retran:
(rsm->r_end - ctf_fixed_maxseg(tp)));
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -6303,6 +7515,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
rack->r_ctl.persist_lost_ends++;
}
counter_u64_add(rack_persists_sends, 1);
+ counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6414,9 +7627,11 @@ rack_remxt_tmr(struct tcpcb *tp)
* back in its seq ordered place.
*/
TAILQ_INIT(&rack->r_ctl.rc_tmap);
- RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
+
+ TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
rsm->r_dupack = 0;
- rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ if (rack_verbose_logging)
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
/* We must re-add it back to the tlist */
if (trsm == NULL) {
TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
@@ -6439,7 +7654,7 @@ rack_remxt_tmr(struct tcpcb *tp)
rack->r_ctl.rc_agg_early = 0;
rack->r_late = 0;
/* Clear the tlp rtx mark */
- rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
if (rack->r_ctl.rc_resend != NULL)
rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
rack->r_ctl.rc_prr_sndcnt = 0;
@@ -6465,46 +7680,7 @@ rack_remxt_tmr(struct tcpcb *tp)
static void
rack_convert_rtts(struct tcpcb *tp)
{
- if (tp->t_srtt > 1) {
- uint32_t val, frac;
-
- val = tp->t_srtt >> TCP_RTT_SHIFT;
- frac = tp->t_srtt & 0x1f;
- tp->t_srtt = TICKS_2_USEC(val);
- /*
- * frac is the fractional part of the srtt (if any)
- * but its in ticks and every bit represents
- * 1/32nd of a hz.
- */
- if (frac) {
- if (hz == 1000) {
- frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
- } else {
- frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
- }
- tp->t_srtt += frac;
- }
- }
- if (tp->t_rttvar) {
- uint32_t val, frac;
-
- val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
- frac = tp->t_rttvar & 0x1f;
- tp->t_rttvar = TICKS_2_USEC(val);
- /*
- * frac is the fractional part of the srtt (if any)
- * but its in ticks and every bit represents
- * 1/32nd of a hz.
- */
- if (frac) {
- if (hz == 1000) {
- frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
- } else {
- frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
- }
- tp->t_rttvar += frac;
- }
- }
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
tp->t_rxtcur = RACK_REXMTVAL(tp);
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
@@ -6613,7 +7789,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
(TP_KEEPINIT(tp) != 0)) {
struct rack_sendmap *rsm;
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_min(rack->r_ctl.tqh);
if (rsm) {
/* Ok we have something outstanding to test keepinit with */
if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
@@ -6891,8 +8067,11 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8
}
/*
* Ok our timer went off early and we are not paced false
- * alarm, go back to sleep.
+ * alarm, go back to sleep. We make sure we don't have
+ * no-sack wakeup on since we no longer have a PKT_OUTPUT
+ * flag in place.
*/
+ rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
ret = -3;
left = rack->r_ctl.rc_timer_exp - cts;
tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left));
@@ -6971,23 +8150,20 @@ rack_stopall(struct tcpcb *tp)
}
static void
-rack_stop_all_timers(struct tcpcb *tp)
+rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack)
{
- struct tcp_rack *rack;
-
/*
* Assure no timers are running.
*/
if (tcp_timer_active(tp, TT_PERSIST)) {
/* We enter in persists, set the flag appropriately */
- rack = (struct tcp_rack *)tp->t_fb_ptr;
rack->rc_in_persist = 1;
}
}
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
+ struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz)
{
int32_t idx;
@@ -7019,8 +8195,11 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
}
+ /* Lets make sure it really is in or not the GP window */
+ rack_mark_in_gp_win(tp, rsm);
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
+ rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz);
/* Take off the must retransmit flag, if its on */
if (rsm->r_flags & RACK_MUST_RXT) {
if (rack->r_must_retran)
@@ -7035,6 +8214,8 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
}
rsm->r_flags &= ~RACK_MUST_RXT;
}
+ /* Remove any collapsed flag */
+ rsm->r_flags &= ~RACK_RWND_COLLAPSED;
if (rsm->r_flags & RACK_SACK_PASSED) {
/* We have retransmitted due to the SACK pass */
rsm->r_flags &= ~RACK_SACK_PASSED;
@@ -7044,16 +8225,14 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
+ struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz)
{
/*
* We (re-)transmitted starting at rsm->r_start for some length
* (possibly less than r_end.
*/
struct rack_sendmap *nrsm;
-#ifdef INVARIANTS
- struct rack_sendmap *insret;
-#endif
+ int insret __diagused;
uint32_t c_end;
int32_t len;
@@ -7064,7 +8243,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
* We retransmitted the whole piece or more than the whole
* slopping into the next rsm.
*/
- rack_update_rsm(tp, rack, rsm, ts, add_flag);
+ rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
if (c_end == rsm->r_end) {
*lenp = 0;
return (0);
@@ -7101,11 +8280,10 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
nrsm->r_dupack = 0;
rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -7114,7 +8292,7 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
nrsm->r_in_tmap = 1;
}
rsm->r_flags &= (~RACK_HAS_FIN);
- rack_update_rsm(tp, rack, rsm, ts, add_flag);
+ rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
/* Log a split of rsm into rsm and nrsm */
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
*lenp = 0;
@@ -7124,13 +8302,13 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
- struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
+ struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb,
+ uint32_t s_moff, int hw_tls, int segsiz)
{
struct tcp_rack *rack;
- struct rack_sendmap *rsm, *nrsm, fe;
-#ifdef INVARIANTS
- struct rack_sendmap *insret;
-#endif
+ struct rack_sendmap *rsm, *nrsm;
+ int insret __diagused;
+
register uint32_t snd_max, snd_una;
/*
@@ -7216,6 +8394,7 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
/* First question is it a retransmission or new? */
if (seq_out == snd_max) {
/* Its new */
+ rack_chk_http_and_hybrid_on_out(rack, seq_out, len, cts);
again:
rsm = rack_alloc(rack);
if (rsm == NULL) {
@@ -7241,6 +8420,7 @@ again:
}
rsm->r_start = seq_out;
rsm->r_end = rsm->r_start + len;
+ rack_mark_in_gp_win(tp, rsm);
rsm->r_dupack = 0;
/*
* save off the mbuf location that
@@ -7280,17 +8460,20 @@ again:
rsm->m = lm;
}
rsm->orig_m_len = rsm->m->m_len;
- } else
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ } else {
rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ }
+ rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz);
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
/* Log a new rsm */
rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, rsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -7306,7 +8489,7 @@ again:
(ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
struct rack_sendmap *prsm;
- prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ prsm = tqhash_prev(rack->r_ctl.tqh, rsm);
if (prsm)
prsm->r_one_out_nr = 1;
}
@@ -7315,7 +8498,6 @@ again:
/*
* If we reach here its a retransmission and we need to find it.
*/
- memset(&fe, 0, sizeof(fe));
more:
if (hintrsm && (hintrsm->r_start == seq_out)) {
rsm = hintrsm;
@@ -7325,7 +8507,7 @@ more:
rsm = NULL;
}
if ((rsm) && (rsm->r_start == seq_out)) {
- seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
+ seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
if (len == 0) {
return;
} else {
@@ -7334,11 +8516,10 @@ more:
}
/* Ok it was not the last pointer go through it the hard way. */
refind:
- fe.r_start = seq_out;
- rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ rsm = tqhash_find(rack->r_ctl.tqh, seq_out);
if (rsm) {
if (rsm->r_start == seq_out) {
- seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
+ seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
if (len == 0) {
return;
} else {
@@ -7353,7 +8534,7 @@ refind:
*/
nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
- rack_update_rsm(tp, rack, rsm, cts, add_flag);
+ rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz);
return;
}
/*
@@ -7363,11 +8544,10 @@ refind:
rack_clone_rsm(rack, nrsm, rsm, seq_out);
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -7376,7 +8556,7 @@ refind:
nrsm->r_in_tmap = 1;
}
rsm->r_flags &= (~RACK_HAS_FIN);
- seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
+ seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz);
if (len == 0) {
return;
} else if (len > 0)
@@ -7394,7 +8574,7 @@ refind:
printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
seq_out, len, tp->snd_una, tp->snd_max);
printf("Starting Dump of all rack entries\n");
- RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
+ TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
printf("rsm:%p start:%u end:%u\n",
rsm, rsm->r_start, rsm->r_end);
}
@@ -7465,7 +8645,7 @@ tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t u
rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
rack->r_ctl.rack_rs.confidence = confidence;
rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
- } else if (confidence || rack->r_ctl.rack_rs.confidence) {
+ } else if (confidence != 0) {
/*
* Once we have a confident number,
* we can update it with a smaller
@@ -7625,7 +8805,7 @@ tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
/* Send in the microsecond rtt has close to the path RTT as we can get */
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
}
-
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
#endif
/*
* the retransmit should happen at rtt + 4 * rttvar. Because of the
@@ -7795,9 +8975,13 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
}
}
- if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
+ (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
+ (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
/* New more recent rack_tmit_time */
rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ if (rack->r_ctl.rc_rack_tmit_time == 0)
+ rack->r_ctl.rc_rack_tmit_time = 1;
rack->rc_rack_rtt = t;
}
return (1);
@@ -7858,10 +9042,13 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
- (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
+ (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
+ (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
/* New more recent rack_tmit_time */
rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ if (rack->r_ctl.rc_rack_tmit_time == 0)
+ rack->r_ctl.rc_rack_tmit_time = 1;
rack->rc_rack_rtt = t;
}
rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
@@ -7870,6 +9057,12 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
return (1);
}
}
+ /* If we are logging log out the sendmap */
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ for (i = 0; i < rsm->r_rtr_cnt; i++) {
+ rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr);
+ }
+ }
goto ts_not_found;
} else {
/*
@@ -7906,9 +9099,13 @@ ts_not_found:
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
+ if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
+ (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
+ (uint32_t)rsm->r_tim_lastsent[i]))) {
/* New more recent rack_tmit_time */
rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
+ if (rack->r_ctl.rc_rack_tmit_time == 0)
+ rack->r_ctl.rc_rack_tmit_time = 1;
rack->rc_rack_rtt = t;
}
return (1);
@@ -7969,6 +9166,7 @@ rack_need_set_test(struct tcpcb *tp,
int line,
int use_which)
{
+ struct rack_sendmap *s_rsm;
if ((tp->t_flags & TF_GPUTINPROG) &&
SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
@@ -8006,26 +9204,24 @@ rack_need_set_test(struct tcpcb *tp,
* up some of the rsm, we set RACK_USE_BEG
* since whats at r_start (i.e. th_ack)
* is left unacked and thats where the
- * measurement not starts.
+ * measurement now starts.
*/
tp->gput_seq = rsm->r_start;
- rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
}
if ((use_which == RACK_USE_END) &&
SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
- /*
- * We use the end when the cumack
- * is moving forward and completely
- * deleting the rsm passed so basically
- * r_end holds th_ack.
- *
- * For SACK's we also want to use the end
- * since this piece just got sacked and
- * we want to target anything after that
- * in our measurement.
- */
- tp->gput_seq = rsm->r_end;
- rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ /*
+ * We use the end when the cumack
+ * is moving forward and completely
+ * deleting the rsm passed so basically
+ * r_end holds th_ack.
+ *
+ * For SACK's we also want to use the end
+ * since this piece just got sacked and
+ * we want to target anything after that
+ * in our measurement.
+ */
+ tp->gput_seq = rsm->r_end;
}
if (use_which == RACK_USE_END_OR_THACK) {
/*
@@ -8038,7 +9234,28 @@ rack_need_set_test(struct tcpcb *tp,
tp->gput_seq = th_ack;
else
tp->gput_seq = rsm->r_end;
- rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ }
+ if (SEQ_LT(tp->gput_seq, tp->snd_max))
+ s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
+ else
+ s_rsm = NULL;
+ /*
+ * Pick up the correct send time if we can the rsm passed in
+ * may be equal to s_rsm if the RACK_USE_BEG was set. For the other
+ * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will
+ * find a different seq i.e. the next send up.
+ *
+ * If that has not been sent, s_rsm will be NULL and we must
+ * arrange it so this function will get called again by setting
+ * app_limited_needs_set.
+ */
+ if (s_rsm)
+ rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0];
+ else {
+ /* If we hit here we have to have *not* sent tp->gput_seq */
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
+ /* Set it up so we will go through here again */
+ rack->app_limited_needs_set = 1;
}
if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
/*
@@ -8060,7 +9277,10 @@ rack_need_set_test(struct tcpcb *tp,
(SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
rack->measure_saw_probe_rtt = 0;
rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
- seq, tp->gput_seq, 0, 5, line, NULL, 0);
+ seq, tp->gput_seq,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
+ (uint64_t)rack->r_ctl.rc_gp_output_ts),
+ 5, line, NULL, 0);
if (rack->rc_gp_filled &&
((tp->gput_ack - tp->gput_seq) <
max(rc_init_window(rack), (MIN_GP_WIN *
@@ -8077,7 +9297,10 @@ rack_need_set_test(struct tcpcb *tp,
*/
tp->t_flags &= ~TF_GPUTINPROG;
rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
- 0, 0, 0, 6, __LINE__, NULL, 0);
+ 0, 0,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
+ (uint64_t)rack->r_ctl.rc_gp_output_ts),
+ 6, __LINE__, NULL, 0);
} else {
/*
* Reset the window further out.
@@ -8085,6 +9308,8 @@ rack_need_set_test(struct tcpcb *tp,
tp->gput_ack = tp->gput_seq + ideal_amount;
}
}
+ rack_tend_gp_marks(tp, rack);
+ rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm);
}
}
@@ -8104,23 +9329,43 @@ is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm
}
+
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
- struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
+ struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
+ int *no_extra,
+ int *moved_two, uint32_t segsiz)
{
uint32_t start, end, changed = 0;
struct rack_sendmap stack_map;
- struct rack_sendmap *rsm, *nrsm, fe, *prev, *next;
-#ifdef INVARIANTS
- struct rack_sendmap *insret;
-#endif
+ struct rack_sendmap *rsm, *nrsm, *prev, *next;
+ int insret __diagused;
int32_t used_ref = 1;
int moved = 0;
+#ifdef NETFLIX_EXP_DETECTION
+ int allow_segsiz;
+ int first_time_through = 1;
+#endif
+ int noextra = 0;
+ int can_use_hookery = 0;
start = sack->start;
end = sack->end;
rsm = *prsm;
- memset(&fe, 0, sizeof(fe));
+#ifdef NETFLIX_EXP_DETECTION
+ /*
+ * There are a strange number of proxys and meddle boxes in the world
+ * that seem to cut up segments on different boundaries. This gets us
+ * smaller sacks that are still ok in terms of it being an attacker.
+ * We use the base segsiz to calculate an allowable smallness but
+ * also enforce a min on the segsiz in case it is an attacker playing
+ * games with MSS. So basically if the sack arrives and it is
+ * larger than a worse case 960 bytes, we don't classify the guy
+ * as supicious.
+ */
+ allow_segsiz = max(segsiz, 1200) * sad_seg_size_per;
+ allow_segsiz /= 1000;
+#endif
do_rest_ofb:
if ((rsm == NULL) ||
(SEQ_LT(end, rsm->r_start)) ||
@@ -8131,14 +9376,106 @@ do_rest_ofb:
* find the correct spot in the tree.
*/
used_ref = 0;
- fe.r_start = start;
- rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ rsm = tqhash_find(rack->r_ctl.tqh, start);
moved++;
}
if (rsm == NULL) {
/* TSNH */
goto out;
}
+#ifdef NETFLIX_EXP_DETECTION
+ /* Now we must check for suspicous activity */
+ if ((first_time_through == 1) &&
+ ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) &&
+ ((rsm->r_flags & RACK_PMTU_CHG) == 0) &&
+ ((rsm->r_flags & RACK_TLP) == 0)) {
+ /*
+ * Its less than a full MSS or the segment being acked
+ * this should only happen if the rsm in question had the
+ * r_just_ret flag set <and> the end matches the end of
+ * the rsm block.
+ *
+ * Note we do not look at segments that have had TLP's on
+ * them since we can get un-reported rwnd collapses that
+ * basically we TLP on and then we get back a sack block
+ * that goes from the start to only a small way.
+ *
+ */
+ int loss, ok;
+
+ ok = 0;
+ if (SEQ_GEQ(end, rsm->r_end)) {
+ if (rsm->r_just_ret == 1) {
+ /* This was at the end of a send which is ok */
+ ok = 1;
+ } else {
+ /* A bit harder was it the end of our segment */
+ int segs, len;
+
+ len = (rsm->r_end - rsm->r_start);
+ segs = len / segsiz;
+ segs *= segsiz;
+ if ((segs + (rsm->r_end - start)) == len) {
+ /*
+ * So this last bit was the
+ * end of our send if we cut it
+ * up into segsiz pieces so its ok.
+ */
+ ok = 1;
+ }
+ }
+ }
+ if (ok == 0) {
+ /*
+ * This guy is doing something suspicious
+ * lets start detection.
+ */
+ if (rack->rc_suspicious == 0) {
+ tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT);
+ counter_u64_add(rack_sack_attacks_suspect, 1);
+ rack->rc_suspicious = 1;
+ rack_log_sad(rack, 4);
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = end;
+ log.u_bbr.flex2 = start;
+ log.u_bbr.flex3 = rsm->r_end;
+ log.u_bbr.flex4 = rsm->r_start;
+ log.u_bbr.flex5 = segsiz;
+ log.u_bbr.flex6 = rsm->r_fas;
+ log.u_bbr.flex7 = rsm->r_bas;
+ log.u_bbr.flex8 = 5;
+ log.u_bbr.pkts_out = rsm->r_flags;
+ log.u_bbr.bbr_state = rack->rc_suspicious;
+ log.u_bbr.bbr_substate = rsm->r_just_ret;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_SAD_DETECTION, 0,
+ 0, &log, false, &tv);
+ }
+ }
+ /* You loose some ack count every time you sack
+ * a small bit that is not butting to the end of
+ * what we have sent. This is because we never
+ * send small bits unless its the end of the sb.
+ * Anyone sending a sack that is not at the end
+ * is thus very very suspicious.
+ */
+ loss = (segsiz/2) / (end - start);
+ if (loss < rack->r_ctl.ack_count)
+ rack->r_ctl.ack_count -= loss;
+ else
+ rack->r_ctl.ack_count = 0;
+ }
+ }
+ first_time_through = 0;
+#endif
/* Ok we have an ACK for some piece of this rsm */
if (rsm->r_start != start) {
if ((rsm->r_flags & RACK_ACKED) == 0) {
@@ -8198,8 +9535,30 @@ do_rest_ofb:
* the next guy and it is already sacked.
*
*/
- next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (next && (next->r_flags & RACK_ACKED) &&
+ /*
+ * Hookery can only be used if the two entries
+ * are in the same bucket and neither one of
+ * them staddle the bucket line.
+ */
+ next = tqhash_next(rack->r_ctl.tqh, rsm);
+ if (next &&
+ (rsm->bindex == next->bindex) &&
+ ((rsm->r_flags & RACK_STRADDLE) == 0) &&
+ ((next->r_flags & RACK_STRADDLE) == 0) &&
+ (rsm->r_flags & RACK_IN_GP_WIN) &&
+ (next->r_flags & RACK_IN_GP_WIN))
+ can_use_hookery = 1;
+ else if (next &&
+ (rsm->bindex == next->bindex) &&
+ ((rsm->r_flags & RACK_STRADDLE) == 0) &&
+ ((next->r_flags & RACK_STRADDLE) == 0) &&
+ ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
+ ((next->r_flags & RACK_IN_GP_WIN) == 0))
+ can_use_hookery = 1;
+ else
+ can_use_hookery = 0;
+ if (next && can_use_hookery &&
+ (next->r_flags & RACK_ACKED) &&
SEQ_GEQ(end, next->r_start)) {
/**
* So the next one is already acked, and
@@ -8218,13 +9577,44 @@ do_rest_ofb:
* use to update all the gizmos.
*/
/* Copy up our fudge block */
+ noextra++;
nrsm = &stack_map;
memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
/* Now adjust our tree blocks */
rsm->r_end = start;
next->r_start = start;
+ rsm->r_flags |= RACK_SHUFFLED;
+ next->r_flags |= RACK_SHUFFLED;
/* Now we must adjust back where next->m is */
- rack_setup_offset_for_rsm(rsm, next);
+ rack_setup_offset_for_rsm(rack, rsm, next);
+ /*
+ * Which timestamp do we keep? It is rather
+ * important in GP measurements to have the
+ * accurate end of the send window.
+ *
+ * We keep the largest value, which is the newest
+ * send. We do this in case a segment that is
+ * joined together and not part of a GP estimate
+ * later gets expanded into the GP estimate.
+ *
+ * We prohibit the merging of unlike kinds i.e.
+ * all pieces that are in the GP estimate can be
+ * merged and all pieces that are not in a GP estimate
+ * can be merged, but not disimilar pieces. Combine
+ * this with taking the highest here and we should
+ * be ok unless of course the client reneges. Then
+ * all bets are off.
+ */
+ if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] <
+ nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)])
+ next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)];
+ /*
+ * And we must keep the newest ack arrival time.
+ */
+ if (next->r_ack_arrival <
+ rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
+ next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
+
/* We don't need to adjust rsm, it did not change */
/* Clear out the dup ack count of the remainder */
@@ -8238,9 +9628,14 @@ do_rest_ofb:
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
changed += (nrsm->r_end - nrsm->r_start);
+ /* You get a count for acking a whole segment or more */
+ if ((nrsm->r_end - nrsm->r_start) >= segsiz)
+ rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
if (nrsm->r_flags & RACK_SACK_PASSED) {
rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
/*
* Now we want to go up from rsm (the
@@ -8271,7 +9666,7 @@ do_rest_ofb:
counter_u64_add(rack_sack_used_next_merge, 1);
/* Postion for the next block */
start = next->r_end;
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
+ rsm = tqhash_next(rack->r_ctl.tqh, next);
if (rsm == NULL)
goto out;
} else {
@@ -8302,13 +9697,13 @@ do_rest_ofb:
}
counter_u64_add(rack_sack_splits, 1);
rack_clone_rsm(rack, nrsm, rsm, start);
+ moved++;
rsm->r_just_ret = 0;
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -8327,12 +9722,12 @@ do_rest_ofb:
moved++;
if (end == rsm->r_end) {
/* Done with block */
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
goto out;
} else if (SEQ_LT(end, rsm->r_end)) {
/* A partial sack to a already sacked block */
moved++;
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
goto out;
} else {
/*
@@ -8341,7 +9736,7 @@ do_rest_ofb:
* next block.
*/
start = rsm->r_end;
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
if (rsm == NULL)
goto out;
}
@@ -8397,6 +9792,9 @@ do_rest_ofb:
}
rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
changed += (rsm->r_end - rsm->r_start);
+ /* You get a count for acking a whole segment or more */
+ if ((rsm->r_end - rsm->r_start) >= segsiz)
+ rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
if (rsm->r_in_tmap) /* should be true */
rack_log_sack_passed(tp, rack, rsm);
@@ -8404,6 +9802,8 @@ do_rest_ofb:
if (rsm->r_flags & RACK_SACK_PASSED) {
rsm->r_flags &= ~RACK_SACK_PASSED;
rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
@@ -8426,7 +9826,7 @@ do_rest_ofb:
* There is more not coverend by this rsm move on
* to the next block in the RB tree.
*/
- nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
start = rsm->r_end;
rsm = nrsm;
if (rsm == NULL)
@@ -8478,8 +9878,30 @@ do_rest_ofb:
rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
}
}
- prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ /*
+ * Hookery can only be used if the two entries
+ * are in the same bucket and neither one of
+ * them staddle the bucket line.
+ */
+ prev = tqhash_prev(rack->r_ctl.tqh, rsm);
if (prev &&
+ (rsm->bindex == prev->bindex) &&
+ ((rsm->r_flags & RACK_STRADDLE) == 0) &&
+ ((prev->r_flags & RACK_STRADDLE) == 0) &&
+ (rsm->r_flags & RACK_IN_GP_WIN) &&
+ (prev->r_flags & RACK_IN_GP_WIN))
+ can_use_hookery = 1;
+ else if (prev &&
+ (rsm->bindex == prev->bindex) &&
+ ((rsm->r_flags & RACK_STRADDLE) == 0) &&
+ ((prev->r_flags & RACK_STRADDLE) == 0) &&
+ ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
+ ((prev->r_flags & RACK_IN_GP_WIN) == 0))
+ can_use_hookery = 1;
+ else
+ can_use_hookery = 0;
+
+ if (prev && can_use_hookery &&
(prev->r_flags & RACK_ACKED)) {
/**
* Goal, we want the right remainder of rsm to shrink
@@ -8498,22 +9920,55 @@ do_rest_ofb:
* Note if either prev/rsm is a TLP we don't
* do this.
*/
+ noextra++;
nrsm = &stack_map;
memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
prev->r_end = end;
rsm->r_start = end;
+ rsm->r_flags |= RACK_SHUFFLED;
+ prev->r_flags |= RACK_SHUFFLED;
/* Now adjust nrsm (stack copy) to be
* the one that is the small
* piece that was "sacked".
*/
nrsm->r_end = end;
rsm->r_dupack = 0;
+ /*
+ * Which timestamp do we keep? It is rather
+ * important in GP measurements to have the
+ * accurate end of the send window.
+ *
+ * We keep the largest value, which is the newest
+ * send. We do this in case a segment that is
+ * joined together and not part of a GP estimate
+ * later gets expanded into the GP estimate.
+ *
+ * We prohibit the merging of unlike kinds i.e.
+ * all pieces that are in the GP estimate can be
+ * merged and all pieces that are not in a GP estimate
+ * can be merged, but not disimilar pieces. Combine
+ * this with taking the highest here and we should
+ * be ok unless of course the client reneges. Then
+ * all bets are off.
+ */
+ if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] <
+ nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) {
+ prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ }
+ /*
+ * And we must keep the newest ack arrival time.
+ */
+
+ if(prev->r_ack_arrival <
+ rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
+ prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
+
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
/*
* Now that the rsm has had its start moved forward
* lets go ahead and get its new place in the world.
*/
- rack_setup_offset_for_rsm(prev, rsm);
+ rack_setup_offset_for_rsm(rack, prev, rsm);
/*
* Now nrsm is our new little piece
* that is acked (which was merged
@@ -8524,9 +9979,15 @@ do_rest_ofb:
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
changed += (nrsm->r_end - nrsm->r_start);
+ /* You get a count for acking a whole segment or more */
+ if ((nrsm->r_end - nrsm->r_start) >= segsiz)
+ rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
+
rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
if (nrsm->r_flags & RACK_SACK_PASSED) {
rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
rsm = prev;
@@ -8550,32 +10011,32 @@ do_rest_ofb:
*/
if (rack->rc_last_tlp_acked_set &&
(is_rsm_inside_declared_tlp_block(rack, rsm))) {
- /*
- * We already turned this on since this block is inside
- * the previous one was a partially sack now we
- * are getting another one (maybe all of it).
- */
- rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
- /*
- * Lets make sure we have all of it though.
- */
- if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
- rack->r_ctl.last_tlp_acked_start = rsm->r_start;
- rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
- rack->r_ctl.last_tlp_acked_end);
- }
- if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
- rack->r_ctl.last_tlp_acked_end = rsm->r_end;
- rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
- rack->r_ctl.last_tlp_acked_end);
- }
- } else {
- rack->r_ctl.last_tlp_acked_start = rsm->r_start;
- rack->r_ctl.last_tlp_acked_end = rsm->r_end;
- rack->rc_last_tlp_acked_set = 1;
- rack->rc_last_tlp_past_cumack = 0;
- rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
- }
+ /*
+ * We already turned this on since this block is inside
+ * the previous one was a partially sack now we
+ * are getting another one (maybe all of it).
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_acked_set = 1;
+ rack->rc_last_tlp_past_cumack = 0;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
}
/**
* In this case nrsm becomes
@@ -8597,14 +10058,14 @@ do_rest_ofb:
*/
counter_u64_add(rack_sack_splits, 1);
rack_clone_rsm(rack, nrsm, rsm, end);
+ moved++;
rsm->r_flags &= (~RACK_HAS_FIN);
rsm->r_just_ret = 0;
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:% rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -8616,6 +10077,10 @@ do_rest_ofb:
rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
changed += (rsm->r_end - rsm->r_start);
+ /* You get a count for acking a whole segment or more */
+ if ((rsm->r_end - rsm->r_start) >= segsiz)
+ rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
+
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
if (rsm->r_in_tmap) /* should be true */
rack_log_sack_passed(tp, rack, rsm);
@@ -8623,6 +10088,8 @@ do_rest_ofb:
if (rsm->r_flags & RACK_SACK_PASSED) {
rsm->r_flags &= ~RACK_SACK_PASSED;
rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
@@ -8650,26 +10117,58 @@ out:
* with either the previous or
* next block?
*/
- next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ next = tqhash_next(rack->r_ctl.tqh, rsm);
while (next) {
if (next->r_flags & RACK_TLP)
break;
+ /* Only allow merges between ones in or out of GP window */
+ if ((next->r_flags & RACK_IN_GP_WIN) &&
+ ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
+ break;
+ }
+ if ((rsm->r_flags & RACK_IN_GP_WIN) &&
+ ((next->r_flags & RACK_IN_GP_WIN) == 0)) {
+ break;
+ }
+ if (rsm->bindex != next->bindex)
+ break;
+ if (rsm->r_flags & RACK_STRADDLE)
+ break;
+ if (next->r_flags & RACK_STRADDLE)
+ break;
if (next->r_flags & RACK_ACKED) {
- /* yep this and next can be merged */
+ /* yep this and next can be merged */
rsm = rack_merge_rsm(rack, rsm, next);
- next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ noextra++;
+ next = tqhash_next(rack->r_ctl.tqh, rsm);
} else
break;
}
/* Now what about the previous? */
- prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ prev = tqhash_prev(rack->r_ctl.tqh, rsm);
while (prev) {
if (prev->r_flags & RACK_TLP)
break;
+ /* Only allow merges between ones in or out of GP window */
+ if ((prev->r_flags & RACK_IN_GP_WIN) &&
+ ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
+ break;
+ }
+ if ((rsm->r_flags & RACK_IN_GP_WIN) &&
+ ((prev->r_flags & RACK_IN_GP_WIN) == 0)) {
+ break;
+ }
+ if (rsm->bindex != prev->bindex)
+ break;
+ if (rsm->r_flags & RACK_STRADDLE)
+ break;
+ if (prev->r_flags & RACK_STRADDLE)
+ break;
if (prev->r_flags & RACK_ACKED) {
/* yep the previous and this can be merged */
rsm = rack_merge_rsm(rack, prev, rsm);
- prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ noextra++;
+ prev = tqhash_prev(rack->r_ctl.tqh, rsm);
} else
break;
}
@@ -8680,13 +10179,11 @@ out:
counter_u64_add(rack_sack_proc_short, 1);
}
/* Save off the next one for quick reference. */
- if (rsm)
- nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- else
- nrsm = NULL;
+ nrsm = tqhash_find(rack->r_ctl.tqh, end);
*prsm = rack->r_ctl.rc_sacklast = nrsm;
/* Pass back the moved. */
*moved_two = moved;
+ *no_extra = noextra;
return (changed);
}
@@ -8715,7 +10212,7 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac
tmap = rsm;
}
tmap->r_in_tmap = 1;
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
}
/*
* Now lets possibly clear the sack filter so we start
@@ -8751,6 +10248,8 @@ rack_do_decay(struct tcp_rack *rack)
* Check for decay of non-SAD,
* we want all SAD detection metrics to
* decay 1/4 per second (or more) passed.
+ * Current default is 800 so it decays
+ * 80% every second.
*/
#ifdef NETFLIX_EXP_DETECTION
uint32_t pkt_delta;
@@ -8784,20 +10283,110 @@ rack_do_decay(struct tcp_rack *rack)
}
}
+static void inline
+rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from)
+{
+ /*
+ * We look at advancing the end send time for our GP
+ * measurement tracking only as the cumulative acknowledgment
+ * moves forward. You might wonder about this, why not
+ * at every transmission or retransmission within the
+ * GP window update the rc_gp_cumack_ts? Well its rather
+ * nuanced but basically the GP window *may* expand (as
+ * it does below) or worse and harder to track it may shrink.
+ *
+ * This last makes it impossible to track at the time of
+ * the send, since you may set forward your rc_gp_cumack_ts
+ * when you send, because that send *is* in your currently
+ * "guessed" window, but then it shrinks. Now which was
+ * the send time of the last bytes in the window, by the
+ * time you ask that question that part of the sendmap
+ * is freed. So you don't know and you will have too
+ * long of send window. Instead by updating the time
+ * marker only when the cumack advances this assures us
+ * that we will have only the sends in the window of our
+ * GP measurement.
+ *
+ * Another complication from this is the
+ * merging of sendmap entries. During SACK processing this
+ * can happen to conserve the sendmap size. That breaks
+ * everything down in tracking the send window of the GP
+ * estimate. So to prevent that and keep it working with
+ * a tiny bit more limited merging, we only allow like
+ * types to be merged. I.e. if two sends are in the GP window
+ * then its ok to merge them together. If two sends are not
+ * in the GP window its ok to merge them together too. Though
+ * one send in and one send out cannot be merged. We combine
+ * this with never allowing the shrinking of the GP window when
+ * we are in recovery so that we can properly calculate the
+ * sending times.
+ *
+ * This all of course seems complicated, because it is.. :)
+ *
+ * The cum-ack is being advanced upon the sendmap.
+ * If we are not doing a GP estimate don't
+ * proceed.
+ */
+ uint64_t ts;
+
+ if ((tp->t_flags & TF_GPUTINPROG) == 0)
+ return;
+ /*
+ * If this sendmap entry is going
+ * beyond the measurement window we had picked,
+ * expand the measurement window by that much.
+ */
+ if (SEQ_GT(rsm->r_end, tp->gput_ack)) {
+ tp->gput_ack = rsm->r_end;
+ }
+ /*
+ * If we have not setup a ack, then we
+ * have no idea if the newly acked pieces
+ * will be "in our seq measurement range". If
+ * it is when we clear the app_limited_needs_set
+ * flag the timestamp will be updated.
+ */
+ if (rack->app_limited_needs_set)
+ return;
+ /*
+ * Finally, we grab out the latest timestamp
+ * that this packet was sent and then see
+ * if:
+ * a) The packet touches are newly defined GP range.
+ * b) The time is greater than (newer) than the
+ * one we currently have. If so we update
+ * our sending end time window.
+ *
+ * Note we *do not* do this at send time. The reason
+ * is that if you do you *may* pick up a newer timestamp
+ * for a range you are not going to measure. We project
+ * out how far and then sometimes modify that to be
+ * smaller. If that occurs then you will have a send
+ * that does not belong to the range included.
+ */
+ if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <=
+ rack->r_ctl.rc_gp_cumack_ts)
+ return;
+ if (rack_in_gp_window(tp, rsm)) {
+ rack->r_ctl.rc_gp_cumack_ts = ts;
+ rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end,
+ __LINE__, from, rsm);
+ }
+}
+
static void
-rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
+rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime)
{
struct rack_sendmap *rsm;
-#ifdef INVARIANTS
- struct rack_sendmap *rm;
-#endif
-
/*
* The ACK point is advancing to th_ack, we must drop off
* the packets in the rack log and calculate any eligble
* RTT's.
*/
+
rack->r_wanted_output = 1;
+ if (SEQ_GT(th_ack, tp->snd_una))
+ rack->r_ctl.last_cumack_advance = acktime;
/* Tend any TLP that has been marked for 1/2 the seq space (its old) */
if ((rack->rc_last_tlp_acked_set == 1)&&
@@ -8848,7 +10437,7 @@ rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_
rack->rc_last_sent_tlp_past_cumack = 1;
}
more:
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_min(rack->r_ctl.tqh);
if (rsm == NULL) {
if ((th_ack - 1) == tp->iss) {
/*
@@ -8932,17 +10521,12 @@ more:
rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
rsm->r_rtr_bytes = 0;
- /* Record the time of highest cumack sent */
- rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
-#ifndef INVARIANTS
- (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#else
- rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (rm != rsm) {
- panic("removing head in rack:%p rsm:%p rm:%p",
- rack, rsm, rm);
- }
-#endif
+ /*
+ * Record the time of highest cumack sent if its in our measurement
+ * window and possibly bump out the end.
+ */
+ rack_rsm_sender_update(rack, tp, rsm, 4);
+ tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
@@ -8965,6 +10549,8 @@ more:
rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
if (rack->r_ent_rec_ns) {
/*
* We have sent no more, and we saw an sack
@@ -8998,7 +10584,7 @@ more:
goto more;
}
/* Check for reneging */
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_min(rack->r_ctl.tqh);
if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
/*
* The peer has moved snd_una up to
@@ -9046,17 +10632,21 @@ more:
}
/*
* Update where the piece starts and record
- * the time of send of highest cumack sent.
+ * the time of send of highest cumack sent if
+ * its in our GP range.
*/
- rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
/* Now we need to move our offset forward too */
- if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
+ if (rsm->m &&
+ ((rsm->orig_m_len != rsm->m->m_len) ||
+ (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
/* Fix up the orig_m_len and possibly the mbuf offset */
rack_adjust_orig_mlen(rsm);
}
rsm->soff += (th_ack - rsm->r_start);
- rsm->r_start = th_ack;
+ rack_rsm_sender_update(rack, tp, rsm, 5);
+ /* The trim will move th_ack into r_start for us */
+ tqhash_trim(rack->r_ctl.tqh, th_ack);
/* Now do we need to move the mbuf fwd too? */
if (rsm->m) {
while (rsm->soff >= rsm->m->m_len) {
@@ -9067,8 +10657,10 @@ more:
rsm, rsm->soff));
}
rsm->orig_m_len = rsm->m->m_len;
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
}
- if (rack->app_limited_needs_set)
+ if (rack->app_limited_needs_set &&
+ SEQ_GEQ(th_ack, tp->gput_seq))
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
}
@@ -9114,13 +10706,76 @@ rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
}
#ifdef NETFLIX_EXP_DETECTION
+
+static void
+rack_merge_out_sacks(struct tcp_rack *rack)
+{
+ struct rack_sendmap *cur, *next, *rsm, *trsm = NULL;
+
+ cur = tqhash_min(rack->r_ctl.tqh);
+ while(cur) {
+ next = tqhash_next(rack->r_ctl.tqh, cur);
+ /*
+ * The idea is to go through all and merge back
+ * together the pieces sent together,
+ */
+ if ((next != NULL) &&
+ (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) {
+ rack_merge_rsm(rack, cur, next);
+ } else {
+ cur = next;
+ }
+ }
+ /*
+ * now treat it like a rxt event, everything is outstanding
+ * and sent nothing acvked and dupacks are all zero. If this
+ * is not an attacker it will have to dupack its way through
+ * it all.
+ */
+ TAILQ_INIT(&rack->r_ctl.rc_tmap);
+ TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
+ rsm->r_dupack = 0;
+ /* We must re-add it back to the tlist */
+ if (trsm == NULL) {
+ TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ } else {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
+ }
+ rsm->r_in_tmap = 1;
+ trsm = rsm;
+ rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
+ }
+ sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una);
+}
+
static void
rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz)
{
- if ((rack->do_detection || tcp_force_detection) &&
- tcp_sack_to_ack_thresh &&
- tcp_sack_to_move_thresh &&
- ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
+ int do_detection = 0;
+
+ if (rack->sack_attack_disable || rack->rc_suspicious) {
+ /*
+ * If we have been disabled we must detect
+ * to possibly reverse it. Or if the guy has
+ * sent in suspicious sacks we want to do detection too.
+ */
+ do_detection = 1;
+
+ } else if ((rack->do_detection || tcp_force_detection) &&
+ (tcp_sack_to_ack_thresh > 0) &&
+ (tcp_sack_to_move_thresh > 0) &&
+ (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) {
+ /*
+ * We only detect here if:
+ * 1) System wide forcing is on <or> do_detection is on
+ * <and>
+ * 2) We have thresholds for move and ack (set one to 0 and we are off)
+ * <and>
+ * 3) We have maps allocated larger than our min (500).
+ */
+ do_detection = 1;
+ }
+ if (do_detection > 0) {
/*
* We have thresholds set to find
* possible attackers and disable sack.
@@ -9130,39 +10785,74 @@ rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_
/* Log detecting */
rack_log_sad(rack, 1);
- ackratio = (uint64_t)(rack->r_ctl.sack_count);
- ackratio *= (uint64_t)(1000);
- if (rack->r_ctl.ack_count)
- ackratio /= (uint64_t)(rack->r_ctl.ack_count);
- else {
- /* We really should not hit here */
- ackratio = 1000;
+ /* Do we establish a ack ratio */
+ if ((rack->r_ctl.sack_count > tcp_map_minimum) ||
+ (rack->rc_suspicious == 1) ||
+ (rack->sack_attack_disable > 0)) {
+ ackratio = (uint64_t)(rack->r_ctl.sack_count);
+ ackratio *= (uint64_t)(1000);
+ if (rack->r_ctl.ack_count)
+ ackratio /= (uint64_t)(rack->r_ctl.ack_count);
+ else {
+ /* We can hit this due to ack totals degregation (via small sacks) */
+ ackratio = 1000;
+ }
+ } else {
+ /*
+ * No ack ratio needed if we have not
+ * seen more sacks then the number of map entries.
+ * The exception to that is if we have disabled sack then
+ * we need to find a ratio.
+ */
+ ackratio = 0;
}
+
if ((rack->sack_attack_disable == 0) &&
(ackratio > rack_highest_sack_thresh_seen))
rack_highest_sack_thresh_seen = (uint32_t)ackratio;
- movetotal = rack->r_ctl.sack_moved_extra;
- movetotal += rack->r_ctl.sack_noextra_move;
- moveratio = rack->r_ctl.sack_moved_extra;
- moveratio *= (uint64_t)1000;
- if (movetotal)
- moveratio /= movetotal;
- else {
- /* No moves, thats pretty good */
+ /* Do we establish a move ratio? */
+ if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) ||
+ (rack->rc_suspicious == 1) ||
+ (rack->sack_attack_disable > 0)) {
+ /*
+ * We need to have more sack moves than maps
+ * allocated to have a move ratio considered.
+ */
+ movetotal = rack->r_ctl.sack_moved_extra;
+ movetotal += rack->r_ctl.sack_noextra_move;
+ moveratio = rack->r_ctl.sack_moved_extra;
+ moveratio *= (uint64_t)1000;
+ if (movetotal)
+ moveratio /= movetotal;
+ else {
+ /* No moves, thats pretty good */
+ moveratio = 0;
+ }
+ } else {
+ /*
+ * Not enough moves have occured to consider
+ * if we are out of whack in that ratio.
+ * The exception to that is if we have disabled sack then
+ * we need to find a ratio.
+ */
moveratio = 0;
}
if ((rack->sack_attack_disable == 0) &&
(moveratio > rack_highest_move_thresh_seen))
rack_highest_move_thresh_seen = (uint32_t)moveratio;
+ /* Now the tests */
if (rack->sack_attack_disable == 0) {
+ /* Not disabled, do we need to disable? */
if ((ackratio > tcp_sack_to_ack_thresh) &&
(moveratio > tcp_sack_to_move_thresh)) {
/* Disable sack processing */
+ tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED);
rack->sack_attack_disable = 1;
- if (rack->r_rep_attack == 0) {
- rack->r_rep_attack = 1;
- counter_u64_add(rack_sack_attacks_detected, 1);
- }
+ /* set it so we have the built in delay */
+ rack->r_ctl.ack_during_sd = 1;
+ if (rack_merge_out_sacks_on_attack)
+ rack_merge_out_sacks(rack);
+ counter_u64_add(rack_sack_attacks_detected, 1);
tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED);
/* Clamp the cwnd at flight size */
rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
@@ -9172,20 +10862,20 @@ rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_
} else {
/* We are sack-disabled check for false positives */
if ((ackratio <= tcp_restoral_thresh) ||
- (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) {
+ ((rack_merge_out_sacks_on_attack == 0) &&
+ (rack->rc_suspicious == 0) &&
+ (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) {
rack->sack_attack_disable = 0;
rack_log_sad(rack, 3);
/* Restart counting */
rack->r_ctl.sack_count = 0;
rack->r_ctl.sack_moved_extra = 0;
rack->r_ctl.sack_noextra_move = 1;
+ rack->rc_suspicious = 0;
rack->r_ctl.ack_count = max(1,
- (bytes_this_ack / segsiz));
+ (bytes_this_ack / segsiz));
- if (rack->r_rep_reverse == 0) {
- rack->r_rep_reverse = 1;
- counter_u64_add(rack_sack_attacks_reversed, 1);
- }
+ counter_u64_add(rack_sack_attacks_reversed, 1);
/* Restore the cwnd */
if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
@@ -9339,7 +11029,8 @@ rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_s
}
static void
-rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
+rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck,
+ int *dsack_seen, int *sacks_seen)
{
uint32_t changed;
struct tcp_rack *rack;
@@ -9348,8 +11039,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
register uint32_t th_ack;
int32_t i, j, k, num_sack_blks = 0;
uint32_t cts, acked, ack_point;
- int loop_start = 0, moved_two = 0;
+ int loop_start = 0, moved_two = 0, no_extra = 0;
uint32_t tsused;
+ uint32_t segsiz, o_cnt;
INP_WLOCK_ASSERT(tptoinpcb(tp));
@@ -9359,12 +11051,13 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
}
rack = (struct tcp_rack *)tp->t_fb_ptr;
cts = tcp_get_usecs(NULL);
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_min(rack->r_ctl.tqh);
changed = 0;
th_ack = th->th_ack;
if (rack->sack_attack_disable == 0)
rack_do_decay(rack);
- if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
+ segsiz = ctf_fixed_maxseg(rack->rc_tp);
+ if (BYTES_THIS_ACK(tp, th) >= segsiz) {
/*
* You only get credit for
* MSS and greater (and you get extra
@@ -9391,7 +11084,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
if (rsm && SEQ_GT(th_ack, rsm->r_start))
changed = th_ack - rsm->r_start;
if (changed) {
- rack_process_to_cumack(tp, rack, th_ack, cts, to);
+ rack_process_to_cumack(tp, rack, th_ack, cts, to,
+ tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
}
if ((to->to_flags & TOF_SACK) == 0) {
/* We are done nothing left and no sack. */
@@ -9427,6 +11121,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
SEQ_LEQ(sack.end, th_ack)) {
int was_tlp;
+ if (dsack_seen != NULL)
+ *dsack_seen = 1;
was_tlp = rack_note_dsack(rack, sack.start, sack.end);
/*
* Its a D-SACK block.
@@ -9446,13 +11142,52 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered
* Sort the SACK blocks so we can update the rack scoreboard with
* just one pass.
*/
+ o_cnt = num_sack_blks;
num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
num_sack_blks, th->th_ack);
ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
+ if (sacks_seen != NULL)
+ *sacks_seen = num_sack_blks;
if (num_sack_blks == 0) {
- /* Nothing to sack (DSACKs?) */
+ /* Nothing to sack, but we need to update counts */
+ if ((o_cnt == 1) &&
+ (*dsack_seen != 1))
+ rack->r_ctl.sack_count++;
+ else if (o_cnt > 1)
+ rack->r_ctl.sack_count++;
goto out_with_totals;
}
+ if (rack->sack_attack_disable) {
+ /*
+ * An attacker disablement is in place, for
+ * every sack block that is not at least a full MSS
+ * count up sack_count.
+ */
+ for (i = 0; i < num_sack_blks; i++) {
+ if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) {
+ rack->r_ctl.sack_count++;
+ }
+ if (rack->r_ctl.sack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ }
+ goto out;
+ }
+ /* Its a sack of some sort */
+ rack->r_ctl.sack_count += num_sack_blks;
+ if (rack->r_ctl.sack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
if (num_sack_blks < 2) {
/* Only one, we don't need to sort */
goto do_sack_work;
@@ -9520,7 +11255,7 @@ do_sack_work:
* We probably did the FR and the next
* SACK in continues as we would expect.
*/
- acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
+ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz);
if (acked) {
rack->r_wanted_output = 1;
changed += acked;
@@ -9547,43 +11282,42 @@ do_sack_work:
rack->r_ctl.ack_count /= 2;
rack->r_ctl.sack_count /= 2;
}
+ if (moved_two) {
+ /*
+ * If we did not get a SACK for at least a MSS and
+ * had to move at all, or if we moved more than our
+ * threshold, it counts against the "extra" move.
+ */
+ rack->r_ctl.sack_moved_extra += moved_two;
+ rack->r_ctl.sack_noextra_move += no_extra;
+ counter_u64_add(rack_move_some, 1);
+ } else {
+ /*
+ * else we did not have to move
+ * any more than we would expect.
+ */
+ rack->r_ctl.sack_noextra_move += no_extra;
+ rack->r_ctl.sack_noextra_move++;
+ counter_u64_add(rack_move_none, 1);
+ }
+ if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
+ (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
+ rack->r_ctl.sack_moved_extra /= 2;
+ rack->r_ctl.sack_noextra_move /= 2;
+ }
goto out_with_totals;
} else {
/*
* Start the loop through the
* rest of blocks, past the first block.
*/
- moved_two = 0;
loop_start = 1;
}
}
- /* Its a sack of some sort */
- rack->r_ctl.sack_count++;
- if (rack->r_ctl.sack_count > 0xfff00000) {
- /*
- * reduce the number to keep us under
- * a uint32_t.
- */
- rack->r_ctl.ack_count /= 2;
- rack->r_ctl.sack_count /= 2;
- }
counter_u64_add(rack_sack_total, 1);
- if (rack->sack_attack_disable) {
- /* An attacker disablement is in place */
- if (num_sack_blks > 1) {
- rack->r_ctl.sack_count += (num_sack_blks - 1);
- rack->r_ctl.sack_moved_extra++;
- counter_u64_add(rack_move_some, 1);
- if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
- rack->r_ctl.sack_moved_extra /= 2;
- rack->r_ctl.sack_noextra_move /= 2;
- }
- }
- goto out;
- }
rsm = rack->r_ctl.rc_sacklast;
for (i = loop_start; i < num_sack_blks; i++) {
- acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
+ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz);
if (acked) {
rack->r_wanted_output = 1;
changed += acked;
@@ -9595,15 +11329,22 @@ do_sack_work:
* threshold, it counts against the "extra" move.
*/
rack->r_ctl.sack_moved_extra += moved_two;
+ rack->r_ctl.sack_noextra_move += no_extra;
counter_u64_add(rack_move_some, 1);
} else {
/*
* else we did not have to move
* any more than we would expect.
*/
+ rack->r_ctl.sack_noextra_move += no_extra;
rack->r_ctl.sack_noextra_move++;
counter_u64_add(rack_move_none, 1);
}
+ if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
+ (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
+ rack->r_ctl.sack_moved_extra /= 2;
+ rack->r_ctl.sack_noextra_move /= 2;
+ }
if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
/*
* If the SACK was not a full MSS then
@@ -9612,6 +11353,10 @@ do_sack_work:
* a MSS if its a TSO send) we had to skip by.
*/
rack->r_ctl.sack_count += moved_two;
+ if (rack->r_ctl.sack_count > 0xfff00000) {
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
counter_u64_add(rack_sack_total, moved_two);
}
/*
@@ -9621,16 +11366,8 @@ do_sack_work:
* the various counts, and then clear out
* moved_two.
*/
- if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
- (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
- rack->r_ctl.sack_moved_extra /= 2;
- rack->r_ctl.sack_noextra_move /= 2;
- }
- if (rack->r_ctl.sack_count > 0xfff00000) {
- rack->r_ctl.ack_count /= 2;
- rack->r_ctl.sack_count /= 2;
- }
moved_two = 0;
+ no_extra = 0;
}
out_with_totals:
if (num_sack_blks > 1) {
@@ -9707,21 +11444,17 @@ rack_strike_dupack(struct tcp_rack *rack)
struct rack_sendmap *rsm;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
- while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
- rsm = TAILQ_NEXT(rsm, r_tnext);
- if (rsm->r_flags & RACK_MUST_RXT) {
- /* Sendmap entries that are marked to
- * be retransmitted do not need dupack's
- * struck. We get these marks for a number
- * of reasons (rxt timeout with no sack,
- * mtu change, or rwnd collapses). When
- * these events occur, we know we must retransmit
- * them and mark the sendmap entries. Dupack counting
- * is not needed since we are already set to retransmit
- * it as soon as we can.
- */
+ while (rsm) {
+ /*
+ * We need to skip anything already set
+ * to be retransmitted.
+ */
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ (rsm->r_flags & RACK_MUST_RXT)) {
+ rsm = TAILQ_NEXT(rsm, r_tnext);
continue;
}
+ break;
}
if (rsm && (rsm->r_dupack < 0xff)) {
rsm->r_dupack++;
@@ -9755,13 +11488,12 @@ rack_strike_dupack(struct tcp_rack *rack)
static void
rack_check_bottom_drag(struct tcpcb *tp,
struct tcp_rack *rack,
- struct socket *so, int32_t acked)
+ struct socket *so)
{
uint32_t segsiz, minseg;
segsiz = ctf_fixed_maxseg(tp);
minseg = segsiz;
-
if (tp->snd_max == tp->snd_una) {
/*
* We are doing dynamic pacing and we are way
@@ -9772,43 +11504,19 @@ rack_check_bottom_drag(struct tcpcb *tp,
* addition to any earlier boosting of
* the multiplier.
*/
+ uint64_t lt_bw;
+
+ lt_bw = rack_get_lt_bw(rack);
rack->rc_dragged_bottom = 1;
rack_validate_multipliers_at_or_above100(rack);
- /*
- * Lets use the segment bytes acked plus
- * the lowest RTT seen as the basis to
- * form a b/w estimate. This will be off
- * due to the fact that the true estimate
- * should be around 1/2 the time of the RTT
- * but we can settle for that.
- */
if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
- acked) {
- uint64_t bw, calc_bw, rtt;
-
- rtt = rack->r_ctl.rack_rs.rs_us_rtt;
- if (rtt == 0) {
- /* no us sample is there a ms one? */
- if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
- rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
- } else {
- goto no_measurement;
- }
- }
- bw = acked;
- calc_bw = bw * 1000000;
- calc_bw /= rtt;
- if (rack->r_ctl.last_max_bw &&
- (rack->r_ctl.last_max_bw < calc_bw)) {
- /*
- * If we have a last calculated max bw
- * enforce it.
- */
- calc_bw = rack->r_ctl.last_max_bw;
- }
- /* now plop it in */
+ (lt_bw > 0)) {
+ /*
+ * Lets use the long-term b/w we have
+ * been getting as a base.
+ */
if (rack->rc_gp_filled == 0) {
- if (calc_bw > ONE_POINT_TWO_MEG) {
+ if (lt_bw > ONE_POINT_TWO_MEG) {
/*
* If we have no measurement
* don't let us set in more than
@@ -9817,19 +11525,19 @@ rack_check_bottom_drag(struct tcpcb *tp,
* will hopefully have a max b/w
* available to sanity check things.
*/
- calc_bw = ONE_POINT_TWO_MEG;
+ lt_bw = ONE_POINT_TWO_MEG;
}
rack->r_ctl.rc_rtt_diff = 0;
- rack->r_ctl.gp_bw = calc_bw;
+ rack->r_ctl.gp_bw = lt_bw;
rack->rc_gp_filled = 1;
if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
rack->r_ctl.num_measurements = RACK_REQ_AVG;
rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
- } else if (calc_bw > rack->r_ctl.gp_bw) {
+ } else if (lt_bw > rack->r_ctl.gp_bw) {
rack->r_ctl.rc_rtt_diff = 0;
if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
rack->r_ctl.num_measurements = RACK_REQ_AVG;
- rack->r_ctl.gp_bw = calc_bw;
+ rack->r_ctl.gp_bw = lt_bw;
rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
} else
rack_increase_bw_mul(rack, -1, 0, 0, 1);
@@ -9837,21 +11545,16 @@ rack_check_bottom_drag(struct tcpcb *tp,
(rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
/* We have enough measurements now */
rack->gp_ready = 1;
- rack_set_cc_pacing(rack);
+ if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) ||
+ rack->rack_hibeta)
+ rack_set_cc_pacing(rack);
if (rack->defer_options)
rack_apply_deferred_options(rack);
}
- /*
- * For acks over 1mss we do a extra boost to simulate
- * where we would get 2 acks (we want 110 for the mul).
- */
- if (acked > segsiz)
- rack_increase_bw_mul(rack, -1, 0, 0, 1);
} else {
/*
* zero rtt possibly?, settle for just an old increase.
*/
-no_measurement:
rack_increase_bw_mul(rack, -1, 0, 0, 1);
}
} else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
@@ -9875,7 +11578,225 @@ no_measurement:
}
}
+#ifdef TCP_REQUEST_TRK
+static void
+rack_log_hybrid(struct tcp_rack *rack, uint32_t seq,
+ struct http_sendfile_track *cur, uint8_t mod, int line, int err)
+{
+ int do_log;
+ do_log = tcp_bblogging_on(rack->rc_tp);
+ if (do_log == 0) {
+ if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0)
+ return;
+ /* We only allow the three below with point logging on */
+ if ((mod != HYBRID_LOG_RULES_APP) &&
+ (mod != HYBRID_LOG_RULES_SET) &&
+ (mod != HYBRID_LOG_REQ_COMP))
+ return;
+
+ }
+ if (do_log) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ /* Convert our ms to a microsecond */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = seq;
+ log.u_bbr.cwnd_gain = line;
+ if (cur != NULL) {
+ uint64_t off;
+
+ log.u_bbr.flex2 = cur->start_seq;
+ log.u_bbr.flex3 = cur->end_seq;
+ log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff);
+ log.u_bbr.flex6 = cur->flags;
+ log.u_bbr.pkts_out = cur->hybrid_flags;
+ log.u_bbr.rttProp = cur->timestamp;
+ log.u_bbr.cur_del_rate = cur->cspr;
+ log.u_bbr.bw_inuse = cur->start;
+ log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff);
+ log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
+ log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
+ log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
+ log.u_bbr.bbr_state = 1;
+ off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]);
+ log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track));
+ } else {
+ log.u_bbr.flex2 = err;
+ }
+ /*
+ * Fill in flex7 to be CHD (catchup|hybrid|DGP)
+ */
+ log.u_bbr.flex7 = rack->rc_catch_up;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->rc_hybrid_mode;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->dgp_on;
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
+ log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start;
+ log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error;
+ log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop;
+ tcp_log_event(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_HYBRID_PACING_LOG, 0,
+ 0, &log, false, NULL, __func__, __LINE__, &tv);
+ }
+}
+#endif
+
+#ifdef TCP_REQUEST_TRK
+static void
+rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len)
+{
+ struct http_sendfile_track *rc_cur;
+ struct tcpcb *tp;
+ int err = 0;
+
+ rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, seq);
+ if (rc_cur == NULL) {
+ /* If not in the beginning what about the end piece */
+ rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
+ rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, (seq + len - 1));
+ } else {
+ err = 12345;
+ }
+ /* If we find no parameters we are in straight DGP mode */
+ if(rc_cur == NULL) {
+ /* None found for this seq, just DGP for now */
+ rack->r_ctl.client_suggested_maxseg = 0;
+ rack->rc_catch_up = 0;
+ rack->r_ctl.bw_rate_cap = 0;
+ rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
+ if (rack->r_ctl.rc_last_sft) {
+ rack->r_ctl.rc_last_sft = NULL;
+ }
+ return;
+ }
+ /*
+ * Ok if we have a new entry *or* have never
+ * set up an entry we need to proceed. If
+ * we have already set it up this entry we
+ * just continue along with what we already
+ * setup.
+ */
+ tp = rack->rc_tp;
+ if ((rack->r_ctl.rc_last_sft != NULL) &&
+ (rack->r_ctl.rc_last_sft == rc_cur)) {
+ /* Its already in place */
+ rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0);
+ return;
+ }
+ if (rack->rc_hybrid_mode == 0) {
+ rack->r_ctl.rc_last_sft = rc_cur;
+ rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
+ return;
+ }
+ if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
+ /* Compensate for all the header overhead's */
+ rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
+ } else
+ rack->r_ctl.bw_rate_cap = 0;
+ if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
+ rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
+ else
+ rack->r_ctl.client_suggested_maxseg = 0;
+ if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
+ (rc_cur->cspr > 0)) {
+ uint64_t len;
+
+ rack->rc_catch_up = 1;
+ /*
+ * Calculate the deadline time, first set the
+ * time to when the request arrived.
+ */
+ rc_cur->deadline = rc_cur->localtime;
+ /*
+ * Next calculate the length and compensate for
+ * TLS if need be.
+ */
+ len = rc_cur->end - rc_cur->start;
+ if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) {
+ /*
+ * This session is doing TLS. Take a swag guess
+ * at the overhead.
+ */
+ len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len);
+ }
+ /*
+ * Now considering the size, and the cspr, what is the time that
+ * would be required at the cspr rate. Here we use the raw
+ * cspr value since the client only looks at the raw data. We
+ * do use len which includes TLS overhead, but not the TCP/IP etc.
+ * That will get made up for in the CU pacing rate set.
+ */
+ len *= HPTS_USEC_IN_SEC;
+ len /= rc_cur->cspr;
+ rc_cur->deadline += len;
+ } else {
+ rack->rc_catch_up = 0;
+ rc_cur->deadline = 0;
+ }
+ if (rack->r_ctl.client_suggested_maxseg != 0) {
+ /*
+ * We need to reset the max pace segs if we have a
+ * client_suggested_maxseg.
+ */
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
+ }
+ rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
+ /* Remember it for next time and for CU mode */
+ rack->r_ctl.rc_last_sft = rc_cur;
+}
+#endif
+
+static void
+rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
+{
+#ifdef TCP_REQUEST_TRK
+ struct http_sendfile_track *ent;
+
+ ent = rack->r_ctl.rc_last_sft;
+ if ((ent == NULL) ||
+ (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) ||
+ (SEQ_GEQ(seq, ent->end_seq))) {
+ /* Time to update the track. */
+ rack_set_dgp_hybrid_mode(rack, seq, len);
+ ent = rack->r_ctl.rc_last_sft;
+ }
+ /* Out of all */
+ if (ent == NULL) {
+ return;
+ }
+ if (SEQ_LT(ent->end_seq, (seq + len))) {
+ /*
+ * This is the case where our end_seq guess
+ * was wrong. This is usually due to TLS having
+ * more bytes then our guess. It could also be the
+ * case that the client sent in two requests closely
+ * and the SB is full of both so we are sending part
+ * of each (end|beg). In such a case lets move this
+ * guys end to match the end of this send. That
+ * way it will complete when all of it is acked.
+ */
+ ent->end_seq = (seq + len);
+ rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent);
+ }
+ /* Now validate we have set the send time of this one */
+ if ((ent->flags & TCP_HTTP_TRACK_FLG_FSND) == 0) {
+ ent->flags |= TCP_HTTP_TRACK_FLG_FSND;
+ ent->first_send = cts;
+ ent->sent_at_fs = rack->rc_tp->t_sndbytes;
+ ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
+ }
+#endif
+}
static void
rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
@@ -9918,7 +11839,7 @@ rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket
}
static void
-rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
+rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb)
{
/*
* Here any sendmap entry that points to the
@@ -9928,7 +11849,7 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
* 2) snd_una adjusted to its new position.
*
* Note that (2) implies rack_ack_received has also
- * been called.
+ * been called and all the sbcut's have been done.
*
* We grab the first mbuf in the socket buffer and
* then go through the front of the sendmap, recalculating
@@ -9939,17 +11860,30 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
* not be a penalty though, since we just referenced the sb
* to go in and trim off the mbufs that we freed (of course
* there will be a penalty for the sendmap references though).
+ *
+ * Note also with INVARIANT on, we validate with a KASSERT
+ * that the first sendmap entry has a soff of 0.
+ *
*/
struct mbuf *m;
struct rack_sendmap *rsm;
+ tcp_seq snd_una;
+#ifdef INVARIANTS
+ int first_processed = 0;
+#endif
+ snd_una = rack->rc_tp->snd_una;
SOCKBUF_LOCK_ASSERT(sb);
m = sb->sb_mb;
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_min(rack->r_ctl.tqh);
if ((rsm == NULL) || (m == NULL)) {
/* Nothing outstanding */
return;
}
+ /* The very first RSM's mbuf must point to the head mbuf in the sb */
+ KASSERT((rsm->m == m),
+ ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb",
+ rack, sb, rsm));
while (rsm->m && (rsm->m == m)) {
/* one to adjust */
#ifdef INVARIANTS
@@ -9957,10 +11891,17 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
uint32_t soff;
tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
- if (rsm->orig_m_len != m->m_len) {
+ if ((rsm->orig_m_len != m->m_len) ||
+ (rsm->orig_t_space != M_TRAILINGROOM(m))){
rack_adjust_orig_mlen(rsm);
}
- if (rsm->soff != soff) {
+ if (first_processed == 0) {
+ KASSERT((rsm->soff == 0),
+ ("Rack:%p rsm:%p -- rsm at head but soff not zero",
+ rack, rsm));
+ first_processed = 1;
+ }
+ if ((rsm->soff != soff) || (rsm->m != tm)) {
/*
* This is not a fatal error, we anticipate it
* might happen (the else code), so we count it here
@@ -9971,24 +11912,104 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
}
rsm->m = tm;
rsm->soff = soff;
- if (tm)
+ if (tm) {
rsm->orig_m_len = rsm->m->m_len;
- else
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ } else {
rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ }
#else
rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
- if (rsm->m)
+ if (rsm->m) {
rsm->orig_m_len = rsm->m->m_len;
- else
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ } else {
rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ }
#endif
- rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
- rsm);
+ rsm = tqhash_next(rack->r_ctl.tqh, rsm);
if (rsm == NULL)
break;
}
}
+#ifdef TCP_REQUEST_TRK
+static inline void
+rack_http_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack)
+{
+ struct http_sendfile_track *ent;
+ int i;
+
+ if ((rack->rc_hybrid_mode == 0) &&
+ (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) {
+ /*
+ * Just do normal completions hybrid pacing is not on
+ * and CLDL is off as well.
+ */
+ tcp_http_check_for_comp(rack->rc_tp, th_ack);
+ return;
+ }
+ /*
+ * Originally I was just going to find the th_ack associated
+ * with an entry. But then I realized a large strech ack could
+ * in theory ack two or more requests at once. So instead we
+ * need to find all entries that are completed by th_ack not
+ * just a single entry and do our logging.
+ */
+ ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
+ while (ent != NULL) {
+ /*
+ * We may be doing hybrid pacing or CLDL and need more details possibly
+ * so we do it manually instead of calling
+ * tcp_http_check_for_comp()
+ */
+ uint64_t laa, tim, data, cbw, ftim;
+
+ /* Ok this ack frees it */
+ rack_log_hybrid(rack, th_ack,
+ ent, HYBRID_LOG_REQ_COMP, __LINE__, 0);
+ /* calculate the time based on the ack arrival */
+ data = ent->end - ent->start;
+ laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ if (ent->flags & TCP_HTTP_TRACK_FLG_FSND) {
+ if (ent->first_send > ent->localtime)
+ ftim = ent->first_send;
+ else
+ ftim = ent->localtime;
+ } else {
+ /* TSNH */
+ ftim = ent->localtime;
+ }
+ if (laa > ent->localtime)
+ tim = laa - ftim;
+ else
+ tim = 0;
+ cbw = data * HPTS_USEC_IN_SEC;
+ if (tim > 0)
+ cbw /= tim;
+ else
+ cbw = 0;
+ rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent);
+ /*
+ * Check to see if we are freeing what we are pointing to send wise
+ * if so be sure to NULL the pointer so we know we are no longer
+ * set to anything.
+ */
+ if (ent == rack->r_ctl.rc_last_sft)
+ rack->r_ctl.rc_last_sft = NULL;
+ /* Generate the log that the tcp_netflix call would have */
+ tcp_http_log_req_info(rack->rc_tp, ent,
+ i, TCP_HTTP_REQ_LOG_FREED, 0, 0);
+ /* Free it and see if there is another one */
+ tcp_http_free_a_slot(rack->rc_tp, ent);
+ ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
+ }
+}
+#endif
+
+
/*
* Return value of 1, we do not need to call rack_process_data().
* return value of 0, rack_process_data can be called.
@@ -10025,6 +12046,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
int in_rec, dup_ack_struck = 0;
+ int dsack_seen = 0, sacks_seen = 0;
in_rec = IN_FASTRECOVERY(tp->t_flags);
if (rack->rc_in_persist) {
@@ -10032,13 +12054,33 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
}
+
if ((th->th_ack == tp->snd_una) &&
(tiwin == tp->snd_wnd) &&
((to->to_flags & TOF_SACK) == 0)) {
rack_strike_dupack(rack);
dup_ack_struck = 1;
}
- rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
+ rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
+ dup_ack_struck, &dsack_seen, &sacks_seen);
+ if ((rack->sack_attack_disable > 0) &&
+ (th->th_ack == tp->snd_una) &&
+ (tiwin == tp->snd_wnd) &&
+ (dsack_seen == 0) &&
+ (sacks_seen > 0)) {
+ /*
+ * If sacks have been disabled we may
+ * want to strike a dup-ack "ignoring" the
+ * sack as long as the sack was not a "dsack". Note
+ * that if no sack is sent (TOF_SACK is off) then the
+ * normal dsack code above rack_log_ack() would have
+ * already struck. So this is just to catch the case
+ * were we are ignoring sacks from this guy due to
+ * it being a suspected attacker.
+ */
+ rack_strike_dupack(rack);
+ }
+
}
if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
/*
@@ -10048,6 +12090,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
return (0);
}
@@ -10110,8 +12154,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
-#ifdef NETFLIX_HTTP_LOGGING
- tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
+#ifdef TCP_REQUEST_TRK
+ rack_http_check_for_comp(rack, th->th_ack);
#endif
}
/*
@@ -10169,11 +12213,19 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
ourfinisacked = 1;
}
tp->snd_una = th->th_ack;
+ /* wakeups? */
if (acked_amount && sbavail(&so->so_snd))
- rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ rack_adjust_sendmap_head(rack, &so->so_snd);
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
+ /* now check the rxt clamps */
+ if ((recovery == 1) &&
+ (rack->excess_rxt_on) &&
+ (rack->r_cwnd_was_clamped == 0)) {
+ do_rack_excess_rxt(tp, rack);
+ } else if (rack->r_cwnd_was_clamped)
+ do_rack_check_for_unclamp(tp, rack);
m_freem(mfree);
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
@@ -10187,7 +12239,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack->rc_gp_dyn_mul &&
rack->rc_always_pace) {
/* Check if we are dragging bottom */
- rack_check_bottom_drag(tp, rack, so, acked);
+ rack_check_bottom_drag(tp, rack, so);
}
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
@@ -10201,6 +12253,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (sbavail(&tptosocket(tp)->so_snd) == 0)
tp->t_acktime = 0;
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ rack->rc_suspicious = 0;
/* Set need output so persist might get set */
rack->r_wanted_output = 1;
sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
@@ -10261,7 +12314,7 @@ rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t
}
static void
-rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
+rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line)
{
/*
* Here all we do is mark the collapsed point and set the flag.
@@ -10271,28 +12324,26 @@ rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
*/
tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
if ((rack->rc_has_collapsed == 0) ||
- (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)))
+ (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd)))
counter_u64_add(rack_collapsed_win_seen, 1);
- rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
+ rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd;
rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
rack->rc_has_collapsed = 1;
rack->r_collapse_point_valid = 1;
- rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
+ rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
}
static void
rack_un_collapse_window(struct tcp_rack *rack, int line)
{
- struct rack_sendmap *nrsm, *rsm, fe;
+ struct rack_sendmap *nrsm, *rsm;
int cnt = 0, split = 0;
-#ifdef INVARIANTS
- struct rack_sendmap *insret;
-#endif
+ int insret __diagused;
+
- memset(&fe, 0, sizeof(fe));
+ tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
rack->rc_has_collapsed = 0;
- fe.r_start = rack->r_ctl.last_collapse_point;
- rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
if (rsm == NULL) {
/* Nothing to do maybe the peer ack'ed it all */
rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
@@ -10312,11 +12363,10 @@ rack_un_collapse_window(struct tcp_rack *rack, int line)
split = 1;
rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
- if (insret != NULL) {
- panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
+ panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
nrsm, insret, rack, rsm);
}
#endif
@@ -10332,8 +12382,10 @@ rack_un_collapse_window(struct tcp_rack *rack, int line)
*/
rsm = nrsm;
}
+
no_split:
- RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
+ TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) {
+ cnt++;
nrsm->r_flags |= RACK_RWND_COLLAPSED;
rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
cnt++;
@@ -10349,24 +12401,12 @@ rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
int32_t tlen, int32_t tfo_syn)
{
if (DELAY_ACK(tp, tlen) || tfo_syn) {
- if (rack->rc_dack_mode &&
- (tlen > 500) &&
- (rack->rc_dack_toggle == 1)) {
- goto no_delayed_ack;
- }
rack_timer_cancel(tp, rack,
rack->r_ctl.rc_rcvtime, __LINE__);
tp->t_flags |= TF_DELACK;
} else {
-no_delayed_ack:
rack->r_wanted_output = 1;
tp->t_flags |= TF_ACKNOW;
- if (rack->rc_dack_mode) {
- if (tp->t_flags & TF_DELACK)
- rack->rc_dack_toggle = 1;
- else
- rack->rc_dack_toggle = 0;
- }
}
}
@@ -10451,7 +12491,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
if (tp->snd_wnd < ctf_outstanding(tp))
/* The peer collapsed the window */
- rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
+ rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
@@ -10479,7 +12519,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing is outstanding, and there is
* data to send. Enter persists.
*/
- rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
}
if (tp->t_flags2 & TF2_DROP_AF_DATA) {
m_freem(m);
@@ -10593,7 +12633,6 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
* DSACK actually handled in the fastpath
* above.
*/
- RACK_OPTS_INC(tcp_sack_path_1);
tcp_update_sack_list(tp, save_start,
save_start + save_tlen);
} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
@@ -10603,22 +12642,18 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Partial overlap, recorded at todrop
* above.
*/
- RACK_OPTS_INC(tcp_sack_path_2a);
tcp_update_sack_list(tp,
tp->sackblks[0].start,
tp->sackblks[0].end);
} else {
- RACK_OPTS_INC(tcp_sack_path_2b);
tcp_update_dsack_list(tp, save_start,
save_start + save_tlen);
}
} else if (tlen >= save_tlen) {
/* Update of sackblks. */
- RACK_OPTS_INC(tcp_sack_path_3);
tcp_update_dsack_list(tp, save_start,
save_start + save_tlen);
} else if (tlen > 0) {
- RACK_OPTS_INC(tcp_sack_path_4);
tcp_update_dsack_list(tp, save_start,
save_start + tlen);
}
@@ -10882,7 +12917,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
under_pacing = 1;
}
nsegs = max(1, m->m_pkthdr.lro_nsegs);
- rack_log_ack(tp, to, th, 0, 0);
+ rack_log_ack(tp, to, th, 0, 0, NULL, NULL);
/* Did the window get updated? */
if (tiwin != tp->snd_wnd) {
tp->snd_wnd = tiwin;
@@ -10910,7 +12945,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing is outstanding, and there is
* data to send. Enter persists.
*/
- rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack);
}
/*
* If last ACK falls within this segment's sequence numbers, record
@@ -10960,7 +12995,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
mfree = sbcut_locked(&so->so_snd, acked);
tp->snd_una = th->th_ack;
/* Note we want to hold the sb lock through the sendmap adjust */
- rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ rack_adjust_sendmap_head(rack, &so->so_snd);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
sowwakeup_locked(so);
@@ -10976,8 +13011,9 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
-#ifdef NETFLIX_HTTP_LOGGING
- tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
+
+#ifdef TCP_REQUEST_TRK
+ rack_http_check_for_comp(rack, th->th_ack);
#endif
}
/*
@@ -10987,7 +13023,7 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (tp->snd_wnd < ctf_outstanding(tp)) {
/* The peer collapsed the window */
- rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
+ rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
} else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
@@ -11014,11 +13050,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack->rc_gp_dyn_mul &&
rack->rc_always_pace) {
/* Check if we are dragging bottom */
- rack_check_bottom_drag(tp, rack, so, acked);
+ rack_check_bottom_drag(tp, rack, so);
}
if (tp->snd_una == tp->snd_max) {
tp->t_flags &= ~TF_PREVVALID;
rack->r_ctl.retran_during_recovery = 0;
+ rack->rc_suspicious = 0;
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
@@ -11125,7 +13162,6 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
rack->r_wanted_output = 1;
tp->t_flags |= TF_ACKNOW;
- rack->rc_dack_toggle = 0;
}
tcp_ecn_input_syn_sent(tp, thflags, iptos);
@@ -12195,6 +14231,15 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u
int chged = 0;
uint32_t user_max, orig_min, orig_max;
+#ifdef TCP_REQUEST_TRK
+ if (rack->rc_hybrid_mode &&
+ (rack->r_ctl.rc_pace_max_segs != 0) &&
+ (rack_hybrid_allow_set_maxseg == 1) &&
+ (rack->r_ctl.rc_last_sft != NULL)) {
+ rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS;
+ return;
+ }
+#endif
orig_min = rack->r_ctl.rc_pace_min_segs;
orig_max = rack->r_ctl.rc_pace_max_segs;
user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
@@ -12214,13 +14259,18 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u
rack->r_ctl.rc_pace_max_segs = user_max;
} else {
/* We are pacing right at the hardware rate */
- uint32_t segsiz;
+ uint32_t segsiz, pace_one;
+ if (rack_pace_one_seg ||
+ (rack->r_ctl.rc_user_set_min_segs == 1))
+ pace_one = 1;
+ else
+ pace_one = 0;
segsiz = min(ctf_fixed_maxseg(tp),
rack->r_ctl.rc_pace_min_segs);
- rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
- tp, bw_est, segsiz, 0,
- rack->r_ctl.crte, NULL);
+ rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(
+ tp, bw_est, segsiz, pace_one,
+ rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
}
} else if (rack->rc_always_pace) {
if (rack->r_ctl.gp_bw ||
@@ -12266,7 +14316,7 @@ rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, u
static void
-rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
+rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags)
{
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
@@ -12296,6 +14346,7 @@ rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
tcpip_fillheaders(rack->rc_inp,
tp->t_port,
ip6, rack->r_ctl.fsb.th);
+ rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL);
} else
#endif /* INET6 */
#ifdef INET
@@ -12317,8 +14368,11 @@ rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
tcpip_fillheaders(rack->rc_inp,
tp->t_port,
ip, rack->r_ctl.fsb.th);
+ rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl;
}
#endif
+ rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0),
+ (long)TCP_MAXWIN << tp->rcv_scale);
rack->r_fsb_inited = 1;
}
@@ -12343,17 +14397,485 @@ rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
return (0);
}
+static void
+rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod)
+{
+ /*
+ * Types of logs (mod value)
+ * 20 - Initial round setup
+ * 21 - Rack declares a new round.
+ */
+ struct tcpcb *tp;
+
+ tp = rack->rc_tp;
+ if (tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = rack->r_ctl.current_round;
+ log.u_bbr.flex2 = rack->r_ctl.roundends;
+ log.u_bbr.flex3 = high_seq;
+ log.u_bbr.flex4 = tp->snd_max;
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
+ log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ TCP_HYSTART, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+static void
+rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ rack->rack_deferred_inited = 1;
+ rack->r_ctl.roundends = tp->snd_max;
+ rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
+ rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
+}
+
+static void
+rack_init_retransmit_value(struct tcp_rack *rack, int ctl)
+{
+ /* Retransmit bit controls.
+ *
+ * The setting of these values control one of
+ * three settings you can have and dictate
+ * how rack does retransmissions. Note this
+ * is in *any* mode i.e. pacing on or off DGP
+ * fixed rate pacing, or just bursting rack.
+ *
+ * 1 - Use full sized retransmits i.e. limit
+ * the size to whatever the pace_max_segments
+ * size is.
+ *
+ * 2 - Use pacer min granularity as a guide to
+ * the size combined with the current calculated
+ * goodput b/w measurement. So for example if
+ * the goodput is measured at 20Mbps we would
+ * calculate 8125 (pacer minimum 250usec in
+ * that b/w) and then round it up to the next
+ * MSS i.e. for 1448 mss 6 MSS or 8688 bytes.
+ *
+ * 0 - The rack default 1 MSS (anything not 0/1/2
+ * fall here too if we are setting via rack_init()).
+ *
+ */
+ if (ctl == 1) {
+ rack->full_size_rxt = 1;
+ rack->shape_rxt_to_pacing_min = 0;
+ } else if (ctl == 2) {
+ rack->full_size_rxt = 0;
+ rack->shape_rxt_to_pacing_min = 1;
+ } else {
+ rack->full_size_rxt = 0;
+ rack->shape_rxt_to_pacing_min = 0;
+ }
+}
+
+static void
+rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod,
+ uint32_t flex1,
+ uint32_t flex2,
+ uint32_t flex3)
+{
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.flex1 = flex1;
+ log.u_bbr.flex2 = flex2;
+ log.u_bbr.flex3 = flex3;
+ tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0,
+ 0, &log, false, NULL, __func__, __LINE__, &tv);
+ }
+}
+
static int
-rack_init(struct tcpcb *tp, void **ptr)
+rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr)
+{
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ int i;
+
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ switch (reqr->req) {
+ case TCP_QUERY_SENDMAP:
+ if ((reqr->req_param == tp->snd_max) ||
+ (tp->snd_max == tp->snd_una)){
+ /* Unlikely */
+ return (0);
+ }
+ rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param);
+ if (rsm == NULL) {
+ /* Can't find that seq -- unlikely */
+ return (0);
+ }
+ reqr->sendmap_start = rsm->r_start;
+ reqr->sendmap_end = rsm->r_end;
+ reqr->sendmap_send_cnt = rsm->r_rtr_cnt;
+ reqr->sendmap_fas = rsm->r_fas;
+ if (reqr->sendmap_send_cnt > SNDMAP_NRTX)
+ reqr->sendmap_send_cnt = SNDMAP_NRTX;
+ for(i=0; i<reqr->sendmap_send_cnt; i++)
+ reqr->sendmap_time[i] = rsm->r_tim_lastsent[i];
+ reqr->sendmap_ack_arrival = rsm->r_ack_arrival;
+ reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK;
+ reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes;
+ reqr->sendmap_dupacks = rsm->r_dupack;
+ rack_log_chg_info(tp, rack, 1,
+ rsm->r_start,
+ rsm->r_end,
+ rsm->r_flags);
+ return(1);
+ break;
+ case TCP_QUERY_TIMERS_UP:
+ if (rack->r_ctl.rc_hpts_flags == 0) {
+ /* no timers up */
+ return (0);
+ }
+ reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags;
+ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to;
+ }
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp;
+ }
+ rack_log_chg_info(tp, rack, 2,
+ rack->r_ctl.rc_hpts_flags,
+ rack->r_ctl.rc_last_output_to,
+ rack->r_ctl.rc_timer_exp);
+ return (1);
+ break;
+ case TCP_QUERY_RACK_TIMES:
+ /* Reordering items */
+ reqr->rack_num_dsacks = rack->r_ctl.num_dsack;
+ reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts;
+ /* Timerstamps and timers */
+ reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time;
+ reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt;
+ reqr->rack_rtt = rack->rc_rack_rtt;
+ reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time;
+ reqr->rack_srtt_measured = rack->rc_srtt_measure_made;
+ /* PRR data */
+ reqr->rack_sacked = rack->r_ctl.rc_sacked;
+ reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt;
+ reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered;
+ reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs;
+ reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt;
+ reqr->rack_prr_out = rack->r_ctl.rc_prr_out;
+ /* TLP and persists info */
+ reqr->rack_tlp_out = rack->rc_tlp_in_progress;
+ reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out;
+ if (rack->rc_in_persist) {
+ reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time;
+ reqr->rack_in_persist = 1;
+ } else {
+ reqr->rack_time_went_idle = 0;
+ reqr->rack_in_persist = 0;
+ }
+ if (rack->r_wanted_output)
+ reqr->rack_wanted_output = 1;
+ else
+ reqr->rack_wanted_output = 0;
+ return (1);
+ break;
+ default:
+ return (-EINVAL);
+ }
+}
+
+static void
+rack_switch_failed(struct tcpcb *tp)
{
+ /*
+ * This method gets called if a stack switch was
+ * attempted and it failed. We are left
+ * but our hpts timers were stopped and we
+ * need to validate time units and inp_flags2.
+ */
struct inpcb *inp = tptoinpcb(tp);
- struct tcp_rack *rack = NULL;
+ struct tcp_rack *rack;
+ struct timeval tv;
+ uint32_t cts;
+ uint32_t toval;
+ struct hpts_diag diag;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
+ if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
+ inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ else
+ inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ if (inp->inp_in_hpts) {
+ /* Strange */
+ return;
+ }
+ cts = tcp_get_usecs(&tv);
+ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
+ toval = rack->r_ctl.rc_last_output_to - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
+ toval = rack->r_ctl.rc_timer_exp - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else
+ toval = HPTS_TICKS_PER_SLOT;
+ (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
+ __LINE__, &diag);
+ rack_log_hpts_diag(rack, cts, &diag, &tv);
+}
+
+static int
+rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr)
+{
+ struct rack_sendmap *rsm, *ersm;
+ int insret __diagused;
+ /*
+ * When initing outstanding, we must be quite careful
+ * to not refer to tp->t_fb_ptr. This has the old rack
+ * pointer in it, not the "new" one (when we are doing
+ * a stack switch).
+ */
+
+
+ if (tp->t_fb->tfb_chg_query == NULL) {
+ /* Create a send map for the current outstanding data */
+
+ rsm = rack_alloc(rack);
+ if (rsm == NULL) {
+ uma_zfree(rack_pcb_zone, ptr);
+ return (ENOMEM);
+ }
+ rsm->r_no_rtt_allowed = 1;
+ rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
+ rsm->r_rtr_cnt = 1;
+ rsm->r_rtr_bytes = 0;
+ if (tp->t_flags & TF_SENTFIN)
+ rsm->r_flags |= RACK_HAS_FIN;
+ rsm->r_end = tp->snd_max;
+ if (tp->snd_una == tp->iss) {
+ /* The data space is one beyond snd_una */
+ rsm->r_flags |= RACK_HAS_SYN;
+ rsm->r_start = tp->iss;
+ rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
+ } else
+ rsm->r_start = tp->snd_una;
+ rsm->r_dupack = 0;
+ if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
+ rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
+ if (rsm->m) {
+ rsm->orig_m_len = rsm->m->m_len;
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ } else {
+ rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ }
+ } else {
+ /*
+ * This can happen if we have a stand-alone FIN or
+ * SYN.
+ */
+ rsm->m = NULL;
+ rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ rsm->soff = 0;
+ }
#ifdef INVARIANTS
- struct rack_sendmap *insret;
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
+ panic("Insert in rb tree fails ret:%d rack:%p rsm:%p",
+ insret, rack, rsm);
+ }
+#else
+ (void)tqhash_insert(rack->r_ctl.tqh, rsm);
#endif
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ } else {
+ /* We have a query mechanism, lets use it */
+ struct tcp_query_resp qr;
+ int i;
+ tcp_seq at;
+
+ at = tp->snd_una;
+ while (at != tp->snd_max) {
+ memset(&qr, 0, sizeof(qr));
+ qr.req = TCP_QUERY_SENDMAP;
+ qr.req_param = at;
+ if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0)
+ break;
+ /* Move forward */
+ at = qr.sendmap_end;
+ /* Now lets build the entry for this one */
+ rsm = rack_alloc(rack);
+ if (rsm == NULL) {
+ uma_zfree(rack_pcb_zone, ptr);
+ return (ENOMEM);
+ }
+ memset(rsm, 0, sizeof(struct rack_sendmap));
+ /* Now configure the rsm and insert it */
+ rsm->r_dupack = qr.sendmap_dupacks;
+ rsm->r_start = qr.sendmap_start;
+ rsm->r_end = qr.sendmap_end;
+ if (qr.sendmap_fas)
+ rsm->r_fas = qr.sendmap_end;
+ else
+ rsm->r_fas = rsm->r_start - tp->snd_una;
+ /*
+ * We have carefully aligned the bits
+ * so that all we have to do is copy over
+ * the bits with the mask.
+ */
+ rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK;
+ rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes;
+ rsm->r_rtr_cnt = qr.sendmap_send_cnt;
+ rsm->r_ack_arrival = qr.sendmap_ack_arrival;
+ for (i=0 ; i<rsm->r_rtr_cnt; i++)
+ rsm->r_tim_lastsent[i] = qr.sendmap_time[i];
+ rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
+ (rsm->r_start - tp->snd_una), &rsm->soff);
+ if (rsm->m) {
+ rsm->orig_m_len = rsm->m->m_len;
+ rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
+ } else {
+ rsm->orig_m_len = 0;
+ rsm->orig_t_space = 0;
+ }
+#ifdef INVARIANTS
+ if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
+ panic("Insert in rb tree fails ret:%d rack:%p rsm:%p",
+ insret, rack, rsm);
+ }
+#else
+ (void)tqhash_insert(rack->r_ctl.tqh, rsm);
+#endif
+ if ((rsm->r_flags & RACK_ACKED) == 0) {
+ TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) {
+ if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] >
+ rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) {
+ /*
+ * If the existing ersm was sent at
+ * a later time than the new one, then
+ * the new one should appear ahead of this
+ * ersm.
+ */
+ rsm->r_in_tmap = 1;
+ TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext);
+ break;
+ }
+ }
+ if (rsm->r_in_tmap == 0) {
+ /*
+ * Not found so shove it on the tail.
+ */
+ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 1;
+ }
+ } else {
+ if ((rack->r_ctl.rc_sacklast == NULL) ||
+ (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) {
+ rack->r_ctl.rc_sacklast = rsm;
+ }
+ }
+ rack_log_chg_info(tp, rack, 3,
+ rsm->r_start,
+ rsm->r_end,
+ rsm->r_flags);
+ }
+ }
+ return (0);
+}
+
+static void
+rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval)
+{
+ /*
+ * P = percent bits
+ * F = fill cw bit -- Toggle fillcw if this bit is set.
+ * S = Segment bits
+ * M = set max segment bit
+ * U = Unclamined
+ * C = If set to non-zero override the max number of clamps.
+ * L = Bit to indicate if clamped gets lower.
+ *
+ * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP
+ *
+ * The lowest 3 nibbles is the perentage .1 - 6553.5%
+ * where 10.1 = 101, max 6553.5
+ * The upper 16 bits holds some options.
+ * The F bit will turn on fill-cw on if you are
+ * not pacing, it will turn it off if dgp is on.
+ * The L bit will change it so when clamped we get
+ * the min(gp, lt-bw) for dgp.
+ */
+ uint16_t per;
+
+ rack->r_ctl.saved_rxt_clamp_val = optval;
+ per = optval & 0x0000ffff;
+ rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff);
+ if (optval > 0) {
+ uint16_t clamp_opt;
+
+ rack->excess_rxt_on = 1;
+ clamp_opt = ((optval & 0xffff0000) >> 16);
+ rack->r_ctl.clamp_options = clamp_opt & 0x00ff;
+ if (clamp_opt & 0xff00) {
+ /* A max clamps is also present */
+ rack->r_ctl.max_clamps = (clamp_opt >> 8);
+ } else {
+ /* No specified clamps means no limit */
+ rack->r_ctl.max_clamps = 0;
+ }
+ if (rack->r_ctl.clamp_options & 0x0002) {
+ rack->r_clamped_gets_lower = 1;
+ } else {
+ rack->r_clamped_gets_lower = 0;
+ }
+ } else {
+ /* Turn it off back to default */
+ rack->excess_rxt_on = 0;
+ rack->r_clamped_gets_lower = 0;
+ }
+
+}
+
+
+static int32_t
+rack_init(struct tcpcb *tp, void **ptr)
+{
+ struct inpcb *inp = tptoinpcb(tp);
+ struct tcp_rack *rack = NULL;
uint32_t iwin, snt, us_cts;
- int err;
+ int err, no_query;
+ /*
+ * First are we the initial or are we a switched stack?
+ * If we are initing via tcp_newtcppcb the ptr passed
+ * will be tp->t_fb_ptr. If its a stack switch that
+ * has a previous stack we can query it will be a local
+ * var that will in the end be set into t_fb_ptr.
+ */
+ if (ptr == &tp->t_fb_ptr)
+ no_query = 1;
+ else
+ no_query = 0;
*ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
if (*ptr == NULL) {
/*
@@ -12362,12 +14884,16 @@ rack_init(struct tcpcb *tp, void **ptr)
* scheme to drop the locks fails :(
*
*/
- return (ENOMEM);
+ return(ENOMEM);
}
- memset(ptr, 0, sizeof(struct tcp_rack));
-
- rack = (struct tcp_rack *)ptr;
- RB_INIT(&rack->r_ctl.rc_mtree);
+ memset(*ptr, 0, sizeof(struct tcp_rack));
+ rack = (struct tcp_rack *)*ptr;
+ rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT);
+ if (rack->r_ctl.tqh == NULL) {
+ uma_zfree(rack_pcb_zone, rack);
+ return(ENOMEM);
+ }
+ tqhash_init(rack->r_ctl.tqh);
TAILQ_INIT(&rack->r_ctl.rc_free);
TAILQ_INIT(&rack->r_ctl.rc_tmap);
rack->rc_tp = tp;
@@ -12387,7 +14913,11 @@ rack_init(struct tcpcb *tp, void **ptr)
* rc_pacing_cc_set. That way whenever we turn off pacing
* or switch off this stack, we will know to go restore
* the saved values.
+ *
+ * We specifically put into the beta the ecn value for pacing.
*/
+ rack->rc_new_rnd_needed = 1;
+ rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
/* We want abe like behavior as well */
@@ -12395,9 +14925,21 @@ rack_init(struct tcpcb *tp, void **ptr)
rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
- rack->r_ctl.roundends = tp->snd_max;
+ if (rack_rxt_clamp_thresh) {
+ rack_translate_clamp_value(rack, rack_rxt_clamp_thresh);
+ rack->excess_rxt_on = 1;
+ }
+ if (rack_uses_full_dgp_in_rec)
+ rack->r_ctl.full_dgp_in_rec = 1;
+ if (rack_fill_cw_state)
+ rack->rc_pace_to_cwnd = 1;
+ if (rack_pacing_min_seg)
+ rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg;
if (use_rack_rr)
rack->use_rack_rr = 1;
+ if (rack_dnd_default) {
+ rack->rc_pace_dnd = 1;
+ }
if (V_tcp_delack_enabled)
tp->t_delayed_ack = 1;
else
@@ -12407,13 +14949,16 @@ rack_init(struct tcpcb *tp, void **ptr)
tp->t_flags2 |= TF2_TCP_ACCOUNTING;
}
#endif
+ rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
+ rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
if (rack_enable_shared_cwnd)
rack->rack_enable_scwnd = 1;
+ rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
rack->rc_user_set_max_segs = rack_hptsi_segments;
rack->rc_force_max_seg = 0;
- if (rack_use_imac_dack)
- rack->rc_dack_mode = 1;
TAILQ_INIT(&rack->r_ctl.opt_list);
+ if (rack_hibeta_setting)
+ rack->rack_hibeta = 1;
rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
@@ -12429,7 +14974,7 @@ rack_init(struct tcpcb *tp, void **ptr)
rack->rc_gp_no_rec_chg = 1;
if (rack_pace_every_seg && tcp_can_enable_pacing()) {
rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
+ if ((rack->gp_ready) && (rack->rc_always_pace && (rack->use_fixed_rate == 0)))
rack_set_cc_pacing(rack);
} else
rack->rc_always_pace = 0;
@@ -12437,18 +14982,13 @@ rack_init(struct tcpcb *tp, void **ptr)
rack->r_mbuf_queue = 1;
else
rack->r_mbuf_queue = 0;
- if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
- inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- else
- inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
rack_set_pace_segments(tp, rack, __LINE__, NULL);
if (rack_limits_scwnd)
rack->r_limit_scw = 1;
else
rack->r_limit_scw = 0;
+ rack_init_retransmit_value(rack, rack_rxt_controls);
rack->rc_labc = V_tcp_abc_l_var;
- rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
- rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
@@ -12493,70 +15033,103 @@ rack_init(struct tcpcb *tp, void **ptr)
rack->rack_hdw_pace_ena = 1;
if (rack_hw_rate_caps)
rack->r_rack_hw_rate_caps = 1;
- /* Do we force on detection? */
#ifdef NETFLIX_EXP_DETECTION
- if (tcp_force_detection)
- rack->do_detection = 1;
- else
+ rack->do_detection = 1;
+#else
+ rack->do_detection = 0;
#endif
- rack->do_detection = 0;
if (rack_non_rxt_use_cr)
rack->rack_rec_nonrxt_use_cr = 1;
+ /* Lets setup the fsb block */
err = rack_init_fsb(tp, rack);
if (err) {
- uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
- tp->t_fb_ptr = NULL;
+ uma_zfree(rack_pcb_zone, *ptr);
+ *ptr = NULL;
return (err);
}
- if (tp->snd_una != tp->snd_max) {
- /* Create a send map for the current outstanding data */
- struct rack_sendmap *rsm;
-
- rsm = rack_alloc(rack);
- if (rsm == NULL) {
- uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
- tp->t_fb_ptr = NULL;
- return (ENOMEM);
- }
- rsm->r_no_rtt_allowed = 1;
- rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
- rsm->r_rtr_cnt = 1;
- rsm->r_rtr_bytes = 0;
- if (tp->t_flags & TF_SENTFIN)
- rsm->r_flags |= RACK_HAS_FIN;
- if ((tp->snd_una == tp->iss) &&
- !TCPS_HAVEESTABLISHED(tp->t_state))
- rsm->r_flags |= RACK_HAS_SYN;
- rsm->r_start = tp->snd_una;
- rsm->r_end = tp->snd_max;
- rsm->r_dupack = 0;
- if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
- rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
- if (rsm->m)
- rsm->orig_m_len = rsm->m->m_len;
- else
- rsm->orig_m_len = 0;
- } else {
+ if (rack_do_hystart) {
+ tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
+ if (rack_do_hystart > 1)
+ tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
+ if (rack_do_hystart > 2)
+ tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
+ }
+ /* Log what we will do with queries */
+ rack_log_chg_info(tp, rack, 7,
+ no_query, 0, 0);
+ if (rack_def_profile)
+ rack_set_profile(rack, rack_def_profile);
+ /* Cancel the GP measurement in progress */
+ tp->t_flags &= ~TF_GPUTINPROG;
+ if ((tp->t_state != TCPS_CLOSED) &&
+ (tp->t_state != TCPS_TIME_WAIT)) {
+ /*
+ * We are already open, we may
+ * need to adjust a few things.
+ */
+ if (SEQ_GT(tp->snd_max, tp->iss))
+ snt = tp->snd_max - tp->iss;
+ else
+ snt = 0;
+ iwin = rc_init_window(rack);
+ if ((snt < iwin) &&
+ (no_query == 1)) {
+ /* We are not past the initial window
+ * on the first init (i.e. a stack switch
+ * has not yet occured) so we need to make
+ * sure cwnd and ssthresh is correct.
+ */
+ if (tp->snd_cwnd < iwin)
+ tp->snd_cwnd = iwin;
/*
- * This can happen if we have a stand-alone FIN or
- * SYN.
+ * If we are within the initial window
+ * we want ssthresh to be unlimited. Setting
+ * it to the rwnd (which the default stack does
+ * and older racks) is not really a good idea
+ * since we want to be in SS and grow both the
+ * cwnd and the rwnd (via dynamic rwnd growth). If
+ * we set it to the rwnd then as the peer grows its
+ * rwnd we will be stuck in CA and never hit SS.
+ *
+ * Its far better to raise it up high (this takes the
+ * risk that there as been a loss already, probably
+ * we should have an indicator in all stacks of loss
+ * but we don't), but considering the normal use this
+ * is a risk worth taking. The consequences of not
+ * hitting SS are far worse than going one more time
+ * into it early on (before we have sent even a IW).
+ * It is highly unlikely that we will have had a loss
+ * before getting the IW out.
*/
- rsm->m = NULL;
- rsm->orig_m_len = 0;
- rsm->soff = 0;
+ tp->snd_ssthresh = 0xffffffff;
}
-#ifndef INVARIANTS
- (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#else
- insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (insret != NULL) {
- panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
- insret, rack, rsm);
+ /*
+ * Any init based on sequence numbers
+ * should be done in the deferred init path
+ * since we can be CLOSED and not have them
+ * inited when rack_init() is called. We
+ * are not closed so lets call it.
+ */
+ rack_deferred_init(tp, rack);
+ }
+ if ((tp->t_state != TCPS_CLOSED) &&
+ (tp->t_state != TCPS_TIME_WAIT) &&
+ (no_query == 0) &&
+ (tp->snd_una != tp->snd_max)) {
+ err = rack_init_outstanding(tp, rack, us_cts, *ptr);
+ if (err) {
+ *ptr = NULL;
+ return(err);
}
-#endif
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
- rsm->r_in_tmap = 1;
}
+ rack_stop_all_timers(tp, rack);
+ /* Setup all the inp_flags2 */
+ if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
+ tptoinpcb(tp)->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ else
+ tptoinpcb(tp)->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
/*
* Timers in Rack are kept in microseconds so lets
* convert any initial incoming variables
@@ -12567,57 +15140,110 @@ rack_init(struct tcpcb *tp, void **ptr)
* these to get the full precision.
*/
rack_convert_rtts(tp);
- tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
- if (rack_do_hystart) {
- tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
- if (rack_do_hystart > 1)
- tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
- if (rack_do_hystart > 2)
- tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
+ rack_log_hystart_event(rack, rack->r_ctl.roundends, 20);
+ if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) {
+ /* We do not start any timers on DROPPED connections */
+ if (tp->t_fb->tfb_chg_query == NULL) {
+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
+ } else {
+ struct tcp_query_resp qr;
+ int ret;
+
+ memset(&qr, 0, sizeof(qr));
+
+ /* Get the misc time stamps and such for rack */
+ qr.req = TCP_QUERY_RACK_TIMES;
+ ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
+ if (ret == 1) {
+ rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts;
+ rack->r_ctl.num_dsack = qr.rack_num_dsacks;
+ rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time;
+ rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt;
+ rack->rc_rack_rtt = qr.rack_rtt;
+ rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time;
+ rack->r_ctl.rc_sacked = qr.rack_sacked;
+ rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt;
+ rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered;
+ rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs;
+ rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt;
+ rack->r_ctl.rc_prr_out = qr.rack_prr_out;
+ if (qr.rack_tlp_out) {
+ rack->rc_tlp_in_progress = 1;
+ rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out;
+ } else {
+ rack->rc_tlp_in_progress = 0;
+ rack->r_ctl.rc_tlp_cnt_out = 0;
+ }
+ if (qr.rack_srtt_measured)
+ rack->rc_srtt_measure_made = 1;
+ if (qr.rack_in_persist == 1) {
+ rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle;
+#ifdef NETFLIX_SHARED_CWND
+ if (rack->r_ctl.rc_scw) {
+ tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
+ rack->rack_scwnd_is_idle = 1;
+ }
+#endif
+ rack->r_ctl.persist_lost_ends = 0;
+ rack->probe_not_answered = 0;
+ rack->forced_ack = 0;
+ tp->t_rxtshift = 0;
+ rack->rc_in_persist = 1;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
+ }
+ if (qr.rack_wanted_output)
+ rack->r_wanted_output = 1;
+ rack_log_chg_info(tp, rack, 6,
+ qr.rack_min_rtt,
+ qr.rack_rtt,
+ qr.rack_reorder_ts);
+ }
+ /* Get the old stack timers */
+ qr.req_param = 0;
+ qr.req = TCP_QUERY_TIMERS_UP;
+ ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
+ if (ret) {
+ /*
+ * non-zero return means we have a timer('s)
+ * to start. Zero means no timer (no keepalive
+ * I suppose).
+ */
+ uint32_t tov = 0;
+
+ rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags;
+ if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) {
+ rack->r_ctl.rc_last_output_to = qr.timer_pacing_to;
+ if (TSTMP_GT(qr.timer_pacing_to, us_cts))
+ tov = qr.timer_pacing_to - us_cts;
+ else
+ tov = HPTS_TICKS_PER_SLOT;
+ }
+ if (qr.timer_hpts_flags & PACE_TMR_MASK) {
+ rack->r_ctl.rc_timer_exp = qr.timer_timer_exp;
+ if (tov == 0) {
+ if (TSTMP_GT(qr.timer_timer_exp, us_cts))
+ tov = qr.timer_timer_exp - us_cts;
+ else
+ tov = HPTS_TICKS_PER_SLOT;
+ }
+ }
+ rack_log_chg_info(tp, rack, 4,
+ rack->r_ctl.rc_hpts_flags,
+ rack->r_ctl.rc_last_output_to,
+ rack->r_ctl.rc_timer_exp);
+ if (tov) {
+ struct hpts_diag diag;
+
+ (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov),
+ __LINE__, &diag);
+ rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
+ }
+ }
+ }
+ rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur,
+ __LINE__, RACK_RTTS_INIT);
}
- if (rack_def_profile)
- rack_set_profile(rack, rack_def_profile);
- /* Cancel the GP measurement in progress */
- tp->t_flags &= ~TF_GPUTINPROG;
- if (SEQ_GT(tp->snd_max, tp->iss))
- snt = tp->snd_max - tp->iss;
- else
- snt = 0;
- iwin = rc_init_window(rack);
- if (snt < iwin) {
- /* We are not past the initial window
- * so we need to make sure cwnd is
- * correct.
- */
- if (tp->snd_cwnd < iwin)
- tp->snd_cwnd = iwin;
- /*
- * If we are within the initial window
- * we want ssthresh to be unlimited. Setting
- * it to the rwnd (which the default stack does
- * and older racks) is not really a good idea
- * since we want to be in SS and grow both the
- * cwnd and the rwnd (via dynamic rwnd growth). If
- * we set it to the rwnd then as the peer grows its
- * rwnd we will be stuck in CA and never hit SS.
- *
- * Its far better to raise it up high (this takes the
- * risk that there as been a loss already, probably
- * we should have an indicator in all stacks of loss
- * but we don't), but considering the normal use this
- * is a risk worth taking. The consequences of not
- * hitting SS are far worse than going one more time
- * into it early on (before we have sent even a IW).
- * It is highly unlikely that we will have had a loss
- * before getting the IW out.
- */
- tp->snd_ssthresh = 0xffffffff;
- }
- rack_stop_all_timers(tp);
- /* Lets setup the fsb block */
- rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
- rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur,
- __LINE__, RACK_RTTS_INIT);
return (0);
}
@@ -12660,42 +15286,25 @@ rack_handoff_ok(struct tcpcb *tp)
return (EINVAL);
}
-
static void
rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
{
- struct inpcb *inp = tptoinpcb(tp);
if (tp->t_fb_ptr) {
+ uint32_t cnt_free = 0;
struct tcp_rack *rack;
- struct rack_sendmap *rsm, *nrsm;
-#ifdef INVARIANTS
- struct rack_sendmap *rm;
-#endif
+ struct rack_sendmap *rsm;
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- if (tp->t_in_pkt) {
- /*
- * It is unsafe to process the packets since a
- * reset may be lurking in them (its rare but it
- * can occur). If we were to find a RST, then we
- * would end up dropping the connection and the
- * INP lock, so when we return the caller (tcp_usrreq)
- * will blow up when it trys to unlock the inp.
- */
- struct mbuf *save, *m;
-
- m = tp->t_in_pkt;
- tp->t_in_pkt = NULL;
- tp->t_tail_pkt = NULL;
- while (m) {
- save = m->m_nextpkt;
- m->m_nextpkt = NULL;
- m_freem(m);
- m = save;
- }
- }
+ tcp_handle_orphaned_packets(tp);
tp->t_flags &= ~TF_FORCEDATA;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack_log_pacing_delay_calc(rack,
+ 0,
+ 0,
+ 0,
+ rack_get_gp_est(rack), /* delRate */
+ rack_get_lt_bw(rack), /* rttProp */
+ 20, __LINE__, NULL, 0);
#ifdef NETFLIX_SHARED_CWND
if (rack->r_ctl.rc_scw) {
uint32_t limit;
@@ -12715,51 +15324,6 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
rack->r_ctl.fsb.tcp_ip_hdr = NULL;
rack->r_ctl.fsb.th = NULL;
}
- /* Convert back to ticks, with */
- if (tp->t_srtt > 1) {
- uint32_t val, frac;
-
- val = USEC_2_TICKS(tp->t_srtt);
- frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
- tp->t_srtt = val << TCP_RTT_SHIFT;
- /*
- * frac is the fractional part here is left
- * over from converting to hz and shifting.
- * We need to convert this to the 5 bit
- * remainder.
- */
- if (frac) {
- if (hz == 1000) {
- frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
- } else {
- frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
- }
- tp->t_srtt += frac;
- }
- }
- if (tp->t_rttvar) {
- uint32_t val, frac;
-
- val = USEC_2_TICKS(tp->t_rttvar);
- frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
- tp->t_rttvar = val << TCP_RTTVAR_SHIFT;
- /*
- * frac is the fractional part here is left
- * over from converting to hz and shifting.
- * We need to convert this to the 5 bit
- * remainder.
- */
- if (frac) {
- if (hz == 1000) {
- frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
- } else {
- frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
- }
- tp->t_rttvar += frac;
- }
- }
- tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
- tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
if (rack->rc_always_pace) {
tcp_decrement_paced_conn();
rack_undo_cc_pacing(rack);
@@ -12782,35 +15346,57 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
#ifdef TCP_BLACKBOX
tcp_log_flowend(tp);
#endif
- RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
-#ifndef INVARIANTS
- (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
-#else
- rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (rm != rsm) {
- panic("At fini, rack:%p rsm:%p rm:%p",
- rack, rsm, rm);
- }
-#endif
+ /*
+ * Lets take a different approach to purging just
+ * get each one and free it like a cum-ack would and
+ * not use a foreach loop.
+ */
+ rsm = tqhash_min(rack->r_ctl.tqh);
+ while (rsm) {
+ tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
+ rack->r_ctl.rc_num_maps_alloced--;
uma_zfree(rack_zone, rsm);
+ rsm = tqhash_min(rack->r_ctl.tqh);
}
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
while (rsm) {
TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
+ rack->r_ctl.rc_num_maps_alloced--;
+ rack->rc_free_cnt--;
+ cnt_free++;
uma_zfree(rack_zone, rsm);
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
}
+ if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
+ (tcp_bblogging_on(tp))) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex8 = 10;
+ log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced;
+ log.u_bbr.flex2 = rack->rc_free_cnt;
+ log.u_bbr.flex3 = cnt_free;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ rsm = tqhash_min(rack->r_ctl.tqh);
+ log.u_bbr.delRate = (uint64_t)rsm;
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
+ log.u_bbr.cur_del_rate = (uint64_t)rsm;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.pkt_epoch = __LINE__;
+ (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+ KASSERT((rack->r_ctl.rc_num_maps_alloced == 0),
+ ("rack:%p num_aloc:%u after freeing all?",
+ rack,
+ rack->r_ctl.rc_num_maps_alloced));
rack->rc_free_cnt = 0;
+ free(rack->r_ctl.tqh, M_TCPFSB);
+ rack->r_ctl.tqh = NULL;
uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
tp->t_fb_ptr = NULL;
}
- inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
- inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
- inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
- inp->inp_flags2 &= ~INP_MBUF_ACKCMP;
- /* Cancel the GP measurement in progress */
- tp->t_flags &= ~TF_GPUTINPROG;
- inp->inp_flags2 &= ~INP_MBUF_L_ACKS;
/* Make sure snd_nxt is correctly set */
tp->snd_nxt = tp->snd_max;
}
@@ -12836,7 +15422,6 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
rack->r_substate = rack_do_established;
break;
case TCPS_CLOSE_WAIT:
- rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_CLOSE_WAIT;
rack->r_substate = rack_do_close_wait;
break;
@@ -12856,7 +15441,6 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
rack->r_substate = rack_do_lastack;
break;
case TCPS_FIN_WAIT_2:
- rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_FIN_WAIT_2;
rack->r_substate = rack_do_fin_wait_2;
break;
@@ -12956,7 +15540,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
static void
-rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
+rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts)
{
if ((SEQ_LT(tp->snd_wl1, seq) ||
(tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
@@ -12980,6 +15564,8 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin
/* Not a valid win update */
return;
}
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
/* Do we exit persists? */
if ((rack->rc_in_persist != 0) &&
(tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -12999,7 +15585,7 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin
* nothing is outstanding, and there is
* data to send. Enter persists.
*/
- rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack);
}
}
@@ -13036,6 +15622,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
log.u_bbr.use_lt_bw <<= 1;
log.u_bbr.use_lt_bw |= rack->r_might_revert;
log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
+ log.u_bbr.bbr_state = rack->rc_free_cnt;
log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
@@ -13236,7 +15823,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
int win_up_req = 0;
#endif
int nsegs = 0;
- int under_pacing = 1;
+ int under_pacing = 0;
int recovery = 0;
#ifdef TCP_ACCOUNTING
sched_pin();
@@ -13244,8 +15831,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (rack->gp_ready &&
(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
- under_pacing = 0;
- else
under_pacing = 1;
if (rack->r_state != tp->t_state)
@@ -13302,6 +15887,14 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
#endif
rack_clear_rate_sample(rack);
ae = ((mtod(m, struct tcp_ackent *)) + i);
+ if (ae->flags & TH_FIN)
+ rack_log_pacing_delay_calc(rack,
+ 0,
+ 0,
+ 0,
+ rack_get_gp_est(rack), /* delRate */
+ rack_get_lt_bw(rack), /* rttProp */
+ 20, __LINE__, NULL, 0);
/* Setup the window */
tiwin = ae->win << tp->snd_scale;
if (tiwin > rack->r_ctl.rc_high_rwnd)
@@ -13320,6 +15913,10 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
/* Case C */
ae->ack_val_set = ACK_RWND;
}
+ if (rack->sack_attack_disable > 0) {
+ rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
+ rack->r_ctl.ack_during_sd++;
+ }
rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
/* Validate timestamp */
if (ae->flags & HAS_TSTMP) {
@@ -13395,6 +15992,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
*/
if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ if (rack->r_ctl.rc_reorder_ts == 0)
+ rack->r_ctl.rc_reorder_ts = 1;
}
} else if (ae->ack_val_set == ACK_DUPACK) {
/* Case D */
@@ -13419,7 +16018,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
win_upd_ack = ae->ack;
win_seq = ae->seq;
the_win = tiwin;
- rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
+ rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
} else {
/* Case A */
if (SEQ_GT(ae->ack, tp->snd_max)) {
@@ -13439,7 +16038,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
win_upd_ack = ae->ack;
win_seq = ae->seq;
the_win = tiwin;
- rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
+ rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
}
#ifdef TCP_ACCOUNTING
/* Account for the acks */
@@ -13448,35 +16047,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
}
#endif
high_seq = ae->ack;
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = high_seq;
- log.u_bbr.flex2 = rack->r_ctl.roundends;
- log.u_bbr.flex3 = rack->r_ctl.current_round;
- log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
- log.u_bbr.flex8 = 8;
- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
- /*
- * The draft (v3) calls for us to use SEQ_GEQ, but that
- * causes issues when we are just going app limited. Lets
- * instead use SEQ_GT <or> where its equal but more data
- * is outstanding.
- */
- if ((SEQ_GT(high_seq, rack->r_ctl.roundends)) ||
- ((high_seq == rack->r_ctl.roundends) &&
- SEQ_GT(tp->snd_max, tp->snd_una))) {
- rack->r_ctl.current_round++;
- rack->r_ctl.roundends = tp->snd_max;
- if (CC_ALGO(tp)->newround != NULL) {
- CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
- }
- }
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
+ rack_log_hystart_event(rack, high_seq, 8);
/* Setup our act_rcv_time */
if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
ts.tv_sec = ae->timestamp / 1000000000;
@@ -13486,7 +16058,11 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
} else {
rack->r_ctl.act_rcv_time = *tv;
}
- rack_process_to_cumack(tp, rack, ae->ack, cts, to);
+ rack_process_to_cumack(tp, rack, ae->ack, cts, to,
+ tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
+#ifdef TCP_REQUEST_TRK
+ rack_http_check_for_comp(rack, high_seq);
+#endif
if (rack->rc_dsack_round_seen) {
/* Is the dsack round over? */
if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
@@ -13516,7 +16092,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
/* Tend to any collapsed window */
if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
/* The peer collapsed the window */
- rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
+ rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__);
} else if (rack->rc_has_collapsed)
rack_un_collapse_window(rack, __LINE__);
if ((rack->r_collapse_point_valid) &&
@@ -13525,6 +16101,28 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
/*
+ * The draft (v3) calls for us to use SEQ_GEQ, but that
+ * causes issues when we are just going app limited. Lets
+ * instead use SEQ_GT <or> where its equal but more data
+ * is outstanding.
+ *
+ * Also make sure we are on the last ack of a series. We
+ * have to have all the ack's processed in queue to know
+ * if there is something left outstanding.
+ *
+ */
+ if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
+ (rack->rc_new_rnd_needed == 0) &&
+ (nxt_pkt == 0)) {
+ rack_log_hystart_event(rack, high_seq, 21);
+ rack->r_ctl.current_round++;
+ /* Force the next send to setup the next round */
+ rack->rc_new_rnd_needed = 1;
+ if (CC_ALGO(tp)->newround != NULL) {
+ CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
+ }
+ }
+ /*
* Clear the probe not answered flag
* since cum-ack moved forward.
*/
@@ -13624,10 +16222,16 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
mfree = sbcut_locked(&so->so_snd, acked_amount);
tp->snd_una = high_seq;
/* Note we want to hold the sb lock through the sendmap adjust */
- rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ rack_adjust_sendmap_head(rack, &so->so_snd);
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
sowwakeup_locked(so);
+ if ((recovery == 1) &&
+ (rack->excess_rxt_on) &&
+ (rack->r_cwnd_was_clamped == 0)) {
+ do_rack_excess_rxt(tp, rack);
+ } else if (rack->r_cwnd_was_clamped)
+ do_rack_check_for_unclamp(tp, rack);
m_freem(mfree);
}
/* update progress */
@@ -13651,9 +16255,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
*/
if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
-#ifdef NETFLIX_HTTP_LOGGING
- tcp_http_check_for_comp(rack->rc_tp, high_seq);
-#endif
tp->snd_wl2 = high_seq;
tp->t_dupacks = 0;
if (under_pacing &&
@@ -13662,11 +16263,12 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
rack->rc_gp_dyn_mul &&
rack->rc_always_pace) {
/* Check if we are dragging bottom */
- rack_check_bottom_drag(tp, rack, so, acked);
+ rack_check_bottom_drag(tp, rack, so);
}
if (tp->snd_una == tp->snd_max) {
tp->t_flags &= ~TF_PREVVALID;
rack->r_ctl.retran_during_recovery = 0;
+ rack->rc_suspicious = 0;
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
@@ -13916,6 +16518,15 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
* tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
*/
rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->rack_deferred_inited == 0) {
+ /*
+ * If we are the connecting socket we will
+ * hit rack_init() when no sequence numbers
+ * are setup. This makes it so we must defer
+ * some initialization. Call that now.
+ */
+ rack_deferred_init(tp, rack);
+ }
if (m->m_flags & M_ACKCMP) {
/*
* All compressed ack's are ack's by definition so
@@ -13924,6 +16535,20 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
rack->rc_ack_required = 0;
return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
}
+ thflags = tcp_get_flags(th);
+ /*
+ * If there is a RST or FIN lets dump out the bw
+ * with a FIN the connection may go on but we
+ * may not.
+ */
+ if ((thflags & TH_FIN) || (thflags & TH_RST))
+ rack_log_pacing_delay_calc(rack,
+ rack->r_ctl.gp_bw,
+ 0,
+ 0,
+ rack_get_gp_est(rack), /* delRate */
+ rack_get_lt_bw(rack), /* rttProp */
+ 20, __LINE__, NULL, 0);
if (m->m_flags & M_ACKCMP) {
panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
}
@@ -13931,7 +16556,6 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
ms_cts = tcp_tv_to_mssectick(tv);
nsegs = m->m_pkthdr.lro_nsegs;
counter_u64_add(rack_proc_non_comp_ack, 1);
- thflags = tcp_get_flags(th);
#ifdef TCP_ACCOUNTING
sched_pin();
if (thflags & TH_ACK)
@@ -14035,6 +16659,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
log.u_bbr.use_lt_bw <<= 1;
log.u_bbr.use_lt_bw |= rack->r_might_revert;
log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
+ log.u_bbr.bbr_state = rack->rc_free_cnt;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
log.u_bbr.flex3 = m->m_flags;
@@ -14094,6 +16719,10 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* Remove ack required flag if set, we have one */
if (thflags & TH_ACK)
rack->rc_ack_required = 0;
+ if (rack->sack_attack_disable > 0) {
+ rack->r_ctl.ack_during_sd++;
+ rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
+ }
if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
way_out = 4;
retval = 0;
@@ -14274,9 +16903,16 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (rack->r_state != tp->t_state)
rack_set_state(tp, rack);
if (SEQ_GT(th->th_ack, tp->snd_una) &&
- (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
+ (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL)
kern_prefetch(rsm, &prev_state);
prev_state = rack->r_state;
+ if ((thflags & TH_RST) &&
+ ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) {
+ /* The connection will be killed by a reset check the tracepoint */
+ tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV);
+ }
retval = (*rack->r_substate) (m, th, so,
tp, &to, drop_hdrlen,
tlen, tiwin, thflags, nxt_pkt, iptos);
@@ -14328,38 +16964,37 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (nxt_pkt == 0) {
if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
do_output_now:
- if (tcp_output(tp) < 0)
+ if (tcp_output(tp) < 0) {
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
return (1);
+ }
did_out = 1;
}
rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
rack_free_trim(rack);
}
/* Update any rounds needed */
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = high_seq;
- log.u_bbr.flex2 = rack->r_ctl.roundends;
- log.u_bbr.flex3 = rack->r_ctl.current_round;
- log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
- log.u_bbr.flex8 = 9;
- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
+ rack_log_hystart_event(rack, high_seq, 8);
/*
* The draft (v3) calls for us to use SEQ_GEQ, but that
* causes issues when we are just going app limited. Lets
* instead use SEQ_GT <or> where its equal but more data
* is outstanding.
+ *
+ * Also make sure we are on the last ack of a series. We
+ * have to have all the ack's processed in queue to know
+ * if there is something left outstanding.
*/
- if ((SEQ_GT(tp->snd_una, rack->r_ctl.roundends)) ||
- ((tp->snd_una == rack->r_ctl.roundends) && SEQ_GT(tp->snd_max, tp->snd_una))) {
+ if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
+ (rack->rc_new_rnd_needed == 0) &&
+ (nxt_pkt == 0)) {
+ rack_log_hystart_event(rack, tp->snd_una, 21);
rack->r_ctl.current_round++;
- rack->r_ctl.roundends = tp->snd_max;
+ /* Force the next send to setup the next round */
+ rack->rc_new_rnd_needed = 1;
if (CC_ALGO(tp)->newround != NULL) {
CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
}
@@ -14449,9 +17084,10 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
struct rack_sendmap *rsm = NULL;
int32_t idx;
uint32_t srtt = 0, thresh = 0, ts_low = 0;
+ int no_sack = 0;
/* Return the next guy to be re-transmitted */
- if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
+ if (tqhash_empty(rack->r_ctl.tqh)) {
return (NULL);
}
if (tp->t_flags & TF_SENTFIN) {
@@ -14471,13 +17107,16 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
return (NULL);
}
check_it:
- if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
+ if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) ||
+ (rack->sack_attack_disable > 0)) {
+ no_sack = 1;
+ }
+ if ((no_sack > 0) &&
(rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
/*
* No sack so we automatically do the 3 strikes and
* retransmit (no rack timer would be started).
*/
-
return (rsm);
}
if (rsm->r_flags & RACK_ACKED) {
@@ -14580,6 +17219,14 @@ rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
log.u_bbr.cwnd_gain <<= 1;
log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
log.u_bbr.bbr_substate = quality;
+ log.u_bbr.bbr_state = rack->dgp_on;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->r_fill_less_agg;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd;
+ log.u_bbr.bbr_state <<= 2;
+ log.u_bbr.bbr_state |= rack->r_pacing_discount;
+ log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7);
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -14591,7 +17238,7 @@ rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
{
- uint32_t new_tso, user_max;
+ uint32_t new_tso, user_max, pace_one;
user_max = rack->rc_user_set_max_segs * mss;
if (rack->rc_force_max_seg) {
@@ -14603,9 +17250,23 @@ rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
/* Use the user mss since we are not exactly matched */
return (user_max);
}
- new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
+ if (rack_pace_one_seg ||
+ (rack->r_ctl.rc_user_set_min_segs == 1))
+ pace_one = 1;
+ else
+ pace_one = 0;
+
+ new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss,
+ pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
if (new_tso > user_max)
new_tso = user_max;
+ if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) {
+ if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso)
+ new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss;
+ }
+ if (rack->r_ctl.rc_user_set_min_segs &&
+ ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso))
+ new_tso = rack->r_ctl.rc_user_set_min_segs * mss;
return (new_tso);
}
@@ -14630,30 +17291,40 @@ pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t se
}
/*
* first lets calculate the b/w based on the last us-rtt
- * and the sndwnd.
+ * and the the smallest send window.
*/
- fill_bw = rack->r_ctl.cwnd_to_use;
+ fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
/* Take the rwnd if its smaller */
if (fill_bw > rack->rc_tp->snd_wnd)
fill_bw = rack->rc_tp->snd_wnd;
+ /* Now lets make it into a b/w */
+ fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
+ fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
if (rack->r_fill_less_agg) {
/*
- * Now take away the inflight (this will reduce our
- * aggressiveness and yeah, if we get that much out in 1RTT
- * we will have had acks come back and still be behind).
+ * We want the average of the rate_wanted
+ * and our fill-cw calculated bw. We also want
+ * to cap any increase to be no more than
+ * X times the lt_bw (where X is the rack_bw_multipler).
*/
- fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ uint64_t lt_bw, rate;
+
+ lt_bw = rack_get_lt_bw(rack);
+ if (lt_bw > *rate_wanted)
+ rate = lt_bw;
+ else
+ rate = *rate_wanted;
+ fill_bw += rate;
+ fill_bw /= 2;
+ if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) {
+ fill_bw = rate * rack_bw_multipler;
+ }
}
- /* Now lets make it into a b/w */
- fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
- fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
/* We are below the min b/w */
if (non_paced)
*rate_wanted = fill_bw;
if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
return (slot);
- if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
- fill_bw = rack->r_ctl.bw_rate_cap;
rack->r_via_fill_cw = 1;
if (rack->r_rack_hw_rate_caps &&
(rack->r_ctl.crte != NULL)) {
@@ -14695,9 +17366,15 @@ pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t se
}
}
}
+ if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL);
+ fill_bw = rack->r_ctl.bw_rate_cap;
+ }
/*
* Ok fill_bw holds our mythical b/w to fill the cwnd
- * in a rtt, what does that time wise equate too?
+ * in an rtt (unless it was capped), what does that
+ * time wise equate too?
*/
lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
lentim /= fill_bw;
@@ -14715,9 +17392,16 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
{
uint64_t srtt;
int32_t slot = 0;
+ int32_t minslot = 0;
int can_start_hw_pacing = 1;
int err;
+ int pace_one;
+ if (rack_pace_one_seg ||
+ (rack->r_ctl.rc_user_set_min_segs == 1))
+ pace_one = 1;
+ else
+ pace_one = 0;
if (rack->rc_always_pace == 0) {
/*
* We use the most optimistic possible cwnd/srtt for
@@ -14774,9 +17458,12 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
} else
rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
+ /*******************************************************/
+ /* RRS: We insert non-paced call to stats here for len */
+ /*******************************************************/
} else {
uint64_t bw_est, res, lentim, rate_wanted;
- uint32_t orig_val, segs, oh;
+ uint32_t segs, oh;
int capped = 0;
int prev_fill;
@@ -14804,6 +17491,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
*/
goto old_method;
}
+ rack_rate_cap_bw(rack, &rate_wanted, &capped);
/* We need to account for all the overheads */
segs = (len + segsiz - 1) / segsiz;
/*
@@ -14812,27 +17500,58 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* means we may be off if we are larger than 1500 bytes
* or smaller. But this just makes us more conservative.
*/
- if (rack_hw_rate_min &&
- (bw_est < rack_hw_rate_min))
- can_start_hw_pacing = 0;
- if (ETHERNET_SEGMENT_SIZE > segsiz)
- oh = ETHERNET_SEGMENT_SIZE - segsiz;
- else
- oh = 0;
+
+ oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr);
+ if (rack->r_is_v6) {
+#ifdef INET6
+ oh += sizeof(struct ip6_hdr);
+#endif
+ } else {
+#ifdef INET
+ oh += sizeof(struct ip);
+#endif
+ }
+ /* We add a fixed 14 for the ethernet header */
+ oh += 14;
segs *= oh;
lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
res = lentim / rate_wanted;
slot = (uint32_t)res;
- orig_val = rack->r_ctl.rc_pace_max_segs;
- if (rack->r_ctl.crte == NULL) {
+ if (rack_hw_rate_min &&
+ (rate_wanted < rack_hw_rate_min)) {
+ can_start_hw_pacing = 0;
+ if (rack->r_ctl.crte) {
+ /*
+ * Ok we need to release it, we
+ * have fallen too low.
+ */
+ tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
+ rack->r_ctl.crte = NULL;
+ rack->rack_attempt_hdwr_pace = 0;
+ rack->rack_hdrw_pacing = 0;
+ }
+ }
+ if (rack->r_ctl.crte &&
+ (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
/*
- * Only do this if we are not hardware pacing
- * since if we are doing hw-pacing below we will
- * set make a call after setting up or changing
- * the rate.
+ * We want more than the hardware can give us,
+ * don't start any hw pacing.
*/
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
- } else if (rack->rc_inp->inp_snd_tag == NULL) {
+ can_start_hw_pacing = 0;
+ if (rack->r_rack_hw_rate_caps == 0) {
+ /*
+ * Ok we need to release it, we
+ * want more than the card can give us and
+ * no rate cap is in place. Set it up so
+ * when we want less we can retry.
+ */
+ tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
+ rack->r_ctl.crte = NULL;
+ rack->rack_attempt_hdwr_pace = 0;
+ rack->rack_hdrw_pacing = 0;
+ }
+ }
+ if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) {
/*
* We lost our rate somehow, this can happen
* if the interface changed underneath us.
@@ -14846,9 +17565,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
rate_wanted, bw_est, __LINE__,
0, 6);
}
- /* Did we change the TSO size, if so log it */
- if (rack->r_ctl.rc_pace_max_segs != orig_val)
- rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0);
prev_fill = rack->r_via_fill_cw;
if ((rack->rc_pace_to_cwnd) &&
(capped == 0) &&
@@ -14860,6 +17576,28 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
* fill the cwnd to the max if its not full.
*/
slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
+ /* Re-check to make sure we are not exceeding our max b/w */
+ if ((rack->r_ctl.crte != NULL) &&
+ (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
+ /*
+ * We want more than the hardware can give us,
+ * don't start any hw pacing.
+ */
+ can_start_hw_pacing = 0;
+ if (rack->r_rack_hw_rate_caps == 0) {
+ /*
+ * Ok we need to release it, we
+ * want more than the card can give us and
+ * no rate cap is in place. Set it up so
+ * when we want less we can retry.
+ */
+ tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
+ rack->r_ctl.crte = NULL;
+ rack->rack_attempt_hdwr_pace = 0;
+ rack->rack_hdrw_pacing = 0;
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
+ }
+ }
}
if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
(rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
@@ -14879,9 +17617,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
&err, &rack->r_ctl.crte_prev_rate);
if (rack->r_ctl.crte) {
rack->rack_hdrw_pacing = 1;
- rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
- 0, rack->r_ctl.crte,
- NULL);
+ rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz,
+ pace_one, rack->r_ctl.crte,
+ NULL, rack->r_ctl.pace_len_divisor);
rack_log_hdwr_pacing(rack,
rate_wanted, rack->r_ctl.crte->rate, __LINE__,
err, 0);
@@ -14933,13 +17671,16 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
goto done_w_hdwr;
}
nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
- rack->rc_tp,
- rack->rc_inp->inp_route.ro_nh->nh_ifp,
- rate_wanted,
- RS_PACING_GEQ,
- &err, &rack->r_ctl.crte_prev_rate);
+ rack->rc_tp,
+ rack->rc_inp->inp_route.ro_nh->nh_ifp,
+ rate_wanted,
+ RS_PACING_GEQ,
+ &err, &rack->r_ctl.crte_prev_rate);
if (nrte == NULL) {
- /* Lost the rate */
+ /*
+ * Lost the rate, lets drop hardware pacing
+ * period.
+ */
rack->rack_hdrw_pacing = 0;
rack->r_ctl.crte = NULL;
rack_log_hdwr_pacing(rack,
@@ -14949,10 +17690,9 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
counter_u64_add(rack_hw_pace_lost, 1);
} else if (nrte != rack->r_ctl.crte) {
rack->r_ctl.crte = nrte;
- rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
- segsiz, 0,
- rack->r_ctl.crte,
- NULL);
+ rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted,
+ segsiz, pace_one, rack->r_ctl.crte,
+ NULL, rack->r_ctl.pace_len_divisor);
rack_log_hdwr_pacing(rack,
rate_wanted, rack->r_ctl.crte->rate, __LINE__,
err, 2);
@@ -14968,17 +17708,10 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
}
}
}
- if ((rack->r_ctl.crte != NULL) &&
- (rack->r_ctl.crte->rate == rate_wanted)) {
- /*
- * We need to add a extra if the rates
- * are exactly matched. The idea is
- * we want the software to make sure the
- * queue is empty before adding more, this
- * gives us N MSS extra pace times where
- * N is our sysctl
- */
- slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
+ if (minslot && (minslot > slot)) {
+ rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim,
+ 98, __LINE__, NULL, 0);
+ slot = minslot;
}
done_w_hdwr:
if (rack_limit_time_with_srtt &&
@@ -15006,6 +17739,9 @@ done_w_hdwr:
slot = srtt;
}
}
+ /*******************************************************************/
+ /* RRS: We insert paced call to stats here for len and rate_wanted */
+ /*******************************************************************/
rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
}
if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
@@ -15033,7 +17769,6 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tcp_seq startseq, uint32_t sb_offset)
{
struct rack_sendmap *my_rsm = NULL;
- struct rack_sendmap fe;
if (tp->t_state < TCPS_ESTABLISHED) {
/*
@@ -15057,6 +17792,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
}
}
tp->t_flags |= TF_GPUTINPROG;
+ rack->r_ctl.rc_gp_cumack_ts = 0;
rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
tp->gput_seq = startseq;
@@ -15067,7 +17803,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
(SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
rack->measure_saw_probe_rtt = 0;
if (rack->rc_gp_filled)
- tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ tp->gput_ts = rack->r_ctl.last_cumack_advance;
else {
/* Special case initial measurement */
struct timeval tv;
@@ -15092,9 +17828,11 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tp->gput_ack,
0,
tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
9,
__LINE__, NULL, 0);
+ rack_tend_gp_marks(tp, rack);
+ rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
return;
}
if (sb_offset) {
@@ -15102,6 +17840,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
* We are out somewhere in the sb
* can we use the already outstanding data?
*/
+
if (rack->r_ctl.rc_app_limited_cnt == 0) {
/*
* Yes first one is good and in this case
@@ -15109,7 +17848,7 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
* the last ack that arrived (no need to
* set things up when an ack comes in).
*/
- my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ my_rsm = tqhash_min(rack->r_ctl.tqh);
if ((my_rsm == NULL) ||
(my_rsm->r_rtr_cnt != 1)) {
/* retransmission? */
@@ -15137,13 +17876,11 @@ rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
* next with space i.e. over 1 MSS or the one
* after that (after the app-limited).
*/
- my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
- rack->r_ctl.rc_first_appl);
+ my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl);
if (my_rsm) {
if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
/* Have to use the next one */
- my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
- my_rsm);
+ my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
else {
/* Use after the first MSS of it is acked */
tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
@@ -15165,19 +17902,44 @@ start_set:
/*
* This one has been acked use the arrival ack time
*/
+ struct rack_sendmap *nrsm;
+
tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
rack->app_limited_needs_set = 0;
+ /*
+ * Ok in this path we need to use the r_end now
+ * since this guy is the starting ack.
+ */
+ tp->gput_seq = my_rsm->r_end;
+ /*
+ * We also need to adjust up the sendtime
+ * to the send of the next data after my_rsm.
+ */
+ nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
+ if (nrsm != NULL)
+ my_rsm = nrsm;
+ else {
+ /*
+ * The next as not been sent, thats the
+ * case for using the latest.
+ */
+ goto use_latest;
+ }
}
- rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
+ rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
+ rack->r_ctl.rc_gp_cumack_ts = 0;
rack_log_pacing_delay_calc(rack,
tp->gput_seq,
tp->gput_ack,
(uint64_t)my_rsm,
tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
9,
- __LINE__, NULL, 0);
+ __LINE__, my_rsm, 0);
+ /* Now lets make sure all are marked as they should be */
+ rack_tend_gp_marks(tp, rack);
+ rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
return;
}
@@ -15191,11 +17953,11 @@ use_latest:
*/
rack->app_limited_needs_set = 1;
tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
+ rack->r_ctl.rc_gp_cumack_ts = 0;
/* Find this guy so we can pull the send time */
- fe.r_start = startseq;
- my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ my_rsm = tqhash_find(rack->r_ctl.tqh, startseq);
if (my_rsm) {
- rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
+ rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
if (my_rsm->r_flags & RACK_ACKED) {
/*
* Unlikely since its probably what was
@@ -15219,13 +17981,15 @@ use_latest:
microuptime(&tv);
rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
}
+ rack_tend_gp_marks(tp, rack);
rack_log_pacing_delay_calc(rack,
tp->gput_seq,
tp->gput_ack,
(uint64_t)my_rsm,
tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
+ (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
9, __LINE__, NULL, 0);
+ rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
}
static inline uint32_t
@@ -15299,7 +18063,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
- len, &log, false, NULL, NULL, 0, &tv);
+ len, &log, false, NULL, __func__, __LINE__, &tv);
}
}
@@ -15442,6 +18206,7 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
* come in.
*/
fsb->o_m_len = smb->m_len;
+ fsb->o_t_len = M_TRAILINGROOM(smb);
} else {
/*
* This is the case where the next mbuf went to NULL. This
@@ -15452,6 +18217,7 @@ rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
* us send more.
*/
fsb->o_m_len = 0;
+ fsb->o_t_len = 0;
}
}
return (top);
@@ -15473,24 +18239,48 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
struct mbuf *m, *n;
int32_t soff;
- soff = rack->r_ctl.fsb.off;
m = rack->r_ctl.fsb.m;
- if (rack->r_ctl.fsb.o_m_len > m->m_len) {
+ if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) {
/*
- * The mbuf had the front of it chopped off by an ack
- * we need to adjust the soff/off by that difference.
+ * The trailing space changed, mbufs can grow
+ * at the tail but they can't shrink from
+ * it, KASSERT that. Adjust the orig_m_len to
+ * compensate for this change.
*/
- uint32_t delta;
-
- delta = rack->r_ctl.fsb.o_m_len - m->m_len;
- soff -= delta;
- } else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
+ KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)),
+ ("mbuf:%p rack:%p trailing_space:%lu ots:%u oml:%u mlen:%u\n",
+ m,
+ rack,
+ M_TRAILINGROOM(m),
+ rack->r_ctl.fsb.o_t_len,
+ rack->r_ctl.fsb.o_m_len,
+ m->m_len));
+ rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m));
+ rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m);
+ }
+ if (m->m_len < rack->r_ctl.fsb.o_m_len) {
/*
- * The mbuf was expanded probably by
- * a m_compress. Just update o_m_len.
+ * Mbuf shrank, trimmed off the top by an ack, our
+ * offset changes.
*/
+ KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)),
+ ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n",
+ m, m->m_len,
+ rack, rack->r_ctl.fsb.o_m_len,
+ rack->r_ctl.fsb.off));
+
+ if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len))
+ rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len);
+ else
+ rack->r_ctl.fsb.off = 0;
rack->r_ctl.fsb.o_m_len = m->m_len;
+#ifdef INVARIANTS
+ } else if (m->m_len > rack->r_ctl.fsb.o_m_len) {
+ panic("rack:%p m:%p m_len grew outside of t_space compensation",
+ rack, m);
+#endif
}
+ soff = rack->r_ctl.fsb.off;
KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
@@ -15505,6 +18295,105 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
return (n);
}
+/* Log the buffer level */
+static void
+rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,
+ int len, struct timeval *tv,
+ uint32_t cts)
+{
+ uint32_t p_rate = 0, p_queue = 0, err = 0;
+ union tcp_log_stackspecific log;
+
+#ifdef RATELIMIT
+ err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
+ err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
+#endif
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
+ log.u_bbr.flex1 = p_rate;
+ log.u_bbr.flex2 = p_queue;
+ log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
+ log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
+ log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
+ log.u_bbr.flex7 = 99;
+ log.u_bbr.flex8 = 0;
+ log.u_bbr.pkts_out = err;
+ log.u_bbr.delRate = rack->r_ctl.crte->rate;
+ log.u_bbr.timeStamp = cts;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
+ len, &log, false, NULL, __func__, __LINE__, tv);
+
+}
+
+static uint32_t
+rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp,
+ struct timeval *tv, uint32_t cts, int len, uint32_t segsiz)
+{
+ uint64_t lentime = 0;
+#ifdef RATELIMIT
+ uint32_t p_rate = 0, p_queue = 0, err;
+ union tcp_log_stackspecific log;
+ uint64_t bw;
+
+ err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
+ /* Failed or queue is zero */
+ if (err || (p_queue == 0)) {
+ lentime = 0;
+ goto out;
+ }
+ err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
+ if (err) {
+ lentime = 0;
+ goto out;
+ }
+ /*
+ * If we reach here we have some bytes in
+ * the queue. The number returned is a value
+ * between 0 and 0xffff where ffff is full
+ * and 0 is empty. So how best to make this into
+ * something usable?
+ *
+ * The "safer" way is lets take the b/w gotten
+ * from the query (which should be our b/w rate)
+ * and pretend that a full send (our rc_pace_max_segs)
+ * is outstanding. We factor it so its as if a full
+ * number of our MSS segment is terms of full
+ * ethernet segments are outstanding.
+ */
+ bw = p_rate / 8;
+ if (bw) {
+ lentime = (rack->r_ctl.rc_pace_max_segs / segsiz);
+ lentime *= ETHERNET_SEGMENT_SIZE;
+ lentime *= (uint64_t)HPTS_USEC_IN_SEC;
+ lentime /= bw;
+ } else {
+ /* TSNH -- KASSERT? */
+ lentime = 0;
+ }
+out:
+ if (tcp_bblogging_on(tp)) {
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
+ log.u_bbr.flex1 = p_rate;
+ log.u_bbr.flex2 = p_queue;
+ log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
+ log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
+ log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
+ log.u_bbr.flex7 = 99;
+ log.u_bbr.flex8 = 0;
+ log.u_bbr.pkts_out = err;
+ log.u_bbr.delRate = rack->r_ctl.crte->rate;
+ log.u_bbr.cur_del_rate = lentime;
+ log.u_bbr.timeStamp = cts;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
+ len, &log, false, NULL, __func__, __LINE__,tv);
+ }
+#endif
+ return ((uint32_t)lentime);
+}
+
static int
rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
@@ -15535,6 +18424,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
uint16_t flags;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
+ int32_t ip_sendflag = IP_NO_SND_TAG_RL;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
@@ -15609,7 +18499,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
/* Establish the len to send */
if (len > max_val)
len = max_val;
- if ((tso) && (len + optlen > tp->t_maxseg)) {
+ if ((tso) && (len + optlen > segsiz)) {
uint32_t if_hw_tsomax;
int32_t max_len;
@@ -15643,6 +18533,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
if ((tso == 0) && (len > segsiz))
len = segsiz;
+ (void)tcp_get_usecs(tv);
if ((len == 0) ||
(len <= MHLEN - hdrlen - max_linkhdr)) {
goto failed;
@@ -15683,7 +18574,9 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
#endif
if (rsm->m == NULL)
goto failed;
- if (rsm->orig_m_len != rsm->m->m_len) {
+ if (rsm->m &&
+ ((rsm->orig_m_len != rsm->m->m_len) ||
+ (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
/* Fix up the orig_m_len and possibly the mbuf offset */
rack_adjust_orig_mlen(rsm);
}
@@ -15726,6 +18619,13 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
ip->ip_tos |= ect;
}
}
+ if (rack->r_ctl.crte != NULL) {
+ /* See if we can send via the hw queue */
+ slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
+ /* If there is nothing in queue (no pacing time) we can send via the hw queue */
+ if (slot == 0)
+ ip_sendflag = 0;
+ }
tcp_set_flags(th, flags);
m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
#ifdef INET6
@@ -15770,10 +18670,15 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
#endif
if (tso) {
- KASSERT(len > tp->t_maxseg - optlen,
+ /*
+ * Here we use segsiz since we have no added options besides
+ * any standard timestamp options (no DSACKs or SACKS are sent
+ * via either fast-path).
+ */
+ KASSERT(len > segsiz,
("%s: len <= tso_segsz tp:%p", __func__, tp));
m->m_pkthdr.csum_flags |= CSUM_TSO;
- m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
+ m->m_pkthdr.tso_segsz = segsiz;
}
#ifdef INET6
if (rack->r_is_v6) {
@@ -15802,6 +18707,19 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
}
#endif
+ if (doing_tlp == 0) {
+ /* Set we retransmitted */
+ rack->rc_gp_saw_rec = 1;
+ } else {
+ /* Its a TLP set ca or ss */
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ /* Set we sent in CA */
+ rack->rc_gp_saw_ca = 1;
+ } else {
+ /* Set we sent in SS */
+ rack->rc_gp_saw_ss = 1;
+ }
+ }
/* Time to copy in our header */
cpto = mtod(m, uint8_t *);
memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
@@ -15829,11 +18747,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = max_val;
- log.u_bbr.flex5 = 0;
/* Save off the early/late values */
log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
if (doing_tlp == 0)
log.u_bbr.flex8 = 1;
else
@@ -15843,17 +18761,43 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.timeStamp = cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ if (rsm && (rsm->r_rtr_cnt > 0)) {
+ /*
+ * When we have a retransmit we want to log the
+ * burst at send and flight at send from before.
+ */
+ log.u_bbr.flex5 = rsm->r_fas;
+ log.u_bbr.bbr_substate = rsm->r_bas;
+ } else {
+ /*
+ * This is currently unlikely until we do the
+ * packet pair probes but I will add it for completeness.
+ */
+ log.u_bbr.flex5 = log.u_bbr.inflight;
+ log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
+ }
log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
log.u_bbr.delivered = 0;
+ log.u_bbr.rttProp = (uint64_t)rsm;
+ log.u_bbr.delRate = rsm->r_flags;
+ log.u_bbr.delRate <<= 31;
+ log.u_bbr.delRate |= rack->r_must_retran;
+ log.u_bbr.delRate <<= 1;
+ log.u_bbr.delRate |= 1;
+ log.u_bbr.pkt_epoch = __LINE__;
lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
- len, &log, false, NULL, NULL, 0, tv);
+ len, &log, false, NULL, __func__, __LINE__, tv);
} else
lgb = NULL;
+ if ((rack->r_ctl.crte != NULL) &&
+ tcp_bblogging_on(tp)) {
+ rack_log_queue_level(tp, rack, len, tv, cts);
+ }
#ifdef INET6
if (rack->r_is_v6) {
error = ip6_output(m, NULL,
&inp->inp_route6,
- 0, NULL, NULL, inp);
+ ip_sendflag, NULL, NULL, inp);
}
else
#endif
@@ -15861,7 +18805,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
{
error = ip_output(m, NULL,
&inp->inp_route,
- 0, 0, inp);
+ ip_sendflag, 0, inp);
}
#endif
m = NULL;
@@ -15871,14 +18815,22 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
}
if (error) {
goto failed;
+ } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) {
+ rack->rc_hw_nobuf = 0;
+ rack->r_ctl.rc_agg_delayed = 0;
+ rack->r_early = 0;
+ rack->r_late = 0;
+ rack->r_ctl.rc_agg_early = 0;
}
+
rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
- rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls);
- if (doing_tlp && (rack->fast_rsm_hack == 0)) {
+ rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
+ if (doing_tlp) {
rack->rc_tlp_in_progress = 1;
rack->r_ctl.rc_tlp_cnt_out++;
}
if (error == 0) {
+ counter_u64_add(rack_total_bytes, len);
tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
if (doing_tlp) {
rack->rc_last_sent_tlp_past_cumack = 0;
@@ -15909,6 +18861,8 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
if (error && (error == ENOBUFS)) {
if (rack->r_ctl.crte != NULL) {
tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
+ if (tcp_bblogging_on(rack->rc_tp))
+ rack_log_queue_level(tp, rack, len, tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
@@ -15916,6 +18870,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
rack->rc_enobuf++;
if (slot < (10 * HPTS_USEC_IN_MSEC))
slot = 10 * HPTS_USEC_IN_MSEC;
+ if (rack->r_ctl.crte != NULL) {
+ counter_u64_add(rack_saw_enobuf_hw, 1);
+ tcp_rl_log_enobuf(rack->r_ctl.crte);
+ }
+ counter_u64_add(rack_saw_enobuf, 1);
} else
slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
if ((slot == 0) ||
@@ -16101,7 +19060,7 @@ again:
/* Establish the len to send */
if (len > max_val)
len = max_val;
- if ((tso) && (len + optlen > tp->t_maxseg)) {
+ if ((tso) && (len + optlen > segsiz)) {
uint32_t if_hw_tsomax;
int32_t max_len;
@@ -16135,6 +19094,7 @@ again:
}
if ((tso == 0) && (len > segsiz))
len = segsiz;
+ (void)tcp_get_usecs(tv);
if ((len == 0) ||
(len <= MHLEN - hdrlen - max_linkhdr)) {
goto failed;
@@ -16172,7 +19132,7 @@ again:
}
if (rack->r_ctl.fsb.rfo_apply_push &&
(len == rack->r_ctl.fsb.left_to_send)) {
- flags |= TH_PUSH;
+ tcp_set_flags(th, flags | TH_PUSH);
add_flag |= RACK_HAD_PUSH;
}
if ((m->m_next == NULL) || (len <= 0)){
@@ -16250,10 +19210,15 @@ again:
}
#endif
if (tso) {
- KASSERT(len > tp->t_maxseg - optlen,
+ /*
+ * Here we use segsiz since we have no added options besides
+ * any standard timestamp options (no DSACKs or SACKS are sent
+ * via either fast-path).
+ */
+ KASSERT(len > segsiz,
("%s: len <= tso_segsz tp:%p", __func__, tp));
m->m_pkthdr.csum_flags |= CSUM_TSO;
- m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
+ m->m_pkthdr.tso_segsz = segsiz;
}
#ifdef INET6
if (rack->r_is_v6) {
@@ -16282,6 +19247,13 @@ again:
}
}
#endif
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ /* Set we sent in CA */
+ rack->rc_gp_saw_ca = 1;
+ } else {
+ /* Set we sent in SS */
+ rack->rc_gp_saw_ss = 1;
+ }
/* Time to copy in our header */
cpto = mtod(m, uint8_t *);
memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
@@ -16292,6 +19264,10 @@ again:
} else {
th->th_off = sizeof(struct tcphdr) >> 2;
}
+ if ((rack->r_ctl.crte != NULL) &&
+ tcp_bblogging_on(tp)) {
+ rack_log_queue_level(tp, rack, len, tv, cts);
+ }
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
@@ -16304,21 +19280,29 @@ again:
log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = max_val;
- log.u_bbr.flex5 = 0;
/* Save off the early/late values */
log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
log.u_bbr.flex8 = 0;
log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
log.u_bbr.flex7 = 44;
log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.timeStamp = cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.flex5 = log.u_bbr.inflight;
log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
log.u_bbr.delivered = 0;
+ log.u_bbr.rttProp = 0;
+ log.u_bbr.delRate = rack->r_must_retran;
+ log.u_bbr.delRate <<= 1;
+ log.u_bbr.pkt_epoch = __LINE__;
+ /* For fast output no retrans so just inflight and how many mss we send */
+ log.u_bbr.flex5 = log.u_bbr.inflight;
+ log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
- len, &log, false, NULL, NULL, 0, tv);
+ len, &log, false, NULL, __func__, __LINE__, tv);
} else
lgb = NULL;
#ifdef INET6
@@ -16346,17 +19330,29 @@ again:
*send_err = error;
m = NULL;
goto failed;
+ } else if (rack->rc_hw_nobuf) {
+ rack->rc_hw_nobuf = 0;
+ rack->r_ctl.rc_agg_delayed = 0;
+ rack->r_early = 0;
+ rack->r_late = 0;
+ rack->r_ctl.rc_agg_early = 0;
+ }
+ if ((error == 0) && (rack->lt_bw_up == 0)) {
+ /* Unlikely */
+ rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv);
+ rack->r_ctl.lt_seq = tp->snd_una;
+ rack->lt_bw_up = 1;
}
rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
- NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls);
+ NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
m = NULL;
if (tp->snd_una == tp->snd_max) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
tp->t_acktime = ticks;
}
- if (error == 0)
- tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
+ counter_u64_add(rack_total_bytes, len);
+ tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
rack->forced_ack = 0; /* If we send something zap the FA flag */
tot_len += len;
@@ -16364,6 +19360,15 @@ again:
rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
tp->snd_max += len;
tp->snd_nxt = tp->snd_max;
+ if (rack->rc_new_rnd_needed) {
+ /*
+ * Update the rnd to start ticking not
+ * that from a time perspective all of
+ * the preceding idle time is "in the round"
+ */
+ rack->rc_new_rnd_needed = 0;
+ rack->r_ctl.roundends = tp->snd_max;
+ }
{
int idx;
@@ -16426,16 +19431,65 @@ failed:
return (-1);
}
+static inline void
+rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack,
+ struct sockbuf *sb,
+ int len, int orig_len, int segsiz, uint32_t pace_max_seg,
+ bool hw_tls,
+ uint16_t flags)
+{
+ rack->r_fast_output = 1;
+ rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
+ rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
+ rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
+ rack->r_ctl.fsb.tcp_flags = flags;
+ rack->r_ctl.fsb.left_to_send = orig_len - len;
+ if (rack->r_ctl.fsb.left_to_send < pace_max_seg) {
+ /* Less than a full sized pace, lets not */
+ rack->r_fast_output = 0;
+ return;
+ } else {
+ /* Round down to the nearest pace_max_seg */
+ rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg);
+ }
+ if (hw_tls)
+ rack->r_ctl.fsb.hw_tls = 1;
+ else
+ rack->r_ctl.fsb.hw_tls = 0;
+ KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
+ ("rack:%p left_to_send:%u sbavail:%u out:%u",
+ rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
+ (tp->snd_max - tp->snd_una)));
+ if (rack->r_ctl.fsb.left_to_send < segsiz)
+ rack->r_fast_output = 0;
+ else {
+ if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
+ rack->r_ctl.fsb.rfo_apply_push = 1;
+ else
+ rack->r_ctl.fsb.rfo_apply_push = 0;
+ }
+}
+
+static uint32_t
+rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz)
+{
+ uint64_t min_time;
+ uint32_t maxlen;
+
+ min_time = (uint64_t)get_hpts_min_sleep_time();
+ maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC);
+ maxlen = roundup(maxlen, segsiz);
+ return (maxlen);
+}
+
static struct rack_sendmap *
rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
{
struct rack_sendmap *rsm = NULL;
- struct rack_sendmap fe;
int thresh;
restart:
- fe.r_start = rack->r_ctl.last_collapse_point;
- rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
/* Nothing, strange turn off validity */
rack->r_collapse_point_valid = 0;
@@ -16478,6 +19532,35 @@ restart:
return (NULL);
}
+static inline void
+rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
+{
+ if ((rack->full_size_rxt == 0) &&
+ (rack->shape_rxt_to_pacing_min == 0) &&
+ (*len >= segsiz)) {
+ *len = segsiz;
+ } else if (rack->shape_rxt_to_pacing_min &&
+ rack->gp_ready) {
+ /* We use pacing min as shaping len req */
+ uint32_t maxlen;
+
+ maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
+ if (*len > maxlen)
+ *len = maxlen;
+ } else {
+ /*
+ * The else is full_size_rxt is on so send it all
+ * note we do need to check this for exceeding
+ * our max segment size due to the fact that
+ * we do sometimes merge chunks together i.e.
+ * we cannot just assume that we will never have
+ * a chunk greater than pace_max_seg
+ */
+ if (*len > pace_max_seg)
+ *len = pace_max_seg;
+ }
+}
+
static int
rack_output(struct tcpcb *tp)
{
@@ -16500,6 +19583,7 @@ rack_output(struct tcpcb *tp)
struct tcphdr *th;
uint8_t pass = 0;
uint8_t mark = 0;
+ uint8_t check_done = 0;
uint8_t wanted_cookie = 0;
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen, ulen=0;
@@ -16558,6 +19642,15 @@ rack_output(struct tcpcb *tp)
return (tcp_offload_output(tp));
}
#endif
+ if (rack->rack_deferred_inited == 0) {
+ /*
+ * If we are the connecting socket we will
+ * hit rack_init() when no sequence numbers
+ * are setup. This makes it so we must defer
+ * some initialization. Call that now.
+ */
+ rack_deferred_init(tp, rack);
+ }
/*
* For TFO connections in SYN_RECEIVED, only allow the initial
* SYN|ACK and those sent by the retransmit timer.
@@ -16603,7 +19696,7 @@ rack_output(struct tcpcb *tp)
int retval;
retval = rack_process_timers(tp, rack, cts, hpts_calling,
- &doing_tlp);
+ &doing_tlp);
if (retval != 0) {
counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
#ifdef TCP_ACCOUNTING
@@ -16752,8 +19845,11 @@ rack_output(struct tcpcb *tp)
}
idle = 0;
}
- if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED))
- rack_init_fsb_block(tp, rack);
+ if (rack_use_fsb &&
+ (rack->r_ctl.fsb.tcp_ip_hdr) &&
+ (rack->r_fsb_inited == 0) &&
+ (rack->r_state != TCPS_CLOSED))
+ rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
again:
/*
* If we've recently taken a timeout, snd_max will be greater than
@@ -16811,8 +19907,7 @@ again:
__func__, __LINE__,
rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
- if (len >= segsiz)
- len = segsiz;
+ rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
} else if (rack->r_collapse_point_valid &&
((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
/*
@@ -16832,10 +19927,7 @@ again:
len = rsm->r_end - rsm->r_start;
sb_offset = rsm->r_start - tp->snd_una;
sendalot = 0;
- if ((rack->full_size_rxt == 0) &&
- (rack->shape_rxt_to_pacing_min == 0) &&
- (len >= segsiz))
- len = segsiz;
+ rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
/* We have a retransmit that takes precedence */
if ((!IN_FASTRECOVERY(tp->t_flags)) &&
@@ -16857,13 +19949,12 @@ again:
rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
sendalot = 0;
- if (len >= segsiz)
- len = segsiz;
+ rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
if (len > 0) {
sack_rxmit = 1;
KMOD_TCPSTAT_INC(tcps_sack_rexmits);
KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
- min(len, segsiz));
+ min(len, segsiz));
}
} else if (rack->r_ctl.rc_tlpsend) {
/* Tail loss probe */
@@ -16955,6 +20046,15 @@ again:
(rack->shape_rxt_to_pacing_min == 0) &&
(len >= segsiz))
len = segsiz;
+ else if (rack->shape_rxt_to_pacing_min &&
+ rack->gp_ready) {
+ /* We use pacing min as shaping len req */
+ uint32_t maxlen;
+
+ maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
+ if (len > maxlen)
+ len = maxlen;
+ }
/*
* Delay removing the flag RACK_MUST_RXT so
* that the fastpath for retransmit will
@@ -16990,7 +20090,8 @@ again:
flags &= ~TH_FIN;
}
}
- if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
+ if (rsm && rack->r_fsb_inited &&
+ rack_use_rsm_rfo &&
((rsm->r_flags & RACK_HAS_FIN) == 0)) {
int ret;
@@ -17029,8 +20130,8 @@ again:
if (rack->r_ctl.rc_scw) {
/* First lets update and get the cwnd */
rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
- rack->r_ctl.rc_scw_index,
- tp->snd_cwnd, tp->snd_wnd, segsiz);
+ rack->r_ctl.rc_scw_index,
+ tp->snd_cwnd, tp->snd_wnd, segsiz);
}
}
#endif
@@ -17092,7 +20193,11 @@ again:
} else {
len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
}
- if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
+ if ((rack->r_ctl.crte == NULL) &&
+ IN_FASTRECOVERY(tp->t_flags) &&
+ (rack->full_size_rxt == 0) &&
+ (rack->shape_rxt_to_pacing_min == 0) &&
+ (len > segsiz)) {
/*
* For prr=off, we need to send only 1 MSS
* at a time. We do this because another sack could
@@ -17101,7 +20206,15 @@ again:
* that keeps us from sending out the retransmit.
*/
len = segsiz;
- }
+ } else if (rack->shape_rxt_to_pacing_min &&
+ rack->gp_ready) {
+ /* We use pacing min as shaping len req */
+ uint32_t maxlen;
+
+ maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
+ if (len > maxlen)
+ len = maxlen;
+ }/* The else is full_size_rxt is on so send it all */
} else {
uint32_t outstanding;
/*
@@ -17259,7 +20372,7 @@ again:
(TCPS_HAVEESTABLISHED(tp->t_state)) &&
(tp->snd_una == tp->snd_max) &&
(sb_offset < (int)sbavail(sb))) {
- rack_enter_persist(tp, rack, cts);
+ rack_enter_persist(tp, rack, cts, tp->snd_una);
}
} else if ((rsm == NULL) &&
(doing_tlp == 0) &&
@@ -17287,9 +20400,9 @@ again:
* Nothing out we can
* go into persists.
*/
- rack_enter_persist(tp, rack, cts);
+ rack_enter_persist(tp, rack, cts, tp->snd_una);
}
- } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
+ } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
(ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
(len < (int)(sbavail(sb) - sb_offset)) &&
(len < minseg)) {
@@ -17346,7 +20459,6 @@ again:
* larger TSO's out).
*/
len = 0;
-
}
}
@@ -17418,7 +20530,7 @@ again:
}
}
recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
- (long)TCP_MAXWIN << tp->rcv_scale);
+ (long)TCP_MAXWIN << tp->rcv_scale);
/*
* Sender silly window avoidance. We transmit under the following
@@ -17519,7 +20631,7 @@ again:
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
oldwin = (tp->rcv_adv - tp->rcv_nxt);
if (adv > oldwin)
- adv -= oldwin;
+ adv -= oldwin;
else {
/* We can't increase the window */
adv = 0;
@@ -17591,6 +20703,7 @@ just_return_nolock:
(tp->rcv_numsacks == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
+ ((IN_RECOVERY(tp->t_flags)) == 0) &&
(rack->r_must_retran == 0) &&
((tp->t_flags & TF_NEEDFIN) == 0) &&
(len > 0) && (orig_len > 0) &&
@@ -17599,28 +20712,8 @@ just_return_nolock:
((optlen == 0) ||
((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
/* We can send at least one more MSS using our fsb */
-
- rack->r_fast_output = 1;
- rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
- rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
- rack->r_ctl.fsb.tcp_flags = flags;
- rack->r_ctl.fsb.left_to_send = orig_len - len;
- if (hw_tls)
- rack->r_ctl.fsb.hw_tls = 1;
- else
- rack->r_ctl.fsb.hw_tls = 0;
- KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
- ("rack:%p left_to_send:%u sbavail:%u out:%u",
- rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
- (tp->snd_max - tp->snd_una)));
- if (rack->r_ctl.fsb.left_to_send < segsiz)
- rack->r_fast_output = 0;
- else {
- if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
- rack->r_ctl.fsb.rfo_apply_push = 1;
- else
- rack->r_ctl.fsb.rfo_apply_push = 0;
- }
+ rack_setup_fast_output(tp, rack, sb, len, orig_len,
+ segsiz, pace_max_seg, hw_tls, flags);
} else
rack->r_fast_output = 0;
@@ -17634,7 +20727,7 @@ just_return_nolock:
int end_window = 0;
uint32_t seq = tp->gput_ack;
- rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm) {
/*
* Mark the last sent that we just-returned (hinting
@@ -17653,12 +20746,12 @@ just_return_nolock:
/* We are limited by the rwnd */
app_limited = CTF_JR_RWND_LIMITED;
if (IN_FASTRECOVERY(tp->t_flags))
- rack->r_ctl.rc_prr_sndcnt = 0;
+ rack->r_ctl.rc_prr_sndcnt = 0;
} else if (ctf_outstanding(tp) >= sbavail(sb)) {
/* We are limited by whats available -- app limited */
app_limited = CTF_JR_APP_LIMITED;
if (IN_FASTRECOVERY(tp->t_flags))
- rack->r_ctl.rc_prr_sndcnt = 0;
+ rack->r_ctl.rc_prr_sndcnt = 0;
} else if ((idle == 0) &&
((tp->t_flags & TF_NODELAY) == 0) &&
((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
@@ -17754,7 +20847,7 @@ just_return_nolock:
log = 1;
}
/* Mark the last packet has app limited */
- rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ rsm = tqhash_max(rack->r_ctl.tqh);
if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
if (rack->r_ctl.rc_app_limited_cnt == 0)
rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
@@ -17784,7 +20877,7 @@ just_return_nolock:
(sbavail(sb) > tp->snd_wnd) &&
(tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
/* Yes lets make sure to move to persist before timer-start */
- rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
}
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
@@ -17822,6 +20915,26 @@ just_return_nolock:
return (0);
send:
+ if ((rack->r_ctl.crte != NULL) &&
+ (rsm == NULL) &&
+ ((rack->rc_hw_nobuf == 1) ||
+ (rack_hw_check_queue && (check_done == 0)))) {
+ /*
+ * We only want to do this once with the hw_check_queue,
+ * for the enobuf case we would only do it once if
+ * we come around to again, the flag will be clear.
+ */
+ check_done = 1;
+ slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
+ if (slot) {
+ rack->r_ctl.rc_agg_delayed = 0;
+ rack->r_ctl.rc_agg_early = 0;
+ rack->r_early = 0;
+ rack->r_late = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto skip_all_send;
+ }
+ }
if (rsm || sack_rxmit)
counter_u64_add(rack_nfto_resend, 1);
else
@@ -18037,7 +21150,7 @@ send:
* In case there are too many small fragments don't
* use TSO:
*/
- if (len <= segsiz) {
+ if (len <= max_len) {
mark = 4;
tso = 0;
}
@@ -18515,6 +21628,10 @@ send:
* not the case for IPv6.
*/
if (tso) {
+ /*
+ * Here we must use t_maxseg and the optlen since
+ * the optlen may include SACK's (or DSACK).
+ */
KASSERT(len > tp->t_maxseg - optlen,
("%s: len <= tso_segsz", __func__));
m->m_pkthdr.csum_flags |= CSUM_TSO;
@@ -18528,6 +21645,11 @@ send:
/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
hhook_run_tcp_est_out(tp, th, &to, len, tso);
#endif
+ if ((rack->r_ctl.crte != NULL) &&
+ (rack->rc_hw_nobuf == 0) &&
+ tcp_bblogging_on(tp)) {
+ rack_log_queue_level(tp, rack, len, &tv, cts);
+ }
/* We're getting ready to send; log now. */
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
@@ -18545,6 +21667,7 @@ send:
log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
log.u_bbr.flex8 = 0;
if (rsm) {
if (rsm->r_flags & RACK_RWND_COLLAPSED) {
@@ -18559,8 +21682,6 @@ send:
} else {
if (doing_tlp)
log.u_bbr.flex8 = 3;
- else
- log.u_bbr.flex8 = 0;
}
log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
log.u_bbr.flex7 = mark;
@@ -18569,10 +21690,39 @@ send:
log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.timeStamp = cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ if (rsm && (rsm->r_rtr_cnt > 0)) {
+ /*
+ * When we have a retransmit we want to log the
+ * burst at send and flight at send from before.
+ */
+ log.u_bbr.flex5 = rsm->r_fas;
+ log.u_bbr.bbr_substate = rsm->r_bas;
+ } else {
+ /*
+ * New transmits we log in flex5 the inflight again as
+ * well as the number of segments in our send in the
+ * substate field.
+ */
+ log.u_bbr.flex5 = log.u_bbr.inflight;
+ log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
+ }
log.u_bbr.lt_epoch = cwnd_to_use;
log.u_bbr.delivered = sendalot;
+ log.u_bbr.rttProp = (uint64_t)rsm;
+ log.u_bbr.pkt_epoch = __LINE__;
+ if (rsm) {
+ log.u_bbr.delRate = rsm->r_flags;
+ log.u_bbr.delRate <<= 31;
+ log.u_bbr.delRate |= rack->r_must_retran;
+ log.u_bbr.delRate <<= 1;
+ log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
+ } else {
+ log.u_bbr.delRate = rack->r_must_retran;
+ log.u_bbr.delRate <<= 1;
+ log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
+ }
lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
- len, &log, false, NULL, NULL, 0, &tv);
+ len, &log, false, NULL, __func__, __LINE__, &tv);
} else
lgb = NULL;
@@ -18684,7 +21834,31 @@ out:
* In transmit state, time the transmission and arrange for the
* retransmit. In persist state, just set snd_max.
*/
+ rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
+ rack_to_usec_ts(&tv),
+ rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
if (error == 0) {
+ if (rsm == NULL) {
+ if (rack->lt_bw_up == 0) {
+ rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
+ rack->r_ctl.lt_seq = tp->snd_una;
+ rack->lt_bw_up = 1;
+ } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) {
+ /*
+ * Need to record what we have since we are
+ * approaching seq wrap.
+ */
+ uint64_t tmark;
+
+ rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
+ rack->r_ctl.lt_seq = tp->snd_una;
+ tmark = tcp_tv_to_lusectick(&tv);
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ rack->r_ctl.lt_timemark = tmark;
+ }
+ }
+ rack->forced_ack = 0; /* If we send something zap the FA flag */
+ counter_u64_add(rack_total_bytes, len);
tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
if (rsm && doing_tlp) {
rack->rc_last_sent_tlp_past_cumack = 0;
@@ -18692,7 +21866,13 @@ out:
rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
}
- rack->forced_ack = 0; /* If we send something zap the FA flag */
+ if (rack->rc_hw_nobuf) {
+ rack->rc_hw_nobuf = 0;
+ rack->r_ctl.rc_agg_delayed = 0;
+ rack->r_early = 0;
+ rack->r_late = 0;
+ rack->r_ctl.rc_agg_early = 0;
+ }
if (rsm && (doing_tlp == 0)) {
/* Set we retransmitted */
rack->rc_gp_saw_rec = 1;
@@ -18710,11 +21890,9 @@ out:
tp->rcv_numsacks > 0)
tcp_clean_dsack_blocks(tp);
tot_len_this_send += len;
- if (len == 0)
+ if (len == 0) {
counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
- else if (len == 1) {
- counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
- } else if (len > 1) {
+ } else {
int idx;
idx = (len / segsiz) + 3;
@@ -18740,9 +21918,6 @@ out:
/* If its a resend without TLP then it must not have the flag */
rsm->r_flags &= ~RACK_TLP;
}
- rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
- rack_to_usec_ts(&tv),
- rsm, add_flag, s_mb, s_moff, hw_tls);
if ((error == 0) &&
@@ -18803,6 +21978,15 @@ out:
tp->t_acktime = ticks;
}
tp->snd_max = tp->snd_nxt;
+ if (rack->rc_new_rnd_needed) {
+ /*
+ * Update the rnd to start ticking not
+ * that from a time perspective all of
+ * the preceding idle time is "in the round"
+ */
+ rack->rc_new_rnd_needed = 0;
+ rack->r_ctl.roundends = tp->snd_max;
+ }
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
@@ -18837,6 +22021,7 @@ out:
if (rack->r_fast_output) {
rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
+ rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
}
}
}
@@ -18878,6 +22063,8 @@ nomore:
*/
if (rack->r_ctl.crte != NULL) {
tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
+ if (tcp_bblogging_on(rack->rc_tp))
+ rack_log_queue_level(tp, rack, len, &tv, cts);
} else
tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
@@ -18903,8 +22090,13 @@ nomore:
if (tso)
tp->t_flags &= ~TF_TSO;
if (mtu != 0) {
+ int saved_mtu;
+
+ saved_mtu = tp->t_maxseg;
tcp_mss_update(tp, -1, mtu, NULL, NULL);
- goto again;
+ if (saved_mtu > tp->t_maxseg) {
+ goto again;
+ }
}
slot = 10 * HPTS_USEC_IN_MSEC;
rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
@@ -19048,6 +22240,7 @@ enobufs:
(tp->rcv_numsacks == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
+ ((IN_RECOVERY(tp->t_flags)) == 0) &&
(rack->r_must_retran == 0) &&
((tp->t_flags & TF_NEEDFIN) == 0) &&
(len > 0) && (orig_len > 0) &&
@@ -19056,28 +22249,8 @@ enobufs:
((optlen == 0) ||
((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
/* We can send at least one more MSS using our fsb */
-
- rack->r_fast_output = 1;
- rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
- rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
- rack->r_ctl.fsb.tcp_flags = flags;
- rack->r_ctl.fsb.left_to_send = orig_len - len;
- if (hw_tls)
- rack->r_ctl.fsb.hw_tls = 1;
- else
- rack->r_ctl.fsb.hw_tls = 0;
- KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
- ("rack:%p left_to_send:%u sbavail:%u out:%u",
- rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
- (tp->snd_max - tp->snd_una)));
- if (rack->r_ctl.fsb.left_to_send < segsiz)
- rack->r_fast_output = 0;
- else {
- if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
- rack->r_ctl.fsb.rfo_apply_push = 1;
- else
- rack->r_ctl.fsb.rfo_apply_push = 0;
- }
+ rack_setup_fast_output(tp, rack, sb, len, orig_len,
+ segsiz, pace_max_seg, hw_tls, flags);
} else
rack->r_fast_output = 0;
rack_log_fsb(rack, tp, so, flags,
@@ -19097,6 +22270,7 @@ enobufs:
(rack->r_must_retran == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
+ ((IN_RECOVERY(tp->t_flags)) == 0) &&
((tp->t_flags & TF_NEEDFIN) == 0) &&
(len > 0) && (orig_len > 0) &&
(orig_len > len) &&
@@ -19104,31 +22278,9 @@ enobufs:
((optlen == 0) ||
((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
/* we can use fast_output for more */
-
- rack->r_fast_output = 1;
- rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
- rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
- rack->r_ctl.fsb.tcp_flags = flags;
- rack->r_ctl.fsb.left_to_send = orig_len - len;
- if (hw_tls)
- rack->r_ctl.fsb.hw_tls = 1;
- else
- rack->r_ctl.fsb.hw_tls = 0;
- KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
- ("rack:%p left_to_send:%u sbavail:%u out:%u",
- rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
- (tp->snd_max - tp->snd_una)));
- if (rack->r_ctl.fsb.left_to_send < segsiz) {
- rack->r_fast_output = 0;
- }
+ rack_setup_fast_output(tp, rack, sb, len, orig_len,
+ segsiz, pace_max_seg, hw_tls, flags);
if (rack->r_fast_output) {
- if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
- rack->r_ctl.fsb.rfo_apply_push = 1;
- else
- rack->r_ctl.fsb.rfo_apply_push = 0;
- rack_log_fsb(rack, tp, so, flags,
- ipoptlen, orig_len, len, error,
- (rsm == NULL), optlen, __LINE__, 3);
error = 0;
ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
if (ret >= 0)
@@ -19141,6 +22293,7 @@ enobufs:
goto again;
}
/* Assure when we leave that snd_nxt will point to top */
+skip_all_send:
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
@@ -19208,7 +22361,7 @@ rack_mtu_change(struct tcpcb *tp)
rack->r_must_retran = 1;
/* Mark all inflight to needing to be rxt'd */
TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
- rsm->r_flags |= RACK_MUST_RXT;
+ rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG);
}
}
sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
@@ -19217,125 +22370,110 @@ rack_mtu_change(struct tcpcb *tp)
}
static int
-rack_set_profile(struct tcp_rack *rack, int prof)
+rack_set_dgp(struct tcp_rack *rack)
{
- int err = EINVAL;
- if (prof == 1) {
- /* pace_always=1 */
- if (rack->rc_always_pace == 0) {
- if (tcp_can_enable_pacing() == 0)
- return (EBUSY);
- }
- rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
- rack_set_cc_pacing(rack);
- rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- rack->rack_attempt_hdwr_pace = 0;
- /* cmpack=1 */
- if (rack_use_cmp_acks)
- rack->r_use_cmp_ack = 1;
- if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
- rack->r_use_cmp_ack)
- rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
- /* scwnd=1 */
- rack->rack_enable_scwnd = 1;
- /* dynamic=100 */
- rack->rc_gp_dyn_mul = 1;
- /* gp_inc_ca */
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* rrr_conf=3 */
- rack->r_rr_config = 3;
- /* npush=2 */
- rack->r_ctl.rc_no_push_at_mrtt = 2;
- /* fillcw=1 */
+ /* pace_always=1 */
+ if (rack->rc_always_pace == 0) {
+ if (tcp_can_enable_pacing() == 0)
+ return (EBUSY);
+ }
+ rack->dgp_on = 1;
+ rack->rc_always_pace = 1;
+ rack->use_fixed_rate = 0;
+ if (rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ rack->rack_attempt_hdwr_pace = 0;
+ /* rxt settings */
+ rack->full_size_rxt = 1;
+ rack->shape_rxt_to_pacing_min = 0;
+ /* cmpack=1 */
+ rack->r_use_cmp_ack = 1;
+ if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
+ rack->r_use_cmp_ack)
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ /* scwnd=1 */
+ rack->rack_enable_scwnd = 1;
+ /* dynamic=100 */
+ rack->rc_gp_dyn_mul = 1;
+ /* gp_inc_ca */
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* rrr_conf=3 */
+ rack->r_rr_config = 3;
+ /* npush=2 */
+ rack->r_ctl.rc_no_push_at_mrtt = 2;
+ /* fillcw=1 */
+ if (rack->r_cwnd_was_clamped == 0) {
rack->rc_pace_to_cwnd = 1;
- rack->rc_pace_fill_if_rttin_range = 0;
- rack->rtt_limit_mul = 0;
- /* noprr=1 */
- rack->rack_no_prr = 1;
- /* lscwnd=1 */
- rack->r_limit_scw = 1;
- /* gp_inc_rec */
- rack->r_ctl.rack_per_of_gp_rec = 90;
- err = 0;
-
- } else if (prof == 3) {
- /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
- /* pace_always=1 */
- if (rack->rc_always_pace == 0) {
- if (tcp_can_enable_pacing() == 0)
- return (EBUSY);
- }
- rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
- rack_set_cc_pacing(rack);
- rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- rack->rack_attempt_hdwr_pace = 0;
- /* cmpack=1 */
- if (rack_use_cmp_acks)
- rack->r_use_cmp_ack = 1;
- if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
- rack->r_use_cmp_ack)
- rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
- /* scwnd=1 */
- rack->rack_enable_scwnd = 1;
- /* dynamic=100 */
- rack->rc_gp_dyn_mul = 1;
- /* gp_inc_ca */
+ } else {
+ rack->rc_pace_to_cwnd = 0;
+ /* Reset all multipliers to 100.0 so just the measured bw */
+ rack->r_ctl.rack_per_of_gp_ss = 100;
rack->r_ctl.rack_per_of_gp_ca = 100;
- /* rrr_conf=3 */
- rack->r_rr_config = 3;
- /* npush=2 */
- rack->r_ctl.rc_no_push_at_mrtt = 2;
- /* fillcw=2 */
- rack->rc_pace_to_cwnd = 1;
- rack->r_fill_less_agg = 1;
- rack->rc_pace_fill_if_rttin_range = 0;
- rack->rtt_limit_mul = 0;
- /* noprr=1 */
- rack->rack_no_prr = 1;
- /* lscwnd=1 */
- rack->r_limit_scw = 1;
- /* gp_inc_rec */
- rack->r_ctl.rack_per_of_gp_rec = 90;
- err = 0;
+ }
+ rack->rc_pace_fill_if_rttin_range = 0;
+ rack->rtt_limit_mul = 0;
+ /* noprr=1 */
+ rack->rack_no_prr = 1;
+ /* lscwnd=1 */
+ rack->r_limit_scw = 1;
+ /* gp_inc_rec */
+ rack->r_ctl.rack_per_of_gp_rec = 90;
+ rack_client_buffer_level_set(rack);
+ return (0);
+}
+
+static int
+rack_set_profile(struct tcp_rack *rack, int prof)
+{
+ int err = EINVAL;
+ if (prof == 1) {
+ /*
+ * Profile 1 is "standard" DGP. It ignores
+ * client buffer level.
+ */
+ rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0;
+ err = rack_set_dgp(rack);
+ if (err)
+ return (err);
} else if (prof == 2) {
- /* cmpack=1 */
- if (rack->rc_always_pace == 0) {
- if (tcp_can_enable_pacing() == 0)
- return (EBUSY);
- }
- rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
- rack_set_cc_pacing(rack);
- rack->r_use_cmp_ack = 1;
- if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
- rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
- /* pace_always=1 */
- rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- /* scwnd=1 */
- rack->rack_enable_scwnd = 1;
- /* dynamic=100 */
- rack->rc_gp_dyn_mul = 1;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* rrr_conf=3 */
- rack->r_rr_config = 3;
- /* npush=2 */
- rack->r_ctl.rc_no_push_at_mrtt = 2;
- /* fillcw=1 */
- rack->rc_pace_to_cwnd = 1;
- rack->rc_pace_fill_if_rttin_range = 0;
- rack->rtt_limit_mul = 0;
- /* noprr=1 */
- rack->rack_no_prr = 1;
- /* lscwnd=0 */
- rack->r_limit_scw = 0;
- err = 0;
+ /*
+ * Profile 2 is DGP. Less aggressive with
+ * respect to client buffer level.
+ */
+ rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1;
+ err = rack_set_dgp(rack);
+ if (err)
+ return (err);
+ } else if (prof == 3) {
+ /*
+ * Profile 3 is DGP. Even Less aggressive with
+ * respect to client buffer level.
+ */
+ rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2;
+ err = rack_set_dgp(rack);
+ if (err)
+ return (err);
+ } else if (prof == 4) {
+ /*
+ * Profile 4 is DGP with the most responsiveness
+ * to client buffer level.
+ */
+ rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3;
+ err = rack_set_dgp(rack);
+ if (err)
+ return (err);
} else if (prof == 0) {
/* This changes things back to the default settings */
+ rack->dgp_on = 0;
+ rack->rc_hybrid_mode = 0;
err = 0;
+ if (rack_fill_cw_state)
+ rack->rc_pace_to_cwnd = 1;
+ else
+ rack->rc_pace_to_cwnd = 0;
if (rack->rc_always_pace) {
tcp_decrement_paced_conn();
rack_undo_cc_pacing(rack);
@@ -19343,7 +22481,7 @@ rack_set_profile(struct tcp_rack *rack, int prof)
}
if (rack_pace_every_seg && tcp_can_enable_pacing()) {
rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
+ if ((rack->gp_ready) && (rack->use_fixed_rate == 0))
rack_set_cc_pacing(rack);
} else
rack->rc_always_pace = 0;
@@ -19407,6 +22545,7 @@ rack_set_profile(struct tcp_rack *rack, int prof)
rack->r_limit_scw = 1;
else
rack->r_limit_scw = 0;
+ rack_init_retransmit_value(rack, rack_rxt_controls);
err = 0;
}
return (err);
@@ -19432,8 +22571,90 @@ rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
}
static int
+process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid)
+{
+#ifdef TCP_REQUEST_TRK
+ struct http_sendfile_track *sft;
+ struct timeval tv;
+ tcp_seq seq;
+ int err;
+
+ microuptime(&tv);
+
+ /*
+ * If BB logging is not on we need to look at the DTL flag.
+ * If its on already then those reasons override the DTL input.
+ * We do this with any request, you can turn DTL on, but it does
+ * not turn off at least from hybrid pacing requests.
+ */
+ if (tcp_bblogging_on(rack->rc_tp) == 0) {
+ if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) {
+ /* Turn on BB point logging */
+ tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS,
+ TCP_BBPOINT_REQ_LEVEL_LOGGING);
+ }
+ }
+ /* Make sure no fixed rate is on */
+ rack->use_fixed_rate = 0;
+ rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
+ rack->r_ctl.rc_fixed_pacing_rate_ca = 0;
+ rack->r_ctl.rc_fixed_pacing_rate_ss = 0;
+ /* Now allocate or find our entry that will have these settings */
+ sft = tcp_http_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0);
+ if (sft == NULL) {
+ rack->rc_tp->tcp_hybrid_error++;
+ /* no space, where would it have gone? */
+ seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc;
+ rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
+ return (ENOSPC);
+ }
+ /* The seq will be snd_una + everything in the buffer */
+ seq = sft->start_seq;
+ if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
+ /* Disabling hybrid pacing */
+ if (rack->rc_hybrid_mode) {
+ rack_set_profile(rack, 0);
+ rack->rc_tp->tcp_hybrid_stop++;
+ }
+ rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0);
+ return (0);
+ }
+ if (rack->dgp_on == 0) {
+ /*
+ * If we have not yet turned DGP on, do so
+ * now setting pure DGP mode, no buffer level
+ * response.
+ */
+ if ((err = rack_set_profile(rack, 1)) != 0){
+ /* Failed to turn pacing on */
+ rack->rc_tp->tcp_hybrid_error++;
+ rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0);
+ return (err);
+ }
+ }
+ /* Now set in our flags */
+ sft->hybrid_flags = hybrid->hybrid_flags;
+ if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
+ sft->cspr = hybrid->cspr;
+ else
+ sft->cspr = 0;
+ if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS)
+ sft->hint_maxseg = hybrid->hint_maxseg;
+ else
+ sft->hint_maxseg = 0;
+ rack->rc_hybrid_mode = 1;
+ rack->rc_tp->tcp_hybrid_start++;
+ rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
+ return (0);
+#else
+ return (ENOTSUP);
+#endif
+}
+
+static int
rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
- uint32_t optval, uint64_t loptval)
+ uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
+
{
struct epoch_tracker et;
struct sockopt sopt;
@@ -19444,7 +22665,17 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
uint16_t ca, ss;
switch (sopt_name) {
-
+ case TCP_RACK_SET_RXT_OPTIONS:
+ if ((optval >= 0) && (optval <= 2)) {
+ rack_init_retransmit_value(rack, optval);
+ } else {
+ /*
+ * You must send in 0, 1 or 2 all else is
+ * invalid.
+ */
+ error = EINVAL;
+ }
+ break;
case TCP_RACK_DSACK_OPT:
RACK_OPTS_INC(tcp_rack_dsack_opt);
if (optval & 0x1) {
@@ -19459,6 +22690,24 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
}
rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
break;
+ case TCP_RACK_PACING_DIVISOR:
+ RACK_OPTS_INC(tcp_rack_pacing_divisor);
+ if (optval == 0) {
+ rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
+ } else {
+ if (optval < RL_MIN_DIVISOR)
+ rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR;
+ else
+ rack->r_ctl.pace_len_divisor = optval;
+ }
+ break;
+ case TCP_RACK_HI_BETA:
+ RACK_OPTS_INC(tcp_rack_hi_beta);
+ if (optval)
+ rack->rack_hibeta = 1;
+ else
+ rack->rack_hibeta = 0;
+ break;
case TCP_RACK_PACING_BETA:
RACK_OPTS_INC(tcp_rack_beta);
if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
@@ -19575,6 +22824,13 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
RACK_OPTS_INC(tcp_pacing_rate_cap);
rack->r_ctl.bw_rate_cap = loptval;
break;
+ case TCP_HYBRID_PACING:
+ if (hybrid == NULL) {
+ error = EINVAL;
+ break;
+ }
+ error = process_hybrid_pacing(rack, hybrid);
+ break;
case TCP_RACK_PROFILE:
RACK_OPTS_INC(tcp_profile);
error = rack_set_profile(rack, optval);
@@ -19599,6 +22855,17 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
else
rack->r_limit_scw = 0;
break;
+ case TCP_RACK_DGP_IN_REC:
+ RACK_OPTS_INC(tcp_dgp_in_rec);
+ if (optval)
+ rack->r_ctl.full_dgp_in_rec = 1;
+ else
+ rack->r_ctl.full_dgp_in_rec = 0;
+ break;
+ case TCP_RXT_CLAMP:
+ RACK_OPTS_INC(tcp_rxt_clamp);
+ rack_translate_clamp_value(rack, optval);
+ break;
case TCP_RACK_PACE_TO_FILL:
RACK_OPTS_INC(tcp_fillcw);
if (optval == 0)
@@ -19699,7 +22966,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
RACK_OPTS_INC(tcp_rack_tlp_reduce);
rack->r_ctl.rc_tlp_cwnd_reduce = optval;
break;
- /* Pacing related ones */
+ /* Pacing related ones */
case TCP_RACK_PACE_ALWAYS:
/*
* zero is old rack method, 1 is new
@@ -19712,7 +22979,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
break;
} else if (tcp_can_enable_pacing()) {
rack->rc_always_pace = 1;
- if (rack->use_fixed_rate || rack->gp_ready)
+ if ((rack->gp_ready) && (rack->use_fixed_rate == 0))
rack_set_cc_pacing(rack);
}
else {
@@ -19803,6 +23070,11 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
else
rack->rc_force_max_seg = 0;
break;
+ case TCP_RACK_PACE_MIN_SEG:
+ RACK_OPTS_INC(tcp_rack_min_seg);
+ rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval);
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
+ break;
case TCP_RACK_PACE_MAX_SEG:
/* Max segments size in a pace in bytes */
RACK_OPTS_INC(tcp_rack_max_seg);
@@ -19818,7 +23090,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
rack->use_fixed_rate = 1;
- if (rack->rc_always_pace)
+ if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
@@ -19836,7 +23108,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
rack->use_fixed_rate = 1;
- if (rack->rc_always_pace)
+ if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
@@ -19854,7 +23126,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
rack->use_fixed_rate = 1;
- if (rack->rc_always_pace)
+ if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
@@ -19914,6 +23186,12 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
else
rack->r_rr_config = 0;
break;
+ case TCP_PACING_DND: /* URL:dnd */
+ if (optval > 0)
+ rack->rc_pace_dnd = 1;
+ else
+ rack->rc_pace_dnd = 0;
+ break;
case TCP_HDWR_RATE_CAP:
RACK_OPTS_INC(tcp_hdwr_rate_cap);
if (optval) {
@@ -19925,6 +23203,10 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
rack->r_rack_hw_rate_caps = 0;
}
break;
+ case TCP_RACK_SPLIT_LIMIT:
+ RACK_OPTS_INC(tcp_split_limit);
+ rack->r_ctl.rc_split_limit = optval;
+ break;
case TCP_BBR_HDWR_PACE:
RACK_OPTS_INC(tcp_hdwr_pacing);
if (optval){
@@ -19945,7 +23227,7 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
#endif
}
break;
- /* End Pacing related ones */
+ /* End Pacing related ones */
case TCP_RACK_PRR_SENDALOT:
/* Allow PRR to send more than one seg */
RACK_OPTS_INC(tcp_rack_prr_sendalot);
@@ -20002,13 +23284,6 @@ rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
else
rack->use_rack_rr = 0;
break;
- case TCP_FAST_RSM_HACK:
- RACK_OPTS_INC(tcp_rack_fastrsm_hack);
- if (optval)
- rack->fast_rsm_hack = 1;
- else
- rack->fast_rsm_hack = 0;
- break;
case TCP_RACK_PKT_DELAY:
/* RACK added ms i.e. rack-rtt + reord + N */
RACK_OPTS_INC(tcp_rack_pkt_delay);
@@ -20065,7 +23340,7 @@ rack_apply_deferred_options(struct tcp_rack *rack)
TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
/* Disadvantage of deferal is you loose the error return */
s_optval = (uint32_t)dol->optval;
- (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
+ (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL);
free(dol, M_TCPDO);
}
}
@@ -20091,6 +23366,33 @@ rack_pru_options(struct tcpcb *tp, int flags)
return (0);
}
+static bool
+rack_wake_check(struct tcpcb *tp)
+{
+ struct tcp_rack *rack;
+ struct timeval tv;
+ uint32_t cts;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->r_ctl.rc_hpts_flags) {
+ cts = tcp_get_usecs(&tv);
+ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){
+ /*
+ * Pacing timer is up, check if we are ready.
+ */
+ if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to))
+ return (true);
+ } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) {
+ /*
+ * A timer is up, check if we are ready.
+ */
+ if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp))
+ return (true);
+ }
+ }
+ return (false);
+}
+
static struct tcp_function_block __tcp_rack = {
.tfb_tcp_block_name = __XSTRING(STACKNAME),
.tfb_tcp_output = rack_output,
@@ -20106,6 +23408,9 @@ static struct tcp_function_block __tcp_rack = {
.tfb_tcp_mtu_chg = rack_mtu_change,
.tfb_pru_options = rack_pru_options,
.tfb_hwtls_change = rack_hw_tls_change,
+ .tfb_chg_query = rack_chg_query,
+ .tfb_switch_failed = rack_switch_failed,
+ .tfb_early_wake_check = rack_wake_check,
.tfb_compute_pipe = rack_compute_pipe,
.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
};
@@ -20127,8 +23432,9 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
#endif
struct tcpcb *tp;
struct tcp_rack *rack;
+ struct tcp_hybrid_req hybrid;
uint64_t loptval;
- int32_t error = 0, optval;
+ int32_t error = 0, mask, optval, tclass;
tp = intotcpcb(inp);
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -20153,10 +23459,15 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
break;
case IPV6_TCLASS:
/*
- * The DSCP codepoint has changed, update the fsb.
+ * The DSCP codepoint has changed, update the fsb
+ * by overwriting any previous traffic class.
*/
- ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
- (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK);
+ if (inp->in6p_outputopts) {
+ mask = 0xfc;
+ tclass = inp->in6p_outputopts->ip6po_tclass;
+ ip6->ip6_flow &= htonl((~mask) << 20);
+ ip6->ip6_flow |= htonl((tclass & mask) << 20);
+ }
break;
}
INP_WUNLOCK(inp);
@@ -20181,67 +23492,95 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
INP_WUNLOCK(inp);
return (0);
#endif
- }
+#ifdef SO_PEERPRIO
+ case SOL_SOCKET:
+ switch (sopt->sopt_name) {
+ case SO_PEERPRIO: /* SC-URL:bs */
+ /* Already read in and sanity checked in sosetopt(). */
+ if (inp->inp_socket) {
+ rack->client_bufferlvl = inp->inp_socket->so_peerprio;
+ rack_client_buffer_level_set(rack);
+ }
+ break;
+ }
+ INP_WUNLOCK(inp);
+ return (0);
+#endif
+ case IPPROTO_TCP:
+ switch (sopt->sopt_name) {
+ case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
+ /* Pacing related ones */
+ case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
+ case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
+ case TCP_BBR_IWINTSO: /* URL:tso_iwin */
+ case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */
+ case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
+ case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
+ case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
+ case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
+ case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
+ case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
+ case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
+ case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
+ case TCP_RACK_RR_CONF: /* URL:rrr_conf */
+ case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
+ case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
+ case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
+ case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
+ case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
+ case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
+ case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
+ case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */
+ /* End pacing related */
+ case TCP_RXT_CLAMP: /* URL:rxtclamp */
+ case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
+ case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
+ case TCP_RACK_MIN_TO: /* URL:min_to */
+ case TCP_RACK_EARLY_SEG: /* URL:early_seg */
+ case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
+ case TCP_RACK_REORD_FADE: /* URL:reord_fade */
+ case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
+ case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
+ case TCP_RACK_TLP_USE: /* URL:tlp_use */
+ case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
+ case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
+ case TCP_RACK_DO_DETECTION: /* URL:detect */
+ case TCP_NO_PRR: /* URL:noprr */
+ case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
+ case TCP_DATA_AFTER_CLOSE: /* no URL */
+ case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
+ case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
+ case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
+ case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
+ case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
+ case TCP_RACK_PROFILE: /* URL:profile */
+ case TCP_HYBRID_PACING: /* URL:hybrid */
+ case TCP_USE_CMP_ACKS: /* URL:cmpack */
+ case TCP_RACK_ABC_VAL: /* URL:labc */
+ case TCP_REC_ABC_VAL: /* URL:reclabc */
+ case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
+ case TCP_DEFER_OPTIONS: /* URL:defer */
+ case TCP_RACK_DSACK_OPT: /* URL:dsack */
+ case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */
+ case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */
+ case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */
+ case TCP_RACK_HI_BETA: /* URL:hibeta */
+ case TCP_RACK_SPLIT_LIMIT: /* URL:split */
+ case TCP_RACK_PACING_DIVISOR: /* URL:divisor */
+ case TCP_PACING_DND: /* URL:dnd */
+ goto process_opt;
+ break;
+ default:
+ /* Filter off all unknown options to the base stack */
+ return (tcp_default_ctloutput(inp, sopt));
+ break;
+ }
- switch (sopt->sopt_name) {
- case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
- /* Pacing related ones */
- case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
- case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
- case TCP_BBR_IWINTSO: /* URL:tso_iwin */
- case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
- case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
- case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
- case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
- case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
- case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
- case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
- case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
- case TCP_RACK_RR_CONF: /* URL:rrr_conf */
- case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
- case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
- case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
- case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
- /* End pacing related */
- case TCP_FAST_RSM_HACK: /* URL:frsm_hack */
- case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
- case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
- case TCP_RACK_MIN_TO: /* URL:min_to */
- case TCP_RACK_EARLY_SEG: /* URL:early_seg */
- case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
- case TCP_RACK_REORD_FADE: /* URL:reord_fade */
- case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
- case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
- case TCP_RACK_TLP_USE: /* URL:tlp_use */
- case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
- case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
- case TCP_RACK_DO_DETECTION: /* URL:detect */
- case TCP_NO_PRR: /* URL:noprr */
- case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
- case TCP_DATA_AFTER_CLOSE: /* no URL */
- case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
- case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
- case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
- case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
- case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
- case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
- case TCP_RACK_PROFILE: /* URL:profile */
- case TCP_USE_CMP_ACKS: /* URL:cmpack */
- case TCP_RACK_ABC_VAL: /* URL:labc */
- case TCP_REC_ABC_VAL: /* URL:reclabc */
- case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
- case TCP_DEFER_OPTIONS: /* URL:defer */
- case TCP_RACK_DSACK_OPT: /* URL:dsack */
- case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
- case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
- case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */
- case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */
- break;
default:
- /* Filter off all unknown options to the base stack */
- return (tcp_default_ctloutput(inp, sopt));
- break;
+ INP_WUNLOCK(inp);
+ return (0);
}
+process_opt:
INP_WUNLOCK(inp);
if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
@@ -20250,6 +23589,8 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
* means rates > 34Gbps won't show right, but thats probably ok.
*/
optval = (uint32_t)loptval;
+ } else if (sopt->sopt_name == TCP_HYBRID_PACING) {
+ error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid));
} else {
error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
/* Save it in 64 bit form too */
@@ -20258,17 +23599,15 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
if (error)
return (error);
INP_WLOCK(inp);
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
if (tp->t_fb != &__tcp_rack) {
INP_WUNLOCK(inp);
return (ENOPROTOOPT);
}
if (rack->defer_options && (rack->gp_ready == 0) &&
(sopt->sopt_name != TCP_DEFER_OPTIONS) &&
+ (sopt->sopt_name != TCP_HYBRID_PACING) &&
(sopt->sopt_name != TCP_RACK_PACING_BETA) &&
+ (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
(sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
(sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
/* Options are beind deferred */
@@ -20281,7 +23620,7 @@ rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
return (ENOMEM);
}
}
- error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
+ error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid);
INP_WUNLOCK(inp);
return (error);
}
@@ -20440,8 +23779,14 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
}
}
break;
- case TCP_FAST_RSM_HACK:
- optval = rack->fast_rsm_hack;
+ case TCP_RACK_DGP_IN_REC:
+ optval = rack->r_ctl.full_dgp_in_rec;
+ break;
+ case TCP_RACK_HI_BETA:
+ optval = rack->rack_hibeta;
+ break;
+ case TCP_RXT_CLAMP:
+ optval = rack->r_ctl.saved_rxt_clamp_val;
break;
case TCP_DEFER_OPTIONS:
optval = rack->defer_options;
@@ -20465,6 +23810,10 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
/* You cannot retrieve a profile, its write only */
error = EINVAL;
break;
+ case TCP_HYBRID_PACING:
+ /* You cannot retrieve hybrid pacing information, its write only */
+ error = EINVAL;
+ break;
case TCP_USE_CMP_ACKS:
optval = rack->r_use_cmp_ack;
break;
@@ -20517,6 +23866,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
case TCP_RACK_FORCE_MSEG:
optval = rack->rc_force_max_seg;
break;
+ case TCP_RACK_PACE_MIN_SEG:
+ optval = rack->r_ctl.rc_user_set_min_segs;
+ break;
case TCP_RACK_PACE_MAX_SEG:
/* Max segments in a pace */
optval = rack->rc_user_set_max_segs;
@@ -20533,6 +23885,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
/* Minimum time between rack t-o's in ms */
optval = rack->r_ctl.rc_min_to;
break;
+ case TCP_RACK_SPLIT_LIMIT:
+ optval = rack->r_ctl.rc_split_limit;
+ break;
case TCP_RACK_EARLY_SEG:
/* If early recovery max segments */
optval = rack->r_ctl.rc_early_recovery_segs;
@@ -20569,6 +23924,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
case TCP_RACK_TLP_USE:
optval = rack->rack_tlp_threshold_use;
break;
+ case TCP_PACING_DND:
+ optval = rack->rc_pace_dnd;
+ break;
case TCP_RACK_PACE_RATE_CA:
optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
break;
@@ -20584,6 +23942,9 @@ rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
case TCP_RACK_GP_INCREASE_CA:
optval = rack->r_ctl.rack_per_of_gp_ss;
break;
+ case TCP_RACK_PACING_DIVISOR:
+ optval = rack->r_ctl.pace_len_divisor;
+ break;
case TCP_BBR_RACK_RTT_USE:
optval = rack->r_ctl.rc_rate_sample_method;
break;
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index 7e71e764990d..d4a70c529386 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_ratelimit.h"
-#include "opt_kern_tls.h"
#include <sys/param.h>
#include <sys/arb.h>
#include <sys/module.h>
@@ -51,9 +50,6 @@ __FBSDID("$FreeBSD$");
#include <sys/qmath.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
-#ifdef KERN_TLS
-#include <sys/ktls.h>
-#endif
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/tree.h>
@@ -130,36 +126,6 @@ __FBSDID("$FreeBSD$");
* Common TCP Functions - These are shared by borth
* rack and BBR.
*/
-#ifdef KERN_TLS
-uint32_t
-ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
-{
- struct ktls_session *tls;
- uint32_t len;
-
-again:
- tls = so->so_snd.sb_tls_info;
- len = tls->params.max_frame_len; /* max tls payload */
- len += tls->params.tls_hlen; /* tls header len */
- len += tls->params.tls_tlen; /* tls trailer len */
- if ((len * 4) > rwnd) {
- /*
- * Stroke this will suck counter and what
- * else should we do Drew? From the
- * TCP perspective I am not sure
- * what should be done...
- */
- if (tls->params.max_frame_len > 4096) {
- tls->params.max_frame_len -= 4096;
- if (tls->params.max_frame_len < 4096)
- tls->params.max_frame_len = 4096;
- goto again;
- }
- }
- return (len);
-}
-#endif
-
static int
ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
{
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h
index 688c64dd92c3..e9c38c01c3c8 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -87,9 +87,6 @@
#ifdef _KERNEL
/* We have only 7 bits in rack so assert its true */
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
-#ifdef KERN_TLS
-uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
-#endif
int
ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
struct mbuf *m, int has_pkt);
diff --git a/sys/netinet/tcp_stacks/tailq_hash.c b/sys/netinet/tcp_stacks/tailq_hash.c
new file mode 100644
index 000000000000..2e3b57be2388
--- /dev/null
+++ b/sys/netinet/tcp_stacks/tailq_hash.c
@@ -0,0 +1,344 @@
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/arb.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#ifdef STATS
+#include <sys/qmath.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#else
+#include <sys/tree.h>
+#endif
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/tim_filter.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+#include <sys/protosw.h>
+#ifdef TCP_ACCOUNTING
+#include <sys/sched.h>
+#include <machine/cpu.h>
+#endif
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet/tcp.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_ratelimit.h>
+#include <netinet/tcp_accounting.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/cc/cc_newreno.h>
+#include <netinet/tcp_fastopen.h>
+#include <netinet/tcp_lro.h>
+#ifdef NETFLIX_SHARED_CWND
+#include <netinet/tcp_shared_cwnd.h>
+#endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ecn.h>
+
+#include <netipsec/ipsec_support.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "sack_filter.h"
+#include "tcp_rack.h"
+#include "tailq_hash.h"
+
+
+struct rack_sendmap *
+tqhash_min(struct tailq_hash *hs)
+{
+ struct rack_sendmap *rsm;
+
+ rsm = tqhash_find(hs, hs->min);
+ return(rsm);
+}
+
+struct rack_sendmap *
+tqhash_max(struct tailq_hash *hs)
+{
+ struct rack_sendmap *rsm;
+
+ rsm = tqhash_find(hs, (hs->max - 1));
+ return (rsm);
+}
+
+int
+tqhash_empty(struct tailq_hash *hs)
+{
+ if (hs->count == 0)
+ return(1);
+ return(0);
+}
+
+struct rack_sendmap *
+tqhash_find(struct tailq_hash *hs, uint32_t seq)
+{
+ struct rack_sendmap *e;
+ int bindex, pbucket, fc = 1;
+
+ if ((SEQ_LT(seq, hs->min)) ||
+ (hs->count == 0) ||
+ (SEQ_GEQ(seq, hs->max))) {
+ /* Not here */
+ return (NULL);
+ }
+ bindex = seq / SEQ_BUCKET_SIZE;
+ bindex %= MAX_HASH_ENTRIES;
+ /* Lets look through the bucket it belongs to */
+ if (TAILQ_EMPTY(&hs->ht[bindex])) {
+ goto look_backwards;
+ }
+ TAILQ_FOREACH(e, &hs->ht[bindex], next) {
+ if (fc == 1) {
+ /*
+ * Special check for when a cum-ack
+ * as moved up over a seq and now its
+ * a bucket behind where it belongs. In
+ * the case of SACKs which create new rsm's
+ * this won't occur.
+ */
+ if (SEQ_GT(e->r_start, seq)) {
+ goto look_backwards;
+ }
+ fc = 0;
+ }
+ if (SEQ_GEQ(seq, e->r_start) &&
+ (SEQ_LT(seq, e->r_end))) {
+ /* Its in this block */
+ return (e);
+ }
+ }
+ /* Did not find it */
+ return (NULL);
+look_backwards:
+ if (bindex == 0)
+ pbucket = MAX_HASH_ENTRIES - 1;
+ else
+ pbucket = bindex - 1;
+ TAILQ_FOREACH_REVERSE(e, &hs->ht[pbucket], rack_head, next) {
+ if (SEQ_GEQ(seq, e->r_start) &&
+ (SEQ_LT(seq, e->r_end))) {
+ /* Its in this block */
+ return (e);
+ }
+ if (SEQ_GEQ(e->r_end, seq))
+ break;
+ }
+ return (NULL);
+}
+
+struct rack_sendmap *
+tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+ struct rack_sendmap *e;
+
+ e = TAILQ_NEXT(rsm, next);
+ if (e == NULL) {
+ /* Move to next bucket */
+ int nxt;
+
+ nxt = rsm->bindex + 1;
+ if (nxt >= MAX_HASH_ENTRIES)
+ nxt = 0;
+ e = TAILQ_FIRST(&hs->ht[nxt]);
+ }
+ return(e);
+}
+
+struct rack_sendmap *
+tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+ struct rack_sendmap *e;
+
+ e = TAILQ_PREV(rsm, rack_head, next);
+ if (e == NULL) {
+ int prev;
+
+ if (rsm->bindex > 0)
+ prev = rsm->bindex - 1;
+ else
+ prev = MAX_HASH_ENTRIES - 1;
+ e = TAILQ_LAST(&hs->ht[prev], rack_head);
+ }
+ return (e);
+}
+
+void
+tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type)
+{
+ TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
+ hs->count--;
+ if (hs->count == 0) {
+ hs->min = hs->max;
+ } else if (type == REMOVE_TYPE_CUMACK) {
+ hs->min = rsm->r_end;
+ }
+}
+
+int
+tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm)
+{
+ struct rack_sendmap *e, *l;
+ int inserted = 0;
+ uint32_t ebucket;
+
+ if (hs->count > 0) {
+ if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) {
+ return (-1);
+ }
+ e = tqhash_find(hs, rsm->r_start);
+ if (e) {
+ return (-2);
+ }
+ }
+ rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE;
+ rsm->bindex %= MAX_HASH_ENTRIES;
+ ebucket = rsm->r_end / SEQ_BUCKET_SIZE;
+ ebucket %= MAX_HASH_ENTRIES;
+ if (ebucket != rsm->bindex) {
+ /* This RSM straddles the bucket boundary */
+ rsm->r_flags |= RACK_STRADDLE;
+ } else {
+ rsm->r_flags &= ~RACK_STRADDLE;
+ }
+ if (hs->count == 0) {
+ /* Special case */
+ hs->min = rsm->r_start;
+ hs->max = rsm->r_end;
+ hs->count = 1;
+ } else {
+ hs->count++;
+ if (SEQ_GT(rsm->r_end, hs->max))
+ hs->max = rsm->r_end;
+ if (SEQ_LT(rsm->r_start, hs->min))
+ hs->min = rsm->r_start;
+ }
+ /* Check the common case of inserting at the end */
+ l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head);
+ if ((l == NULL) || (SEQ_GT(rsm->r_start, l->r_start))) {
+ TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next);
+ return (0);
+ }
+ TAILQ_FOREACH(e, &hs->ht[rsm->bindex], next) {
+ if (SEQ_LEQ(rsm->r_start, e->r_start)) {
+ inserted = 1;
+ TAILQ_INSERT_BEFORE(e, rsm, next);
+ break;
+ }
+ }
+ if (inserted == 0) {
+ TAILQ_INSERT_TAIL(&hs->ht[rsm->bindex], rsm, next);
+ }
+ return (0);
+}
+
+void
+tqhash_init(struct tailq_hash *hs)
+{
+ int i;
+
+ for(i = 0; i < MAX_HASH_ENTRIES; i++) {
+ TAILQ_INIT(&hs->ht[i]);
+ }
+ hs->min = hs->max = 0;
+ hs->count = 0;
+}
+
+int
+tqhash_trim(struct tailq_hash *hs, uint32_t th_ack)
+{
+ struct rack_sendmap *rsm;
+
+ if (SEQ_LT(th_ack, hs->min)) {
+ /* It can't be behind our current min */
+ return (-1);
+ }
+ if (SEQ_GEQ(th_ack, hs->max)) {
+ /* It can't be beyond or at our current max */
+ return (-2);
+ }
+ rsm = tqhash_min(hs);
+ if (rsm == NULL) {
+ /* nothing to trim */
+ return (-3);
+ }
+ if (SEQ_GEQ(th_ack, rsm->r_end)) {
+ /*
+ * You can't trim all bytes instead
+ * you need to remove it.
+ */
+ return (-4);
+ }
+ if (SEQ_GT(th_ack, hs->min))
+ hs->min = th_ack;
+ /*
+ * Should we trim it for the caller?
+ * they may have already which is ok...
+ */
+ if (SEQ_GT(th_ack, rsm->r_start)) {
+ rsm->r_start = th_ack;
+ }
+ return (0);
+}
+
diff --git a/sys/netinet/tcp_stacks/tailq_hash.h b/sys/netinet/tcp_stacks/tailq_hash.h
new file mode 100644
index 000000000000..ae8d3e00f558
--- /dev/null
+++ b/sys/netinet/tcp_stacks/tailq_hash.h
@@ -0,0 +1,73 @@
+#ifndef __tailq_hash__
+#define __tailq_hash__
+
+/* Must be powers of 2 */
+#define MAX_HASH_ENTRIES 128
+#define SEQ_BUCKET_SIZE 262144
+/*
+ * The max seq range that can be stored is
+ * 64 x 262144 or 16Meg. We have one extra slot
+ * for fall-over but must keep it so we never have
+ * wrap in hashing over valid other entries.
+ */
+#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1))
+
+struct tailq_hash {
+ struct rack_head ht[MAX_HASH_ENTRIES];
+ uint32_t min;
+ uint32_t max;
+ uint32_t count;
+};
+
+struct rack_sendmap *
+tqhash_min(struct tailq_hash *hs);
+
+struct rack_sendmap *
+tqhash_max(struct tailq_hash *hs);
+
+int
+tqhash_empty(struct tailq_hash *hs);
+
+struct rack_sendmap *
+tqhash_find(struct tailq_hash *hs, uint32_t seq);
+
+struct rack_sendmap *
+tqhash_next(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+struct rack_sendmap *
+tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+#define REMOVE_TYPE_CUMACK 1 /* Cumack moved */
+#define REMOVE_TYPE_MERGE 2 /* Merging two blocks */
+#define REMOVE_TYPE_FINI 3 /* The connection is over */
+
+void
+tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type);
+
+int
+tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm);
+
+void
+tqhash_init(struct tailq_hash *hs);
+
+int
+tqhash_trim(struct tailq_hash *hs, uint32_t th_ack);
+
+
+#define TQHASH_FOREACH(var, head) \
+ for ((var) = tqhash_min((head)); \
+ (var); \
+ (var) = tqhash_next((head), (var)))
+
+#define TQHASH_FOREACH_FROM(var, head, fvar) \
+ for ((var) = ((fvar) ? (fvar) : tqhash_min((head))); \
+ (var); \
+ (var) = tqhash_next((head), (var)))
+
+#define TQHASH_FOREACH_REVERSE_FROM(var, head) \
+ for ((var) = ((var) ? (var) : tqhash_max((head))); \
+ (var); \
+ (var) = tqhash_prev((head), (var)))
+
+
+#endif
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index 6f447d5ea470..fbf7b4bddda9 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -45,6 +45,11 @@
#define RACK_SENT_FP 0x004000/* sent in fast path */
#define RACK_HAD_PUSH 0x008000/* Push was sent on original send */
#define RACK_MUST_RXT 0x010000/* We must retransmit this rsm (non-sack/mtu chg)*/
+#define RACK_IN_GP_WIN 0x020000/* Send was in GP window when sent */
+#define RACK_SHUFFLED 0x040000/* The RSM was shuffled some data from one to another */
+#define RACK_MERGED 0x080000/* The RSM was merged */
+#define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */
+#define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
@@ -52,7 +57,9 @@
#define RACK_REQ_AVG 3 /* Must be less than 256 */
struct rack_sendmap {
+ TAILQ_ENTRY(rack_sendmap) next;
TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
+ uint32_t bindex;
uint32_t r_start; /* Sequence number of the segment */
uint32_t r_end; /* End seq, this is 1 beyond actually */
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
@@ -60,7 +67,8 @@ struct rack_sendmap {
r_rtr_cnt : 8; /* Retran count, index this -1 to get time */
struct mbuf *m;
uint32_t soff;
- uint32_t orig_m_len;
+ uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */
+ uint32_t orig_t_space; /* The original trailing space when we sent (can update) */
uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */
uint8_t r_dupack; /* Dup ack count */
uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
@@ -72,8 +80,8 @@ struct rack_sendmap {
r_avail : 4;
uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
uint64_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */
- RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
uint32_t r_fas; /* Flight at send */
+ uint8_t r_bas; /* The burst size (burst at send = bas) */
};
struct deferred_opt_list {
@@ -201,11 +209,11 @@ struct rack_opts_stats {
uint64_t tcp_rack_pace_rate_ss;
uint64_t tcp_rack_pace_rate_rec;
/* Temp counters for dsack */
- uint64_t tcp_sack_path_1;
- uint64_t tcp_sack_path_2a;
- uint64_t tcp_sack_path_2b;
- uint64_t tcp_sack_path_3;
- uint64_t tcp_sack_path_4;
+ uint64_t tcp_sack_path_1; /* not used */
+ uint64_t tcp_sack_path_2a; /* not used */
+ uint64_t tcp_sack_path_2b; /* not used */
+ uint64_t tcp_sack_path_3; /* not used */
+ uint64_t tcp_sack_path_4; /* not used */
/* non temp counters */
uint64_t tcp_rack_scwnd;
uint64_t tcp_rack_noprr;
@@ -227,11 +235,16 @@ struct rack_opts_stats {
uint64_t tcp_rack_rtt_use;
uint64_t tcp_data_after_close;
uint64_t tcp_defer_opt;
- uint64_t tcp_rack_fastrsm_hack;
+ uint64_t tcp_rxt_clamp;
uint64_t tcp_rack_beta;
uint64_t tcp_rack_beta_ecn;
uint64_t tcp_rack_timer_slop;
uint64_t tcp_rack_dsack_opt;
+ uint64_t tcp_rack_hi_beta;
+ uint64_t tcp_split_limit;
+ uint64_t tcp_rack_pacing_divisor;
+ uint64_t tcp_rack_min_seg;
+ uint64_t tcp_dgp_in_rec;
};
/* RTT shrink reasons */
@@ -261,38 +274,6 @@ struct rack_opts_stats {
#define RACK_QUALITY_PROBERTT 4 /* A measurement where we went into or exited probe RTT */
#define RACK_QUALITY_ALLACKED 5 /* All data is now acknowledged */
-/*********************/
-/* Rack Trace points */
-/*********************/
-/*
- * Rack trace points are interesting points within
- * the rack code that the author/debugger may want
- * to have BB logging enabled if we hit that point.
- * In order to enable a trace point you set the
- * sysctl var net.inet.tcp.<stack>.tp.number to
- * one of the numbers listed below. You also
- * must make sure net.inet.tcp.<stack>.tp.bbmode is
- * non-zero, the default is 4 for continuous tracing.
- * You also set in the number of connections you want
- * have get BB logs in net.inet.tcp.<stack>.tp.count.
- *
- * Count will decrement every time BB logging is assigned
- * to a connection that hit your tracepoint.
- *
- * You can enable all trace points by setting the number
- * to 0xffffffff. You can disable all trace points by
- * setting number to zero (or count to 0).
- *
- * Below are the enumerated list of tracepoints that
- * have currently been defined in the code. Add more
- * as you add a call to rack_trace_point(rack, <name>);
- * where <name> is defined below.
- */
-#define RACK_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */
-#define RACK_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */
-#define RACK_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */
-#define RACK_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
-
#define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */
#ifdef _KERNEL
#define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t))
@@ -356,14 +337,17 @@ struct rack_fast_send_blk {
struct udphdr *udp;
struct mbuf *m;
uint32_t o_m_len;
+ uint32_t o_t_len;
uint32_t rfo_apply_push : 1,
hw_tls : 1,
unused : 30;
};
+struct tailq_hash;
+
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
- struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
+ struct tailq_hash *tqh; /* Tree of all segments Lock(a) */
struct rack_head rc_tmap; /* List in transmit order Lock(a) */
struct rack_sendmap *rc_tlpsend; /* Remembered place for
* tlp_sending Lock(a) */
@@ -371,8 +355,8 @@ struct rack_control {
* resend */
struct rack_fast_send_blk fsb; /* The fast-send block */
uint32_t timer_slop;
- uint32_t input_pkt;
- uint32_t saved_input_pkt;
+ uint16_t pace_len_divisor;
+ uint16_t rc_user_set_min_segs;
uint32_t rc_hpts_flags;
uint32_t rc_fixed_pacing_rate_ca;
uint32_t rc_fixed_pacing_rate_rec;
@@ -387,6 +371,7 @@ struct rack_control {
uint64_t last_hw_bw_req;
uint64_t crte_prev_rate;
uint64_t bw_rate_cap;
+ uint64_t last_cumack_advance; /* Last time cumack moved forward */
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP
@@ -401,6 +386,7 @@ struct rack_control {
uint32_t last_sent_tlp_seq; /* Last tlp sequence that was retransmitted Lock(a) */
uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */
+
uint16_t rc_tlp_cnt_out; /* count of times we have sent a TLP without new data */
uint16_t last_sent_tlp_len; /* Number of bytes in the last sent tlp */
@@ -418,6 +404,7 @@ struct rack_control {
* have allocated */
uint32_t rc_rcvtime; /* When we last received data */
uint32_t rc_num_split_allocs; /* num split map entries allocated */
+ uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */
uint32_t rc_last_output_to;
uint32_t rc_went_idle_time;
@@ -462,7 +449,20 @@ struct rack_control {
uint64_t last_max_bw; /* Our calculated max b/w last */
struct time_filter_small rc_gp_min_rtt;
struct def_opt_head opt_list;
+ uint64_t lt_bw_time; /* Total time with data outstanding (lt_bw = long term bandwidth) */
+ uint64_t lt_bw_bytes; /* Total bytes acked */
+ uint64_t lt_timemark; /* 64 bit timestamp when we started sending */
+ struct http_sendfile_track *rc_last_sft;
+ uint32_t lt_seq; /* Seq at start of lt_bw gauge */
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
+ uint64_t last_sndbytes;
+ uint64_t last_snd_rxt_bytes;
+ uint64_t rxt_threshold;
+ uint32_t last_rnd_rxt_clamped;
+ uint32_t num_of_clamps_applied;
+ uint32_t clamp_options;
+ uint32_t max_clamps;
+
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */
@@ -502,6 +502,10 @@ struct rack_control {
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
uint32_t persist_lost_ends;
+ uint32_t ack_during_sd;
+ uint32_t input_pkt;
+ uint32_t saved_input_pkt;
+ uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -516,6 +520,8 @@ struct rack_control {
*/
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
+ uint8_t rack_per_upper_bound_ss;
+ uint8_t rack_per_upper_bound_ca;
uint8_t dsack_persist;
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
@@ -523,9 +529,55 @@ struct rack_control {
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
+ uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */
+ uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */
+ uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */
+ uint8_t pacing_discount_amm; /*
+ * This is a multipler to the base discount that
+ * can be used to increase the discount.
+ */
+ uint8_t already_had_a_excess;
};
#endif
+/* DGP with no buffer level mitigations */
+#define DGP_LEVEL0 0
+
+/*
+ * DGP with buffer level mitigation where BL:4 caps fillcw and BL:5
+ * turns off fillcw.
+ */
+#define DGP_LEVEL1 1
+
+/*
+ * DGP with buffer level mitigation where BL:3 caps fillcw and BL:4 turns off fillcw
+ * and BL:5 reduces by 10%
+ */
+#define DGP_LEVEL2 2
+
+/*
+ * DGP with buffer level mitigation where BL:2 caps fillcw and BL:3 turns off
+ * fillcw BL:4 reduces by 10% and BL:5 reduces by 20%
+ */
+#define DGP_LEVEL3 3
+
+/* Hybrid pacing log defines */
+#define HYBRID_LOG_NO_ROOM 0 /* No room for the clients request */
+#define HYBRID_LOG_TURNED_OFF 1 /* Turned off hybrid pacing */
+#define HYBRID_LOG_NO_PACING 2 /* Failed to set pacing on */
+#define HYBRID_LOG_RULES_SET 3 /* Hybrid pacing for this chunk is set */
+#define HYBRID_LOG_NO_RANGE 4 /* In DGP mode, no range found */
+#define HYBRID_LOG_RULES_APP 5 /* The specified rules were applied */
+#define HYBRID_LOG_REQ_COMP 6 /* The request completed */
+#define HYBRID_LOG_BW_MEASURE 7 /* Follow up b/w measurements to the previous completed log */
+#define HYBRID_LOG_RATE_CAP 8 /* We had a rate cap apply */
+#define HYBRID_LOG_CAP_CALC 9 /* How we calculate the cap */
+#define HYBRID_LOG_ISSAME 10 /* Same as before -- temp */
+#define HYBRID_LOG_ALLSENT 11 /* We sent it all no more rate-cap */
+#define HYBRID_LOG_OUTOFTIME 12 /* We are past the deadline DGP */
+#define HYBRID_LOG_CAPERROR 13 /* Hit one of the TSNH cases */
+#define HYBRID_LOG_EXTEND 14 /* We extended the end */
+
#define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */
#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */
@@ -558,11 +610,11 @@ struct tcp_rack {
shape_rxt_to_pacing_min : 1,
/* ******************************************************************** */
rc_ack_required: 1,
- spare : 1;
+ r_pacing_discount : 1;
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
- fast_rsm_hack: 1,
+ excess_rxt_on: 1, /* Are actions on for excess retransmissions? */
rc_ack_can_sendout_data: 1, /*
* If set it will override pacing restrictions on not sending
* data when the pacing timer is running. I.e. you set this
@@ -590,7 +642,8 @@ struct tcp_rack {
rc_last_sent_tlp_seq_valid: 1,
rc_last_sent_tlp_past_cumack: 1,
probe_not_answered: 1,
- avail_bytes : 2;
+ rack_hibeta : 1,
+ lt_bw_up : 1;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
@@ -616,11 +669,15 @@ struct tcp_rack {
r_use_labc_for_rec: 1,
rc_highly_buffered: 1, /* The path is highly buffered */
rc_dragged_bottom: 1,
- rc_dack_mode : 1, /* Mac O/S emulation of d-ack */
- rc_dack_toggle : 1, /* For Mac O/S emulation of d-ack */
+ rc_pace_dnd : 1, /* The pace do not disturb bit */
+ rc_avali2 : 1,
rc_gp_filled : 1,
- rc_is_spare : 1;
- uint8_t r_state; /* Current rack state Lock(a) */
+ rc_hw_nobuf : 1;
+ uint8_t r_state : 4, /* Current rack state Lock(a) */
+ rc_catch_up : 1, /* catch up mode in dgp */
+ rc_hybrid_mode : 1, /* We are in hybrid mode */
+ rc_suspicious : 1, /* Suspect sacks have been given */
+ rc_new_rnd_needed: 1;
uint8_t rc_tmr_stopped : 7,
t_timers_stopped : 1;
uint8_t rc_enobuf : 7, /* count of enobufs on connection provides */
@@ -636,8 +693,8 @@ struct tcp_rack {
uint8_t app_limited_needs_set : 1,
use_fixed_rate : 1,
rc_has_collapsed : 1,
- r_rep_attack : 1,
- r_rep_reverse : 1,
+ r_cwnd_was_clamped : 1,
+ r_clamped_gets_lower : 1,
rack_hdrw_pacing : 1, /* We are doing Hardware pacing */
rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */
rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */
@@ -660,8 +717,8 @@ struct tcp_rack {
r_wanted_output: 1,
r_rr_config : 2,
r_persist_lt_bw_off : 1,
- r_collapse_point_valid : 1,
- rc_avail_bit : 2;
+ r_collapse_point_valid : 1,
+ dgp_on : 1;
uint16_t rc_init_win : 8,
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 1f2256c6b6f9..fcd430f270f3 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -158,6 +158,11 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
CTLFLAG_RW,
&tcp_sad_limit, 10000,
"If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
+int32_t tcp_sad_limit = 10000;
+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
+ CTLFLAG_RW,
+ &tcp_sad_limit, 10000,
+ "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
CTLFLAG_RW,