/*
* Copyright (c) 2020-2021 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include "tcp_includes.h"
#define REORDERING_WINDOW_FLOOR (2) /* If min_rtt is too small, at least wait for a reordering window of 2ms */
/* RACK is implemented by following RFC 8985 */
void
tcp_rack_transmit_seg(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
uint32_t xmit_ts, uint8_t flags)
{
seg->start_seq = start;
seg->end_seq = end;
seg->xmit_ts = xmit_ts;
/*
* Set dsack round_end at the start of a (re) transmission
* round to the segment with the smallest sequence sent
*/
if (SEQ_LT(start, tp->rack.dsack_round_end)) {
tp->rack.dsack_round_end = start;
}
seg->flags |= flags;
if (seg->flags & TCP_RACK_RETRANSMITTED) {
tp->bytes_retransmitted += tcp_seg_len(seg);
}
/*
* Set segs_retransmitted ONLY when it is not set, otherwise new segments
* can clear this even if there are retransmitted segments
*/
if (!tp->rack.segs_retransmitted) {
tp->rack.segs_retransmitted = !!(flags & TCP_RACK_RETRANSMITTED);
}
}
/* If segment (t1, seq1) was sent after segment (t2, seq2) */
static bool
tcp_rack_sent_after(uint32_t t1, uint32_t seq1, uint32_t t2, uint32_t seq2)
{
return (t1 > t2) || (t1 == t2 && SEQ_GT(seq1, seq2));
}
void
tcp_rack_update_reordering_win_persist(struct tcpcb *tp)
{
if (tp->rack.reo_wnd_persist != 0) {
tp->rack.reo_wnd_persist--;
}
}
void
tcp_rack_bad_rexmt_restore(struct tcpcb *tp)
{
/* Force RACK to re-examine losses */
tp->rack.advanced = 1;
/* Restore reordering window persist value */
tp->rack.reo_wnd_persist = MIN(tp->rack.reo_wnd_persist + 1,
TCP_RACK_RECOVERY_PERSIST_MAX);
}
void
tcp_rack_reset_segs_retransmitted(struct tcpcb *tp)
{
tp->rack.segs_retransmitted = false;
}
/* MUST be called before we have processed dup ACKs and made a decision to enter recovery */
static uint32_t
tcp_rack_reordering_window(struct tcpcb *tp, uint32_t dup_acks, bool in_rto)
{
if (tp->t_reordered_pkts == 0) {
/*
* When no reordering has been observed, the RACK.reo_wnd is set
* to 0 both during fast and RTO recovery. OR if we are entering
* fast recovery due to SACKed segments/dup ACKs >= DupThresh.
* When reo_wnd is set to 0, loss is detected if RACK.RTT time has
* elapsed since packet was sent.
*/
if (IN_FASTRECOVERY(tp) || dup_acks >= tp->t_rexmtthresh || in_rto) {
return 0;
}
}
/*
* reordering window = N * Min_RTT/4,
* limited to a max value of SRTT.
*/
uint32_t reordering_window = MIN((tp->rack.reo_wnd_multi * (get_base_rtt(tp) >> 2)),
(uint32_t)(tp->t_srtt >> TCP_RTT_SHIFT));
reordering_window = MAX(reordering_window, REORDERING_WINDOW_FLOOR);
return reordering_window;
}
static uint32_t
tcp_rack_detect_segment_lost(struct tcpcb *tp, struct tcp_seg_sent *seg,
uint32_t reordering_window, bool *loss_detected)
{
/* Skip already marked lost but not yet retransmitted segments */
if (seg->flags & TCP_SEGMENT_LOST &&
!(seg->flags & TCP_RACK_RETRANSMITTED)) {
return 0;
}
if (seg->flags & TCP_SEGMENT_SACKED) {
return 0;
}
/* After the segment is sent, wait for (RTT + reordering window) */
uint32_t wait_ts = seg->xmit_ts + tp->rack.rtt + reordering_window;
if (TSTMP_GEQ(tcp_now, wait_ts)) {
/*
* Segment should be marked as lost as it was sent
* (RTT + reordering window) time ago.
*/
tcp_mark_seg_lost(tp, seg);
if (loss_detected != NULL) {
*loss_detected = true;
}
return 0;
}
return wait_ts - tcp_now;
}
/*
* RFC 8985,
* Step 1: Update RACK.min_RTT (done in tcp_input)
* Step 2: Update the state for the most recently sent segment that has been delivered.
*/
void
tcp_rack_update_segment_acked(struct tcpcb *tp, uint32_t tsecr,
uint32_t xmit_ts, uint32_t end_seq,
bool retransmitted)
{
/*
* Step 1: Update RACK.min_RTT - is done in tcp_input ACK processing
*/
uint32_t rtt = tcp_now - xmit_ts;
if (rtt == 0) {
/*
* As rtt has millisecond precision,
* make adjustment for sub ms RTT
*/
rtt = 1;
}
/*
* RFC 8985 - An ACK can acknowledge retransmitted data and because retransmissions
* can be spurious, ignore ACKs for such retransmitted segments.
* Ignore a segment if any of its sequence range has been retransmitted
* before and if either of two conditions is true:
* 1. The TSecr of the ACK's timestamp option (if available) indicates the ACK was not
* acknowledging the last (re)transmission OR tsecr was invalid (greater than tcp_now)
* 2. If TSecr is not available or ACK arrives immediately after last retransmission,
* check if the segment was last (re)transmitted less than RACK.min_rtt ago.
*/
if (retransmitted) {
if ((tsecr != 0 && (TSTMP_LT(tsecr, xmit_ts) || TSTMP_GT(tsecr, tcp_now)))
|| rtt < get_base_rtt(tp)) {
// Spurious inference
TCP_LOG(tp, "Spurious inference as either "
"tsecr (%u) doesn't lie between xmit_ts(%u) and now (%u) OR "
"the rtt (%u) is less than base-rtt (%u). end_seq is:%u",
tsecr, xmit_ts, tcp_now, rtt, get_base_rtt(tp), end_seq);
return;
}
}
if (tcp_rack_sent_after(xmit_ts, end_seq, tp->rack.xmit_ts, tp->rack.end_seq)) {
tp->rack.advanced = 1;
tp->rack.xmit_ts = xmit_ts;
tp->rack.end_seq = end_seq;
tp->rack.rtt = rtt;
/* Cancel the RACK reordering timer as we have received a new ACK */
tp->t_timer[TCPT_REORDER] = 0;
}
}
/*
* Step 3: Reordering detection is done in tcp_sack_detect_reordering
* Step 4: Update the RACK reordering window.
*/
void
tcp_rack_update_reordering_window(struct tcpcb *tp, tcp_seq th_ack)
{
/*
* RACK.reo_wnd starts with a value of RACK.min_RTT/4. After that, RACK
* dynamically adapts to higher degrees of reordering using DSACK
* option from the receiver.
* To deal with temporary reordering, RACK persists using the inflated
* RACK.reo_wnd for up to 16 loss recoveries, after which it resets
* RACK.reo_wnd to its starting value.
*/
/*
* Ignore DSACK if an RTT hasn't passed as th_ack < previous dsack_round_end
*/
if (SEQ_LT(th_ack, tp->rack.dsack_round_end)) {
tp->rack.dsack_round_seen = 0;
}
/*
* Start of the new dsack round.
* Grow the reordering window once per round that sees DSACK on an ACK.
* Reordering window persists for 16 loss recoveries (that don't receive DSACK).
* On receiving DSACK, we reset window persist to 16 as it
* indicates that reordering is still happening.
*/
if (tp->rack.dsack_round_seen == 1) {
tp->rack.dsack_round_seen = 0;
tp->rack.dsack_round_end = tp->snd_nxt;
tp->rack.reo_wnd_multi = (uint8_t)(min(0xFF, tp->rack.reo_wnd_multi + 1));
tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_PERSIST_MAX;
} else if (tp->rack.reo_wnd_persist == 0) {
tp->rack.reo_wnd_multi = 1;
}
}
/*
* Step 5: Detect losses
* Call this only after S/ACK has been processed, so that s/acked segments
* are either removed or marked accordingly
*/
static uint32_t
tcp_rack_detect_loss(struct tcpcb *tp, uint32_t dup_acks, bool *loss_detected)
{
struct tcp_seg_sent *seg = NULL;
uint32_t reordering_timeout = 0;
uint32_t reordering_window = tcp_rack_reordering_window(tp, dup_acks, false);
TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
/*
* No segment after this segment has been acknowledged yet,
* hence RACK.segment is not after this segment
*/
if (!tcp_rack_sent_after(tp->rack.xmit_ts, tp->rack.end_seq,
seg->xmit_ts, seg->end_seq)) {
break;
}
uint32_t remaining = tcp_rack_detect_segment_lost(tp, seg, reordering_window, loss_detected);
if (remaining) {
/*
* We only want to arm the timer at max wait time as we are
* expecting to get ACKs to do RACK processing. Only in the
* worst case, when we don't receive ACKs, we set the timeout
* to be the wait time for the most recently sent packet.
*/
reordering_timeout = max(remaining, reordering_timeout);
}
}
return reordering_timeout;
}
/*
* Call during input processing to detect loss.
* If loss is detected, enter_fr will be true and
* tcp_input will enter fast recovery
*/
bool
tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks)
{
uint32_t reordering_timeout = 0;
bool loss_detected = false;
if (!tp->rack.advanced) {
return false;
}
/* Cancel any existing RACK reordering timer as we are going to re-fire it if needed */
tp->t_timer[TCPT_REORDER] = 0;
reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected);
if (reordering_timeout) {
tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp,
reordering_timeout + REORDERING_WINDOW_FLOOR);
/* Since losses can be marked at future point, clear the TLP timer */
tp->t_timer[TCPT_PTO] = 0;
} else {
/* Cancel any pending timers */
tp->t_timer[TCPT_REORDER] = 0;
}
return loss_detected;
}
/* Reordering timeout has expired, detect loss and enter recovery */
void
tcp_rack_reordering_timeout(struct tcpcb *tp, uint32_t dup_acks)
{
bool enter_fr = false;
tcp_rack_detect_loss(tp, dup_acks, &enter_fr);
if (enter_fr) {
/* Some packets have been marked as lost */
if (!IN_FASTRECOVERY(tp)) {
tcp_rexmt_save_state(tp);
tcp_enter_fast_recovery(tp);
}
tcpstat.tcps_rack_reordering_timeout_recovery_episode++;
tp->t_rack_reo_timeout_recovery_episode++;
tcp_output(tp);
}
}
void
tcp_rack_loss_on_rto(struct tcpcb *tp, bool in_rto)
{
struct tcp_seg_sent *seg = NULL;
uint32_t reordering_window = tcp_rack_reordering_window(tp, 0, in_rto);
TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
/* Mark the first unacknowledged segment as lost */
if (seg->start_seq == tp->snd_una) {
tcp_mark_seg_lost(tp, seg);
}
/*
* Mark any segment for which time elapsed since transmit
* is at least the sum of recent RTT and reordering window
*/
tcp_rack_detect_segment_lost(tp, seg, reordering_window, NULL);
}
}
uint32_t
tcp_rack_adjust(struct tcpcb *tp, uint32_t cwin)
{
uint32_t max_len = 0;
struct tcp_seg_sent *seg = NULL;
/*
* We traverse RB tree (instead of time-ordered list)
* as it would be faster to look for a seg such that
* seg->start <= snd_nxt < seg->end
*/
RB_FOREACH(seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
if (max_len >= cwin) {
break;
}
if (seg->flags & TCP_SEGMENT_SACKED) {
if (SEQ_LT(tp->snd_nxt, seg->end_seq) &&
SEQ_GEQ(tp->snd_nxt, seg->start_seq)) {
tp->snd_nxt = seg->end_seq;
}
continue;
}
if (SEQ_LT(tp->snd_nxt, seg->end_seq)) {
max_len += tcp_seg_len(seg);
}
}
return max_len;
}
/* This function is only used during retransmissions. */
struct tcp_seg_sent *
tcp_rack_output(struct tcpcb *tp, uint32_t cwin, uint16_t *rack_seg_len)
{
struct tcp_seg_sent *seg = NULL;
TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
if (seg->flags & TCP_SEGMENT_SACKED) {
continue;
}
if (seg->flags & TCP_SEGMENT_LOST && !(seg->flags & TCP_RACK_RETRANSMITTED)) {
/* We don't do TSO for retransmissions and only send MSS sized segments */
uint16_t allowed_size = (uint16_t)min(cwin, tp->t_maxseg);
/*
* When entire segment can be retransmitted,
* lost segment is moved to the end of the time-ordered
* list in tcp_seg_sent_insert.
*
* When entire segment can't be retransmitted,
* we move the seg->start by amount of data
* retransmitted during tcp_seg_sent_insert
*/
*rack_seg_len = tcp_seg_len(seg) <= allowed_size ?
(uint16_t)tcp_seg_len(seg) : allowed_size;
break;
}
}
return seg;
}
/*
* Check if a retransmitted segment was completed covered by received
* (first) DSACK block
*/
void
tcp_rack_detect_reordering_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end)
{
struct tcp_seg_sent *seg = NULL;
TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
if (seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) {
if (SEQ_LEQ(start, seg->start_seq) && SEQ_GEQ(end, seg->end_seq)) {
tp->t_reordered_pkts++;
}
}
}
}
void
tcp_rack_detect_reordering_acked(struct tcpcb *tp, struct tcp_seg_sent *seg)
{
/*
* A never retransmitted segment below fack was delivered.
* Ignore the segments that have already been sacked before
*/
if (SEQ_LT(seg->end_seq, tp->snd_fack) &&
(seg->flags & (TCP_SEGMENT_SACKED | TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)) == 0) {
tp->t_reordered_pkts++;
}
}