This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include "tcp_includes.h"

#define REORDERING_WINDOW_FLOOR (2) /* If min_rtt is too small, at least wait for a reordering window of 2ms */

/* RACK is implemented by following RFC 8985 */
void
tcp_rack_transmit_seg(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, tcp_seq end,
    uint32_t xmit_ts, uint8_t flags)
{
	seg->start_seq = start;
	seg->end_seq = end;
	seg->xmit_ts = xmit_ts;

	/*
	 * Set dsack round_end at the start of a (re) transmission
	 * round to the segment with the smallest sequence sent
	 */
	if (SEQ_LT(start, tp->rack.dsack_round_end)) {
		tp->rack.dsack_round_end = start;
	}
	seg->flags |= flags;
	if (seg->flags & TCP_RACK_RETRANSMITTED) {
		tp->bytes_retransmitted += tcp_seg_len(seg);
	}
	/*
	 * Set segs_retransmitted ONLY when it is not set, otherwise new segments
	 * can clear this even if there are retransmitted segments
	 */
	if (!tp->rack.segs_retransmitted) {
		tp->rack.segs_retransmitted = !!(flags & TCP_RACK_RETRANSMITTED);
	}
}

/* If segment (t1, seq1) was sent after segment (t2, seq2) */
static bool
tcp_rack_sent_after(uint32_t t1, uint32_t seq1, uint32_t t2, uint32_t seq2)
{
	return (t1 > t2) || (t1 == t2 && SEQ_GT(seq1, seq2));
}

void
tcp_rack_update_reordering_win_persist(struct tcpcb *tp)
{
	if (tp->rack.reo_wnd_persist != 0) {
		tp->rack.reo_wnd_persist--;
	}
}

void
tcp_rack_bad_rexmt_restore(struct tcpcb *tp)
{
	/* Force RACK to re-examine losses */
	tp->rack.advanced = 1;

	/* Restore reordering window persist value */
	tp->rack.reo_wnd_persist = MIN(tp->rack.reo_wnd_persist + 1,
	    TCP_RACK_RECOVERY_PERSIST_MAX);
}

void
tcp_rack_reset_segs_retransmitted(struct tcpcb *tp)
{
	tp->rack.segs_retransmitted = false;
}

/* MUST be called before we have processed dup ACKs and made a decision to enter recovery */
static uint32_t
tcp_rack_reordering_window(struct tcpcb *tp, uint32_t dup_acks, bool in_rto)
{
	if (tp->t_reordered_pkts == 0) {
		/*
		 * When no reordering has been observed, the RACK.reo_wnd is set
		 * to 0 both during fast and RTO recovery. OR if we are entering
		 * fast recovery due to SACKed segments/dup ACKs >= DupThresh.
		 * When reo_wnd is set to 0, loss is detected if RACK.RTT time has
		 * elapsed since packet was sent.
		 */
		if (IN_FASTRECOVERY(tp) || dup_acks >= tp->t_rexmtthresh || in_rto) {
			return 0;
		}
	}
	/*
	 * reordering window = N * Min_RTT/4,
	 * limited to a max value of SRTT.
	 */
	uint32_t reordering_window = MIN((tp->rack.reo_wnd_multi * (get_base_rtt(tp) >> 2)),
	    (uint32_t)(tp->t_srtt >> TCP_RTT_SHIFT));
	reordering_window = MAX(reordering_window, REORDERING_WINDOW_FLOOR);

	return reordering_window;
}

static uint32_t
tcp_rack_detect_segment_lost(struct tcpcb *tp, struct tcp_seg_sent *seg,
    uint32_t reordering_window, bool *loss_detected)
{
	/* Skip already marked lost but not yet retransmitted segments */
	if (seg->flags & TCP_SEGMENT_LOST &&
	    !(seg->flags & TCP_RACK_RETRANSMITTED)) {
		return 0;
	}

	if (seg->flags & TCP_SEGMENT_SACKED) {
		return 0;
	}

	/* After the segment is sent, wait for (RTT + reordering window) */
	uint32_t wait_ts = seg->xmit_ts + tp->rack.rtt + reordering_window;
	if (TSTMP_GEQ(tcp_now, wait_ts)) {
		/*
		 * Segment should be marked as lost as it was sent
		 * (RTT + reordering window) time ago.
		 */
		tcp_mark_seg_lost(tp, seg);
		if (loss_detected != NULL) {
			*loss_detected = true;
		}
		return 0;
	}
	return wait_ts - tcp_now;
}

/*
 * RFC 8985,
 * Step 1: Update RACK.min_RTT (done in tcp_input)
 * Step 2: Update the state for the most recently sent segment that has been delivered.
 */
void
tcp_rack_update_segment_acked(struct tcpcb *tp, uint32_t tsecr,
    uint32_t xmit_ts, uint32_t end_seq,
    bool retransmitted)
{
	/*
	 * Step 1: Update RACK.min_RTT - is done in tcp_input ACK processing
	 */
	uint32_t rtt = tcp_now - xmit_ts;
	if (rtt == 0) {
		/*
		 * As rtt has millisecond precision,
		 * make adjustment for sub ms RTT
		 */
		rtt = 1;
	}
	/*
	 * RFC 8985 - An ACK can acknowledge retransmitted data and because retransmissions
	 * can be spurious, ignore ACKs for such retransmitted segments.
	 * Ignore a segment if any of its sequence range has been retransmitted
	 * before and if either of two conditions is true:
	 * 1. The TSecr of the ACK's timestamp option (if available) indicates the ACK was not
	 *	acknowledging the last (re)transmission OR tsecr was invalid (greater than tcp_now)
	 * 2. If TSecr is not available or ACK arrives immediately after last retransmission,
	 *  check if the segment was last (re)transmitted less than RACK.min_rtt ago.
	 */
	if (retransmitted) {
		if ((tsecr != 0 && (TSTMP_LT(tsecr, xmit_ts) || TSTMP_GT(tsecr, tcp_now)))
		    || rtt < get_base_rtt(tp)) {
			// Spurious inference
			TCP_LOG(tp, "Spurious inference as either "
			    "tsecr (%u) doesn't lie between xmit_ts(%u) and now (%u) OR "
			    "the rtt (%u) is less than base-rtt (%u). end_seq is:%u",
			    tsecr, xmit_ts, tcp_now, rtt, get_base_rtt(tp), end_seq);
			return;
		}
	}

	if (tcp_rack_sent_after(xmit_ts, end_seq, tp->rack.xmit_ts, tp->rack.end_seq)) {
		tp->rack.advanced = 1;
		tp->rack.xmit_ts = xmit_ts;
		tp->rack.end_seq = end_seq;
		tp->rack.rtt = rtt;

		/* Cancel the RACK reordering timer as we have received a new ACK */
		tp->t_timer[TCPT_REORDER] = 0;
	}
}

/*
 * Step 3: Reordering detection is done in tcp_sack_detect_reordering
 * Step 4: Update the RACK reordering window.
 */
void
tcp_rack_update_reordering_window(struct tcpcb *tp, tcp_seq th_ack)
{
	/*
	 * RACK.reo_wnd starts with a value of RACK.min_RTT/4. After that, RACK
	 * dynamically adapts to higher degrees of reordering using DSACK
	 * option from the receiver.
	 * To deal with temporary reordering, RACK persists using the inflated
	 * RACK.reo_wnd for up to 16 loss recoveries, after which it resets
	 * RACK.reo_wnd to its starting value.
	 */

	/*
	 * Ignore DSACK if an RTT hasn't passed as th_ack < previous dsack_round_end
	 */
	if (SEQ_LT(th_ack, tp->rack.dsack_round_end)) {
		tp->rack.dsack_round_seen = 0;
	}
	/*
	 * Start of the new dsack round.
	 * Grow the reordering window once per round that sees DSACK on an ACK.
	 * Reordering window persists for 16 loss recoveries (that don't receive DSACK).
	 * On receiving DSACK, we reset window persist to 16 as it
	 * indicates that reordering is still happening.
	 */
	if (tp->rack.dsack_round_seen == 1) {
		tp->rack.dsack_round_seen = 0;
		tp->rack.dsack_round_end = tp->snd_nxt;
		tp->rack.reo_wnd_multi = (uint8_t)(min(0xFF, tp->rack.reo_wnd_multi + 1));
		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_PERSIST_MAX;
	} else if (tp->rack.reo_wnd_persist == 0) {
		tp->rack.reo_wnd_multi = 1;
	}
}

/*
 * Step 5: Detect losses
 * Call this only after S/ACK has been processed, so that s/acked segments
 * are either removed or marked accordingly
 */
static uint32_t
tcp_rack_detect_loss(struct tcpcb *tp, uint32_t dup_acks, bool *loss_detected)
{
	struct tcp_seg_sent *seg = NULL;
	uint32_t reordering_timeout = 0;
	uint32_t reordering_window = tcp_rack_reordering_window(tp, dup_acks, false);

	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
		/*
		 * No segment after this segment has been acknowledged yet,
		 * hence RACK.segment is not after this segment
		 */
		if (!tcp_rack_sent_after(tp->rack.xmit_ts, tp->rack.end_seq,
		    seg->xmit_ts, seg->end_seq)) {
			break;
		}

		uint32_t remaining = tcp_rack_detect_segment_lost(tp, seg, reordering_window, loss_detected);
		if (remaining) {
			/*
			 * We only want to arm the timer at max wait time as we are
			 * expecting to get ACKs to do RACK processing. Only in the
			 * worst case, when we don't receive ACKs, we set the timeout
			 * to be the wait time for the most recently sent packet.
			 */
			reordering_timeout = max(remaining, reordering_timeout);
		}
	}
	return reordering_timeout;
}

/*
 * Call during input processing to detect loss.
 * If loss is detected, enter_fr will be true and
 * tcp_input will enter fast recovery
 */
bool
tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks)
{
	uint32_t reordering_timeout = 0;
	bool loss_detected = false;

	if (!tp->rack.advanced) {
		return false;
	}

	/* Cancel any existing RACK reordering timer as we are going to re-fire it if needed */
	tp->t_timer[TCPT_REORDER] = 0;

	reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected);
	if (reordering_timeout) {
		tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp,
		    reordering_timeout + REORDERING_WINDOW_FLOOR);
		/* Since losses can be marked at future point, clear the TLP timer */
		tp->t_timer[TCPT_PTO] = 0;
	} else {
		/* Cancel any pending timers */
		tp->t_timer[TCPT_REORDER] = 0;
	}

	return loss_detected;
}

/* Reordering timeout has expired, detect loss and enter recovery */
void
tcp_rack_reordering_timeout(struct tcpcb *tp, uint32_t dup_acks)
{
	bool enter_fr = false;

	tcp_rack_detect_loss(tp, dup_acks, &enter_fr);

	if (enter_fr) {
		/* Some packets have been marked as lost */
		if (!IN_FASTRECOVERY(tp)) {
			tcp_rexmt_save_state(tp);
			tcp_enter_fast_recovery(tp);
		}
		tcpstat.tcps_rack_reordering_timeout_recovery_episode++;
		tp->t_rack_reo_timeout_recovery_episode++;
		tcp_output(tp);
	}
}

void
tcp_rack_loss_on_rto(struct tcpcb *tp, bool in_rto)
{
	struct tcp_seg_sent *seg = NULL;
	uint32_t reordering_window = tcp_rack_reordering_window(tp, 0, in_rto);

	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
		/* Mark the first unacknowledged segment as lost */
		if (seg->start_seq == tp->snd_una) {
			tcp_mark_seg_lost(tp, seg);
		}
		/*
		 * Mark any segment for which time elapsed since transmit
		 * is at least the sum of recent RTT and reordering window
		 */
		tcp_rack_detect_segment_lost(tp, seg, reordering_window, NULL);
	}
}

uint32_t
tcp_rack_adjust(struct tcpcb *tp, uint32_t cwin)
{
	uint32_t max_len = 0;
	struct tcp_seg_sent *seg = NULL;

	/*
	 * We traverse RB tree (instead of time-ordered list)
	 * as it would be faster to look for a seg such that
	 * seg->start <= snd_nxt < seg->end
	 */
	RB_FOREACH(seg, tcp_seg_sent_tree_head, &tp->t_segs_sent_tree) {
		if (max_len >= cwin) {
			break;
		}
		if (seg->flags & TCP_SEGMENT_SACKED) {
			if (SEQ_LT(tp->snd_nxt, seg->end_seq) &&
			    SEQ_GEQ(tp->snd_nxt, seg->start_seq)) {
				tp->snd_nxt = seg->end_seq;
			}
			continue;
		}
		if (SEQ_LT(tp->snd_nxt, seg->end_seq)) {
			max_len += tcp_seg_len(seg);
		}
	}

	return max_len;
}

/* This function is only used during retransmissions. */
struct tcp_seg_sent *
tcp_rack_output(struct tcpcb *tp, uint32_t cwin, uint16_t *rack_seg_len)
{
	struct tcp_seg_sent *seg = NULL;

	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
		if (seg->flags & TCP_SEGMENT_SACKED) {
			continue;
		}
		if (seg->flags & TCP_SEGMENT_LOST && !(seg->flags & TCP_RACK_RETRANSMITTED)) {
			/* We don't do TSO for retransmissions and only send MSS sized segments */
			uint16_t allowed_size = (uint16_t)min(cwin, tp->t_maxseg);
			/*
			 * When entire segment can be retransmitted,
			 * lost segment is moved to the end of the time-ordered
			 * list in tcp_seg_sent_insert.
			 *
			 * When entire segment can't be retransmitted,
			 * we move the seg->start by amount of data
			 * retransmitted during tcp_seg_sent_insert
			 */
			*rack_seg_len = tcp_seg_len(seg) <= allowed_size ?
			    (uint16_t)tcp_seg_len(seg) : allowed_size;

			break;
		}
	}

	return seg;
}

/*
 * Check if a retransmitted segment was completed covered by received
 * (first) DSACK block
 */
void
tcp_rack_detect_reordering_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end)
{
	struct tcp_seg_sent *seg = NULL;

	TAILQ_FOREACH(seg, &tp->t_segs_sent, tx_link) {
		if (seg->flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE) {
			if (SEQ_LEQ(start, seg->start_seq) && SEQ_GEQ(end, seg->end_seq)) {
				tp->t_reordered_pkts++;
			}
		}
	}
}

void
tcp_rack_detect_reordering_acked(struct tcpcb *tp, struct tcp_seg_sent *seg)
{
	/*
	 * A never retransmitted segment below fack was delivered.
	 * Ignore the segments that have already been sacked before
	 */
	if (SEQ_LT(seg->end_seq, tp->snd_fack) &&
	    (seg->flags & (TCP_SEGMENT_SACKED | TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)) == 0) {
		tp->t_reordered_pkts++;
	}
}