This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2016-2021 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
* uint32_t len, uint32_t sum0);
*
* input :
* src : source starting address
* dst : destination starting address
* len : byte stream length
* sum0 : initial 32-bit sum
*
* output :
* the source byte stream is copied into the destination buffer
* the function returns the partial 16-bit checksum accumulated
* in a 32-bit variable (without 1's complement); caller is
* responsible for folding the 32-bit sum into 16-bit and
* performing the 1's complement if applicable
*/
/*
* the following definitions default the implementation to little-endian
* architectures
*/
#define LITTLE_ENDIAN 1
#define BYTE_ORDER LITTLE_ENDIAN
/*
* renaming registers to ease code porting from arm64
*/
#define v0 q0
#define v1 q1
#define v2 q2
#define v3 q3
#define v8 q8
#define v9 q9
#define v10 q10
#define v11 q11
#define v12 q12
#define v13 q13
#define v14 q14
#define v15 q15
.syntax unified
.align 2
.code 16
.thumb_func _os_cpu_copy_in_cksum
.text
.globl _os_cpu_copy_in_cksum
_os_cpu_copy_in_cksum:
#define src r0
#define dst r1
#define len r2
#define sum r3
#define need_swap r4
#define partial r5
#define t r12
push {r4,r5,r7,lr}
add r7, sp, #8 /* set up base pointer for debug tracing */
cmp len, #0
mov partial, #0 /* partial = 0; */
mov need_swap, #0 /* needs_swap = 0; */
cbnz len, 0f
b L_len_0
0:
/*
* Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
* byte to high byte of 16-bit in w7
*
* t = 0;
* if ((uintptr_t)src & 1) {
* t = *src << 8;
* *dst++ = *src++;
* --len;
* }
*/
tst src, #1
beq 1f
ldrb partial, [src]
add src, src, #1
strb partial, [dst], #1
#if BYTE_ORDER == LITTLE_ENDIAN
lsl partial, partial, #8
#endif
subs len, len, #1
mov need_swap, #1
beq L_len_0
1:
#ifdef KERNEL
vpush {v8-v15}
vpush {v0-v3}
#endif
/*
* pre-decrement len by 8*16, and if less tha 8*16 bytes, try
* 4*16 bytes next.
* v0,v1 will store temp result after we exit the L128 loop
*/
veor v0, v0, v0
veor v1, v1, v1
cmp len, #8*16
vmov s0, partial /* move partial to 1st 64b lane in v0 */
blt L64_bytes
/*
* accumulate 8 x 2 x 16-bit pairs into 16 lanes in v0-v3
* branch to finish off if len<128
*/
vld1.8 {q8,q9}, [src]!
veor v2, v2, v2
vld1.8 {q10,q11}, [src]!
veor v3, v3, v3
vld1.8 {q12,q13}, [src]!
subs len, len, #2*8*16
vld1.8 {q14,q15}, [src]!
blt L128_finishup
/*
* loop for loading and accumulating 16 32-bit words nto 8 8-byte
* accumulators per iteration
*/
L128_loop:
vpadal.u16 v0, v8
vst1.8 {q8,q9}, [dst]!
vpadal.u16 v1, v9
vld1.8 {q8,q9}, [src]!
vpadal.u16 v2, v10
vst1.8 {q10,q11}, [dst]!
vpadal.u16 v3, v11
vld1.8 {q10,q11}, [src]!
vpadal.u16 v0, v12
vst1.8 {q12,q13}, [dst]!
vpadal.u16 v1, v13
vld1.8 {q12,q13}, [src]!
vpadal.u16 v2, v14
vst1.8 {q14,q15}, [dst]!
vpadal.u16 v3, v15
vld1.8 {q14,q15}, [src]!
subs len, len, #8*16
bge L128_loop
L128_finishup:
vpadal.u16 v0, v8
vst1.8 {q8,q9}, [dst]!
vpadal.u16 v1, v9
vpadal.u16 v2, v10
vst1.8 {q10,q11}, [dst]!
vpadal.u16 v3, v11
vpadal.u16 v0, v12
vst1.8 {q12,q13}, [dst]!
vpadal.u16 v1, v13
vpadal.u16 v2, v14
vst1.8 {q14,q15}, [dst]!
vpadal.u16 v3, v15
add len, len, #8*16
vadd.i32 v0, v0, v2
vadd.i32 v1, v1, v3
L64_bytes:
cmp len, #4*16
blt L32_bytes
vld1.8 {q8,q9}, [src]!
vld1.8 {q10,q11}, [src]!
vpadal.u16 v0, v8
vst1.8 {q8,q9}, [dst]!
vpadal.u16 v1, v9
vpadal.u16 v0, v10
vst1.8 {q10,q11}, [dst]!
vpadal.u16 v1, v11
sub len, len, #4*16
L32_bytes:
cmp len, #2*16
blt L16_bytes
vld1.8 {q8,q9}, [src]!
vpadal.u16 v0, v8
vst1.8 {q8,q9}, [dst]!
vpadal.u16 v1, v9
sub len, len, #2*16
L16_bytes:
vadd.i32 v0, v0, v1
cmp len, #16
blt L8_bytes
vld1.8 {q8}, [src]!
vpadal.u16 v0, v8
vst1.8 {q8}, [dst]!
sub len, len, #16
L8_bytes:
veor v1, v1, v1
tst len, #8
beq L4_bytes
vld1.8 {d2}, [src]!
vst1.8 {d2}, [dst]!
vpadal.u16 v0, v1
L4_bytes:
ands len, len, #7
vpadd.i32 d0, d0, d1
vpadd.i32 d0, d0, d1
vmov partial, s0
#ifdef KERNEL
vpop {q0-q1}
vpop {q2-q3}
vpop {q8-q9}
vpop {q10-q11}
vpop {q12-q13}
vpop {q14-q15}
#endif
beq L_len_0
subs len, len, #2
blt L_trailing_bytes
L2_bytes:
ldrh t, [src], #2
strh t, [dst], #2
add partial, partial, t
subs len, len, #2
bge L2_bytes
L_trailing_bytes:
tst len, #1
beq L_len_0
ldrb t,[src],#1
strb t,[dst],#1
#if BYTE_ORDER != LITTLE_ENDIAN
lsl t, t, #8
#endif
add partial, partial, t
L_len_0:
/*
* if (needs_swap)
* partial = (partial << 8) + (partial >> 24);
*/
cbz need_swap, 1f
lsl t, partial, #8
add partial, t, partial, lsr #24
1:
movw lr, #0xffff
/* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
and r0, sum, lr
add r0, r0, sum, lsr #16
/* final_acc += (partial >> 16) + (partial & 0xffff); */
add r0, r0, partial, lsr #16
and partial, partial, lr
add r0, r0, partial
/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
and t, r0, lr
add r0, t, r0, lsr #16
/*
* One final fold in case of carry from the previous one.
* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
*/
and t, r0, lr
add r0, t, r0, lsr #16
/*
* return (~final_acc & 0xffff);
*
* mvn r0, r0
* and r0, r0, lr
*/
pop {r4,r5,r7,pc}