This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2016-2021 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
* uint32_t len, uint32_t sum0);
*
* input :
* src : source starting address
* dst : destination starting address
* len : byte stream length
* sum0 : initial 32-bit sum
*
* output :
* the source byte stream is copied into the destination buffer
* the function returns the partial 16-bit checksum accumulated
* in a 32-bit variable (without 1's complement); caller is
* responsible for folding the 32-bit sum into 16-bit and
* performing the 1's complement if applicable
*/
/*
* The following definitions default the implementation to little-endian
* architectures.
*/
#define LITTLE_ENDIAN 1
#define BYTE_ORDER LITTLE_ENDIAN
/*
* ARM64 kernel mode -- just like user mode -- no longer requires saving
* the vector registers, since it's done by the exception handler code.
*/
#define SAVE_REGISTERS 0
.globl _os_cpu_copy_in_cksum
.text
.align 4
_os_cpu_copy_in_cksum:
#define src x0
#define dst x1
#define len x2
#define sum x3
#define need_swap x5
#define t x6
#define partial x7
#define wpartial w7
mov partial, #0 // partial = 0;
mov need_swap, #0 // needs_swap = 0;
cbz len, L_len_0
/*
* Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
* byte to high byte of 16-bit in w7
*
* t = 0;
* if ((uintptr_t)src & 1) {
* t = *src << 8;
* *dst++ = *src++;
* --len;
* }
*/
tst src, #1
b.eq 1f
ldrb wpartial, [src]
add src, src, #1
strb wpartial, [dst], #1
#if BYTE_ORDER == LITTLE_ENDIAN
lsl partial, partial, #8
#endif
sub len, len, #1
mov need_swap, #1
cbz len, L_len_0
1:
#if SAVE_REGISTERS
/*
* we will always use v0-v3, and v4-v7/v16-v19 if len>=128
* so allocate 12*16 bytes in the stack, and store v0-v3 now,
* keep x11 as the pointer
*/
sub sp, sp, #12*16
mov x11, sp
st1.4s {v0, v1, v2, v3}, [x11], #4*16
#endif
/*
* pre-decrement len by 8*16, and if less tha 8*16 bytes, try
* 4*16 bytes next.
* v0,v1 will store temp result after we exit the L128 loop
*/
eor.16b v0, v0, v0
eor.16b v1, v1, v1
cmp len, #8*16
mov v0.d[0], partial // move partial to 1st 64b lane in v0
b.lt L64_bytes
#if SAVE_REGISTERS
/* if we are here, we need to save v4-v7/v16-v19 for kernel mode */
st1.4s {v4, v5, v6, v7}, [x11], #4*16
st1.4s {v16, v17, v18, v19}, [x11], #4*16
#endif
/*
* accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
* load 1st 4 vectors, and clear v0-v3
*/
ldr q4, [src], #8*16
eor.16b v2, v2, v2
ldr q5, [src, #-7*16]
eor.16b v3, v3, v3
ldr q6, [src, #-6*16]
ldr q7, [src, #-5*16]
ldr q16, [src, #-4*16]
ldr q17, [src, #-3*16]
ldr q18, [src, #-2*16]
ldr q19, [src, #-1*16]
/* branch to finish off if len<128 */
subs len, len, #2*8*16
b.lt L128_finishup
/*
* loop for loading and accumulating 16 32-bit words nto 8 8-byte
* accumulators per iteration
*/
L128_loop:
str q4, [dst], #16*8
uadalp.2d v0, v4
str q5, [dst, #-7*16]
uadalp.2d v1, v5
ldr q4, [src], #16*8
ldr q5, [src, #-7*16]
str q6, [dst, #-6*16]
uadalp.2d v2, v6
str q7, [dst, #-5*16]
uadalp.2d v3, v7
ldr q6, [src, #-6*16]
ldr q7, [src, #-5*16]
str q16, [dst, #-4*16]
uadalp.2d v0, v16
str q17, [dst, #-3*16]
uadalp.2d v1, v17
ldr q16, [src, #-4*16]
ldr q17, [src, #-3*16]
str q18, [dst, #-2*16]
uadalp.2d v2, v18
str q19, [dst, #-1*16]
uadalp.2d v3, v19
ldr q18, [src, #-2*16]
ldr q19, [src, #-1*16]
subs len, len, #8*16
b.ge L128_loop
L128_finishup:
str q4, [dst], #16*8
uadalp.2d v0, v4
str q5, [dst, #-7*16]
uadalp.2d v1, v5
str q6, [dst, #-6*16]
uadalp.2d v2, v6
str q7, [dst, #-5*16]
uadalp.2d v3, v7
str q16, [dst, #-4*16]
uadalp.2d v0, v16
str q17, [dst, #-3*16]
uadalp.2d v1, v17
str q18, [dst, #-2*16]
uadalp.2d v2, v18
str q19, [dst, #-1*16]
uadalp.2d v3, v19
add len, len, #8*16
add.2d v0, v0, v2
add.2d v1, v1, v3
#if SAVE_REGISTERS
/* restore v4-v7/v16-v19 as they won't be used any more */
add x11, sp, #4*16
ld1.4s {v4, v5, v6, v7}, [x11], #4*16
ld1.4s {v16, v17, v18, v19}, [x11], #4*16
#endif
L64_bytes:
cmp len, #4*16
b.lt L32_bytes
ldr q2, [src], #4*16
ldr q3, [src, #-3*16]
str q2, [dst], #4*16
uadalp.2d v0, v2
str q3, [dst, #-3*16]
uadalp.2d v1, v3
ldr q2, [src, #-2*16]
ldr q3, [src, #-1*16]
str q2, [dst, #-2*16]
uadalp.2d v0, v2
str q3, [dst, #-1*16]
uadalp.2d v1, v3
sub len, len, #4*16
L32_bytes:
cmp len, #2*16
b.lt L16_bytes
ldr q2, [src], #2*16
ldr q3, [src, #-1*16]
str q2, [dst], #2*16
uadalp.2d v0, v2
str q3, [dst, #-1*16]
uadalp.2d v1, v3
sub len, len, #2*16
L16_bytes:
add.2d v0, v0, v1
cmp len, #16
b.lt L8_bytes
ldr q2, [src], #16
str q2, [dst], #16
uadalp.2d v0, v2
sub len, len, #16
L8_bytes:
eor.16b v1, v1, v1
eor.16b v2, v2, v2
eor.16b v3, v3, v3
tst len, #8
b.eq L4_bytes
ldr d1,[src],#8
str d1,[dst],#8
L4_bytes:
tst len, #4
b.eq L2_bytes
ldr s2,[src],#4
str s2,[dst],#4
L2_bytes:
uadalp.2d v0, v1
eor.16b v1, v1, v1
tst len, #2
b.eq L_trailing_bytes
ldr h3,[src],#2
str h3,[dst],#2
L_trailing_bytes:
tst len, #1
b.eq L0_bytes
ldr b1,[src],#1
str b1,[dst],#1
#if BYTE_ORDER != LITTLE_ENDIAN
shl.4h v1, v1, #8 // partial <<= 8;
#endif
L0_bytes:
uadalp.2d v2, v3
uadalp.2d v0, v1
uadalp.2d v0, v2
addp.2d d0, v0
fmov partial, d0
#if SAVE_REGISTERS
/* restore v0-v3 and deallocate stack space */
ld1.4s {v0, v1, v2, v3}, [sp]
add sp, sp, #12*16
#endif
/* partial = (partial >> 32) + (partial & 0xffffffff); */
and t, partial, #0xffffffff
add partial, t, partial, lsr #32
/* partial = (partial >> 16) + (partial & 0xffff); */
and t, partial, #0xffff
add partial, t, partial, lsr #16
L_len_0:
/*
* if (needs_swap)
* partial = (partial << 8) + (partial >> 24);
*/
cbz need_swap, 1f
lsl t, partial, #8
add partial, t, partial, lsr #24
1:
/* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
and x0, sum, #0xffff
add x0, x0, sum, lsr #16
/* final_acc += (partial >> 16) + (partial & 0xffff); */
add x0, x0, partial, lsr #16
and partial, partial, #0xffff
add x0, x0, partial
/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
and t, x0, #0xffff
add x0, t, x0, lsr #16
/*
* One final fold in case of carry from the previous one.
* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
*/
and t, x0, #0xffff
add x0, t, x0, lsr #16
/*
* return (~final_acc & 0xffff);
*
* mvn w0, w0
* and w0, w0, #0xffff
*/
ret lr