/*
* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*
* This file implements the following function for the arm64 architecture:
*
* int strncmp(const char *s1, const char *s2, size_t n);
*
* Returns 0 if the two strings are equal up to the first n bytes or to the
* end of the string, whichever comes first. Otherwise, returns the difference
* of the first mismatched characters interpreted as uint8_t.
*/
#include <arm64/asm.h>
.globl _strncmp
/*****************************************************************************
* Macros *
*****************************************************************************/
.macro EstablishFrame
ARM64_STACK_PROLOG
stp fp, lr, [sp, #-16]!
mov fp, sp
.endm
.macro ClearFrameAndReturn
ldp fp, lr, [sp], #16
ARM64_STACK_EPILOG
.endm
#include "../mach/arm/vm_param.h"
#define kVectorSize 16
/*****************************************************************************
* Constants *
*****************************************************************************/
.text
.align 5
L_mask:
.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
/*****************************************************************************
* Entrypoints *
*****************************************************************************/
_strncmp:
EstablishFrame
eor x3, x3, x3
cbz x2, L_scalarDone
// Compare one byte at a time until s1 has vector alignment.
0: tst x0, #(kVectorSize-1)
b.eq L_s1aligned
ldrb w4, [x0],#1 // load byte from src1
ldrb w5, [x1],#1 // load byte from src2
subs x3, x4, x5 // if the are not equal
ccmp w4, #0, #4, eq // or we find an EOS
b.eq L_scalarDone // return the difference
subs x2, x2, #1 // decrement length
b.ne 0b // continue loop if non-zero
// We found a mismatch or EOS before s1 became aligned. Simply return the
// difference between the last bytes that we loaded.
L_scalarDone:
mov x0, x3
ClearFrameAndReturn
L_s1aligned:
// If s2 is similarly aligned to s1, then we can use a naive vector comparison
// from this point on without worrying about spurious page faults; none of our
// loads will ever cross a page boundary, because they are all aligned.
tst x1, #(kVectorSize-1)
b.eq L_naiveVector
/*****************************************************************************
* Careful chunk comparison *
*****************************************************************************/
// Otherwise, we need to be careful; although vector loads from s1 cannot
// cross a page boundary because they are aligned, s2 is not aligned. We
// compute the multiple of vector size that we can safely load before reaching
// a page boundary, and compare only that far before switching over to scalar
// comparisons to step across the page boundary. If this number happens to
// be zero, we jump directly to the scalar comparison.
neg x7, x1
ands x7, x7, #(PAGE_MIN_SIZE-kVectorSize)
b.eq 2f
.align 4
// If n is less than the number of bytes before a page-crossing load, jump
// into the naive vector path instead, since we will not even reach a page
// crossing. Otherwise, decrement n by that number before we monkey with it,
// and set the decremented value aside.
0: cmp x2, x7
b.ls L_naiveVector
sub x6, x2, x7
// Use vector comparisons until a mismatch or EOS is encountered, or the next
// vector load from s2 would be page-crossing.
1: ldr q0, [x0],#(kVectorSize)
ldr q1, [x1],#(kVectorSize)
cmeq.16b v1, v0, v1
and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS
uminv.16b b1, v0
fmov w3, s1 // zero only iff comparison is finished
cbz w3, L_vectorDone
subs x7, x7, #(kVectorSize)
b.ne 1b
// Restore the updated n to x2
mov x2, x6
// The next vector load will cross a page boundary. Instead, compare one byte
// at a time until s1 again has vector alignment, at which point we will have
// compared exactly 16 bytes.
2: ldrb w4, [x0],#1 // load byte from src1
ldrb w5, [x1],#1 // load byte from src2
subs x3, x4, x5 // if the are not equal
ccmp w4, #0, #4, eq // or we find an EOS
b.eq L_scalarDone // return the difference
subs x2, x2, #1 // decrement length
b.eq L_scalarDone // exit loop if zero.
tst x0, #(kVectorSize-1)
b.ne 2b
// Having compared one vector's worth of bytes using a scalar comparison, we
// know that we are safely across the page boundary. Initialize x7 and jump
// back into the vector comparison part of the loop.
mov x7, #(PAGE_MIN_SIZE-kVectorSize)
b 0b
/*****************************************************************************
* Naive vector comparison *
*****************************************************************************/
L_naiveVector:
subs x3, x2, #(kVectorSize)
b.lo L_scalar
add x4, x0, x3 // save the addresses of the last vectors
add x5, x1, x3
mov x2, x3 // length -= kVectorSize
.align 4
0:
ldr q0, [x0],#(kVectorSize)
ldr q1, [x1],#(kVectorSize)
cmeq.16b v1, v0, v1
and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS
uminv.16b b1, v0
fmov w3, s1 // zero only iff comparison is finished
cbz w3, L_vectorDone
subs x2, x2, #(kVectorSize)
b.hi 0b
// compare the last vector
mov x0, x4
mov x1, x5
ldr q0, [x0],#(kVectorSize)
ldr q1, [x1],#(kVectorSize)
cmeq.16b v1, v0, v1
and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS
uminv.16b b1, v0
fmov w3, s1 // zero only iff comparison is finished
cbz w3, L_vectorDone
L_readNBytes:
eor x0, x0, x0
ClearFrameAndReturn
L_vectorDone:
// Load the bytes corresponding to the first mismatch or EOS and return
// their difference.
eor.16b v1, v1, v1
cmhi.16b v0, v0, v1 // force non-zero lanes to 0xff
ldr q1, L_mask
orr.16b v0, v0, v1 // lane index in lanes containing mismatch or EOS
uminv.16b b1, v0
fmov w3, s1
sub x3, x3, #(kVectorSize)
ldrb w4, [x0, x3]
ldrb w5, [x1, x3]
sub x0, x4, x5
ClearFrameAndReturn
L_scalar:
ldrb w4, [x0],#1 // load byte from src1
ldrb w5, [x1],#1 // load byte from src2
subs x3, x4, x5 // if the are not equal
ccmp w4, #0, #4, eq // or we find an EOS
b.eq 1f // return the difference
subs x2, x2, #1 // decrement length
b.ne L_scalar // continue loop if non-zero
1:
mov x0, x3
ClearFrameAndReturn