source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/memchr.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 5.3 KB
Line 
1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014, ARM Limited
5 * All rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *     * Redistributions of source code must retain the above copyright
10 *       notice, this list of conditions and the following disclaimer.
11 *     * Redistributions in binary form must reproduce the above copyright
12 *       notice, this list of conditions and the following disclaimer in the
13 *       documentation and/or other materials provided with the distribution.
14 *     * Neither the name of the company nor the names of its contributors
15 *       may be used to endorse or promote products derived from this
16 *       software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
32/* See memchr-stub.c  */
33#else
34/* Assumptions:
35 *
36 * ARMv8-a, AArch64
37 * Neon Available.
38 */
39
40/* Arguments and results.  */
41#define srcin           x0
42#define chrin           w1
43#define cntin           x2
44
45#define result          x0
46
47#define src             x3
48#define tmp             x4
49#define wtmp2           w5
50#define synd            x6
51#define soff            x9
52#define cntrem          x10
53
54#define vrepchr         v0
55#define vdata1          v1
56#define vdata2          v2
57#define vhas_chr1       v3
58#define vhas_chr2       v4
59#define vrepmask        v5
60#define vend            v6
61
62/*
63 * Core algorithm:
64 *
65 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
66 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
67 * requested character and bit 1 is not used (faster than using a 32bit
68 * syndrome). Since the bits in the syndrome reflect exactly the order in which
69 * things occur in the original string, counting trailing zeros allows to
70 * identify exactly which byte has matched.
71 */
72
73        .macro def_fn f p2align=0
74        .text
75        .p2align \p2align
76        .global \f
77        .type \f, %function
78\f:
79        .endm
80
81def_fn memchr
82        /* Do not dereference srcin if no bytes to compare.  */
83        cbz     cntin, .Lzero_length
84        /*
85         * Magic constant 0x40100401 allows us to identify which lane matches
86         * the requested byte.
87         */
88        mov     wtmp2, #0x0401
89        movk    wtmp2, #0x4010, lsl #16
90        dup     vrepchr.16b, chrin
91        /* Work with aligned 32-byte chunks */
92        bic     src, srcin, #31
93        dup     vrepmask.4s, wtmp2
94        ands    soff, srcin, #31
95        and     cntrem, cntin, #31
96        b.eq    .Lloop
97
98        /*
99         * Input string is not 32-byte aligned. We calculate the syndrome
100         * value for the aligned 32 bytes block containing the first bytes
101         * and mask the irrelevant part.
102         */
103
104        ld1     {vdata1.16b, vdata2.16b}, [src], #32
105        sub     tmp, soff, #32
106        adds    cntin, cntin, tmp
107        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
108        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
109        and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
110        and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
111        addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b          /* 256->128 */
112        addp    vend.16b, vend.16b, vend.16b                    /* 128->64 */
113        mov     synd, vend.2d[0]
114        /* Clear the soff*2 lower bits */
115        lsl     tmp, soff, #1
116        lsr     synd, synd, tmp
117        lsl     synd, synd, tmp
118        /* The first block can also be the last */
119        b.ls    .Lmasklast
120        /* Have we found something already? */
121        cbnz    synd, .Ltail
122
123.Lloop:
124        ld1     {vdata1.16b, vdata2.16b}, [src], #32
125        subs    cntin, cntin, #32
126        cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
127        cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
128        /* If we're out of data we finish regardless of the result */
129        b.ls    .Lend
130        /* Use a fast check for the termination condition */
131        orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
132        addp    vend.2d, vend.2d, vend.2d
133        mov     synd, vend.2d[0]
134        /* We're not out of data, loop if we haven't found the character */
135        cbz     synd, .Lloop
136
137.Lend:
138        /* Termination condition found, let's calculate the syndrome value */
139        and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
140        and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
141        addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b          /* 256->128 */
142        addp    vend.16b, vend.16b, vend.16b                    /* 128->64 */
143        mov     synd, vend.2d[0]
144        /* Only do the clear for the last possible block */
145        b.hi    .Ltail
146
147.Lmasklast:
148        /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
149        add     tmp, cntrem, soff
150        and     tmp, tmp, #31
151        sub     tmp, tmp, #32
152        neg     tmp, tmp, lsl #1
153        lsl     synd, synd, tmp
154        lsr     synd, synd, tmp
155
156.Ltail:
157        /* Count the trailing zeros using bit reversing */
158        rbit    synd, synd
159        /* Compensate the last post-increment */
160        sub     src, src, #32
161        /* Check that we have found a character */
162        cmp     synd, #0
163        /* And count the leading zeros */
164        clz     synd, synd
165        /* Compute the potential result */
166        add     result, src, synd, lsr #1
167        /* Select result or NULL */
168        csel    result, xzr, result, eq
169        ret
170
171.Lzero_length:
172        mov     result, #0
173        ret
174
175        .size   memchr, . - memchr
176#endif
Note: See TracBrowser for help on using the repository browser.