source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/strlen.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 7.6 KB
Line 
1/* Copyright (c) 2013-2015, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
28/* See strlen-stub.c  */
29#else
30
31/* Assumptions:
32 *
33 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
34 */
35
36/* To test the page crossing code path more thoroughly, compile with
37   -DTEST_PAGE_CROSS - this will force all calls through the slower
38   entry path.  This option is not intended for production use.  */
39
40/* Arguments and results.  */
41#define srcin           x0
42#define len             x0
43
44/* Locals and temporaries.  */
45#define src             x1
46#define data1           x2
47#define data2           x3
48#define has_nul1        x4
49#define has_nul2        x5
50#define tmp1            x4
51#define tmp2            x5
52#define tmp3            x6
53#define tmp4            x7
54#define zeroones        x8
55
56#define L(l) .L ## l
57
58        .macro def_fn f p2align=0
59        .text
60        .p2align \p2align
61        .global \f
62        .type \f, %function
63\f:
64        .endm
65
66        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
67           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
68           can be done in parallel across the entire word. A faster check
69           (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
70           false hits for characters 129..255.  */
71
72#define REP8_01 0x0101010101010101
73#define REP8_7f 0x7f7f7f7f7f7f7f7f
74#define REP8_80 0x8080808080808080
75
76#ifdef TEST_PAGE_CROSS
77# define MIN_PAGE_SIZE 15
78#else
79# define MIN_PAGE_SIZE 4096
80#endif
81
82        /* Since strings are short on average, we check the first 16 bytes
83           of the string for a NUL character.  In order to do an unaligned ldp
84           safely we have to do a page cross check first.  If there is a NUL
85           byte we calculate the length from the 2 8-byte words using
86           conditional select to reduce branch mispredictions (it is unlikely
87           strlen will be repeatedly called on strings with the same length).
88
89           If the string is longer than 16 bytes, we align src so don't need
90           further page cross checks, and process 32 bytes per iteration
91           using the fast NUL check.  If we encounter non-ASCII characters,
92           fallback to a second loop using the full NUL check.
93
94           If the page cross check fails, we read 16 bytes from an aligned
95           address, remove any characters before the string, and continue
96           in the main loop using aligned loads.  Since strings crossing a
97           page in the first 16 bytes are rare (probability of
98           16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
99
100           AArch64 systems have a minimum page size of 4k.  We don't bother
101           checking for larger page sizes - the cost of setting up the correct
102           page size is just not worth the extra gain from a small reduction in
103           the cases taking the slow path.  Note that we only care about
104           whether the first fetch, which may be misaligned, crosses a page
105           boundary.  */
106
107def_fn strlen p2align=6
108        and     tmp1, srcin, MIN_PAGE_SIZE - 1
109        mov     zeroones, REP8_01
110        cmp     tmp1, MIN_PAGE_SIZE - 16
111        b.gt    L(page_cross)
112        ldp     data1, data2, [srcin]
113#ifdef __AARCH64EB__
114        /* For big-endian, carry propagation (if the final byte in the
115           string is 0x01) means we cannot use has_nul1/2 directly.
116           Since we expect strings to be small and early-exit,
117           byte-swap the data now so has_null1/2 will be correct.  */
118        rev     data1, data1
119        rev     data2, data2
120#endif
121        sub     tmp1, data1, zeroones
122        orr     tmp2, data1, REP8_7f
123        sub     tmp3, data2, zeroones
124        orr     tmp4, data2, REP8_7f
125        bics    has_nul1, tmp1, tmp2
126        bic     has_nul2, tmp3, tmp4
127        ccmp    has_nul2, 0, 0, eq
128        beq     L(main_loop_entry)
129
130        /* Enter with C = has_nul1 == 0.  */
131        csel    has_nul1, has_nul1, has_nul2, cc
132        mov     len, 8
133        rev     has_nul1, has_nul1
134        clz     tmp1, has_nul1
135        csel    len, xzr, len, cc
136        add     len, len, tmp1, lsr 3
137        ret
138
139        /* The inner loop processes 32 bytes per iteration and uses the fast
140           NUL check.  If we encounter non-ASCII characters, use a second
141           loop with the accurate NUL check.  */
142        .p2align 4
143L(main_loop_entry):
144        bic     src, srcin, 15
145        sub     src, src, 16
146L(main_loop):
147        ldp     data1, data2, [src, 32]!
148.Lpage_cross_entry:
149        sub     tmp1, data1, zeroones
150        sub     tmp3, data2, zeroones
151        orr     tmp2, tmp1, tmp3
152        tst     tmp2, zeroones, lsl 7
153        bne     1f
154        ldp     data1, data2, [src, 16]
155        sub     tmp1, data1, zeroones
156        sub     tmp3, data2, zeroones
157        orr     tmp2, tmp1, tmp3
158        tst     tmp2, zeroones, lsl 7
159        beq     L(main_loop)
160        add     src, src, 16
1611:
162        /* The fast check failed, so do the slower, accurate NUL check.  */
163        orr     tmp2, data1, REP8_7f
164        orr     tmp4, data2, REP8_7f
165        bics    has_nul1, tmp1, tmp2
166        bic     has_nul2, tmp3, tmp4
167        ccmp    has_nul2, 0, 0, eq
168        beq     L(nonascii_loop)
169
170        /* Enter with C = has_nul1 == 0.  */
171L(tail):
172#ifdef __AARCH64EB__
173        /* For big-endian, carry propagation (if the final byte in the
174           string is 0x01) means we cannot use has_nul1/2 directly.  The
175           easiest way to get the correct byte is to byte-swap the data
176           and calculate the syndrome a second time.  */
177        csel    data1, data1, data2, cc
178        rev     data1, data1
179        sub     tmp1, data1, zeroones
180        orr     tmp2, data1, REP8_7f
181        bic     has_nul1, tmp1, tmp2
182#else
183        csel    has_nul1, has_nul1, has_nul2, cc
184#endif
185        sub     len, src, srcin
186        rev     has_nul1, has_nul1
187        add     tmp2, len, 8
188        clz     tmp1, has_nul1
189        csel    len, len, tmp2, cc
190        add     len, len, tmp1, lsr 3
191        ret
192
193L(nonascii_loop):
194        ldp     data1, data2, [src, 16]!
195        sub     tmp1, data1, zeroones
196        orr     tmp2, data1, REP8_7f
197        sub     tmp3, data2, zeroones
198        orr     tmp4, data2, REP8_7f
199        bics    has_nul1, tmp1, tmp2
200        bic     has_nul2, tmp3, tmp4
201        ccmp    has_nul2, 0, 0, eq
202        bne     L(tail)
203        ldp     data1, data2, [src, 16]!
204        sub     tmp1, data1, zeroones
205        orr     tmp2, data1, REP8_7f
206        sub     tmp3, data2, zeroones
207        orr     tmp4, data2, REP8_7f
208        bics    has_nul1, tmp1, tmp2
209        bic     has_nul2, tmp3, tmp4
210        ccmp    has_nul2, 0, 0, eq
211        beq     L(nonascii_loop)
212        b       L(tail)
213
214        /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
215           srcin to 0x7f, so we ignore any NUL bytes before the string.
216           Then continue in the aligned loop.  */
217L(page_cross):
218        bic     src, srcin, 15
219        ldp     data1, data2, [src]
220        lsl     tmp1, srcin, 3
221        mov     tmp4, -1
222#ifdef __AARCH64EB__
223        /* Big-endian.  Early bytes are at MSB.  */
224        lsr     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
225#else
226        /* Little-endian.  Early bytes are at LSB.  */
227        lsl     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
228#endif
229        orr     tmp1, tmp1, REP8_80
230        orn     data1, data1, tmp1
231        orn     tmp2, data2, tmp1
232        tst     srcin, 8
233        csel    data1, data1, tmp4, eq
234        csel    data2, data2, tmp2, eq
235        b       L(page_cross_entry)
236
237        .size   strlen, . - strlen
238#endif
Note: See TracBrowser for help on using the repository browser.