source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/strnlen.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 5.8 KB
Line 
1/* strnlen - calculate the length of a string with limit.
2
3   Copyright (c) 2013, Linaro Limited
4   All rights reserved.
5
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of the Linaro nor the
14         names of its contributors may be used to endorse or promote products
15         derived from this software without specific prior written permission.
16
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28
29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
30/* See strlen-stub.c  */
31#else
32
33/* Assumptions:
34 *
35 * ARMv8-a, AArch64
36 */
37
38/* Arguments and results.  */
39#define srcin           x0
40#define len             x0
41#define limit           x1
42
43/* Locals and temporaries.  */
44#define src             x2
45#define data1           x3
46#define data2           x4
47#define data2a          x5
48#define has_nul1        x6
49#define has_nul2        x7
50#define tmp1            x8
51#define tmp2            x9
52#define tmp3            x10
53#define tmp4            x11
54#define zeroones        x12
55#define pos             x13
56#define limit_wd        x14
57
58        .macro def_fn f p2align=0
59        .text
60        .p2align \p2align
61        .global \f
62        .type \f, %function
63\f:
64        .endm
65
66#define REP8_01 0x0101010101010101
67#define REP8_7f 0x7f7f7f7f7f7f7f7f
68#define REP8_80 0x8080808080808080
69
70        .text
71        .p2align        6
72.Lstart:
73        /* Pre-pad to ensure critical loop begins an icache line.  */
74        .rep 7
75        nop
76        .endr
77        /* Put this code here to avoid wasting more space with pre-padding.  */
78.Lhit_limit:
79        mov     len, limit
80        ret
81
82def_fn strnlen
83        cbz     limit, .Lhit_limit
84        mov     zeroones, #REP8_01
85        bic     src, srcin, #15
86        ands    tmp1, srcin, #15
87        b.ne    .Lmisaligned
88        /* Calculate the number of full and partial words -1.  */
89        sub     limit_wd, limit, #1     /* Limit != 0, so no underflow.  */
90        lsr     limit_wd, limit_wd, #4  /* Convert to Qwords.  */
91
92        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
93           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
94           can be done in parallel across the entire word.  */
95        /* The inner loop deals with two Dwords at a time.  This has a
96           slightly higher start-up cost, but we should win quite quickly,
97           especially on cores with a high number of issue slots per
98           cycle, as we get much better parallelism out of the operations.  */
99
100        /* Start of critial section -- keep to one 64Byte cache line.  */
101.Lloop:
102        ldp     data1, data2, [src], #16
103.Lrealigned:
104        sub     tmp1, data1, zeroones
105        orr     tmp2, data1, #REP8_7f
106        sub     tmp3, data2, zeroones
107        orr     tmp4, data2, #REP8_7f
108        bic     has_nul1, tmp1, tmp2
109        bic     has_nul2, tmp3, tmp4
110        subs    limit_wd, limit_wd, #1
111        orr     tmp1, has_nul1, has_nul2
112        ccmp    tmp1, #0, #0, pl        /* NZCV = 0000  */
113        b.eq    .Lloop
114        /* End of critical section -- keep to one 64Byte cache line.  */
115
116        orr     tmp1, has_nul1, has_nul2
117        cbz     tmp1, .Lhit_limit       /* No null in final Qword.  */
118
119        /* We know there's a null in the final Qword.  The easiest thing
120           to do now is work out the length of the string and return
121           MIN (len, limit).  */
122
123        sub     len, src, srcin
124        cbz     has_nul1, .Lnul_in_data2
125#ifdef __AARCH64EB__
126        mov     data2, data1
127#endif
128        sub     len, len, #8
129        mov     has_nul2, has_nul1
130.Lnul_in_data2:
131#ifdef __AARCH64EB__
132        /* For big-endian, carry propagation (if the final byte in the
133           string is 0x01) means we cannot use has_nul directly.  The
134           easiest way to get the correct byte is to byte-swap the data
135           and calculate the syndrome a second time.  */
136        rev     data2, data2
137        sub     tmp1, data2, zeroones
138        orr     tmp2, data2, #REP8_7f
139        bic     has_nul2, tmp1, tmp2
140#endif
141        sub     len, len, #8
142        rev     has_nul2, has_nul2
143        clz     pos, has_nul2
144        add     len, len, pos, lsr #3           /* Bits to bytes.  */
145        cmp     len, limit
146        csel    len, len, limit, ls             /* Return the lower value.  */
147        ret
148
149.Lmisaligned:
150        /* Deal with a partial first word.
151           We're doing two things in parallel here;
152           1) Calculate the number of words (but avoiding overflow if
153              limit is near ULONG_MAX) - to do this we need to work out
154              limit + tmp1 - 1 as a 65-bit value before shifting it;
155           2) Load and mask the initial data words - we force the bytes
156              before the ones we are interested in to 0xff - this ensures
157              early bytes will not hit any zero detection.  */
158        sub     limit_wd, limit, #1
159        neg     tmp4, tmp1
160        cmp     tmp1, #8
161
162        and     tmp3, limit_wd, #15
163        lsr     limit_wd, limit_wd, #4
164        mov     tmp2, #~0
165
166        ldp     data1, data2, [src], #16
167        lsl     tmp4, tmp4, #3          /* Bytes beyond alignment -> bits.  */
168        add     tmp3, tmp3, tmp1
169
170#ifdef __AARCH64EB__
171        /* Big-endian.  Early bytes are at MSB.  */
172        lsl     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
173#else
174        /* Little-endian.  Early bytes are at LSB.  */
175        lsr     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
176#endif
177        add     limit_wd, limit_wd, tmp3, lsr #4
178
179        orr     data1, data1, tmp2
180        orr     data2a, data2, tmp2
181
182        csinv   data1, data1, xzr, le
183        csel    data2, data2, data2a, le
184        b       .Lrealigned
185        .size   strnlen, . - .Lstart    /* Include pre-padding in size.  */
186
187#endif
Note: See TracBrowser for help on using the repository browser.