source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/strncmp.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 6.9 KB
Line 
1/* Copyright (c) 2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
28/* See strcmp-stub.c  */
29#else
30
31/* Assumptions:
32 *
33 * ARMv8-a, AArch64
34 */
35
36        .macro def_fn f p2align=0
37        .text
38        .p2align \p2align
39        .global \f
40        .type \f, %function
41\f:
42        .endm
43
44#define REP8_01 0x0101010101010101
45#define REP8_7f 0x7f7f7f7f7f7f7f7f
46#define REP8_80 0x8080808080808080
47
48/* Parameters and result.  */
49#define src1            x0
50#define src2            x1
51#define limit           x2
52#define result          x0
53
54/* Internal variables.  */
55#define data1           x3
56#define data1w          w3
57#define data2           x4
58#define data2w          w4
59#define has_nul         x5
60#define diff            x6
61#define syndrome        x7
62#define tmp1            x8
63#define tmp2            x9
64#define tmp3            x10
65#define zeroones        x11
66#define pos             x12
67#define limit_wd        x13
68#define mask            x14
69#define endloop         x15
70
71        .text
72        .p2align 6
73        .rep 7
74        nop     /* Pad so that the loop below fits a cache line.  */
75        .endr
76def_fn strncmp
77        cbz     limit, .Lret0
78        eor     tmp1, src1, src2
79        mov     zeroones, #REP8_01
80        tst     tmp1, #7
81        b.ne    .Lmisaligned8
82        ands    tmp1, src1, #7
83        b.ne    .Lmutual_align
84        /* Calculate the number of full and partial words -1.  */
85        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
86        lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
87
88        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
89           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
90           can be done in parallel across the entire word.  */
91        /* Start of performance-critical section  -- one 64B cache line.  */
92.Lloop_aligned:
93        ldr     data1, [src1], #8
94        ldr     data2, [src2], #8
95.Lstart_realigned:
96        subs    limit_wd, limit_wd, #1
97        sub     tmp1, data1, zeroones
98        orr     tmp2, data1, #REP8_7f
99        eor     diff, data1, data2      /* Non-zero if differences found.  */
100        csinv   endloop, diff, xzr, pl  /* Last Dword or differences.  */
101        bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
102        ccmp    endloop, #0, #0, eq
103        b.eq    .Lloop_aligned
104        /* End of performance-critical section  -- one 64B cache line.  */
105
106        /* Not reached the limit, must have found the end or a diff.  */
107        tbz     limit_wd, #63, .Lnot_limit
108
109        /* Limit % 8 == 0 => all bytes significant.  */
110        ands    limit, limit, #7
111        b.eq    .Lnot_limit
112
113        lsl     limit, limit, #3        /* Bits -> bytes.  */
114        mov     mask, #~0
115#ifdef __AARCH64EB__
116        lsr     mask, mask, limit
117#else
118        lsl     mask, mask, limit
119#endif
120        bic     data1, data1, mask
121        bic     data2, data2, mask
122
123        /* Make sure that the NUL byte is marked in the syndrome.  */
124        orr     has_nul, has_nul, mask
125
126.Lnot_limit:
127        orr     syndrome, diff, has_nul
128
129#ifndef __AARCH64EB__
130        rev     syndrome, syndrome
131        rev     data1, data1
132        /* The MS-non-zero bit of the syndrome marks either the first bit
133           that is different, or the top bit of the first zero byte.
134           Shifting left now will bring the critical information into the
135           top bits.  */
136        clz     pos, syndrome
137        rev     data2, data2
138        lsl     data1, data1, pos
139        lsl     data2, data2, pos
140        /* But we need to zero-extend (char is unsigned) the value and then
141           perform a signed 32-bit subtraction.  */
142        lsr     data1, data1, #56
143        sub     result, data1, data2, lsr #56
144        ret
145#else
146        /* For big-endian we cannot use the trick with the syndrome value
147           as carry-propagation can corrupt the upper bits if the trailing
148           bytes in the string contain 0x01.  */
149        /* However, if there is no NUL byte in the dword, we can generate
150           the result directly.  We can't just subtract the bytes as the
151           MSB might be significant.  */
152        cbnz    has_nul, 1f
153        cmp     data1, data2
154        cset    result, ne
155        cneg    result, result, lo
156        ret
1571:
158        /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
159        rev     tmp3, data1
160        sub     tmp1, tmp3, zeroones
161        orr     tmp2, tmp3, #REP8_7f
162        bic     has_nul, tmp1, tmp2
163        rev     has_nul, has_nul
164        orr     syndrome, diff, has_nul
165        clz     pos, syndrome
166        /* The MS-non-zero bit of the syndrome marks either the first bit
167           that is different, or the top bit of the first zero byte.
168           Shifting left now will bring the critical information into the
169           top bits.  */
170        lsl     data1, data1, pos
171        lsl     data2, data2, pos
172        /* But we need to zero-extend (char is unsigned) the value and then
173           perform a signed 32-bit subtraction.  */
174        lsr     data1, data1, #56
175        sub     result, data1, data2, lsr #56
176        ret
177#endif
178
179.Lmutual_align:
180        /* Sources are mutually aligned, but are not currently at an
181           alignment boundary.  Round down the addresses and then mask off
182           the bytes that precede the start point.
183           We also need to adjust the limit calculations, but without
184           overflowing if the limit is near ULONG_MAX.  */
185        bic     src1, src1, #7
186        bic     src2, src2, #7
187        ldr     data1, [src1], #8
188        neg     tmp3, tmp1, lsl #3      /* 64 - bits(bytes beyond align). */
189        ldr     data2, [src2], #8
190        mov     tmp2, #~0
191        sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
192#ifdef __AARCH64EB__
193        /* Big-endian.  Early bytes are at MSB.  */
194        lsl     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
195#else
196        /* Little-endian.  Early bytes are at LSB.  */
197        lsr     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
198#endif
199        and     tmp3, limit_wd, #7
200        lsr     limit_wd, limit_wd, #3
201        /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
202        add     limit, limit, tmp1
203        add     tmp3, tmp3, tmp1
204        orr     data1, data1, tmp2
205        orr     data2, data2, tmp2
206        add     limit_wd, limit_wd, tmp3, lsr #3
207        b       .Lstart_realigned
208
209.Lret0:
210        mov     result, #0
211        ret
212
213        .p2align 6
214.Lmisaligned8:
215        sub     limit, limit, #1
2161:
217        /* Perhaps we can do better than this.  */
218        ldrb    data1w, [src1], #1
219        ldrb    data2w, [src2], #1
220        subs    limit, limit, #1
221        ccmp    data1w, #1, #0, cs      /* NZCV = 0b0000.  */
222        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
223        b.eq    1b
224        sub     result, data1, data2
225        ret
226        .size strncmp, . - strncmp
227
228#endif
Note: See TracBrowser for help on using the repository browser.