source: trunk/libs/newlib/src/newlib/libc/machine/arm/memchr.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 11.4 KB
Line 
1/* Copyright (c) 2010-2011, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31   Written by Dave Gilbert <david.gilbert@linaro.org>
32
33   This memchr routine is optimised on a Cortex-A9 and should work on
34   all ARMv7 processors.   It has a fast path for short sizes, and has
35   an optimised path for large data sets; the worst case is finding the
36   match early in a large data set. */
37
38/* Copyright (c) 2015 ARM Ltd.
39   All rights reserved.
40
41   Redistribution and use in source and binary forms, with or without
42   modification, are permitted provided that the following conditions are met:
43       * Redistributions of source code must retain the above copyright
44         notice, this list of conditions and the following disclaimer.
45       * Redistributions in binary form must reproduce the above copyright
46         notice, this list of conditions and the following disclaimer in the
47         documentation and/or other materials provided with the distribution.
48       * Neither the name of the Linaro nor the
49         names of its contributors may be used to endorse or promote products
50         derived from this software without specific prior written permission.
51
52   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
56   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
63
64@ 2011-02-07 david.gilbert@linaro.org
65@    Extracted from local git a5b438d861
66@ 2011-07-14 david.gilbert@linaro.org
67@    Import endianness fix from local git ea786f1b
68@ 2011-10-11 david.gilbert@linaro.org
69@    Import from cortex-strings bzr rev 63
70@    Flip to ldrd (as suggested by Greta Yorsh)
71@    Make conditional on CPU type
72@    tidy
73
74@ This code requires armv6t2 or later.  Uses Thumb2.
75
76        .syntax unified
77
78#include "acle-compat.h"
79
80@ NOTE: This ifdef MUST match the one in memchr-stub.c
81#if defined (__ARM_NEON__) || defined (__ARM_NEON)
82        .arch   armv7-a
83        .fpu    neon
84
85
86/* Arguments */
87#define srcin           r0
88#define chrin           r1
89#define cntin           r2
90
91/* Retval */
92#define result          r0      /* Live range does not overlap with srcin */
93
94/* Working registers */
95#define src             r1      /* Live range does not overlap with chrin */
96#define tmp             r3
97#define synd            r0      /* No overlap with srcin or result */
98#define soff            r12
99
100/* Working NEON registers */
101#define vrepchr         q0
102#define vdata0          q1
103#define vdata0_0        d2      /* Lower half of vdata0 */
104#define vdata0_1        d3      /* Upper half of vdata0 */
105#define vdata1          q2
106#define vdata1_0        d4      /* Lower half of vhas_chr0 */
107#define vdata1_1        d5      /* Upper half of vhas_chr0 */
108#define vrepmask        q3
109#define vrepmask0       d6
110#define vrepmask1       d7
111#define vend            q4
112#define vend0           d8
113#define vend1           d9
114
115/*
116 * Core algorithm:
117 *
118 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
119 * byte. Each bit is set if the relevant byte matched the requested character
120 * and cleared otherwise. Since the bits in the syndrome reflect exactly the
121 * order in which things occur in the original string, counting trailing zeros
122 * allows to identify exactly which byte has matched.
123 */
124
125        .text
126        .thumb_func
127        .align 4
128        .p2align 4,,15
129        .global memchr
130        .type memchr,%function
131
132memchr:
133        .cfi_sections .debug_frame
134        .cfi_startproc
135        /* Use a simple loop if there are less than 8 bytes to search.  */
136        cmp     cntin, #7
137        bhi     .Llargestr
138        and     chrin, chrin, #0xff
139
140.Lsmallstr:
141        subs    cntin, cntin, #1
142        blo     .Lnotfound      /* Return not found if reached end.  */
143        ldrb    tmp, [srcin], #1
144        cmp     tmp, chrin
145        bne     .Lsmallstr      /* Loop again if not found.  */
146        /* Otherwise fixup address and return.  */
147        sub     result, result, #1
148        bx      lr
149
150
151.Llargestr:
152        vdup.8  vrepchr, chrin  /* Duplicate char across all lanes. */
153        /*
154         * Magic constant 0x8040201008040201 allows us to identify which lane
155         * matches the requested byte.
156         */
157        movw    tmp, #0x0201
158        movt    tmp, #0x0804
159        lsl     soff, tmp, #4
160        vmov    vrepmask0, tmp, soff
161        vmov    vrepmask1, tmp, soff
162        /* Work with aligned 32-byte chunks */
163        bic     src, srcin, #31
164        ands    soff, srcin, #31
165        beq     .Lloopintro     /* Go straight to main loop if it's aligned. */
166
167        /*
168         * Input string is not 32-byte aligned. We calculate the syndrome
169         * value for the aligned 32 bytes block containing the first bytes
170         * and mask the irrelevant part.
171         */
172        vld1.8          {vdata0, vdata1}, [src:256]!
173        sub             tmp, soff, #32
174        adds            cntin, cntin, tmp
175        vceq.i8         vdata0, vdata0, vrepchr
176        vceq.i8         vdata1, vdata1, vrepchr
177        vand            vdata0, vdata0, vrepmask
178        vand            vdata1, vdata1, vrepmask
179        vpadd.i8        vdata0_0, vdata0_0, vdata0_1
180        vpadd.i8        vdata1_0, vdata1_0, vdata1_1
181        vpadd.i8        vdata0_0, vdata0_0, vdata1_0
182        vpadd.i8        vdata0_0, vdata0_0, vdata0_0
183        vmov            synd, vdata0_0[0]
184
185        /* Clear the soff lower bits */
186        lsr             synd, synd, soff
187        lsl             synd, synd, soff
188        /* The first block can also be the last */
189        bls             .Lmasklast
190        /* Have we found something already? */
191        cbnz            synd, .Ltail
192
193
194.Lloopintro:
195        vpush   {vend}
196        /* 264/265 correspond to d8/d9 for q4 */
197        .cfi_adjust_cfa_offset  16
198        .cfi_rel_offset 264, 0
199        .cfi_rel_offset 265, 8
200        .p2align 3,,7
201.Lloop:
202        vld1.8          {vdata0, vdata1}, [src:256]!
203        subs            cntin, cntin, #32
204        vceq.i8         vdata0, vdata0, vrepchr
205        vceq.i8         vdata1, vdata1, vrepchr
206        /* If we're out of data we finish regardless of the result. */
207        bls             .Lend
208        /* Use a fast check for the termination condition. */
209        vorr            vend, vdata0, vdata1
210        vorr            vend0, vend0, vend1
211        vmov            synd, tmp, vend0
212        orrs            synd, synd, tmp
213        /* We're not out of data, loop if we haven't found the character. */
214        beq             .Lloop
215
216.Lend:
217        vpop            {vend}
218        .cfi_adjust_cfa_offset  -16
219        .cfi_restore    264
220        .cfi_restore    265
221
222        /* Termination condition found, let's calculate the syndrome value. */
223        vand            vdata0, vdata0, vrepmask
224        vand            vdata1, vdata1, vrepmask
225        vpadd.i8        vdata0_0, vdata0_0, vdata0_1
226        vpadd.i8        vdata1_0, vdata1_0, vdata1_1
227        vpadd.i8        vdata0_0, vdata0_0, vdata1_0
228        vpadd.i8        vdata0_0, vdata0_0, vdata0_0
229        vmov            synd, vdata0_0[0]
230        cbz             synd, .Lnotfound
231        bhi             .Ltail
232
233
234.Lmasklast:
235        /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
236        neg     cntin, cntin
237        lsl     synd, synd, cntin
238        lsrs    synd, synd, cntin
239        it      eq
240        moveq   src, #0 /* If no match, set src to 0 so the retval is 0. */
241
242
243.Ltail:
244        /* Count the trailing zeros using bit reversing */
245        rbit    synd, synd
246        /* Compensate the last post-increment */
247        sub     src, src, #32
248        /* Count the leading zeros */
249        clz     synd, synd
250        /* Compute the potential result and return */
251        add     result, src, synd
252        bx      lr
253
254
255.Lnotfound:
256        /* Set result to NULL if not found and return */
257        mov     result, #0
258        bx      lr
259
260        .cfi_endproc
261        .size   memchr, . - memchr
262
263#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
264
265#if __ARM_ARCH_PROFILE == 'M'
266       .arch armv7e-m
267#else
268       .arch armv6t2
269#endif
270
271@ this lets us check a flag in a 00/ff byte easily in either endianness
272#ifdef __ARMEB__
273#define CHARTSTMASK(c) 1<<(31-(c*8))
274#else
275#define CHARTSTMASK(c) 1<<(c*8)
276#endif
277        .text
278        .thumb
279
280@ ---------------------------------------------------------------------------
281        .thumb_func
282        .align 2
283        .p2align 4,,15
284        .global memchr
285        .type memchr,%function
286memchr:
287        @ r0 = start of memory to scan
288        @ r1 = character to look for
289        @ r2 = length
290        @ returns r0 = pointer to character or NULL if not found
291        and     r1,r1,#0xff     @ Don't trust the caller to pass a char
292
293        cmp     r2,#16          @ If short don't bother with anything clever
294        blt     20f
295
296        tst     r0, #7          @ If it's already aligned skip the next bit
297        beq     10f
298
299        @ Work up to an aligned point
3005:
301        ldrb    r3, [r0],#1
302        subs    r2, r2, #1
303        cmp     r3, r1
304        beq     50f             @ If it matches exit found
305        tst     r0, #7
306        cbz     r2, 40f         @ If we run off the end, exit not found
307        bne     5b              @ If not aligned yet then do next byte
308       
30910:
310        @ We are aligned, we know we have at least 8 bytes to work with
311        push    {r4,r5,r6,r7}
312        orr     r1, r1, r1, lsl #8      @ expand the match word across all bytes
313        orr     r1, r1, r1, lsl #16
314        bic     r4, r2, #7      @ Number of double words to work with * 8
315        mvns    r7, #0          @ all F's
316        movs    r3, #0
317       
31815:
319        ldrd    r5,r6,[r0],#8
320        subs    r4, r4, #8
321        eor     r5,r5, r1       @ r5,r6 have 00's where bytes match the target
322        eor     r6,r6, r1
323        uadd8   r5, r5, r7      @ Par add 0xff - sets GE bits for bytes!=0
324        sel     r5, r3, r7      @ bytes are 00 for none-00 bytes,
325                                @ or ff for 00 bytes - NOTE INVERSION
326        uadd8   r6, r6, r7      @ Par add 0xff - sets GE bits for bytes!=0
327        sel     r6, r5, r7      @ chained....bytes are 00 for none-00 bytes
328                                @ or ff for 00 bytes - NOTE INVERSION
329        cbnz    r6, 60f
330        bne     15b             @ (Flags from the subs above)
331
332        pop     {r4,r5,r6,r7}
333        and     r1,r1,#0xff     @ r1 back to a single character
334        and     r2,r2,#7        @ Leave the count remaining as the number
335                                @ after the double words have been done
336 
33720:
338        cbz     r2, 40f         @ 0 length or hit the end already then not found
339
34021:  @ Post aligned section, or just a short call
341        ldrb    r3,[r0],#1
342        subs    r2,r2,#1
343        eor     r3,r3,r1        @ r3 = 0 if match - doesn't break flags from sub
344        cbz     r3, 50f
345        bne     21b             @ on r2 flags
346
34740:
348        movs    r0,#0           @ not found
349        bx      lr
350
35150:
352        subs    r0,r0,#1        @ found
353        bx      lr
354
35560:  @ We're here because the fast path found a hit
356     @ now we have to track down exactly which word it was
357        @ r0 points to the start of the double word after the one tested
358        @ r5 has the 00/ff pattern for the first word, r6 has the chained value
359        cmp     r5, #0
360        itte    eq
361        moveq   r5, r6          @ the end is in the 2nd word
362        subeq   r0,r0,#3        @ Points to 2nd byte of 2nd word
363        subne   r0,r0,#7        @ or 2nd byte of 1st word
364
365        @ r0 currently points to the 2nd byte of the word containing the hit
366        tst     r5, # CHARTSTMASK(0)    @ 1st character
367        bne     61f
368        adds    r0,r0,#1
369        tst     r5, # CHARTSTMASK(1)    @ 2nd character
370        ittt    eq
371        addeq   r0,r0,#1
372        tsteq   r5, # (3<<15)           @ 2nd & 3rd character
373        @ If not the 3rd must be the last one
374        addeq   r0,r0,#1
375
37661:
377        pop     {r4,r5,r6,r7}
378        subs    r0,r0,#1
379        bx      lr
380#else
381  /* Defined in memchr-stub.c.  */
382#endif
Note: See TracBrowser for help on using the repository browser.