source: trunk/libs/newlib/src/newlib/libc/machine/arm/strcmp-armv7.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 11.7 KB
Line 
1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29        /* Implementation of strcmp for ARMv7 when DSP instructions are
30           available.  Use ldrd to support wider loads, provided the data
31           is sufficiently aligned.  Use saturating arithmetic to optimize
32           the compares.  */
33
34        /* Build Options:
35           STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36           byte in the string.  If comparing completely random strings
37           the pre-check will save time, since there is a very high
38           probability of a mismatch in the first character: we save
39           significant overhead if this is the common case.  However,
40           if strings are likely to be identical (eg because we're
41           verifying a hit in a hash table), then this check is largely
42           redundant.  */
43
44        /* This version uses Thumb-2 code.  */
45        .thumb
46        .syntax unified
47
48/* Parameters and result.  */
49#define src1            r0
50#define src2            r1
51#define result          r0      /* Overlaps src1.  */
52
53/* Internal variables.  */
54#define tmp1            r4
55#define tmp2            r5
56#define const_m1        r12
57
58/* Additional internal variables for 64-bit aligned data.  */
59#define data1a          r2
60#define data1b          r3
61#define data2a          r6
62#define data2b          r7
63#define syndrome_a      tmp1
64#define syndrome_b      tmp2
65
66/* Additional internal variables for 32-bit aligned data.  */
67#define data1           r2
68#define data2           r3
69#define syndrome        tmp2
70
71
72        /* Macro to compute and return the result value for word-aligned
73           cases.  */
74        .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
75#ifdef __ARM_BIG_ENDIAN
76        /* If data1 contains a zero byte, then syndrome will contain a 1 in
77           bit 7 of that byte.  Otherwise, the highest set bit in the
78           syndrome will highlight the first different bit.  It is therefore
79           sufficient to extract the eight bits starting with the syndrome
80           bit.  */
81        clz     tmp1, \synd
82        lsl     r1, \d2, tmp1
83        .if \restore_r6
84        ldrd    r6, r7, [sp, #8]
85        .endif
86        .cfi_restore 6
87        .cfi_restore 7
88        lsl     \d1, \d1, tmp1
89        .cfi_remember_state
90        lsr     result, \d1, #24
91        ldrd    r4, r5, [sp], #16
92        .cfi_restore 4
93        .cfi_restore 5
94        sub     result, result, r1, lsr #24
95        bx      lr
96#else
97        /* To use the big-endian trick we'd have to reverse all three words.
98           that's slower than this approach.  */
99        rev     \synd, \synd
100        clz     tmp1, \synd
101        bic     tmp1, tmp1, #7
102        lsr     r1, \d2, tmp1
103        .cfi_remember_state
104        .if \restore_r6
105        ldrd    r6, r7, [sp, #8]
106        .endif
107        .cfi_restore 6
108        .cfi_restore 7
109        lsr     \d1, \d1, tmp1
110        and     result, \d1, #255
111        and     r1, r1, #255
112        ldrd    r4, r5, [sp], #16
113        .cfi_restore 4
114        .cfi_restore 5
115        sub     result, result, r1
116
117        bx      lr
118#endif
119        .endm
120
121        .text
122        .p2align        5
123.Lstrcmp_start_addr:
124#ifndef STRCMP_NO_PRECHECK
125.Lfastpath_exit:
126        sub     r0, r2, r3
127        bx      lr
128        nop
129#endif
130def_fn  strcmp
131#ifndef STRCMP_NO_PRECHECK
132        ldrb    r2, [src1]
133        ldrb    r3, [src2]
134        cmp     r2, #1
135        it      cs
136        cmpcs   r2, r3
137        bne     .Lfastpath_exit
138#endif
139        .cfi_sections .debug_frame
140        .cfi_startproc
141        strd    r4, r5, [sp, #-16]!
142        .cfi_def_cfa_offset 16
143        .cfi_offset 4, -16
144        .cfi_offset 5, -12
145        orr     tmp1, src1, src2
146        strd    r6, r7, [sp, #8]
147        .cfi_offset 6, -8
148        .cfi_offset 7, -4
149        mvn     const_m1, #0
150        lsl     r2, tmp1, #29
151        cbz     r2, .Lloop_aligned8
152
153.Lnot_aligned:
154        eor     tmp1, src1, src2
155        tst     tmp1, #7
156        bne     .Lmisaligned8
157
158        /* Deal with mutual misalignment by aligning downwards and then
159           masking off the unwanted loaded data to prevent a difference.  */
160        and     tmp1, src1, #7
161        bic     src1, src1, #7
162        and     tmp2, tmp1, #3
163        bic     src2, src2, #7
164        lsl     tmp2, tmp2, #3  /* Bytes -> bits.  */
165        ldrd    data1a, data1b, [src1], #16
166        tst     tmp1, #4
167        ldrd    data2a, data2b, [src2], #16
168        /* In thumb code we can't use MVN with a register shift, but
169           we do have ORN.  */
170        S2HI    tmp1, const_m1, tmp2
171        orn     data1a, data1a, tmp1
172        orn     data2a, data2a, tmp1
173        beq     .Lstart_realigned8
174        orn     data1b, data1b, tmp1
175        mov     data1a, const_m1
176        orn     data2b, data2b, tmp1
177        mov     data2a, const_m1
178        b       .Lstart_realigned8
179
180        /* Unwind the inner loop by a factor of 2, giving 16 bytes per
181           pass.  */
182        .p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
183        .p2align 2      /* Always word aligned.  */
184.Lloop_aligned8:
185        ldrd    data1a, data1b, [src1], #16
186        ldrd    data2a, data2b, [src2], #16
187.Lstart_realigned8:
188        uadd8   syndrome_b, data1a, const_m1    /* Only want GE bits,  */
189        eor     syndrome_a, data1a, data2a
190        sel     syndrome_a, syndrome_a, const_m1
191        cbnz    syndrome_a, .Ldiff_in_a
192        uadd8   syndrome_b, data1b, const_m1    /* Only want GE bits.  */
193        eor     syndrome_b, data1b, data2b
194        sel     syndrome_b, syndrome_b, const_m1
195        cbnz    syndrome_b, .Ldiff_in_b
196
197        ldrd    data1a, data1b, [src1, #-8]
198        ldrd    data2a, data2b, [src2, #-8]
199        uadd8   syndrome_b, data1a, const_m1    /* Only want GE bits,  */
200        eor     syndrome_a, data1a, data2a
201        sel     syndrome_a, syndrome_a, const_m1
202        uadd8   syndrome_b, data1b, const_m1    /* Only want GE bits.  */
203        eor     syndrome_b, data1b, data2b
204        sel     syndrome_b, syndrome_b, const_m1
205        /* Can't use CBZ for backwards branch.  */
206        orrs    syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
207        beq     .Lloop_aligned8
208
209.Ldiff_found:
210        cbnz    syndrome_a, .Ldiff_in_a
211
212.Ldiff_in_b:
213        strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
214
215.Ldiff_in_a:
216        .cfi_restore_state
217        strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
218
219        .cfi_restore_state
220.Lmisaligned8:
221        tst     tmp1, #3
222        bne     .Lmisaligned4
223        ands    tmp1, src1, #3
224        bne     .Lmutual_align4
225
226        /* Unrolled by a factor of 2, to reduce the number of post-increment
227           operations.  */
228.Lloop_aligned4:
229        ldr     data1, [src1], #8
230        ldr     data2, [src2], #8
231.Lstart_realigned4:
232        uadd8   syndrome, data1, const_m1       /* Only need GE bits.  */
233        eor     syndrome, data1, data2
234        sel     syndrome, syndrome, const_m1
235        cbnz    syndrome, .Laligned4_done
236        ldr     data1, [src1, #-4]
237        ldr     data2, [src2, #-4]
238        uadd8   syndrome, data1, const_m1
239        eor     syndrome, data1, data2
240        sel     syndrome, syndrome, const_m1
241        cmp     syndrome, #0
242        beq     .Lloop_aligned4
243
244.Laligned4_done:
245        strcmp_epilogue_aligned syndrome, data1, data2, 0
246
247.Lmutual_align4:
248        .cfi_restore_state
249        /* Deal with mutual misalignment by aligning downwards and then
250           masking off the unwanted loaded data to prevent a difference.  */
251        lsl     tmp1, tmp1, #3  /* Bytes -> bits.  */
252        bic     src1, src1, #3
253        ldr     data1, [src1], #8
254        bic     src2, src2, #3
255        ldr     data2, [src2], #8
256
257        /* In thumb code we can't use MVN with a register shift, but
258           we do have ORN.  */
259        S2HI    tmp1, const_m1, tmp1
260        orn     data1, data1, tmp1
261        orn     data2, data2, tmp1
262        b       .Lstart_realigned4
263
264.Lmisaligned4:
265        ands    tmp1, src1, #3
266        beq     .Lsrc1_aligned
267        sub     src2, src2, tmp1
268        bic     src1, src1, #3
269        lsls    tmp1, tmp1, #31
270        ldr     data1, [src1], #4
271        beq     .Laligned_m2
272        bcs     .Laligned_m1
273
274#ifdef STRCMP_NO_PRECHECK
275        ldrb    data2, [src2, #1]
276        uxtb    tmp1, data1, ror #BYTE1_OFFSET
277        subs    tmp1, tmp1, data2
278        bne     .Lmisaligned_exit
279        cbz     data2, .Lmisaligned_exit
280
281.Laligned_m2:
282        ldrb    data2, [src2, #2]
283        uxtb    tmp1, data1, ror #BYTE2_OFFSET
284        subs    tmp1, tmp1, data2
285        bne     .Lmisaligned_exit
286        cbz     data2, .Lmisaligned_exit
287
288.Laligned_m1:
289        ldrb    data2, [src2, #3]
290        uxtb    tmp1, data1, ror #BYTE3_OFFSET
291        subs    tmp1, tmp1, data2
292        bne     .Lmisaligned_exit
293        add     src2, src2, #4
294        cbnz    data2, .Lsrc1_aligned
295#else  /* STRCMP_NO_PRECHECK */
296        /* If we've done the pre-check, then we don't need to check the
297           first byte again here.  */
298        ldrb    data2, [src2, #2]
299        uxtb    tmp1, data1, ror #BYTE2_OFFSET
300        subs    tmp1, tmp1, data2
301        bne     .Lmisaligned_exit
302        cbz     data2, .Lmisaligned_exit
303
304.Laligned_m2:
305        ldrb    data2, [src2, #3]
306        uxtb    tmp1, data1, ror #BYTE3_OFFSET
307        subs    tmp1, tmp1, data2
308        bne     .Lmisaligned_exit
309        cbnz    data2, .Laligned_m1
310#endif
311
312.Lmisaligned_exit:
313        .cfi_remember_state
314        mov     result, tmp1
315        ldr     r4, [sp], #16
316        .cfi_restore 4
317        bx      lr
318
319#ifndef STRCMP_NO_PRECHECK
320.Laligned_m1:
321        add     src2, src2, #4
322#endif
323.Lsrc1_aligned:
324        .cfi_restore_state
325        /* src1 is word aligned, but src2 has no common alignment
326           with it.  */
327        ldr     data1, [src1], #4
328        lsls    tmp1, src2, #31         /* C=src2[1], Z=src2[0].  */
329
330        bic     src2, src2, #3
331        ldr     data2, [src2], #4
332        bhi     .Loverlap1              /* C=1, Z=0 => src2[1:0] = 0b11.  */
333        bcs     .Loverlap2              /* C=1, Z=1 => src2[1:0] = 0b10.  */
334
335        /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
336.Loverlap3:
337        bic     tmp1, data1, #MSB
338        uadd8   syndrome, data1, const_m1
339        eors    syndrome, tmp1, data2, S2LO #8
340        sel     syndrome, syndrome, const_m1
341        bne     4f
342        cbnz    syndrome, 5f
343        ldr     data2, [src2], #4
344        eor     tmp1, tmp1, data1
345        cmp     tmp1, data2, S2HI #24
346        bne     6f
347        ldr     data1, [src1], #4
348        b       .Loverlap3
3494:
350        S2LO    data2, data2, #8
351        b       .Lstrcmp_tail
352
3535:
354        bics    syndrome, syndrome, #MSB
355        bne     .Lstrcmp_done_equal
356
357        /* We can only get here if the MSB of data1 contains 0, so
358           fast-path the exit.  */
359        ldrb    result, [src2]
360        .cfi_remember_state
361        ldrd    r4, r5, [sp], #16
362        .cfi_restore 4
363        .cfi_restore 5
364        /* R6/7 Not used in this sequence.  */
365        .cfi_restore 6
366        .cfi_restore 7
367        neg     result, result
368        bx      lr
369
3706:
371        .cfi_restore_state
372        S2LO    data1, data1, #24
373        and     data2, data2, #LSB
374        b       .Lstrcmp_tail
375
376        .p2align 5,,12  /* Ensure at least 3 instructions in cache line.  */
377.Loverlap2:
378        and     tmp1, data1, const_m1, S2LO #16
379        uadd8   syndrome, data1, const_m1
380        eors    syndrome, tmp1, data2, S2LO #16
381        sel     syndrome, syndrome, const_m1
382        bne     4f
383        cbnz    syndrome, 5f
384        ldr     data2, [src2], #4
385        eor     tmp1, tmp1, data1
386        cmp     tmp1, data2, S2HI #16
387        bne     6f
388        ldr     data1, [src1], #4
389        b       .Loverlap2
3904:
391        S2LO    data2, data2, #16
392        b       .Lstrcmp_tail
3935:
394        ands    syndrome, syndrome, const_m1, S2LO #16
395        bne     .Lstrcmp_done_equal
396
397        ldrh    data2, [src2]
398        S2LO    data1, data1, #16
399#ifdef __ARM_BIG_ENDIAN
400        lsl     data2, data2, #16
401#endif
402        b       .Lstrcmp_tail
403
4046:
405        S2LO    data1, data1, #16
406        and     data2, data2, const_m1, S2LO #16
407        b       .Lstrcmp_tail
408
409        .p2align 5,,12  /* Ensure at least 3 instructions in cache line.  */
410.Loverlap1:
411        and     tmp1, data1, #LSB
412        uadd8   syndrome, data1, const_m1
413        eors    syndrome, tmp1, data2, S2LO #24
414        sel     syndrome, syndrome, const_m1
415        bne     4f
416        cbnz    syndrome, 5f
417        ldr     data2, [src2], #4
418        eor     tmp1, tmp1, data1
419        cmp     tmp1, data2, S2HI #8
420        bne     6f
421        ldr     data1, [src1], #4
422        b       .Loverlap1
4234:
424        S2LO    data2, data2, #24
425        b       .Lstrcmp_tail
4265:
427        tst     syndrome, #LSB
428        bne     .Lstrcmp_done_equal
429        ldr     data2, [src2]
4306:
431        S2LO    data1, data1, #8
432        bic     data2, data2, #MSB
433        b       .Lstrcmp_tail
434
435.Lstrcmp_done_equal:
436        mov     result, #0
437        .cfi_remember_state
438        ldrd    r4, r5, [sp], #16
439        .cfi_restore 4
440        .cfi_restore 5
441        /* R6/7 not used in this sequence.  */
442        .cfi_restore 6
443        .cfi_restore 7
444        bx      lr
445
446.Lstrcmp_tail:
447        .cfi_restore_state
448#ifndef __ARM_BIG_ENDIAN
449        rev     data1, data1
450        rev     data2, data2
451        /* Now everything looks big-endian...  */
452#endif
453        uadd8   tmp1, data1, const_m1
454        eor     tmp1, data1, data2
455        sel     syndrome, tmp1, const_m1
456        clz     tmp1, syndrome
457        lsl     data1, data1, tmp1
458        lsl     data2, data2, tmp1
459        lsr     result, data1, #24
460        ldrd    r4, r5, [sp], #16
461        .cfi_restore 4
462        .cfi_restore 5
463        /* R6/7 not used in this sequence.  */
464        .cfi_restore 6
465        .cfi_restore 7
466        sub     result, result, data2, lsr #24
467        bx      lr
468        .cfi_endproc
469        .size strcmp, . - .Lstrcmp_start_addr
Note: See TracBrowser for help on using the repository browser.