source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/strcpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 10.3 KB
Line 
1/*
2   strcpy/stpcpy - copy a string returning pointer to start/end.
3
4   Copyright (c) 2013, 2014, 2015 ARM Ltd.
5   All Rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29
30#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
31/* See strchr-stub.c  */
32#else
33
34/* Assumptions:
35 *
36 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
37 */
38
39/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
40
41   To test the page crossing code path more thoroughly, compile with
42   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
43   entry path.  This option is not intended for production use.  */
44
45/* Arguments and results.  */
46#define dstin           x0
47#define srcin           x1
48
49/* Locals and temporaries.  */
50#define src             x2
51#define dst             x3
52#define data1           x4
53#define data1w          w4
54#define data2           x5
55#define data2w          w5
56#define has_nul1        x6
57#define has_nul2        x7
58#define tmp1            x8
59#define tmp2            x9
60#define tmp3            x10
61#define tmp4            x11
62#define zeroones        x12
63#define data1a          x13
64#define data2a          x14
65#define pos             x15
66#define len             x16
67#define to_align        x17
68
69#ifdef BUILD_STPCPY
70#define STRCPY stpcpy
71#else
72#define STRCPY strcpy
73#endif
74
75        .macro def_fn f p2align=0
76        .text
77        .p2align \p2align
78        .global \f
79        .type \f, %function
80\f:
81        .endm
82
83        /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
84           (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
85           can be done in parallel across the entire word.  */
86
87#define REP8_01 0x0101010101010101
88#define REP8_7f 0x7f7f7f7f7f7f7f7f
89#define REP8_80 0x8080808080808080
90
91        /* AArch64 systems have a minimum page size of 4k.  We can do a quick
92           page size check for crossing this boundary on entry and if we
93           do not, then we can short-circuit much of the entry code.  We
94           expect early page-crossing strings to be rare (probability of
95           16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
96           predictable, even with random strings.
97
98           We don't bother checking for larger page sizes, the cost of setting
99           up the correct page size is just not worth the extra gain from
100           a small reduction in the cases taking the slow path.  Note that
101           we only care about whether the first fetch, which may be
102           misaligned, crosses a page boundary - after that we move to aligned
103           fetches for the remainder of the string.  */
104
105#ifdef STRCPY_TEST_PAGE_CROSS
106        /* Make everything that isn't Qword aligned look like a page cross.  */
107#define MIN_PAGE_P2 4
108#else
109#define MIN_PAGE_P2 12
110#endif
111
112#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
113
114def_fn STRCPY p2align=6
115        /* For moderately short strings, the fastest way to do the copy is to
116           calculate the length of the string in the same way as strlen, then
117           essentially do a memcpy of the result.  This avoids the need for
118           multiple byte copies and further means that by the time we
119           reach the bulk copy loop we know we can always use DWord
120           accesses.  We expect strcpy to rarely be called repeatedly
121           with the same source string, so branch prediction is likely to
122           always be difficult - we mitigate against this by preferring
123           conditional select operations over branches whenever this is
124           feasible.  */
125        and     tmp2, srcin, #(MIN_PAGE_SIZE - 1)
126        mov     zeroones, #REP8_01
127        and     to_align, srcin, #15
128        cmp     tmp2, #(MIN_PAGE_SIZE - 16)
129        neg     tmp1, to_align
130        /* The first fetch will straddle a (possible) page boundary iff
131           srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
132           aligned string will never fail the page align check, so will
133           always take the fast path.  */
134        b.gt    .Lpage_cross
135
136.Lpage_cross_ok:
137        ldp     data1, data2, [srcin]
138#ifdef __AARCH64EB__
139        /* Because we expect the end to be found within 16 characters
140           (profiling shows this is the most common case), it's worth
141           swapping the bytes now to save having to recalculate the
142           termination syndrome later.  We preserve data1 and data2
143           so that we can re-use the values later on.  */
144        rev     tmp2, data1
145        sub     tmp1, tmp2, zeroones
146        orr     tmp2, tmp2, #REP8_7f
147        bics    has_nul1, tmp1, tmp2
148        b.ne    .Lfp_le8
149        rev     tmp4, data2
150        sub     tmp3, tmp4, zeroones
151        orr     tmp4, tmp4, #REP8_7f
152#else
153        sub     tmp1, data1, zeroones
154        orr     tmp2, data1, #REP8_7f
155        bics    has_nul1, tmp1, tmp2
156        b.ne    .Lfp_le8
157        sub     tmp3, data2, zeroones
158        orr     tmp4, data2, #REP8_7f
159#endif
160        bics    has_nul2, tmp3, tmp4
161        b.eq    .Lbulk_entry
162
163        /* The string is short (<=16 bytes).  We don't know exactly how
164           short though, yet.  Work out the exact length so that we can
165           quickly select the optimal copy strategy.  */
166.Lfp_gt8:
167        rev     has_nul2, has_nul2
168        clz     pos, has_nul2
169        mov     tmp2, #56
170        add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
171        sub     pos, tmp2, pos
172#ifdef __AARCH64EB__
173        lsr     data2, data2, pos
174#else
175        lsl     data2, data2, pos
176#endif
177        str     data2, [dst, #1]
178        str     data1, [dstin]
179#ifdef BUILD_STPCPY
180        add     dstin, dst, #8
181#endif
182        ret
183
184.Lfp_le8:
185        rev     has_nul1, has_nul1
186        clz     pos, has_nul1
187        add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
188        subs    tmp2, pos, #24                  /* Pos in bits. */
189        b.lt    .Lfp_lt4
190#ifdef __AARCH64EB__
191        mov     tmp2, #56
192        sub     pos, tmp2, pos
193        lsr     data2, data1, pos
194        lsr     data1, data1, #32
195#else
196        lsr     data2, data1, tmp2
197#endif
198        /* 4->7 bytes to copy.  */
199        str     data2w, [dst, #-3]
200        str     data1w, [dstin]
201#ifdef BUILD_STPCPY
202        mov     dstin, dst
203#endif
204        ret
205.Lfp_lt4:
206        cbz     pos, .Lfp_lt2
207        /* 2->3 bytes to copy.  */
208#ifdef __AARCH64EB__
209        lsr     data1, data1, #48
210#endif
211        strh    data1w, [dstin]
212        /* Fall-through, one byte (max) to go.  */
213.Lfp_lt2:
214        /* Null-terminated string.  Last character must be zero!  */
215        strb    wzr, [dst]
216#ifdef BUILD_STPCPY
217        mov     dstin, dst
218#endif
219        ret
220
221        .p2align 6
222        /* Aligning here ensures that the entry code and main loop all lies
223           within one 64-byte cache line.  */
224.Lbulk_entry:
225        sub     to_align, to_align, #16
226        stp     data1, data2, [dstin]
227        sub     src, srcin, to_align
228        sub     dst, dstin, to_align
229        b       .Lentry_no_page_cross
230
231        /* The inner loop deals with two Dwords at a time.  This has a
232           slightly higher start-up cost, but we should win quite quickly,
233           especially on cores with a high number of issue slots per
234           cycle, as we get much better parallelism out of the operations.  */
235.Lmain_loop:
236        stp     data1, data2, [dst], #16
237.Lentry_no_page_cross:
238        ldp     data1, data2, [src], #16
239        sub     tmp1, data1, zeroones
240        orr     tmp2, data1, #REP8_7f
241        sub     tmp3, data2, zeroones
242        orr     tmp4, data2, #REP8_7f
243        bic     has_nul1, tmp1, tmp2
244        bics    has_nul2, tmp3, tmp4
245        ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
246        b.eq    .Lmain_loop
247
248        /* Since we know we are copying at least 16 bytes, the fastest way
249           to deal with the tail is to determine the location of the
250           trailing NUL, then (re)copy the 16 bytes leading up to that.  */
251        cmp     has_nul1, #0
252#ifdef __AARCH64EB__
253        /* For big-endian, carry propagation (if the final byte in the
254           string is 0x01) means we cannot use has_nul directly.  The
255           easiest way to get the correct byte is to byte-swap the data
256           and calculate the syndrome a second time.  */
257        csel    data1, data1, data2, ne
258        rev     data1, data1
259        sub     tmp1, data1, zeroones
260        orr     tmp2, data1, #REP8_7f
261        bic     has_nul1, tmp1, tmp2
262#else
263        csel    has_nul1, has_nul1, has_nul2, ne
264#endif
265        rev     has_nul1, has_nul1
266        clz     pos, has_nul1
267        add     tmp1, pos, #72
268        add     pos, pos, #8
269        csel    pos, pos, tmp1, ne
270        add     src, src, pos, lsr #3
271        add     dst, dst, pos, lsr #3
272        ldp     data1, data2, [src, #-32]
273        stp     data1, data2, [dst, #-16]
274#ifdef BUILD_STPCPY
275        sub     dstin, dst, #1
276#endif
277        ret
278
279.Lpage_cross:
280        bic     src, srcin, #15
281        /* Start by loading two words at [srcin & ~15], then forcing the
282           bytes that precede srcin to 0xff.  This means they never look
283           like termination bytes.  */
284        ldp     data1, data2, [src]
285        lsl     tmp1, tmp1, #3  /* Bytes beyond alignment -> bits.  */
286        tst     to_align, #7
287        csetm   tmp2, ne
288#ifdef __AARCH64EB__
289        lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
290#else
291        lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
292#endif
293        orr     data1, data1, tmp2
294        orr     data2a, data2, tmp2
295        cmp     to_align, #8
296        csinv   data1, data1, xzr, lt
297        csel    data2, data2, data2a, lt
298        sub     tmp1, data1, zeroones
299        orr     tmp2, data1, #REP8_7f
300        sub     tmp3, data2, zeroones
301        orr     tmp4, data2, #REP8_7f
302        bic     has_nul1, tmp1, tmp2
303        bics    has_nul2, tmp3, tmp4
304        ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
305        b.eq    .Lpage_cross_ok
306        /* We now need to make data1 and data2 look like they've been
307           loaded directly from srcin.  Do a rotate on the 128-bit value.  */
308        lsl     tmp1, to_align, #3      /* Bytes->bits.  */
309        neg     tmp2, to_align, lsl #3
310#ifdef __AARCH64EB__
311        lsl     data1a, data1, tmp1
312        lsr     tmp4, data2, tmp2
313        lsl     data2, data2, tmp1
314        orr     tmp4, tmp4, data1a
315        cmp     to_align, #8
316        csel    data1, tmp4, data2, lt
317        rev     tmp2, data1
318        rev     tmp4, data2
319        sub     tmp1, tmp2, zeroones
320        orr     tmp2, tmp2, #REP8_7f
321        sub     tmp3, tmp4, zeroones
322        orr     tmp4, tmp4, #REP8_7f
323#else
324        lsr     data1a, data1, tmp1
325        lsl     tmp4, data2, tmp2
326        lsr     data2, data2, tmp1
327        orr     tmp4, tmp4, data1a
328        cmp     to_align, #8
329        csel    data1, tmp4, data2, lt
330        sub     tmp1, data1, zeroones
331        orr     tmp2, data1, #REP8_7f
332        sub     tmp3, data2, zeroones
333        orr     tmp4, data2, #REP8_7f
334#endif
335        bic     has_nul1, tmp1, tmp2
336        cbnz    has_nul1, .Lfp_le8
337        bic     has_nul2, tmp3, tmp4
338        b       .Lfp_gt8
339
340        .size   STRCPY, . - STRCPY
341#endif
Note: See TracBrowser for help on using the repository browser.