source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/memcpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 6.9 KB
Line 
1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses.
58 *
59 */
60
61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
62/* See memcpy-stub.c  */
63#else
64
65#define dstin   x0
66#define src     x1
67#define count   x2
68#define dst     x3
69#define srcend  x4
70#define dstend  x5
71#define A_l     x6
72#define A_lw    w6
73#define A_h     x7
74#define A_hw    w7
75#define B_l     x8
76#define B_lw    w8
77#define B_h     x9
78#define C_l     x10
79#define C_h     x11
80#define D_l     x12
81#define D_h     x13
82#define E_l     src
83#define E_h     count
84#define F_l     srcend
85#define F_h     dst
86#define tmp1    x9
87
88#define L(l) .L ## l
89
90        .macro def_fn f p2align=0
91        .text
92        .p2align \p2align
93        .global \f
94        .type \f, %function
95\f:
96        .endm
97
98/* Copies are split into 3 main cases: small copies of up to 16 bytes,
99   medium copies of 17..96 bytes which are fully unrolled. Large copies
100   of more than 96 bytes align the destination and use an unrolled loop
101   processing 64 bytes per iteration.
102   Small and medium copies read all data before writing, allowing any
103   kind of overlap, and memmove tailcalls memcpy for these cases as
104   well as non-overlapping copies.
105*/
106
107def_fn memcpy p2align=6
108        prfm    PLDL1KEEP, [src]
109        add     srcend, src, count
110        add     dstend, dstin, count
111        cmp     count, 16
112        b.ls    L(copy16)
113        cmp     count, 96
114        b.hi    L(copy_long)
115
116        /* Medium copies: 17..96 bytes.  */
117        sub     tmp1, count, 1
118        ldp     A_l, A_h, [src]
119        tbnz    tmp1, 6, L(copy96)
120        ldp     D_l, D_h, [srcend, -16]
121        tbz     tmp1, 5, 1f
122        ldp     B_l, B_h, [src, 16]
123        ldp     C_l, C_h, [srcend, -32]
124        stp     B_l, B_h, [dstin, 16]
125        stp     C_l, C_h, [dstend, -32]
1261:
127        stp     A_l, A_h, [dstin]
128        stp     D_l, D_h, [dstend, -16]
129        ret
130
131        .p2align 4
132        /* Small copies: 0..16 bytes.  */
133L(copy16):
134        cmp     count, 8
135        b.lo    1f
136        ldr     A_l, [src]
137        ldr     A_h, [srcend, -8]
138        str     A_l, [dstin]
139        str     A_h, [dstend, -8]
140        ret
141        .p2align 4
1421:
143        tbz     count, 2, 1f
144        ldr     A_lw, [src]
145        ldr     A_hw, [srcend, -4]
146        str     A_lw, [dstin]
147        str     A_hw, [dstend, -4]
148        ret
149
150        /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
151           byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1521:
153        cbz     count, 2f
154        lsr     tmp1, count, 1
155        ldrb    A_lw, [src]
156        ldrb    A_hw, [srcend, -1]
157        ldrb    B_lw, [src, tmp1]
158        strb    A_lw, [dstin]
159        strb    B_lw, [dstin, tmp1]
160        strb    A_hw, [dstend, -1]
1612:      ret
162
163        .p2align 4
164        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
165           32 bytes from the end.  */
166L(copy96):
167        ldp     B_l, B_h, [src, 16]
168        ldp     C_l, C_h, [src, 32]
169        ldp     D_l, D_h, [src, 48]
170        ldp     E_l, E_h, [srcend, -32]
171        ldp     F_l, F_h, [srcend, -16]
172        stp     A_l, A_h, [dstin]
173        stp     B_l, B_h, [dstin, 16]
174        stp     C_l, C_h, [dstin, 32]
175        stp     D_l, D_h, [dstin, 48]
176        stp     E_l, E_h, [dstend, -32]
177        stp     F_l, F_h, [dstend, -16]
178        ret
179
180        /* Align DST to 16 byte alignment so that we don't cross cache line
181           boundaries on both loads and stores.  There are at least 96 bytes
182           to copy, so copy 16 bytes unaligned and then align.  The loop
183           copies 64 bytes per iteration and prefetches one iteration ahead.  */
184
185        .p2align 4
186L(copy_long):
187        and     tmp1, dstin, 15
188        bic     dst, dstin, 15
189        ldp     D_l, D_h, [src]
190        sub     src, src, tmp1
191        add     count, count, tmp1      /* Count is now 16 too large.  */
192        ldp     A_l, A_h, [src, 16]
193        stp     D_l, D_h, [dstin]
194        ldp     B_l, B_h, [src, 32]
195        ldp     C_l, C_h, [src, 48]
196        ldp     D_l, D_h, [src, 64]!
197        subs    count, count, 128 + 16  /* Test and readjust count.  */
198        b.ls    2f
1991:
200        stp     A_l, A_h, [dst, 16]
201        ldp     A_l, A_h, [src, 16]
202        stp     B_l, B_h, [dst, 32]
203        ldp     B_l, B_h, [src, 32]
204        stp     C_l, C_h, [dst, 48]
205        ldp     C_l, C_h, [src, 48]
206        stp     D_l, D_h, [dst, 64]!
207        ldp     D_l, D_h, [src, 64]!
208        subs    count, count, 64
209        b.hi    1b
210
211        /* Write the last full set of 64 bytes.  The remainder is at most 64
212           bytes, so it is safe to always copy 64 bytes from the end even if
213           there is just 1 byte left.  */
2142:
215        ldp     E_l, E_h, [srcend, -64]
216        stp     A_l, A_h, [dst, 16]
217        ldp     A_l, A_h, [srcend, -48]
218        stp     B_l, B_h, [dst, 32]
219        ldp     B_l, B_h, [srcend, -32]
220        stp     C_l, C_h, [dst, 48]
221        ldp     C_l, C_h, [srcend, -16]
222        stp     D_l, D_h, [dst, 64]
223        stp     E_l, E_h, [dstend, -64]
224        stp     A_l, A_h, [dstend, -48]
225        stp     B_l, B_h, [dstend, -32]
226        stp     C_l, C_h, [dstend, -16]
227        ret
228
229        .size   memcpy, . - memcpy
230#endif
Note: See TracBrowser for help on using the repository browser.