source: trunk/libs/newlib/src/newlib/libc/machine/aarch64/memset.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 6.5 KB
Line 
1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses
58 *
59 */
60
61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
62/* See memset-stub.c  */
63#else
64
65#define dstin   x0
66#define val     x1
67#define valw    w1
68#define count   x2
69#define dst     x3
70#define dstend  x4
71#define tmp1    x5
72#define tmp1w   w5
73#define tmp2    x6
74#define tmp2w   w6
75#define zva_len x7
76#define zva_lenw w7
77
78#define L(l) .L ## l
79
80        .macro def_fn f p2align=0
81        .text
82        .p2align \p2align
83        .global \f
84        .type \f, %function
85\f:
86        .endm
87
88def_fn memset p2align=6
89
90        dup     v0.16B, valw
91        add     dstend, dstin, count
92
93        cmp     count, 96
94        b.hi    L(set_long)
95        cmp     count, 16
96        b.hs    L(set_medium)
97        mov     val, v0.D[0]
98
99        /* Set 0..15 bytes.  */
100        tbz     count, 3, 1f
101        str     val, [dstin]
102        str     val, [dstend, -8]
103        ret
104        nop
1051:      tbz     count, 2, 2f
106        str     valw, [dstin]
107        str     valw, [dstend, -4]
108        ret
1092:      cbz     count, 3f
110        strb    valw, [dstin]
111        tbz     count, 1, 3f
112        strh    valw, [dstend, -2]
1133:      ret
114
115        /* Set 17..96 bytes.  */
116L(set_medium):
117        str     q0, [dstin]
118        tbnz    count, 6, L(set96)
119        str     q0, [dstend, -16]
120        tbz     count, 5, 1f
121        str     q0, [dstin, 16]
122        str     q0, [dstend, -32]
1231:      ret
124
125        .p2align 4
126        /* Set 64..96 bytes.  Write 64 bytes from the start and
127           32 bytes from the end.  */
128L(set96):
129        str     q0, [dstin, 16]
130        stp     q0, q0, [dstin, 32]
131        stp     q0, q0, [dstend, -32]
132        ret
133
134        .p2align 3
135        nop
136L(set_long):
137        and     valw, valw, 255
138        bic     dst, dstin, 15
139        str     q0, [dstin]
140        cmp     count, 256
141        ccmp    valw, 0, 0, cs
142        b.eq    L(try_zva)
143L(no_zva):
144        sub     count, dstend, dst      /* Count is 16 too large.  */
145        add     dst, dst, 16
146        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
1471:      stp     q0, q0, [dst], 64
148        stp     q0, q0, [dst, -32]
149L(tail64):
150        subs    count, count, 64
151        b.hi    1b
1522:      stp     q0, q0, [dstend, -64]
153        stp     q0, q0, [dstend, -32]
154        ret
155
156        .p2align 3
157L(try_zva):
158        mrs     tmp1, dczid_el0
159        tbnz    tmp1w, 4, L(no_zva)
160        and     tmp1w, tmp1w, 15
161        cmp     tmp1w, 4        /* ZVA size is 64 bytes.  */
162        b.ne     L(zva_128)
163
164        /* Write the first and last 64 byte aligned block using stp rather
165           than using DC ZVA.  This is faster on some cores.
166         */
167L(zva_64):
168        str     q0, [dst, 16]
169        stp     q0, q0, [dst, 32]
170        bic     dst, dst, 63
171        stp     q0, q0, [dst, 64]
172        stp     q0, q0, [dst, 96]
173        sub     count, dstend, dst      /* Count is now 128 too large.  */
174        sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
175        add     dst, dst, 128
176        nop
1771:      dc      zva, dst
178        add     dst, dst, 64
179        subs    count, count, 64
180        b.hi    1b
181        stp     q0, q0, [dst, 0]
182        stp     q0, q0, [dst, 32]
183        stp     q0, q0, [dstend, -64]
184        stp     q0, q0, [dstend, -32]
185        ret
186
187        .p2align 3
188L(zva_128):
189        cmp     tmp1w, 5        /* ZVA size is 128 bytes.  */
190        b.ne    L(zva_other)
191
192        str     q0, [dst, 16]
193        stp     q0, q0, [dst, 32]
194        stp     q0, q0, [dst, 64]
195        stp     q0, q0, [dst, 96]
196        bic     dst, dst, 127
197        sub     count, dstend, dst      /* Count is now 128 too large.  */
198        sub     count, count, 128+128   /* Adjust count and bias for loop.  */
199        add     dst, dst, 128
2001:      dc      zva, dst
201        add     dst, dst, 128
202        subs    count, count, 128
203        b.hi    1b
204        stp     q0, q0, [dstend, -128]
205        stp     q0, q0, [dstend, -96]
206        stp     q0, q0, [dstend, -64]
207        stp     q0, q0, [dstend, -32]
208        ret
209
210L(zva_other):
211        mov     tmp2w, 4
212        lsl     zva_lenw, tmp2w, tmp1w
213        add     tmp1, zva_len, 64       /* Max alignment bytes written.  */
214        cmp     count, tmp1
215        blo     L(no_zva)
216
217        sub     tmp2, zva_len, 1
218        add     tmp1, dst, zva_len
219        add     dst, dst, 16
220        subs    count, tmp1, dst        /* Actual alignment bytes to write.  */
221        bic     tmp1, tmp1, tmp2        /* Aligned dc zva start address.  */
222        beq     2f
2231:      stp     q0, q0, [dst], 64
224        stp     q0, q0, [dst, -32]
225        subs    count, count, 64
226        b.hi    1b
2272:      mov     dst, tmp1
228        sub     count, dstend, tmp1     /* Remaining bytes to write.  */
229        subs    count, count, zva_len
230        b.lo    4f
2313:      dc      zva, dst
232        add     dst, dst, zva_len
233        subs    count, count, zva_len
234        b.hs    3b
2354:      add     count, count, zva_len
236        b       L(tail64)
237
238        .size   memset, . - memset
239#endif
Note: See TracBrowser for help on using the repository browser.