source: trunk/libs/newlib/src/newlib/libc/machine/arm/memcpy-armv7m.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 7.7 KB
Line 
1/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
30   unaligned access.
31
32   If compiled with GCC, this file should be enclosed within following
33   pre-processing check:
34   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
35
36   Prototype: void *memcpy (void *dst, const void *src, size_t count);
37
38   The job will be done in 5 steps.
39   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42   Step 4: Copy word by word
43   Step 5: Copy byte-to-byte
44
45   Tunable options:
46     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
47     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
48 */
49#ifndef __OPT_BIG_BLOCK_SIZE
50#define __OPT_BIG_BLOCK_SIZE (4 * 16)
51#endif
52
53#ifndef __OPT_MID_BLOCK_SIZE
54#define __OPT_MID_BLOCK_SIZE (4 * 4)
55#endif
56
57#if __OPT_BIG_BLOCK_SIZE == 16
58#define BEGIN_UNROLL_BIG_BLOCK \
59  .irp offset, 0,4,8,12
60#elif __OPT_BIG_BLOCK_SIZE == 32
61#define BEGIN_UNROLL_BIG_BLOCK \
62  .irp offset, 0,4,8,12,16,20,24,28
63#elif __OPT_BIG_BLOCK_SIZE == 64
64#define BEGIN_UNROLL_BIG_BLOCK \
65  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
66#else
67#error "Illegal __OPT_BIG_BLOCK_SIZE"
68#endif
69
70#if __OPT_MID_BLOCK_SIZE == 8
71#define BEGIN_UNROLL_MID_BLOCK \
72  .irp offset, 0,4
73#elif __OPT_MID_BLOCK_SIZE == 16
74#define BEGIN_UNROLL_MID_BLOCK \
75  .irp offset, 0,4,8,12
76#else
77#error "Illegal __OPT_MID_BLOCK_SIZE"
78#endif
79
80#define END_UNROLL .endr
81
82        .syntax unified
83        .text
84        .align  2
85        .global memcpy
86        .thumb
87        .thumb_func
88        .type   memcpy, %function
89memcpy:
90        @ r0: dst
91        @ r1: src
92        @ r2: len
93#ifdef __ARM_FEATURE_UNALIGNED
94        /* In case of UNALIGNED access supported, ip is not used in
95           function body.  */
96        mov     ip, r0
97#else
98        push    {r0}
99#endif
100        orr     r3, r1, r0
101        ands    r3, r3, #3
102        bne     .Lmisaligned_copy
103
104.Lbig_block:
105        subs    r2, __OPT_BIG_BLOCK_SIZE
106        blo     .Lmid_block
107
108        /* Kernel loop for big block copy */
109        .align 2
110.Lbig_block_loop:
111        BEGIN_UNROLL_BIG_BLOCK
112#ifdef __ARM_ARCH_7EM__
113        ldr     r3, [r1], #4
114        str     r3, [r0], #4
115        END_UNROLL
116#else /* __ARM_ARCH_7M__ */
117        ldr     r3, [r1, \offset]
118        str     r3, [r0, \offset]
119        END_UNROLL
120        adds    r0, __OPT_BIG_BLOCK_SIZE
121        adds    r1, __OPT_BIG_BLOCK_SIZE
122#endif
123        subs    r2, __OPT_BIG_BLOCK_SIZE
124        bhs .Lbig_block_loop
125
126.Lmid_block:
127        adds    r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
128        blo     .Lcopy_word_by_word
129
130        /* Kernel loop for mid-block copy */
131        .align 2
132.Lmid_block_loop:
133        BEGIN_UNROLL_MID_BLOCK
134#ifdef __ARM_ARCH_7EM__
135        ldr     r3, [r1], #4
136        str     r3, [r0], #4
137        END_UNROLL
138#else /* __ARM_ARCH_7M__ */
139        ldr     r3, [r1, \offset]
140        str     r3, [r0, \offset]
141        END_UNROLL
142        adds    r0, __OPT_MID_BLOCK_SIZE
143        adds    r1, __OPT_MID_BLOCK_SIZE
144#endif
145        subs    r2, __OPT_MID_BLOCK_SIZE
146        bhs     .Lmid_block_loop
147
148.Lcopy_word_by_word:
149        adds    r2, __OPT_MID_BLOCK_SIZE - 4
150        blo     .Lcopy_less_than_4
151
152        /* Kernel loop for small block copy */
153        .align 2
154.Lcopy_word_by_word_loop:
155        ldr     r3, [r1], #4
156        str     r3, [r0], #4
157        subs    r2, #4
158        bhs     .Lcopy_word_by_word_loop
159
160.Lcopy_less_than_4:
161        adds    r2, #4
162        beq     .Ldone
163
164        lsls    r2, r2, #31
165        itt ne
166        ldrbne  r3, [r1], #1
167        strbne  r3, [r0], #1
168
169        bcc     .Ldone
170#ifdef __ARM_FEATURE_UNALIGNED
171        ldrh    r3, [r1]
172        strh    r3, [r0]
173#else
174        ldrb    r3, [r1]
175        strb    r3, [r0]
176        ldrb    r3, [r1, #1]
177        strb    r3, [r0, #1]
178#endif /* __ARM_FEATURE_UNALIGNED */
179
180.Ldone:
181#ifdef __ARM_FEATURE_UNALIGNED
182        mov     r0, ip
183#else
184        pop     {r0}
185#endif
186        bx      lr
187
188        .align 2
189.Lmisaligned_copy:
190#ifdef __ARM_FEATURE_UNALIGNED
191        /* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
192           once destination is adjusted to aligned.  */
193#define Ldst_aligned Lbig_block
194
195        /* Copy word by word using LDR when alignment can be done in hardware,
196        i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
197
198        cmp     r2, #8
199        blo     .Lbyte_copy
200
201        /* if src is aligned, just go to the big block loop.  */
202        lsls    r3, r1, #30
203        beq     .Ldst_aligned
204#else
205        /* if len < 12, misalignment adjustment has more overhead than
206        just byte-to-byte copy.  Also, len must >=8 to guarantee code
207        afterward work correctly.  */
208        cmp     r2, #12
209        blo     .Lbyte_copy
210#endif /* __ARM_FEATURE_UNALIGNED */
211
212        /* Align dst only, not trying to align src.  That is the because
213        handling of aligned src and misaligned dst need more overhead than
214        otherwise.  By doing this the worst case is when initial src is aligned,
215        additional up to 4 byte additional copy will executed, which is
216        acceptable.  */
217
218        ands    r3, r0, #3
219        beq     .Ldst_aligned
220
221        rsb     r3, #4
222        subs    r2, r3
223
224        lsls    r3, r3, #31
225        itt ne
226        ldrbne  r3, [r1], #1
227        strbne  r3, [r0], #1
228
229        bcc .Ldst_aligned
230
231#ifdef __ARM_FEATURE_UNALIGNED
232        ldrh    r3, [r1], #2
233        strh    r3, [r0], #2
234        b       .Ldst_aligned
235#else
236        ldrb    r3, [r1], #1
237        strb    r3, [r0], #1
238        ldrb    r3, [r1], #1
239        strb    r3, [r0], #1
240        /* Now that dst is aligned */
241.Ldst_aligned:
242        /* if r1 is aligned now, it means r0/r1 has the same misalignment,
243        and they are both aligned now.  Go aligned copy.  */
244        ands    r3, r1, #3
245        beq     .Lbig_block
246
247        /* dst is aligned, but src isn't.  Misaligned copy.  */
248
249        push    {r4, r5}
250        subs    r2, #4
251
252        /* Backward r1 by misaligned bytes, to make r1 aligned.
253        Since we need to restore r1 to unaligned address after the loop,
254        we need keep the offset bytes to ip and sub it from r1 afterward.  */
255        subs    r1, r3
256        rsb     ip, r3, #4
257
258        /* Pre-load on word */
259        ldr     r4, [r1], #4
260
261        cmp     r3, #2
262        beq     .Lmisaligned_copy_2_2
263        cmp     r3, #3
264        beq     .Lmisaligned_copy_3_1
265
266        .macro mis_src_copy shift
2671:
268#ifdef __ARM_BIG_ENDIAN
269        lsls    r4, r4, \shift
270#else
271        lsrs    r4, r4, \shift
272#endif
273        ldr     r3, [r1], #4
274#ifdef __ARM_BIG_ENDIAN
275        lsrs    r5, r3, 32-\shift
276#else
277        lsls    r5, r3, 32-\shift
278#endif
279        orr     r4, r4, r5
280        str     r4, [r0], #4
281        mov     r4, r3
282        subs    r2, #4
283        bhs     1b
284        .endm
285
286.Lmisaligned_copy_1_3:
287        mis_src_copy shift=8
288        b       .Lsrc_misaligned_tail
289
290.Lmisaligned_copy_3_1:
291        mis_src_copy shift=24
292        b       .Lsrc_misaligned_tail
293
294.Lmisaligned_copy_2_2:
295        /* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
296        mis_src_copy shift=16
297
298.Lsrc_misaligned_tail:
299        adds    r2, #4
300        subs    r1, ip
301        pop     {r4, r5}
302
303#endif /* __ARM_FEATURE_UNALIGNED */
304
305.Lbyte_copy:
306        subs    r2, #4
307        blo     .Lcopy_less_than_4
308
309.Lbyte_copy_loop:
310        subs    r2, #1
311        ldrb    r3, [r1], #1
312        strb    r3, [r0], #1
313        bhs     .Lbyte_copy_loop
314
315        ldrb    r3, [r1]
316        strb    r3, [r0]
317        ldrb    r3, [r1, #1]
318        strb    r3, [r0, #1]
319        ldrb    r3, [r1, #2]
320        strb    r3, [r0, #2]
321
322#ifdef __ARM_FEATURE_UNALIGNED
323        mov     r0, ip
324#else
325        pop     {r0}
326#endif
327        bx      lr
328
329        .size   memcpy, .-memcpy
Note: See TracBrowser for help on using the repository browser.