source: trunk/libs/newlib/src/newlib/libc/machine/mips/memcpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 25.7 KB
Line 
1/*
2 * Copyright (c) 2012-2015
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#ifdef ANDROID_CHANGES
31# include "machine/asm.h"
32# include "machine/regdef.h"
33# define USE_MEMMOVE_FOR_OVERLAP
34# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
35# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
36#elif _LIBC
37# include <sysdep.h>
38# include <regdef.h>
39# include <sys/asm.h>
40# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
41# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
42#elif _COMPILING_NEWLIB
43# include "machine/asm.h"
44# include "machine/regdef.h"
45# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
46# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
47#else
48# include <regdef.h>
49# include <sys/asm.h>
50#endif
51
52/* Check to see if the MIPS architecture we are compiling for supports
53 * prefetching.
54 */
55
56#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
57# ifndef DISABLE_PREFETCH
58#  define USE_PREFETCH
59# endif
60#endif
61
62#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
63# ifndef DISABLE_DOUBLE
64#  define USE_DOUBLE
65# endif
66#endif
67
68
69#if __mips_isa_rev > 5
70# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
71#  undef PREFETCH_STORE_HINT
72#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
73# endif
74# define R6_CODE
75#endif
76
77/* Some asm.h files do not have the L macro definition.  */
78#ifndef L
79# if _MIPS_SIM == _ABIO32
80#  define L(label) $L ## label
81# else
82#  define L(label) .L ## label
83# endif
84#endif
85
86/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
87#ifndef PTR_ADDIU
88# ifdef USE_DOUBLE
89#  define PTR_ADDIU     daddiu
90# else
91#  define PTR_ADDIU     addiu
92# endif
93#endif
94
95/* Some asm.h files do not have the PTR_SRA macro definition.  */
96#ifndef PTR_SRA
97# ifdef USE_DOUBLE
98#  define PTR_SRA       dsra
99# else
100#  define PTR_SRA       sra
101# endif
102#endif
103
104/* New R6 instructions that may not be in asm.h.  */
105#ifndef PTR_LSA
106# if _MIPS_SIM == _ABI64
107#  define PTR_LSA       dlsa
108# else
109#  define PTR_LSA       lsa
110# endif
111#endif
112
113/*
114 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
115 * prefetches appears to offer a slight preformance advantage.
116 *
117 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
118 * or PREFETCH_STORE_STREAMED offers a large performance advantage
119 * but PREPAREFORSTORE has some special restrictions to consider.
120 *
121 * Prefetch with the 'prepare for store' hint does not copy a memory
122 * location into the cache, it just allocates a cache line and zeros
123 * it out.  This means that if you do not write to the entire cache
124 * line before writing it out to memory some data will get zero'ed out
125 * when the cache line is written back to memory and data will be lost.
126 *
127 * Also if you are using this memcpy to copy overlapping buffers it may
128 * not behave correctly when using the 'prepare for store' hint.  If you
129 * use the 'prepare for store' prefetch on a memory area that is in the
130 * memcpy source (as well as the memcpy destination), then you will get
131 * some data zero'ed out before you have a chance to read it and data will
132 * be lost.
133 *
134 * If you are going to use this memcpy routine with the 'prepare for store'
135 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
136 * the problem of running memcpy on overlapping buffers.
137 *
138 * There are ifdef'ed sections of this memcpy to make sure that it does not
139 * do prefetches on cache lines that are not going to be completely written.
140 * This code is only needed and only used when PREFETCH_STORE_HINT is set to
141 * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
142 * 32 bytes and if the cache line is larger it will not work correctly.
143 */
144
145#ifdef USE_PREFETCH
146# define PREFETCH_HINT_LOAD             0
147# define PREFETCH_HINT_STORE            1
148# define PREFETCH_HINT_LOAD_STREAMED    4
149# define PREFETCH_HINT_STORE_STREAMED   5
150# define PREFETCH_HINT_LOAD_RETAINED    6
151# define PREFETCH_HINT_STORE_RETAINED   7
152# define PREFETCH_HINT_WRITEBACK_INVAL  25
153# define PREFETCH_HINT_PREPAREFORSTORE  30
154
155/*
156 * If we have not picked out what hints to use at this point use the
157 * standard load and store prefetch hints.
158 */
159# ifndef PREFETCH_STORE_HINT
160#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
161# endif
162# ifndef PREFETCH_LOAD_HINT
163#  define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
164# endif
165
166/*
167 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
168 * get 64 bytes in that case.  The assumption is that each individual
169 * prefetch brings in 32 bytes.
170 */
171
172# ifdef USE_DOUBLE
173#  define PREFETCH_CHUNK 64
174#  define PREFETCH_FOR_LOAD(chunk, reg) \
175 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
176 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
177#  define PREFETCH_FOR_STORE(chunk, reg) \
178 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
179 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
180# else
181#  define PREFETCH_CHUNK 32
182#  define PREFETCH_FOR_LOAD(chunk, reg) \
183 pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
184#  define PREFETCH_FOR_STORE(chunk, reg) \
185 pref PREFETCH_STORE_HINT, (chunk)*32(reg)
186# endif
187/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
188 * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
189 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
190 * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
191 * used then MAX_PREFETCH_SIZE does not matter.  */
192# define MAX_PREFETCH_SIZE 128
193/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
194 * than 5 on a STORE prefetch and that a single prefetch can never be larger
195 * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
196 * we actually do two prefetches in that case, one 32 bytes after the other.  */
197# ifdef USE_DOUBLE
198#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
199# else
200#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
201# endif
202# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
203    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
204/* We cannot handle this because the initial prefetches may fetch bytes that
205 * are before the buffer being copied.  We start copies with an offset
206 * of 4 so avoid this situation when using PREPAREFORSTORE.  */
207#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
208# endif
209#else /* USE_PREFETCH not defined */
210# define PREFETCH_FOR_LOAD(offset, reg)
211# define PREFETCH_FOR_STORE(offset, reg)
212#endif
213
214/* Allow the routine to be named something else if desired.  */
215#ifndef MEMCPY_NAME
216# define MEMCPY_NAME memcpy
217#endif
218
219/* We use these 32/64 bit registers as temporaries to do the copying.  */
220#define REG0 t0
221#define REG1 t1
222#define REG2 t2
223#define REG3 t3
224#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64)
225# define REG4 t4
226# define REG5 t5
227# define REG6 t6
228# define REG7 t7
229#else
230# define REG4 ta0
231# define REG5 ta1
232# define REG6 ta2
233# define REG7 ta3
234#endif
235
236/* We load/store 64 bits at a time when USE_DOUBLE is true.
237 * The C_ prefix stands for CHUNK and is used to avoid macro name
238 * conflicts with system header files.  */
239
240#ifdef USE_DOUBLE
241# define C_ST   sd
242# define C_LD   ld
243# if __MIPSEB
244#  define C_LDHI        ldl     /* high part is left in big-endian      */
245#  define C_STHI        sdl     /* high part is left in big-endian      */
246#  define C_LDLO        ldr     /* low part is right in big-endian      */
247#  define C_STLO        sdr     /* low part is right in big-endian      */
248# else
249#  define C_LDHI        ldr     /* high part is right in little-endian  */
250#  define C_STHI        sdr     /* high part is right in little-endian  */
251#  define C_LDLO        ldl     /* low part is left in little-endian    */
252#  define C_STLO        sdl     /* low part is left in little-endian    */
253# endif
254# define C_ALIGN        dalign  /* r6 align instruction                 */
255#else
256# define C_ST   sw
257# define C_LD   lw
258# if __MIPSEB
259#  define C_LDHI        lwl     /* high part is left in big-endian      */
260#  define C_STHI        swl     /* high part is left in big-endian      */
261#  define C_LDLO        lwr     /* low part is right in big-endian      */
262#  define C_STLO        swr     /* low part is right in big-endian      */
263# else
264#  define C_LDHI        lwr     /* high part is right in little-endian  */
265#  define C_STHI        swr     /* high part is right in little-endian  */
266#  define C_LDLO        lwl     /* low part is left in little-endian    */
267#  define C_STLO        swl     /* low part is left in little-endian    */
268# endif
269# define C_ALIGN        align   /* r6 align instruction                 */
270#endif
271
272/* Bookkeeping values for 32 vs. 64 bit mode.  */
273#ifdef USE_DOUBLE
274# define NSIZE 8
275# define NSIZEMASK 0x3f
276# define NSIZEDMASK 0x7f
277#else
278# define NSIZE 4
279# define NSIZEMASK 0x1f
280# define NSIZEDMASK 0x3f
281#endif
282#define UNIT(unit) ((unit)*NSIZE)
283#define UNITM1(unit) (((unit)*NSIZE)-1)
284
285#ifdef ANDROID_CHANGES
286LEAF(MEMCPY_NAME, 0)
287#else
288LEAF(MEMCPY_NAME)
289#endif
290        .set    nomips16
291        .set    noreorder
292/*
293 * Below we handle the case where memcpy is called with overlapping src and dst.
294 * Although memcpy is not required to handle this case, some parts of Android
295 * like Skia rely on such usage. We call memmove to handle such cases.
296 */
297#ifdef USE_MEMMOVE_FOR_OVERLAP
298        PTR_SUBU t0,a0,a1
299        PTR_SRA t2,t0,31
300        xor     t1,t0,t2
301        PTR_SUBU t0,t1,t2
302        sltu    t2,t0,a2
303        beq     t2,zero,L(memcpy)
304        la      t9,memmove
305        jr      t9
306         nop
307L(memcpy):
308#endif
309/*
310 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
311 * size, copy dst pointer to v0 for the return value.
312 */
313        slti    t2,a2,(2 * NSIZE)
314        bne     t2,zero,L(lasts)
315#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
316        move    v0,zero
317#else
318        move    v0,a0
319#endif
320
321#ifndef R6_CODE
322
323/*
324 * If src and dst have different alignments, go to L(unaligned), if they
325 * have the same alignment (but are not actually aligned) do a partial
326 * load/store to make them aligned.  If they are both already aligned
327 * we can start copying at L(aligned).
328 */
329        xor     t8,a1,a0
330        andi    t8,t8,(NSIZE-1)         /* t8 is a0/a1 word-displacement */
331        bne     t8,zero,L(unaligned)
332        PTR_SUBU a3, zero, a0
333
334        andi    a3,a3,(NSIZE-1)         /* copy a3 bytes to align a0/a1   */
335        beq     a3,zero,L(aligned)      /* if a3=0, it is already aligned */
336        PTR_SUBU a2,a2,a3               /* a2 is the remining bytes count */
337
338        C_LDHI  t8,0(a1)
339        PTR_ADDU a1,a1,a3
340        C_STHI  t8,0(a0)
341        PTR_ADDU a0,a0,a3
342
343#else /* R6_CODE */
344
345/*
346 * Align the destination and hope that the source gets aligned too.  If it
347 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
348 * align instruction.
349 */
350        andi    t8,a0,7
351        lapc    t9,L(atable)
352        PTR_LSA t9,t8,t9,2
353        jrc     t9
354L(atable):
355        bc      L(lb0)
356        bc      L(lb7)
357        bc      L(lb6)
358        bc      L(lb5)
359        bc      L(lb4)
360        bc      L(lb3)
361        bc      L(lb2)
362        bc      L(lb1)
363L(lb7):
364        lb      a3, 6(a1)
365        sb      a3, 6(a0)
366L(lb6):
367        lb      a3, 5(a1)
368        sb      a3, 5(a0)
369L(lb5):
370        lb      a3, 4(a1)
371        sb      a3, 4(a0)
372L(lb4):
373        lb      a3, 3(a1)
374        sb      a3, 3(a0)
375L(lb3):
376        lb      a3, 2(a1)
377        sb      a3, 2(a0)
378L(lb2):
379        lb      a3, 1(a1)
380        sb      a3, 1(a0)
381L(lb1):
382        lb      a3, 0(a1)
383        sb      a3, 0(a0)
384
385        li      t9,8
386        subu    t8,t9,t8
387        PTR_SUBU a2,a2,t8
388        PTR_ADDU a0,a0,t8
389        PTR_ADDU a1,a1,t8
390L(lb0):
391
392        andi    t8,a1,(NSIZE-1)
393        lapc    t9,L(jtable)
394        PTR_LSA t9,t8,t9,2
395        jrc     t9
396L(jtable):
397        bc      L(aligned)
398        bc      L(r6_unaligned1)
399        bc      L(r6_unaligned2)
400        bc      L(r6_unaligned3)
401# ifdef USE_DOUBLE
402        bc      L(r6_unaligned4)
403        bc      L(r6_unaligned5)
404        bc      L(r6_unaligned6)
405        bc      L(r6_unaligned7)
406# endif
407#endif /* R6_CODE */
408
409L(aligned):
410
411/*
412 * Now dst/src are both aligned to (word or double word) aligned addresses
413 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
414 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
415 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
416 * equals a3.
417 */
418
419        andi    t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
420        beq     a2,t8,L(chkw)    /* if a2==t8, no 64-byte/128-byte chunks */
421        PTR_SUBU a3,a2,t8        /* subtract from a2 the reminder */
422        PTR_ADDU a3,a0,a3        /* Now a3 is the final dst after loop */
423
424/* When in the loop we may prefetch with the 'prepare to store' hint,
425 * in this case the a0+x should not be past the "t0-32" address.  This
426 * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
427 * for x=64 the last "safe" a0 address is "t0-96" In the current version we
428 * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
429 */
430#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
431        PTR_ADDU t0,a0,a2               /* t0 is the "past the end" address */
432        PTR_SUBU t9,t0,PREFETCH_LIMIT   /* t9 is the "last safe pref" address */
433#endif
434        PREFETCH_FOR_LOAD  (0, a1)
435        PREFETCH_FOR_LOAD  (1, a1)
436        PREFETCH_FOR_LOAD  (2, a1)
437        PREFETCH_FOR_LOAD  (3, a1)
438#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
439        PREFETCH_FOR_STORE (1, a0)
440        PREFETCH_FOR_STORE (2, a0)
441        PREFETCH_FOR_STORE (3, a0)
442#endif
443#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
444# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
445        sltu    v1,t9,a0
446        bgtz    v1,L(skip_set)
447        nop
448        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
449L(skip_set):
450# else
451        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
452# endif
453#endif
454#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
455    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
456        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
457# ifdef USE_DOUBLE
458        PTR_ADDIU v0,v0,32
459# endif
460#endif
461L(loop16w):
462        C_LD    t0,UNIT(0)(a1)
463#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
464        sltu    v1,t9,a0                /* If a0 > t9 don't use next prefetch */
465        bgtz    v1,L(skip_pref)
466#endif
467        C_LD    t1,UNIT(1)(a1)
468#ifndef R6_CODE
469        PREFETCH_FOR_STORE (4, a0)
470        PREFETCH_FOR_STORE (5, a0)
471#else
472        PREFETCH_FOR_STORE (2, a0)
473#endif
474#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
475        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
476# ifdef USE_DOUBLE
477        PTR_ADDIU v0,v0,32
478# endif
479#endif
480L(skip_pref):
481        C_LD    REG2,UNIT(2)(a1)
482        C_LD    REG3,UNIT(3)(a1)
483        C_LD    REG4,UNIT(4)(a1)
484        C_LD    REG5,UNIT(5)(a1)
485        C_LD    REG6,UNIT(6)(a1)
486        C_LD    REG7,UNIT(7)(a1)
487#ifndef R6_CODE
488        PREFETCH_FOR_LOAD (4, a1)
489#else
490        PREFETCH_FOR_LOAD (3, a1)
491#endif
492        C_ST    t0,UNIT(0)(a0)
493        C_ST    t1,UNIT(1)(a0)
494        C_ST    REG2,UNIT(2)(a0)
495        C_ST    REG3,UNIT(3)(a0)
496        C_ST    REG4,UNIT(4)(a0)
497        C_ST    REG5,UNIT(5)(a0)
498        C_ST    REG6,UNIT(6)(a0)
499        C_ST    REG7,UNIT(7)(a0)
500
501        C_LD    t0,UNIT(8)(a1)
502        C_LD    t1,UNIT(9)(a1)
503        C_LD    REG2,UNIT(10)(a1)
504        C_LD    REG3,UNIT(11)(a1)
505        C_LD    REG4,UNIT(12)(a1)
506        C_LD    REG5,UNIT(13)(a1)
507        C_LD    REG6,UNIT(14)(a1)
508        C_LD    REG7,UNIT(15)(a1)
509#ifndef R6_CODE
510        PREFETCH_FOR_LOAD (5, a1)
511#endif
512        C_ST    t0,UNIT(8)(a0)
513        C_ST    t1,UNIT(9)(a0)
514        C_ST    REG2,UNIT(10)(a0)
515        C_ST    REG3,UNIT(11)(a0)
516        C_ST    REG4,UNIT(12)(a0)
517        C_ST    REG5,UNIT(13)(a0)
518        C_ST    REG6,UNIT(14)(a0)
519        C_ST    REG7,UNIT(15)(a0)
520        PTR_ADDIU a0,a0,UNIT(16)        /* adding 64/128 to dest */
521        bne     a0,a3,L(loop16w)
522        PTR_ADDIU a1,a1,UNIT(16)        /* adding 64/128 to src */
523        move    a2,t8
524
525/* Here we have src and dest word-aligned but less than 64-bytes or
526 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
527 * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
528 * the copy.
529 */
530
531L(chkw):
532        PREFETCH_FOR_LOAD (0, a1)
533        andi    t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk.  */
534                                /* The t8 is the reminder count past 32-bytes */
535        beq     a2,t8,L(chk1w)  /* When a2=t8, no 32-byte chunk  */
536        nop
537        C_LD    t0,UNIT(0)(a1)
538        C_LD    t1,UNIT(1)(a1)
539        C_LD    REG2,UNIT(2)(a1)
540        C_LD    REG3,UNIT(3)(a1)
541        C_LD    REG4,UNIT(4)(a1)
542        C_LD    REG5,UNIT(5)(a1)
543        C_LD    REG6,UNIT(6)(a1)
544        C_LD    REG7,UNIT(7)(a1)
545        PTR_ADDIU a1,a1,UNIT(8)
546        C_ST    t0,UNIT(0)(a0)
547        C_ST    t1,UNIT(1)(a0)
548        C_ST    REG2,UNIT(2)(a0)
549        C_ST    REG3,UNIT(3)(a0)
550        C_ST    REG4,UNIT(4)(a0)
551        C_ST    REG5,UNIT(5)(a0)
552        C_ST    REG6,UNIT(6)(a0)
553        C_ST    REG7,UNIT(7)(a0)
554        PTR_ADDIU a0,a0,UNIT(8)
555
556/*
557 * Here we have less than 32(64) bytes to copy.  Set up for a loop to
558 * copy one word (or double word) at a time.  Set a2 to count how many
559 * bytes we have to copy after all the word (or double word) chunks are
560 * copied and a3 to the dst pointer after all the (d)word chunks have
561 * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
562 */
563L(chk1w):
564        andi    a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
565        beq     a2,t8,L(lastw)
566        PTR_SUBU a3,t8,a2       /* a3 is count of bytes in one (d)word chunks */
567        PTR_ADDU a3,a0,a3       /* a3 is the dst address after loop */
568
569/* copying in words (4-byte or 8-byte chunks) */
570L(wordCopy_loop):
571        C_LD    REG3,UNIT(0)(a1)
572        PTR_ADDIU a0,a0,UNIT(1)
573        PTR_ADDIU a1,a1,UNIT(1)
574        bne     a0,a3,L(wordCopy_loop)
575        C_ST    REG3,UNIT(-1)(a0)
576
577/* If we have been copying double words, see if we can copy a single word
578   before doing byte copies.  We can have, at most, one word to copy.  */
579
580L(lastw):
581#ifdef USE_DOUBLE
582        andi    t8,a2,3         /* a2 is the remainder past 4 byte chunks.  */
583        beq     t8,a2,L(lastb)
584        move    a2,t8
585        lw      REG3,0(a1)
586        sw      REG3,0(a0)
587        PTR_ADDIU a0,a0,4
588        PTR_ADDIU a1,a1,4
589#endif
590
591/* Copy the last 8 (or 16) bytes */
592L(lastb):
593        blez    a2,L(leave)
594        PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
595L(lastbloop):
596        lb      v1,0(a1)
597        PTR_ADDIU a0,a0,1
598        PTR_ADDIU a1,a1,1
599        bne     a0,a3,L(lastbloop)
600        sb      v1,-1(a0)
601L(leave):
602        j       ra
603        nop
604
605/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
606   whether or not USE_DOUBLE is defined.  Instead of just doing byte
607   copies, check the alignment and size and use lw/sw if possible.
608   Otherwise, do byte copies.  */
609
610L(lasts):
611        andi    t8,a2,3
612        beq     t8,a2,L(lastb)
613
614        andi    t9,a0,3
615        bne     t9,zero,L(lastb)
616        andi    t9,a1,3
617        bne     t9,zero,L(lastb)
618
619        PTR_SUBU a3,a2,t8
620        PTR_ADDU a3,a0,a3
621
622L(wcopy_loop):
623        lw      REG3,0(a1)
624        PTR_ADDIU a0,a0,4
625        PTR_ADDIU a1,a1,4
626        bne     a0,a3,L(wcopy_loop)
627        sw      REG3,-4(a0)
628
629        b       L(lastb)
630        move    a2,t8
631
632#ifndef R6_CODE
633/*
634 * UNALIGNED case, got here with a3 = "negu a0"
635 * This code is nearly identical to the aligned code above
636 * but only the destination (not the source) gets aligned
637 * so we need to do partial loads of the source followed
638 * by normal stores to the destination (once we have aligned
639 * the destination).
640 */
641
642L(unaligned):
643        andi    a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
644        beqz    a3,L(ua_chk16w) /* if a3=0, it is already aligned */
645        PTR_SUBU a2,a2,a3       /* a2 is the remining bytes count */
646
647        C_LDHI  v1,UNIT(0)(a1)
648        C_LDLO  v1,UNITM1(1)(a1)
649        PTR_ADDU a1,a1,a3
650        C_STHI  v1,UNIT(0)(a0)
651        PTR_ADDU a0,a0,a3
652
653/*
654 *  Now the destination (but not the source) is aligned
655 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
656 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
657 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
658 * equals a3.
659 */
660
661L(ua_chk16w):
662        andi    t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
663        beq     a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
664        PTR_SUBU a3,a2,t8        /* subtract from a2 the reminder */
665        PTR_ADDU a3,a0,a3        /* Now a3 is the final dst after loop */
666
667# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
668        PTR_ADDU t0,a0,a2         /* t0 is the "past the end" address */
669        PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
670# endif
671        PREFETCH_FOR_LOAD  (0, a1)
672        PREFETCH_FOR_LOAD  (1, a1)
673        PREFETCH_FOR_LOAD  (2, a1)
674# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
675        PREFETCH_FOR_STORE (1, a0)
676        PREFETCH_FOR_STORE (2, a0)
677        PREFETCH_FOR_STORE (3, a0)
678# endif
679# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
680#  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
681        sltu    v1,t9,a0
682        bgtz    v1,L(ua_skip_set)
683        nop
684        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
685L(ua_skip_set):
686#  else
687        PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
688#  endif
689# endif
690L(ua_loop16w):
691        PREFETCH_FOR_LOAD  (3, a1)
692        C_LDHI  t0,UNIT(0)(a1)
693        C_LDHI  t1,UNIT(1)(a1)
694        C_LDHI  REG2,UNIT(2)(a1)
695# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
696        sltu    v1,t9,a0
697        bgtz    v1,L(ua_skip_pref)
698# endif
699        C_LDHI  REG3,UNIT(3)(a1)
700        PREFETCH_FOR_STORE (4, a0)
701        PREFETCH_FOR_STORE (5, a0)
702L(ua_skip_pref):
703        C_LDHI  REG4,UNIT(4)(a1)
704        C_LDHI  REG5,UNIT(5)(a1)
705        C_LDHI  REG6,UNIT(6)(a1)
706        C_LDHI  REG7,UNIT(7)(a1)
707        C_LDLO  t0,UNITM1(1)(a1)
708        C_LDLO  t1,UNITM1(2)(a1)
709        C_LDLO  REG2,UNITM1(3)(a1)
710        C_LDLO  REG3,UNITM1(4)(a1)
711        C_LDLO  REG4,UNITM1(5)(a1)
712        C_LDLO  REG5,UNITM1(6)(a1)
713        C_LDLO  REG6,UNITM1(7)(a1)
714        C_LDLO  REG7,UNITM1(8)(a1)
715        PREFETCH_FOR_LOAD (4, a1)
716        C_ST    t0,UNIT(0)(a0)
717        C_ST    t1,UNIT(1)(a0)
718        C_ST    REG2,UNIT(2)(a0)
719        C_ST    REG3,UNIT(3)(a0)
720        C_ST    REG4,UNIT(4)(a0)
721        C_ST    REG5,UNIT(5)(a0)
722        C_ST    REG6,UNIT(6)(a0)
723        C_ST    REG7,UNIT(7)(a0)
724        C_LDHI  t0,UNIT(8)(a1)
725        C_LDHI  t1,UNIT(9)(a1)
726        C_LDHI  REG2,UNIT(10)(a1)
727        C_LDHI  REG3,UNIT(11)(a1)
728        C_LDHI  REG4,UNIT(12)(a1)
729        C_LDHI  REG5,UNIT(13)(a1)
730        C_LDHI  REG6,UNIT(14)(a1)
731        C_LDHI  REG7,UNIT(15)(a1)
732        C_LDLO  t0,UNITM1(9)(a1)
733        C_LDLO  t1,UNITM1(10)(a1)
734        C_LDLO  REG2,UNITM1(11)(a1)
735        C_LDLO  REG3,UNITM1(12)(a1)
736        C_LDLO  REG4,UNITM1(13)(a1)
737        C_LDLO  REG5,UNITM1(14)(a1)
738        C_LDLO  REG6,UNITM1(15)(a1)
739        C_LDLO  REG7,UNITM1(16)(a1)
740        PREFETCH_FOR_LOAD (5, a1)
741        C_ST    t0,UNIT(8)(a0)
742        C_ST    t1,UNIT(9)(a0)
743        C_ST    REG2,UNIT(10)(a0)
744        C_ST    REG3,UNIT(11)(a0)
745        C_ST    REG4,UNIT(12)(a0)
746        C_ST    REG5,UNIT(13)(a0)
747        C_ST    REG6,UNIT(14)(a0)
748        C_ST    REG7,UNIT(15)(a0)
749        PTR_ADDIU a0,a0,UNIT(16)        /* adding 64/128 to dest */
750        bne     a0,a3,L(ua_loop16w)
751        PTR_ADDIU a1,a1,UNIT(16)        /* adding 64/128 to src */
752        move    a2,t8
753
754/* Here we have src and dest word-aligned but less than 64-bytes or
755 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
756 * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
757 * the copy.  */
758
759L(ua_chkw):
760        PREFETCH_FOR_LOAD (0, a1)
761        andi    t8,a2,NSIZEMASK   /* Is there a 32-byte/64-byte chunk.  */
762                                  /* t8 is the reminder count past 32-bytes */
763        beq     a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
764        nop
765        C_LDHI  t0,UNIT(0)(a1)
766        C_LDHI  t1,UNIT(1)(a1)
767        C_LDHI  REG2,UNIT(2)(a1)
768        C_LDHI  REG3,UNIT(3)(a1)
769        C_LDHI  REG4,UNIT(4)(a1)
770        C_LDHI  REG5,UNIT(5)(a1)
771        C_LDHI  REG6,UNIT(6)(a1)
772        C_LDHI  REG7,UNIT(7)(a1)
773        C_LDLO  t0,UNITM1(1)(a1)
774        C_LDLO  t1,UNITM1(2)(a1)
775        C_LDLO  REG2,UNITM1(3)(a1)
776        C_LDLO  REG3,UNITM1(4)(a1)
777        C_LDLO  REG4,UNITM1(5)(a1)
778        C_LDLO  REG5,UNITM1(6)(a1)
779        C_LDLO  REG6,UNITM1(7)(a1)
780        C_LDLO  REG7,UNITM1(8)(a1)
781        PTR_ADDIU a1,a1,UNIT(8)
782        C_ST    t0,UNIT(0)(a0)
783        C_ST    t1,UNIT(1)(a0)
784        C_ST    REG2,UNIT(2)(a0)
785        C_ST    REG3,UNIT(3)(a0)
786        C_ST    REG4,UNIT(4)(a0)
787        C_ST    REG5,UNIT(5)(a0)
788        C_ST    REG6,UNIT(6)(a0)
789        C_ST    REG7,UNIT(7)(a0)
790        PTR_ADDIU a0,a0,UNIT(8)
791/*
792 * Here we have less than 32(64) bytes to copy.  Set up for a loop to
793 * copy one word (or double word) at a time.
794 */
795L(ua_chk1w):
796        andi    a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
797        beq     a2,t8,L(ua_smallCopy)
798        PTR_SUBU a3,t8,a2       /* a3 is count of bytes in one (d)word chunks */
799        PTR_ADDU a3,a0,a3       /* a3 is the dst address after loop */
800
801/* copying in words (4-byte or 8-byte chunks) */
802L(ua_wordCopy_loop):
803        C_LDHI  v1,UNIT(0)(a1)
804        C_LDLO  v1,UNITM1(1)(a1)
805        PTR_ADDIU a0,a0,UNIT(1)
806        PTR_ADDIU a1,a1,UNIT(1)
807        bne     a0,a3,L(ua_wordCopy_loop)
808        C_ST    v1,UNIT(-1)(a0)
809
810/* Copy the last 8 (or 16) bytes */
811L(ua_smallCopy):
812        beqz    a2,L(leave)
813        PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
814L(ua_smallCopy_loop):
815        lb      v1,0(a1)
816        PTR_ADDIU a0,a0,1
817        PTR_ADDIU a1,a1,1
818        bne     a0,a3,L(ua_smallCopy_loop)
819        sb      v1,-1(a0)
820
821        j       ra
822        nop
823
824#else /* R6_CODE */
825
826# if __MIPSEB
827#  define SWAP_REGS(X,Y) X, Y
828#  define ALIGN_OFFSET(N) (N)
829# else
830#  define SWAP_REGS(X,Y) Y, X
831#  define ALIGN_OFFSET(N) (NSIZE-N)
832# endif
833# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
834        andi    REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
835        beq     REG7, a2, L(lastb); /* Check for bytes to copy by word     */ \
836        PTR_SUBU a3, a2, REG7;  /* a3 is number of bytes to be copied in   */ \
837                                /* (d)word chunks.                         */ \
838        move    a2, REG7;       /* a2 is # of bytes to copy byte by byte   */ \
839                                /* after word loop is finished.            */ \
840        PTR_ADDU REG6, a0, a3;  /* REG6 is the dst address after loop.     */ \
841        PTR_SUBU REG2, a1, t8;  /* REG2 is the aligned src address.        */ \
842        PTR_ADDU a1, a1, a3;    /* a1 is addr of source after word loop.   */ \
843        C_LD    t0, UNIT(0)(REG2);  /* Load first part of source.          */ \
844L(r6_ua_wordcopy##BYTEOFFSET):                                                \
845        C_LD    t1, UNIT(1)(REG2);  /* Load second part of source.         */ \
846        C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);             \
847        PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.      */ \
848        PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
849        move    t0, t1;         /* Move second part of source to first.    */ \
850        bne     a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);                       \
851        C_ST    REG3, UNIT(-1)(a0);                                           \
852        j       L(lastb);                                                     \
853        nop
854
855        /* We are generating R6 code, the destination is 4 byte aligned and
856           the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
857           alignment of the source.  */
858
859L(r6_unaligned1):
860        R6_UNALIGNED_WORD_COPY(1)
861L(r6_unaligned2):
862        R6_UNALIGNED_WORD_COPY(2)
863L(r6_unaligned3):
864        R6_UNALIGNED_WORD_COPY(3)
865# ifdef USE_DOUBLE
866L(r6_unaligned4):
867        R6_UNALIGNED_WORD_COPY(4)
868L(r6_unaligned5):
869        R6_UNALIGNED_WORD_COPY(5)
870L(r6_unaligned6):
871        R6_UNALIGNED_WORD_COPY(6)
872L(r6_unaligned7):
873        R6_UNALIGNED_WORD_COPY(7)
874# endif
875#endif /* R6_CODE */
876
877        .set    at
878        .set    reorder
879END(MEMCPY_NAME)
880#ifndef ANDROID_CHANGES
881# ifdef _LIBC
882libc_hidden_builtin_def (MEMCPY_NAME)
883# endif
884#endif
Note: See TracBrowser for help on using the repository browser.