Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

memcpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 25.7 KB

Rev	Line
[444]	1	/*
	2	* Copyright (c) 2012-2015
	3	* MIPS Technologies, Inc., California.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
	14	* contributors may be used to endorse or promote products derived from
	15	* this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
	18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
	21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	27	* SUCH DAMAGE.
	28	*/
	29
	30	#ifdef ANDROID_CHANGES
	31	# include "machine/asm.h"
	32	# include "machine/regdef.h"
	33	# define USE_MEMMOVE_FOR_OVERLAP
	34	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
	35	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
	36	#elif _LIBC
	37	# include <sysdep.h>
	38	# include <regdef.h>
	39	# include <sys/asm.h>
	40	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
	41	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
	42	#elif _COMPILING_NEWLIB
	43	# include "machine/asm.h"
	44	# include "machine/regdef.h"
	45	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
	46	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
	47	#else
	48	# include <regdef.h>
	49	# include <sys/asm.h>
	50	#endif
	51
	52	/* Check to see if the MIPS architecture we are compiling for supports
	53	* prefetching.
	54	*/
	55
	56	#if (__mips == 4) \|\| (__mips == 5) \|\| (__mips == 32) \|\| (__mips == 64)
	57	# ifndef DISABLE_PREFETCH
	58	# define USE_PREFETCH
	59	# endif
	60	#endif
	61
	62	#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) \|\| (_MIPS_SIM == _ABIN32))
	63	# ifndef DISABLE_DOUBLE
	64	# define USE_DOUBLE
	65	# endif
	66	#endif
	67
	68
	69	#if __mips_isa_rev > 5
	70	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	71	# undef PREFETCH_STORE_HINT
	72	# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
	73	# endif
	74	# define R6_CODE
	75	#endif
	76
	77	/* Some asm.h files do not have the L macro definition. */
	78	#ifndef L
	79	# if _MIPS_SIM == _ABIO32
	80	# define L(label) $L ## label
	81	# else
	82	# define L(label) .L ## label
	83	# endif
	84	#endif
	85
	86	/* Some asm.h files do not have the PTR_ADDIU macro definition. */
	87	#ifndef PTR_ADDIU
	88	# ifdef USE_DOUBLE
	89	# define PTR_ADDIU daddiu
	90	# else
	91	# define PTR_ADDIU addiu
	92	# endif
	93	#endif
	94
	95	/* Some asm.h files do not have the PTR_SRA macro definition. */
	96	#ifndef PTR_SRA
	97	# ifdef USE_DOUBLE
	98	# define PTR_SRA dsra
	99	# else
	100	# define PTR_SRA sra
	101	# endif
	102	#endif
	103
	104	/* New R6 instructions that may not be in asm.h. */
	105	#ifndef PTR_LSA
	106	# if _MIPS_SIM == _ABI64
	107	# define PTR_LSA dlsa
	108	# else
	109	# define PTR_LSA lsa
	110	# endif
	111	#endif
	112
	113	/*
	114	* Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
	115	* prefetches appears to offer a slight preformance advantage.
	116	*
	117	* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
	118	* or PREFETCH_STORE_STREAMED offers a large performance advantage
	119	* but PREPAREFORSTORE has some special restrictions to consider.
	120	*
	121	* Prefetch with the 'prepare for store' hint does not copy a memory
	122	* location into the cache, it just allocates a cache line and zeros
	123	* it out. This means that if you do not write to the entire cache
	124	* line before writing it out to memory some data will get zero'ed out
	125	* when the cache line is written back to memory and data will be lost.
	126	*
	127	* Also if you are using this memcpy to copy overlapping buffers it may
	128	* not behave correctly when using the 'prepare for store' hint. If you
	129	* use the 'prepare for store' prefetch on a memory area that is in the
	130	* memcpy source (as well as the memcpy destination), then you will get
	131	* some data zero'ed out before you have a chance to read it and data will
	132	* be lost.
	133	*
	134	* If you are going to use this memcpy routine with the 'prepare for store'
	135	* prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
	136	* the problem of running memcpy on overlapping buffers.
	137	*
	138	* There are ifdef'ed sections of this memcpy to make sure that it does not
	139	* do prefetches on cache lines that are not going to be completely written.
	140	* This code is only needed and only used when PREFETCH_STORE_HINT is set to
	141	* PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
	142	* 32 bytes and if the cache line is larger it will not work correctly.
	143	*/
	144
	145	#ifdef USE_PREFETCH
	146	# define PREFETCH_HINT_LOAD 0
	147	# define PREFETCH_HINT_STORE 1
	148	# define PREFETCH_HINT_LOAD_STREAMED 4
	149	# define PREFETCH_HINT_STORE_STREAMED 5
	150	# define PREFETCH_HINT_LOAD_RETAINED 6
	151	# define PREFETCH_HINT_STORE_RETAINED 7
	152	# define PREFETCH_HINT_WRITEBACK_INVAL 25
	153	# define PREFETCH_HINT_PREPAREFORSTORE 30
	154
	155	/*
	156	* If we have not picked out what hints to use at this point use the
	157	* standard load and store prefetch hints.
	158	*/
	159	# ifndef PREFETCH_STORE_HINT
	160	# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
	161	# endif
	162	# ifndef PREFETCH_LOAD_HINT
	163	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
	164	# endif
	165
	166	/*
	167	* We double everything when USE_DOUBLE is true so we do 2 prefetches to
	168	* get 64 bytes in that case. The assumption is that each individual
	169	* prefetch brings in 32 bytes.
	170	*/
	171
	172	# ifdef USE_DOUBLE
	173	# define PREFETCH_CHUNK 64
	174	# define PREFETCH_FOR_LOAD(chunk, reg) \
	175	pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
	176	pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
	177	# define PREFETCH_FOR_STORE(chunk, reg) \
	178	pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
	179	pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
	180	# else
	181	# define PREFETCH_CHUNK 32
	182	# define PREFETCH_FOR_LOAD(chunk, reg) \
	183	pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
	184	# define PREFETCH_FOR_STORE(chunk, reg) \
	185	pref PREFETCH_STORE_HINT, (chunk)*32(reg)
	186	# endif
	187	/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
	188	* than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size
	189	* of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
	190	* hint is used, the code will not work correctly. If PREPAREFORSTORE is not
	191	* used then MAX_PREFETCH_SIZE does not matter. */
	192	# define MAX_PREFETCH_SIZE 128
	193	/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
	194	* than 5 on a STORE prefetch and that a single prefetch can never be larger
	195	* than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because
	196	* we actually do two prefetches in that case, one 32 bytes after the other. */
	197	# ifdef USE_DOUBLE
	198	# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
	199	# else
	200	# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
	201	# endif
	202	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
	203	&& ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
	204	/* We cannot handle this because the initial prefetches may fetch bytes that
	205	* are before the buffer being copied. We start copies with an offset
	206	* of 4 so avoid this situation when using PREPAREFORSTORE. */
	207	#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
	208	# endif
	209	#else /* USE_PREFETCH not defined */
	210	# define PREFETCH_FOR_LOAD(offset, reg)
	211	# define PREFETCH_FOR_STORE(offset, reg)
	212	#endif
	213
	214	/* Allow the routine to be named something else if desired. */
	215	#ifndef MEMCPY_NAME
	216	# define MEMCPY_NAME memcpy
	217	#endif
	218
	219	/* We use these 32/64 bit registers as temporaries to do the copying. */
	220	#define REG0 t0
	221	#define REG1 t1
	222	#define REG2 t2
	223	#define REG3 t3
	224	#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 \|\| _MIPS_SIM == _ABIO64)
	225	# define REG4 t4
	226	# define REG5 t5
	227	# define REG6 t6
	228	# define REG7 t7
	229	#else
	230	# define REG4 ta0
	231	# define REG5 ta1
	232	# define REG6 ta2
	233	# define REG7 ta3
	234	#endif
	235
	236	/* We load/store 64 bits at a time when USE_DOUBLE is true.
	237	* The C_ prefix stands for CHUNK and is used to avoid macro name
	238	* conflicts with system header files. */
	239
	240	#ifdef USE_DOUBLE
	241	# define C_ST sd
	242	# define C_LD ld
	243	# if __MIPSEB
	244	# define C_LDHI ldl /* high part is left in big-endian */
	245	# define C_STHI sdl /* high part is left in big-endian */
	246	# define C_LDLO ldr /* low part is right in big-endian */
	247	# define C_STLO sdr /* low part is right in big-endian */
	248	# else
	249	# define C_LDHI ldr /* high part is right in little-endian */
	250	# define C_STHI sdr /* high part is right in little-endian */
	251	# define C_LDLO ldl /* low part is left in little-endian */
	252	# define C_STLO sdl /* low part is left in little-endian */
	253	# endif
	254	# define C_ALIGN dalign /* r6 align instruction */
	255	#else
	256	# define C_ST sw
	257	# define C_LD lw
	258	# if __MIPSEB
	259	# define C_LDHI lwl /* high part is left in big-endian */
	260	# define C_STHI swl /* high part is left in big-endian */
	261	# define C_LDLO lwr /* low part is right in big-endian */
	262	# define C_STLO swr /* low part is right in big-endian */
	263	# else
	264	# define C_LDHI lwr /* high part is right in little-endian */
	265	# define C_STHI swr /* high part is right in little-endian */
	266	# define C_LDLO lwl /* low part is left in little-endian */
	267	# define C_STLO swl /* low part is left in little-endian */
	268	# endif
	269	# define C_ALIGN align /* r6 align instruction */
	270	#endif
	271
	272	/* Bookkeeping values for 32 vs. 64 bit mode. */
	273	#ifdef USE_DOUBLE
	274	# define NSIZE 8
	275	# define NSIZEMASK 0x3f
	276	# define NSIZEDMASK 0x7f
	277	#else
	278	# define NSIZE 4
	279	# define NSIZEMASK 0x1f
	280	# define NSIZEDMASK 0x3f
	281	#endif
	282	#define UNIT(unit) ((unit)*NSIZE)
	283	#define UNITM1(unit) (((unit)*NSIZE)-1)
	284
	285	#ifdef ANDROID_CHANGES
	286	LEAF(MEMCPY_NAME, 0)
	287	#else
	288	LEAF(MEMCPY_NAME)
	289	#endif
	290	.set nomips16
	291	.set noreorder
	292	/*
	293	* Below we handle the case where memcpy is called with overlapping src and dst.
	294	* Although memcpy is not required to handle this case, some parts of Android
	295	* like Skia rely on such usage. We call memmove to handle such cases.
	296	*/
	297	#ifdef USE_MEMMOVE_FOR_OVERLAP
	298	PTR_SUBU t0,a0,a1
	299	PTR_SRA t2,t0,31
	300	xor t1,t0,t2
	301	PTR_SUBU t0,t1,t2
	302	sltu t2,t0,a2
	303	beq t2,zero,L(memcpy)
	304	la t9,memmove
	305	jr t9
	306	nop
	307	L(memcpy):
	308	#endif
	309	/*
	310	* If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of
	311	* size, copy dst pointer to v0 for the return value.
	312	*/
	313	slti t2,a2,(2 * NSIZE)
	314	bne t2,zero,L(lasts)
	315	#if defined(RETURN_FIRST_PREFETCH) \|\| defined(RETURN_LAST_PREFETCH)
	316	move v0,zero
	317	#else
	318	move v0,a0
	319	#endif
	320
	321	#ifndef R6_CODE
	322
	323	/*
	324	* If src and dst have different alignments, go to L(unaligned), if they
	325	* have the same alignment (but are not actually aligned) do a partial
	326	* load/store to make them aligned. If they are both already aligned
	327	* we can start copying at L(aligned).
	328	*/
	329	xor t8,a1,a0
	330	andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
	331	bne t8,zero,L(unaligned)
	332	PTR_SUBU a3, zero, a0
	333
	334	andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
	335	beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
	336	PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
	337
	338	C_LDHI t8,0(a1)
	339	PTR_ADDU a1,a1,a3
	340	C_STHI t8,0(a0)
	341	PTR_ADDU a0,a0,a3
	342
	343	#else /* R6_CODE */
	344
	345	/*
	346	* Align the destination and hope that the source gets aligned too. If it
	347	* doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
	348	* align instruction.
	349	*/
	350	andi t8,a0,7
	351	lapc t9,L(atable)
	352	PTR_LSA t9,t8,t9,2
	353	jrc t9
	354	L(atable):
	355	bc L(lb0)
	356	bc L(lb7)
	357	bc L(lb6)
	358	bc L(lb5)
	359	bc L(lb4)
	360	bc L(lb3)
	361	bc L(lb2)
	362	bc L(lb1)
	363	L(lb7):
	364	lb a3, 6(a1)
	365	sb a3, 6(a0)
	366	L(lb6):
	367	lb a3, 5(a1)
	368	sb a3, 5(a0)
	369	L(lb5):
	370	lb a3, 4(a1)
	371	sb a3, 4(a0)
	372	L(lb4):
	373	lb a3, 3(a1)
	374	sb a3, 3(a0)
	375	L(lb3):
	376	lb a3, 2(a1)
	377	sb a3, 2(a0)
	378	L(lb2):
	379	lb a3, 1(a1)
	380	sb a3, 1(a0)
	381	L(lb1):
	382	lb a3, 0(a1)
	383	sb a3, 0(a0)
	384
	385	li t9,8
	386	subu t8,t9,t8
	387	PTR_SUBU a2,a2,t8
	388	PTR_ADDU a0,a0,t8
	389	PTR_ADDU a1,a1,t8
	390	L(lb0):
	391
	392	andi t8,a1,(NSIZE-1)
	393	lapc t9,L(jtable)
	394	PTR_LSA t9,t8,t9,2
	395	jrc t9
	396	L(jtable):
	397	bc L(aligned)
	398	bc L(r6_unaligned1)
	399	bc L(r6_unaligned2)
	400	bc L(r6_unaligned3)
	401	# ifdef USE_DOUBLE
	402	bc L(r6_unaligned4)
	403	bc L(r6_unaligned5)
	404	bc L(r6_unaligned6)
	405	bc L(r6_unaligned7)
	406	# endif
	407	#endif /* R6_CODE */
	408
	409	L(aligned):
	410
	411	/*
	412	* Now dst/src are both aligned to (word or double word) aligned addresses
	413	* Set a2 to count how many bytes we have to copy after all the 64/128 byte
	414	* chunks are copied and a3 to the dst pointer after all the 64/128 byte
	415	* chunks have been copied. We will loop, incrementing a0 and a1 until a0
	416	* equals a3.
	417	*/
	418
	419	andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	420	beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
	421	PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
	422	PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
	423
	424	/* When in the loop we may prefetch with the 'prepare to store' hint,
	425	* in this case the a0+x should not be past the "t0-32" address. This
	426	* means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
	427	* for x=64 the last "safe" a0 address is "t0-96" In the current version we
	428	* will use "prefetch hint,128(a0)", so "t0-160" is the limit.
	429	*/
	430	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	431	PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
	432	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
	433	#endif
	434	PREFETCH_FOR_LOAD (0, a1)
	435	PREFETCH_FOR_LOAD (1, a1)
	436	PREFETCH_FOR_LOAD (2, a1)
	437	PREFETCH_FOR_LOAD (3, a1)
	438	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	439	PREFETCH_FOR_STORE (1, a0)
	440	PREFETCH_FOR_STORE (2, a0)
	441	PREFETCH_FOR_STORE (3, a0)
	442	#endif
	443	#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
	444	# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
	445	sltu v1,t9,a0
	446	bgtz v1,L(skip_set)
	447	nop
	448	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
	449	L(skip_set):
	450	# else
	451	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
	452	# endif
	453	#endif
	454	#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
	455	&& (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	456	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
	457	# ifdef USE_DOUBLE
	458	PTR_ADDIU v0,v0,32
	459	# endif
	460	#endif
	461	L(loop16w):
	462	C_LD t0,UNIT(0)(a1)
	463	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	464	sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
	465	bgtz v1,L(skip_pref)
	466	#endif
	467	C_LD t1,UNIT(1)(a1)
	468	#ifndef R6_CODE
	469	PREFETCH_FOR_STORE (4, a0)
	470	PREFETCH_FOR_STORE (5, a0)
	471	#else
	472	PREFETCH_FOR_STORE (2, a0)
	473	#endif
	474	#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
	475	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
	476	# ifdef USE_DOUBLE
	477	PTR_ADDIU v0,v0,32
	478	# endif
	479	#endif
	480	L(skip_pref):
	481	C_LD REG2,UNIT(2)(a1)
	482	C_LD REG3,UNIT(3)(a1)
	483	C_LD REG4,UNIT(4)(a1)
	484	C_LD REG5,UNIT(5)(a1)
	485	C_LD REG6,UNIT(6)(a1)
	486	C_LD REG7,UNIT(7)(a1)
	487	#ifndef R6_CODE
	488	PREFETCH_FOR_LOAD (4, a1)
	489	#else
	490	PREFETCH_FOR_LOAD (3, a1)
	491	#endif
	492	C_ST t0,UNIT(0)(a0)
	493	C_ST t1,UNIT(1)(a0)
	494	C_ST REG2,UNIT(2)(a0)
	495	C_ST REG3,UNIT(3)(a0)
	496	C_ST REG4,UNIT(4)(a0)
	497	C_ST REG5,UNIT(5)(a0)
	498	C_ST REG6,UNIT(6)(a0)
	499	C_ST REG7,UNIT(7)(a0)
	500
	501	C_LD t0,UNIT(8)(a1)
	502	C_LD t1,UNIT(9)(a1)
	503	C_LD REG2,UNIT(10)(a1)
	504	C_LD REG3,UNIT(11)(a1)
	505	C_LD REG4,UNIT(12)(a1)
	506	C_LD REG5,UNIT(13)(a1)
	507	C_LD REG6,UNIT(14)(a1)
	508	C_LD REG7,UNIT(15)(a1)
	509	#ifndef R6_CODE
	510	PREFETCH_FOR_LOAD (5, a1)
	511	#endif
	512	C_ST t0,UNIT(8)(a0)
	513	C_ST t1,UNIT(9)(a0)
	514	C_ST REG2,UNIT(10)(a0)
	515	C_ST REG3,UNIT(11)(a0)
	516	C_ST REG4,UNIT(12)(a0)
	517	C_ST REG5,UNIT(13)(a0)
	518	C_ST REG6,UNIT(14)(a0)
	519	C_ST REG7,UNIT(15)(a0)
	520	PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
	521	bne a0,a3,L(loop16w)
	522	PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
	523	move a2,t8
	524
	525	/* Here we have src and dest word-aligned but less than 64-bytes or
	526	* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
	527	* is one. Otherwise jump down to L(chk1w) to handle the tail end of
	528	* the copy.
	529	*/
	530
	531	L(chkw):
	532	PREFETCH_FOR_LOAD (0, a1)
	533	andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
	534	/* The t8 is the reminder count past 32-bytes */
	535	beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
	536	nop
	537	C_LD t0,UNIT(0)(a1)
	538	C_LD t1,UNIT(1)(a1)
	539	C_LD REG2,UNIT(2)(a1)
	540	C_LD REG3,UNIT(3)(a1)
	541	C_LD REG4,UNIT(4)(a1)
	542	C_LD REG5,UNIT(5)(a1)
	543	C_LD REG6,UNIT(6)(a1)
	544	C_LD REG7,UNIT(7)(a1)
	545	PTR_ADDIU a1,a1,UNIT(8)
	546	C_ST t0,UNIT(0)(a0)
	547	C_ST t1,UNIT(1)(a0)
	548	C_ST REG2,UNIT(2)(a0)
	549	C_ST REG3,UNIT(3)(a0)
	550	C_ST REG4,UNIT(4)(a0)
	551	C_ST REG5,UNIT(5)(a0)
	552	C_ST REG6,UNIT(6)(a0)
	553	C_ST REG7,UNIT(7)(a0)
	554	PTR_ADDIU a0,a0,UNIT(8)
	555
	556	/*
	557	* Here we have less than 32(64) bytes to copy. Set up for a loop to
	558	* copy one word (or double word) at a time. Set a2 to count how many
	559	* bytes we have to copy after all the word (or double word) chunks are
	560	* copied and a3 to the dst pointer after all the (d)word chunks have
	561	* been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
	562	*/
	563	L(chk1w):
	564	andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
	565	beq a2,t8,L(lastw)
	566	PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
	567	PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
	568
	569	/* copying in words (4-byte or 8-byte chunks) */
	570	L(wordCopy_loop):
	571	C_LD REG3,UNIT(0)(a1)
	572	PTR_ADDIU a0,a0,UNIT(1)
	573	PTR_ADDIU a1,a1,UNIT(1)
	574	bne a0,a3,L(wordCopy_loop)
	575	C_ST REG3,UNIT(-1)(a0)
	576
	577	/* If we have been copying double words, see if we can copy a single word
	578	before doing byte copies. We can have, at most, one word to copy. */
	579
	580	L(lastw):
	581	#ifdef USE_DOUBLE
	582	andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */
	583	beq t8,a2,L(lastb)
	584	move a2,t8
	585	lw REG3,0(a1)
	586	sw REG3,0(a0)
	587	PTR_ADDIU a0,a0,4
	588	PTR_ADDIU a1,a1,4
	589	#endif
	590
	591	/* Copy the last 8 (or 16) bytes */
	592	L(lastb):
	593	blez a2,L(leave)
	594	PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
	595	L(lastbloop):
	596	lb v1,0(a1)
	597	PTR_ADDIU a0,a0,1
	598	PTR_ADDIU a1,a1,1
	599	bne a0,a3,L(lastbloop)
	600	sb v1,-1(a0)
	601	L(leave):
	602	j ra
	603	nop
	604
	605	/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
	606	whether or not USE_DOUBLE is defined. Instead of just doing byte
	607	copies, check the alignment and size and use lw/sw if possible.
	608	Otherwise, do byte copies. */
	609
	610	L(lasts):
	611	andi t8,a2,3
	612	beq t8,a2,L(lastb)
	613
	614	andi t9,a0,3
	615	bne t9,zero,L(lastb)
	616	andi t9,a1,3
	617	bne t9,zero,L(lastb)
	618
	619	PTR_SUBU a3,a2,t8
	620	PTR_ADDU a3,a0,a3
	621
	622	L(wcopy_loop):
	623	lw REG3,0(a1)
	624	PTR_ADDIU a0,a0,4
	625	PTR_ADDIU a1,a1,4
	626	bne a0,a3,L(wcopy_loop)
	627	sw REG3,-4(a0)
	628
	629	b L(lastb)
	630	move a2,t8
	631
	632	#ifndef R6_CODE
	633	/*
	634	* UNALIGNED case, got here with a3 = "negu a0"
	635	* This code is nearly identical to the aligned code above
	636	* but only the destination (not the source) gets aligned
	637	* so we need to do partial loads of the source followed
	638	* by normal stores to the destination (once we have aligned
	639	* the destination).
	640	*/
	641
	642	L(unaligned):
	643	andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
	644	beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
	645	PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
	646
	647	C_LDHI v1,UNIT(0)(a1)
	648	C_LDLO v1,UNITM1(1)(a1)
	649	PTR_ADDU a1,a1,a3
	650	C_STHI v1,UNIT(0)(a0)
	651	PTR_ADDU a0,a0,a3
	652
	653	/*
	654	* Now the destination (but not the source) is aligned
	655	* Set a2 to count how many bytes we have to copy after all the 64/128 byte
	656	* chunks are copied and a3 to the dst pointer after all the 64/128 byte
	657	* chunks have been copied. We will loop, incrementing a0 and a1 until a0
	658	* equals a3.
	659	*/
	660
	661	L(ua_chk16w):
	662	andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	663	beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
	664	PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
	665	PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
	666
	667	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	668	PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
	669	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
	670	# endif
	671	PREFETCH_FOR_LOAD (0, a1)
	672	PREFETCH_FOR_LOAD (1, a1)
	673	PREFETCH_FOR_LOAD (2, a1)
	674	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
	675	PREFETCH_FOR_STORE (1, a0)
	676	PREFETCH_FOR_STORE (2, a0)
	677	PREFETCH_FOR_STORE (3, a0)
	678	# endif
	679	# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
	680	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	681	sltu v1,t9,a0
	682	bgtz v1,L(ua_skip_set)
	683	nop
	684	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
	685	L(ua_skip_set):
	686	# else
	687	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
	688	# endif
	689	# endif
	690	L(ua_loop16w):
	691	PREFETCH_FOR_LOAD (3, a1)
	692	C_LDHI t0,UNIT(0)(a1)
	693	C_LDHI t1,UNIT(1)(a1)
	694	C_LDHI REG2,UNIT(2)(a1)
	695	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
	696	sltu v1,t9,a0
	697	bgtz v1,L(ua_skip_pref)
	698	# endif
	699	C_LDHI REG3,UNIT(3)(a1)
	700	PREFETCH_FOR_STORE (4, a0)
	701	PREFETCH_FOR_STORE (5, a0)
	702	L(ua_skip_pref):
	703	C_LDHI REG4,UNIT(4)(a1)
	704	C_LDHI REG5,UNIT(5)(a1)
	705	C_LDHI REG6,UNIT(6)(a1)
	706	C_LDHI REG7,UNIT(7)(a1)
	707	C_LDLO t0,UNITM1(1)(a1)
	708	C_LDLO t1,UNITM1(2)(a1)
	709	C_LDLO REG2,UNITM1(3)(a1)
	710	C_LDLO REG3,UNITM1(4)(a1)
	711	C_LDLO REG4,UNITM1(5)(a1)
	712	C_LDLO REG5,UNITM1(6)(a1)
	713	C_LDLO REG6,UNITM1(7)(a1)
	714	C_LDLO REG7,UNITM1(8)(a1)
	715	PREFETCH_FOR_LOAD (4, a1)
	716	C_ST t0,UNIT(0)(a0)
	717	C_ST t1,UNIT(1)(a0)
	718	C_ST REG2,UNIT(2)(a0)
	719	C_ST REG3,UNIT(3)(a0)
	720	C_ST REG4,UNIT(4)(a0)
	721	C_ST REG5,UNIT(5)(a0)
	722	C_ST REG6,UNIT(6)(a0)
	723	C_ST REG7,UNIT(7)(a0)
	724	C_LDHI t0,UNIT(8)(a1)
	725	C_LDHI t1,UNIT(9)(a1)
	726	C_LDHI REG2,UNIT(10)(a1)
	727	C_LDHI REG3,UNIT(11)(a1)
	728	C_LDHI REG4,UNIT(12)(a1)
	729	C_LDHI REG5,UNIT(13)(a1)
	730	C_LDHI REG6,UNIT(14)(a1)
	731	C_LDHI REG7,UNIT(15)(a1)
	732	C_LDLO t0,UNITM1(9)(a1)
	733	C_LDLO t1,UNITM1(10)(a1)
	734	C_LDLO REG2,UNITM1(11)(a1)
	735	C_LDLO REG3,UNITM1(12)(a1)
	736	C_LDLO REG4,UNITM1(13)(a1)
	737	C_LDLO REG5,UNITM1(14)(a1)
	738	C_LDLO REG6,UNITM1(15)(a1)
	739	C_LDLO REG7,UNITM1(16)(a1)
	740	PREFETCH_FOR_LOAD (5, a1)
	741	C_ST t0,UNIT(8)(a0)
	742	C_ST t1,UNIT(9)(a0)
	743	C_ST REG2,UNIT(10)(a0)
	744	C_ST REG3,UNIT(11)(a0)
	745	C_ST REG4,UNIT(12)(a0)
	746	C_ST REG5,UNIT(13)(a0)
	747	C_ST REG6,UNIT(14)(a0)
	748	C_ST REG7,UNIT(15)(a0)
	749	PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
	750	bne a0,a3,L(ua_loop16w)
	751	PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
	752	move a2,t8
	753
	754	/* Here we have src and dest word-aligned but less than 64-bytes or
	755	* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
	756	* is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
	757	* the copy. */
	758
	759	L(ua_chkw):
	760	PREFETCH_FOR_LOAD (0, a1)
	761	andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
	762	/* t8 is the reminder count past 32-bytes */
	763	beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
	764	nop
	765	C_LDHI t0,UNIT(0)(a1)
	766	C_LDHI t1,UNIT(1)(a1)
	767	C_LDHI REG2,UNIT(2)(a1)
	768	C_LDHI REG3,UNIT(3)(a1)
	769	C_LDHI REG4,UNIT(4)(a1)
	770	C_LDHI REG5,UNIT(5)(a1)
	771	C_LDHI REG6,UNIT(6)(a1)
	772	C_LDHI REG7,UNIT(7)(a1)
	773	C_LDLO t0,UNITM1(1)(a1)
	774	C_LDLO t1,UNITM1(2)(a1)
	775	C_LDLO REG2,UNITM1(3)(a1)
	776	C_LDLO REG3,UNITM1(4)(a1)
	777	C_LDLO REG4,UNITM1(5)(a1)
	778	C_LDLO REG5,UNITM1(6)(a1)
	779	C_LDLO REG6,UNITM1(7)(a1)
	780	C_LDLO REG7,UNITM1(8)(a1)
	781	PTR_ADDIU a1,a1,UNIT(8)
	782	C_ST t0,UNIT(0)(a0)
	783	C_ST t1,UNIT(1)(a0)
	784	C_ST REG2,UNIT(2)(a0)
	785	C_ST REG3,UNIT(3)(a0)
	786	C_ST REG4,UNIT(4)(a0)
	787	C_ST REG5,UNIT(5)(a0)
	788	C_ST REG6,UNIT(6)(a0)
	789	C_ST REG7,UNIT(7)(a0)
	790	PTR_ADDIU a0,a0,UNIT(8)
	791	/*
	792	* Here we have less than 32(64) bytes to copy. Set up for a loop to
	793	* copy one word (or double word) at a time.
	794	*/
	795	L(ua_chk1w):
	796	andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
	797	beq a2,t8,L(ua_smallCopy)
	798	PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
	799	PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
	800
	801	/* copying in words (4-byte or 8-byte chunks) */
	802	L(ua_wordCopy_loop):
	803	C_LDHI v1,UNIT(0)(a1)
	804	C_LDLO v1,UNITM1(1)(a1)
	805	PTR_ADDIU a0,a0,UNIT(1)
	806	PTR_ADDIU a1,a1,UNIT(1)
	807	bne a0,a3,L(ua_wordCopy_loop)
	808	C_ST v1,UNIT(-1)(a0)
	809
	810	/* Copy the last 8 (or 16) bytes */
	811	L(ua_smallCopy):
	812	beqz a2,L(leave)
	813	PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
	814	L(ua_smallCopy_loop):
	815	lb v1,0(a1)
	816	PTR_ADDIU a0,a0,1
	817	PTR_ADDIU a1,a1,1
	818	bne a0,a3,L(ua_smallCopy_loop)
	819	sb v1,-1(a0)
	820
	821	j ra
	822	nop
	823
	824	#else /* R6_CODE */
	825
	826	# if __MIPSEB
	827	# define SWAP_REGS(X,Y) X, Y
	828	# define ALIGN_OFFSET(N) (N)
	829	# else
	830	# define SWAP_REGS(X,Y) Y, X
	831	# define ALIGN_OFFSET(N) (NSIZE-N)
	832	# endif
	833	# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
	834	andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \
	835	beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \
	836	PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \
	837	/* (d)word chunks. */ \
	838	move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \
	839	/* after word loop is finished. */ \
	840	PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \
	841	PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \
	842	PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \
	843	C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \
	844	L(r6_ua_wordcopy##BYTEOFFSET): \
	845	C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \
	846	C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \
	847	PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \
	848	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
	849	move t0, t1; /* Move second part of source to first. */ \
	850	bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \
	851	C_ST REG3, UNIT(-1)(a0); \
	852	j L(lastb); \
	853	nop
	854
	855	/* We are generating R6 code, the destination is 4 byte aligned and
	856	the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
	857	alignment of the source. */
	858
	859	L(r6_unaligned1):
	860	R6_UNALIGNED_WORD_COPY(1)
	861	L(r6_unaligned2):
	862	R6_UNALIGNED_WORD_COPY(2)
	863	L(r6_unaligned3):
	864	R6_UNALIGNED_WORD_COPY(3)
	865	# ifdef USE_DOUBLE
	866	L(r6_unaligned4):
	867	R6_UNALIGNED_WORD_COPY(4)
	868	L(r6_unaligned5):
	869	R6_UNALIGNED_WORD_COPY(5)
	870	L(r6_unaligned6):
	871	R6_UNALIGNED_WORD_COPY(6)
	872	L(r6_unaligned7):
	873	R6_UNALIGNED_WORD_COPY(7)
	874	# endif
	875	#endif /* R6_CODE */
	876
	877	.set at
	878	.set reorder
	879	END(MEMCPY_NAME)
	880	#ifndef ANDROID_CHANGES
	881	# ifdef _LIBC
	882	libc_hidden_builtin_def (MEMCPY_NAME)
	883	# endif
	884	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format