/* Copyright (c) 2015, Synopsys, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3) Neither the name of the Synopsys, Inc., nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This implementation is optimized for performance. For code size a generic implementation of this function from newlib/libc/string/memcpy.c will be used. */ #if !defined (__OPTIMIZE_SIZE__) && !defined (PREFER_SIZE_OVER_SPEED) #include "asm.h" #if defined (__ARCHS__) #ifdef __LITTLE_ENDIAN__ # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM # define MERGE_2(RX,RY,IMM) # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM #else # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 #endif #ifdef __ARC_LL64__ # define PREFETCH_READ(RX) prefetch [RX, 56] # define PREFETCH_WRITE(RX) prefetchw [RX, 64] # define LOADX(DST,RX) ldd.ab DST, [RX, 8] # define STOREX(SRC,RX) std.ab SRC, [RX, 8] # define ZOLSHFT 5 # define ZOLAND 0x1F #else # define PREFETCH_READ(RX) prefetch [RX, 28] # define PREFETCH_WRITE(RX) prefetchw [RX, 32] # define LOADX(DST,RX) ld.ab DST, [RX, 4] # define STOREX(SRC,RX) st.ab SRC, [RX, 4] # define ZOLSHFT 4 # define ZOLAND 0xF #endif #ifdef __ARC_ALIGNED_ACCESS__ ENTRY (memcpy) prefetch [r1] ; Prefetch the read location prefetchw [r0] ; Prefetch the write location mov.f 0, r2 ; if size is zero jz.d [blink] mov r3, r0 ; don't clobber ret val ; if size <= 8 cmp r2, 8 bls.d @.Lsmallchunk mov.f lp_count, r2 and.f r4, r0, 0x03 rsub lp_count, r4, 4 lpnz @.Laligndestination ; LOOP BEGIN ldb.ab r5, [r1,1] sub r2, r2, 1 stb.ab r5, [r3,1] .Laligndestination: ; Check the alignment of the source and.f r4, r1, 0x03 bnz.d @.Lsourceunaligned ; CASE 0: Both source and destination are 32bit aligned ; Convert len to Dwords, unfold x4 lsr.f lp_count, r2, ZOLSHFT lpnz @.Lcopy32_64bytes ; LOOP START LOADX (r6, r1) PREFETCH_READ (r1) PREFETCH_WRITE (r3) LOADX (r8, r1) LOADX (r10, r1) LOADX (r4, r1) STOREX (r6, r3) STOREX (r8, r3) STOREX (r10, r3) STOREX (r4, r3) .Lcopy32_64bytes: and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes .Lsmallchunk: lpnz @.Lcopyremainingbytes ; LOOP START ldb.ab r5, [r1,1] stb.ab r5, [r3,1] .Lcopyremainingbytes: j [blink] ; END CASE 0 .Lsourceunaligned: cmp r4, 2 beq.d @.LunalignedOffby2 sub r2, r2, 1 bhi.d @.LunalignedOffby3 ldb.ab r5, [r1, 1] ; CASE 1: The source is unaligned, off by 1 ; Hence I need to read 1 byte for a 16bit alignment ; and 2bytes to reach 32bit alignment ldh.ab r6, [r1, 2] sub r2, r2, 2 ; Convert to words, unfold x2 lsr.f lp_count, r2, 3 MERGE_1 (r6, r6, 8) MERGE_2 (r5, r5, 24) or r5, r5, r6 ; Both src and dst are aligned lpnz @.Lcopy8bytes_1 ; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 24) or r7, r7, r5 SHIFT_2 (r5, r6, 8) SHIFT_1 (r9, r8, 24) or r9, r9, r5 SHIFT_2 (r5, r8, 8) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_1: ; Write back the remaining 16bits EXTRACT_1 (r6, r5, 16) sth.ab r6, [r3, 2] ; Write back the remaining 8bits EXTRACT_2 (r5, r5, 16) stb.ab r5, [r3, 1] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_1 ; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_1: j [blink] .LunalignedOffby2: ; CASE 2: The source is unaligned, off by 2 ldh.ab r5, [r1, 2] sub r2, r2, 1 ; Both src and dst are aligned ; Convert to words, unfold x2 lsr.f lp_count, r2, 3 #ifdef __BIG_ENDIAN__ asl.nz r5, r5, 16 #endif lpnz @.Lcopy8bytes_2 ; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 16) or r7, r7, r5 SHIFT_2 (r5, r6, 16) SHIFT_1 (r9, r8, 16) or r9, r9, r5 SHIFT_2 (r5, r8, 16) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_2: #ifdef __BIG_ENDIAN__ lsr.nz r5, r5, 16 #endif sth.ab r5, [r3, 2] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_2 ; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_2: j [blink] .LunalignedOffby3: ; CASE 3: The source is unaligned, off by 3 ; Hence, I need to read 1byte for achieve the 32bit alignment ; Both src and dst are aligned ; Convert to words, unfold x2 lsr.f lp_count, r2, 3 #ifdef __BIG_ENDIAN__ asl.ne r5, r5, 24 #endif lpnz @.Lcopy8bytes_3 ; LOOP START ld.ab r6, [r1, 4] prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 8) or r7, r7, r5 SHIFT_2 (r5, r6, 24) SHIFT_1 (r9, r8, 8) or r9, r9, r5 SHIFT_2 (r5, r8, 24) st.ab r7, [r3, 4] st.ab r9, [r3, 4] .Lcopy8bytes_3: #ifdef __BIG_ENDIAN__ lsr.nz r5, r5, 24 #endif stb.ab r5, [r3, 1] and.f lp_count, r2, 0x07 ;Last 8bytes lpnz @.Lcopybytewise_3 ; LOOP START ldb.ab r6, [r1,1] stb.ab r6, [r3,1] .Lcopybytewise_3: j [blink] ENDFUNC (memcpy) #else ENTRY(memcpy) prefetch [r1] ; Prefetch the read location prefetchw [r0] ; Prefetch the write location mov.f 0, r2 ;;; if size is zero jz.d [blink] mov r3, r0 ; don't clobber ret val ;;; if size <= 8 cmp r2, 8 bls.d @.Lsmallchunk mov.f lp_count, r2 ;;; Convert len to Dwords, unfold x4 lsr.f lp_count, r2, ZOLSHFT lpnz @.Lcopyfast ;; LOOP START LOADX (r6, r1) PREFETCH_READ (r1) PREFETCH_WRITE (r3) LOADX (r8, r1) LOADX (r10, r1) LOADX (r4, r1) STOREX (r6, r3) STOREX (r8, r3) STOREX (r10, r3) STOREX (r4, r3) .Lcopyfast: #ifdef __ARC_LL64__ and r2, r2, ZOLAND ;Remaining 31 bytes lsr.f lp_count, r2, 3 ;Convert to 64-bit words. lpnz @.Lcopy64b ;; LOOP START ldd.ab r6,[r1,8] std.ab r6,[r3,8] .Lcopy64b: and.f lp_count, r2, 0x07 ; Last 7 bytes #else and.f lp_count, r2, ZOLAND #endif .Lsmallchunk: lpnz @.Lcopyremainingbytes ;; LOOP START ldb.ab r5, [r1,1] stb.ab r5, [r3,1] .Lcopyremainingbytes: j [blink] ENDFUNC(memcpy) #endif #endif /* __ARCHS__ */ #endif /* !__OPTIMIZE_SIZE__ && !PREFER_SIZE_OVER_SPEED */