[444] | 1 | /* |
---|
| 2 | Copyright (c) 2015, Synopsys, Inc. All rights reserved. |
---|
| 3 | |
---|
| 4 | Redistribution and use in source and binary forms, with or without |
---|
| 5 | modification, are permitted provided that the following conditions are met: |
---|
| 6 | |
---|
| 7 | 1) Redistributions of source code must retain the above copyright notice, |
---|
| 8 | this list of conditions and the following disclaimer. |
---|
| 9 | |
---|
| 10 | 2) Redistributions in binary form must reproduce the above copyright notice, |
---|
| 11 | this list of conditions and the following disclaimer in the documentation |
---|
| 12 | and/or other materials provided with the distribution. |
---|
| 13 | |
---|
| 14 | 3) Neither the name of the Synopsys, Inc., nor the names of its contributors |
---|
| 15 | may be used to endorse or promote products derived from this software |
---|
| 16 | without specific prior written permission. |
---|
| 17 | |
---|
| 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
| 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
| 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
| 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
---|
| 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
| 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
| 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
| 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
| 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
| 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
| 28 | POSSIBILITY OF SUCH DAMAGE. |
---|
| 29 | */ |
---|
| 30 | |
---|
| 31 | /* This implementation is optimized for performance. For code size a generic |
---|
| 32 | implementation of this function from newlib/libc/string/memcpy.c will be |
---|
| 33 | used. */ |
---|
| 34 | #if !defined (__OPTIMIZE_SIZE__) && !defined (PREFER_SIZE_OVER_SPEED) |
---|
| 35 | |
---|
| 36 | #include "asm.h" |
---|
| 37 | |
---|
| 38 | #if defined (__ARCHS__) |
---|
| 39 | |
---|
| 40 | #ifdef __LITTLE_ENDIAN__ |
---|
| 41 | # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << |
---|
| 42 | # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> |
---|
| 43 | # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM |
---|
| 44 | # define MERGE_2(RX,RY,IMM) |
---|
| 45 | # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF |
---|
| 46 | # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM |
---|
| 47 | #else |
---|
| 48 | # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> |
---|
| 49 | # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << |
---|
| 50 | # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << |
---|
| 51 | # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << |
---|
| 52 | # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM |
---|
| 53 | # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 |
---|
| 54 | #endif |
---|
| 55 | |
---|
| 56 | #ifdef __ARC_LL64__ |
---|
| 57 | # define PREFETCH_READ(RX) prefetch [RX, 56] |
---|
| 58 | # define PREFETCH_WRITE(RX) prefetchw [RX, 64] |
---|
| 59 | # define LOADX(DST,RX) ldd.ab DST, [RX, 8] |
---|
| 60 | # define STOREX(SRC,RX) std.ab SRC, [RX, 8] |
---|
| 61 | # define ZOLSHFT 5 |
---|
| 62 | # define ZOLAND 0x1F |
---|
| 63 | #else |
---|
| 64 | # define PREFETCH_READ(RX) prefetch [RX, 28] |
---|
| 65 | # define PREFETCH_WRITE(RX) prefetchw [RX, 32] |
---|
| 66 | # define LOADX(DST,RX) ld.ab DST, [RX, 4] |
---|
| 67 | # define STOREX(SRC,RX) st.ab SRC, [RX, 4] |
---|
| 68 | # define ZOLSHFT 4 |
---|
| 69 | # define ZOLAND 0xF |
---|
| 70 | #endif |
---|
| 71 | |
---|
| 72 | #ifdef __ARC_ALIGNED_ACCESS__ |
---|
| 73 | ENTRY (memcpy) |
---|
| 74 | prefetch [r1] ; Prefetch the read location |
---|
| 75 | prefetchw [r0] ; Prefetch the write location |
---|
| 76 | mov.f 0, r2 |
---|
| 77 | ; if size is zero |
---|
| 78 | jz.d [blink] |
---|
| 79 | mov r3, r0 ; don't clobber ret val |
---|
| 80 | |
---|
| 81 | ; if size <= 8 |
---|
| 82 | cmp r2, 8 |
---|
| 83 | bls.d @.Lsmallchunk |
---|
| 84 | mov.f lp_count, r2 |
---|
| 85 | |
---|
| 86 | and.f r4, r0, 0x03 |
---|
| 87 | rsub lp_count, r4, 4 |
---|
| 88 | lpnz @.Laligndestination |
---|
| 89 | ; LOOP BEGIN |
---|
| 90 | ldb.ab r5, [r1,1] |
---|
| 91 | sub r2, r2, 1 |
---|
| 92 | stb.ab r5, [r3,1] |
---|
| 93 | .Laligndestination: |
---|
| 94 | |
---|
| 95 | ; Check the alignment of the source |
---|
| 96 | and.f r4, r1, 0x03 |
---|
| 97 | bnz.d @.Lsourceunaligned |
---|
| 98 | |
---|
| 99 | ; CASE 0: Both source and destination are 32bit aligned |
---|
| 100 | ; Convert len to Dwords, unfold x4 |
---|
| 101 | lsr.f lp_count, r2, ZOLSHFT |
---|
| 102 | lpnz @.Lcopy32_64bytes |
---|
| 103 | ; LOOP START |
---|
| 104 | LOADX (r6, r1) |
---|
| 105 | PREFETCH_READ (r1) |
---|
| 106 | PREFETCH_WRITE (r3) |
---|
| 107 | LOADX (r8, r1) |
---|
| 108 | LOADX (r10, r1) |
---|
| 109 | LOADX (r4, r1) |
---|
| 110 | STOREX (r6, r3) |
---|
| 111 | STOREX (r8, r3) |
---|
| 112 | STOREX (r10, r3) |
---|
| 113 | STOREX (r4, r3) |
---|
| 114 | .Lcopy32_64bytes: |
---|
| 115 | |
---|
| 116 | and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes |
---|
| 117 | .Lsmallchunk: |
---|
| 118 | lpnz @.Lcopyremainingbytes |
---|
| 119 | ; LOOP START |
---|
| 120 | ldb.ab r5, [r1,1] |
---|
| 121 | stb.ab r5, [r3,1] |
---|
| 122 | .Lcopyremainingbytes: |
---|
| 123 | |
---|
| 124 | j [blink] |
---|
| 125 | ; END CASE 0 |
---|
| 126 | |
---|
| 127 | .Lsourceunaligned: |
---|
| 128 | cmp r4, 2 |
---|
| 129 | beq.d @.LunalignedOffby2 |
---|
| 130 | sub r2, r2, 1 |
---|
| 131 | |
---|
| 132 | bhi.d @.LunalignedOffby3 |
---|
| 133 | ldb.ab r5, [r1, 1] |
---|
| 134 | |
---|
| 135 | ; CASE 1: The source is unaligned, off by 1 |
---|
| 136 | ; Hence I need to read 1 byte for a 16bit alignment |
---|
| 137 | ; and 2bytes to reach 32bit alignment |
---|
| 138 | ldh.ab r6, [r1, 2] |
---|
| 139 | sub r2, r2, 2 |
---|
| 140 | ; Convert to words, unfold x2 |
---|
| 141 | lsr.f lp_count, r2, 3 |
---|
| 142 | MERGE_1 (r6, r6, 8) |
---|
| 143 | MERGE_2 (r5, r5, 24) |
---|
| 144 | or r5, r5, r6 |
---|
| 145 | |
---|
| 146 | ; Both src and dst are aligned |
---|
| 147 | lpnz @.Lcopy8bytes_1 |
---|
| 148 | ; LOOP START |
---|
| 149 | ld.ab r6, [r1, 4] |
---|
| 150 | prefetch [r1, 28] ;Prefetch the next read location |
---|
| 151 | ld.ab r8, [r1,4] |
---|
| 152 | prefetchw [r3, 32] ;Prefetch the next write location |
---|
| 153 | |
---|
| 154 | SHIFT_1 (r7, r6, 24) |
---|
| 155 | or r7, r7, r5 |
---|
| 156 | SHIFT_2 (r5, r6, 8) |
---|
| 157 | |
---|
| 158 | SHIFT_1 (r9, r8, 24) |
---|
| 159 | or r9, r9, r5 |
---|
| 160 | SHIFT_2 (r5, r8, 8) |
---|
| 161 | |
---|
| 162 | st.ab r7, [r3, 4] |
---|
| 163 | st.ab r9, [r3, 4] |
---|
| 164 | .Lcopy8bytes_1: |
---|
| 165 | |
---|
| 166 | ; Write back the remaining 16bits |
---|
| 167 | EXTRACT_1 (r6, r5, 16) |
---|
| 168 | sth.ab r6, [r3, 2] |
---|
| 169 | ; Write back the remaining 8bits |
---|
| 170 | EXTRACT_2 (r5, r5, 16) |
---|
| 171 | stb.ab r5, [r3, 1] |
---|
| 172 | |
---|
| 173 | and.f lp_count, r2, 0x07 ;Last 8bytes |
---|
| 174 | lpnz @.Lcopybytewise_1 |
---|
| 175 | ; LOOP START |
---|
| 176 | ldb.ab r6, [r1,1] |
---|
| 177 | stb.ab r6, [r3,1] |
---|
| 178 | .Lcopybytewise_1: |
---|
| 179 | j [blink] |
---|
| 180 | |
---|
| 181 | .LunalignedOffby2: |
---|
| 182 | ; CASE 2: The source is unaligned, off by 2 |
---|
| 183 | ldh.ab r5, [r1, 2] |
---|
| 184 | sub r2, r2, 1 |
---|
| 185 | |
---|
| 186 | ; Both src and dst are aligned |
---|
| 187 | ; Convert to words, unfold x2 |
---|
| 188 | lsr.f lp_count, r2, 3 |
---|
| 189 | #ifdef __BIG_ENDIAN__ |
---|
| 190 | asl.nz r5, r5, 16 |
---|
| 191 | #endif |
---|
| 192 | lpnz @.Lcopy8bytes_2 |
---|
| 193 | ; LOOP START |
---|
| 194 | ld.ab r6, [r1, 4] |
---|
| 195 | prefetch [r1, 28] ;Prefetch the next read location |
---|
| 196 | ld.ab r8, [r1,4] |
---|
| 197 | prefetchw [r3, 32] ;Prefetch the next write location |
---|
| 198 | |
---|
| 199 | SHIFT_1 (r7, r6, 16) |
---|
| 200 | or r7, r7, r5 |
---|
| 201 | SHIFT_2 (r5, r6, 16) |
---|
| 202 | |
---|
| 203 | SHIFT_1 (r9, r8, 16) |
---|
| 204 | or r9, r9, r5 |
---|
| 205 | SHIFT_2 (r5, r8, 16) |
---|
| 206 | |
---|
| 207 | st.ab r7, [r3, 4] |
---|
| 208 | st.ab r9, [r3, 4] |
---|
| 209 | .Lcopy8bytes_2: |
---|
| 210 | |
---|
| 211 | #ifdef __BIG_ENDIAN__ |
---|
| 212 | lsr.nz r5, r5, 16 |
---|
| 213 | #endif |
---|
| 214 | sth.ab r5, [r3, 2] |
---|
| 215 | |
---|
| 216 | and.f lp_count, r2, 0x07 ;Last 8bytes |
---|
| 217 | lpnz @.Lcopybytewise_2 |
---|
| 218 | ; LOOP START |
---|
| 219 | ldb.ab r6, [r1,1] |
---|
| 220 | stb.ab r6, [r3,1] |
---|
| 221 | .Lcopybytewise_2: |
---|
| 222 | j [blink] |
---|
| 223 | |
---|
| 224 | .LunalignedOffby3: |
---|
| 225 | ; CASE 3: The source is unaligned, off by 3 |
---|
| 226 | ; Hence, I need to read 1byte for achieve the 32bit alignment |
---|
| 227 | |
---|
| 228 | ; Both src and dst are aligned |
---|
| 229 | ; Convert to words, unfold x2 |
---|
| 230 | lsr.f lp_count, r2, 3 |
---|
| 231 | #ifdef __BIG_ENDIAN__ |
---|
| 232 | asl.ne r5, r5, 24 |
---|
| 233 | #endif |
---|
| 234 | lpnz @.Lcopy8bytes_3 |
---|
| 235 | ; LOOP START |
---|
| 236 | ld.ab r6, [r1, 4] |
---|
| 237 | prefetch [r1, 28] ;Prefetch the next read location |
---|
| 238 | ld.ab r8, [r1,4] |
---|
| 239 | prefetchw [r3, 32] ;Prefetch the next write location |
---|
| 240 | |
---|
| 241 | SHIFT_1 (r7, r6, 8) |
---|
| 242 | or r7, r7, r5 |
---|
| 243 | SHIFT_2 (r5, r6, 24) |
---|
| 244 | |
---|
| 245 | SHIFT_1 (r9, r8, 8) |
---|
| 246 | or r9, r9, r5 |
---|
| 247 | SHIFT_2 (r5, r8, 24) |
---|
| 248 | |
---|
| 249 | st.ab r7, [r3, 4] |
---|
| 250 | st.ab r9, [r3, 4] |
---|
| 251 | .Lcopy8bytes_3: |
---|
| 252 | |
---|
| 253 | #ifdef __BIG_ENDIAN__ |
---|
| 254 | lsr.nz r5, r5, 24 |
---|
| 255 | #endif |
---|
| 256 | stb.ab r5, [r3, 1] |
---|
| 257 | |
---|
| 258 | and.f lp_count, r2, 0x07 ;Last 8bytes |
---|
| 259 | lpnz @.Lcopybytewise_3 |
---|
| 260 | ; LOOP START |
---|
| 261 | ldb.ab r6, [r1,1] |
---|
| 262 | stb.ab r6, [r3,1] |
---|
| 263 | .Lcopybytewise_3: |
---|
| 264 | j [blink] |
---|
| 265 | |
---|
| 266 | ENDFUNC (memcpy) |
---|
| 267 | |
---|
| 268 | #else |
---|
| 269 | |
---|
| 270 | ENTRY(memcpy) |
---|
| 271 | prefetch [r1] ; Prefetch the read location |
---|
| 272 | prefetchw [r0] ; Prefetch the write location |
---|
| 273 | mov.f 0, r2 |
---|
| 274 | ;;; if size is zero |
---|
| 275 | jz.d [blink] |
---|
| 276 | mov r3, r0 ; don't clobber ret val |
---|
| 277 | |
---|
| 278 | ;;; if size <= 8 |
---|
| 279 | cmp r2, 8 |
---|
| 280 | bls.d @.Lsmallchunk |
---|
| 281 | mov.f lp_count, r2 |
---|
| 282 | |
---|
| 283 | ;;; Convert len to Dwords, unfold x4 |
---|
| 284 | lsr.f lp_count, r2, ZOLSHFT |
---|
| 285 | lpnz @.Lcopyfast |
---|
| 286 | ;; LOOP START |
---|
| 287 | LOADX (r6, r1) |
---|
| 288 | PREFETCH_READ (r1) |
---|
| 289 | PREFETCH_WRITE (r3) |
---|
| 290 | LOADX (r8, r1) |
---|
| 291 | LOADX (r10, r1) |
---|
| 292 | LOADX (r4, r1) |
---|
| 293 | STOREX (r6, r3) |
---|
| 294 | STOREX (r8, r3) |
---|
| 295 | STOREX (r10, r3) |
---|
| 296 | STOREX (r4, r3) |
---|
| 297 | .Lcopyfast: |
---|
| 298 | |
---|
| 299 | #ifdef __ARC_LL64__ |
---|
| 300 | and r2, r2, ZOLAND ;Remaining 31 bytes |
---|
| 301 | lsr.f lp_count, r2, 3 ;Convert to 64-bit words. |
---|
| 302 | lpnz @.Lcopy64b |
---|
| 303 | ;; LOOP START |
---|
| 304 | ldd.ab r6,[r1,8] |
---|
| 305 | std.ab r6,[r3,8] |
---|
| 306 | .Lcopy64b: |
---|
| 307 | |
---|
| 308 | and.f lp_count, r2, 0x07 ; Last 7 bytes |
---|
| 309 | #else |
---|
| 310 | and.f lp_count, r2, ZOLAND |
---|
| 311 | #endif |
---|
| 312 | |
---|
| 313 | .Lsmallchunk: |
---|
| 314 | lpnz @.Lcopyremainingbytes |
---|
| 315 | ;; LOOP START |
---|
| 316 | ldb.ab r5, [r1,1] |
---|
| 317 | stb.ab r5, [r3,1] |
---|
| 318 | .Lcopyremainingbytes: |
---|
| 319 | |
---|
| 320 | j [blink] |
---|
| 321 | |
---|
| 322 | ENDFUNC(memcpy) |
---|
| 323 | #endif |
---|
| 324 | |
---|
| 325 | #endif /* __ARCHS__ */ |
---|
| 326 | |
---|
| 327 | #endif /* !__OPTIMIZE_SIZE__ && !PREFER_SIZE_OVER_SPEED */ |
---|