[444] | 1 | #include "setarch.h" |
---|
| 2 | |
---|
| 3 | #include "defines.h" |
---|
| 4 | |
---|
| 5 | #ifdef __H8300SX__ |
---|
| 6 | |
---|
| 7 | .global _memcpy |
---|
| 8 | _memcpy: |
---|
| 9 | stm.l er4-er6,@-er7 |
---|
| 10 | |
---|
| 11 | ; Set up source and destination pointers for movmd. |
---|
| 12 | mov.l er0,er6 |
---|
| 13 | mov.l er1,er5 |
---|
| 14 | |
---|
| 15 | ; See whether the copy is long enough to use the movmd.l code. |
---|
| 16 | ; Although the code can handle anything longer than 6 bytes, |
---|
| 17 | ; it can be more expensive than movmd.b for small moves. |
---|
| 18 | ; It's better to use a higher threshold to account for this. |
---|
| 19 | ; |
---|
| 20 | ; Note that the exact overhead of the movmd.l checks depends on |
---|
| 21 | ; the alignments of the length and pointers. They are faster when |
---|
| 22 | ; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values |
---|
| 23 | ; are 0. This threshold is a compromise between the various cases. |
---|
| 24 | cmp #16,LEN(r2) |
---|
| 25 | blo simple |
---|
| 26 | |
---|
| 27 | ; movmd.l only works for even addresses. If one of the addresses |
---|
| 28 | ; is odd and the other is not, fall back on a simple move. |
---|
| 29 | bld #0,r5l |
---|
| 30 | bxor #0,r6l |
---|
| 31 | bcs simple |
---|
| 32 | |
---|
| 33 | ; Make the addresses even. |
---|
| 34 | bld #0,r5l |
---|
| 35 | bcc word_aligned |
---|
| 36 | mov.b @er5+,@er6+ |
---|
| 37 | sub #1,LEN(r2) |
---|
| 38 | |
---|
| 39 | word_aligned: |
---|
| 40 | ; See if copying one word would make the first operand longword |
---|
| 41 | ; aligned. Although this is only really worthwhile if it aligns |
---|
| 42 | ; the second operand as well, it's no worse if doesn't, so it |
---|
| 43 | ; hardly seems worth the overhead of a "band" check. |
---|
| 44 | bld #1,r6l |
---|
| 45 | bcc fast_copy |
---|
| 46 | mov.w @er5+,@er6+ |
---|
| 47 | sub #2,LEN(r2) |
---|
| 48 | |
---|
| 49 | fast_copy: |
---|
| 50 | ; Set (e)r4 to the number of longwords to copy. |
---|
| 51 | mov LEN(r2),LEN(r4) |
---|
| 52 | shlr #2,LEN(r4) |
---|
| 53 | |
---|
| 54 | #ifdef __NORMAL_MODE__ |
---|
| 55 | ; 16-bit pointers and size_ts: one movmd.l is enough. This code |
---|
| 56 | ; is never reached with r4 == 0. |
---|
| 57 | movmd.l |
---|
| 58 | and.w #3,r2 |
---|
| 59 | simple: |
---|
| 60 | mov.w r2,r4 |
---|
| 61 | beq quit |
---|
| 62 | movmd.b |
---|
| 63 | quit: |
---|
| 64 | rts/l er4-er6 |
---|
| 65 | #else |
---|
| 66 | ; Skip the first iteration if the number of longwords is divisible |
---|
| 67 | ; by 0x10000. |
---|
| 68 | mov.w r4,r4 |
---|
| 69 | beq fast_loop_next |
---|
| 70 | |
---|
| 71 | ; This loop copies r4 (!= 0) longwords the first time round and 65536 |
---|
| 72 | ; longwords on each iteration after that. |
---|
| 73 | fast_loop: |
---|
| 74 | movmd.l |
---|
| 75 | fast_loop_next: |
---|
| 76 | sub.w #1,e4 |
---|
| 77 | bhs fast_loop |
---|
| 78 | |
---|
| 79 | ; Mop up any left-over bytes. We could just fall through to the |
---|
| 80 | ; simple code after the "and" but the version below is quicker |
---|
| 81 | ; and only takes 10 more bytes. |
---|
| 82 | and.w #3,r2 |
---|
| 83 | beq quit |
---|
| 84 | mov.w r2,r4 |
---|
| 85 | movmd.b |
---|
| 86 | quit: |
---|
| 87 | rts/l er4-er6 |
---|
| 88 | |
---|
| 89 | simple: |
---|
| 90 | ; Simple bytewise copy. We need to handle all lengths, including zero. |
---|
| 91 | mov.w r2,r4 |
---|
| 92 | beq simple_loop_next |
---|
| 93 | simple_loop: |
---|
| 94 | movmd.b |
---|
| 95 | simple_loop_next: |
---|
| 96 | sub.w #1,e2 |
---|
| 97 | bhs simple_loop |
---|
| 98 | rts/l er4-er6 |
---|
| 99 | #endif |
---|
| 100 | |
---|
| 101 | #else |
---|
| 102 | |
---|
| 103 | .global _memcpy |
---|
| 104 | _memcpy: |
---|
| 105 | ; MOVP @(2/4,r7),A0P ; dst |
---|
| 106 | ; MOVP @(4/8,r7),A1P ; src |
---|
| 107 | ; MOVP @(6/12,r7),A2P ; len |
---|
| 108 | |
---|
| 109 | MOVP A0P,A3P ; keep copy of final dst |
---|
| 110 | ADDP A2P,A0P ; point to end of dst |
---|
| 111 | CMPP A0P,A3P ; see if anything to do |
---|
| 112 | beq quit |
---|
| 113 | |
---|
| 114 | ADDP A2P,A1P ; point to end of src |
---|
| 115 | |
---|
| 116 | ; lets see if we can do this in words |
---|
| 117 | or A0L,A2L ; or in the dst address |
---|
| 118 | or A3L,A2L ; or the length |
---|
| 119 | or A1L,A2L ; or the src address |
---|
| 120 | btst #0,A2L ; see if the lsb is zero |
---|
| 121 | bne byteloop |
---|
| 122 | |
---|
| 123 | wordloop: |
---|
| 124 | #ifdef __NORMAL_MODE__ |
---|
| 125 | sub #2,A1P |
---|
| 126 | #else |
---|
| 127 | subs #2,A1P ; point to word |
---|
| 128 | #endif |
---|
| 129 | mov.w @A1P,A2 ; get word |
---|
| 130 | mov.w A2,@-A0P ; save word |
---|
| 131 | CMPP A0P,A3P ; at the front again ? |
---|
| 132 | bne wordloop |
---|
| 133 | rts |
---|
| 134 | |
---|
| 135 | byteloop: |
---|
| 136 | #ifdef __NORMAL_MODE__ |
---|
| 137 | sub #1,A1P |
---|
| 138 | #else |
---|
| 139 | subs #1,A1P ; point to byte |
---|
| 140 | #endif |
---|
| 141 | mov.b @A1P,A2L ; get byte |
---|
| 142 | mov.b A2L,@-A0P ; save byte |
---|
| 143 | CMPP A0P,A3P ; at the front again ? |
---|
| 144 | bne byteloop |
---|
| 145 | |
---|
| 146 | ; return with A0 pointing to dst |
---|
| 147 | quit: rts |
---|
| 148 | |
---|
| 149 | #endif |
---|