1 | /* Copyright (c) 2010-2011, Linaro Limited |
---|
2 | All rights reserved. |
---|
3 | |
---|
4 | Redistribution and use in source and binary forms, with or without |
---|
5 | modification, are permitted provided that the following conditions |
---|
6 | are met: |
---|
7 | |
---|
8 | * Redistributions of source code must retain the above copyright |
---|
9 | notice, this list of conditions and the following disclaimer. |
---|
10 | |
---|
11 | * Redistributions in binary form must reproduce the above copyright |
---|
12 | notice, this list of conditions and the following disclaimer in the |
---|
13 | documentation and/or other materials provided with the distribution. |
---|
14 | |
---|
15 | * Neither the name of Linaro Limited nor the names of its |
---|
16 | contributors may be used to endorse or promote products derived |
---|
17 | from this software without specific prior written permission. |
---|
18 | |
---|
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
---|
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
---|
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
---|
22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
---|
23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
---|
25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
---|
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
30 | |
---|
31 | Written by Dave Gilbert <david.gilbert@linaro.org> |
---|
32 | |
---|
33 | This memchr routine is optimised on a Cortex-A9 and should work on |
---|
34 | all ARMv7 processors. It has a fast path for short sizes, and has |
---|
35 | an optimised path for large data sets; the worst case is finding the |
---|
36 | match early in a large data set. */ |
---|
37 | |
---|
38 | /* Copyright (c) 2015 ARM Ltd. |
---|
39 | All rights reserved. |
---|
40 | |
---|
41 | Redistribution and use in source and binary forms, with or without |
---|
42 | modification, are permitted provided that the following conditions are met: |
---|
43 | * Redistributions of source code must retain the above copyright |
---|
44 | notice, this list of conditions and the following disclaimer. |
---|
45 | * Redistributions in binary form must reproduce the above copyright |
---|
46 | notice, this list of conditions and the following disclaimer in the |
---|
47 | documentation and/or other materials provided with the distribution. |
---|
48 | * Neither the name of the Linaro nor the |
---|
49 | names of its contributors may be used to endorse or promote products |
---|
50 | derived from this software without specific prior written permission. |
---|
51 | |
---|
52 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
---|
53 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
---|
54 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
---|
55 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
---|
56 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
57 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
---|
58 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
59 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
60 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
61 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
---|
62 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ |
---|
63 | |
---|
64 | @ 2011-02-07 david.gilbert@linaro.org |
---|
65 | @ Extracted from local git a5b438d861 |
---|
66 | @ 2011-07-14 david.gilbert@linaro.org |
---|
67 | @ Import endianness fix from local git ea786f1b |
---|
68 | @ 2011-10-11 david.gilbert@linaro.org |
---|
69 | @ Import from cortex-strings bzr rev 63 |
---|
70 | @ Flip to ldrd (as suggested by Greta Yorsh) |
---|
71 | @ Make conditional on CPU type |
---|
72 | @ tidy |
---|
73 | |
---|
74 | @ This code requires armv6t2 or later. Uses Thumb2. |
---|
75 | |
---|
76 | .syntax unified |
---|
77 | |
---|
78 | #include "acle-compat.h" |
---|
79 | |
---|
80 | @ NOTE: This ifdef MUST match the one in memchr-stub.c |
---|
81 | #if defined (__ARM_NEON__) || defined (__ARM_NEON) |
---|
82 | .arch armv7-a |
---|
83 | .fpu neon |
---|
84 | |
---|
85 | |
---|
86 | /* Arguments */ |
---|
87 | #define srcin r0 |
---|
88 | #define chrin r1 |
---|
89 | #define cntin r2 |
---|
90 | |
---|
91 | /* Retval */ |
---|
92 | #define result r0 /* Live range does not overlap with srcin */ |
---|
93 | |
---|
94 | /* Working registers */ |
---|
95 | #define src r1 /* Live range does not overlap with chrin */ |
---|
96 | #define tmp r3 |
---|
97 | #define synd r0 /* No overlap with srcin or result */ |
---|
98 | #define soff r12 |
---|
99 | |
---|
100 | /* Working NEON registers */ |
---|
101 | #define vrepchr q0 |
---|
102 | #define vdata0 q1 |
---|
103 | #define vdata0_0 d2 /* Lower half of vdata0 */ |
---|
104 | #define vdata0_1 d3 /* Upper half of vdata0 */ |
---|
105 | #define vdata1 q2 |
---|
106 | #define vdata1_0 d4 /* Lower half of vhas_chr0 */ |
---|
107 | #define vdata1_1 d5 /* Upper half of vhas_chr0 */ |
---|
108 | #define vrepmask q3 |
---|
109 | #define vrepmask0 d6 |
---|
110 | #define vrepmask1 d7 |
---|
111 | #define vend q4 |
---|
112 | #define vend0 d8 |
---|
113 | #define vend1 d9 |
---|
114 | |
---|
115 | /* |
---|
116 | * Core algorithm: |
---|
117 | * |
---|
118 | * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per |
---|
119 | * byte. Each bit is set if the relevant byte matched the requested character |
---|
120 | * and cleared otherwise. Since the bits in the syndrome reflect exactly the |
---|
121 | * order in which things occur in the original string, counting trailing zeros |
---|
122 | * allows to identify exactly which byte has matched. |
---|
123 | */ |
---|
124 | |
---|
125 | .text |
---|
126 | .thumb_func |
---|
127 | .align 4 |
---|
128 | .p2align 4,,15 |
---|
129 | .global memchr |
---|
130 | .type memchr,%function |
---|
131 | |
---|
132 | memchr: |
---|
133 | .cfi_sections .debug_frame |
---|
134 | .cfi_startproc |
---|
135 | /* Use a simple loop if there are less than 8 bytes to search. */ |
---|
136 | cmp cntin, #7 |
---|
137 | bhi .Llargestr |
---|
138 | and chrin, chrin, #0xff |
---|
139 | |
---|
140 | .Lsmallstr: |
---|
141 | subs cntin, cntin, #1 |
---|
142 | blo .Lnotfound /* Return not found if reached end. */ |
---|
143 | ldrb tmp, [srcin], #1 |
---|
144 | cmp tmp, chrin |
---|
145 | bne .Lsmallstr /* Loop again if not found. */ |
---|
146 | /* Otherwise fixup address and return. */ |
---|
147 | sub result, result, #1 |
---|
148 | bx lr |
---|
149 | |
---|
150 | |
---|
151 | .Llargestr: |
---|
152 | vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ |
---|
153 | /* |
---|
154 | * Magic constant 0x8040201008040201 allows us to identify which lane |
---|
155 | * matches the requested byte. |
---|
156 | */ |
---|
157 | movw tmp, #0x0201 |
---|
158 | movt tmp, #0x0804 |
---|
159 | lsl soff, tmp, #4 |
---|
160 | vmov vrepmask0, tmp, soff |
---|
161 | vmov vrepmask1, tmp, soff |
---|
162 | /* Work with aligned 32-byte chunks */ |
---|
163 | bic src, srcin, #31 |
---|
164 | ands soff, srcin, #31 |
---|
165 | beq .Lloopintro /* Go straight to main loop if it's aligned. */ |
---|
166 | |
---|
167 | /* |
---|
168 | * Input string is not 32-byte aligned. We calculate the syndrome |
---|
169 | * value for the aligned 32 bytes block containing the first bytes |
---|
170 | * and mask the irrelevant part. |
---|
171 | */ |
---|
172 | vld1.8 {vdata0, vdata1}, [src:256]! |
---|
173 | sub tmp, soff, #32 |
---|
174 | adds cntin, cntin, tmp |
---|
175 | vceq.i8 vdata0, vdata0, vrepchr |
---|
176 | vceq.i8 vdata1, vdata1, vrepchr |
---|
177 | vand vdata0, vdata0, vrepmask |
---|
178 | vand vdata1, vdata1, vrepmask |
---|
179 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
---|
180 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
---|
181 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
---|
182 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
---|
183 | vmov synd, vdata0_0[0] |
---|
184 | |
---|
185 | /* Clear the soff lower bits */ |
---|
186 | lsr synd, synd, soff |
---|
187 | lsl synd, synd, soff |
---|
188 | /* The first block can also be the last */ |
---|
189 | bls .Lmasklast |
---|
190 | /* Have we found something already? */ |
---|
191 | cbnz synd, .Ltail |
---|
192 | |
---|
193 | |
---|
194 | .Lloopintro: |
---|
195 | vpush {vend} |
---|
196 | /* 264/265 correspond to d8/d9 for q4 */ |
---|
197 | .cfi_adjust_cfa_offset 16 |
---|
198 | .cfi_rel_offset 264, 0 |
---|
199 | .cfi_rel_offset 265, 8 |
---|
200 | .p2align 3,,7 |
---|
201 | .Lloop: |
---|
202 | vld1.8 {vdata0, vdata1}, [src:256]! |
---|
203 | subs cntin, cntin, #32 |
---|
204 | vceq.i8 vdata0, vdata0, vrepchr |
---|
205 | vceq.i8 vdata1, vdata1, vrepchr |
---|
206 | /* If we're out of data we finish regardless of the result. */ |
---|
207 | bls .Lend |
---|
208 | /* Use a fast check for the termination condition. */ |
---|
209 | vorr vend, vdata0, vdata1 |
---|
210 | vorr vend0, vend0, vend1 |
---|
211 | vmov synd, tmp, vend0 |
---|
212 | orrs synd, synd, tmp |
---|
213 | /* We're not out of data, loop if we haven't found the character. */ |
---|
214 | beq .Lloop |
---|
215 | |
---|
216 | .Lend: |
---|
217 | vpop {vend} |
---|
218 | .cfi_adjust_cfa_offset -16 |
---|
219 | .cfi_restore 264 |
---|
220 | .cfi_restore 265 |
---|
221 | |
---|
222 | /* Termination condition found, let's calculate the syndrome value. */ |
---|
223 | vand vdata0, vdata0, vrepmask |
---|
224 | vand vdata1, vdata1, vrepmask |
---|
225 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
---|
226 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
---|
227 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
---|
228 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
---|
229 | vmov synd, vdata0_0[0] |
---|
230 | cbz synd, .Lnotfound |
---|
231 | bhi .Ltail |
---|
232 | |
---|
233 | |
---|
234 | .Lmasklast: |
---|
235 | /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ |
---|
236 | neg cntin, cntin |
---|
237 | lsl synd, synd, cntin |
---|
238 | lsrs synd, synd, cntin |
---|
239 | it eq |
---|
240 | moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ |
---|
241 | |
---|
242 | |
---|
243 | .Ltail: |
---|
244 | /* Count the trailing zeros using bit reversing */ |
---|
245 | rbit synd, synd |
---|
246 | /* Compensate the last post-increment */ |
---|
247 | sub src, src, #32 |
---|
248 | /* Count the leading zeros */ |
---|
249 | clz synd, synd |
---|
250 | /* Compute the potential result and return */ |
---|
251 | add result, src, synd |
---|
252 | bx lr |
---|
253 | |
---|
254 | |
---|
255 | .Lnotfound: |
---|
256 | /* Set result to NULL if not found and return */ |
---|
257 | mov result, #0 |
---|
258 | bx lr |
---|
259 | |
---|
260 | .cfi_endproc |
---|
261 | .size memchr, . - memchr |
---|
262 | |
---|
263 | #elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) |
---|
264 | |
---|
265 | #if __ARM_ARCH_PROFILE == 'M' |
---|
266 | .arch armv7e-m |
---|
267 | #else |
---|
268 | .arch armv6t2 |
---|
269 | #endif |
---|
270 | |
---|
271 | @ this lets us check a flag in a 00/ff byte easily in either endianness |
---|
272 | #ifdef __ARMEB__ |
---|
273 | #define CHARTSTMASK(c) 1<<(31-(c*8)) |
---|
274 | #else |
---|
275 | #define CHARTSTMASK(c) 1<<(c*8) |
---|
276 | #endif |
---|
277 | .text |
---|
278 | .thumb |
---|
279 | |
---|
280 | @ --------------------------------------------------------------------------- |
---|
281 | .thumb_func |
---|
282 | .align 2 |
---|
283 | .p2align 4,,15 |
---|
284 | .global memchr |
---|
285 | .type memchr,%function |
---|
286 | memchr: |
---|
287 | @ r0 = start of memory to scan |
---|
288 | @ r1 = character to look for |
---|
289 | @ r2 = length |
---|
290 | @ returns r0 = pointer to character or NULL if not found |
---|
291 | and r1,r1,#0xff @ Don't trust the caller to pass a char |
---|
292 | |
---|
293 | cmp r2,#16 @ If short don't bother with anything clever |
---|
294 | blt 20f |
---|
295 | |
---|
296 | tst r0, #7 @ If it's already aligned skip the next bit |
---|
297 | beq 10f |
---|
298 | |
---|
299 | @ Work up to an aligned point |
---|
300 | 5: |
---|
301 | ldrb r3, [r0],#1 |
---|
302 | subs r2, r2, #1 |
---|
303 | cmp r3, r1 |
---|
304 | beq 50f @ If it matches exit found |
---|
305 | tst r0, #7 |
---|
306 | cbz r2, 40f @ If we run off the end, exit not found |
---|
307 | bne 5b @ If not aligned yet then do next byte |
---|
308 | |
---|
309 | 10: |
---|
310 | @ We are aligned, we know we have at least 8 bytes to work with |
---|
311 | push {r4,r5,r6,r7} |
---|
312 | orr r1, r1, r1, lsl #8 @ expand the match word across all bytes |
---|
313 | orr r1, r1, r1, lsl #16 |
---|
314 | bic r4, r2, #7 @ Number of double words to work with * 8 |
---|
315 | mvns r7, #0 @ all F's |
---|
316 | movs r3, #0 |
---|
317 | |
---|
318 | 15: |
---|
319 | ldrd r5,r6,[r0],#8 |
---|
320 | subs r4, r4, #8 |
---|
321 | eor r5,r5, r1 @ r5,r6 have 00's where bytes match the target |
---|
322 | eor r6,r6, r1 |
---|
323 | uadd8 r5, r5, r7 @ Par add 0xff - sets GE bits for bytes!=0 |
---|
324 | sel r5, r3, r7 @ bytes are 00 for none-00 bytes, |
---|
325 | @ or ff for 00 bytes - NOTE INVERSION |
---|
326 | uadd8 r6, r6, r7 @ Par add 0xff - sets GE bits for bytes!=0 |
---|
327 | sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes |
---|
328 | @ or ff for 00 bytes - NOTE INVERSION |
---|
329 | cbnz r6, 60f |
---|
330 | bne 15b @ (Flags from the subs above) |
---|
331 | |
---|
332 | pop {r4,r5,r6,r7} |
---|
333 | and r1,r1,#0xff @ r1 back to a single character |
---|
334 | and r2,r2,#7 @ Leave the count remaining as the number |
---|
335 | @ after the double words have been done |
---|
336 | |
---|
337 | 20: |
---|
338 | cbz r2, 40f @ 0 length or hit the end already then not found |
---|
339 | |
---|
340 | 21: @ Post aligned section, or just a short call |
---|
341 | ldrb r3,[r0],#1 |
---|
342 | subs r2,r2,#1 |
---|
343 | eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub |
---|
344 | cbz r3, 50f |
---|
345 | bne 21b @ on r2 flags |
---|
346 | |
---|
347 | 40: |
---|
348 | movs r0,#0 @ not found |
---|
349 | bx lr |
---|
350 | |
---|
351 | 50: |
---|
352 | subs r0,r0,#1 @ found |
---|
353 | bx lr |
---|
354 | |
---|
355 | 60: @ We're here because the fast path found a hit |
---|
356 | @ now we have to track down exactly which word it was |
---|
357 | @ r0 points to the start of the double word after the one tested |
---|
358 | @ r5 has the 00/ff pattern for the first word, r6 has the chained value |
---|
359 | cmp r5, #0 |
---|
360 | itte eq |
---|
361 | moveq r5, r6 @ the end is in the 2nd word |
---|
362 | subeq r0,r0,#3 @ Points to 2nd byte of 2nd word |
---|
363 | subne r0,r0,#7 @ or 2nd byte of 1st word |
---|
364 | |
---|
365 | @ r0 currently points to the 2nd byte of the word containing the hit |
---|
366 | tst r5, # CHARTSTMASK(0) @ 1st character |
---|
367 | bne 61f |
---|
368 | adds r0,r0,#1 |
---|
369 | tst r5, # CHARTSTMASK(1) @ 2nd character |
---|
370 | ittt eq |
---|
371 | addeq r0,r0,#1 |
---|
372 | tsteq r5, # (3<<15) @ 2nd & 3rd character |
---|
373 | @ If not the 3rd must be the last one |
---|
374 | addeq r0,r0,#1 |
---|
375 | |
---|
376 | 61: |
---|
377 | pop {r4,r5,r6,r7} |
---|
378 | subs r0,r0,#1 |
---|
379 | bx lr |
---|
380 | #else |
---|
381 | /* Defined in memchr-stub.c. */ |
---|
382 | #endif |
---|