Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

strncmp.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 6.9 KB

Line
1	/* Copyright (c) 2013, Linaro Limited
2	All rights reserved.
3
4	Redistribution and use in source and binary forms, with or without
5	modification, are permitted provided that the following conditions are met:
6	* Redistributions of source code must retain the above copyright
7	notice, this list of conditions and the following disclaimer.
8	* Redistributions in binary form must reproduce the above copyright
9	notice, this list of conditions and the following disclaimer in the
10	documentation and/or other materials provided with the distribution.
11	* Neither the name of the Linaro nor the
12	names of its contributors may be used to endorse or promote products
13	derived from this software without specific prior written permission.
14
15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27	#if (defined (__OPTIMIZE_SIZE__) \|\| defined (PREFER_SIZE_OVER_SPEED))
28	/* See strcmp-stub.c */
29	#else
30
31	/* Assumptions:
32	*
33	* ARMv8-a, AArch64
34	*/
35
36	.macro def_fn f p2align=0
37	.text
38	.p2align \p2align
39	.global \f
40	.type \f, %function
41	\f:
42	.endm
43
44	#define REP8_01 0x0101010101010101
45	#define REP8_7f 0x7f7f7f7f7f7f7f7f
46	#define REP8_80 0x8080808080808080
47
48	/* Parameters and result. */
49	#define src1 x0
50	#define src2 x1
51	#define limit x2
52	#define result x0
53
54	/* Internal variables. */
55	#define data1 x3
56	#define data1w w3
57	#define data2 x4
58	#define data2w w4
59	#define has_nul x5
60	#define diff x6
61	#define syndrome x7
62	#define tmp1 x8
63	#define tmp2 x9
64	#define tmp3 x10
65	#define zeroones x11
66	#define pos x12
67	#define limit_wd x13
68	#define mask x14
69	#define endloop x15
70
71	.text
72	.p2align 6
73	.rep 7
74	nop /* Pad so that the loop below fits a cache line. */
75	.endr
76	def_fn strncmp
77	cbz limit, .Lret0
78	eor tmp1, src1, src2
79	mov zeroones, #REP8_01
80	tst tmp1, #7
81	b.ne .Lmisaligned8
82	ands tmp1, src1, #7
83	b.ne .Lmutual_align
84	/* Calculate the number of full and partial words -1. */
85	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
86	lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
87
88	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
89	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
90	can be done in parallel across the entire word. */
91	/* Start of performance-critical section -- one 64B cache line. */
92	.Lloop_aligned:
93	ldr data1, [src1], #8
94	ldr data2, [src2], #8
95	.Lstart_realigned:
96	subs limit_wd, limit_wd, #1
97	sub tmp1, data1, zeroones
98	orr tmp2, data1, #REP8_7f
99	eor diff, data1, data2 /* Non-zero if differences found. */
100	csinv endloop, diff, xzr, pl /* Last Dword or differences. */
101	bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
102	ccmp endloop, #0, #0, eq
103	b.eq .Lloop_aligned
104	/* End of performance-critical section -- one 64B cache line. */
105
106	/* Not reached the limit, must have found the end or a diff. */
107	tbz limit_wd, #63, .Lnot_limit
108
109	/* Limit % 8 == 0 => all bytes significant. */
110	ands limit, limit, #7
111	b.eq .Lnot_limit
112
113	lsl limit, limit, #3 /* Bits -> bytes. */
114	mov mask, #~0
115	#ifdef __AARCH64EB__
116	lsr mask, mask, limit
117	#else
118	lsl mask, mask, limit
119	#endif
120	bic data1, data1, mask
121	bic data2, data2, mask
122
123	/* Make sure that the NUL byte is marked in the syndrome. */
124	orr has_nul, has_nul, mask
125
126	.Lnot_limit:
127	orr syndrome, diff, has_nul
128
129	#ifndef __AARCH64EB__
130	rev syndrome, syndrome
131	rev data1, data1
132	/* The MS-non-zero bit of the syndrome marks either the first bit
133	that is different, or the top bit of the first zero byte.
134	Shifting left now will bring the critical information into the
135	top bits. */
136	clz pos, syndrome
137	rev data2, data2
138	lsl data1, data1, pos
139	lsl data2, data2, pos
140	/* But we need to zero-extend (char is unsigned) the value and then
141	perform a signed 32-bit subtraction. */
142	lsr data1, data1, #56
143	sub result, data1, data2, lsr #56
144	ret
145	#else
146	/* For big-endian we cannot use the trick with the syndrome value
147	as carry-propagation can corrupt the upper bits if the trailing
148	bytes in the string contain 0x01. */
149	/* However, if there is no NUL byte in the dword, we can generate
150	the result directly. We can't just subtract the bytes as the
151	MSB might be significant. */
152	cbnz has_nul, 1f
153	cmp data1, data2
154	cset result, ne
155	cneg result, result, lo
156	ret
157	1:
158	/* Re-compute the NUL-byte detection, using a byte-reversed value. */
159	rev tmp3, data1
160	sub tmp1, tmp3, zeroones
161	orr tmp2, tmp3, #REP8_7f
162	bic has_nul, tmp1, tmp2
163	rev has_nul, has_nul
164	orr syndrome, diff, has_nul
165	clz pos, syndrome
166	/* The MS-non-zero bit of the syndrome marks either the first bit
167	that is different, or the top bit of the first zero byte.
168	Shifting left now will bring the critical information into the
169	top bits. */
170	lsl data1, data1, pos
171	lsl data2, data2, pos
172	/* But we need to zero-extend (char is unsigned) the value and then
173	perform a signed 32-bit subtraction. */
174	lsr data1, data1, #56
175	sub result, data1, data2, lsr #56
176	ret
177	#endif
178
179	.Lmutual_align:
180	/* Sources are mutually aligned, but are not currently at an
181	alignment boundary. Round down the addresses and then mask off
182	the bytes that precede the start point.
183	We also need to adjust the limit calculations, but without
184	overflowing if the limit is near ULONG_MAX. */
185	bic src1, src1, #7
186	bic src2, src2, #7
187	ldr data1, [src1], #8
188	neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
189	ldr data2, [src2], #8
190	mov tmp2, #~0
191	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
192	#ifdef __AARCH64EB__
193	/* Big-endian. Early bytes are at MSB. */
194	lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
195	#else
196	/* Little-endian. Early bytes are at LSB. */
197	lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
198	#endif
199	and tmp3, limit_wd, #7
200	lsr limit_wd, limit_wd, #3
201	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
202	add limit, limit, tmp1
203	add tmp3, tmp3, tmp1
204	orr data1, data1, tmp2
205	orr data2, data2, tmp2
206	add limit_wd, limit_wd, tmp3, lsr #3
207	b .Lstart_realigned
208
209	.Lret0:
210	mov result, #0
211	ret
212
213	.p2align 6
214	.Lmisaligned8:
215	sub limit, limit, #1
216	1:
217	/* Perhaps we can do better than this. */
218	ldrb data1w, [src1], #1
219	ldrb data2w, [src2], #1
220	subs limit, limit, #1
221	ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
222	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
223	b.eq 1b
224	sub result, data1, data2
225	ret
226	.size strncmp, . - strncmp
227
228	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format