Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

strcpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 10.3 KB

Line
1	/*
2	strcpy/stpcpy - copy a string returning pointer to start/end.
3
4	Copyright (c) 2013, 2014, 2015 ARM Ltd.
5	All Rights Reserved.
6
7	Redistribution and use in source and binary forms, with or without
8	modification, are permitted provided that the following conditions are met:
9	* Redistributions of source code must retain the above copyright
10	notice, this list of conditions and the following disclaimer.
11	* Redistributions in binary form must reproduce the above copyright
12	notice, this list of conditions and the following disclaimer in the
13	documentation and/or other materials provided with the distribution.
14	* Neither the name of the company nor the names of its contributors
15	may be used to endorse or promote products derived from this
16	software without specific prior written permission.
17
18	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
29
30	#if (defined (__OPTIMIZE_SIZE__) \|\| defined (PREFER_SIZE_OVER_SPEED))
31	/* See strchr-stub.c */
32	#else
33
34	/* Assumptions:
35	*
36	* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
37	*/
38
39	/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
40
41	To test the page crossing code path more thoroughly, compile with
42	-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
43	entry path. This option is not intended for production use. */
44
45	/* Arguments and results. */
46	#define dstin x0
47	#define srcin x1
48
49	/* Locals and temporaries. */
50	#define src x2
51	#define dst x3
52	#define data1 x4
53	#define data1w w4
54	#define data2 x5
55	#define data2w w5
56	#define has_nul1 x6
57	#define has_nul2 x7
58	#define tmp1 x8
59	#define tmp2 x9
60	#define tmp3 x10
61	#define tmp4 x11
62	#define zeroones x12
63	#define data1a x13
64	#define data2a x14
65	#define pos x15
66	#define len x16
67	#define to_align x17
68
69	#ifdef BUILD_STPCPY
70	#define STRCPY stpcpy
71	#else
72	#define STRCPY strcpy
73	#endif
74
75	.macro def_fn f p2align=0
76	.text
77	.p2align \p2align
78	.global \f
79	.type \f, %function
80	\f:
81	.endm
82
83	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
84	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
85	can be done in parallel across the entire word. */
86
87	#define REP8_01 0x0101010101010101
88	#define REP8_7f 0x7f7f7f7f7f7f7f7f
89	#define REP8_80 0x8080808080808080
90
91	/* AArch64 systems have a minimum page size of 4k. We can do a quick
92	page size check for crossing this boundary on entry and if we
93	do not, then we can short-circuit much of the entry code. We
94	expect early page-crossing strings to be rare (probability of
95	16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
96	predictable, even with random strings.
97
98	We don't bother checking for larger page sizes, the cost of setting
99	up the correct page size is just not worth the extra gain from
100	a small reduction in the cases taking the slow path. Note that
101	we only care about whether the first fetch, which may be
102	misaligned, crosses a page boundary - after that we move to aligned
103	fetches for the remainder of the string. */
104
105	#ifdef STRCPY_TEST_PAGE_CROSS
106	/* Make everything that isn't Qword aligned look like a page cross. */
107	#define MIN_PAGE_P2 4
108	#else
109	#define MIN_PAGE_P2 12
110	#endif
111
112	#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
113
114	def_fn STRCPY p2align=6
115	/* For moderately short strings, the fastest way to do the copy is to
116	calculate the length of the string in the same way as strlen, then
117	essentially do a memcpy of the result. This avoids the need for
118	multiple byte copies and further means that by the time we
119	reach the bulk copy loop we know we can always use DWord
120	accesses. We expect strcpy to rarely be called repeatedly
121	with the same source string, so branch prediction is likely to
122	always be difficult - we mitigate against this by preferring
123	conditional select operations over branches whenever this is
124	feasible. */
125	and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
126	mov zeroones, #REP8_01
127	and to_align, srcin, #15
128	cmp tmp2, #(MIN_PAGE_SIZE - 16)
129	neg tmp1, to_align
130	/* The first fetch will straddle a (possible) page boundary iff
131	srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
132	aligned string will never fail the page align check, so will
133	always take the fast path. */
134	b.gt .Lpage_cross
135
136	.Lpage_cross_ok:
137	ldp data1, data2, [srcin]
138	#ifdef __AARCH64EB__
139	/* Because we expect the end to be found within 16 characters
140	(profiling shows this is the most common case), it's worth
141	swapping the bytes now to save having to recalculate the
142	termination syndrome later. We preserve data1 and data2
143	so that we can re-use the values later on. */
144	rev tmp2, data1
145	sub tmp1, tmp2, zeroones
146	orr tmp2, tmp2, #REP8_7f
147	bics has_nul1, tmp1, tmp2
148	b.ne .Lfp_le8
149	rev tmp4, data2
150	sub tmp3, tmp4, zeroones
151	orr tmp4, tmp4, #REP8_7f
152	#else
153	sub tmp1, data1, zeroones
154	orr tmp2, data1, #REP8_7f
155	bics has_nul1, tmp1, tmp2
156	b.ne .Lfp_le8
157	sub tmp3, data2, zeroones
158	orr tmp4, data2, #REP8_7f
159	#endif
160	bics has_nul2, tmp3, tmp4
161	b.eq .Lbulk_entry
162
163	/* The string is short (<=16 bytes). We don't know exactly how
164	short though, yet. Work out the exact length so that we can
165	quickly select the optimal copy strategy. */
166	.Lfp_gt8:
167	rev has_nul2, has_nul2
168	clz pos, has_nul2
169	mov tmp2, #56
170	add dst, dstin, pos, lsr #3 /* Bits to bytes. */
171	sub pos, tmp2, pos
172	#ifdef __AARCH64EB__
173	lsr data2, data2, pos
174	#else
175	lsl data2, data2, pos
176	#endif
177	str data2, [dst, #1]
178	str data1, [dstin]
179	#ifdef BUILD_STPCPY
180	add dstin, dst, #8
181	#endif
182	ret
183
184	.Lfp_le8:
185	rev has_nul1, has_nul1
186	clz pos, has_nul1
187	add dst, dstin, pos, lsr #3 /* Bits to bytes. */
188	subs tmp2, pos, #24 /* Pos in bits. */
189	b.lt .Lfp_lt4
190	#ifdef __AARCH64EB__
191	mov tmp2, #56
192	sub pos, tmp2, pos
193	lsr data2, data1, pos
194	lsr data1, data1, #32
195	#else
196	lsr data2, data1, tmp2
197	#endif
198	/* 4->7 bytes to copy. */
199	str data2w, [dst, #-3]
200	str data1w, [dstin]
201	#ifdef BUILD_STPCPY
202	mov dstin, dst
203	#endif
204	ret
205	.Lfp_lt4:
206	cbz pos, .Lfp_lt2
207	/* 2->3 bytes to copy. */
208	#ifdef __AARCH64EB__
209	lsr data1, data1, #48
210	#endif
211	strh data1w, [dstin]
212	/* Fall-through, one byte (max) to go. */
213	.Lfp_lt2:
214	/* Null-terminated string. Last character must be zero! */
215	strb wzr, [dst]
216	#ifdef BUILD_STPCPY
217	mov dstin, dst
218	#endif
219	ret
220
221	.p2align 6
222	/* Aligning here ensures that the entry code and main loop all lies
223	within one 64-byte cache line. */
224	.Lbulk_entry:
225	sub to_align, to_align, #16
226	stp data1, data2, [dstin]
227	sub src, srcin, to_align
228	sub dst, dstin, to_align
229	b .Lentry_no_page_cross
230
231	/* The inner loop deals with two Dwords at a time. This has a
232	slightly higher start-up cost, but we should win quite quickly,
233	especially on cores with a high number of issue slots per
234	cycle, as we get much better parallelism out of the operations. */
235	.Lmain_loop:
236	stp data1, data2, [dst], #16
237	.Lentry_no_page_cross:
238	ldp data1, data2, [src], #16
239	sub tmp1, data1, zeroones
240	orr tmp2, data1, #REP8_7f
241	sub tmp3, data2, zeroones
242	orr tmp4, data2, #REP8_7f
243	bic has_nul1, tmp1, tmp2
244	bics has_nul2, tmp3, tmp4
245	ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
246	b.eq .Lmain_loop
247
248	/* Since we know we are copying at least 16 bytes, the fastest way
249	to deal with the tail is to determine the location of the
250	trailing NUL, then (re)copy the 16 bytes leading up to that. */
251	cmp has_nul1, #0
252	#ifdef __AARCH64EB__
253	/* For big-endian, carry propagation (if the final byte in the
254	string is 0x01) means we cannot use has_nul directly. The
255	easiest way to get the correct byte is to byte-swap the data
256	and calculate the syndrome a second time. */
257	csel data1, data1, data2, ne
258	rev data1, data1
259	sub tmp1, data1, zeroones
260	orr tmp2, data1, #REP8_7f
261	bic has_nul1, tmp1, tmp2
262	#else
263	csel has_nul1, has_nul1, has_nul2, ne
264	#endif
265	rev has_nul1, has_nul1
266	clz pos, has_nul1
267	add tmp1, pos, #72
268	add pos, pos, #8
269	csel pos, pos, tmp1, ne
270	add src, src, pos, lsr #3
271	add dst, dst, pos, lsr #3
272	ldp data1, data2, [src, #-32]
273	stp data1, data2, [dst, #-16]
274	#ifdef BUILD_STPCPY
275	sub dstin, dst, #1
276	#endif
277	ret
278
279	.Lpage_cross:
280	bic src, srcin, #15
281	/* Start by loading two words at [srcin & ~15], then forcing the
282	bytes that precede srcin to 0xff. This means they never look
283	like termination bytes. */
284	ldp data1, data2, [src]
285	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
286	tst to_align, #7
287	csetm tmp2, ne
288	#ifdef __AARCH64EB__
289	lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
290	#else
291	lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
292	#endif
293	orr data1, data1, tmp2
294	orr data2a, data2, tmp2
295	cmp to_align, #8
296	csinv data1, data1, xzr, lt
297	csel data2, data2, data2a, lt
298	sub tmp1, data1, zeroones
299	orr tmp2, data1, #REP8_7f
300	sub tmp3, data2, zeroones
301	orr tmp4, data2, #REP8_7f
302	bic has_nul1, tmp1, tmp2
303	bics has_nul2, tmp3, tmp4
304	ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
305	b.eq .Lpage_cross_ok
306	/* We now need to make data1 and data2 look like they've been
307	loaded directly from srcin. Do a rotate on the 128-bit value. */
308	lsl tmp1, to_align, #3 /* Bytes->bits. */
309	neg tmp2, to_align, lsl #3
310	#ifdef __AARCH64EB__
311	lsl data1a, data1, tmp1
312	lsr tmp4, data2, tmp2
313	lsl data2, data2, tmp1
314	orr tmp4, tmp4, data1a
315	cmp to_align, #8
316	csel data1, tmp4, data2, lt
317	rev tmp2, data1
318	rev tmp4, data2
319	sub tmp1, tmp2, zeroones
320	orr tmp2, tmp2, #REP8_7f
321	sub tmp3, tmp4, zeroones
322	orr tmp4, tmp4, #REP8_7f
323	#else
324	lsr data1a, data1, tmp1
325	lsl tmp4, data2, tmp2
326	lsr data2, data2, tmp1
327	orr tmp4, tmp4, data1a
328	cmp to_align, #8
329	csel data1, tmp4, data2, lt
330	sub tmp1, data1, zeroones
331	orr tmp2, data1, #REP8_7f
332	sub tmp3, data2, zeroones
333	orr tmp4, data2, #REP8_7f
334	#endif
335	bic has_nul1, tmp1, tmp2
336	cbnz has_nul1, .Lfp_le8
337	bic has_nul2, tmp3, tmp4
338	b .Lfp_gt8
339
340	.size STRCPY, . - STRCPY
341	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format