1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90__FBSDID("$FreeBSD$");
91
92#include "assym.inc"
93
94	.syntax	unified
95
96.L_arm_memcpy:
97	.word	_C_LABEL(_arm_memcpy)
98.L_arm_bzero:
99	.word	_C_LABEL(_arm_bzero)
100.L_min_memcpy_size:
101	.word	_C_LABEL(_min_memcpy_size)
102.L_min_bzero_size:
103	.word	_C_LABEL(_min_bzero_size)
104/*
105 * memset: Sets a block of memory to the specified value
106 *
107 * On entry:
108 *   r0 - dest address
109 *   r1 - byte to write
110 *   r2 - number of bytes to write
111 *
112 * On exit:
113 *   r0 - dest address
114 */
115/* LINTSTUB: Func: void bzero(void *, size_t) */
116ENTRY(bzero)
117	ldr	r3, .L_arm_bzero
118	ldr	r3, [r3]
119	cmp	r3, #0
120	beq	.Lnormal0
121	ldr	r2, .L_min_bzero_size
122	ldr	r2, [r2]
123	cmp	r1, r2
124	blt	.Lnormal0
125	stmfd	sp!, {r0, r1, lr}
126	mov	r2, #0
127	mov	lr, pc
128	mov	pc, r3
129	cmp	r0, #0
130	ldmfd	sp!, {r0, r1, lr}
131	RETeq
132.Lnormal0:
133	mov	r3, #0x00
134	b	do_memset
135END(bzero)
136/* LINTSTUB: Func: void *memset(void *, int, size_t) */
137ENTRY(memset)
138	and	r3, r1, #0xff		/* We deal with bytes */
139	mov	r1, r2
140do_memset:
141	cmp	r1, #0x04		/* Do we have less than 4 bytes */
142	mov	ip, r0
143	blt	.Lmemset_lessthanfour
144
145	/* Ok first we will word align the address */
146	ands	r2, ip, #0x03		/* Get the bottom two bits */
147	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
148
149	/* We are now word aligned */
150.Lmemset_wordaligned:
151	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
152#ifdef _ARM_ARCH_5E
153	tst	ip, #0x04		/* Quad-align for armv5e */
154#else
155	cmp	r1, #0x10
156#endif
157	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
158#ifdef _ARM_ARCH_5E
159	subne	r1, r1, #0x04		/* Quad-align if necessary */
160	strne	r3, [ip], #0x04
161	cmp	r1, #0x10
162#endif
163	blt	.Lmemset_loop4		/* If less than 16 then use words */
164	mov	r2, r3			/* Duplicate data */
165	cmp	r1, #0x80		/* If < 128 then skip the big loop */
166	blt	.Lmemset_loop32
167
168	/* Do 128 bytes at a time */
169.Lmemset_loop128:
170	subs	r1, r1, #0x80
171#ifdef _ARM_ARCH_5E
172	strdge	r2, [ip], #0x08
173	strdge	r2, [ip], #0x08
174	strdge	r2, [ip], #0x08
175	strdge	r2, [ip], #0x08
176	strdge	r2, [ip], #0x08
177	strdge	r2, [ip], #0x08
178	strdge	r2, [ip], #0x08
179	strdge	r2, [ip], #0x08
180	strdge	r2, [ip], #0x08
181	strdge	r2, [ip], #0x08
182	strdge	r2, [ip], #0x08
183	strdge	r2, [ip], #0x08
184	strdge	r2, [ip], #0x08
185	strdge	r2, [ip], #0x08
186	strdge	r2, [ip], #0x08
187	strdge	r2, [ip], #0x08
188#else
189	stmiage	ip!, {r2-r3}
190	stmiage	ip!, {r2-r3}
191	stmiage	ip!, {r2-r3}
192	stmiage	ip!, {r2-r3}
193	stmiage	ip!, {r2-r3}
194	stmiage	ip!, {r2-r3}
195	stmiage	ip!, {r2-r3}
196	stmiage	ip!, {r2-r3}
197	stmiage	ip!, {r2-r3}
198	stmiage	ip!, {r2-r3}
199	stmiage	ip!, {r2-r3}
200	stmiage	ip!, {r2-r3}
201	stmiage	ip!, {r2-r3}
202	stmiage	ip!, {r2-r3}
203	stmiage	ip!, {r2-r3}
204	stmiage	ip!, {r2-r3}
205#endif
206	bgt	.Lmemset_loop128
207	RETeq			/* Zero length so just exit */
208
209	add	r1, r1, #0x80		/* Adjust for extra sub */
210
211	/* Do 32 bytes at a time */
212.Lmemset_loop32:
213	subs	r1, r1, #0x20
214#ifdef _ARM_ARCH_5E
215	strdge	r2, [ip], #0x08
216	strdge	r2, [ip], #0x08
217	strdge	r2, [ip], #0x08
218	strdge	r2, [ip], #0x08
219#else
220	stmiage	ip!, {r2-r3}
221	stmiage	ip!, {r2-r3}
222	stmiage	ip!, {r2-r3}
223	stmiage	ip!, {r2-r3}
224#endif
225	bgt	.Lmemset_loop32
226	RETeq			/* Zero length so just exit */
227
228	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
229
230	/* Deal with 16 bytes or more */
231#ifdef _ARM_ARCH_5E
232	strdge	r2, [ip], #0x08
233	strdge	r2, [ip], #0x08
234#else
235	stmiage	ip!, {r2-r3}
236	stmiage	ip!, {r2-r3}
237#endif
238	RETeq			/* Zero length so just exit */
239
240	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
241
242	/* We have at least 4 bytes so copy as words */
243.Lmemset_loop4:
244	subs	r1, r1, #0x04
245	strge	r3, [ip], #0x04
246	bgt	.Lmemset_loop4
247	RETeq			/* Zero length so just exit */
248
249#ifdef _ARM_ARCH_5E
250	/* Compensate for 64-bit alignment check */
251	adds	r1, r1, #0x04
252	RETeq
253	cmp	r1, #2
254#else
255	cmp	r1, #-2
256#endif
257
258	strb	r3, [ip], #0x01		/* Set 1 byte */
259	strbge	r3, [ip], #0x01		/* Set another byte */
260	strbgt	r3, [ip]		/* and a third */
261	RET			/* Exit */
262
263.Lmemset_wordunaligned:
264	rsb	r2, r2, #0x004
265	strb	r3, [ip], #0x01		/* Set 1 byte */
266	cmp	r2, #0x02
267	strbge	r3, [ip], #0x01		/* Set another byte */
268	sub	r1, r1, r2
269	strbgt	r3, [ip], #0x01		/* and a third */
270	cmp	r1, #0x04		/* More than 4 bytes left? */
271	bge	.Lmemset_wordaligned	/* Yup */
272
273.Lmemset_lessthanfour:
274	cmp	r1, #0x00
275	RETeq			/* Zero length so exit */
276	strb	r3, [ip], #0x01		/* Set 1 byte */
277	cmp	r1, #0x02
278	strbge	r3, [ip], #0x01		/* Set another byte */
279	strbgt	r3, [ip]		/* and a third */
280	RET			/* Exit */
281EEND(memset)
282END(bzero)
283
284ENTRY(bcmp)
285	mov	ip, r0
286	cmp	r2, #0x06
287	beq	.Lmemcmp_6bytes
288	mov	r0, #0x00
289
290	/* Are both addresses aligned the same way? */
291	cmp	r2, #0x00
292	eorsne	r3, ip, r1
293	RETeq			/* len == 0, or same addresses! */
294	tst	r3, #0x03
295	subne	r2, r2, #0x01
296	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
297
298	/* Word-align the addresses, if necessary */
299	sub	r3, r1, #0x05
300	ands	r3, r3, #0x03
301	add	r3, r3, r3, lsl #1
302	addne	pc, pc, r3, lsl #3
303	nop
304
305	/* Compare up to 3 bytes */
306	ldrb	r0, [ip], #0x01
307	ldrb	r3, [r1], #0x01
308	subs	r0, r0, r3
309	RETne
310	subs	r2, r2, #0x01
311	RETeq
312
313	/* Compare up to 2 bytes */
314	ldrb	r0, [ip], #0x01
315	ldrb	r3, [r1], #0x01
316	subs	r0, r0, r3
317	RETne
318	subs	r2, r2, #0x01
319	RETeq
320
321	/* Compare 1 byte */
322	ldrb	r0, [ip], #0x01
323	ldrb	r3, [r1], #0x01
324	subs	r0, r0, r3
325	RETne
326	subs	r2, r2, #0x01
327	RETeq
328
329	/* Compare 4 bytes at a time, if possible */
330	subs	r2, r2, #0x04
331	bcc	.Lmemcmp_bytewise
332.Lmemcmp_word_aligned:
333	ldr	r0, [ip], #0x04
334	ldr	r3, [r1], #0x04
335	subs	r2, r2, #0x04
336	cmpcs	r0, r3
337	beq	.Lmemcmp_word_aligned
338	sub	r0, r0, r3
339
340	/* Correct for extra subtraction, and check if done */
341	adds	r2, r2, #0x04
342	cmpeq	r0, #0x00		/* If done, did all bytes match? */
343	RETeq			/* Yup. Just return */
344
345	/* Re-do the final word byte-wise */
346	sub	ip, ip, #0x04
347	sub	r1, r1, #0x04
348
349.Lmemcmp_bytewise:
350	add	r2, r2, #0x03
351.Lmemcmp_bytewise2:
352	ldrb	r0, [ip], #0x01
353	ldrb	r3, [r1], #0x01
354	subs	r2, r2, #0x01
355	cmpcs	r0, r3
356	beq	.Lmemcmp_bytewise2
357	sub	r0, r0, r3
358	RET
359
360	/*
361	 * 6 byte compares are very common, thanks to the network stack.
362	 * This code is hand-scheduled to reduce the number of stalls for
363	 * load results. Everything else being equal, this will be ~32%
364	 * faster than a byte-wise memcmp.
365	 */
366	.align	5
367.Lmemcmp_6bytes:
368	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
369	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
370	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
371	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
372	ldrbeq	r3, [ip, #0x01]		/* r3 = b1#1 */
373	RETne			/* Return if mismatch on #0 */
374	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
375	ldrbeq	r3, [r1, #0x02]		/* r3 = b2#2 */
376	ldrbeq	r0, [ip, #0x02]		/* r0 = b1#2 */
377	RETne			/* Return if mismatch on #1 */
378	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
379	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
380	ldrbeq	r3, [ip, #0x03]		/* r3 = b1#3 */
381	RETne			/* Return if mismatch on #2 */
382	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
383	ldrbeq	r3, [r1, #0x04]		/* r3 = b2#4 */
384	ldrbeq	r0, [ip, #0x04]		/* r0 = b1#4 */
385	RETne			/* Return if mismatch on #3 */
386	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
387	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
388	ldrbeq	r3, [ip, #0x05]		/* r3 = b1#5 */
389	RETne			/* Return if mismatch on #4 */
390	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
391	RET
392END(bcmp)
393
394ENTRY(bcopy)
395	/* switch the source and destination registers */
396	eor     r0, r1, r0
397	eor     r1, r0, r1
398	eor     r0, r1, r0
399EENTRY(memmove)
400	/* Do the buffers overlap? */
401	cmp	r0, r1
402	RETeq		/* Bail now if src/dst are the same */
403	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
404	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
405	cmp	r3, r2		/* if (r3 < len) we have an overlap */
406	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
407
408	/* Determine copy direction */
409	cmp	r1, r0
410	bcc	.Lmemmove_backwards
411
412	moveq	r0, #0			/* Quick abort for len=0 */
413	RETeq
414
415	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
416	subs	r2, r2, #4
417	blt	.Lmemmove_fl4		/* less than 4 bytes */
418	ands	r12, r0, #3
419	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
420	ands	r12, r1, #3
421	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
422
423.Lmemmove_ft8:
424	/* We have aligned source and destination */
425	subs	r2, r2, #8
426	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
427	subs	r2, r2, #0x14
428	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
429	stmdb	sp!, {r4}		/* borrow r4 */
430
431	/* blat 32 bytes at a time */
432	/* XXX for really big copies perhaps we should use more registers */
433.Lmemmove_floop32:
434	ldmia	r1!, {r3, r4, r12, lr}
435	stmia	r0!, {r3, r4, r12, lr}
436	ldmia	r1!, {r3, r4, r12, lr}
437	stmia	r0!, {r3, r4, r12, lr}
438	subs	r2, r2, #0x20
439	bge	.Lmemmove_floop32
440
441	cmn	r2, #0x10
442	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
443	stmiage	r0!, {r3, r4, r12, lr}
444	subge	r2, r2, #0x10
445	ldmia	sp!, {r4}		/* return r4 */
446
447.Lmemmove_fl32:
448	adds	r2, r2, #0x14
449
450	/* blat 12 bytes at a time */
451.Lmemmove_floop12:
452	ldmiage	r1!, {r3, r12, lr}
453	stmiage	r0!, {r3, r12, lr}
454	subsge	r2, r2, #0x0c
455	bge	.Lmemmove_floop12
456
457.Lmemmove_fl12:
458	adds	r2, r2, #8
459	blt	.Lmemmove_fl4
460
461	subs	r2, r2, #4
462	ldrlt	r3, [r1], #4
463	strlt	r3, [r0], #4
464	ldmiage	r1!, {r3, r12}
465	stmiage	r0!, {r3, r12}
466	subge	r2, r2, #4
467
468.Lmemmove_fl4:
469	/* less than 4 bytes to go */
470	adds	r2, r2, #4
471	ldmiaeq	sp!, {r0, pc}		/* done */
472
473	/* copy the crud byte at a time */
474	cmp	r2, #2
475	ldrb	r3, [r1], #1
476	strb	r3, [r0], #1
477	ldrbge	r3, [r1], #1
478	strbge	r3, [r0], #1
479	ldrbgt	r3, [r1], #1
480	strbgt	r3, [r0], #1
481	ldmia	sp!, {r0, pc}
482
483	/* erg - unaligned destination */
484.Lmemmove_fdestul:
485	rsb	r12, r12, #4
486	cmp	r12, #2
487
488	/* align destination with byte copies */
489	ldrb	r3, [r1], #1
490	strb	r3, [r0], #1
491	ldrbge	r3, [r1], #1
492	strbge	r3, [r0], #1
493	ldrbgt	r3, [r1], #1
494	strbgt	r3, [r0], #1
495	subs	r2, r2, r12
496	blt	.Lmemmove_fl4		/* less the 4 bytes */
497
498	ands	r12, r1, #3
499	beq	.Lmemmove_ft8		/* we have an aligned source */
500
501	/* erg - unaligned source */
502	/* This is where it gets nasty ... */
503.Lmemmove_fsrcul:
504	bic	r1, r1, #3
505	ldr	lr, [r1], #4
506	cmp	r12, #2
507	bgt	.Lmemmove_fsrcul3
508	beq	.Lmemmove_fsrcul2
509	cmp	r2, #0x0c
510	blt	.Lmemmove_fsrcul1loop4
511	sub	r2, r2, #0x0c
512	stmdb	sp!, {r4, r5}
513
514.Lmemmove_fsrcul1loop16:
515#ifdef __ARMEB__
516	mov	r3, lr, lsl #8
517#else
518	mov	r3, lr, lsr #8
519#endif
520	ldmia	r1!, {r4, r5, r12, lr}
521#ifdef __ARMEB__
522	orr	r3, r3, r4, lsr #24
523	mov	r4, r4, lsl #8
524	orr	r4, r4, r5, lsr #24
525	mov	r5, r5, lsl #8
526	orr	r5, r5, r12, lsr #24
527	mov	r12, r12, lsl #8
528	orr	r12, r12, lr, lsr #24
529#else
530	orr	r3, r3, r4, lsl #24
531	mov	r4, r4, lsr #8
532	orr	r4, r4, r5, lsl #24
533	mov	r5, r5, lsr #8
534	orr	r5, r5, r12, lsl #24
535	mov	r12, r12, lsr #8
536	orr	r12, r12, lr, lsl #24
537#endif
538	stmia	r0!, {r3-r5, r12}
539	subs	r2, r2, #0x10
540	bge	.Lmemmove_fsrcul1loop16
541	ldmia	sp!, {r4, r5}
542	adds	r2, r2, #0x0c
543	blt	.Lmemmove_fsrcul1l4
544
545.Lmemmove_fsrcul1loop4:
546#ifdef __ARMEB__
547	mov	r12, lr, lsl #8
548#else
549	mov	r12, lr, lsr #8
550#endif
551	ldr	lr, [r1], #4
552#ifdef __ARMEB__
553	orr	r12, r12, lr, lsr #24
554#else
555	orr	r12, r12, lr, lsl #24
556#endif
557	str	r12, [r0], #4
558	subs	r2, r2, #4
559	bge	.Lmemmove_fsrcul1loop4
560
561.Lmemmove_fsrcul1l4:
562	sub	r1, r1, #3
563	b	.Lmemmove_fl4
564
565.Lmemmove_fsrcul2:
566	cmp	r2, #0x0c
567	blt	.Lmemmove_fsrcul2loop4
568	sub	r2, r2, #0x0c
569	stmdb	sp!, {r4, r5}
570
571.Lmemmove_fsrcul2loop16:
572#ifdef __ARMEB__
573	mov	r3, lr, lsl #16
574#else
575	mov	r3, lr, lsr #16
576#endif
577	ldmia	r1!, {r4, r5, r12, lr}
578#ifdef __ARMEB__
579	orr	r3, r3, r4, lsr #16
580	mov	r4, r4, lsl #16
581	orr	r4, r4, r5, lsr #16
582	mov	r5, r5, lsl #16
583	orr	r5, r5, r12, lsr #16
584	mov	r12, r12, lsl #16
585	orr	r12, r12, lr, lsr #16
586#else
587	orr	r3, r3, r4, lsl #16
588	mov	r4, r4, lsr #16
589	orr	r4, r4, r5, lsl #16
590	mov	r5, r5, lsr #16
591	orr	r5, r5, r12, lsl #16
592	mov	r12, r12, lsr #16
593	orr	r12, r12, lr, lsl #16
594#endif
595	stmia	r0!, {r3-r5, r12}
596	subs	r2, r2, #0x10
597	bge	.Lmemmove_fsrcul2loop16
598	ldmia	sp!, {r4, r5}
599	adds	r2, r2, #0x0c
600	blt	.Lmemmove_fsrcul2l4
601
602.Lmemmove_fsrcul2loop4:
603#ifdef __ARMEB__
604	mov	r12, lr, lsl #16
605#else
606	mov	r12, lr, lsr #16
607#endif
608	ldr	lr, [r1], #4
609#ifdef __ARMEB__
610	orr	r12, r12, lr, lsr #16
611#else
612	orr	r12, r12, lr, lsl #16
613#endif
614	str	r12, [r0], #4
615	subs	r2, r2, #4
616	bge	.Lmemmove_fsrcul2loop4
617
618.Lmemmove_fsrcul2l4:
619	sub	r1, r1, #2
620	b	.Lmemmove_fl4
621
622.Lmemmove_fsrcul3:
623	cmp	r2, #0x0c
624	blt	.Lmemmove_fsrcul3loop4
625	sub	r2, r2, #0x0c
626	stmdb	sp!, {r4, r5}
627
628.Lmemmove_fsrcul3loop16:
629#ifdef __ARMEB__
630	mov	r3, lr, lsl #24
631#else
632	mov	r3, lr, lsr #24
633#endif
634	ldmia	r1!, {r4, r5, r12, lr}
635#ifdef __ARMEB__
636	orr	r3, r3, r4, lsr #8
637	mov	r4, r4, lsl #24
638	orr	r4, r4, r5, lsr #8
639	mov	r5, r5, lsl #24
640	orr	r5, r5, r12, lsr #8
641	mov	r12, r12, lsl #24
642	orr	r12, r12, lr, lsr #8
643#else
644	orr	r3, r3, r4, lsl #8
645	mov	r4, r4, lsr #24
646	orr	r4, r4, r5, lsl #8
647	mov	r5, r5, lsr #24
648	orr	r5, r5, r12, lsl #8
649	mov	r12, r12, lsr #24
650	orr	r12, r12, lr, lsl #8
651#endif
652	stmia	r0!, {r3-r5, r12}
653	subs	r2, r2, #0x10
654	bge	.Lmemmove_fsrcul3loop16
655	ldmia	sp!, {r4, r5}
656	adds	r2, r2, #0x0c
657	blt	.Lmemmove_fsrcul3l4
658
659.Lmemmove_fsrcul3loop4:
660#ifdef __ARMEB__
661	mov	r12, lr, lsl #24
662#else
663	mov	r12, lr, lsr #24
664#endif
665	ldr	lr, [r1], #4
666#ifdef __ARMEB__
667	orr	r12, r12, lr, lsr #8
668#else
669	orr	r12, r12, lr, lsl #8
670#endif
671	str	r12, [r0], #4
672	subs	r2, r2, #4
673	bge	.Lmemmove_fsrcul3loop4
674
675.Lmemmove_fsrcul3l4:
676	sub	r1, r1, #1
677	b	.Lmemmove_fl4
678
679.Lmemmove_backwards:
680	add	r1, r1, r2
681	add	r0, r0, r2
682	subs	r2, r2, #4
683	blt	.Lmemmove_bl4		/* less than 4 bytes */
684	ands	r12, r0, #3
685	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
686	ands	r12, r1, #3
687	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
688
689.Lmemmove_bt8:
690	/* We have aligned source and destination */
691	subs	r2, r2, #8
692	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
693	stmdb	sp!, {r4, lr}
694	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
695	blt	.Lmemmove_bl32
696
697	/* blat 32 bytes at a time */
698	/* XXX for really big copies perhaps we should use more registers */
699.Lmemmove_bloop32:
700	ldmdb	r1!, {r3, r4, r12, lr}
701	stmdb	r0!, {r3, r4, r12, lr}
702	ldmdb	r1!, {r3, r4, r12, lr}
703	stmdb	r0!, {r3, r4, r12, lr}
704	subs	r2, r2, #0x20
705	bge	.Lmemmove_bloop32
706
707.Lmemmove_bl32:
708	cmn	r2, #0x10
709	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
710	stmdbge	r0!, {r3, r4, r12, lr}
711	subge	r2, r2, #0x10
712	adds	r2, r2, #0x14
713	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
714	stmdbge	r0!, {r3, r12, lr}
715	subge	r2, r2, #0x0c
716	ldmia	sp!, {r4, lr}
717
718.Lmemmove_bl12:
719	adds	r2, r2, #8
720	blt	.Lmemmove_bl4
721	subs	r2, r2, #4
722	ldrlt	r3, [r1, #-4]!
723	strlt	r3, [r0, #-4]!
724	ldmdbge	r1!, {r3, r12}
725	stmdbge	r0!, {r3, r12}
726	subge	r2, r2, #4
727
728.Lmemmove_bl4:
729	/* less than 4 bytes to go */
730	adds	r2, r2, #4
731	RETeq			/* done */
732
733	/* copy the crud byte at a time */
734	cmp	r2, #2
735	ldrb	r3, [r1, #-1]!
736	strb	r3, [r0, #-1]!
737	ldrbge	r3, [r1, #-1]!
738	strbge	r3, [r0, #-1]!
739	ldrbgt	r3, [r1, #-1]!
740	strbgt	r3, [r0, #-1]!
741	RET
742
743	/* erg - unaligned destination */
744.Lmemmove_bdestul:
745	cmp	r12, #2
746
747	/* align destination with byte copies */
748	ldrb	r3, [r1, #-1]!
749	strb	r3, [r0, #-1]!
750	ldrbge	r3, [r1, #-1]!
751	strbge	r3, [r0, #-1]!
752	ldrbgt	r3, [r1, #-1]!
753	strbgt	r3, [r0, #-1]!
754	subs	r2, r2, r12
755	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
756	ands	r12, r1, #3
757	beq	.Lmemmove_bt8		/* we have an aligned source */
758
759	/* erg - unaligned source */
760	/* This is where it gets nasty ... */
761.Lmemmove_bsrcul:
762	bic	r1, r1, #3
763	ldr	r3, [r1, #0]
764	cmp	r12, #2
765	blt	.Lmemmove_bsrcul1
766	beq	.Lmemmove_bsrcul2
767	cmp	r2, #0x0c
768	blt	.Lmemmove_bsrcul3loop4
769	sub	r2, r2, #0x0c
770	stmdb	sp!, {r4, r5, lr}
771
772.Lmemmove_bsrcul3loop16:
773#ifdef __ARMEB__
774	mov	lr, r3, lsr #8
775#else
776	mov	lr, r3, lsl #8
777#endif
778	ldmdb	r1!, {r3-r5, r12}
779#ifdef __ARMEB__
780	orr	lr, lr, r12, lsl #24
781	mov	r12, r12, lsr #8
782	orr	r12, r12, r5, lsl #24
783	mov	r5, r5, lsr #8
784	orr	r5, r5, r4, lsl #24
785	mov	r4, r4, lsr #8
786	orr	r4, r4, r3, lsl #24
787#else
788	orr	lr, lr, r12, lsr #24
789	mov	r12, r12, lsl #8
790	orr	r12, r12, r5, lsr #24
791	mov	r5, r5, lsl #8
792	orr	r5, r5, r4, lsr #24
793	mov	r4, r4, lsl #8
794	orr	r4, r4, r3, lsr #24
795#endif
796	stmdb	r0!, {r4, r5, r12, lr}
797	subs	r2, r2, #0x10
798	bge	.Lmemmove_bsrcul3loop16
799	ldmia	sp!, {r4, r5, lr}
800	adds	r2, r2, #0x0c
801	blt	.Lmemmove_bsrcul3l4
802
803.Lmemmove_bsrcul3loop4:
804#ifdef __ARMEB__
805	mov	r12, r3, lsr #8
806#else
807	mov	r12, r3, lsl #8
808#endif
809	ldr	r3, [r1, #-4]!
810#ifdef __ARMEB__
811	orr	r12, r12, r3, lsl #24
812#else
813	orr	r12, r12, r3, lsr #24
814#endif
815	str	r12, [r0, #-4]!
816	subs	r2, r2, #4
817	bge	.Lmemmove_bsrcul3loop4
818
819.Lmemmove_bsrcul3l4:
820	add	r1, r1, #3
821	b	.Lmemmove_bl4
822
823.Lmemmove_bsrcul2:
824	cmp	r2, #0x0c
825	blt	.Lmemmove_bsrcul2loop4
826	sub	r2, r2, #0x0c
827	stmdb	sp!, {r4, r5, lr}
828
829.Lmemmove_bsrcul2loop16:
830#ifdef __ARMEB__
831	mov	lr, r3, lsr #16
832#else
833	mov	lr, r3, lsl #16
834#endif
835	ldmdb	r1!, {r3-r5, r12}
836#ifdef __ARMEB__
837	orr	lr, lr, r12, lsl #16
838	mov	r12, r12, lsr #16
839	orr	r12, r12, r5, lsl #16
840	mov	r5, r5, lsr #16
841	orr	r5, r5, r4, lsl #16
842	mov	r4, r4, lsr #16
843	orr	r4, r4, r3, lsl #16
844#else
845	orr	lr, lr, r12, lsr #16
846	mov	r12, r12, lsl #16
847	orr	r12, r12, r5, lsr #16
848	mov	r5, r5, lsl #16
849	orr	r5, r5, r4, lsr #16
850	mov	r4, r4, lsl #16
851	orr	r4, r4, r3, lsr #16
852#endif
853	stmdb	r0!, {r4, r5, r12, lr}
854	subs	r2, r2, #0x10
855	bge	.Lmemmove_bsrcul2loop16
856	ldmia	sp!, {r4, r5, lr}
857	adds	r2, r2, #0x0c
858	blt	.Lmemmove_bsrcul2l4
859
860.Lmemmove_bsrcul2loop4:
861#ifdef __ARMEB__
862	mov	r12, r3, lsr #16
863#else
864	mov	r12, r3, lsl #16
865#endif
866	ldr	r3, [r1, #-4]!
867#ifdef __ARMEB__
868	orr	r12, r12, r3, lsl #16
869#else
870	orr	r12, r12, r3, lsr #16
871#endif
872	str	r12, [r0, #-4]!
873	subs	r2, r2, #4
874	bge	.Lmemmove_bsrcul2loop4
875
876.Lmemmove_bsrcul2l4:
877	add	r1, r1, #2
878	b	.Lmemmove_bl4
879
880.Lmemmove_bsrcul1:
881	cmp	r2, #0x0c
882	blt	.Lmemmove_bsrcul1loop4
883	sub	r2, r2, #0x0c
884	stmdb	sp!, {r4, r5, lr}
885
886.Lmemmove_bsrcul1loop32:
887#ifdef __ARMEB__
888	mov	lr, r3, lsr #24
889#else
890	mov	lr, r3, lsl #24
891#endif
892	ldmdb	r1!, {r3-r5, r12}
893#ifdef __ARMEB__
894	orr	lr, lr, r12, lsl #8
895	mov	r12, r12, lsr #24
896	orr	r12, r12, r5, lsl #8
897	mov	r5, r5, lsr #24
898	orr	r5, r5, r4, lsl #8
899	mov	r4, r4, lsr #24
900	orr	r4, r4, r3, lsl #8
901#else
902	orr	lr, lr, r12, lsr #8
903	mov	r12, r12, lsl #24
904	orr	r12, r12, r5, lsr #8
905	mov	r5, r5, lsl #24
906	orr	r5, r5, r4, lsr #8
907	mov	r4, r4, lsl #24
908	orr	r4, r4, r3, lsr #8
909#endif
910	stmdb	r0!, {r4, r5, r12, lr}
911	subs	r2, r2, #0x10
912	bge	.Lmemmove_bsrcul1loop32
913	ldmia	sp!, {r4, r5, lr}
914	adds	r2, r2, #0x0c
915	blt	.Lmemmove_bsrcul1l4
916
917.Lmemmove_bsrcul1loop4:
918#ifdef __ARMEB__
919	mov	r12, r3, lsr #24
920#else
921	mov	r12, r3, lsl #24
922#endif
923	ldr	r3, [r1, #-4]!
924#ifdef __ARMEB__
925	orr	r12, r12, r3, lsl #8
926#else
927	orr	r12, r12, r3, lsr #8
928#endif
929	str	r12, [r0, #-4]!
930	subs	r2, r2, #4
931	bge	.Lmemmove_bsrcul1loop4
932
933.Lmemmove_bsrcul1l4:
934	add	r1, r1, #1
935	b	.Lmemmove_bl4
936EEND(memmove)
937END(bcopy)
938
939#if !defined(_ARM_ARCH_5E)
940ENTRY(memcpy)
941	/* save leaf functions having to store this away */
942	/* Do not check arm_memcpy if we're running from flash */
943#if defined(FLASHADDR) && defined(PHYSADDR)
944#if FLASHADDR > PHYSADDR
945	ldr	r3, =FLASHADDR
946	cmp	r3, pc
947	bls	.Lnormal
948#else
949	ldr	r3, =FLASHADDR
950	cmp	r3, pc
951	bhi	.Lnormal
952#endif
953#endif
954	ldr	r3, .L_arm_memcpy
955	ldr	r3, [r3]
956	cmp	r3, #0
957	beq	.Lnormal
958	ldr	r3, .L_min_memcpy_size
959	ldr	r3, [r3]
960	cmp	r2, r3
961	blt	.Lnormal
962	stmfd	sp!, {r0-r2, r4, lr}
963	mov	r3, #0
964	ldr	r4, .L_arm_memcpy
965	mov	lr, pc
966	ldr	pc, [r4]
967	cmp	r0, #0
968	ldmfd	sp!, {r0-r2, r4, lr}
969	RETeq
970
971.Lnormal:
972	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
973
974	subs	r2, r2, #4
975	blt	.Lmemcpy_l4		/* less than 4 bytes */
976	ands	r12, r0, #3
977	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
978	ands	r12, r1, #3
979	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
980
981.Lmemcpy_t8:
982	/* We have aligned source and destination */
983	subs	r2, r2, #8
984	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
985	subs	r2, r2, #0x14
986	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
987	stmdb	sp!, {r4}		/* borrow r4 */
988
989	/* blat 32 bytes at a time */
990	/* XXX for really big copies perhaps we should use more registers */
991.Lmemcpy_loop32:
992	ldmia	r1!, {r3, r4, r12, lr}
993	stmia	r0!, {r3, r4, r12, lr}
994	ldmia	r1!, {r3, r4, r12, lr}
995	stmia	r0!, {r3, r4, r12, lr}
996	subs	r2, r2, #0x20
997	bge	.Lmemcpy_loop32
998
999	cmn	r2, #0x10
1000	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
1001	stmiage	r0!, {r3, r4, r12, lr}
1002	subge	r2, r2, #0x10
1003	ldmia	sp!, {r4}		/* return r4 */
1004
1005.Lmemcpy_l32:
1006	adds	r2, r2, #0x14
1007
1008	/* blat 12 bytes at a time */
1009.Lmemcpy_loop12:
1010	ldmiage	r1!, {r3, r12, lr}
1011	stmiage	r0!, {r3, r12, lr}
1012	subsge	r2, r2, #0x0c
1013	bge	.Lmemcpy_loop12
1014
1015.Lmemcpy_l12:
1016	adds	r2, r2, #8
1017	blt	.Lmemcpy_l4
1018
1019	subs	r2, r2, #4
1020	ldrlt	r3, [r1], #4
1021	strlt	r3, [r0], #4
1022	ldmiage	r1!, {r3, r12}
1023	stmiage	r0!, {r3, r12}
1024	subge	r2, r2, #4
1025
1026.Lmemcpy_l4:
1027	/* less than 4 bytes to go */
1028	adds	r2, r2, #4
1029#ifdef __APCS_26_
1030	ldmiaeq sp!, {r0, pc}^		/* done */
1031#else
1032	ldmiaeq	sp!, {r0, pc}		/* done */
1033#endif
1034	/* copy the crud byte at a time */
1035	cmp	r2, #2
1036	ldrb	r3, [r1], #1
1037	strb	r3, [r0], #1
1038	ldrbge	r3, [r1], #1
1039	strbge	r3, [r0], #1
1040	ldrbgt	r3, [r1], #1
1041	strbgt	r3, [r0], #1
1042	ldmia	sp!, {r0, pc}
1043
1044	/* erg - unaligned destination */
1045.Lmemcpy_destul:
1046	rsb	r12, r12, #4
1047	cmp	r12, #2
1048
1049	/* align destination with byte copies */
1050	ldrb	r3, [r1], #1
1051	strb	r3, [r0], #1
1052	ldrbge	r3, [r1], #1
1053	strbge	r3, [r0], #1
1054	ldrbgt	r3, [r1], #1
1055	strbgt	r3, [r0], #1
1056	subs	r2, r2, r12
1057	blt	.Lmemcpy_l4		/* less the 4 bytes */
1058
1059	ands	r12, r1, #3
1060	beq	.Lmemcpy_t8		/* we have an aligned source */
1061
1062	/* erg - unaligned source */
1063	/* This is where it gets nasty ... */
1064.Lmemcpy_srcul:
1065	bic	r1, r1, #3
1066	ldr	lr, [r1], #4
1067	cmp	r12, #2
1068	bgt	.Lmemcpy_srcul3
1069	beq	.Lmemcpy_srcul2
1070	cmp	r2, #0x0c
1071	blt	.Lmemcpy_srcul1loop4
1072	sub	r2, r2, #0x0c
1073	stmdb	sp!, {r4, r5}
1074
1075.Lmemcpy_srcul1loop16:
1076	mov	r3, lr, lsr #8
1077	ldmia	r1!, {r4, r5, r12, lr}
1078	orr	r3, r3, r4, lsl #24
1079	mov	r4, r4, lsr #8
1080	orr	r4, r4, r5, lsl #24
1081	mov	r5, r5, lsr #8
1082	orr	r5, r5, r12, lsl #24
1083	mov	r12, r12, lsr #8
1084	orr	r12, r12, lr, lsl #24
1085	stmia	r0!, {r3-r5, r12}
1086	subs	r2, r2, #0x10
1087	bge	.Lmemcpy_srcul1loop16
1088	ldmia	sp!, {r4, r5}
1089	adds	r2, r2, #0x0c
1090	blt	.Lmemcpy_srcul1l4
1091
1092.Lmemcpy_srcul1loop4:
1093	mov	r12, lr, lsr #8
1094	ldr	lr, [r1], #4
1095	orr	r12, r12, lr, lsl #24
1096	str	r12, [r0], #4
1097	subs	r2, r2, #4
1098	bge	.Lmemcpy_srcul1loop4
1099
1100.Lmemcpy_srcul1l4:
1101	sub	r1, r1, #3
1102	b	.Lmemcpy_l4
1103
1104.Lmemcpy_srcul2:
1105	cmp	r2, #0x0c
1106	blt	.Lmemcpy_srcul2loop4
1107	sub	r2, r2, #0x0c
1108	stmdb	sp!, {r4, r5}
1109
1110.Lmemcpy_srcul2loop16:
1111	mov	r3, lr, lsr #16
1112	ldmia	r1!, {r4, r5, r12, lr}
1113	orr	r3, r3, r4, lsl #16
1114	mov	r4, r4, lsr #16
1115	orr	r4, r4, r5, lsl #16
1116	mov	r5, r5, lsr #16
1117	orr	r5, r5, r12, lsl #16
1118	mov	r12, r12, lsr #16
1119	orr	r12, r12, lr, lsl #16
1120	stmia	r0!, {r3-r5, r12}
1121	subs	r2, r2, #0x10
1122	bge	.Lmemcpy_srcul2loop16
1123	ldmia	sp!, {r4, r5}
1124	adds	r2, r2, #0x0c
1125	blt	.Lmemcpy_srcul2l4
1126
1127.Lmemcpy_srcul2loop4:
1128	mov	r12, lr, lsr #16
1129	ldr	lr, [r1], #4
1130	orr	r12, r12, lr, lsl #16
1131	str	r12, [r0], #4
1132	subs	r2, r2, #4
1133	bge	.Lmemcpy_srcul2loop4
1134
1135.Lmemcpy_srcul2l4:
1136	sub	r1, r1, #2
1137	b	.Lmemcpy_l4
1138
1139.Lmemcpy_srcul3:
1140	cmp	r2, #0x0c
1141	blt	.Lmemcpy_srcul3loop4
1142	sub	r2, r2, #0x0c
1143	stmdb	sp!, {r4, r5}
1144
1145.Lmemcpy_srcul3loop16:
1146	mov	r3, lr, lsr #24
1147	ldmia	r1!, {r4, r5, r12, lr}
1148	orr	r3, r3, r4, lsl #8
1149	mov	r4, r4, lsr #24
1150	orr	r4, r4, r5, lsl #8
1151	mov	r5, r5, lsr #24
1152	orr	r5, r5, r12, lsl #8
1153	mov	r12, r12, lsr #24
1154	orr	r12, r12, lr, lsl #8
1155	stmia	r0!, {r3-r5, r12}
1156	subs	r2, r2, #0x10
1157	bge	.Lmemcpy_srcul3loop16
1158	ldmia	sp!, {r4, r5}
1159	adds	r2, r2, #0x0c
1160	blt	.Lmemcpy_srcul3l4
1161
1162.Lmemcpy_srcul3loop4:
1163	mov	r12, lr, lsr #24
1164	ldr	lr, [r1], #4
1165	orr	r12, r12, lr, lsl #8
1166	str	r12, [r0], #4
1167	subs	r2, r2, #4
1168	bge	.Lmemcpy_srcul3loop4
1169
1170.Lmemcpy_srcul3l4:
1171	sub	r1, r1, #1
1172	b	.Lmemcpy_l4
1173END(memcpy)
1174
1175#else
1176/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1177ENTRY(memcpy)
1178	pld	[r1]
1179	cmp	r2, #0x0c
1180	ble	.Lmemcpy_short		/* <= 12 bytes */
1181#ifdef FLASHADDR
1182#if FLASHADDR > PHYSADDR
1183	ldr	r3, =FLASHADDR
1184	cmp	r3, pc
1185	bls	.Lnormal
1186#else
1187	ldr	r3, =FLASHADDR
1188	cmp	r3, pc
1189	bhi	.Lnormal
1190#endif
1191#endif
1192	ldr	r3, .L_arm_memcpy
1193	ldr	r3, [r3]
1194	cmp	r3, #0
1195	beq	.Lnormal
1196	ldr	r3, .L_min_memcpy_size
1197	ldr	r3, [r3]
1198	cmp	r2, r3
1199	blt	.Lnormal
1200	stmfd	sp!, {r0-r2, r4, lr}
1201	mov	r3, #0
1202	ldr	r4, .L_arm_memcpy
1203	mov	lr, pc
1204	ldr	pc, [r4]
1205	cmp	r0, #0
1206	ldmfd	sp!, {r0-r2, r4, lr}
1207	RETeq
1208.Lnormal:
1209	mov	r3, r0			/* We must not clobber r0 */
1210
1211	/* Word-align the destination buffer */
1212	ands	ip, r3, #0x03		/* Already word aligned? */
1213	beq	.Lmemcpy_wordaligned	/* Yup */
1214	cmp	ip, #0x02
1215	ldrb	ip, [r1], #0x01
1216	sub	r2, r2, #0x01
1217	strb	ip, [r3], #0x01
1218	ldrble	ip, [r1], #0x01
1219	suble	r2, r2, #0x01
1220	strble	ip, [r3], #0x01
1221	ldrblt	ip, [r1], #0x01
1222	sublt	r2, r2, #0x01
1223	strblt	ip, [r3], #0x01
1224
1225	/* Destination buffer is now word aligned */
1226.Lmemcpy_wordaligned:
1227	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1228	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1229
1230	/* Quad-align the destination buffer */
1231	tst	r3, #0x07		/* Already quad aligned? */
1232	ldrne	ip, [r1], #0x04
1233	stmfd	sp!, {r4-r9}		/* Free up some registers */
1234	subne	r2, r2, #0x04
1235	strne	ip, [r3], #0x04
1236
1237	/* Destination buffer quad aligned, source is at least word aligned */
1238	subs	r2, r2, #0x80
1239	blt	.Lmemcpy_w_lessthan128
1240
1241	/* Copy 128 bytes at a time */
1242.Lmemcpy_w_loop128:
1243	ldr	r4, [r1], #0x04		/* LD:00-03 */
1244	ldr	r5, [r1], #0x04		/* LD:04-07 */
1245	pld	[r1, #0x18]		/* Prefetch 0x20 */
1246	ldr	r6, [r1], #0x04		/* LD:08-0b */
1247	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1248	ldr	r8, [r1], #0x04		/* LD:10-13 */
1249	ldr	r9, [r1], #0x04		/* LD:14-17 */
1250	strd	r4, [r3], #0x08		/* ST:00-07 */
1251	ldr	r4, [r1], #0x04		/* LD:18-1b */
1252	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1253	strd	r6, [r3], #0x08		/* ST:08-0f */
1254	ldr	r6, [r1], #0x04		/* LD:20-23 */
1255	ldr	r7, [r1], #0x04		/* LD:24-27 */
1256	pld	[r1, #0x18]		/* Prefetch 0x40 */
1257	strd	r8, [r3], #0x08		/* ST:10-17 */
1258	ldr	r8, [r1], #0x04		/* LD:28-2b */
1259	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1260	strd	r4, [r3], #0x08		/* ST:18-1f */
1261	ldr	r4, [r1], #0x04		/* LD:30-33 */
1262	ldr	r5, [r1], #0x04		/* LD:34-37 */
1263	strd	r6, [r3], #0x08		/* ST:20-27 */
1264	ldr	r6, [r1], #0x04		/* LD:38-3b */
1265	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1266	strd	r8, [r3], #0x08		/* ST:28-2f */
1267	ldr	r8, [r1], #0x04		/* LD:40-43 */
1268	ldr	r9, [r1], #0x04		/* LD:44-47 */
1269	pld	[r1, #0x18]		/* Prefetch 0x60 */
1270	strd	r4, [r3], #0x08		/* ST:30-37 */
1271	ldr	r4, [r1], #0x04		/* LD:48-4b */
1272	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1273	strd	r6, [r3], #0x08		/* ST:38-3f */
1274	ldr	r6, [r1], #0x04		/* LD:50-53 */
1275	ldr	r7, [r1], #0x04		/* LD:54-57 */
1276	strd	r8, [r3], #0x08		/* ST:40-47 */
1277	ldr	r8, [r1], #0x04		/* LD:58-5b */
1278	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1279	strd	r4, [r3], #0x08		/* ST:48-4f */
1280	ldr	r4, [r1], #0x04		/* LD:60-63 */
1281	ldr	r5, [r1], #0x04		/* LD:64-67 */
1282	pld	[r1, #0x18]		/* Prefetch 0x80 */
1283	strd	r6, [r3], #0x08		/* ST:50-57 */
1284	ldr	r6, [r1], #0x04		/* LD:68-6b */
1285	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1286	strd	r8, [r3], #0x08		/* ST:58-5f */
1287	ldr	r8, [r1], #0x04		/* LD:70-73 */
1288	ldr	r9, [r1], #0x04		/* LD:74-77 */
1289	strd	r4, [r3], #0x08		/* ST:60-67 */
1290	ldr	r4, [r1], #0x04		/* LD:78-7b */
1291	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1292	strd	r6, [r3], #0x08		/* ST:68-6f */
1293	strd	r8, [r3], #0x08		/* ST:70-77 */
1294	subs	r2, r2, #0x80
1295	strd	r4, [r3], #0x08		/* ST:78-7f */
1296	bge	.Lmemcpy_w_loop128
1297
1298.Lmemcpy_w_lessthan128:
1299	adds	r2, r2, #0x80		/* Adjust for extra sub */
1300	ldmfdeq	sp!, {r4-r9}
1301	RETeq			/* Return now if done */
1302	subs	r2, r2, #0x20
1303	blt	.Lmemcpy_w_lessthan32
1304
1305	/* Copy 32 bytes at a time */
1306.Lmemcpy_w_loop32:
1307	ldr	r4, [r1], #0x04
1308	ldr	r5, [r1], #0x04
1309	pld	[r1, #0x18]
1310	ldr	r6, [r1], #0x04
1311	ldr	r7, [r1], #0x04
1312	ldr	r8, [r1], #0x04
1313	ldr	r9, [r1], #0x04
1314	strd	r4, [r3], #0x08
1315	ldr	r4, [r1], #0x04
1316	ldr	r5, [r1], #0x04
1317	strd	r6, [r3], #0x08
1318	strd	r8, [r3], #0x08
1319	subs	r2, r2, #0x20
1320	strd	r4, [r3], #0x08
1321	bge	.Lmemcpy_w_loop32
1322
1323.Lmemcpy_w_lessthan32:
1324	adds	r2, r2, #0x20		/* Adjust for extra sub */
1325	ldmfdeq	sp!, {r4-r9}
1326	RETeq			/* Return now if done */
1327
1328	and	r4, r2, #0x18
1329	rsbs	r4, r4, #0x18
1330	addne	pc, pc, r4, lsl #1
1331	nop
1332
1333	/* At least 24 bytes remaining */
1334	ldr	r4, [r1], #0x04
1335	ldr	r5, [r1], #0x04
1336	sub	r2, r2, #0x08
1337	strd	r4, [r3], #0x08
1338
1339	/* At least 16 bytes remaining */
1340	ldr	r4, [r1], #0x04
1341	ldr	r5, [r1], #0x04
1342	sub	r2, r2, #0x08
1343	strd	r4, [r3], #0x08
1344
1345	/* At least 8 bytes remaining */
1346	ldr	r4, [r1], #0x04
1347	ldr	r5, [r1], #0x04
1348	subs	r2, r2, #0x08
1349	strd	r4, [r3], #0x08
1350
1351	/* Less than 8 bytes remaining */
1352	ldmfd	sp!, {r4-r9}
1353	RETeq			/* Return now if done */
1354	subs	r2, r2, #0x04
1355	ldrge	ip, [r1], #0x04
1356	strge	ip, [r3], #0x04
1357	RETeq			/* Return now if done */
1358	addlt	r2, r2, #0x04
1359	ldrb	ip, [r1], #0x01
1360	cmp	r2, #0x02
1361	ldrbge	r2, [r1], #0x01
1362	strb	ip, [r3], #0x01
1363	ldrbgt	ip, [r1]
1364	strbge	r2, [r3], #0x01
1365	strbgt	ip, [r3]
1366	RET
1367/* Place a literal pool here for the above ldr instructions to use */
1368.ltorg
1369
1370
1371/*
1372 * At this point, it has not been possible to word align both buffers.
1373 * The destination buffer is word aligned, but the source buffer is not.
1374 */
1375.Lmemcpy_bad_align:
1376	stmfd	sp!, {r4-r7}
1377	bic	r1, r1, #0x03
1378	cmp	ip, #2
1379	ldr	ip, [r1], #0x04
1380	bgt	.Lmemcpy_bad3
1381	beq	.Lmemcpy_bad2
1382	b	.Lmemcpy_bad1
1383
1384.Lmemcpy_bad1_loop16:
1385#ifdef __ARMEB__
1386	mov	r4, ip, lsl #8
1387#else
1388	mov	r4, ip, lsr #8
1389#endif
1390	ldr	r5, [r1], #0x04
1391	pld	[r1, #0x018]
1392	ldr	r6, [r1], #0x04
1393	ldr	r7, [r1], #0x04
1394	ldr	ip, [r1], #0x04
1395#ifdef __ARMEB__
1396	orr	r4, r4, r5, lsr #24
1397	mov	r5, r5, lsl #8
1398	orr	r5, r5, r6, lsr #24
1399	mov	r6, r6, lsl #8
1400	orr	r6, r6, r7, lsr #24
1401	mov	r7, r7, lsl #8
1402	orr	r7, r7, ip, lsr #24
1403#else
1404	orr	r4, r4, r5, lsl #24
1405	mov	r5, r5, lsr #8
1406	orr	r5, r5, r6, lsl #24
1407	mov	r6, r6, lsr #8
1408	orr	r6, r6, r7, lsl #24
1409	mov	r7, r7, lsr #8
1410	orr	r7, r7, ip, lsl #24
1411#endif
1412	str	r4, [r3], #0x04
1413	str	r5, [r3], #0x04
1414	str	r6, [r3], #0x04
1415	str	r7, [r3], #0x04
1416.Lmemcpy_bad1:
1417	subs	r2, r2, #0x10
1418	bge	.Lmemcpy_bad1_loop16
1419
1420	adds	r2, r2, #0x10
1421	ldmfdeq	sp!, {r4-r7}
1422	RETeq			/* Return now if done */
1423	subs	r2, r2, #0x04
1424	sublt	r1, r1, #0x03
1425	blt	.Lmemcpy_bad_done
1426
1427.Lmemcpy_bad1_loop4:
1428#ifdef __ARMEB__
1429	mov	r4, ip, lsl #8
1430#else
1431	mov	r4, ip, lsr #8
1432#endif
1433	ldr	ip, [r1], #0x04
1434	subs	r2, r2, #0x04
1435#ifdef __ARMEB__
1436	orr	r4, r4, ip, lsr #24
1437#else
1438	orr	r4, r4, ip, lsl #24
1439#endif
1440	str	r4, [r3], #0x04
1441	bge	.Lmemcpy_bad1_loop4
1442	sub	r1, r1, #0x03
1443	b	.Lmemcpy_bad_done
1444
1445.Lmemcpy_bad2_loop16:
1446#ifdef __ARMEB__
1447	mov	r4, ip, lsl #16
1448#else
1449	mov	r4, ip, lsr #16
1450#endif
1451	ldr	r5, [r1], #0x04
1452	pld	[r1, #0x018]
1453	ldr	r6, [r1], #0x04
1454	ldr	r7, [r1], #0x04
1455	ldr	ip, [r1], #0x04
1456#ifdef __ARMEB__
1457	orr	r4, r4, r5, lsr #16
1458	mov	r5, r5, lsl #16
1459	orr	r5, r5, r6, lsr #16
1460	mov	r6, r6, lsl #16
1461	orr	r6, r6, r7, lsr #16
1462	mov	r7, r7, lsl #16
1463	orr	r7, r7, ip, lsr #16
1464#else
1465	orr	r4, r4, r5, lsl #16
1466	mov	r5, r5, lsr #16
1467	orr	r5, r5, r6, lsl #16
1468	mov	r6, r6, lsr #16
1469	orr	r6, r6, r7, lsl #16
1470	mov	r7, r7, lsr #16
1471	orr	r7, r7, ip, lsl #16
1472#endif
1473	str	r4, [r3], #0x04
1474	str	r5, [r3], #0x04
1475	str	r6, [r3], #0x04
1476	str	r7, [r3], #0x04
1477.Lmemcpy_bad2:
1478	subs	r2, r2, #0x10
1479	bge	.Lmemcpy_bad2_loop16
1480
1481	adds	r2, r2, #0x10
1482	ldmfdeq	sp!, {r4-r7}
1483	RETeq			/* Return now if done */
1484	subs	r2, r2, #0x04
1485	sublt	r1, r1, #0x02
1486	blt	.Lmemcpy_bad_done
1487
1488.Lmemcpy_bad2_loop4:
1489#ifdef __ARMEB__
1490	mov	r4, ip, lsl #16
1491#else
1492	mov	r4, ip, lsr #16
1493#endif
1494	ldr	ip, [r1], #0x04
1495	subs	r2, r2, #0x04
1496#ifdef __ARMEB__
1497	orr	r4, r4, ip, lsr #16
1498#else
1499	orr	r4, r4, ip, lsl #16
1500#endif
1501	str	r4, [r3], #0x04
1502	bge	.Lmemcpy_bad2_loop4
1503	sub	r1, r1, #0x02
1504	b	.Lmemcpy_bad_done
1505
1506.Lmemcpy_bad3_loop16:
1507#ifdef __ARMEB__
1508	mov	r4, ip, lsl #24
1509#else
1510	mov	r4, ip, lsr #24
1511#endif
1512	ldr	r5, [r1], #0x04
1513	pld	[r1, #0x018]
1514	ldr	r6, [r1], #0x04
1515	ldr	r7, [r1], #0x04
1516	ldr	ip, [r1], #0x04
1517#ifdef __ARMEB__
1518	orr	r4, r4, r5, lsr #8
1519	mov	r5, r5, lsl #24
1520	orr	r5, r5, r6, lsr #8
1521	mov	r6, r6, lsl #24
1522	orr	r6, r6, r7, lsr #8
1523	mov	r7, r7, lsl #24
1524	orr	r7, r7, ip, lsr #8
1525#else
1526	orr	r4, r4, r5, lsl #8
1527	mov	r5, r5, lsr #24
1528	orr	r5, r5, r6, lsl #8
1529	mov	r6, r6, lsr #24
1530	orr	r6, r6, r7, lsl #8
1531	mov	r7, r7, lsr #24
1532	orr	r7, r7, ip, lsl #8
1533#endif
1534	str	r4, [r3], #0x04
1535	str	r5, [r3], #0x04
1536	str	r6, [r3], #0x04
1537	str	r7, [r3], #0x04
1538.Lmemcpy_bad3:
1539	subs	r2, r2, #0x10
1540	bge	.Lmemcpy_bad3_loop16
1541
1542	adds	r2, r2, #0x10
1543	ldmfdeq	sp!, {r4-r7}
1544	RETeq			/* Return now if done */
1545	subs	r2, r2, #0x04
1546	sublt	r1, r1, #0x01
1547	blt	.Lmemcpy_bad_done
1548
1549.Lmemcpy_bad3_loop4:
1550#ifdef __ARMEB__
1551	mov	r4, ip, lsl #24
1552#else
1553	mov	r4, ip, lsr #24
1554#endif
1555	ldr	ip, [r1], #0x04
1556	subs	r2, r2, #0x04
1557#ifdef __ARMEB__
1558	orr	r4, r4, ip, lsr #8
1559#else
1560	orr	r4, r4, ip, lsl #8
1561#endif
1562	str	r4, [r3], #0x04
1563	bge	.Lmemcpy_bad3_loop4
1564	sub	r1, r1, #0x01
1565
1566.Lmemcpy_bad_done:
1567	ldmfd	sp!, {r4-r7}
1568	adds	r2, r2, #0x04
1569	RETeq
1570	ldrb	ip, [r1], #0x01
1571	cmp	r2, #0x02
1572	ldrbge	r2, [r1], #0x01
1573	strb	ip, [r3], #0x01
1574	ldrbgt	ip, [r1]
1575	strbge	r2, [r3], #0x01
1576	strbgt	ip, [r3]
1577	RET
1578
1579
1580/*
1581 * Handle short copies (less than 16 bytes), possibly misaligned.
1582 * Some of these are *very* common, thanks to the network stack,
1583 * and so are handled specially.
1584 */
1585.Lmemcpy_short:
1586	add	pc, pc, r2, lsl #2
1587	nop
1588	RET			/* 0x00 */
1589	b	.Lmemcpy_bytewise	/* 0x01 */
1590	b	.Lmemcpy_bytewise	/* 0x02 */
1591	b	.Lmemcpy_bytewise	/* 0x03 */
1592	b	.Lmemcpy_4		/* 0x04 */
1593	b	.Lmemcpy_bytewise	/* 0x05 */
1594	b	.Lmemcpy_6		/* 0x06 */
1595	b	.Lmemcpy_bytewise	/* 0x07 */
1596	b	.Lmemcpy_8		/* 0x08 */
1597	b	.Lmemcpy_bytewise	/* 0x09 */
1598	b	.Lmemcpy_bytewise	/* 0x0a */
1599	b	.Lmemcpy_bytewise	/* 0x0b */
1600	b	.Lmemcpy_c		/* 0x0c */
1601.Lmemcpy_bytewise:
1602	mov	r3, r0			/* We must not clobber r0 */
1603	ldrb	ip, [r1], #0x01
16041:	subs	r2, r2, #0x01
1605	strb	ip, [r3], #0x01
1606	ldrbne	ip, [r1], #0x01
1607	bne	1b
1608	RET
1609
1610/******************************************************************************
1611 * Special case for 4 byte copies
1612 */
1613#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1614#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1615	LMEMCPY_4_PAD
1616.Lmemcpy_4:
1617	and	r2, r1, #0x03
1618	orr	r2, r2, r0, lsl #2
1619	ands	r2, r2, #0x0f
1620	sub	r3, pc, #0x14
1621	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1622
1623/*
1624 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1625 */
1626	ldr	r2, [r1]
1627	str	r2, [r0]
1628	RET
1629	LMEMCPY_4_PAD
1630
1631/*
1632 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1633 */
1634	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1635	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1636#ifdef __ARMEB__
1637	mov	r3, r3, lsl #8		/* r3 = 012. */
1638	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1639#else
1640	mov	r3, r3, lsr #8		/* r3 = .210 */
1641	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1642#endif
1643	str	r3, [r0]
1644	RET
1645	LMEMCPY_4_PAD
1646
1647/*
1648 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1649 */
1650#ifdef __ARMEB__
1651	ldrh	r3, [r1]
1652	ldrh	r2, [r1, #0x02]
1653#else
1654	ldrh	r3, [r1, #0x02]
1655	ldrh	r2, [r1]
1656#endif
1657	orr	r3, r2, r3, lsl #16
1658	str	r3, [r0]
1659	RET
1660	LMEMCPY_4_PAD
1661
1662/*
1663 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1664 */
1665	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1666	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1667#ifdef __ARMEB__
1668	mov	r3, r3, lsl #24		/* r3 = 0... */
1669	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1670#else
1671	mov	r3, r3, lsr #24		/* r3 = ...0 */
1672	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1673#endif
1674	str	r3, [r0]
1675	RET
1676	LMEMCPY_4_PAD
1677
1678/*
1679 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1680 */
1681	ldr	r2, [r1]
1682#ifdef __ARMEB__
1683	strb	r2, [r0, #0x03]
1684	mov	r3, r2, lsr #8
1685	mov	r1, r2, lsr #24
1686	strb	r1, [r0]
1687#else
1688	strb	r2, [r0]
1689	mov	r3, r2, lsr #8
1690	mov	r1, r2, lsr #24
1691	strb	r1, [r0, #0x03]
1692#endif
1693	strh	r3, [r0, #0x01]
1694	RET
1695	LMEMCPY_4_PAD
1696
1697/*
1698 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1699 */
1700	ldrb	r2, [r1]
1701	ldrh	r3, [r1, #0x01]
1702	ldrb	r1, [r1, #0x03]
1703	strb	r2, [r0]
1704	strh	r3, [r0, #0x01]
1705	strb	r1, [r0, #0x03]
1706	RET
1707	LMEMCPY_4_PAD
1708
1709/*
1710 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1711 */
1712	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1713	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1714#ifdef __ARMEB__
1715	mov	r1, r2, lsr #8		/* r1 = ...0 */
1716	strb	r1, [r0]
1717	mov	r2, r2, lsl #8		/* r2 = .01. */
1718	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1719#else
1720	strb	r2, [r0]
1721	mov	r2, r2, lsr #8		/* r2 = ...1 */
1722	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1723	mov	r3, r3, lsr #8		/* r3 = ...3 */
1724#endif
1725	strh	r2, [r0, #0x01]
1726	strb	r3, [r0, #0x03]
1727	RET
1728	LMEMCPY_4_PAD
1729
1730/*
1731 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1732 */
1733	ldrb	r2, [r1]
1734	ldrh	r3, [r1, #0x01]
1735	ldrb	r1, [r1, #0x03]
1736	strb	r2, [r0]
1737	strh	r3, [r0, #0x01]
1738	strb	r1, [r0, #0x03]
1739	RET
1740	LMEMCPY_4_PAD
1741
1742/*
1743 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1744 */
1745	ldr	r2, [r1]
1746#ifdef __ARMEB__
1747	strh	r2, [r0, #0x02]
1748	mov	r3, r2, lsr #16
1749	strh	r3, [r0]
1750#else
1751	strh	r2, [r0]
1752	mov	r3, r2, lsr #16
1753	strh	r3, [r0, #0x02]
1754#endif
1755	RET
1756	LMEMCPY_4_PAD
1757
1758/*
1759 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1760 */
1761	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1762	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1763	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1764	strh	r1, [r0]
1765#ifdef __ARMEB__
1766	mov	r2, r2, lsl #8		/* r2 = 012. */
1767	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1768#else
1769	mov	r2, r2, lsr #24		/* r2 = ...2 */
1770	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1771#endif
1772	strh	r2, [r0, #0x02]
1773	RET
1774	LMEMCPY_4_PAD
1775
1776/*
1777 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1778 */
1779	ldrh	r2, [r1]
1780	ldrh	r3, [r1, #0x02]
1781	strh	r2, [r0]
1782	strh	r3, [r0, #0x02]
1783	RET
1784	LMEMCPY_4_PAD
1785
1786/*
1787 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1788 */
1789	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1790	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1791	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1792	strh	r1, [r0, #0x02]
1793#ifdef __ARMEB__
1794	mov	r3, r3, lsr #24		/* r3 = ...1 */
1795	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1796#else
1797	mov	r3, r3, lsl #8		/* r3 = 321. */
1798	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1799#endif
1800	strh	r3, [r0]
1801	RET
1802	LMEMCPY_4_PAD
1803
1804/*
1805 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1806 */
1807	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1808#ifdef __ARMEB__
1809	strb	r2, [r0, #0x03]
1810	mov	r3, r2, lsr #8
1811	mov	r1, r2, lsr #24
1812	strh	r3, [r0, #0x01]
1813	strb	r1, [r0]
1814#else
1815	strb	r2, [r0]
1816	mov	r3, r2, lsr #8
1817	mov	r1, r2, lsr #24
1818	strh	r3, [r0, #0x01]
1819	strb	r1, [r0, #0x03]
1820#endif
1821	RET
1822	LMEMCPY_4_PAD
1823
1824/*
1825 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1826 */
1827	ldrb	r2, [r1]
1828	ldrh	r3, [r1, #0x01]
1829	ldrb	r1, [r1, #0x03]
1830	strb	r2, [r0]
1831	strh	r3, [r0, #0x01]
1832	strb	r1, [r0, #0x03]
1833	RET
1834	LMEMCPY_4_PAD
1835
1836/*
1837 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1838 */
1839#ifdef __ARMEB__
1840	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1841	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1842	strb	r3, [r0, #0x03]
1843	mov	r3, r3, lsr #8		/* r3 = ...2 */
1844	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1845	strh	r3, [r0, #0x01]
1846	mov	r2, r2, lsr #8		/* r2 = ...0 */
1847	strb	r2, [r0]
1848#else
1849	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1850	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1851	strb	r2, [r0]
1852	mov	r2, r2, lsr #8		/* r2 = ...1 */
1853	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1854	strh	r2, [r0, #0x01]
1855	mov	r3, r3, lsr #8		/* r3 = ...3 */
1856	strb	r3, [r0, #0x03]
1857#endif
1858	RET
1859	LMEMCPY_4_PAD
1860
1861/*
1862 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1863 */
1864	ldrb	r2, [r1]
1865	ldrh	r3, [r1, #0x01]
1866	ldrb	r1, [r1, #0x03]
1867	strb	r2, [r0]
1868	strh	r3, [r0, #0x01]
1869	strb	r1, [r0, #0x03]
1870	RET
1871	LMEMCPY_4_PAD
1872
1873
1874/******************************************************************************
1875 * Special case for 6 byte copies
1876 */
1877#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1878#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1879	LMEMCPY_6_PAD
1880.Lmemcpy_6:
1881	and	r2, r1, #0x03
1882	orr	r2, r2, r0, lsl #2
1883	ands	r2, r2, #0x0f
1884	sub	r3, pc, #0x14
1885	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1886
1887/*
1888 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1889 */
1890	ldr	r2, [r1]
1891	ldrh	r3, [r1, #0x04]
1892	str	r2, [r0]
1893	strh	r3, [r0, #0x04]
1894	RET
1895	LMEMCPY_6_PAD
1896
1897/*
1898 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1899 */
1900	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1901	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1902#ifdef __ARMEB__
1903	mov	r2, r2, lsl #8		/* r2 = 012. */
1904	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1905#else
1906	mov	r2, r2, lsr #8		/* r2 = .210 */
1907	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1908#endif
1909	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1910	str	r2, [r0]
1911	strh	r3, [r0, #0x04]
1912	RET
1913	LMEMCPY_6_PAD
1914
1915/*
1916 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1917 */
1918	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1919	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1920#ifdef __ARMEB__
1921	mov	r1, r3, lsr #16		/* r1 = ..23 */
1922	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1923	str	r1, [r0]
1924	strh	r3, [r0, #0x04]
1925#else
1926	mov	r1, r3, lsr #16		/* r1 = ..54 */
1927	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1928	str	r2, [r0]
1929	strh	r1, [r0, #0x04]
1930#endif
1931	RET
1932	LMEMCPY_6_PAD
1933
1934/*
1935 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1936 */
1937	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1938	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1939	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1940#ifdef __ARMEB__
1941	mov	r2, r2, lsl #24		/* r2 = 0... */
1942	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1943	mov	r3, r3, lsl #8		/* r3 = 234. */
1944	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1945#else
1946	mov	r2, r2, lsr #24		/* r2 = ...0 */
1947	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1948	mov	r1, r1, lsl #8		/* r1 = xx5. */
1949	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1950#endif
1951	str	r2, [r0]
1952	strh	r1, [r0, #0x04]
1953	RET
1954	LMEMCPY_6_PAD
1955
1956/*
1957 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1958 */
1959	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1960	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1961	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1962	strh	r1, [r0, #0x01]
1963#ifdef __ARMEB__
1964	mov	r1, r3, lsr #24		/* r1 = ...0 */
1965	strb	r1, [r0]
1966	mov	r3, r3, lsl #8		/* r3 = 123. */
1967	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1968#else
1969	strb	r3, [r0]
1970	mov	r3, r3, lsr #24		/* r3 = ...3 */
1971	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1972	mov	r2, r2, lsr #8		/* r2 = ...5 */
1973#endif
1974	strh	r3, [r0, #0x03]
1975	strb	r2, [r0, #0x05]
1976	RET
1977	LMEMCPY_6_PAD
1978
1979/*
1980 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1981 */
1982	ldrb	r2, [r1]
1983	ldrh	r3, [r1, #0x01]
1984	ldrh	ip, [r1, #0x03]
1985	ldrb	r1, [r1, #0x05]
1986	strb	r2, [r0]
1987	strh	r3, [r0, #0x01]
1988	strh	ip, [r0, #0x03]
1989	strb	r1, [r0, #0x05]
1990	RET
1991	LMEMCPY_6_PAD
1992
1993/*
1994 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1995 */
1996	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1997	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1998#ifdef __ARMEB__
1999	mov	r3, r2, lsr #8		/* r3 = ...0 */
2000	strb	r3, [r0]
2001	strb	r1, [r0, #0x05]
2002	mov	r3, r1, lsr #8		/* r3 = .234 */
2003	strh	r3, [r0, #0x03]
2004	mov	r3, r2, lsl #8		/* r3 = .01. */
2005	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
2006	strh	r3, [r0, #0x01]
2007#else
2008	strb	r2, [r0]
2009	mov	r3, r1, lsr #24
2010	strb	r3, [r0, #0x05]
2011	mov	r3, r1, lsr #8		/* r3 = .543 */
2012	strh	r3, [r0, #0x03]
2013	mov	r3, r2, lsr #8		/* r3 = ...1 */
2014	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2015	strh	r3, [r0, #0x01]
2016#endif
2017	RET
2018	LMEMCPY_6_PAD
2019
2020/*
2021 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2022 */
2023	ldrb	r2, [r1]
2024	ldrh	r3, [r1, #0x01]
2025	ldrh	ip, [r1, #0x03]
2026	ldrb	r1, [r1, #0x05]
2027	strb	r2, [r0]
2028	strh	r3, [r0, #0x01]
2029	strh	ip, [r0, #0x03]
2030	strb	r1, [r0, #0x05]
2031	RET
2032	LMEMCPY_6_PAD
2033
2034/*
2035 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2036 */
2037#ifdef __ARMEB__
2038	ldr	r2, [r1]		/* r2 = 0123 */
2039	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2040	mov	r1, r2, lsr #16		/* r1 = ..01 */
2041	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2042	strh	r1, [r0]
2043	str	r3, [r0, #0x02]
2044#else
2045	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2046	ldr	r3, [r1]		/* r3 = 3210 */
2047	mov	r2, r2, lsl #16		/* r2 = 54.. */
2048	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2049	strh	r3, [r0]
2050	str	r2, [r0, #0x02]
2051#endif
2052	RET
2053	LMEMCPY_6_PAD
2054
2055/*
2056 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2057 */
2058	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2059	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2060	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2061#ifdef __ARMEB__
2062	mov	r2, r2, lsr #8		/* r2 = .345 */
2063	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2064#else
2065	mov	r2, r2, lsl #8		/* r2 = 543. */
2066	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2067#endif
2068	strh	r1, [r0]
2069	str	r2, [r0, #0x02]
2070	RET
2071	LMEMCPY_6_PAD
2072
2073/*
2074 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2075 */
2076	ldrh	r2, [r1]
2077	ldr	r3, [r1, #0x02]
2078	strh	r2, [r0]
2079	str	r3, [r0, #0x02]
2080	RET
2081	LMEMCPY_6_PAD
2082
2083/*
2084 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2085 */
2086	ldrb	r3, [r1]		/* r3 = ...0 */
2087	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2088	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2089#ifdef __ARMEB__
2090	mov	r3, r3, lsl #8		/* r3 = ..0. */
2091	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2092	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2093#else
2094	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2095	mov	r1, r1, lsl #24		/* r1 = 5... */
2096	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2097#endif
2098	strh	r3, [r0]
2099	str	r1, [r0, #0x02]
2100	RET
2101	LMEMCPY_6_PAD
2102
2103/*
2104 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2105 */
2106	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2107	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2108#ifdef __ARMEB__
2109	mov	r3, r2, lsr #24		/* r3 = ...0 */
2110	strb	r3, [r0]
2111	mov	r2, r2, lsl #8		/* r2 = 123. */
2112	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2113#else
2114	strb	r2, [r0]
2115	mov	r2, r2, lsr #8		/* r2 = .321 */
2116	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2117	mov	r1, r1, lsr #8		/* r1 = ...5 */
2118#endif
2119	str	r2, [r0, #0x01]
2120	strb	r1, [r0, #0x05]
2121	RET
2122	LMEMCPY_6_PAD
2123
2124/*
2125 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2126 */
2127	ldrb	r2, [r1]
2128	ldrh	r3, [r1, #0x01]
2129	ldrh	ip, [r1, #0x03]
2130	ldrb	r1, [r1, #0x05]
2131	strb	r2, [r0]
2132	strh	r3, [r0, #0x01]
2133	strh	ip, [r0, #0x03]
2134	strb	r1, [r0, #0x05]
2135	RET
2136	LMEMCPY_6_PAD
2137
2138/*
2139 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2140 */
2141	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2142	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2143#ifdef __ARMEB__
2144	mov	r3, r2, lsr #8		/* r3 = ...0 */
2145	strb	r3, [r0]
2146	mov	r2, r2, lsl #24		/* r2 = 1... */
2147	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2148#else
2149	strb	r2, [r0]
2150	mov	r2, r2, lsr #8		/* r2 = ...1 */
2151	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2152	mov	r1, r1, lsr #24		/* r1 = ...5 */
2153#endif
2154	str	r2, [r0, #0x01]
2155	strb	r1, [r0, #0x05]
2156	RET
2157	LMEMCPY_6_PAD
2158
2159/*
2160 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2161 */
2162	ldrb	r2, [r1]
2163	ldr	r3, [r1, #0x01]
2164	ldrb	r1, [r1, #0x05]
2165	strb	r2, [r0]
2166	str	r3, [r0, #0x01]
2167	strb	r1, [r0, #0x05]
2168	RET
2169	LMEMCPY_6_PAD
2170
2171
2172/******************************************************************************
2173 * Special case for 8 byte copies
2174 */
2175#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2176#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2177	LMEMCPY_8_PAD
2178.Lmemcpy_8:
2179	and	r2, r1, #0x03
2180	orr	r2, r2, r0, lsl #2
2181	ands	r2, r2, #0x0f
2182	sub	r3, pc, #0x14
2183	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2184
2185/*
2186 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2187 */
2188	ldr	r2, [r1]
2189	ldr	r3, [r1, #0x04]
2190	str	r2, [r0]
2191	str	r3, [r0, #0x04]
2192	RET
2193	LMEMCPY_8_PAD
2194
2195/*
2196 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2197 */
2198	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2199	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2200	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2201#ifdef __ARMEB__
2202	mov	r3, r3, lsl #8		/* r3 = 012. */
2203	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2204	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2205#else
2206	mov	r3, r3, lsr #8		/* r3 = .210 */
2207	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2208	mov	r1, r1, lsl #24		/* r1 = 7... */
2209	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2210#endif
2211	str	r3, [r0]
2212	str	r2, [r0, #0x04]
2213	RET
2214	LMEMCPY_8_PAD
2215
2216/*
2217 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2218 */
2219	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2220	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2221	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2222#ifdef __ARMEB__
2223	mov	r2, r2, lsl #16		/* r2 = 01.. */
2224	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2225	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2226#else
2227	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2228	mov	r3, r3, lsr #16		/* r3 = ..54 */
2229	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2230#endif
2231	str	r2, [r0]
2232	str	r3, [r0, #0x04]
2233	RET
2234	LMEMCPY_8_PAD
2235
2236/*
2237 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2238 */
2239	ldrb	r3, [r1]		/* r3 = ...0 */
2240	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2241	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2242#ifdef __ARMEB__
2243	mov	r3, r3, lsl #24		/* r3 = 0... */
2244	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2245	mov	r2, r2, lsl #24		/* r2 = 4... */
2246	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2247#else
2248	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2249	mov	r2, r2, lsr #24		/* r2 = ...4 */
2250	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2251#endif
2252	str	r3, [r0]
2253	str	r2, [r0, #0x04]
2254	RET
2255	LMEMCPY_8_PAD
2256
2257/*
2258 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2259 */
2260	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2261	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2262#ifdef __ARMEB__
2263	mov	r1, r3, lsr #24		/* r1 = ...0 */
2264	strb	r1, [r0]
2265	mov	r1, r3, lsr #8		/* r1 = .012 */
2266	strb	r2, [r0, #0x07]
2267	mov	r3, r3, lsl #24		/* r3 = 3... */
2268	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2269#else
2270	strb	r3, [r0]
2271	mov	r1, r2, lsr #24		/* r1 = ...7 */
2272	strb	r1, [r0, #0x07]
2273	mov	r1, r3, lsr #8		/* r1 = .321 */
2274	mov	r3, r3, lsr #24		/* r3 = ...3 */
2275	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2276#endif
2277	strh	r1, [r0, #0x01]
2278	str	r3, [r0, #0x03]
2279	RET
2280	LMEMCPY_8_PAD
2281
2282/*
2283 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2284 */
2285	ldrb	r2, [r1]
2286	ldrh	r3, [r1, #0x01]
2287	ldr	ip, [r1, #0x03]
2288	ldrb	r1, [r1, #0x07]
2289	strb	r2, [r0]
2290	strh	r3, [r0, #0x01]
2291	str	ip, [r0, #0x03]
2292	strb	r1, [r0, #0x07]
2293	RET
2294	LMEMCPY_8_PAD
2295
2296/*
2297 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2298 */
2299	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2300	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2301	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2302#ifdef __ARMEB__
2303	mov	ip, r2, lsr #8		/* ip = ...0 */
2304	strb	ip, [r0]
2305	mov	ip, r2, lsl #8		/* ip = .01. */
2306	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2307	strb	r1, [r0, #0x07]
2308	mov	r3, r3, lsl #8		/* r3 = 345. */
2309	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2310#else
2311	strb	r2, [r0]		/* 0 */
2312	mov	ip, r1, lsr #8		/* ip = ...7 */
2313	strb	ip, [r0, #0x07]		/* 7 */
2314	mov	ip, r2, lsr #8		/* ip = ...1 */
2315	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2316	mov	r3, r3, lsr #8		/* r3 = .543 */
2317	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2318#endif
2319	strh	ip, [r0, #0x01]
2320	str	r3, [r0, #0x03]
2321	RET
2322	LMEMCPY_8_PAD
2323
2324/*
2325 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2326 */
2327	ldrb	r3, [r1]		/* r3 = ...0 */
2328	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2329	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2330	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2331	strb	r3, [r0]
2332	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2333#ifdef __ARMEB__
2334	strh	r3, [r0, #0x01]
2335	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2336#else
2337	strh	ip, [r0, #0x01]
2338	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2339#endif
2340	str	r2, [r0, #0x03]
2341	strb	r1, [r0, #0x07]
2342	RET
2343	LMEMCPY_8_PAD
2344
2345/*
2346 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2347 */
2348	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2349	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2350	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2351#ifdef __ARMEB__
2352	strh	r1, [r0]
2353	mov	r1, r3, lsr #16		/* r1 = ..45 */
2354	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2355#else
2356	strh	r2, [r0]
2357	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2358	mov	r3, r3, lsr #16		/* r3 = ..76 */
2359#endif
2360	str	r2, [r0, #0x02]
2361	strh	r3, [r0, #0x06]
2362	RET
2363	LMEMCPY_8_PAD
2364
2365/*
2366 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2367 */
2368	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2369	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2370	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2371	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2372	strh	r1, [r0]
2373#ifdef __ARMEB__
2374	mov	r1, r2, lsl #24		/* r1 = 2... */
2375	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2376	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2377#else
2378	mov	r1, r2, lsr #24		/* r1 = ...2 */
2379	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2380	mov	r3, r3, lsr #24		/* r3 = ...6 */
2381	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2382#endif
2383	str	r1, [r0, #0x02]
2384	strh	r3, [r0, #0x06]
2385	RET
2386	LMEMCPY_8_PAD
2387
2388/*
2389 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2390 */
2391	ldrh	r2, [r1]
2392	ldr	ip, [r1, #0x02]
2393	ldrh	r3, [r1, #0x06]
2394	strh	r2, [r0]
2395	str	ip, [r0, #0x02]
2396	strh	r3, [r0, #0x06]
2397	RET
2398	LMEMCPY_8_PAD
2399
2400/*
2401 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2402 */
2403	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2404	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2405	ldrb	ip, [r1]		/* ip = ...0 */
2406	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2407	strh	r1, [r0, #0x06]
2408#ifdef __ARMEB__
2409	mov	r3, r3, lsr #24		/* r3 = ...5 */
2410	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2411	mov	r2, r2, lsr #24		/* r2 = ...1 */
2412	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2413#else
2414	mov	r3, r3, lsl #24		/* r3 = 5... */
2415	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2416	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2417#endif
2418	str	r3, [r0, #0x02]
2419	strh	r2, [r0]
2420	RET
2421	LMEMCPY_8_PAD
2422
2423/*
2424 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2425 */
2426	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2427	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2428	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2429	strh	r1, [r0, #0x05]
2430#ifdef __ARMEB__
2431	strb	r3, [r0, #0x07]
2432	mov	r1, r2, lsr #24		/* r1 = ...0 */
2433	strb	r1, [r0]
2434	mov	r2, r2, lsl #8		/* r2 = 123. */
2435	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2436	str	r2, [r0, #0x01]
2437#else
2438	strb	r2, [r0]
2439	mov	r1, r3, lsr #24		/* r1 = ...7 */
2440	strb	r1, [r0, #0x07]
2441	mov	r2, r2, lsr #8		/* r2 = .321 */
2442	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2443	str	r2, [r0, #0x01]
2444#endif
2445	RET
2446	LMEMCPY_8_PAD
2447
2448/*
2449 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2450 */
2451	ldrb	r3, [r1]		/* r3 = ...0 */
2452	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2453	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2454	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2455	strb	r3, [r0]
2456	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2457#ifdef __ARMEB__
2458	strh	ip, [r0, #0x05]
2459	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2460#else
2461	strh	r3, [r0, #0x05]
2462	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2463#endif
2464	str	r2, [r0, #0x01]
2465	strb	r1, [r0, #0x07]
2466	RET
2467	LMEMCPY_8_PAD
2468
2469/*
2470 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2471 */
2472	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2473	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2474	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2475#ifdef __ARMEB__
2476	mov	ip, r2, lsr #8		/* ip = ...0 */
2477	strb	ip, [r0]
2478	mov	ip, r2, lsl #24		/* ip = 1... */
2479	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2480	strb	r1, [r0, #0x07]
2481	mov	r1, r1, lsr #8		/* r1 = ...6 */
2482	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2483#else
2484	strb	r2, [r0]
2485	mov	ip, r2, lsr #8		/* ip = ...1 */
2486	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2487	mov	r2, r1, lsr #8		/* r2 = ...7 */
2488	strb	r2, [r0, #0x07]
2489	mov	r1, r1, lsl #8		/* r1 = .76. */
2490	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2491#endif
2492	str	ip, [r0, #0x01]
2493	strh	r1, [r0, #0x05]
2494	RET
2495	LMEMCPY_8_PAD
2496
2497/*
2498 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2499 */
2500	ldrb	r2, [r1]
2501	ldr	ip, [r1, #0x01]
2502	ldrh	r3, [r1, #0x05]
2503	ldrb	r1, [r1, #0x07]
2504	strb	r2, [r0]
2505	str	ip, [r0, #0x01]
2506	strh	r3, [r0, #0x05]
2507	strb	r1, [r0, #0x07]
2508	RET
2509	LMEMCPY_8_PAD
2510
2511/******************************************************************************
2512 * Special case for 12 byte copies
2513 */
2514#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2515#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2516	LMEMCPY_C_PAD
2517.Lmemcpy_c:
2518	and	r2, r1, #0x03
2519	orr	r2, r2, r0, lsl #2
2520	ands	r2, r2, #0x0f
2521	sub	r3, pc, #0x14
2522	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2523
2524/*
2525 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2526 */
2527	ldr	r2, [r1]
2528	ldr	r3, [r1, #0x04]
2529	ldr	r1, [r1, #0x08]
2530	str	r2, [r0]
2531	str	r3, [r0, #0x04]
2532	str	r1, [r0, #0x08]
2533	RET
2534	LMEMCPY_C_PAD
2535
2536/*
2537 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2538 */
2539	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2540	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2541	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2542	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2543#ifdef __ARMEB__
2544	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2545	str	r2, [r0, #0x08]
2546	mov	r2, ip, lsr #24		/* r2 = ...7 */
2547	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2548	mov	r1, r1, lsl #8		/* r1 = 012. */
2549	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2550#else
2551	mov	r2, r2, lsl #24		/* r2 = B... */
2552	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2553	str	r2, [r0, #0x08]
2554	mov	r2, ip, lsl #24		/* r2 = 7... */
2555	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2556	mov	r1, r1, lsr #8		/* r1 = .210 */
2557	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2558#endif
2559	str	r2, [r0, #0x04]
2560	str	r1, [r0]
2561	RET
2562	LMEMCPY_C_PAD
2563
2564/*
2565 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2566 */
2567	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2568	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2569	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2570	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2571#ifdef __ARMEB__
2572	mov	r2, r2, lsl #16		/* r2 = 01.. */
2573	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2574	str	r2, [r0]
2575	mov	r3, r3, lsl #16		/* r3 = 45.. */
2576	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2577	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2578#else
2579	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2580	str	r2, [r0]
2581	mov	r3, r3, lsr #16		/* r3 = ..54 */
2582	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2583	mov	r1, r1, lsl #16		/* r1 = BA.. */
2584	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2585#endif
2586	str	r3, [r0, #0x04]
2587	str	r1, [r0, #0x08]
2588	RET
2589	LMEMCPY_C_PAD
2590
2591/*
2592 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2593 */
2594	ldrb	r2, [r1]		/* r2 = ...0 */
2595	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2596	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2597	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2598#ifdef __ARMEB__
2599	mov	r2, r2, lsl #24		/* r2 = 0... */
2600	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2601	str	r2, [r0]
2602	mov	r3, r3, lsl #24		/* r3 = 4... */
2603	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2604	mov	r1, r1, lsr #8		/* r1 = .9AB */
2605	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2606#else
2607	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2608	str	r2, [r0]
2609	mov	r3, r3, lsr #24		/* r3 = ...4 */
2610	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2611	mov	r1, r1, lsl #8		/* r1 = BA9. */
2612	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2613#endif
2614	str	r3, [r0, #0x04]
2615	str	r1, [r0, #0x08]
2616	RET
2617	LMEMCPY_C_PAD
2618
2619/*
2620 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2621 */
2622	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2623	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2624	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2625	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2626	strh	r1, [r0, #0x01]
2627#ifdef __ARMEB__
2628	mov	r1, r2, lsr #24		/* r1 = ...0 */
2629	strb	r1, [r0]
2630	mov	r1, r2, lsl #24		/* r1 = 3... */
2631	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2632	mov	r1, r3, lsl #24		/* r1 = 7... */
2633	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2634#else
2635	strb	r2, [r0]
2636	mov	r1, r2, lsr #24		/* r1 = ...3 */
2637	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2638	mov	r1, r3, lsr #24		/* r1 = ...7 */
2639	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2640	mov	ip, ip, lsr #24		/* ip = ...B */
2641#endif
2642	str	r2, [r0, #0x03]
2643	str	r1, [r0, #0x07]
2644	strb	ip, [r0, #0x0b]
2645	RET
2646	LMEMCPY_C_PAD
2647
2648/*
2649 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2650 */
2651	ldrb	r2, [r1]
2652	ldrh	r3, [r1, #0x01]
2653	ldr	ip, [r1, #0x03]
2654	strb	r2, [r0]
2655	ldr	r2, [r1, #0x07]
2656	ldrb	r1, [r1, #0x0b]
2657	strh	r3, [r0, #0x01]
2658	str	ip, [r0, #0x03]
2659	str	r2, [r0, #0x07]
2660	strb	r1, [r0, #0x0b]
2661	RET
2662	LMEMCPY_C_PAD
2663
2664/*
2665 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2666 */
2667	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2668	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2669	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2670	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2671#ifdef __ARMEB__
2672	mov	r2, r2, ror #8		/* r2 = 1..0 */
2673	strb	r2, [r0]
2674	mov	r2, r2, lsr #16		/* r2 = ..1. */
2675	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2676	strh	r2, [r0, #0x01]
2677	mov	r2, r3, lsl #8		/* r2 = 345. */
2678	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2679	mov	r2, ip, lsl #8		/* r2 = 789. */
2680	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2681#else
2682	strb	r2, [r0]
2683	mov	r2, r2, lsr #8		/* r2 = ...1 */
2684	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2685	strh	r2, [r0, #0x01]
2686	mov	r2, r3, lsr #8		/* r2 = .543 */
2687	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2688	mov	r2, ip, lsr #8		/* r2 = .987 */
2689	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2690	mov	r1, r1, lsr #8		/* r1 = ...B */
2691#endif
2692	str	r3, [r0, #0x03]
2693	str	r2, [r0, #0x07]
2694	strb	r1, [r0, #0x0b]
2695	RET
2696	LMEMCPY_C_PAD
2697
2698/*
2699 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2700 */
2701	ldrb	r2, [r1]
2702	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2703	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2704	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2705	strb	r2, [r0]
2706#ifdef __ARMEB__
2707	mov	r2, r3, lsr #16		/* r2 = ..12 */
2708	strh	r2, [r0, #0x01]
2709	mov	r3, r3, lsl #16		/* r3 = 34.. */
2710	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2711	mov	ip, ip, lsl #16		/* ip = 78.. */
2712	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2713	mov	r1, r1, lsr #8		/* r1 = .9AB */
2714#else
2715	strh	r3, [r0, #0x01]
2716	mov	r3, r3, lsr #16		/* r3 = ..43 */
2717	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2718	mov	ip, ip, lsr #16		/* ip = ..87 */
2719	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2720	mov	r1, r1, lsr #16		/* r1 = ..xB */
2721#endif
2722	str	r3, [r0, #0x03]
2723	str	ip, [r0, #0x07]
2724	strb	r1, [r0, #0x0b]
2725	RET
2726	LMEMCPY_C_PAD
2727
2728/*
2729 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2730 */
2731	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2732	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2733	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2734	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2735#ifdef __ARMEB__
2736	strh	r1, [r0]
2737	mov	r1, ip, lsl #16		/* r1 = 23.. */
2738	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2739	mov	r3, r3, lsl #16		/* r3 = 67.. */
2740	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2741#else
2742	strh	ip, [r0]
2743	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2744	mov	r3, r3, lsr #16		/* r3 = ..76 */
2745	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2746	mov	r2, r2, lsr #16		/* r2 = ..BA */
2747#endif
2748	str	r1, [r0, #0x02]
2749	str	r3, [r0, #0x06]
2750	strh	r2, [r0, #0x0a]
2751	RET
2752	LMEMCPY_C_PAD
2753
2754/*
2755 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2756 */
2757	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2758	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2759	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2760	strh	ip, [r0]
2761	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2762	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2763#ifdef __ARMEB__
2764	mov	r2, r2, lsl #24		/* r2 = 2... */
2765	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2766	mov	r3, r3, lsl #24		/* r3 = 6... */
2767	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2768	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2769#else
2770	mov	r2, r2, lsr #24		/* r2 = ...2 */
2771	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2772	mov	r3, r3, lsr #24		/* r3 = ...6 */
2773	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2774	mov	r1, r1, lsl #8		/* r1 = ..B. */
2775	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2776#endif
2777	str	r2, [r0, #0x02]
2778	str	r3, [r0, #0x06]
2779	strh	r1, [r0, #0x0a]
2780	RET
2781	LMEMCPY_C_PAD
2782
2783/*
2784 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2785 */
2786	ldrh	r2, [r1]
2787	ldr	r3, [r1, #0x02]
2788	ldr	ip, [r1, #0x06]
2789	ldrh	r1, [r1, #0x0a]
2790	strh	r2, [r0]
2791	str	r3, [r0, #0x02]
2792	str	ip, [r0, #0x06]
2793	strh	r1, [r0, #0x0a]
2794	RET
2795	LMEMCPY_C_PAD
2796
2797/*
2798 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2799 */
2800	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2801	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2802	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2803	strh	ip, [r0, #0x0a]
2804	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2805	ldrb	r1, [r1]		/* r1 = ...0 */
2806#ifdef __ARMEB__
2807	mov	r2, r2, lsr #24		/* r2 = ...9 */
2808	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2809	mov	r3, r3, lsr #24		/* r3 = ...5 */
2810	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2811	mov	r1, r1, lsl #8		/* r1 = ..0. */
2812	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2813#else
2814	mov	r2, r2, lsl #24		/* r2 = 9... */
2815	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2816	mov	r3, r3, lsl #24		/* r3 = 5... */
2817	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2818	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2819#endif
2820	str	r2, [r0, #0x06]
2821	str	r3, [r0, #0x02]
2822	strh	r1, [r0]
2823	RET
2824	LMEMCPY_C_PAD
2825
2826/*
2827 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2828 */
2829	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2830	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2831	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2832#ifdef __ARMEB__
2833	mov	r3, r2, lsr #24		/* r3 = ...0 */
2834	strb	r3, [r0]
2835	mov	r2, r2, lsl #8		/* r2 = 123. */
2836	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2837	str	r2, [r0, #0x01]
2838	mov	r2, ip, lsl #8		/* r2 = 567. */
2839	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2840	str	r2, [r0, #0x05]
2841	mov	r2, r1, lsr #8		/* r2 = ..9A */
2842	strh	r2, [r0, #0x09]
2843	strb	r1, [r0, #0x0b]
2844#else
2845	strb	r2, [r0]
2846	mov	r3, r2, lsr #8		/* r3 = .321 */
2847	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2848	str	r3, [r0, #0x01]
2849	mov	r3, ip, lsr #8		/* r3 = .765 */
2850	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2851	str	r3, [r0, #0x05]
2852	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2853	strh	r1, [r0, #0x09]
2854	mov	r1, r1, lsr #16		/* r1 = ...B */
2855	strb	r1, [r0, #0x0b]
2856#endif
2857	RET
2858	LMEMCPY_C_PAD
2859
2860/*
2861 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2862 */
2863	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2864	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2865	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2866	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2867	strb	r2, [r0, #0x0b]
2868#ifdef __ARMEB__
2869	strh	r3, [r0, #0x09]
2870	mov	r3, r3, lsr #16		/* r3 = ..78 */
2871	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2872	mov	ip, ip, lsr #16		/* ip = ..34 */
2873	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2874	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2875#else
2876	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2877	strh	r2, [r0, #0x09]
2878	mov	r3, r3, lsl #16		/* r3 = 87.. */
2879	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2880	mov	ip, ip, lsl #16		/* ip = 43.. */
2881	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2882	mov	r1, r1, lsr #8		/* r1 = .210 */
2883#endif
2884	str	r3, [r0, #0x05]
2885	str	ip, [r0, #0x01]
2886	strb	r1, [r0]
2887	RET
2888	LMEMCPY_C_PAD
2889
2890/*
2891 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2892 */
2893#ifdef __ARMEB__
2894	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2895	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2896	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2897	ldrh	r1, [r1]		/* r1 = ..01 */
2898	strb	r2, [r0, #0x0b]
2899	mov	r2, r2, lsr #8		/* r2 = ...A */
2900	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2901	mov	ip, ip, lsr #8		/* ip = .678 */
2902	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2903	mov	r3, r3, lsr #8		/* r3 = .234 */
2904	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2905	mov	r1, r1, lsr #8		/* r1 = ...0 */
2906	strb	r1, [r0]
2907	str	r3, [r0, #0x01]
2908	str	ip, [r0, #0x05]
2909	strh	r2, [r0, #0x09]
2910#else
2911	ldrh	r2, [r1]		/* r2 = ..10 */
2912	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2913	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2914	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2915	strb	r2, [r0]
2916	mov	r2, r2, lsr #8		/* r2 = ...1 */
2917	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2918	mov	r3, r3, lsr #24		/* r3 = ...5 */
2919	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2920	mov	ip, ip, lsr #24		/* ip = ...9 */
2921	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2922	mov	r1, r1, lsr #8		/* r1 = ...B */
2923	str	r2, [r0, #0x01]
2924	str	r3, [r0, #0x05]
2925	strh	ip, [r0, #0x09]
2926	strb	r1, [r0, #0x0b]
2927#endif
2928	RET
2929	LMEMCPY_C_PAD
2930
2931/*
2932 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2933 */
2934	ldrb	r2, [r1]
2935	ldr	r3, [r1, #0x01]
2936	ldr	ip, [r1, #0x05]
2937	strb	r2, [r0]
2938	ldrh	r2, [r1, #0x09]
2939	ldrb	r1, [r1, #0x0b]
2940	str	r3, [r0, #0x01]
2941	str	ip, [r0, #0x05]
2942	strh	r2, [r0, #0x09]
2943	strb	r1, [r0, #0x0b]
2944	RET
2945END(memcpy)
2946#endif /* _ARM_ARCH_5E */
2947
2948#ifdef GPROF
2949
2950ENTRY(user)
2951	nop
2952END(user)
2953ENTRY(btrap)
2954	nop
2955END(btrap)
2956ENTRY(etrap)
2957	nop
2958END(etrap)
2959ENTRY(bintr)
2960	nop
2961END(bintr)
2962ENTRY(eintr)
2963	nop
2964END(eintr)
2965#endif
2966