chacha-armv8.pl revision d6ebbcc6a2816c252717bea731e74e1d39d06dac
1#! /usr/bin/env perl
2# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
24#
25# Apple A7		5.50/+49%       3.33            1.70
26# Cortex-A53		8.40/+80%       4.72		4.72(*)
27# Cortex-A57		8.06/+43%       4.90            4.43(**)
28# Denver		4.50/+82%       2.63		2.67(*)
29# X-Gene		9.50/+46%       8.82		8.89(*)
30# Mongoose		8.00/+44%	3.64		3.25
31# Kryo			8.17/+50%	4.83		4.65
32#
33# (*)	it's expected that doubling interleave factor doesn't help
34#	all processors, only those with higher NEON latency and
35#	higher instruction issue rate;
36# (**)	expected improvement was actually higher;
37
38$flavour=shift;
39$output=shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
50{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51  my $arg = pop;
52    $arg = "#$arg" if ($arg*1 eq $arg);
53    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54}
55
56my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57
58my @x=map("x$_",(5..17,19..21));
59my @d=map("x$_",(22..28,30));
60
61sub ROUND {
62my ($a0,$b0,$c0,$d0)=@_;
63my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66
67    (
68	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
69	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
70	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
71	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
72	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
73	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
74	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
75	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
76	"&ror_32	(@x[$d0],@x[$d0],16)",
77	 "&ror_32	(@x[$d1],@x[$d1],16)",
78	  "&ror_32	(@x[$d2],@x[$d2],16)",
79	   "&ror_32	(@x[$d3],@x[$d3],16)",
80
81	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
82	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
83	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
84	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
85	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
86	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
87	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
88	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
89	"&ror_32	(@x[$b0],@x[$b0],20)",
90	 "&ror_32	(@x[$b1],@x[$b1],20)",
91	  "&ror_32	(@x[$b2],@x[$b2],20)",
92	   "&ror_32	(@x[$b3],@x[$b3],20)",
93
94	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
95	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
96	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
97	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
98	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
99	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
100	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
101	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
102	"&ror_32	(@x[$d0],@x[$d0],24)",
103	 "&ror_32	(@x[$d1],@x[$d1],24)",
104	  "&ror_32	(@x[$d2],@x[$d2],24)",
105	   "&ror_32	(@x[$d3],@x[$d3],24)",
106
107	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
108	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
109	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
110	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
111	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
112	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
113	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
114	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
115	"&ror_32	(@x[$b0],@x[$b0],25)",
116	 "&ror_32	(@x[$b1],@x[$b1],25)",
117	  "&ror_32	(@x[$b2],@x[$b2],25)",
118	   "&ror_32	(@x[$b3],@x[$b3],25)"
119    );
120}
121
122$code.=<<___;
123#include "arm_arch.h"
124
125.text
126
127.extern	OPENSSL_armcap_P
128
129.align	5
130.Lsigma:
131.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
132.Lone:
133.long	1,0,0,0
134.LOPENSSL_armcap_P:
135#ifdef	__ILP32__
136.long	OPENSSL_armcap_P-.
137#else
138.quad	OPENSSL_armcap_P-.
139#endif
140.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
141
142.globl	ChaCha20_ctr32
143.type	ChaCha20_ctr32,%function
144.align	5
145ChaCha20_ctr32:
146	cbz	$len,.Labort
147	adr	@x[0],.LOPENSSL_armcap_P
148	cmp	$len,#192
149	b.lo	.Lshort
150#ifdef	__ILP32__
151	ldrsw	@x[1],[@x[0]]
152#else
153	ldr	@x[1],[@x[0]]
154#endif
155	ldr	w17,[@x[1],@x[0]]
156	tst	w17,#ARMV7_NEON
157	b.ne	ChaCha20_neon
158
159.Lshort:
160	.inst	0xd503233f			// paciasp
161	stp	x29,x30,[sp,#-96]!
162	add	x29,sp,#0
163
164	adr	@x[0],.Lsigma
165	stp	x19,x20,[sp,#16]
166	stp	x21,x22,[sp,#32]
167	stp	x23,x24,[sp,#48]
168	stp	x25,x26,[sp,#64]
169	stp	x27,x28,[sp,#80]
170	sub	sp,sp,#64
171
172	ldp	@d[0],@d[1],[@x[0]]		// load sigma
173	ldp	@d[2],@d[3],[$key]		// load key
174	ldp	@d[4],@d[5],[$key,#16]
175	ldp	@d[6],@d[7],[$ctr]		// load counter
176#ifdef	__ARMEB__
177	ror	@d[2],@d[2],#32
178	ror	@d[3],@d[3],#32
179	ror	@d[4],@d[4],#32
180	ror	@d[5],@d[5],#32
181	ror	@d[6],@d[6],#32
182	ror	@d[7],@d[7],#32
183#endif
184
185.Loop_outer:
186	mov.32	@x[0],@d[0]			// unpack key block
187	lsr	@x[1],@d[0],#32
188	mov.32	@x[2],@d[1]
189	lsr	@x[3],@d[1],#32
190	mov.32	@x[4],@d[2]
191	lsr	@x[5],@d[2],#32
192	mov.32	@x[6],@d[3]
193	lsr	@x[7],@d[3],#32
194	mov.32	@x[8],@d[4]
195	lsr	@x[9],@d[4],#32
196	mov.32	@x[10],@d[5]
197	lsr	@x[11],@d[5],#32
198	mov.32	@x[12],@d[6]
199	lsr	@x[13],@d[6],#32
200	mov.32	@x[14],@d[7]
201	lsr	@x[15],@d[7],#32
202
203	mov	$ctr,#10
204	subs	$len,$len,#64
205.Loop:
206	sub	$ctr,$ctr,#1
207___
208	foreach (&ROUND(0, 4, 8,12)) { eval; }
209	foreach (&ROUND(0, 5,10,15)) { eval; }
210$code.=<<___;
211	cbnz	$ctr,.Loop
212
213	add.32	@x[0],@x[0],@d[0]		// accumulate key block
214	add	@x[1],@x[1],@d[0],lsr#32
215	add.32	@x[2],@x[2],@d[1]
216	add	@x[3],@x[3],@d[1],lsr#32
217	add.32	@x[4],@x[4],@d[2]
218	add	@x[5],@x[5],@d[2],lsr#32
219	add.32	@x[6],@x[6],@d[3]
220	add	@x[7],@x[7],@d[3],lsr#32
221	add.32	@x[8],@x[8],@d[4]
222	add	@x[9],@x[9],@d[4],lsr#32
223	add.32	@x[10],@x[10],@d[5]
224	add	@x[11],@x[11],@d[5],lsr#32
225	add.32	@x[12],@x[12],@d[6]
226	add	@x[13],@x[13],@d[6],lsr#32
227	add.32	@x[14],@x[14],@d[7]
228	add	@x[15],@x[15],@d[7],lsr#32
229
230	b.lo	.Ltail
231
232	add	@x[0],@x[0],@x[1],lsl#32	// pack
233	add	@x[2],@x[2],@x[3],lsl#32
234	ldp	@x[1],@x[3],[$inp,#0]		// load input
235	add	@x[4],@x[4],@x[5],lsl#32
236	add	@x[6],@x[6],@x[7],lsl#32
237	ldp	@x[5],@x[7],[$inp,#16]
238	add	@x[8],@x[8],@x[9],lsl#32
239	add	@x[10],@x[10],@x[11],lsl#32
240	ldp	@x[9],@x[11],[$inp,#32]
241	add	@x[12],@x[12],@x[13],lsl#32
242	add	@x[14],@x[14],@x[15],lsl#32
243	ldp	@x[13],@x[15],[$inp,#48]
244	add	$inp,$inp,#64
245#ifdef	__ARMEB__
246	rev	@x[0],@x[0]
247	rev	@x[2],@x[2]
248	rev	@x[4],@x[4]
249	rev	@x[6],@x[6]
250	rev	@x[8],@x[8]
251	rev	@x[10],@x[10]
252	rev	@x[12],@x[12]
253	rev	@x[14],@x[14]
254#endif
255	eor	@x[0],@x[0],@x[1]
256	eor	@x[2],@x[2],@x[3]
257	eor	@x[4],@x[4],@x[5]
258	eor	@x[6],@x[6],@x[7]
259	eor	@x[8],@x[8],@x[9]
260	eor	@x[10],@x[10],@x[11]
261	eor	@x[12],@x[12],@x[13]
262	eor	@x[14],@x[14],@x[15]
263
264	stp	@x[0],@x[2],[$out,#0]		// store output
265	 add	@d[6],@d[6],#1			// increment counter
266	stp	@x[4],@x[6],[$out,#16]
267	stp	@x[8],@x[10],[$out,#32]
268	stp	@x[12],@x[14],[$out,#48]
269	add	$out,$out,#64
270
271	b.hi	.Loop_outer
272
273	ldp	x19,x20,[x29,#16]
274	add	sp,sp,#64
275	ldp	x21,x22,[x29,#32]
276	ldp	x23,x24,[x29,#48]
277	ldp	x25,x26,[x29,#64]
278	ldp	x27,x28,[x29,#80]
279	ldp	x29,x30,[sp],#96
280	.inst	0xd50323bf			// autiasp
281.Labort:
282	ret
283
284.align	4
285.Ltail:
286	add	$len,$len,#64
287.Less_than_64:
288	sub	$out,$out,#1
289	add	$inp,$inp,$len
290	add	$out,$out,$len
291	add	$ctr,sp,$len
292	neg	$len,$len
293
294	add	@x[0],@x[0],@x[1],lsl#32	// pack
295	add	@x[2],@x[2],@x[3],lsl#32
296	add	@x[4],@x[4],@x[5],lsl#32
297	add	@x[6],@x[6],@x[7],lsl#32
298	add	@x[8],@x[8],@x[9],lsl#32
299	add	@x[10],@x[10],@x[11],lsl#32
300	add	@x[12],@x[12],@x[13],lsl#32
301	add	@x[14],@x[14],@x[15],lsl#32
302#ifdef	__ARMEB__
303	rev	@x[0],@x[0]
304	rev	@x[2],@x[2]
305	rev	@x[4],@x[4]
306	rev	@x[6],@x[6]
307	rev	@x[8],@x[8]
308	rev	@x[10],@x[10]
309	rev	@x[12],@x[12]
310	rev	@x[14],@x[14]
311#endif
312	stp	@x[0],@x[2],[sp,#0]
313	stp	@x[4],@x[6],[sp,#16]
314	stp	@x[8],@x[10],[sp,#32]
315	stp	@x[12],@x[14],[sp,#48]
316
317.Loop_tail:
318	ldrb	w10,[$inp,$len]
319	ldrb	w11,[$ctr,$len]
320	add	$len,$len,#1
321	eor	w10,w10,w11
322	strb	w10,[$out,$len]
323	cbnz	$len,.Loop_tail
324
325	stp	xzr,xzr,[sp,#0]
326	stp	xzr,xzr,[sp,#16]
327	stp	xzr,xzr,[sp,#32]
328	stp	xzr,xzr,[sp,#48]
329
330	ldp	x19,x20,[x29,#16]
331	add	sp,sp,#64
332	ldp	x21,x22,[x29,#32]
333	ldp	x23,x24,[x29,#48]
334	ldp	x25,x26,[x29,#64]
335	ldp	x27,x28,[x29,#80]
336	ldp	x29,x30,[sp],#96
337	.inst	0xd50323bf			// autiasp
338	ret
339.size	ChaCha20_ctr32,.-ChaCha20_ctr32
340___
341
342{{{
343my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
344    map("v$_.4s",(0..7,16..23));
345my (@K)=map("v$_.4s",(24..30));
346my $ONE="v31.4s";
347
348sub NEONROUND {
349my $odd = pop;
350my ($a,$b,$c,$d,$t)=@_;
351
352	(
353	"&add		('$a','$a','$b')",
354	"&eor		('$d','$d','$a')",
355	"&rev32_16	('$d','$d')",		# vrot ($d,16)
356
357	"&add		('$c','$c','$d')",
358	"&eor		('$t','$b','$c')",
359	"&ushr		('$b','$t',20)",
360	"&sli		('$b','$t',12)",
361
362	"&add		('$a','$a','$b')",
363	"&eor		('$t','$d','$a')",
364	"&ushr		('$d','$t',24)",
365	"&sli		('$d','$t',8)",
366
367	"&add		('$c','$c','$d')",
368	"&eor		('$t','$b','$c')",
369	"&ushr		('$b','$t',25)",
370	"&sli		('$b','$t',7)",
371
372	"&ext		('$c','$c','$c',8)",
373	"&ext		('$d','$d','$d',$odd?4:12)",
374	"&ext		('$b','$b','$b',$odd?12:4)"
375	);
376}
377
378$code.=<<___;
379
380.type	ChaCha20_neon,%function
381.align	5
382ChaCha20_neon:
383	.inst	0xd503233f			// paciasp
384	stp	x29,x30,[sp,#-96]!
385	add	x29,sp,#0
386
387	adr	@x[0],.Lsigma
388	stp	x19,x20,[sp,#16]
389	stp	x21,x22,[sp,#32]
390	stp	x23,x24,[sp,#48]
391	stp	x25,x26,[sp,#64]
392	stp	x27,x28,[sp,#80]
393	cmp	$len,#512
394	b.hs	.L512_or_more_neon
395
396	sub	sp,sp,#64
397
398	ldp	@d[0],@d[1],[@x[0]]		// load sigma
399	ld1	{@K[0]},[@x[0]],#16
400	ldp	@d[2],@d[3],[$key]		// load key
401	ldp	@d[4],@d[5],[$key,#16]
402	ld1	{@K[1],@K[2]},[$key]
403	ldp	@d[6],@d[7],[$ctr]		// load counter
404	ld1	{@K[3]},[$ctr]
405	ld1	{$ONE},[@x[0]]
406#ifdef	__ARMEB__
407	rev64	@K[0],@K[0]
408	ror	@d[2],@d[2],#32
409	ror	@d[3],@d[3],#32
410	ror	@d[4],@d[4],#32
411	ror	@d[5],@d[5],#32
412	ror	@d[6],@d[6],#32
413	ror	@d[7],@d[7],#32
414#endif
415	add	@K[3],@K[3],$ONE		// += 1
416	add	@K[4],@K[3],$ONE
417	add	@K[5],@K[4],$ONE
418	shl	$ONE,$ONE,#2			// 1 -> 4
419
420.Loop_outer_neon:
421	mov.32	@x[0],@d[0]			// unpack key block
422	lsr	@x[1],@d[0],#32
423	 mov	$A0,@K[0]
424	mov.32	@x[2],@d[1]
425	lsr	@x[3],@d[1],#32
426	 mov	$A1,@K[0]
427	mov.32	@x[4],@d[2]
428	lsr	@x[5],@d[2],#32
429	 mov	$A2,@K[0]
430	mov.32	@x[6],@d[3]
431	 mov	$B0,@K[1]
432	lsr	@x[7],@d[3],#32
433	 mov	$B1,@K[1]
434	mov.32	@x[8],@d[4]
435	 mov	$B2,@K[1]
436	lsr	@x[9],@d[4],#32
437	 mov	$D0,@K[3]
438	mov.32	@x[10],@d[5]
439	 mov	$D1,@K[4]
440	lsr	@x[11],@d[5],#32
441	 mov	$D2,@K[5]
442	mov.32	@x[12],@d[6]
443	 mov	$C0,@K[2]
444	lsr	@x[13],@d[6],#32
445	 mov	$C1,@K[2]
446	mov.32	@x[14],@d[7]
447	 mov	$C2,@K[2]
448	lsr	@x[15],@d[7],#32
449
450	mov	$ctr,#10
451	subs	$len,$len,#256
452.Loop_neon:
453	sub	$ctr,$ctr,#1
454___
455	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
456	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
457	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
458	my @thread3=&ROUND(0,4,8,12);
459
460	foreach (@thread0) {
461		eval;			eval(shift(@thread3));
462		eval(shift(@thread1));	eval(shift(@thread3));
463		eval(shift(@thread2));	eval(shift(@thread3));
464	}
465
466	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
467	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
468	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
469	@thread3=&ROUND(0,5,10,15);
470
471	foreach (@thread0) {
472		eval;			eval(shift(@thread3));
473		eval(shift(@thread1));	eval(shift(@thread3));
474		eval(shift(@thread2));	eval(shift(@thread3));
475	}
476$code.=<<___;
477	cbnz	$ctr,.Loop_neon
478
479	add.32	@x[0],@x[0],@d[0]		// accumulate key block
480	 add	$A0,$A0,@K[0]
481	add	@x[1],@x[1],@d[0],lsr#32
482	 add	$A1,$A1,@K[0]
483	add.32	@x[2],@x[2],@d[1]
484	 add	$A2,$A2,@K[0]
485	add	@x[3],@x[3],@d[1],lsr#32
486	 add	$C0,$C0,@K[2]
487	add.32	@x[4],@x[4],@d[2]
488	 add	$C1,$C1,@K[2]
489	add	@x[5],@x[5],@d[2],lsr#32
490	 add	$C2,$C2,@K[2]
491	add.32	@x[6],@x[6],@d[3]
492	 add	$D0,$D0,@K[3]
493	add	@x[7],@x[7],@d[3],lsr#32
494	add.32	@x[8],@x[8],@d[4]
495	 add	$D1,$D1,@K[4]
496	add	@x[9],@x[9],@d[4],lsr#32
497	add.32	@x[10],@x[10],@d[5]
498	 add	$D2,$D2,@K[5]
499	add	@x[11],@x[11],@d[5],lsr#32
500	add.32	@x[12],@x[12],@d[6]
501	 add	$B0,$B0,@K[1]
502	add	@x[13],@x[13],@d[6],lsr#32
503	add.32	@x[14],@x[14],@d[7]
504	 add	$B1,$B1,@K[1]
505	add	@x[15],@x[15],@d[7],lsr#32
506	 add	$B2,$B2,@K[1]
507
508	b.lo	.Ltail_neon
509
510	add	@x[0],@x[0],@x[1],lsl#32	// pack
511	add	@x[2],@x[2],@x[3],lsl#32
512	ldp	@x[1],@x[3],[$inp,#0]		// load input
513	add	@x[4],@x[4],@x[5],lsl#32
514	add	@x[6],@x[6],@x[7],lsl#32
515	ldp	@x[5],@x[7],[$inp,#16]
516	add	@x[8],@x[8],@x[9],lsl#32
517	add	@x[10],@x[10],@x[11],lsl#32
518	ldp	@x[9],@x[11],[$inp,#32]
519	add	@x[12],@x[12],@x[13],lsl#32
520	add	@x[14],@x[14],@x[15],lsl#32
521	ldp	@x[13],@x[15],[$inp,#48]
522	add	$inp,$inp,#64
523#ifdef	__ARMEB__
524	rev	@x[0],@x[0]
525	rev	@x[2],@x[2]
526	rev	@x[4],@x[4]
527	rev	@x[6],@x[6]
528	rev	@x[8],@x[8]
529	rev	@x[10],@x[10]
530	rev	@x[12],@x[12]
531	rev	@x[14],@x[14]
532#endif
533	ld1.8	{$T0-$T3},[$inp],#64
534	eor	@x[0],@x[0],@x[1]
535	eor	@x[2],@x[2],@x[3]
536	eor	@x[4],@x[4],@x[5]
537	eor	@x[6],@x[6],@x[7]
538	eor	@x[8],@x[8],@x[9]
539	 eor	$A0,$A0,$T0
540	eor	@x[10],@x[10],@x[11]
541	 eor	$B0,$B0,$T1
542	eor	@x[12],@x[12],@x[13]
543	 eor	$C0,$C0,$T2
544	eor	@x[14],@x[14],@x[15]
545	 eor	$D0,$D0,$T3
546	 ld1.8	{$T0-$T3},[$inp],#64
547
548	stp	@x[0],@x[2],[$out,#0]		// store output
549	 add	@d[6],@d[6],#4			// increment counter
550	stp	@x[4],@x[6],[$out,#16]
551	 add	@K[3],@K[3],$ONE		// += 4
552	stp	@x[8],@x[10],[$out,#32]
553	 add	@K[4],@K[4],$ONE
554	stp	@x[12],@x[14],[$out,#48]
555	 add	@K[5],@K[5],$ONE
556	add	$out,$out,#64
557
558	st1.8	{$A0-$D0},[$out],#64
559	ld1.8	{$A0-$D0},[$inp],#64
560
561	eor	$A1,$A1,$T0
562	eor	$B1,$B1,$T1
563	eor	$C1,$C1,$T2
564	eor	$D1,$D1,$T3
565	st1.8	{$A1-$D1},[$out],#64
566
567	eor	$A2,$A2,$A0
568	eor	$B2,$B2,$B0
569	eor	$C2,$C2,$C0
570	eor	$D2,$D2,$D0
571	st1.8	{$A2-$D2},[$out],#64
572
573	b.hi	.Loop_outer_neon
574
575	ldp	x19,x20,[x29,#16]
576	add	sp,sp,#64
577	ldp	x21,x22,[x29,#32]
578	ldp	x23,x24,[x29,#48]
579	ldp	x25,x26,[x29,#64]
580	ldp	x27,x28,[x29,#80]
581	ldp	x29,x30,[sp],#96
582	.inst	0xd50323bf			// autiasp
583	ret
584
585.Ltail_neon:
586	add	$len,$len,#256
587	cmp	$len,#64
588	b.lo	.Less_than_64
589
590	add	@x[0],@x[0],@x[1],lsl#32	// pack
591	add	@x[2],@x[2],@x[3],lsl#32
592	ldp	@x[1],@x[3],[$inp,#0]		// load input
593	add	@x[4],@x[4],@x[5],lsl#32
594	add	@x[6],@x[6],@x[7],lsl#32
595	ldp	@x[5],@x[7],[$inp,#16]
596	add	@x[8],@x[8],@x[9],lsl#32
597	add	@x[10],@x[10],@x[11],lsl#32
598	ldp	@x[9],@x[11],[$inp,#32]
599	add	@x[12],@x[12],@x[13],lsl#32
600	add	@x[14],@x[14],@x[15],lsl#32
601	ldp	@x[13],@x[15],[$inp,#48]
602	add	$inp,$inp,#64
603#ifdef	__ARMEB__
604	rev	@x[0],@x[0]
605	rev	@x[2],@x[2]
606	rev	@x[4],@x[4]
607	rev	@x[6],@x[6]
608	rev	@x[8],@x[8]
609	rev	@x[10],@x[10]
610	rev	@x[12],@x[12]
611	rev	@x[14],@x[14]
612#endif
613	eor	@x[0],@x[0],@x[1]
614	eor	@x[2],@x[2],@x[3]
615	eor	@x[4],@x[4],@x[5]
616	eor	@x[6],@x[6],@x[7]
617	eor	@x[8],@x[8],@x[9]
618	eor	@x[10],@x[10],@x[11]
619	eor	@x[12],@x[12],@x[13]
620	eor	@x[14],@x[14],@x[15]
621
622	stp	@x[0],@x[2],[$out,#0]		// store output
623	 add	@d[6],@d[6],#4			// increment counter
624	stp	@x[4],@x[6],[$out,#16]
625	stp	@x[8],@x[10],[$out,#32]
626	stp	@x[12],@x[14],[$out,#48]
627	add	$out,$out,#64
628	b.eq	.Ldone_neon
629	sub	$len,$len,#64
630	cmp	$len,#64
631	b.lo	.Less_than_128
632
633	ld1.8	{$T0-$T3},[$inp],#64
634	eor	$A0,$A0,$T0
635	eor	$B0,$B0,$T1
636	eor	$C0,$C0,$T2
637	eor	$D0,$D0,$T3
638	st1.8	{$A0-$D0},[$out],#64
639	b.eq	.Ldone_neon
640	sub	$len,$len,#64
641	cmp	$len,#64
642	b.lo	.Less_than_192
643
644	ld1.8	{$T0-$T3},[$inp],#64
645	eor	$A1,$A1,$T0
646	eor	$B1,$B1,$T1
647	eor	$C1,$C1,$T2
648	eor	$D1,$D1,$T3
649	st1.8	{$A1-$D1},[$out],#64
650	b.eq	.Ldone_neon
651	sub	$len,$len,#64
652
653	st1.8	{$A2-$D2},[sp]
654	b	.Last_neon
655
656.Less_than_128:
657	st1.8	{$A0-$D0},[sp]
658	b	.Last_neon
659.Less_than_192:
660	st1.8	{$A1-$D1},[sp]
661	b	.Last_neon
662
663.align	4
664.Last_neon:
665	sub	$out,$out,#1
666	add	$inp,$inp,$len
667	add	$out,$out,$len
668	add	$ctr,sp,$len
669	neg	$len,$len
670
671.Loop_tail_neon:
672	ldrb	w10,[$inp,$len]
673	ldrb	w11,[$ctr,$len]
674	add	$len,$len,#1
675	eor	w10,w10,w11
676	strb	w10,[$out,$len]
677	cbnz	$len,.Loop_tail_neon
678
679	stp	xzr,xzr,[sp,#0]
680	stp	xzr,xzr,[sp,#16]
681	stp	xzr,xzr,[sp,#32]
682	stp	xzr,xzr,[sp,#48]
683
684.Ldone_neon:
685	ldp	x19,x20,[x29,#16]
686	add	sp,sp,#64
687	ldp	x21,x22,[x29,#32]
688	ldp	x23,x24,[x29,#48]
689	ldp	x25,x26,[x29,#64]
690	ldp	x27,x28,[x29,#80]
691	ldp	x29,x30,[sp],#96
692	.inst	0xd50323bf			// autiasp
693	ret
694.size	ChaCha20_neon,.-ChaCha20_neon
695___
696{
697my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
698my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
699    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
700
701$code.=<<___;
702.type	ChaCha20_512_neon,%function
703.align	5
704ChaCha20_512_neon:
705	.inst	0xd503233f			// paciasp
706	stp	x29,x30,[sp,#-96]!
707	add	x29,sp,#0
708
709	adr	@x[0],.Lsigma
710	stp	x19,x20,[sp,#16]
711	stp	x21,x22,[sp,#32]
712	stp	x23,x24,[sp,#48]
713	stp	x25,x26,[sp,#64]
714	stp	x27,x28,[sp,#80]
715
716.L512_or_more_neon:
717	sub	sp,sp,#128+64
718
719	ldp	@d[0],@d[1],[@x[0]]		// load sigma
720	ld1	{@K[0]},[@x[0]],#16
721	ldp	@d[2],@d[3],[$key]		// load key
722	ldp	@d[4],@d[5],[$key,#16]
723	ld1	{@K[1],@K[2]},[$key]
724	ldp	@d[6],@d[7],[$ctr]		// load counter
725	ld1	{@K[3]},[$ctr]
726	ld1	{$ONE},[@x[0]]
727#ifdef	__ARMEB__
728	rev64	@K[0],@K[0]
729	ror	@d[2],@d[2],#32
730	ror	@d[3],@d[3],#32
731	ror	@d[4],@d[4],#32
732	ror	@d[5],@d[5],#32
733	ror	@d[6],@d[6],#32
734	ror	@d[7],@d[7],#32
735#endif
736	add	@K[3],@K[3],$ONE		// += 1
737	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
738	add	@K[3],@K[3],$ONE		// not typo
739	str	@K[2],[sp,#32]
740	add	@K[4],@K[3],$ONE
741	add	@K[5],@K[4],$ONE
742	add	@K[6],@K[5],$ONE
743	shl	$ONE,$ONE,#2			// 1 -> 4
744
745	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
746	stp	d10,d11,[sp,#128+16]
747	stp	d12,d13,[sp,#128+32]
748	stp	d14,d15,[sp,#128+48]
749
750	sub	$len,$len,#512			// not typo
751
752.Loop_outer_512_neon:
753	 mov	$A0,@K[0]
754	 mov	$A1,@K[0]
755	 mov	$A2,@K[0]
756	 mov	$A3,@K[0]
757	 mov	$A4,@K[0]
758	 mov	$A5,@K[0]
759	 mov	$B0,@K[1]
760	mov.32	@x[0],@d[0]			// unpack key block
761	 mov	$B1,@K[1]
762	lsr	@x[1],@d[0],#32
763	 mov	$B2,@K[1]
764	mov.32	@x[2],@d[1]
765	 mov	$B3,@K[1]
766	lsr	@x[3],@d[1],#32
767	 mov	$B4,@K[1]
768	mov.32	@x[4],@d[2]
769	 mov	$B5,@K[1]
770	lsr	@x[5],@d[2],#32
771	 mov	$D0,@K[3]
772	mov.32	@x[6],@d[3]
773	 mov	$D1,@K[4]
774	lsr	@x[7],@d[3],#32
775	 mov	$D2,@K[5]
776	mov.32	@x[8],@d[4]
777	 mov	$D3,@K[6]
778	lsr	@x[9],@d[4],#32
779	 mov	$C0,@K[2]
780	mov.32	@x[10],@d[5]
781	 mov	$C1,@K[2]
782	lsr	@x[11],@d[5],#32
783	 add	$D4,$D0,$ONE			// +4
784	mov.32	@x[12],@d[6]
785	 add	$D5,$D1,$ONE			// +4
786	lsr	@x[13],@d[6],#32
787	 mov	$C2,@K[2]
788	mov.32	@x[14],@d[7]
789	 mov	$C3,@K[2]
790	lsr	@x[15],@d[7],#32
791	 mov	$C4,@K[2]
792	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
793	 mov	$C5,@K[2]
794	 str	@K[5],[sp,#80]
795
796	mov	$ctr,#5
797	subs	$len,$len,#512
798.Loop_upper_neon:
799	sub	$ctr,$ctr,#1
800___
801	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
802	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
803	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
804	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
805	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
806	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
807	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
808	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
809	my $i = 0;
810
811	foreach (@thread0) {
812		eval;			eval(shift(@thread67));
813		eval(shift(@thread1));	eval(shift(@thread67));
814		eval(shift(@thread2));	eval(shift(@thread67));
815		eval(shift(@thread3));	eval(shift(@thread67));
816		eval(shift(@thread4));	eval(shift(@thread67));
817		eval(shift(@thread5));	eval(shift(@thread67));
818	}
819
820	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
821	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
822	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
823	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
824	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
825	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
826	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
827
828	foreach (@thread0) {
829		eval;			eval(shift(@thread67));
830		eval(shift(@thread1));	eval(shift(@thread67));
831		eval(shift(@thread2));	eval(shift(@thread67));
832		eval(shift(@thread3));	eval(shift(@thread67));
833		eval(shift(@thread4));	eval(shift(@thread67));
834		eval(shift(@thread5));	eval(shift(@thread67));
835	}
836$code.=<<___;
837	cbnz	$ctr,.Loop_upper_neon
838
839	add.32	@x[0],@x[0],@d[0]		// accumulate key block
840	add	@x[1],@x[1],@d[0],lsr#32
841	add.32	@x[2],@x[2],@d[1]
842	add	@x[3],@x[3],@d[1],lsr#32
843	add.32	@x[4],@x[4],@d[2]
844	add	@x[5],@x[5],@d[2],lsr#32
845	add.32	@x[6],@x[6],@d[3]
846	add	@x[7],@x[7],@d[3],lsr#32
847	add.32	@x[8],@x[8],@d[4]
848	add	@x[9],@x[9],@d[4],lsr#32
849	add.32	@x[10],@x[10],@d[5]
850	add	@x[11],@x[11],@d[5],lsr#32
851	add.32	@x[12],@x[12],@d[6]
852	add	@x[13],@x[13],@d[6],lsr#32
853	add.32	@x[14],@x[14],@d[7]
854	add	@x[15],@x[15],@d[7],lsr#32
855
856	add	@x[0],@x[0],@x[1],lsl#32	// pack
857	add	@x[2],@x[2],@x[3],lsl#32
858	ldp	@x[1],@x[3],[$inp,#0]		// load input
859	add	@x[4],@x[4],@x[5],lsl#32
860	add	@x[6],@x[6],@x[7],lsl#32
861	ldp	@x[5],@x[7],[$inp,#16]
862	add	@x[8],@x[8],@x[9],lsl#32
863	add	@x[10],@x[10],@x[11],lsl#32
864	ldp	@x[9],@x[11],[$inp,#32]
865	add	@x[12],@x[12],@x[13],lsl#32
866	add	@x[14],@x[14],@x[15],lsl#32
867	ldp	@x[13],@x[15],[$inp,#48]
868	add	$inp,$inp,#64
869#ifdef	__ARMEB__
870	rev	@x[0],@x[0]
871	rev	@x[2],@x[2]
872	rev	@x[4],@x[4]
873	rev	@x[6],@x[6]
874	rev	@x[8],@x[8]
875	rev	@x[10],@x[10]
876	rev	@x[12],@x[12]
877	rev	@x[14],@x[14]
878#endif
879	eor	@x[0],@x[0],@x[1]
880	eor	@x[2],@x[2],@x[3]
881	eor	@x[4],@x[4],@x[5]
882	eor	@x[6],@x[6],@x[7]
883	eor	@x[8],@x[8],@x[9]
884	eor	@x[10],@x[10],@x[11]
885	eor	@x[12],@x[12],@x[13]
886	eor	@x[14],@x[14],@x[15]
887
888	 stp	@x[0],@x[2],[$out,#0]		// store output
889	 add	@d[6],@d[6],#1			// increment counter
890	mov.32	@x[0],@d[0]			// unpack key block
891	lsr	@x[1],@d[0],#32
892	 stp	@x[4],@x[6],[$out,#16]
893	mov.32	@x[2],@d[1]
894	lsr	@x[3],@d[1],#32
895	 stp	@x[8],@x[10],[$out,#32]
896	mov.32	@x[4],@d[2]
897	lsr	@x[5],@d[2],#32
898	 stp	@x[12],@x[14],[$out,#48]
899	 add	$out,$out,#64
900	mov.32	@x[6],@d[3]
901	lsr	@x[7],@d[3],#32
902	mov.32	@x[8],@d[4]
903	lsr	@x[9],@d[4],#32
904	mov.32	@x[10],@d[5]
905	lsr	@x[11],@d[5],#32
906	mov.32	@x[12],@d[6]
907	lsr	@x[13],@d[6],#32
908	mov.32	@x[14],@d[7]
909	lsr	@x[15],@d[7],#32
910
911	mov	$ctr,#5
912.Loop_lower_neon:
913	sub	$ctr,$ctr,#1
914___
915	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
916	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
917	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
918	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
919	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
920	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
921	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
922
923	foreach (@thread0) {
924		eval;			eval(shift(@thread67));
925		eval(shift(@thread1));	eval(shift(@thread67));
926		eval(shift(@thread2));	eval(shift(@thread67));
927		eval(shift(@thread3));	eval(shift(@thread67));
928		eval(shift(@thread4));	eval(shift(@thread67));
929		eval(shift(@thread5));	eval(shift(@thread67));
930	}
931
932	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
933	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
934	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
935	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
936	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
937	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
938	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
939
940	foreach (@thread0) {
941		eval;			eval(shift(@thread67));
942		eval(shift(@thread1));	eval(shift(@thread67));
943		eval(shift(@thread2));	eval(shift(@thread67));
944		eval(shift(@thread3));	eval(shift(@thread67));
945		eval(shift(@thread4));	eval(shift(@thread67));
946		eval(shift(@thread5));	eval(shift(@thread67));
947	}
948$code.=<<___;
949	cbnz	$ctr,.Loop_lower_neon
950
951	add.32	@x[0],@x[0],@d[0]		// accumulate key block
952	 ldp	@K[0],@K[1],[sp,#0]
953	add	@x[1],@x[1],@d[0],lsr#32
954	 ldp	@K[2],@K[3],[sp,#32]
955	add.32	@x[2],@x[2],@d[1]
956	 ldp	@K[4],@K[5],[sp,#64]
957	add	@x[3],@x[3],@d[1],lsr#32
958	 add	$A0,$A0,@K[0]
959	add.32	@x[4],@x[4],@d[2]
960	 add	$A1,$A1,@K[0]
961	add	@x[5],@x[5],@d[2],lsr#32
962	 add	$A2,$A2,@K[0]
963	add.32	@x[6],@x[6],@d[3]
964	 add	$A3,$A3,@K[0]
965	add	@x[7],@x[7],@d[3],lsr#32
966	 add	$A4,$A4,@K[0]
967	add.32	@x[8],@x[8],@d[4]
968	 add	$A5,$A5,@K[0]
969	add	@x[9],@x[9],@d[4],lsr#32
970	 add	$C0,$C0,@K[2]
971	add.32	@x[10],@x[10],@d[5]
972	 add	$C1,$C1,@K[2]
973	add	@x[11],@x[11],@d[5],lsr#32
974	 add	$C2,$C2,@K[2]
975	add.32	@x[12],@x[12],@d[6]
976	 add	$C3,$C3,@K[2]
977	add	@x[13],@x[13],@d[6],lsr#32
978	 add	$C4,$C4,@K[2]
979	add.32	@x[14],@x[14],@d[7]
980	 add	$C5,$C5,@K[2]
981	add	@x[15],@x[15],@d[7],lsr#32
982	 add	$D4,$D4,$ONE			// +4
983	add	@x[0],@x[0],@x[1],lsl#32	// pack
984	 add	$D5,$D5,$ONE			// +4
985	add	@x[2],@x[2],@x[3],lsl#32
986	 add	$D0,$D0,@K[3]
987	ldp	@x[1],@x[3],[$inp,#0]		// load input
988	 add	$D1,$D1,@K[4]
989	add	@x[4],@x[4],@x[5],lsl#32
990	 add	$D2,$D2,@K[5]
991	add	@x[6],@x[6],@x[7],lsl#32
992	 add	$D3,$D3,@K[6]
993	ldp	@x[5],@x[7],[$inp,#16]
994	 add	$D4,$D4,@K[3]
995	add	@x[8],@x[8],@x[9],lsl#32
996	 add	$D5,$D5,@K[4]
997	add	@x[10],@x[10],@x[11],lsl#32
998	 add	$B0,$B0,@K[1]
999	ldp	@x[9],@x[11],[$inp,#32]
1000	 add	$B1,$B1,@K[1]
1001	add	@x[12],@x[12],@x[13],lsl#32
1002	 add	$B2,$B2,@K[1]
1003	add	@x[14],@x[14],@x[15],lsl#32
1004	 add	$B3,$B3,@K[1]
1005	ldp	@x[13],@x[15],[$inp,#48]
1006	 add	$B4,$B4,@K[1]
1007	add	$inp,$inp,#64
1008	 add	$B5,$B5,@K[1]
1009
1010#ifdef	__ARMEB__
1011	rev	@x[0],@x[0]
1012	rev	@x[2],@x[2]
1013	rev	@x[4],@x[4]
1014	rev	@x[6],@x[6]
1015	rev	@x[8],@x[8]
1016	rev	@x[10],@x[10]
1017	rev	@x[12],@x[12]
1018	rev	@x[14],@x[14]
1019#endif
1020	ld1.8	{$T0-$T3},[$inp],#64
1021	eor	@x[0],@x[0],@x[1]
1022	eor	@x[2],@x[2],@x[3]
1023	eor	@x[4],@x[4],@x[5]
1024	eor	@x[6],@x[6],@x[7]
1025	eor	@x[8],@x[8],@x[9]
1026	 eor	$A0,$A0,$T0
1027	eor	@x[10],@x[10],@x[11]
1028	 eor	$B0,$B0,$T1
1029	eor	@x[12],@x[12],@x[13]
1030	 eor	$C0,$C0,$T2
1031	eor	@x[14],@x[14],@x[15]
1032	 eor	$D0,$D0,$T3
1033	 ld1.8	{$T0-$T3},[$inp],#64
1034
1035	stp	@x[0],@x[2],[$out,#0]		// store output
1036	 add	@d[6],@d[6],#7			// increment counter
1037	stp	@x[4],@x[6],[$out,#16]
1038	stp	@x[8],@x[10],[$out,#32]
1039	stp	@x[12],@x[14],[$out,#48]
1040	add	$out,$out,#64
1041	st1.8	{$A0-$D0},[$out],#64
1042
1043	ld1.8	{$A0-$D0},[$inp],#64
1044	eor	$A1,$A1,$T0
1045	eor	$B1,$B1,$T1
1046	eor	$C1,$C1,$T2
1047	eor	$D1,$D1,$T3
1048	st1.8	{$A1-$D1},[$out],#64
1049
1050	ld1.8	{$A1-$D1},[$inp],#64
1051	eor	$A2,$A2,$A0
1052	 ldp	@K[0],@K[1],[sp,#0]
1053	eor	$B2,$B2,$B0
1054	 ldp	@K[2],@K[3],[sp,#32]
1055	eor	$C2,$C2,$C0
1056	eor	$D2,$D2,$D0
1057	st1.8	{$A2-$D2},[$out],#64
1058
1059	ld1.8	{$A2-$D2},[$inp],#64
1060	eor	$A3,$A3,$A1
1061	eor	$B3,$B3,$B1
1062	eor	$C3,$C3,$C1
1063	eor	$D3,$D3,$D1
1064	st1.8	{$A3-$D3},[$out],#64
1065
1066	ld1.8	{$A3-$D3},[$inp],#64
1067	eor	$A4,$A4,$A2
1068	eor	$B4,$B4,$B2
1069	eor	$C4,$C4,$C2
1070	eor	$D4,$D4,$D2
1071	st1.8	{$A4-$D4},[$out],#64
1072
1073	shl	$A0,$ONE,#1			// 4 -> 8
1074	eor	$A5,$A5,$A3
1075	eor	$B5,$B5,$B3
1076	eor	$C5,$C5,$C3
1077	eor	$D5,$D5,$D3
1078	st1.8	{$A5-$D5},[$out],#64
1079
1080	add	@K[3],@K[3],$A0			// += 8
1081	add	@K[4],@K[4],$A0
1082	add	@K[5],@K[5],$A0
1083	add	@K[6],@K[6],$A0
1084
1085	b.hs	.Loop_outer_512_neon
1086
1087	adds	$len,$len,#512
1088	ushr	$A0,$ONE,#2			// 4 -> 1
1089
1090	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1091	ldp	d10,d11,[sp,#128+16]
1092	ldp	d12,d13,[sp,#128+32]
1093	ldp	d14,d15,[sp,#128+48]
1094
1095	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
1096	stp	@K[0],$ONE,[sp,#32]
1097	stp	@K[0],$ONE,[sp,#64]
1098
1099	b.eq	.Ldone_512_neon
1100
1101	cmp	$len,#192
1102	sub	@K[3],@K[3],$A0			// -= 1
1103	sub	@K[4],@K[4],$A0
1104	sub	@K[5],@K[5],$A0
1105	add	sp,sp,#128
1106	b.hs	.Loop_outer_neon
1107
1108	eor	@K[1],@K[1],@K[1]
1109	eor	@K[2],@K[2],@K[2]
1110	eor	@K[3],@K[3],@K[3]
1111	eor	@K[4],@K[4],@K[4]
1112	eor	@K[5],@K[5],@K[5]
1113	eor	@K[6],@K[6],@K[6]
1114	b	.Loop_outer
1115
1116.Ldone_512_neon:
1117	ldp	x19,x20,[x29,#16]
1118	add	sp,sp,#128+64
1119	ldp	x21,x22,[x29,#32]
1120	ldp	x23,x24,[x29,#48]
1121	ldp	x25,x26,[x29,#64]
1122	ldp	x27,x28,[x29,#80]
1123	ldp	x29,x30,[sp],#96
1124	.inst	0xd50323bf			// autiasp
1125	ret
1126.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1127___
1128}
1129}}}
1130
1131foreach (split("\n",$code)) {
1132	s/\`([^\`]*)\`/eval $1/geo;
1133
1134	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1135	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
1136	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1137	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1138	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1139
1140	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1141
1142	print $_,"\n";
1143}
1144close STDOUT;	# flush
1145