1f3cd074jkim#! /usr/bin/env perl
21661cedjkim# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3f3cd074jkim#
4f3cd074jkim# Licensed under the OpenSSL license (the "License").  You may not use
5f3cd074jkim# this file except in compliance with the License.  You can obtain a copy
6f3cd074jkim# in the file LICENSE in the source distribution or at
7f3cd074jkim# https://www.openssl.org/source/license.html
8f3cd074jkim
9f3cd074jkim#
10f3cd074jkim# ====================================================================
11f3cd074jkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12f3cd074jkim# project. The module is, however, dual licensed under OpenSSL and
13f3cd074jkim# CRYPTOGAMS licenses depending on where you obtain it. For further
14f3cd074jkim# details see http://www.openssl.org/~appro/cryptogams/.
15f3cd074jkim# ====================================================================
16f3cd074jkim#
17f3cd074jkim# June 2015
18f3cd074jkim#
19f3cd074jkim# ChaCha20 for ARMv8.
20f3cd074jkim#
21f3cd074jkim# Performance in cycles per byte out of large buffer.
22f3cd074jkim#
23f3cd074jkim#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
24f3cd074jkim#
25f3cd074jkim# Apple A7		5.50/+49%       3.33            1.70
26f3cd074jkim# Cortex-A53		8.40/+80%       4.72		4.72(*)
27f3cd074jkim# Cortex-A57		8.06/+43%       4.90            4.43(**)
28f3cd074jkim# Denver		4.50/+82%       2.63		2.67(*)
29f3cd074jkim# X-Gene		9.50/+46%       8.82		8.89(*)
30f3cd074jkim# Mongoose		8.00/+44%	3.64		3.25
31f3cd074jkim# Kryo			8.17/+50%	4.83		4.65
32f3cd074jkim#
33f3cd074jkim# (*)	it's expected that doubling interleave factor doesn't help
34f3cd074jkim#	all processors, only those with higher NEON latency and
35f3cd074jkim#	higher instruction issue rate;
36f3cd074jkim# (**)	expected improvement was actually higher;
37f3cd074jkim
38f3cd074jkim$flavour=shift;
39f3cd074jkim$output=shift;
40f3cd074jkim
41f3cd074jkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42f3cd074jkim( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43f3cd074jkim( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44f3cd074jkimdie "can't locate arm-xlate.pl";
45f3cd074jkim
46f3cd074jkimopen OUT,"| \"$^X\" $xlate $flavour $output";
47f3cd074jkim*STDOUT=*OUT;
48f3cd074jkim
49f3cd074jkimsub AUTOLOAD()		# thunk [simplified] x86-style perlasm
50f3cd074jkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51f3cd074jkim  my $arg = pop;
52f3cd074jkim    $arg = "#$arg" if ($arg*1 eq $arg);
53f3cd074jkim    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54f3cd074jkim}
55f3cd074jkim
56f3cd074jkimmy ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57f3cd074jkim
58f3cd074jkimmy @x=map("x$_",(5..17,19..21));
59f3cd074jkimmy @d=map("x$_",(22..28,30));
60f3cd074jkim
61f3cd074jkimsub ROUND {
62f3cd074jkimmy ($a0,$b0,$c0,$d0)=@_;
63f3cd074jkimmy ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64f3cd074jkimmy ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65f3cd074jkimmy ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66f3cd074jkim
67f3cd074jkim    (
68f3cd074jkim	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
69f3cd074jkim	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
70f3cd074jkim	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
71f3cd074jkim	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
72f3cd074jkim	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
73f3cd074jkim	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
74f3cd074jkim	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
75f3cd074jkim	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
76f3cd074jkim	"&ror_32	(@x[$d0],@x[$d0],16)",
77f3cd074jkim	 "&ror_32	(@x[$d1],@x[$d1],16)",
78f3cd074jkim	  "&ror_32	(@x[$d2],@x[$d2],16)",
79f3cd074jkim	   "&ror_32	(@x[$d3],@x[$d3],16)",
80f3cd074jkim
81f3cd074jkim	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
82f3cd074jkim	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
83f3cd074jkim	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
84f3cd074jkim	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
85f3cd074jkim	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
86f3cd074jkim	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
87f3cd074jkim	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
88f3cd074jkim	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
89f3cd074jkim	"&ror_32	(@x[$b0],@x[$b0],20)",
90f3cd074jkim	 "&ror_32	(@x[$b1],@x[$b1],20)",
91f3cd074jkim	  "&ror_32	(@x[$b2],@x[$b2],20)",
92f3cd074jkim	   "&ror_32	(@x[$b3],@x[$b3],20)",
93f3cd074jkim
94f3cd074jkim	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
95f3cd074jkim	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
96f3cd074jkim	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
97f3cd074jkim	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
98f3cd074jkim	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
99f3cd074jkim	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
100f3cd074jkim	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
101f3cd074jkim	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
102f3cd074jkim	"&ror_32	(@x[$d0],@x[$d0],24)",
103f3cd074jkim	 "&ror_32	(@x[$d1],@x[$d1],24)",
104f3cd074jkim	  "&ror_32	(@x[$d2],@x[$d2],24)",
105f3cd074jkim	   "&ror_32	(@x[$d3],@x[$d3],24)",
106f3cd074jkim
107f3cd074jkim	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
108f3cd074jkim	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
109f3cd074jkim	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
110f3cd074jkim	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
111f3cd074jkim	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
112f3cd074jkim	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
113f3cd074jkim	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
114f3cd074jkim	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
115f3cd074jkim	"&ror_32	(@x[$b0],@x[$b0],25)",
116f3cd074jkim	 "&ror_32	(@x[$b1],@x[$b1],25)",
117f3cd074jkim	  "&ror_32	(@x[$b2],@x[$b2],25)",
118f3cd074jkim	   "&ror_32	(@x[$b3],@x[$b3],25)"
119f3cd074jkim    );
120f3cd074jkim}
121f3cd074jkim
122f3cd074jkim$code.=<<___;
123f3cd074jkim#include "arm_arch.h"
124f3cd074jkim
125f3cd074jkim.text
126f3cd074jkim
127f3cd074jkim.extern	OPENSSL_armcap_P
128f3cd074jkim
129f3cd074jkim.align	5
130f3cd074jkim.Lsigma:
131f3cd074jkim.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
132f3cd074jkim.Lone:
133f3cd074jkim.long	1,0,0,0
134f3cd074jkim.LOPENSSL_armcap_P:
135f3cd074jkim#ifdef	__ILP32__
136f3cd074jkim.long	OPENSSL_armcap_P-.
137f3cd074jkim#else
138f3cd074jkim.quad	OPENSSL_armcap_P-.
139f3cd074jkim#endif
140f3cd074jkim.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
141f3cd074jkim
142f3cd074jkim.globl	ChaCha20_ctr32
143f3cd074jkim.type	ChaCha20_ctr32,%function
144f3cd074jkim.align	5
145f3cd074jkimChaCha20_ctr32:
146f3cd074jkim	cbz	$len,.Labort
147f3cd074jkim	adr	@x[0],.LOPENSSL_armcap_P
148f3cd074jkim	cmp	$len,#192
149f3cd074jkim	b.lo	.Lshort
150f3cd074jkim#ifdef	__ILP32__
151f3cd074jkim	ldrsw	@x[1],[@x[0]]
152f3cd074jkim#else
153f3cd074jkim	ldr	@x[1],[@x[0]]
154f3cd074jkim#endif
155f3cd074jkim	ldr	w17,[@x[1],@x[0]]
156f3cd074jkim	tst	w17,#ARMV7_NEON
157f3cd074jkim	b.ne	ChaCha20_neon
158f3cd074jkim
159f3cd074jkim.Lshort:
160cfa887fjkim	.inst	0xd503233f			// paciasp
161f3cd074jkim	stp	x29,x30,[sp,#-96]!
162f3cd074jkim	add	x29,sp,#0
163f3cd074jkim
164f3cd074jkim	adr	@x[0],.Lsigma
165f3cd074jkim	stp	x19,x20,[sp,#16]
166f3cd074jkim	stp	x21,x22,[sp,#32]
167f3cd074jkim	stp	x23,x24,[sp,#48]
168f3cd074jkim	stp	x25,x26,[sp,#64]
169f3cd074jkim	stp	x27,x28,[sp,#80]
170f3cd074jkim	sub	sp,sp,#64
171f3cd074jkim
172f3cd074jkim	ldp	@d[0],@d[1],[@x[0]]		// load sigma
173f3cd074jkim	ldp	@d[2],@d[3],[$key]		// load key
174f3cd074jkim	ldp	@d[4],@d[5],[$key,#16]
175f3cd074jkim	ldp	@d[6],@d[7],[$ctr]		// load counter
176f3cd074jkim#ifdef	__ARMEB__
177f3cd074jkim	ror	@d[2],@d[2],#32
178f3cd074jkim	ror	@d[3],@d[3],#32
179f3cd074jkim	ror	@d[4],@d[4],#32
180f3cd074jkim	ror	@d[5],@d[5],#32
181f3cd074jkim	ror	@d[6],@d[6],#32
182f3cd074jkim	ror	@d[7],@d[7],#32
183f3cd074jkim#endif
184f3cd074jkim
185f3cd074jkim.Loop_outer:
186f3cd074jkim	mov.32	@x[0],@d[0]			// unpack key block
187f3cd074jkim	lsr	@x[1],@d[0],#32
188f3cd074jkim	mov.32	@x[2],@d[1]
189f3cd074jkim	lsr	@x[3],@d[1],#32
190f3cd074jkim	mov.32	@x[4],@d[2]
191f3cd074jkim	lsr	@x[5],@d[2],#32
192f3cd074jkim	mov.32	@x[6],@d[3]
193f3cd074jkim	lsr	@x[7],@d[3],#32
194f3cd074jkim	mov.32	@x[8],@d[4]
195f3cd074jkim	lsr	@x[9],@d[4],#32
196f3cd074jkim	mov.32	@x[10],@d[5]
197f3cd074jkim	lsr	@x[11],@d[5],#32
198f3cd074jkim	mov.32	@x[12],@d[6]
199f3cd074jkim	lsr	@x[13],@d[6],#32
200f3cd074jkim	mov.32	@x[14],@d[7]
201f3cd074jkim	lsr	@x[15],@d[7],#32
202f3cd074jkim
203f3cd074jkim	mov	$ctr,#10
204f3cd074jkim	subs	$len,$len,#64
205f3cd074jkim.Loop:
206f3cd074jkim	sub	$ctr,$ctr,#1
207f3cd074jkim___
208f3cd074jkim	foreach (&ROUND(0, 4, 8,12)) { eval; }
209f3cd074jkim	foreach (&ROUND(0, 5,10,15)) { eval; }
210f3cd074jkim$code.=<<___;
211f3cd074jkim	cbnz	$ctr,.Loop
212f3cd074jkim
213f3cd074jkim	add.32	@x[0],@x[0],@d[0]		// accumulate key block
214f3cd074jkim	add	@x[1],@x[1],@d[0],lsr#32
215f3cd074jkim	add.32	@x[2],@x[2],@d[1]
216f3cd074jkim	add	@x[3],@x[3],@d[1],lsr#32
217f3cd074jkim	add.32	@x[4],@x[4],@d[2]
218f3cd074jkim	add	@x[5],@x[5],@d[2],lsr#32
219f3cd074jkim	add.32	@x[6],@x[6],@d[3]
220f3cd074jkim	add	@x[7],@x[7],@d[3],lsr#32
221f3cd074jkim	add.32	@x[8],@x[8],@d[4]
222f3cd074jkim	add	@x[9],@x[9],@d[4],lsr#32
223f3cd074jkim	add.32	@x[10],@x[10],@d[5]
224f3cd074jkim	add	@x[11],@x[11],@d[5],lsr#32
225f3cd074jkim	add.32	@x[12],@x[12],@d[6]
226f3cd074jkim	add	@x[13],@x[13],@d[6],lsr#32
227f3cd074jkim	add.32	@x[14],@x[14],@d[7]
228f3cd074jkim	add	@x[15],@x[15],@d[7],lsr#32
229f3cd074jkim
230f3cd074jkim	b.lo	.Ltail
231f3cd074jkim
232f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
233f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
234f3cd074jkim	ldp	@x[1],@x[3],[$inp,#0]		// load input
235f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
236f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
237f3cd074jkim	ldp	@x[5],@x[7],[$inp,#16]
238f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
239f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
240f3cd074jkim	ldp	@x[9],@x[11],[$inp,#32]
241f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
242f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
243f3cd074jkim	ldp	@x[13],@x[15],[$inp,#48]
244f3cd074jkim	add	$inp,$inp,#64
245f3cd074jkim#ifdef	__ARMEB__
246f3cd074jkim	rev	@x[0],@x[0]
247f3cd074jkim	rev	@x[2],@x[2]
248f3cd074jkim	rev	@x[4],@x[4]
249f3cd074jkim	rev	@x[6],@x[6]
250f3cd074jkim	rev	@x[8],@x[8]
251f3cd074jkim	rev	@x[10],@x[10]
252f3cd074jkim	rev	@x[12],@x[12]
253f3cd074jkim	rev	@x[14],@x[14]
254f3cd074jkim#endif
255f3cd074jkim	eor	@x[0],@x[0],@x[1]
256f3cd074jkim	eor	@x[2],@x[2],@x[3]
257f3cd074jkim	eor	@x[4],@x[4],@x[5]
258f3cd074jkim	eor	@x[6],@x[6],@x[7]
259f3cd074jkim	eor	@x[8],@x[8],@x[9]
260f3cd074jkim	eor	@x[10],@x[10],@x[11]
261f3cd074jkim	eor	@x[12],@x[12],@x[13]
262f3cd074jkim	eor	@x[14],@x[14],@x[15]
263f3cd074jkim
264f3cd074jkim	stp	@x[0],@x[2],[$out,#0]		// store output
265f3cd074jkim	 add	@d[6],@d[6],#1			// increment counter
266f3cd074jkim	stp	@x[4],@x[6],[$out,#16]
267f3cd074jkim	stp	@x[8],@x[10],[$out,#32]
268f3cd074jkim	stp	@x[12],@x[14],[$out,#48]
269f3cd074jkim	add	$out,$out,#64
270f3cd074jkim
271f3cd074jkim	b.hi	.Loop_outer
272f3cd074jkim
273f3cd074jkim	ldp	x19,x20,[x29,#16]
274f3cd074jkim	add	sp,sp,#64
275f3cd074jkim	ldp	x21,x22,[x29,#32]
276f3cd074jkim	ldp	x23,x24,[x29,#48]
277f3cd074jkim	ldp	x25,x26,[x29,#64]
278f3cd074jkim	ldp	x27,x28,[x29,#80]
279f3cd074jkim	ldp	x29,x30,[sp],#96
280cfa887fjkim	.inst	0xd50323bf			// autiasp
281f3cd074jkim.Labort:
282f3cd074jkim	ret
283f3cd074jkim
284f3cd074jkim.align	4
285f3cd074jkim.Ltail:
286f3cd074jkim	add	$len,$len,#64
287f3cd074jkim.Less_than_64:
288f3cd074jkim	sub	$out,$out,#1
289f3cd074jkim	add	$inp,$inp,$len
290f3cd074jkim	add	$out,$out,$len
291f3cd074jkim	add	$ctr,sp,$len
292f3cd074jkim	neg	$len,$len
293f3cd074jkim
294f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
295f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
296f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
297f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
298f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
299f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
300f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
301f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
302f3cd074jkim#ifdef	__ARMEB__
303f3cd074jkim	rev	@x[0],@x[0]
304f3cd074jkim	rev	@x[2],@x[2]
305f3cd074jkim	rev	@x[4],@x[4]
306f3cd074jkim	rev	@x[6],@x[6]
307f3cd074jkim	rev	@x[8],@x[8]
308f3cd074jkim	rev	@x[10],@x[10]
309f3cd074jkim	rev	@x[12],@x[12]
310f3cd074jkim	rev	@x[14],@x[14]
311f3cd074jkim#endif
312f3cd074jkim	stp	@x[0],@x[2],[sp,#0]
313f3cd074jkim	stp	@x[4],@x[6],[sp,#16]
314f3cd074jkim	stp	@x[8],@x[10],[sp,#32]
315f3cd074jkim	stp	@x[12],@x[14],[sp,#48]
316f3cd074jkim
317f3cd074jkim.Loop_tail:
318f3cd074jkim	ldrb	w10,[$inp,$len]
319f3cd074jkim	ldrb	w11,[$ctr,$len]
320f3cd074jkim	add	$len,$len,#1
321f3cd074jkim	eor	w10,w10,w11
322f3cd074jkim	strb	w10,[$out,$len]
323f3cd074jkim	cbnz	$len,.Loop_tail
324f3cd074jkim
325f3cd074jkim	stp	xzr,xzr,[sp,#0]
326f3cd074jkim	stp	xzr,xzr,[sp,#16]
327f3cd074jkim	stp	xzr,xzr,[sp,#32]
328f3cd074jkim	stp	xzr,xzr,[sp,#48]
329f3cd074jkim
330f3cd074jkim	ldp	x19,x20,[x29,#16]
331f3cd074jkim	add	sp,sp,#64
332f3cd074jkim	ldp	x21,x22,[x29,#32]
333f3cd074jkim	ldp	x23,x24,[x29,#48]
334f3cd074jkim	ldp	x25,x26,[x29,#64]
335f3cd074jkim	ldp	x27,x28,[x29,#80]
336f3cd074jkim	ldp	x29,x30,[sp],#96
337cfa887fjkim	.inst	0xd50323bf			// autiasp
338f3cd074jkim	ret
339f3cd074jkim.size	ChaCha20_ctr32,.-ChaCha20_ctr32
340f3cd074jkim___
341f3cd074jkim
342f3cd074jkim{{{
343f3cd074jkimmy ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
344f3cd074jkim    map("v$_.4s",(0..7,16..23));
345f3cd074jkimmy (@K)=map("v$_.4s",(24..30));
346f3cd074jkimmy $ONE="v31.4s";
347f3cd074jkim
348f3cd074jkimsub NEONROUND {
349f3cd074jkimmy $odd = pop;
350f3cd074jkimmy ($a,$b,$c,$d,$t)=@_;
351f3cd074jkim
352f3cd074jkim	(
353f3cd074jkim	"&add		('$a','$a','$b')",
354f3cd074jkim	"&eor		('$d','$d','$a')",
355f3cd074jkim	"&rev32_16	('$d','$d')",		# vrot ($d,16)
356f3cd074jkim
357f3cd074jkim	"&add		('$c','$c','$d')",
358f3cd074jkim	"&eor		('$t','$b','$c')",
359f3cd074jkim	"&ushr		('$b','$t',20)",
360f3cd074jkim	"&sli		('$b','$t',12)",
361f3cd074jkim
362f3cd074jkim	"&add		('$a','$a','$b')",
363f3cd074jkim	"&eor		('$t','$d','$a')",
364f3cd074jkim	"&ushr		('$d','$t',24)",
365f3cd074jkim	"&sli		('$d','$t',8)",
366f3cd074jkim
367f3cd074jkim	"&add		('$c','$c','$d')",
368f3cd074jkim	"&eor		('$t','$b','$c')",
369f3cd074jkim	"&ushr		('$b','$t',25)",
370f3cd074jkim	"&sli		('$b','$t',7)",
371f3cd074jkim
372f3cd074jkim	"&ext		('$c','$c','$c',8)",
373f3cd074jkim	"&ext		('$d','$d','$d',$odd?4:12)",
374f3cd074jkim	"&ext		('$b','$b','$b',$odd?12:4)"
375f3cd074jkim	);
376f3cd074jkim}
377f3cd074jkim
378f3cd074jkim$code.=<<___;
379f3cd074jkim
380f3cd074jkim.type	ChaCha20_neon,%function
381f3cd074jkim.align	5
382f3cd074jkimChaCha20_neon:
383cfa887fjkim	.inst	0xd503233f			// paciasp
384f3cd074jkim	stp	x29,x30,[sp,#-96]!
385f3cd074jkim	add	x29,sp,#0
386f3cd074jkim
387f3cd074jkim	adr	@x[0],.Lsigma
388f3cd074jkim	stp	x19,x20,[sp,#16]
389f3cd074jkim	stp	x21,x22,[sp,#32]
390f3cd074jkim	stp	x23,x24,[sp,#48]
391f3cd074jkim	stp	x25,x26,[sp,#64]
392f3cd074jkim	stp	x27,x28,[sp,#80]
393f3cd074jkim	cmp	$len,#512
394f3cd074jkim	b.hs	.L512_or_more_neon
395f3cd074jkim
396f3cd074jkim	sub	sp,sp,#64
397f3cd074jkim
398f3cd074jkim	ldp	@d[0],@d[1],[@x[0]]		// load sigma
399f3cd074jkim	ld1	{@K[0]},[@x[0]],#16
400f3cd074jkim	ldp	@d[2],@d[3],[$key]		// load key
401f3cd074jkim	ldp	@d[4],@d[5],[$key,#16]
402f3cd074jkim	ld1	{@K[1],@K[2]},[$key]
403f3cd074jkim	ldp	@d[6],@d[7],[$ctr]		// load counter
404f3cd074jkim	ld1	{@K[3]},[$ctr]
405f3cd074jkim	ld1	{$ONE},[@x[0]]
406f3cd074jkim#ifdef	__ARMEB__
407f3cd074jkim	rev64	@K[0],@K[0]
408f3cd074jkim	ror	@d[2],@d[2],#32
409f3cd074jkim	ror	@d[3],@d[3],#32
410f3cd074jkim	ror	@d[4],@d[4],#32
411f3cd074jkim	ror	@d[5],@d[5],#32
412f3cd074jkim	ror	@d[6],@d[6],#32
413f3cd074jkim	ror	@d[7],@d[7],#32
414f3cd074jkim#endif
415f3cd074jkim	add	@K[3],@K[3],$ONE		// += 1
416f3cd074jkim	add	@K[4],@K[3],$ONE
417f3cd074jkim	add	@K[5],@K[4],$ONE
418f3cd074jkim	shl	$ONE,$ONE,#2			// 1 -> 4
419f3cd074jkim
420f3cd074jkim.Loop_outer_neon:
421f3cd074jkim	mov.32	@x[0],@d[0]			// unpack key block
422f3cd074jkim	lsr	@x[1],@d[0],#32
423f3cd074jkim	 mov	$A0,@K[0]
424f3cd074jkim	mov.32	@x[2],@d[1]
425f3cd074jkim	lsr	@x[3],@d[1],#32
426f3cd074jkim	 mov	$A1,@K[0]
427f3cd074jkim	mov.32	@x[4],@d[2]
428f3cd074jkim	lsr	@x[5],@d[2],#32
429f3cd074jkim	 mov	$A2,@K[0]
430f3cd074jkim	mov.32	@x[6],@d[3]
431f3cd074jkim	 mov	$B0,@K[1]
432f3cd074jkim	lsr	@x[7],@d[3],#32
433f3cd074jkim	 mov	$B1,@K[1]
434f3cd074jkim	mov.32	@x[8],@d[4]
435f3cd074jkim	 mov	$B2,@K[1]
436f3cd074jkim	lsr	@x[9],@d[4],#32
437f3cd074jkim	 mov	$D0,@K[3]
438f3cd074jkim	mov.32	@x[10],@d[5]
439f3cd074jkim	 mov	$D1,@K[4]
440f3cd074jkim	lsr	@x[11],@d[5],#32
441f3cd074jkim	 mov	$D2,@K[5]
442f3cd074jkim	mov.32	@x[12],@d[6]
443f3cd074jkim	 mov	$C0,@K[2]
444f3cd074jkim	lsr	@x[13],@d[6],#32
445f3cd074jkim	 mov	$C1,@K[2]
446f3cd074jkim	mov.32	@x[14],@d[7]
447f3cd074jkim	 mov	$C2,@K[2]
448f3cd074jkim	lsr	@x[15],@d[7],#32
449f3cd074jkim
450f3cd074jkim	mov	$ctr,#10
451f3cd074jkim	subs	$len,$len,#256
452f3cd074jkim.Loop_neon:
453f3cd074jkim	sub	$ctr,$ctr,#1
454f3cd074jkim___
455f3cd074jkim	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
456f3cd074jkim	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
457f3cd074jkim	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
458f3cd074jkim	my @thread3=&ROUND(0,4,8,12);
459f3cd074jkim
460f3cd074jkim	foreach (@thread0) {
461f3cd074jkim		eval;			eval(shift(@thread3));
462f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread3));
463f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread3));
464f3cd074jkim	}
465f3cd074jkim
466f3cd074jkim	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
467f3cd074jkim	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
468f3cd074jkim	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
469f3cd074jkim	@thread3=&ROUND(0,5,10,15);
470f3cd074jkim
471f3cd074jkim	foreach (@thread0) {
472f3cd074jkim		eval;			eval(shift(@thread3));
473f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread3));
474f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread3));
475f3cd074jkim	}
476f3cd074jkim$code.=<<___;
477f3cd074jkim	cbnz	$ctr,.Loop_neon
478f3cd074jkim
479f3cd074jkim	add.32	@x[0],@x[0],@d[0]		// accumulate key block
480f3cd074jkim	 add	$A0,$A0,@K[0]
481f3cd074jkim	add	@x[1],@x[1],@d[0],lsr#32
482f3cd074jkim	 add	$A1,$A1,@K[0]
483f3cd074jkim	add.32	@x[2],@x[2],@d[1]
484f3cd074jkim	 add	$A2,$A2,@K[0]
485f3cd074jkim	add	@x[3],@x[3],@d[1],lsr#32
486f3cd074jkim	 add	$C0,$C0,@K[2]
487f3cd074jkim	add.32	@x[4],@x[4],@d[2]
488f3cd074jkim	 add	$C1,$C1,@K[2]
489f3cd074jkim	add	@x[5],@x[5],@d[2],lsr#32
490f3cd074jkim	 add	$C2,$C2,@K[2]
491f3cd074jkim	add.32	@x[6],@x[6],@d[3]
492f3cd074jkim	 add	$D0,$D0,@K[3]
493f3cd074jkim	add	@x[7],@x[7],@d[3],lsr#32
494f3cd074jkim	add.32	@x[8],@x[8],@d[4]
495f3cd074jkim	 add	$D1,$D1,@K[4]
496f3cd074jkim	add	@x[9],@x[9],@d[4],lsr#32
497f3cd074jkim	add.32	@x[10],@x[10],@d[5]
498f3cd074jkim	 add	$D2,$D2,@K[5]
499f3cd074jkim	add	@x[11],@x[11],@d[5],lsr#32
500f3cd074jkim	add.32	@x[12],@x[12],@d[6]
501f3cd074jkim	 add	$B0,$B0,@K[1]
502f3cd074jkim	add	@x[13],@x[13],@d[6],lsr#32
503f3cd074jkim	add.32	@x[14],@x[14],@d[7]
504f3cd074jkim	 add	$B1,$B1,@K[1]
505f3cd074jkim	add	@x[15],@x[15],@d[7],lsr#32
506f3cd074jkim	 add	$B2,$B2,@K[1]
507f3cd074jkim
508f3cd074jkim	b.lo	.Ltail_neon
509f3cd074jkim
510f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
511f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
512f3cd074jkim	ldp	@x[1],@x[3],[$inp,#0]		// load input
513f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
514f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
515f3cd074jkim	ldp	@x[5],@x[7],[$inp,#16]
516f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
517f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
518f3cd074jkim	ldp	@x[9],@x[11],[$inp,#32]
519f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
520f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
521f3cd074jkim	ldp	@x[13],@x[15],[$inp,#48]
522f3cd074jkim	add	$inp,$inp,#64
523f3cd074jkim#ifdef	__ARMEB__
524f3cd074jkim	rev	@x[0],@x[0]
525f3cd074jkim	rev	@x[2],@x[2]
526f3cd074jkim	rev	@x[4],@x[4]
527f3cd074jkim	rev	@x[6],@x[6]
528f3cd074jkim	rev	@x[8],@x[8]
529f3cd074jkim	rev	@x[10],@x[10]
530f3cd074jkim	rev	@x[12],@x[12]
531f3cd074jkim	rev	@x[14],@x[14]
532f3cd074jkim#endif
533f3cd074jkim	ld1.8	{$T0-$T3},[$inp],#64
534f3cd074jkim	eor	@x[0],@x[0],@x[1]
535f3cd074jkim	eor	@x[2],@x[2],@x[3]
536f3cd074jkim	eor	@x[4],@x[4],@x[5]
537f3cd074jkim	eor	@x[6],@x[6],@x[7]
538f3cd074jkim	eor	@x[8],@x[8],@x[9]
539f3cd074jkim	 eor	$A0,$A0,$T0
540f3cd074jkim	eor	@x[10],@x[10],@x[11]
541f3cd074jkim	 eor	$B0,$B0,$T1
542f3cd074jkim	eor	@x[12],@x[12],@x[13]
543f3cd074jkim	 eor	$C0,$C0,$T2
544f3cd074jkim	eor	@x[14],@x[14],@x[15]
545f3cd074jkim	 eor	$D0,$D0,$T3
546f3cd074jkim	 ld1.8	{$T0-$T3},[$inp],#64
547f3cd074jkim
548f3cd074jkim	stp	@x[0],@x[2],[$out,#0]		// store output
549f3cd074jkim	 add	@d[6],@d[6],#4			// increment counter
550f3cd074jkim	stp	@x[4],@x[6],[$out,#16]
551f3cd074jkim	 add	@K[3],@K[3],$ONE		// += 4
552f3cd074jkim	stp	@x[8],@x[10],[$out,#32]
553f3cd074jkim	 add	@K[4],@K[4],$ONE
554f3cd074jkim	stp	@x[12],@x[14],[$out,#48]
555f3cd074jkim	 add	@K[5],@K[5],$ONE
556f3cd074jkim	add	$out,$out,#64
557f3cd074jkim
558f3cd074jkim	st1.8	{$A0-$D0},[$out],#64
559f3cd074jkim	ld1.8	{$A0-$D0},[$inp],#64
560f3cd074jkim
561f3cd074jkim	eor	$A1,$A1,$T0
562f3cd074jkim	eor	$B1,$B1,$T1
563f3cd074jkim	eor	$C1,$C1,$T2
564f3cd074jkim	eor	$D1,$D1,$T3
565f3cd074jkim	st1.8	{$A1-$D1},[$out],#64
566f3cd074jkim
567f3cd074jkim	eor	$A2,$A2,$A0
568f3cd074jkim	eor	$B2,$B2,$B0
569f3cd074jkim	eor	$C2,$C2,$C0
570f3cd074jkim	eor	$D2,$D2,$D0
571f3cd074jkim	st1.8	{$A2-$D2},[$out],#64
572f3cd074jkim
573f3cd074jkim	b.hi	.Loop_outer_neon
574f3cd074jkim
575f3cd074jkim	ldp	x19,x20,[x29,#16]
576f3cd074jkim	add	sp,sp,#64
577f3cd074jkim	ldp	x21,x22,[x29,#32]
578f3cd074jkim	ldp	x23,x24,[x29,#48]
579f3cd074jkim	ldp	x25,x26,[x29,#64]
580f3cd074jkim	ldp	x27,x28,[x29,#80]
581f3cd074jkim	ldp	x29,x30,[sp],#96
582cfa887fjkim	.inst	0xd50323bf			// autiasp
583f3cd074jkim	ret
584f3cd074jkim
585f3cd074jkim.Ltail_neon:
586f3cd074jkim	add	$len,$len,#256
587f3cd074jkim	cmp	$len,#64
588f3cd074jkim	b.lo	.Less_than_64
589f3cd074jkim
590f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
591f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
592f3cd074jkim	ldp	@x[1],@x[3],[$inp,#0]		// load input
593f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
594f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
595f3cd074jkim	ldp	@x[5],@x[7],[$inp,#16]
596f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
597f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
598f3cd074jkim	ldp	@x[9],@x[11],[$inp,#32]
599f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
600f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
601f3cd074jkim	ldp	@x[13],@x[15],[$inp,#48]
602f3cd074jkim	add	$inp,$inp,#64
603f3cd074jkim#ifdef	__ARMEB__
604f3cd074jkim	rev	@x[0],@x[0]
605f3cd074jkim	rev	@x[2],@x[2]
606f3cd074jkim	rev	@x[4],@x[4]
607f3cd074jkim	rev	@x[6],@x[6]
608f3cd074jkim	rev	@x[8],@x[8]
609f3cd074jkim	rev	@x[10],@x[10]
610f3cd074jkim	rev	@x[12],@x[12]
611f3cd074jkim	rev	@x[14],@x[14]
612f3cd074jkim#endif
613f3cd074jkim	eor	@x[0],@x[0],@x[1]
614f3cd074jkim	eor	@x[2],@x[2],@x[3]
615f3cd074jkim	eor	@x[4],@x[4],@x[5]
616f3cd074jkim	eor	@x[6],@x[6],@x[7]
617f3cd074jkim	eor	@x[8],@x[8],@x[9]
618f3cd074jkim	eor	@x[10],@x[10],@x[11]
619f3cd074jkim	eor	@x[12],@x[12],@x[13]
620f3cd074jkim	eor	@x[14],@x[14],@x[15]
621f3cd074jkim
622f3cd074jkim	stp	@x[0],@x[2],[$out,#0]		// store output
623f3cd074jkim	 add	@d[6],@d[6],#4			// increment counter
624f3cd074jkim	stp	@x[4],@x[6],[$out,#16]
625f3cd074jkim	stp	@x[8],@x[10],[$out,#32]
626f3cd074jkim	stp	@x[12],@x[14],[$out,#48]
627f3cd074jkim	add	$out,$out,#64
628f3cd074jkim	b.eq	.Ldone_neon
629f3cd074jkim	sub	$len,$len,#64
630f3cd074jkim	cmp	$len,#64
631f3cd074jkim	b.lo	.Less_than_128
632f3cd074jkim
633f3cd074jkim	ld1.8	{$T0-$T3},[$inp],#64
634f3cd074jkim	eor	$A0,$A0,$T0
635f3cd074jkim	eor	$B0,$B0,$T1
636f3cd074jkim	eor	$C0,$C0,$T2
637f3cd074jkim	eor	$D0,$D0,$T3
638f3cd074jkim	st1.8	{$A0-$D0},[$out],#64
639f3cd074jkim	b.eq	.Ldone_neon
640f3cd074jkim	sub	$len,$len,#64
641f3cd074jkim	cmp	$len,#64
642f3cd074jkim	b.lo	.Less_than_192
643f3cd074jkim
644f3cd074jkim	ld1.8	{$T0-$T3},[$inp],#64
645f3cd074jkim	eor	$A1,$A1,$T0
646f3cd074jkim	eor	$B1,$B1,$T1
647f3cd074jkim	eor	$C1,$C1,$T2
648f3cd074jkim	eor	$D1,$D1,$T3
649f3cd074jkim	st1.8	{$A1-$D1},[$out],#64
650f3cd074jkim	b.eq	.Ldone_neon
651f3cd074jkim	sub	$len,$len,#64
652f3cd074jkim
653f3cd074jkim	st1.8	{$A2-$D2},[sp]
654f3cd074jkim	b	.Last_neon
655f3cd074jkim
656f3cd074jkim.Less_than_128:
657f3cd074jkim	st1.8	{$A0-$D0},[sp]
658f3cd074jkim	b	.Last_neon
659f3cd074jkim.Less_than_192:
660f3cd074jkim	st1.8	{$A1-$D1},[sp]
661f3cd074jkim	b	.Last_neon
662f3cd074jkim
663f3cd074jkim.align	4
664f3cd074jkim.Last_neon:
665f3cd074jkim	sub	$out,$out,#1
666f3cd074jkim	add	$inp,$inp,$len
667f3cd074jkim	add	$out,$out,$len
668f3cd074jkim	add	$ctr,sp,$len
669f3cd074jkim	neg	$len,$len
670f3cd074jkim
671f3cd074jkim.Loop_tail_neon:
672f3cd074jkim	ldrb	w10,[$inp,$len]
673f3cd074jkim	ldrb	w11,[$ctr,$len]
674f3cd074jkim	add	$len,$len,#1
675f3cd074jkim	eor	w10,w10,w11
676f3cd074jkim	strb	w10,[$out,$len]
677f3cd074jkim	cbnz	$len,.Loop_tail_neon
678f3cd074jkim
679f3cd074jkim	stp	xzr,xzr,[sp,#0]
680f3cd074jkim	stp	xzr,xzr,[sp,#16]
681f3cd074jkim	stp	xzr,xzr,[sp,#32]
682f3cd074jkim	stp	xzr,xzr,[sp,#48]
683f3cd074jkim
684f3cd074jkim.Ldone_neon:
685f3cd074jkim	ldp	x19,x20,[x29,#16]
686f3cd074jkim	add	sp,sp,#64
687f3cd074jkim	ldp	x21,x22,[x29,#32]
688f3cd074jkim	ldp	x23,x24,[x29,#48]
689f3cd074jkim	ldp	x25,x26,[x29,#64]
690f3cd074jkim	ldp	x27,x28,[x29,#80]
691f3cd074jkim	ldp	x29,x30,[sp],#96
692cfa887fjkim	.inst	0xd50323bf			// autiasp
693f3cd074jkim	ret
694f3cd074jkim.size	ChaCha20_neon,.-ChaCha20_neon
695f3cd074jkim___
696f3cd074jkim{
697f3cd074jkimmy ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
698f3cd074jkimmy ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
699f3cd074jkim    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
700f3cd074jkim
701f3cd074jkim$code.=<<___;
702f3cd074jkim.type	ChaCha20_512_neon,%function
703f3cd074jkim.align	5
704f3cd074jkimChaCha20_512_neon:
705cfa887fjkim	.inst	0xd503233f			// paciasp
706f3cd074jkim	stp	x29,x30,[sp,#-96]!
707f3cd074jkim	add	x29,sp,#0
708f3cd074jkim
709f3cd074jkim	adr	@x[0],.Lsigma
710f3cd074jkim	stp	x19,x20,[sp,#16]
711f3cd074jkim	stp	x21,x22,[sp,#32]
712f3cd074jkim	stp	x23,x24,[sp,#48]
713f3cd074jkim	stp	x25,x26,[sp,#64]
714f3cd074jkim	stp	x27,x28,[sp,#80]
715f3cd074jkim
716f3cd074jkim.L512_or_more_neon:
717f3cd074jkim	sub	sp,sp,#128+64
718f3cd074jkim
719f3cd074jkim	ldp	@d[0],@d[1],[@x[0]]		// load sigma
720f3cd074jkim	ld1	{@K[0]},[@x[0]],#16
721f3cd074jkim	ldp	@d[2],@d[3],[$key]		// load key
722f3cd074jkim	ldp	@d[4],@d[5],[$key,#16]
723f3cd074jkim	ld1	{@K[1],@K[2]},[$key]
724f3cd074jkim	ldp	@d[6],@d[7],[$ctr]		// load counter
725f3cd074jkim	ld1	{@K[3]},[$ctr]
726f3cd074jkim	ld1	{$ONE},[@x[0]]
727f3cd074jkim#ifdef	__ARMEB__
728f3cd074jkim	rev64	@K[0],@K[0]
729f3cd074jkim	ror	@d[2],@d[2],#32
730f3cd074jkim	ror	@d[3],@d[3],#32
731f3cd074jkim	ror	@d[4],@d[4],#32
732f3cd074jkim	ror	@d[5],@d[5],#32
733f3cd074jkim	ror	@d[6],@d[6],#32
734f3cd074jkim	ror	@d[7],@d[7],#32
735f3cd074jkim#endif
736f3cd074jkim	add	@K[3],@K[3],$ONE		// += 1
737f3cd074jkim	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
738f3cd074jkim	add	@K[3],@K[3],$ONE		// not typo
739f3cd074jkim	str	@K[2],[sp,#32]
740f3cd074jkim	add	@K[4],@K[3],$ONE
741f3cd074jkim	add	@K[5],@K[4],$ONE
742f3cd074jkim	add	@K[6],@K[5],$ONE
743f3cd074jkim	shl	$ONE,$ONE,#2			// 1 -> 4
744f3cd074jkim
745f3cd074jkim	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
746f3cd074jkim	stp	d10,d11,[sp,#128+16]
747f3cd074jkim	stp	d12,d13,[sp,#128+32]
748f3cd074jkim	stp	d14,d15,[sp,#128+48]
749f3cd074jkim
750f3cd074jkim	sub	$len,$len,#512			// not typo
751f3cd074jkim
752f3cd074jkim.Loop_outer_512_neon:
753f3cd074jkim	 mov	$A0,@K[0]
754f3cd074jkim	 mov	$A1,@K[0]
755f3cd074jkim	 mov	$A2,@K[0]
756f3cd074jkim	 mov	$A3,@K[0]
757f3cd074jkim	 mov	$A4,@K[0]
758f3cd074jkim	 mov	$A5,@K[0]
759f3cd074jkim	 mov	$B0,@K[1]
760f3cd074jkim	mov.32	@x[0],@d[0]			// unpack key block
761f3cd074jkim	 mov	$B1,@K[1]
762f3cd074jkim	lsr	@x[1],@d[0],#32
763f3cd074jkim	 mov	$B2,@K[1]
764f3cd074jkim	mov.32	@x[2],@d[1]
765f3cd074jkim	 mov	$B3,@K[1]
766f3cd074jkim	lsr	@x[3],@d[1],#32
767f3cd074jkim	 mov	$B4,@K[1]
768f3cd074jkim	mov.32	@x[4],@d[2]
769f3cd074jkim	 mov	$B5,@K[1]
770f3cd074jkim	lsr	@x[5],@d[2],#32
771f3cd074jkim	 mov	$D0,@K[3]
772f3cd074jkim	mov.32	@x[6],@d[3]
773f3cd074jkim	 mov	$D1,@K[4]
774f3cd074jkim	lsr	@x[7],@d[3],#32
775f3cd074jkim	 mov	$D2,@K[5]
776f3cd074jkim	mov.32	@x[8],@d[4]
777f3cd074jkim	 mov	$D3,@K[6]
778f3cd074jkim	lsr	@x[9],@d[4],#32
779f3cd074jkim	 mov	$C0,@K[2]
780f3cd074jkim	mov.32	@x[10],@d[5]
781f3cd074jkim	 mov	$C1,@K[2]
782f3cd074jkim	lsr	@x[11],@d[5],#32
783f3cd074jkim	 add	$D4,$D0,$ONE			// +4
784f3cd074jkim	mov.32	@x[12],@d[6]
785f3cd074jkim	 add	$D5,$D1,$ONE			// +4
786f3cd074jkim	lsr	@x[13],@d[6],#32
787f3cd074jkim	 mov	$C2,@K[2]
788f3cd074jkim	mov.32	@x[14],@d[7]
789f3cd074jkim	 mov	$C3,@K[2]
790f3cd074jkim	lsr	@x[15],@d[7],#32
791f3cd074jkim	 mov	$C4,@K[2]
792f3cd074jkim	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
793f3cd074jkim	 mov	$C5,@K[2]
794f3cd074jkim	 str	@K[5],[sp,#80]
795f3cd074jkim
796f3cd074jkim	mov	$ctr,#5
797f3cd074jkim	subs	$len,$len,#512
798f3cd074jkim.Loop_upper_neon:
799f3cd074jkim	sub	$ctr,$ctr,#1
800f3cd074jkim___
801f3cd074jkim	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
802f3cd074jkim	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
803f3cd074jkim	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
804f3cd074jkim	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
805f3cd074jkim	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
806f3cd074jkim	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
807f3cd074jkim	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
808f3cd074jkim	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
809f3cd074jkim	my $i = 0;
810f3cd074jkim
811f3cd074jkim	foreach (@thread0) {
812f3cd074jkim		eval;			eval(shift(@thread67));
813f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread67));
814f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread67));
815f3cd074jkim		eval(shift(@thread3));	eval(shift(@thread67));
816f3cd074jkim		eval(shift(@thread4));	eval(shift(@thread67));
817f3cd074jkim		eval(shift(@thread5));	eval(shift(@thread67));
818f3cd074jkim	}
819f3cd074jkim
820f3cd074jkim	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
821f3cd074jkim	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
822f3cd074jkim	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
823f3cd074jkim	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
824f3cd074jkim	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
825f3cd074jkim	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
826f3cd074jkim	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
827f3cd074jkim
828f3cd074jkim	foreach (@thread0) {
829f3cd074jkim		eval;			eval(shift(@thread67));
830f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread67));
831f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread67));
832f3cd074jkim		eval(shift(@thread3));	eval(shift(@thread67));
833f3cd074jkim		eval(shift(@thread4));	eval(shift(@thread67));
834f3cd074jkim		eval(shift(@thread5));	eval(shift(@thread67));
835f3cd074jkim	}
836f3cd074jkim$code.=<<___;
837f3cd074jkim	cbnz	$ctr,.Loop_upper_neon
838f3cd074jkim
839f3cd074jkim	add.32	@x[0],@x[0],@d[0]		// accumulate key block
840f3cd074jkim	add	@x[1],@x[1],@d[0],lsr#32
841f3cd074jkim	add.32	@x[2],@x[2],@d[1]
842f3cd074jkim	add	@x[3],@x[3],@d[1],lsr#32
843f3cd074jkim	add.32	@x[4],@x[4],@d[2]
844f3cd074jkim	add	@x[5],@x[5],@d[2],lsr#32
845f3cd074jkim	add.32	@x[6],@x[6],@d[3]
846f3cd074jkim	add	@x[7],@x[7],@d[3],lsr#32
847f3cd074jkim	add.32	@x[8],@x[8],@d[4]
848f3cd074jkim	add	@x[9],@x[9],@d[4],lsr#32
849f3cd074jkim	add.32	@x[10],@x[10],@d[5]
850f3cd074jkim	add	@x[11],@x[11],@d[5],lsr#32
851f3cd074jkim	add.32	@x[12],@x[12],@d[6]
852f3cd074jkim	add	@x[13],@x[13],@d[6],lsr#32
853f3cd074jkim	add.32	@x[14],@x[14],@d[7]
854f3cd074jkim	add	@x[15],@x[15],@d[7],lsr#32
855f3cd074jkim
856f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
857f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
858f3cd074jkim	ldp	@x[1],@x[3],[$inp,#0]		// load input
859f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
860f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
861f3cd074jkim	ldp	@x[5],@x[7],[$inp,#16]
862f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
863f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
864f3cd074jkim	ldp	@x[9],@x[11],[$inp,#32]
865f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
866f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
867f3cd074jkim	ldp	@x[13],@x[15],[$inp,#48]
868f3cd074jkim	add	$inp,$inp,#64
869f3cd074jkim#ifdef	__ARMEB__
870f3cd074jkim	rev	@x[0],@x[0]
871f3cd074jkim	rev	@x[2],@x[2]
872f3cd074jkim	rev	@x[4],@x[4]
873f3cd074jkim	rev	@x[6],@x[6]
874f3cd074jkim	rev	@x[8],@x[8]
875f3cd074jkim	rev	@x[10],@x[10]
876f3cd074jkim	rev	@x[12],@x[12]
877f3cd074jkim	rev	@x[14],@x[14]
878f3cd074jkim#endif
879f3cd074jkim	eor	@x[0],@x[0],@x[1]
880f3cd074jkim	eor	@x[2],@x[2],@x[3]
881f3cd074jkim	eor	@x[4],@x[4],@x[5]
882f3cd074jkim	eor	@x[6],@x[6],@x[7]
883f3cd074jkim	eor	@x[8],@x[8],@x[9]
884f3cd074jkim	eor	@x[10],@x[10],@x[11]
885f3cd074jkim	eor	@x[12],@x[12],@x[13]
886f3cd074jkim	eor	@x[14],@x[14],@x[15]
887f3cd074jkim
888f3cd074jkim	 stp	@x[0],@x[2],[$out,#0]		// store output
889f3cd074jkim	 add	@d[6],@d[6],#1			// increment counter
890f3cd074jkim	mov.32	@x[0],@d[0]			// unpack key block
891f3cd074jkim	lsr	@x[1],@d[0],#32
892f3cd074jkim	 stp	@x[4],@x[6],[$out,#16]
893f3cd074jkim	mov.32	@x[2],@d[1]
894f3cd074jkim	lsr	@x[3],@d[1],#32
895f3cd074jkim	 stp	@x[8],@x[10],[$out,#32]
896f3cd074jkim	mov.32	@x[4],@d[2]
897f3cd074jkim	lsr	@x[5],@d[2],#32
898f3cd074jkim	 stp	@x[12],@x[14],[$out,#48]
899f3cd074jkim	 add	$out,$out,#64
900f3cd074jkim	mov.32	@x[6],@d[3]
901f3cd074jkim	lsr	@x[7],@d[3],#32
902f3cd074jkim	mov.32	@x[8],@d[4]
903f3cd074jkim	lsr	@x[9],@d[4],#32
904f3cd074jkim	mov.32	@x[10],@d[5]
905f3cd074jkim	lsr	@x[11],@d[5],#32
906f3cd074jkim	mov.32	@x[12],@d[6]
907f3cd074jkim	lsr	@x[13],@d[6],#32
908f3cd074jkim	mov.32	@x[14],@d[7]
909f3cd074jkim	lsr	@x[15],@d[7],#32
910f3cd074jkim
911f3cd074jkim	mov	$ctr,#5
912f3cd074jkim.Loop_lower_neon:
913f3cd074jkim	sub	$ctr,$ctr,#1
914f3cd074jkim___
915f3cd074jkim	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
916f3cd074jkim	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
917f3cd074jkim	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
918f3cd074jkim	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
919f3cd074jkim	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
920f3cd074jkim	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
921f3cd074jkim	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
922f3cd074jkim
923f3cd074jkim	foreach (@thread0) {
924f3cd074jkim		eval;			eval(shift(@thread67));
925f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread67));
926f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread67));
927f3cd074jkim		eval(shift(@thread3));	eval(shift(@thread67));
928f3cd074jkim		eval(shift(@thread4));	eval(shift(@thread67));
929f3cd074jkim		eval(shift(@thread5));	eval(shift(@thread67));
930f3cd074jkim	}
931f3cd074jkim
932f3cd074jkim	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
933f3cd074jkim	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
934f3cd074jkim	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
935f3cd074jkim	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
936f3cd074jkim	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
937f3cd074jkim	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
938f3cd074jkim	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
939f3cd074jkim
940f3cd074jkim	foreach (@thread0) {
941f3cd074jkim		eval;			eval(shift(@thread67));
942f3cd074jkim		eval(shift(@thread1));	eval(shift(@thread67));
943f3cd074jkim		eval(shift(@thread2));	eval(shift(@thread67));
944f3cd074jkim		eval(shift(@thread3));	eval(shift(@thread67));
945f3cd074jkim		eval(shift(@thread4));	eval(shift(@thread67));
946f3cd074jkim		eval(shift(@thread5));	eval(shift(@thread67));
947f3cd074jkim	}
948f3cd074jkim$code.=<<___;
949f3cd074jkim	cbnz	$ctr,.Loop_lower_neon
950f3cd074jkim
951f3cd074jkim	add.32	@x[0],@x[0],@d[0]		// accumulate key block
952f3cd074jkim	 ldp	@K[0],@K[1],[sp,#0]
953f3cd074jkim	add	@x[1],@x[1],@d[0],lsr#32
954f3cd074jkim	 ldp	@K[2],@K[3],[sp,#32]
955f3cd074jkim	add.32	@x[2],@x[2],@d[1]
956f3cd074jkim	 ldp	@K[4],@K[5],[sp,#64]
957f3cd074jkim	add	@x[3],@x[3],@d[1],lsr#32
958f3cd074jkim	 add	$A0,$A0,@K[0]
959f3cd074jkim	add.32	@x[4],@x[4],@d[2]
960f3cd074jkim	 add	$A1,$A1,@K[0]
961f3cd074jkim	add	@x[5],@x[5],@d[2],lsr#32
962f3cd074jkim	 add	$A2,$A2,@K[0]
963f3cd074jkim	add.32	@x[6],@x[6],@d[3]
964f3cd074jkim	 add	$A3,$A3,@K[0]
965f3cd074jkim	add	@x[7],@x[7],@d[3],lsr#32
966f3cd074jkim	 add	$A4,$A4,@K[0]
967f3cd074jkim	add.32	@x[8],@x[8],@d[4]
968f3cd074jkim	 add	$A5,$A5,@K[0]
969f3cd074jkim	add	@x[9],@x[9],@d[4],lsr#32
970f3cd074jkim	 add	$C0,$C0,@K[2]
971f3cd074jkim	add.32	@x[10],@x[10],@d[5]
972f3cd074jkim	 add	$C1,$C1,@K[2]
973f3cd074jkim	add	@x[11],@x[11],@d[5],lsr#32
974f3cd074jkim	 add	$C2,$C2,@K[2]
975f3cd074jkim	add.32	@x[12],@x[12],@d[6]
976f3cd074jkim	 add	$C3,$C3,@K[2]
977f3cd074jkim	add	@x[13],@x[13],@d[6],lsr#32
978f3cd074jkim	 add	$C4,$C4,@K[2]
979f3cd074jkim	add.32	@x[14],@x[14],@d[7]
980f3cd074jkim	 add	$C5,$C5,@K[2]
981f3cd074jkim	add	@x[15],@x[15],@d[7],lsr#32
982f3cd074jkim	 add	$D4,$D4,$ONE			// +4
983f3cd074jkim	add	@x[0],@x[0],@x[1],lsl#32	// pack
984f3cd074jkim	 add	$D5,$D5,$ONE			// +4
985f3cd074jkim	add	@x[2],@x[2],@x[3],lsl#32
986f3cd074jkim	 add	$D0,$D0,@K[3]
987f3cd074jkim	ldp	@x[1],@x[3],[$inp,#0]		// load input
988f3cd074jkim	 add	$D1,$D1,@K[4]
989f3cd074jkim	add	@x[4],@x[4],@x[5],lsl#32
990f3cd074jkim	 add	$D2,$D2,@K[5]
991f3cd074jkim	add	@x[6],@x[6],@x[7],lsl#32
992f3cd074jkim	 add	$D3,$D3,@K[6]
993f3cd074jkim	ldp	@x[5],@x[7],[$inp,#16]
994f3cd074jkim	 add	$D4,$D4,@K[3]
995f3cd074jkim	add	@x[8],@x[8],@x[9],lsl#32
996f3cd074jkim	 add	$D5,$D5,@K[4]
997f3cd074jkim	add	@x[10],@x[10],@x[11],lsl#32
998f3cd074jkim	 add	$B0,$B0,@K[1]
999f3cd074jkim	ldp	@x[9],@x[11],[$inp,#32]
1000f3cd074jkim	 add	$B1,$B1,@K[1]
1001f3cd074jkim	add	@x[12],@x[12],@x[13],lsl#32
1002f3cd074jkim	 add	$B2,$B2,@K[1]
1003f3cd074jkim	add	@x[14],@x[14],@x[15],lsl#32
1004f3cd074jkim	 add	$B3,$B3,@K[1]
1005f3cd074jkim	ldp	@x[13],@x[15],[$inp,#48]
1006f3cd074jkim	 add	$B4,$B4,@K[1]
1007f3cd074jkim	add	$inp,$inp,#64
1008f3cd074jkim	 add	$B5,$B5,@K[1]
1009f3cd074jkim
1010f3cd074jkim#ifdef	__ARMEB__
1011f3cd074jkim	rev	@x[0],@x[0]
1012f3cd074jkim	rev	@x[2],@x[2]
1013f3cd074jkim	rev	@x[4],@x[4]
1014f3cd074jkim	rev	@x[6],@x[6]
1015f3cd074jkim	rev	@x[8],@x[8]
1016f3cd074jkim	rev	@x[10],@x[10]
1017f3cd074jkim	rev	@x[12],@x[12]
1018f3cd074jkim	rev	@x[14],@x[14]
1019f3cd074jkim#endif
1020f3cd074jkim	ld1.8	{$T0-$T3},[$inp],#64
1021f3cd074jkim	eor	@x[0],@x[0],@x[1]
1022f3cd074jkim	eor	@x[2],@x[2],@x[3]
1023f3cd074jkim	eor	@x[4],@x[4],@x[5]
1024f3cd074jkim	eor	@x[6],@x[6],@x[7]
1025f3cd074jkim	eor	@x[8],@x[8],@x[9]
1026f3cd074jkim	 eor	$A0,$A0,$T0
1027f3cd074jkim	eor	@x[10],@x[10],@x[11]
1028f3cd074jkim	 eor	$B0,$B0,$T1
1029f3cd074jkim	eor	@x[12],@x[12],@x[13]
1030f3cd074jkim	 eor	$C0,$C0,$T2
1031f3cd074jkim	eor	@x[14],@x[14],@x[15]
1032f3cd074jkim	 eor	$D0,$D0,$T3
1033f3cd074jkim	 ld1.8	{$T0-$T3},[$inp],#64
1034f3cd074jkim
1035f3cd074jkim	stp	@x[0],@x[2],[$out,#0]		// store output
1036f3cd074jkim	 add	@d[6],@d[6],#7			// increment counter
1037f3cd074jkim	stp	@x[4],@x[6],[$out,#16]
1038f3cd074jkim	stp	@x[8],@x[10],[$out,#32]
1039f3cd074jkim	stp	@x[12],@x[14],[$out,#48]
1040f3cd074jkim	add	$out,$out,#64
1041f3cd074jkim	st1.8	{$A0-$D0},[$out],#64
1042f3cd074jkim
1043f3cd074jkim	ld1.8	{$A0-$D0},[$inp],#64
1044f3cd074jkim	eor	$A1,$A1,$T0
1045f3cd074jkim	eor	$B1,$B1,$T1
1046f3cd074jkim	eor	$C1,$C1,$T2
1047f3cd074jkim	eor	$D1,$D1,$T3
1048f3cd074jkim	st1.8	{$A1-$D1},[$out],#64
1049f3cd074jkim
1050f3cd074jkim	ld1.8	{$A1-$D1},[$inp],#64
1051f3cd074jkim	eor	$A2,$A2,$A0
1052f3cd074jkim	 ldp	@K[0],@K[1],[sp,#0]
1053f3cd074jkim	eor	$B2,$B2,$B0
1054f3cd074jkim	 ldp	@K[2],@K[3],[sp,#32]
1055f3cd074jkim	eor	$C2,$C2,$C0
1056f3cd074jkim	eor	$D2,$D2,$D0
1057f3cd074jkim	st1.8	{$A2-$D2},[$out],#64
1058f3cd074jkim
1059f3cd074jkim	ld1.8	{$A2-$D2},[$inp],#64
1060f3cd074jkim	eor	$A3,$A3,$A1
1061f3cd074jkim	eor	$B3,$B3,$B1
1062f3cd074jkim	eor	$C3,$C3,$C1
1063f3cd074jkim	eor	$D3,$D3,$D1
1064f3cd074jkim	st1.8	{$A3-$D3},[$out],#64
1065f3cd074jkim
1066f3cd074jkim	ld1.8	{$A3-$D3},[$inp],#64
1067f3cd074jkim	eor	$A4,$A4,$A2
1068f3cd074jkim	eor	$B4,$B4,$B2
1069f3cd074jkim	eor	$C4,$C4,$C2
1070f3cd074jkim	eor	$D4,$D4,$D2
1071f3cd074jkim	st1.8	{$A4-$D4},[$out],#64
1072f3cd074jkim
1073f3cd074jkim	shl	$A0,$ONE,#1			// 4 -> 8
1074f3cd074jkim	eor	$A5,$A5,$A3
1075f3cd074jkim	eor	$B5,$B5,$B3
1076f3cd074jkim	eor	$C5,$C5,$C3
1077f3cd074jkim	eor	$D5,$D5,$D3
1078f3cd074jkim	st1.8	{$A5-$D5},[$out],#64
1079f3cd074jkim
1080f3cd074jkim	add	@K[3],@K[3],$A0			// += 8
1081f3cd074jkim	add	@K[4],@K[4],$A0
1082f3cd074jkim	add	@K[5],@K[5],$A0
1083f3cd074jkim	add	@K[6],@K[6],$A0
1084f3cd074jkim
1085f3cd074jkim	b.hs	.Loop_outer_512_neon
1086f3cd074jkim
1087f3cd074jkim	adds	$len,$len,#512
1088f3cd074jkim	ushr	$A0,$ONE,#2			// 4 -> 1
1089f3cd074jkim
1090f3cd074jkim	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1091f3cd074jkim	ldp	d10,d11,[sp,#128+16]
1092f3cd074jkim	ldp	d12,d13,[sp,#128+32]
1093f3cd074jkim	ldp	d14,d15,[sp,#128+48]
1094f3cd074jkim
1095f3cd074jkim	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
1096f3cd074jkim	stp	@K[0],$ONE,[sp,#32]
1097f3cd074jkim	stp	@K[0],$ONE,[sp,#64]
1098f3cd074jkim
1099f3cd074jkim	b.eq	.Ldone_512_neon
1100f3cd074jkim
1101f3cd074jkim	cmp	$len,#192
1102f3cd074jkim	sub	@K[3],@K[3],$A0			// -= 1
1103f3cd074jkim	sub	@K[4],@K[4],$A0
1104f3cd074jkim	sub	@K[5],@K[5],$A0
1105f3cd074jkim	add	sp,sp,#128
1106f3cd074jkim	b.hs	.Loop_outer_neon
1107f3cd074jkim
1108f3cd074jkim	eor	@K[1],@K[1],@K[1]
1109f3cd074jkim	eor	@K[2],@K[2],@K[2]
1110f3cd074jkim	eor	@K[3],@K[3],@K[3]
1111f3cd074jkim	eor	@K[4],@K[4],@K[4]
1112f3cd074jkim	eor	@K[5],@K[5],@K[5]
1113f3cd074jkim	eor	@K[6],@K[6],@K[6]
1114f3cd074jkim	b	.Loop_outer
1115f3cd074jkim
1116f3cd074jkim.Ldone_512_neon:
1117f3cd074jkim	ldp	x19,x20,[x29,#16]
1118f3cd074jkim	add	sp,sp,#128+64
1119f3cd074jkim	ldp	x21,x22,[x29,#32]
1120f3cd074jkim	ldp	x23,x24,[x29,#48]
1121f3cd074jkim	ldp	x25,x26,[x29,#64]
1122f3cd074jkim	ldp	x27,x28,[x29,#80]
1123f3cd074jkim	ldp	x29,x30,[sp],#96
1124cfa887fjkim	.inst	0xd50323bf			// autiasp
1125f3cd074jkim	ret
1126f3cd074jkim.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1127f3cd074jkim___
1128f3cd074jkim}
1129f3cd074jkim}}}
1130f3cd074jkim
1131f3cd074jkimforeach (split("\n",$code)) {
1132f3cd074jkim	s/\`([^\`]*)\`/eval $1/geo;
1133f3cd074jkim
1134f3cd074jkim	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1135f3cd074jkim	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
1136f3cd074jkim	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1137f3cd074jkim	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1138f3cd074jkim	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1139f3cd074jkim
1140f3cd074jkim	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1141f3cd074jkim
1142f3cd074jkim	print $_,"\n";
1143f3cd074jkim}
11441661cedjkimclose STDOUT or die "error closing STDOUT: $!";	# flush
1145