1f3cd074jkim#! /usr/bin/env perl
2f3cd074jkim# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3f3cd074jkim#
4f3cd074jkim# Licensed under the OpenSSL license (the "License").  You may not use
5f3cd074jkim# this file except in compliance with the License.  You can obtain a copy
6f3cd074jkim# in the file LICENSE in the source distribution or at
7f3cd074jkim# https://www.openssl.org/source/license.html
8f3cd074jkim
9532b408jkim
10532b408jkim# ====================================================================
11532b408jkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12532b408jkim# project. The module is, however, dual licensed under OpenSSL and
13532b408jkim# CRYPTOGAMS licenses depending on where you obtain it. For further
14532b408jkim# details see http://www.openssl.org/~appro/cryptogams/.
15532b408jkim# ====================================================================
16532b408jkim#
17532b408jkim# March 2010
18532b408jkim#
19532b408jkim# The module implements "4-bit" GCM GHASH function and underlying
20532b408jkim# single multiplication operation in GF(2^128). "4-bit" means that it
21532b408jkim# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
22532b408jkim# GHASH performance was measured to be 6.67 cycles per processed byte
23532b408jkim# on Itanium 2, which is >90% better than Microsoft compiler generated
24532b408jkim# code. To anchor to something else sha1-ia64.pl module processes one
25532b408jkim# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
26532b408jkim# byte.
27532b408jkim
28532b408jkim# September 2010
29532b408jkim#
30532b408jkim# It was originally thought that it makes lesser sense to implement
31532b408jkim# "528B" variant on Itanium 2 for following reason. Because number of
32532b408jkim# functional units is naturally limited, it appeared impossible to
33532b408jkim# implement "528B" loop in 4 cycles, only in 5. This would mean that
34532b408jkim# theoretically performance improvement couldn't be more than 20%.
35532b408jkim# But occasionally you prove yourself wrong:-) I figured out a way to
36532b408jkim# fold couple of instructions and having freed yet another instruction
37532b408jkim# slot by unrolling the loop... Resulting performance is 4.45 cycles
38532b408jkim# per processed byte and 50% better than "256B" version. On original
39532b408jkim# Itanium performance should remain the same as the "256B" version,
40532b408jkim# i.e. ~8.5 cycles.
41532b408jkim
42f3cd074jkim$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
43532b408jkim
44532b408jkimif ($^O eq "hpux") {
45532b408jkim    $ADDP="addp4";
46532b408jkim    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47532b408jkim} else { $ADDP="add"; }
48532b408jkimfor (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
49532b408jkim                $big_endian=0 if (/\-DL_ENDIAN/);  }
50532b408jkimif (!defined($big_endian))
51532b408jkim             {  $big_endian=(unpack('L',pack('N',1))==1);  }
52532b408jkim
53532b408jkimsub loop() {
54532b408jkimmy $label=shift;
55532b408jkimmy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
56532b408jkim
57532b408jkim# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58532b408jkim# in scalable manner;-) Naturally assuming data in L1 cache...
59532b408jkim# Special note about 'dep' instruction, which is used to construct
60532b408jkim# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61532b408jkim# bytes boundary and lower 7 bits of its address are guaranteed to
62532b408jkim# be zero.
63532b408jkim$code.=<<___;
64532b408jkim$label:
65532b408jkim{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
66532b408jkim	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
67532b408jkim{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
68532b408jkim	($p17)	xor	xi[1]=xi[1],in[1]	};;
69532b408jkim{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
70532b408jkim	(p19)	shrp	Zlo=Zhi,Zlo,4		}
71532b408jkim{ .mfi;	(p19)	ld8	rem=[rem]
72532b408jkim	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
73532b408jkim{ .mmi;	($p16)	ld1	in[0]=[inp],-1
74532b408jkim	(p18)	xor	Zlo=Zlo,Hlo
75532b408jkim	(p19)	shr.u	Zhi=Zhi,4		}
76532b408jkim{ .mib;	(p19)	xor	Hhi=Hhi,rem
77532b408jkim	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
78532b408jkim
79532b408jkim{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
80532b408jkim	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
81532b408jkim{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
82532b408jkim	(p18)	xor	Zhi=Zhi,Hhi		};;
83532b408jkim{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
84532b408jkim	(p18)	shrp	Zlo=Zhi,Zlo,4		}
85532b408jkim{ .mfi;	(p18)	ld8	rem=[rem]
86532b408jkim	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
87532b408jkim{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
88532b408jkim	(p18)	xor	Zlo=Zlo,Hlo
89532b408jkim	(p18)	shr.u	Zhi=Zhi,4		}
90532b408jkim{ .mib;	(p18)	xor	Hhi=Hhi,rem
91532b408jkim	(p17)	add	Hi[0]=Htbl,Hi[0]
92532b408jkim	br.ctop.sptk	$label			};;
93532b408jkim___
94532b408jkim}
95532b408jkim
96532b408jkim$code=<<___;
97532b408jkim.explicit
98532b408jkim.text
99532b408jkim
100532b408jkimprevfs=r2;	prevlc=r3;	prevpr=r8;
101532b408jkimmask0xf0=r21;
102532b408jkimrem=r22;	rem_4bitp=r23;
103532b408jkimXi=r24;		Htbl=r25;
104532b408jkiminp=r26;	end=r27;
105532b408jkimHhi=r28;	Hlo=r29;
106532b408jkimZhi=r30;	Zlo=r31;
107532b408jkim
108532b408jkim.align	128
109532b408jkim.skip	16					// aligns loop body
110532b408jkim.global	gcm_gmult_4bit#
111532b408jkim.proc	gcm_gmult_4bit#
112532b408jkimgcm_gmult_4bit:
113532b408jkim	.prologue
114532b408jkim{ .mmi;	.save	ar.pfs,prevfs
115532b408jkim	alloc	prevfs=ar.pfs,2,6,0,8
116532b408jkim	$ADDP	Xi=15,in0			// &Xi[15]
117532b408jkim	mov	rem_4bitp=ip		}
118532b408jkim{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
119532b408jkim	.save	ar.lc,prevlc
120532b408jkim	mov	prevlc=ar.lc
121532b408jkim	.save	pr,prevpr
122532b408jkim	mov	prevpr=pr		};;
123532b408jkim
124532b408jkim	.body
125532b408jkim	.rotr	in[3],xi[3],Hi[2]
126532b408jkim
127532b408jkim{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
128532b408jkim	mov	mask0xf0=0xf0
129532b408jkim	brp.loop.imp	.Loop1,.Lend1-16};;
130532b408jkim{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
131532b408jkim					};;
132532b408jkim{ .mii;	shladd	Hi[1]=xi[2],4,r0
133532b408jkim	mov	pr.rot=0x7<<16
134532b408jkim	mov	ar.lc=13		};;
135532b408jkim{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
136532b408jkim	mov	ar.ec=3
137532b408jkim	xor	Zlo=Zlo,Zlo		};;
138532b408jkim{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
139532b408jkim	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
140532b408jkim	xor	Zhi=Zhi,Zhi		};;
141532b408jkim___
142532b408jkim	&loop	(".Loop1",1);
143532b408jkim$code.=<<___;
144532b408jkim.Lend1:
145532b408jkim{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
146532b408jkim{ .mib;	mux1	Zlo=Zlo,\@rev		};;
147532b408jkim{ .mib;	mux1	Zhi=Zhi,\@rev		};;
148532b408jkim{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
149532b408jkim	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
150532b408jkim{ .mib;	st8	[Hlo]=Zlo
151532b408jkim	mov	pr=prevpr,0x1ffff	};;
152532b408jkim{ .mib;	st8	[Hhi]=Zhi
153532b408jkim	mov	ar.lc=prevlc
154532b408jkim	br.ret.sptk.many	b0	};;
155532b408jkim.endp	gcm_gmult_4bit#
156532b408jkim___
157532b408jkim
158532b408jkim######################################################################
159f3cd074jkim# "528B" (well, "512B" actually) streamed GHASH
160532b408jkim#
161532b408jkim$Xip="in0";
162532b408jkim$Htbl="in1";
163532b408jkim$inp="in2";
164532b408jkim$len="in3";
165532b408jkim$rem_8bit="loc0";
166532b408jkim$mask0xff="loc1";
167532b408jkim($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
168532b408jkim
169532b408jkimsub load_htable() {
170532b408jkim    for (my $i=0;$i<8;$i++) {
171532b408jkim	$code.=<<___;
172532b408jkim{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
173532b408jkim	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
174532b408jkim{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
175532b408jkim	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
176532b408jkim___
177532b408jkim	$code.=shift	if (($i+$#_)==7);
178532b408jkim	$code.="\t};;\n"
179532b408jkim    }
180532b408jkim}
181532b408jkim
182532b408jkim$code.=<<___;
183532b408jkimprevsp=r3;
184532b408jkim
185532b408jkim.align	32
186532b408jkim.skip	16					// aligns loop body
187532b408jkim.global	gcm_ghash_4bit#
188532b408jkim.proc	gcm_ghash_4bit#
189532b408jkimgcm_ghash_4bit:
190532b408jkim	.prologue
191532b408jkim{ .mmi;	.save	ar.pfs,prevfs
192532b408jkim	alloc	prevfs=ar.pfs,4,2,0,0
193532b408jkim	.vframe	prevsp
194532b408jkim	mov	prevsp=sp
195532b408jkim	mov	$rem_8bit=ip		};;
196532b408jkim	.body
197532b408jkim{ .mfi;	$ADDP	r8=0+0,$Htbl
198532b408jkim	$ADDP	r9=0+8,$Htbl		}
199532b408jkim{ .mfi;	$ADDP	r10=128+0,$Htbl
200532b408jkim	$ADDP	r11=128+8,$Htbl		};;
201532b408jkim___
202532b408jkim	&load_htable(
203532b408jkim	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
204532b408jkim	"	$ADDP	$len=$len,$inp",	# &inp[len]
205532b408jkim	"	$ADDP	$inp=15,$inp",		# &inp[15]
206532b408jkim	"	mov	$mask0xff=0xff",
207532b408jkim	"	add	sp=-512,sp",
208532b408jkim	"	andcm	sp=sp,$mask0xff",	# align stack frame
209532b408jkim	"	add	r14=0,sp",
210532b408jkim	"	add	r15=8,sp");
211532b408jkim$code.=<<___;
212532b408jkim{ .mmi;	$sum	1<<1				// go big-endian
213532b408jkim	add	r8=256+0,sp
214532b408jkim	add	r9=256+8,sp		}
215532b408jkim{ .mmi;	add	r10=256+128+0,sp
216532b408jkim	add	r11=256+128+8,sp
217532b408jkim	add	$len=-17,$len		};;
218532b408jkim___
219532b408jkimfor($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
220532b408jkimmy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
221532b408jkim$code.=<<___;
222532b408jkim{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
223532b408jkim	st8	[r9]=$rhi,16			// Htable[$i].hi
224532b408jkim	shrp	$rlo=$rhi,$rlo,4	}//;;
225532b408jkim{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
226532b408jkim	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
227532b408jkim	shr.u	$rhi=$rhi,4		};;
228532b408jkim{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
229532b408jkim	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
230532b408jkim___
231532b408jkim}
232532b408jkim$code.=<<___;
233532b408jkim{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
234532b408jkim	ld8	r17=[r9],16		};;	// Htable[8].hi
235532b408jkim{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
236532b408jkim	ld8	r19=[r9],16		}	// Htable[9].hi
237532b408jkim{ .mmi;	rum	1<<5				// clear um.mfh
238532b408jkim	shrp	r16=r17,r16,4		};;
239532b408jkim___
240532b408jkimfor($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
241532b408jkim$code.=<<___;
242532b408jkim{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
243532b408jkim	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
244532b408jkim	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245532b408jkim{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246532b408jkim	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247532b408jkim	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248532b408jkim___
249532b408jkim}
250532b408jkim$code.=<<___;
251532b408jkim{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
252532b408jkim{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
253532b408jkim	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
254532b408jkim	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
255532b408jkim{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
256532b408jkim	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257532b408jkim	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
258532b408jkim{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
259532b408jkim	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
260532b408jkim___
261532b408jkim
262532b408jkim$in="r15";
263532b408jkim@xi=("r16","r17");
264532b408jkim@rem=("r18","r19");
265532b408jkim($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266532b408jkim($Atbl,$Btbl)=("r26","r27");
267532b408jkim
268532b408jkim$code.=<<___;	# (p16)
269532b408jkim{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
270532b408jkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
271532b408jkim	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
272532b408jkim___
273532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
274532b408jkim
275532b408jkim$code.=<<___;	# (p16),(p17)
276532b408jkim{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
277532b408jkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
278532b408jkim{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
279532b408jkim	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
280532b408jkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
281532b408jkim.align	32
282532b408jkim.LOOP:
283532b408jkim{ .mmi;
284532b408jkim(p6)	st8	[$Xip]=$Zhi,13
285532b408jkim	xor	$Zlo=$Zlo,$Zlo
286532b408jkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
287532b408jkim___
288532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
289532b408jkim
290532b408jkim$code.=<<___;	# (p16),(p17),(p18)
291532b408jkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
292532b408jkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293532b408jkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
294532b408jkim{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
295532b408jkim	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
296532b408jkim{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
297532b408jkim	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
298532b408jkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299532b408jkim	ld1	$in=[$inp],-1		}	//(p16) *inp--
300532b408jkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
301532b408jkim	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
302532b408jkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
303532b408jkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
304532b408jkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
305532b408jkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306532b408jkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307532b408jkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
308532b408jkim___
309532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
310532b408jkim
311532b408jkimfor ($i=1;$i<14;$i++) {
312532b408jkim# Above and below fragments are derived from this one by removing
313532b408jkim# unsuitable (p??) instructions.
314532b408jkim$code.=<<___;	# (p16),(p17),(p18),(p19)
315532b408jkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
316532b408jkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317532b408jkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
318532b408jkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
319532b408jkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
320532b408jkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
321532b408jkim{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
322532b408jkim	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
323532b408jkim	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
324532b408jkim{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
325532b408jkim	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
326532b408jkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
327532b408jkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328532b408jkim	ld1	$in=[$inp],-1			//(p16) *inp--
329532b408jkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
330532b408jkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
331532b408jkim	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
332532b408jkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
333532b408jkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
334532b408jkim	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
335532b408jkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336532b408jkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337532b408jkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
338532b408jkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
339532b408jkim___
340532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
341532b408jkim}
342532b408jkim
343532b408jkim$code.=<<___;	# (p17),(p18),(p19)
344532b408jkim{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
345532b408jkim	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346532b408jkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
347532b408jkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
348532b408jkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
349532b408jkim	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
350532b408jkim{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
351532b408jkim	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
352532b408jkim	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
353532b408jkim{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
354532b408jkim	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
355532b408jkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
356532b408jkim{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357532b408jkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
358532b408jkim{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
359532b408jkim	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
360532b408jkim	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
361532b408jkim{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
362532b408jkim	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363532b408jkim{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364532b408jkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
365532b408jkim	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
366532b408jkim___
367532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
368532b408jkim
369532b408jkim$code.=<<___;	# (p18),(p19)
370532b408jkim{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
371532b408jkim	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
372532b408jkim{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
373532b408jkim	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
374532b408jkim{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
375532b408jkim	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
376532b408jkim{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
377532b408jkim	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
378532b408jkim{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
379532b408jkim	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
380532b408jkim{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
381532b408jkim	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
382532b408jkim{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
383532b408jkim	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384532b408jkim{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385532b408jkim	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
386532b408jkim___
387532b408jkimpush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
388532b408jkim
389532b408jkim$code.=<<___;	# (p19)
390532b408jkim{ .mmi;	cmp.ltu	p6,p0=$inp,$len
391532b408jkim	add	$inp=32,$inp
392532b408jkim	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
393532b408jkim{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
394532b408jkim	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
395532b408jkim	add	$Xip=9,$Xip		};;	//	&Xi.lo
396532b408jkim{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
397532b408jkim(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
398532b408jkim(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
399532b408jkim{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
400532b408jkim(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
401532b408jkim{ .mmi;	st8	[$Xip]=$Zlo,-8
402532b408jkim(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
403532b408jkim	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
404532b408jkim{ .mmi;
405532b408jkim(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
406532b408jkim	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
407532b408jkim(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
408532b408jkim{ .mib;
409532b408jkim(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
410532b408jkim(p6)	br.cond.dptk.many	.LOOP	};;
411532b408jkim
412532b408jkim{ .mib;	st8	[$Xip]=$Zhi		};;
413532b408jkim{ .mib;	$rum	1<<1				// return to little-endian
414532b408jkim	.restore	sp
415532b408jkim	mov	sp=prevsp
416532b408jkim	br.ret.sptk.many	b0	};;
417532b408jkim.endp	gcm_ghash_4bit#
418532b408jkim___
419532b408jkim$code.=<<___;
420532b408jkim.align	128
421532b408jkim.type	rem_4bit#,\@object
422532b408jkimrem_4bit:
423532b408jkim        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424532b408jkim        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425532b408jkim        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426532b408jkim        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
427532b408jkim.size	rem_4bit#,128
428532b408jkim.type	rem_8bit#,\@object
429532b408jkimrem_8bit:
430532b408jkim	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431532b408jkim	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432532b408jkim	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433532b408jkim	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434532b408jkim	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435532b408jkim	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436532b408jkim	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437532b408jkim	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438532b408jkim	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439532b408jkim	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440532b408jkim	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441532b408jkim	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442532b408jkim	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443532b408jkim	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444532b408jkim	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445532b408jkim	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446532b408jkim	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447532b408jkim	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448532b408jkim	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449532b408jkim	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450532b408jkim	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451532b408jkim	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452532b408jkim	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453532b408jkim	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454532b408jkim	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455532b408jkim	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456532b408jkim	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457532b408jkim	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458532b408jkim	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459532b408jkim	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460532b408jkim	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461532b408jkim	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
462532b408jkim.size	rem_8bit#,512
463532b408jkimstringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
464532b408jkim___
465532b408jkim
466532b408jkim$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
467532b408jkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
468532b408jkim
469532b408jkimprint $code;
470532b408jkimclose STDOUT;
471