1/*
2 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * The basic framework for this code came from the reference
8 * implementation for MD5.  That implementation is Copyright (C)
9 * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
10 *
11 * License to copy and use this software is granted provided that it
12 * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
13 * Algorithm" in all material mentioning or referencing this software
14 * or this function.
15 *
16 * License is also granted to make and use derivative works provided
17 * that such works are identified as "derived from the RSA Data
18 * Security, Inc. MD5 Message-Digest Algorithm" in all material
19 * mentioning or referencing the derived work.
20 *
21 * RSA Data Security, Inc. makes no representations concerning either
22 * the merchantability of this software or the suitability of this
23 * software for any particular purpose. It is provided "as is"
24 * without express or implied warranty of any kind.
25 *
26 * These notices must be retained in any copies of any part of this
27 * documentation and/or software.
28 *
29 * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
30 * standard, available at http://www.itl.nist.gov/fipspubs/fip180-1.htm
31 * Not as fast as one would like -- further optimizations are encouraged
32 * and appreciated.
33 */
34
35#if defined(_STANDALONE)
36#include <sys/cdefs.h>
37#define	_RESTRICT_KYWD	restrict
38#else
39#if !defined(_KERNEL) && !defined(_BOOT)
40#include <stdint.h>
41#include <strings.h>
42#include <stdlib.h>
43#include <errno.h>
44#include <sys/systeminfo.h>
45#endif  /* !_KERNEL && !_BOOT */
46#endif	/* _STANDALONE */
47
48#include <sys/types.h>
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/sysmacros.h>
52#include <sys/sha1.h>
53#include <sys/sha1_consts.h>
54
55#if defined(_STANDALONE)
56#include <sys/endian.h>
57#define	HAVE_HTONL
58#if _BYTE_ORDER == _LITTLE_ENDIAN
59#undef _BIG_ENDIAN
60#else
61#undef _LITTLE_ENDIAN
62#endif
63#else
64#ifdef _LITTLE_ENDIAN
65#include <sys/byteorder.h>
66#define	HAVE_HTONL
67#endif
68#endif /* _STANDALONE */
69
70#ifdef	_BOOT
71#define	bcopy(_s, _d, _l)	((void) memcpy((_d), (_s), (_l)))
72#define	bzero(_m, _l)		((void) memset((_m), 0, (_l)))
73#endif
74
75static void Encode(uint8_t *, const uint32_t *, size_t);
76
77#if	defined(__sparc)
78
79#define	SHA1_TRANSFORM(ctx, in) \
80	SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
81		(ctx)->state[3], (ctx)->state[4], (ctx), (in))
82
83static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
84    SHA1_CTX *, const uint8_t *);
85
86#elif	defined(__amd64)
87
88#define	SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
89#define	SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
90		(in), (num))
91
92void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
93
94#else
95
96#define	SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
97
98static void SHA1Transform(SHA1_CTX *, const uint8_t *);
99
100#endif
101
102
103static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
104
105/*
106 * F, G, and H are the basic SHA1 functions.
107 */
108#define	F(b, c, d)	(((b) & (c)) | ((~b) & (d)))
109#define	G(b, c, d)	((b) ^ (c) ^ (d))
110#define	H(b, c, d)	(((b) & (c)) | (((b)|(c)) & (d)))
111
112/*
113 * ROTATE_LEFT rotates x left n bits.
114 */
115
116#if	defined(__GNUC__) && defined(_LP64)
117static __inline__ uint64_t
118ROTATE_LEFT(uint64_t value, uint32_t n)
119{
120	uint32_t t32;
121
122	t32 = (uint32_t)value;
123	return ((t32 << n) | (t32 >> (32 - n)));
124}
125
126#else
127
128#define	ROTATE_LEFT(x, n)	\
129	(((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
130
131#endif
132
133
134/*
135 * SHA1Init()
136 *
137 * purpose: initializes the sha1 context and begins and sha1 digest operation
138 *   input: SHA1_CTX *	: the context to initializes.
139 *  output: void
140 */
141
142void
143SHA1Init(SHA1_CTX *ctx)
144{
145	ctx->count[0] = ctx->count[1] = 0;
146
147	/*
148	 * load magic initialization constants. Tell lint
149	 * that these constants are unsigned by using U.
150	 */
151
152	ctx->state[0] = 0x67452301U;
153	ctx->state[1] = 0xefcdab89U;
154	ctx->state[2] = 0x98badcfeU;
155	ctx->state[3] = 0x10325476U;
156	ctx->state[4] = 0xc3d2e1f0U;
157}
158
159#ifdef VIS_SHA1
160#ifdef _KERNEL
161
162#include <sys/regset.h>
163#include <sys/vis.h>
164#include <sys/fpu/fpusystm.h>
165
166/* the alignment for block stores to save fp registers */
167#define	VIS_ALIGN	(64)
168
169extern int sha1_savefp(kfpu_t *, int);
170extern void sha1_restorefp(kfpu_t *);
171
172uint32_t	vis_sha1_svfp_threshold = 128;
173
174#endif /* _KERNEL */
175
176/*
177 * VIS SHA-1 consts.
178 */
179static uint64_t VIS[] = {
180	0x8000000080000000ULL,
181	0x0002000200020002ULL,
182	0x5a8279996ed9eba1ULL,
183	0x8f1bbcdcca62c1d6ULL,
184	0x012389ab456789abULL};
185
186extern void SHA1TransformVIS(uint64_t *, uint32_t *, uint32_t *, uint64_t *);
187
188
189/*
190 * SHA1Update()
191 *
192 * purpose: continues an sha1 digest operation, using the message block
193 *          to update the context.
194 *   input: SHA1_CTX *	: the context to update
195 *          void *	: the message block
196 *          size_t    : the length of the message block in bytes
197 *  output: void
198 */
199
200void
201SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
202{
203	uint32_t i, buf_index, buf_len;
204	uint64_t X0[40], input64[8];
205	const uint8_t *input = inptr;
206#ifdef _KERNEL
207	int usevis = 0;
208#else
209	int usevis = 1;
210#endif /* _KERNEL */
211
212	/* check for noop */
213	if (input_len == 0)
214		return;
215
216	/* compute number of bytes mod 64 */
217	buf_index = (ctx->count[1] >> 3) & 0x3F;
218
219	/* update number of bits */
220	if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
221		ctx->count[0]++;
222
223	ctx->count[0] += (input_len >> 29);
224
225	buf_len = 64 - buf_index;
226
227	/* transform as many times as possible */
228	i = 0;
229	if (input_len >= buf_len) {
230#ifdef _KERNEL
231		kfpu_t *fpu;
232		if (fpu_exists) {
233			uint8_t fpua[sizeof (kfpu_t) + GSR_SIZE + VIS_ALIGN];
234			uint32_t len = (input_len + buf_index) & ~0x3f;
235			int svfp_ok;
236
237			fpu = (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, 64);
238			svfp_ok = ((len >= vis_sha1_svfp_threshold) ? 1 : 0);
239			usevis = fpu_exists && sha1_savefp(fpu, svfp_ok);
240		} else {
241			usevis = 0;
242		}
243#endif /* _KERNEL */
244
245		/*
246		 * general optimization:
247		 *
248		 * only do initial bcopy() and SHA1Transform() if
249		 * buf_index != 0.  if buf_index == 0, we're just
250		 * wasting our time doing the bcopy() since there
251		 * wasn't any data left over from a previous call to
252		 * SHA1Update().
253		 */
254
255		if (buf_index) {
256			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
257			if (usevis) {
258				SHA1TransformVIS(X0,
259				    ctx->buf_un.buf32,
260				    &ctx->state[0], VIS);
261			} else {
262				SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
263			}
264			i = buf_len;
265		}
266
267		/*
268		 * VIS SHA-1: uses the VIS 1.0 instructions to accelerate
269		 * SHA-1 processing. This is achieved by "offloading" the
270		 * computation of the message schedule (MS) to the VIS units.
271		 * This allows the VIS computation of the message schedule
272		 * to be performed in parallel with the standard integer
273		 * processing of the remainder of the SHA-1 computation.
274		 * performance by up to around 1.37X, compared to an optimized
275		 * integer-only implementation.
276		 *
277		 * The VIS implementation of SHA1Transform has a different API
278		 * to the standard integer version:
279		 *
280		 * void SHA1TransformVIS(
281		 *	 uint64_t *, // Pointer to MS for ith block
282		 *	 uint32_t *, // Pointer to ith block of message data
283		 *	 uint32_t *, // Pointer to SHA state i.e ctx->state
284		 *	 uint64_t *, // Pointer to various VIS constants
285		 * )
286		 *
287		 * Note: the message data must by 4-byte aligned.
288		 *
289		 * Function requires VIS 1.0 support.
290		 *
291		 * Handling is provided to deal with arbitrary byte alingment
292		 * of the input data but the performance gains are reduced
293		 * for alignments other than 4-bytes.
294		 */
295		if (usevis) {
296			if (!IS_P2ALIGNED(&input[i], sizeof (uint32_t))) {
297				/*
298				 * Main processing loop - input misaligned
299				 */
300				for (; i + 63 < input_len; i += 64) {
301					bcopy(&input[i], input64, 64);
302					SHA1TransformVIS(X0,
303					    (uint32_t *)input64,
304					    &ctx->state[0], VIS);
305				}
306			} else {
307				/*
308				 * Main processing loop - input 8-byte aligned
309				 */
310				for (; i + 63 < input_len; i += 64) {
311					SHA1TransformVIS(X0,
312					    /* LINTED E_BAD_PTR_CAST_ALIGN */
313					    (uint32_t *)&input[i], /* CSTYLED */
314					    &ctx->state[0], VIS);
315				}
316
317			}
318#ifdef _KERNEL
319			sha1_restorefp(fpu);
320#endif /* _KERNEL */
321		} else {
322			for (; i + 63 < input_len; i += 64) {
323				SHA1_TRANSFORM(ctx, &input[i]);
324			}
325		}
326
327		/*
328		 * general optimization:
329		 *
330		 * if i and input_len are the same, return now instead
331		 * of calling bcopy(), since the bcopy() in this case
332		 * will be an expensive nop.
333		 */
334
335		if (input_len == i)
336			return;
337
338		buf_index = 0;
339	}
340
341	/* buffer remaining input */
342	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
343}
344
345#else /* VIS_SHA1 */
346
347void
348SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
349{
350	uint32_t i, buf_index, buf_len;
351	const uint8_t *input = inptr;
352#if defined(__amd64)
353	uint32_t	block_count;
354#endif	/* __amd64 */
355
356	/* check for noop */
357	if (input_len == 0)
358		return;
359
360	/* compute number of bytes mod 64 */
361	buf_index = (ctx->count[1] >> 3) & 0x3F;
362
363	/* update number of bits */
364	if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
365		ctx->count[0]++;
366
367	ctx->count[0] += (input_len >> 29);
368
369	buf_len = 64 - buf_index;
370
371	/* transform as many times as possible */
372	i = 0;
373	if (input_len >= buf_len) {
374
375		/*
376		 * general optimization:
377		 *
378		 * only do initial bcopy() and SHA1Transform() if
379		 * buf_index != 0.  if buf_index == 0, we're just
380		 * wasting our time doing the bcopy() since there
381		 * wasn't any data left over from a previous call to
382		 * SHA1Update().
383		 */
384
385		if (buf_index) {
386			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
387			SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
388			i = buf_len;
389		}
390
391#if !defined(__amd64)
392		for (; i + 63 < input_len; i += 64)
393			SHA1_TRANSFORM(ctx, &input[i]);
394#else
395		block_count = (input_len - i) >> 6;
396		if (block_count > 0) {
397			SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
398			i += block_count << 6;
399		}
400#endif	/* !__amd64 */
401
402		/*
403		 * general optimization:
404		 *
405		 * if i and input_len are the same, return now instead
406		 * of calling bcopy(), since the bcopy() in this case
407		 * will be an expensive nop.
408		 */
409
410		if (input_len == i)
411			return;
412
413		buf_index = 0;
414	}
415
416	/* buffer remaining input */
417	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
418}
419
420#endif /* VIS_SHA1 */
421
422/*
423 * SHA1Final()
424 *
425 * purpose: ends an sha1 digest operation, finalizing the message digest and
426 *          zeroing the context.
427 *   input: uchar_t *	: A buffer to store the digest.
428 *			: The function actually uses void* because many
429 *			: callers pass things other than uchar_t here.
430 *          SHA1_CTX *  : the context to finalize, save, and zero
431 *  output: void
432 */
433
434void
435SHA1Final(void *digest, SHA1_CTX *ctx)
436{
437	uint8_t		bitcount_be[sizeof (ctx->count)];
438	uint32_t	index = (ctx->count[1] >> 3) & 0x3f;
439
440	/* store bit count, big endian */
441	Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
442
443	/* pad out to 56 mod 64 */
444	SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
445
446	/* append length (before padding) */
447	SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
448
449	/* store state in digest */
450	Encode(digest, ctx->state, sizeof (ctx->state));
451
452	/* zeroize sensitive information */
453	bzero(ctx, sizeof (*ctx));
454}
455
456
457#if !defined(__amd64)
458
459typedef uint32_t sha1word;
460
461/*
462 * sparc optimization:
463 *
464 * on the sparc, we can load big endian 32-bit data easily.  note that
465 * special care must be taken to ensure the address is 32-bit aligned.
466 * in the interest of speed, we don't check to make sure, since
467 * careful programming can guarantee this for us.
468 */
469
470#if	defined(_BIG_ENDIAN)
471#define	LOAD_BIG_32(addr)	(*(uint32_t *)(addr))
472
473#elif	defined(HAVE_HTONL)
474#define	LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
475
476#else
477/* little endian -- will work on big endian, but slowly */
478#define	LOAD_BIG_32(addr)	\
479	(((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
480#endif	/* _BIG_ENDIAN */
481
482/*
483 * SHA1Transform()
484 */
485#if	defined(W_ARRAY)
486#define	W(n) w[n]
487#else	/* !defined(W_ARRAY) */
488#define	W(n) w_ ## n
489#endif	/* !defined(W_ARRAY) */
490
491
492#if	defined(__sparc)
493
494/*
495 * sparc register window optimization:
496 *
497 * `a', `b', `c', `d', and `e' are passed into SHA1Transform
498 * explicitly since it increases the number of registers available to
499 * the compiler.  under this scheme, these variables can be held in
500 * %i0 - %i4, which leaves more local and out registers available.
501 *
502 * purpose: sha1 transformation -- updates the digest based on `block'
503 *   input: uint32_t	: bytes  1 -  4 of the digest
504 *          uint32_t	: bytes  5 -  8 of the digest
505 *          uint32_t	: bytes  9 - 12 of the digest
506 *          uint32_t	: bytes 12 - 16 of the digest
507 *          uint32_t	: bytes 16 - 20 of the digest
508 *          SHA1_CTX *	: the context to update
509 *          uint8_t [64]: the block to use to update the digest
510 *  output: void
511 */
512
513void
514SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
515    SHA1_CTX *ctx, const uint8_t blk[64])
516{
517	/*
518	 * sparc optimization:
519	 *
520	 * while it is somewhat counter-intuitive, on sparc, it is
521	 * more efficient to place all the constants used in this
522	 * function in an array and load the values out of the array
523	 * than to manually load the constants.  this is because
524	 * setting a register to a 32-bit value takes two ops in most
525	 * cases: a `sethi' and an `or', but loading a 32-bit value
526	 * from memory only takes one `ld' (or `lduw' on v9).  while
527	 * this increases memory usage, the compiler can find enough
528	 * other things to do while waiting to keep the pipeline does
529	 * not stall.  additionally, it is likely that many of these
530	 * constants are cached so that later accesses do not even go
531	 * out to the bus.
532	 *
533	 * this array is declared `static' to keep the compiler from
534	 * having to bcopy() this array onto the stack frame of
535	 * SHA1Transform() each time it is called -- which is
536	 * unacceptably expensive.
537	 *
538	 * the `const' is to ensure that callers are good citizens and
539	 * do not try to munge the array.  since these routines are
540	 * going to be called from inside multithreaded kernelland,
541	 * this is a good safety check. -- `sha1_consts' will end up in
542	 * .rodata.
543	 *
544	 * unfortunately, loading from an array in this manner hurts
545	 * performance under Intel.  So, there is a macro,
546	 * SHA1_CONST(), used in SHA1Transform(), that either expands to
547	 * a reference to this array, or to the actual constant,
548	 * depending on what platform this code is compiled for.
549	 */
550
551	static const uint32_t sha1_consts[] = {
552		SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3
553	};
554
555	/*
556	 * general optimization:
557	 *
558	 * use individual integers instead of using an array.  this is a
559	 * win, although the amount it wins by seems to vary quite a bit.
560	 */
561
562	uint32_t	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
563	uint32_t	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
564
565	/*
566	 * sparc optimization:
567	 *
568	 * if `block' is already aligned on a 4-byte boundary, use
569	 * LOAD_BIG_32() directly.  otherwise, bcopy() into a
570	 * buffer that *is* aligned on a 4-byte boundary and then do
571	 * the LOAD_BIG_32() on that buffer.  benchmarks have shown
572	 * that using the bcopy() is better than loading the bytes
573	 * individually and doing the endian-swap by hand.
574	 *
575	 * even though it's quite tempting to assign to do:
576	 *
577	 * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
578	 *
579	 * and only have one set of LOAD_BIG_32()'s, the compiler
580	 * *does not* like that, so please resist the urge.
581	 */
582
583	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
584		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
585		w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
586		w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
587		w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
588		w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
589		w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
590		w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
591		w_9  = LOAD_BIG_32(ctx->buf_un.buf32 +  9);
592		w_8  = LOAD_BIG_32(ctx->buf_un.buf32 +  8);
593		w_7  = LOAD_BIG_32(ctx->buf_un.buf32 +  7);
594		w_6  = LOAD_BIG_32(ctx->buf_un.buf32 +  6);
595		w_5  = LOAD_BIG_32(ctx->buf_un.buf32 +  5);
596		w_4  = LOAD_BIG_32(ctx->buf_un.buf32 +  4);
597		w_3  = LOAD_BIG_32(ctx->buf_un.buf32 +  3);
598		w_2  = LOAD_BIG_32(ctx->buf_un.buf32 +  2);
599		w_1  = LOAD_BIG_32(ctx->buf_un.buf32 +  1);
600		w_0  = LOAD_BIG_32(ctx->buf_un.buf32 +  0);
601	} else {
602		/* LINTED E_BAD_PTR_CAST_ALIGN */
603		w_15 = LOAD_BIG_32(blk + 60);
604		/* LINTED E_BAD_PTR_CAST_ALIGN */
605		w_14 = LOAD_BIG_32(blk + 56);
606		/* LINTED E_BAD_PTR_CAST_ALIGN */
607		w_13 = LOAD_BIG_32(blk + 52);
608		/* LINTED E_BAD_PTR_CAST_ALIGN */
609		w_12 = LOAD_BIG_32(blk + 48);
610		/* LINTED E_BAD_PTR_CAST_ALIGN */
611		w_11 = LOAD_BIG_32(blk + 44);
612		/* LINTED E_BAD_PTR_CAST_ALIGN */
613		w_10 = LOAD_BIG_32(blk + 40);
614		/* LINTED E_BAD_PTR_CAST_ALIGN */
615		w_9  = LOAD_BIG_32(blk + 36);
616		/* LINTED E_BAD_PTR_CAST_ALIGN */
617		w_8  = LOAD_BIG_32(blk + 32);
618		/* LINTED E_BAD_PTR_CAST_ALIGN */
619		w_7  = LOAD_BIG_32(blk + 28);
620		/* LINTED E_BAD_PTR_CAST_ALIGN */
621		w_6  = LOAD_BIG_32(blk + 24);
622		/* LINTED E_BAD_PTR_CAST_ALIGN */
623		w_5  = LOAD_BIG_32(blk + 20);
624		/* LINTED E_BAD_PTR_CAST_ALIGN */
625		w_4  = LOAD_BIG_32(blk + 16);
626		/* LINTED E_BAD_PTR_CAST_ALIGN */
627		w_3  = LOAD_BIG_32(blk + 12);
628		/* LINTED E_BAD_PTR_CAST_ALIGN */
629		w_2  = LOAD_BIG_32(blk +  8);
630		/* LINTED E_BAD_PTR_CAST_ALIGN */
631		w_1  = LOAD_BIG_32(blk +  4);
632		/* LINTED E_BAD_PTR_CAST_ALIGN */
633		w_0  = LOAD_BIG_32(blk +  0);
634	}
635#else	/* !defined(__sparc) */
636
637void /* CSTYLED */
638SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
639{
640	/* CSTYLED */
641	sha1word a = ctx->state[0];
642	sha1word b = ctx->state[1];
643	sha1word c = ctx->state[2];
644	sha1word d = ctx->state[3];
645	sha1word e = ctx->state[4];
646
647#if	defined(W_ARRAY)
648	sha1word	w[16];
649#else	/* !defined(W_ARRAY) */
650	sha1word	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
651	sha1word	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
652#endif	/* !defined(W_ARRAY) */
653
654	W(0)  = LOAD_BIG_32((void *)(blk +  0));
655	W(1)  = LOAD_BIG_32((void *)(blk +  4));
656	W(2)  = LOAD_BIG_32((void *)(blk +  8));
657	W(3)  = LOAD_BIG_32((void *)(blk + 12));
658	W(4)  = LOAD_BIG_32((void *)(blk + 16));
659	W(5)  = LOAD_BIG_32((void *)(blk + 20));
660	W(6)  = LOAD_BIG_32((void *)(blk + 24));
661	W(7)  = LOAD_BIG_32((void *)(blk + 28));
662	W(8)  = LOAD_BIG_32((void *)(blk + 32));
663	W(9)  = LOAD_BIG_32((void *)(blk + 36));
664	W(10) = LOAD_BIG_32((void *)(blk + 40));
665	W(11) = LOAD_BIG_32((void *)(blk + 44));
666	W(12) = LOAD_BIG_32((void *)(blk + 48));
667	W(13) = LOAD_BIG_32((void *)(blk + 52));
668	W(14) = LOAD_BIG_32((void *)(blk + 56));
669	W(15) = LOAD_BIG_32((void *)(blk + 60));
670
671#endif	/* !defined(__sparc) */
672
673	/*
674	 * general optimization:
675	 *
676	 * even though this approach is described in the standard as
677	 * being slower algorithmically, it is 30-40% faster than the
678	 * "faster" version under SPARC, because this version has more
679	 * of the constraints specified at compile-time and uses fewer
680	 * variables (and therefore has better register utilization)
681	 * than its "speedier" brother.  (i've tried both, trust me)
682	 *
683	 * for either method given in the spec, there is an "assignment"
684	 * phase where the following takes place:
685	 *
686	 *	tmp = (main_computation);
687	 *	e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
688	 *
689	 * we can make the algorithm go faster by not doing this work,
690	 * but just pretending that `d' is now `e', etc. this works
691	 * really well and obviates the need for a temporary variable.
692	 * however, we still explicitly perform the rotate action,
693	 * since it is cheaper on SPARC to do it once than to have to
694	 * do it over and over again.
695	 */
696
697	/* round 1 */
698	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
699	b = ROTATE_LEFT(b, 30);
700
701	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
702	a = ROTATE_LEFT(a, 30);
703
704	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
705	e = ROTATE_LEFT(e, 30);
706
707	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
708	d = ROTATE_LEFT(d, 30);
709
710	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
711	c = ROTATE_LEFT(c, 30);
712
713	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
714	b = ROTATE_LEFT(b, 30);
715
716	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
717	a = ROTATE_LEFT(a, 30);
718
719	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
720	e = ROTATE_LEFT(e, 30);
721
722	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
723	d = ROTATE_LEFT(d, 30);
724
725	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
726	c = ROTATE_LEFT(c, 30);
727
728	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
729	b = ROTATE_LEFT(b, 30);
730
731	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
732	a = ROTATE_LEFT(a, 30);
733
734	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
735	e = ROTATE_LEFT(e, 30);
736
737	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
738	d = ROTATE_LEFT(d, 30);
739
740	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
741	c = ROTATE_LEFT(c, 30);
742
743	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
744	b = ROTATE_LEFT(b, 30);
745
746	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 16 */
747	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
748	a = ROTATE_LEFT(a, 30);
749
750	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 17 */
751	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
752	e = ROTATE_LEFT(e, 30);
753
754	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 18 */
755	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
756	d = ROTATE_LEFT(d, 30);
757
758	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 19 */
759	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
760	c = ROTATE_LEFT(c, 30);
761
762	/* round 2 */
763	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 20 */
764	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
765	b = ROTATE_LEFT(b, 30);
766
767	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 21 */
768	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
769	a = ROTATE_LEFT(a, 30);
770
771	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 22 */
772	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
773	e = ROTATE_LEFT(e, 30);
774
775	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 23 */
776	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
777	d = ROTATE_LEFT(d, 30);
778
779	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 24 */
780	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
781	c = ROTATE_LEFT(c, 30);
782
783	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 25 */
784	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
785	b = ROTATE_LEFT(b, 30);
786
787	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 26 */
788	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
789	a = ROTATE_LEFT(a, 30);
790
791	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 27 */
792	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
793	e = ROTATE_LEFT(e, 30);
794
795	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 28 */
796	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
797	d = ROTATE_LEFT(d, 30);
798
799	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 29 */
800	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
801	c = ROTATE_LEFT(c, 30);
802
803	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 30 */
804	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
805	b = ROTATE_LEFT(b, 30);
806
807	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 31 */
808	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
809	a = ROTATE_LEFT(a, 30);
810
811	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 32 */
812	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
813	e = ROTATE_LEFT(e, 30);
814
815	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 33 */
816	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
817	d = ROTATE_LEFT(d, 30);
818
819	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 34 */
820	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
821	c = ROTATE_LEFT(c, 30);
822
823	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 35 */
824	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
825	b = ROTATE_LEFT(b, 30);
826
827	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 36 */
828	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
829	a = ROTATE_LEFT(a, 30);
830
831	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 37 */
832	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
833	e = ROTATE_LEFT(e, 30);
834
835	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 38 */
836	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
837	d = ROTATE_LEFT(d, 30);
838
839	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 39 */
840	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
841	c = ROTATE_LEFT(c, 30);
842
843	/* round 3 */
844	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 40 */
845	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
846	b = ROTATE_LEFT(b, 30);
847
848	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 41 */
849	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
850	a = ROTATE_LEFT(a, 30);
851
852	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 42 */
853	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
854	e = ROTATE_LEFT(e, 30);
855
856	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 43 */
857	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
858	d = ROTATE_LEFT(d, 30);
859
860	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 44 */
861	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
862	c = ROTATE_LEFT(c, 30);
863
864	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 45 */
865	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
866	b = ROTATE_LEFT(b, 30);
867
868	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 46 */
869	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
870	a = ROTATE_LEFT(a, 30);
871
872	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 47 */
873	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
874	e = ROTATE_LEFT(e, 30);
875
876	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 48 */
877	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
878	d = ROTATE_LEFT(d, 30);
879
880	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 49 */
881	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
882	c = ROTATE_LEFT(c, 30);
883
884	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 50 */
885	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
886	b = ROTATE_LEFT(b, 30);
887
888	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 51 */
889	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
890	a = ROTATE_LEFT(a, 30);
891
892	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 52 */
893	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
894	e = ROTATE_LEFT(e, 30);
895
896	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 53 */
897	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
898	d = ROTATE_LEFT(d, 30);
899
900	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 54 */
901	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
902	c = ROTATE_LEFT(c, 30);
903
904	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 55 */
905	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
906	b = ROTATE_LEFT(b, 30);
907
908	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 56 */
909	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
910	a = ROTATE_LEFT(a, 30);
911
912	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 57 */
913	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
914	e = ROTATE_LEFT(e, 30);
915
916	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 58 */
917	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
918	d = ROTATE_LEFT(d, 30);
919
920	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 59 */
921	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
922	c = ROTATE_LEFT(c, 30);
923
924	/* round 4 */
925	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 60 */
926	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
927	b = ROTATE_LEFT(b, 30);
928
929	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 61 */
930	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
931	a = ROTATE_LEFT(a, 30);
932
933	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 62 */
934	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
935	e = ROTATE_LEFT(e, 30);
936
937	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 63 */
938	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
939	d = ROTATE_LEFT(d, 30);
940
941	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 64 */
942	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
943	c = ROTATE_LEFT(c, 30);
944
945	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 65 */
946	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
947	b = ROTATE_LEFT(b, 30);
948
949	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 66 */
950	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
951	a = ROTATE_LEFT(a, 30);
952
953	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 67 */
954	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
955	e = ROTATE_LEFT(e, 30);
956
957	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 68 */
958	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
959	d = ROTATE_LEFT(d, 30);
960
961	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 69 */
962	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
963	c = ROTATE_LEFT(c, 30);
964
965	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 70 */
966	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
967	b = ROTATE_LEFT(b, 30);
968
969	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 71 */
970	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
971	a = ROTATE_LEFT(a, 30);
972
973	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 72 */
974	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
975	e = ROTATE_LEFT(e, 30);
976
977	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 73 */
978	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
979	d = ROTATE_LEFT(d, 30);
980
981	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 74 */
982	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
983	c = ROTATE_LEFT(c, 30);
984
985	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 75 */
986	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
987	b = ROTATE_LEFT(b, 30);
988
989	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 76 */
990	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
991	a = ROTATE_LEFT(a, 30);
992
993	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 77 */
994	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
995	e = ROTATE_LEFT(e, 30);
996
997	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 78 */
998	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
999	d = ROTATE_LEFT(d, 30);
1000
1001	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 79 */
1002
1003	ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +
1004	    SHA1_CONST(3);
1005	ctx->state[1] += b;
1006	ctx->state[2] += ROTATE_LEFT(c, 30);
1007	ctx->state[3] += d;
1008	ctx->state[4] += e;
1009
1010	/* zeroize sensitive information */
1011	W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
1012	W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
1013}
1014#endif	/* !__amd64 */
1015
1016
1017/*
1018 * Encode()
1019 *
1020 * purpose: to convert a list of numbers from little endian to big endian
1021 *   input: uint8_t *	: place to store the converted big endian numbers
1022 *	    uint32_t *	: place to get numbers to convert from
1023 *          size_t	: the length of the input in bytes
1024 *  output: void
1025 */
1026
1027static void
1028Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
1029    size_t len)
1030{
1031	size_t		i, j;
1032
1033#if	defined(__sparc)
1034	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
1035		for (i = 0, j = 0; j < len; i++, j += 4) {
1036			/* LINTED E_BAD_PTR_CAST_ALIGN */
1037			*((uint32_t *)(output + j)) = input[i];
1038		}
1039	} else {
1040#endif	/* little endian -- will work on big endian, but slowly */
1041		for (i = 0, j = 0; j < len; i++, j += 4) {
1042			output[j]	= (input[i] >> 24) & 0xff;
1043			output[j + 1]	= (input[i] >> 16) & 0xff;
1044			output[j + 2]	= (input[i] >>  8) & 0xff;
1045			output[j + 3]	= input[i] & 0xff;
1046		}
1047#if	defined(__sparc)
1048	}
1049#endif
1050}
1051