/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include "../arcfour.h"

/* Initialize the key stream 'key' using the key value */
void
arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
{
	uchar_t ext_keyval[256];
	uchar_t tmp;
	int i, j;

	for (i = j = 0; i < 256; i++, j++) {
		if (j == keyvallen)
			j = 0;

		ext_keyval[i] = keyval[j];
	}
	for (i = 0; i < 256; i++)
		key->arr[i] = (uchar_t)i;

	j = 0;
	for (i = 0; i < 256; i++) {
		j = (j + key->arr[i] + ext_keyval[i]) % 256;
		tmp = key->arr[i];
		key->arr[i] = key->arr[j];
		key->arr[j] = tmp;
	}
	key->i = 0;
	key->j = 0;
}


/*
 * Encipher 'in' using 'key.
 * in and out can point to the same location
 */
void
arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
{
	size_t ii;
	unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
	uchar_t i, j, *base, jj, *base1, tmp;
	unsigned int tmp0, tmp1, i_accum, shift = 0, i1;

	int index;

	base = key->arr;

	index = (((uintptr_t)in) & 0x7);

	/* Get the 'in' on an 8-byte alignment */
	if (index > 0) {
		i = key->i;
		j = key->j;

		for (index = 8 - index; (index-- > 0) && len > 0;
		    len--, in++, out++) {

			i = i + 1;
			j = j + key->arr[i];
			tmp = key->arr[i];
			key->arr[i] = key->arr[j];
			key->arr[j] = tmp;
			tmp = key->arr[i] + key->arr[j];
			*out = *in ^ key->arr[tmp];
		}
		key->i = i;
		key->j = j;

	}
	if (len == 0)
		return;

	/* See if we're fortunate and 'out' got aligned as well */


	/*
	 * Niagara optimized version for
	 * the cases where the input and output  buffers are aligned on
	 * a multiple of 8-byte boundary.
	 */
#ifdef	sun4v
	if ((((uintptr_t)out) & 7) != 0) {
#endif	/* sun4v */
		i = key->i;
		j = key->j;
		for (ii = 0; ii < len; ii++) {
			i = i + 1;
			tmp0 = base[i];
			j = j + tmp0;
			tmp1 = base[j];
			base[i] = (uchar_t)tmp1;
			base[j] = (uchar_t)tmp0;
			tmp0 += tmp1;
			tmp0 = tmp0 & 0xff;
			out[ii] = in[ii] ^ base[tmp0];
		}
		key->i = i;
		key->j = j;
#ifdef	sun4v
	} else {
		i = key->i;
		j = key->j;

		/*
		 * Want to align base[i] on a 2B boundary -- allows updates
		 * via [i] to be performed in 2B chunks (reducing # of stores).
		 * Requires appropriate alias detection.
		 */

		if (((i+1) % 2) != 0) {
			i = i + 1;
			tmp0 = base[i];
			j = j + tmp0;
			tmp1 = base[j];

			base[i] = (uchar_t)tmp1;
			base[j] = (uchar_t)tmp0;

			tmp0 += tmp1;
			tmp0 = tmp0 & 0xff;

			merge0 = (unsigned long long)(base[tmp0]) << 56;
			shift = 8; mask = 0xff;
		}

		/*
		 * Note - in and out may now be misaligned -
		 * as updating [out] in 8B chunks need to handle this
		 * possibility. Also could have a 1B overrun.
		 * Need to drop out of loop early as a result.
		 */

		for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
		    ii += 8, i1 = i1&0xff) {

			/*
			 * If i < less than 248, know wont wrap around
			 * (i % 256), so don't need to bother with masking i
			 * after each increment
			 */
			if (i1 < 248) {

				/* BYTE 0 */
				i1 = (i1 + 1);

				/*
				 * Creating this base pointer reduces subsequent
				 * arihmetic ops required to load [i]
				 *
				 * N.B. don't need to check if [j] aliases.
				 * [i] and [j] end up with the same values
				 * anyway.
				 */
				base1 = &base[i1];

				tmp0 = base1[0];
				j = j + tmp0;

				tmp1 = base[j];
				/*
				 * Don't store [i] yet
				 */
				i_accum = tmp1;
				base[j] = (uchar_t)tmp0;

				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;

				/*
				 * Check [tmp0] doesn't alias with [i]
				 */

				/*
				 * Updating [out] in 8B chunks
				 */
				if (i1 == tmp0) {
					merge =
					    (unsigned long long)(i_accum) << 56;
				} else {
					merge =
					    (unsigned long long)(base[tmp0]) <<
					    56;
				}

				/* BYTE 1 */
				tmp0 = base1[1];

				j = j + tmp0;

				/*
				 * [j] can now alias with [i] and [i-1]
				 * If alias abort speculation
				 */
				if ((i1 ^ j) < 2) {
					base1[0] = (uchar_t)i_accum;

					tmp1 = base[j];

					base1[1] = (uchar_t)tmp1;
					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					merge |= (unsigned long long)
					    (base[tmp0]) << 48;
				} else {

					tmp1 = base[j];

					i_accum = i_accum << 8;
					i_accum |= tmp1;

					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					/*
					 * Speculation suceeded! Update [i]
					 * in 2B chunk
					 */
					/* LINTED E_BAD_PTR_CAST_ALIGN */
					*((unsigned short *) &base[i1]) =
					    i_accum;

					merge |=
					    (unsigned long long)(base[tmp0]) <<
					    48;
				}


				/*
				 * Too expensive to perform [i] speculation for
				 * every byte. Just need to reduce frequency
				 * of stores until store buffer full stalls
				 * are not the bottleneck.
				 */

				/* BYTE 2 */
				tmp0 = base1[2];
				j = j + tmp0;
				tmp1 = base[j];
				base1[2] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp1 += tmp0;
				tmp1 = tmp1 & 0xff;
				merge |= (unsigned long long)(base[tmp1]) << 40;

				/* BYTE 3 */
				tmp0 = base1[3];
				j = j + tmp0;
				tmp1 = base[j];
				base1[3] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 32;

				/* BYTE 4 */
				tmp0 = base1[4];
				j = j + tmp0;
				tmp1 = base[j];
				base1[4] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 24;

				/* BYTE 5 */
				tmp0 = base1[5];
				j = j + tmp0;
				tmp1 = base[j];
				base1[5] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 16;

				/* BYTE 6 */
				i1 = (i1+6);
				tmp0 = base1[6];
				j = j + tmp0;
				tmp1 = base[j];
				i_accum = tmp1;
				base[j] = (uchar_t)tmp0;

				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;

				if (i1 == tmp0) {
					merge |=
					    (unsigned long long)(i_accum) << 8;
				} else {
					merge |=
					    (unsigned long long)(base[tmp0]) <<
					    8;
				}

				/* BYTE 7 */
				tmp0 = base1[7];

				/*
				 * Perform [i] speculation again. Indentical
				 * to that performed for BYTE0 and BYTE1.
				 */
				j = j + tmp0;
				if ((i1 ^ j) < 2) {
					base1[6] = (uchar_t)i_accum;
					tmp1 = base[j];

					base1[7] = (uchar_t)tmp1;
					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					merge |=
					    (unsigned long long)(base[tmp0]);

				} else {
					tmp1 = base[j];

					i_accum = i_accum << 8;
					i_accum |= tmp1;

					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					/* LINTED E_BAD_PTR_CAST_ALIGN */
					*((unsigned short *) &base[i1]) =
					    i_accum;

					merge |=
					    (unsigned long long)(base[tmp0]);
				}
				i1++;
			} else {
				/*
				 * i is too close to wrap-around to allow
				 * masking to be disregarded
				 */

				/*
				 * Same old speculation for BYTE 0 and BYTE 1
				 */

				/* BYTE 0 */
				i1 = (i1 + 1) & 0xff;
				jj = (uchar_t)i1;

				tmp0 = base[i1];
				j = j + tmp0;

				tmp1 = base[j];
				i_accum = tmp1;
				base[j] = (uchar_t)tmp0;

				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;

				if (i1 == tmp0) {
					merge =
					    (unsigned long long)(i_accum) << 56;
				} else {
					merge =
					    (unsigned long long)(base[tmp0]) <<
					    56;
				}

				/* BYTE 1 */
				tmp0 = base[i1+1];

				j = j + tmp0;

				if ((jj ^ j) < 2) {
					base[jj] = (uchar_t)i_accum;

					tmp1 = base[j];

					base[i1+1] = (uchar_t)tmp1;
					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					merge |=
					    (unsigned long long)(base[tmp0]) <<
					    48;
				} else {

					tmp1 = base[j];

					i_accum = i_accum << 8;
					i_accum |= tmp1;

					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					/* LINTED E_BAD_PTR_CAST_ALIGN */
					*((unsigned short *) &base[jj]) =
					    i_accum;

					merge |=
					    (unsigned long long)(base[tmp0]) <<
					    48;
				}

				/* BYTE 2 */
				/*
				 * As know i must be even when enter loop (to
				 * satisfy alignment), can only wrap around
				 * on the even bytes. So just need to perform
				 * mask every 2nd byte
				 */
				i1 = (i1 + 2) & 0xff;
				tmp0 = base[i1];
				j = j + tmp0;
				tmp1 = base[j];
				base[i1] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 40;

				/* BYTE 3 */
				tmp0 = base[i1+1];
				j = j + tmp0;
				tmp1 = base[j];
				base[i1+1] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 32;

				/* BYTE 4 */
				i1 = (i1 + 2) & 0xff;
				tmp0 = base[i1];
				j = j + tmp0;
				tmp1 = base[j];
				base[i1] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 24;

				/* BYTE 5 */
				tmp0 = base[i1+1];
				j = j + tmp0;
				tmp1 = base[j];
				base[i1+1] = (uchar_t)tmp1;
				base[j] = (uchar_t)tmp0;
				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;
				merge |= (unsigned long long)(base[tmp0]) << 16;

				/* BYTE 6 */
				i1 = (i1+2) &0xff;
				jj = (uchar_t)i1;
				tmp0 = base[i1];

				j = j + tmp0;

				tmp1 = base[j];
				i_accum = tmp1;
				base[j] = (uchar_t)tmp0;


				tmp0 += tmp1;
				tmp0 = tmp0 & 0xff;

				if (i1 == tmp0) {
					merge |=
					    (unsigned long long)(i_accum) << 8;
				} else {
					merge |=
					    (unsigned long long)(base[tmp0]) <<
					    8;
				}

				/* BYTE 7 */
				i1++;
				tmp0 = base[i1];

				j = j + tmp0;
				if ((jj ^ j) < 2) {
					base[jj] = (uchar_t)i_accum;
					tmp1 = base[j];

					base[i1] = (uchar_t)tmp1;
					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					merge |=
					    (unsigned long long)(base[tmp0]);

				} else {

					tmp1 = base[j];

					i_accum = i_accum << 8;
					i_accum |= tmp1;

					base[j] = (uchar_t)tmp0;

					tmp0 += tmp1;
					tmp0 = tmp0 & 0xff;

					/* LINTED E_BAD_PTR_CAST_ALIGN */
					*((unsigned short *) &base[jj]) =
					    i_accum;

					merge |=
					    (unsigned long long)(base[tmp0]);
				}
			}

			/*
			 * Perform update to [out]
			 * Remember could be alignment issues
			 */
			/* LINTED E_BAD_PTR_CAST_ALIGN */
			in0 = *((unsigned long long *) (&in[ii]));

			merge1 = merge0 | (merge >> shift);

			merge0 = (merge & mask) << 56;

			in0 = in0 ^ merge1;

			/* LINTED E_BAD_PTR_CAST_ALIGN */
			*((unsigned long long *) (&out[ii])) = in0;
		}

		i = (uchar_t)i1;

		/*
		 * Handle any overrun
		 */
		if (shift) {
			out[ii] = in[ii] ^ (merge0 >> 56);
			ii++;
		}

		/*
		 * Handle final few bytes
		 */
		for (; ii < len; ii++) {
			i = i + 1;
			tmp0 = base[i];
			j = j + tmp0;
			tmp1 = base[j];

			base[i] = (uchar_t)tmp1;
			base[j] = (uchar_t)tmp0;

			tmp0 += tmp1;
			tmp0 = tmp0 & 0xff;
			out[ii] = in[ii] ^ base[tmp0];
		}
		key->i = i;
		key->j = j;
	}
#endif /* sun4v */
}