1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Configuration guide
28  * -------------------
29  *
30  * There are 4 preprocessor symbols used to configure the bignum
31  * implementation.  This file contains no logic to configure based on
32  * processor; we leave that to the Makefiles to specify.
33  *
34  * USE_FLOATING_POINT
35  *   Meaning: There is support for a fast floating-point implementation of
36  *   Montgomery multiply.
37  *
38  * PSR_MUL
39  *   Meaning: There are processor-specific versions of the low level
40  *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
41  *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
42  *   for all 4 functions.  You cannot pick and choose which subset of these
43  *   functions to support; that would lead to a rat's nest of #ifdefs.
44  *
45  * HWCAP
46  *   Meaning: Call multiply support functions through a function pointer.
47  *   On x86, there are multiple implementations for different hardware
48  *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
49  *   a function is first used.  So, the support functions are called through
50  *   a function pointer.  There is no need for that on Sparc, because there
51  *   is only one implementation; support functions are called directly.
52  *   Later, if there were some new VIS instruction, or something, and a
53  *   run-time test were needed, rather than variant kernel modules and
54  *   libraries, then HWCAP would be defined for Sparc, as well.
55  *
56  * UMUL64
57  *   Meaning: It is safe to use generic C code that assumes the existence
58  *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
59  *   then the generic code for big_mul_add_vec() must necessarily be very slow,
60  *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
61  *
62  */
63 
64 
65 #include <sys/types.h>
66 #include "bignum.h"
67 
68 #ifdef	_KERNEL
69 #include <sys/ddi.h>
70 #include <sys/mdesc.h>
71 #include <sys/crypto/common.h>
72 
73 #include <sys/kmem.h>
74 #include <sys/param.h>
75 #include <sys/sunddi.h>
76 
77 #else
78 #include <stdlib.h>
79 #include <stdio.h>
80 #include <assert.h>
81 #define	ASSERT	assert
82 #endif	/* _KERNEL */
83 
84 #ifdef	_LP64 /* truncate 64-bit size_t to 32-bits */
85 #define	UI32(ui)	((uint32_t)ui)
86 #else /* size_t already 32-bits */
87 #define	UI32(ui)	(ui)
88 #endif
89 
90 
91 #ifdef	_KERNEL
92 #define	big_malloc(size)	kmem_alloc(size, KM_NOSLEEP)
93 #define	big_free(ptr, size)	kmem_free(ptr, size)
94 
95 void *
96 big_realloc(void *from, size_t oldsize, size_t newsize)
97 {
98 	void *rv;
99 
100 	rv = kmem_alloc(newsize, KM_NOSLEEP);
101 	if (rv != NULL)
102 		bcopy(from, rv, oldsize);
103 	kmem_free(from, oldsize);
104 	return (rv);
105 }
106 
107 #else	/* _KERNEL */
108 
109 #ifndef MALLOC_DEBUG
110 
111 #define	big_malloc(size)	malloc(size)
112 #define	big_free(ptr, size)	free(ptr)
113 
114 #else
115 
116 void
117 big_free(void *ptr, size_t size)
118 {
119 	printf("freed %d bytes at %p\n", size, ptr);
120 	free(ptr);
121 }
122 
123 void *
124 big_malloc(size_t size)
125 {
126 	void *rv;
127 	rv = malloc(size);
128 	printf("malloced %d bytes, addr:%p\n", size, rv);
129 	return (rv);
130 }
131 #endif /* MALLOC_DEBUG */
132 
133 #define	big_realloc(x, y, z) realloc((x), (z))
134 
135 
136 /*
137  * printbignum()
138  * Print a BIGNUM type to stdout.
139  */
140 void
141 printbignum(char *aname, BIGNUM *a)
142 {
143 	int i;
144 
145 	(void) printf("\n%s\n%d\n", aname, a->sign*a->len);
146 	for (i = a->len - 1; i >= 0; i--) {
147 #ifdef BIGNUM_CHUNK_32
148 		(void) printf("%08x ", a->value[i]);
149 		if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
150 			(void) printf("\n");
151 		}
152 #else
153 		(void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
154 		    (uint32_t)((a->value[i]) & 0xffffffff));
155 		if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
156 			(void) printf("\n");
157 		}
158 #endif
159 	}
160 	(void) printf("\n");
161 }
162 
163 #endif	/* _KERNEL */
164 
165 
166 /*
167  * big_init()
168  * Initialize and allocate memory for a BIGNUM type.
169  *
170  * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
171  *
172  * Note: call big_finish() to free memory allocated by big_init().
173  *
174  * Input:
175  * number	Uninitialized memory for BIGNUM
176  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
177  *
178  * Output:
179  * number	Initialized BIGNUM
180  *
181  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
182  */
183 BIG_ERR_CODE
184 big_init(BIGNUM *number, int size)
185 {
186 	number->value = big_malloc(BIGNUM_WORDSIZE * size);
187 	if (number->value == NULL) {
188 		return (BIG_NO_MEM);
189 	}
190 	number->size = size;
191 	number->len = 0;
192 	number->sign = 1;
193 	number->malloced = 1;
194 	return (BIG_OK);
195 }
196 
197 
198 /*
199  * big_init1()
200  * Initialize and, if needed, allocate memory for a BIGNUM type.
201  * Use the buffer passed, buf, if any, instad of allocating memory
202  * if it's at least "size" bytes.
203  *
204  * Note: call big_finish() to free memory allocated by big_init().
205  *
206  * Input:
207  * number	Uninitialized memory for BIGNUM
208  * size		Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
209  * buf		Buffer for storing a BIGNUM.
210  *		If NULL, big_init1() will allocate a buffer
211  * bufsize	Size, in BIG_CHUNK_SIZE_bit words, of buf
212  *
213  * Output:
214  * number	Initialized BIGNUM
215  *
216  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
217  */
218 BIG_ERR_CODE
219 big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
220 {
221 	if ((buf == NULL) || (size > bufsize)) {
222 		number->value = big_malloc(BIGNUM_WORDSIZE * size);
223 		if (number->value == NULL) {
224 			return (BIG_NO_MEM);
225 		}
226 		number->size = size;
227 		number->malloced = 1;
228 	} else {
229 		number->value = buf;
230 		number->size = bufsize;
231 		number->malloced = 0;
232 	}
233 	number->len = 0;
234 	number->sign = 1;
235 
236 	return (BIG_OK);
237 }
238 
239 
240 /*
241  * big_finish()
242  * Free memory, if any, allocated by big_init() or big_init1().
243  */
244 void
245 big_finish(BIGNUM *number)
246 {
247 	if (number->malloced == 1) {
248 		big_free(number->value, BIGNUM_WORDSIZE * number->size);
249 		number->malloced = 0;
250 	}
251 }
252 
253 
254 /*
255  * bn->size should be at least
256  * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
257  * converts from byte-big-endian format to bignum format (words in
258  * little endian order, but bytes within the words big endian)
259  */
260 void
261 bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
262 {
263 	int		i, j;
264 	uint32_t	offs;
265 	const uint32_t	slen = UI32(len);
266 	BIG_CHUNK_TYPE	word;
267 	uchar_t		*knwordp;
268 
269 	if (slen == 0) {
270 		bn->len = 1;
271 		bn->value[0] = 0;
272 		return;
273 	}
274 
275 	offs = slen % BIGNUM_WORDSIZE;
276 	bn->len = slen / BIGNUM_WORDSIZE;
277 
278 	for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
279 		knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
280 		word = knwordp[0];
281 		for (j = 1; j < BIGNUM_WORDSIZE; j++) {
282 			word = (word << BITSINBYTE) + knwordp[j];
283 		}
284 		bn->value[i] = word;
285 	}
286 	if (offs > 0) {
287 		word = kn[0];
288 		for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
289 		bn->value[bn->len++] = word;
290 	}
291 	while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
292 		bn->len --;
293 	}
294 }
295 
296 
297 /*
298  * copies the least significant len bytes if
299  * len < bn->len * BIGNUM_WORDSIZE
300  * converts from bignum format to byte-big-endian format.
301  * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
302  */
303 void
304 bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
305 {
306 	int		i, j;
307 	uint32_t	offs;
308 	const uint32_t	slen = UI32(len);
309 	BIG_CHUNK_TYPE	word;
310 
311 	if (len < BIGNUM_WORDSIZE * bn->len) {
312 		for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
313 			word = bn->value[i];
314 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
315 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
316 				    word & 0xff;
317 				word = word >> BITSINBYTE;
318 			}
319 		}
320 		offs = slen % BIGNUM_WORDSIZE;
321 		if (offs > 0) {
322 			word = bn->value[slen / BIGNUM_WORDSIZE];
323 			for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
324 				kn[i - 1] = word & 0xff;
325 				word = word >> BITSINBYTE;
326 			}
327 		}
328 	} else {
329 		for (i = 0; i < bn->len; i++) {
330 			word = bn->value[i];
331 			for (j = 0; j < BIGNUM_WORDSIZE; j++) {
332 				kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
333 				    word & 0xff;
334 				word = word >> BITSINBYTE;
335 			}
336 		}
337 		for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
338 			kn[i] = 0;
339 		}
340 	}
341 }
342 
343 
344 int
345 big_bitlength(BIGNUM *a)
346 {
347 	int		l = 0, b = 0;
348 	BIG_CHUNK_TYPE	c;
349 
350 	l = a->len - 1;
351 	while ((l > 0) && (a->value[l] == 0)) {
352 		l--;
353 	}
354 	b = BIG_CHUNK_SIZE;
355 	c = a->value[l];
356 	while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
357 		c = c << 1;
358 		b--;
359 	}
360 
361 	return (l * BIG_CHUNK_SIZE + b);
362 }
363 
364 
365 BIG_ERR_CODE
366 big_copy(BIGNUM *dest, BIGNUM *src)
367 {
368 	BIG_CHUNK_TYPE	*newptr;
369 	int		i, len;
370 
371 	len = src->len;
372 	while ((len > 1) && (src->value[len - 1] == 0)) {
373 		len--;
374 	}
375 	src->len = len;
376 	if (dest->size < len) {
377 		if (dest->malloced == 1) {
378 			newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
379 			    BIGNUM_WORDSIZE * dest->size,
380 			    BIGNUM_WORDSIZE * len);
381 		} else {
382 			newptr = (BIG_CHUNK_TYPE *)
383 			    big_malloc(BIGNUM_WORDSIZE * len);
384 			if (newptr != NULL) {
385 				dest->malloced = 1;
386 			}
387 		}
388 		if (newptr == NULL) {
389 			return (BIG_NO_MEM);
390 		}
391 		dest->value = newptr;
392 		dest->size = len;
393 	}
394 	dest->len = len;
395 	dest->sign = src->sign;
396 	for (i = 0; i < len; i++) {
397 		dest->value[i] = src->value[i];
398 	}
399 
400 	return (BIG_OK);
401 }
402 
403 
404 BIG_ERR_CODE
405 big_extend(BIGNUM *number, int size)
406 {
407 	BIG_CHUNK_TYPE	*newptr;
408 	int		i;
409 
410 	if (number->size >= size)
411 		return (BIG_OK);
412 	if (number->malloced) {
413 		number->value = big_realloc(number->value,
414 		    BIGNUM_WORDSIZE * number->size,
415 		    BIGNUM_WORDSIZE * size);
416 	} else {
417 		newptr = big_malloc(BIGNUM_WORDSIZE * size);
418 		if (newptr != NULL) {
419 			for (i = 0; i < number->size; i++) {
420 				newptr[i] = number->value[i];
421 			}
422 		}
423 		number->value = newptr;
424 	}
425 
426 	if (number->value == NULL) {
427 		return (BIG_NO_MEM);
428 	}
429 
430 	number->size = size;
431 	number->malloced = 1;
432 	return (BIG_OK);
433 }
434 
435 
436 /* returns 1 if n == 0 */
437 int
438 big_is_zero(BIGNUM *n)
439 {
440 	int	i, result;
441 
442 	result = 1;
443 	for (i = 0; i < n->len; i++) {
444 		if (n->value[i] != 0) {
445 			result = 0;
446 		}
447 	}
448 	return (result);
449 }
450 
451 
452 BIG_ERR_CODE
453 big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
454 {
455 	int		i, shorter, longer;
456 	BIG_CHUNK_TYPE	cy, ai;
457 	BIG_CHUNK_TYPE	*r, *a, *b, *c;
458 	BIG_ERR_CODE	err;
459 	BIGNUM		*longerarg;
460 
461 	if (aa->len > bb->len) {
462 		shorter = bb->len;
463 		longer = aa->len;
464 		longerarg = aa;
465 	} else {
466 		shorter = aa->len;
467 		longer = bb->len;
468 		longerarg = bb;
469 	}
470 	if (result->size < longer + 1) {
471 		err = big_extend(result, longer + 1);
472 		if (err != BIG_OK) {
473 			return (err);
474 		}
475 	}
476 
477 	r = result->value;
478 	a = aa->value;
479 	b = bb->value;
480 	c = longerarg->value;
481 	cy = 0;
482 	for (i = 0; i < shorter; i++) {
483 		ai = a[i];
484 		r[i] = ai + b[i] + cy;
485 		if (r[i] > ai) {
486 			cy = 0;
487 		} else if (r[i] < ai) {
488 			cy = 1;
489 		}
490 	}
491 	for (; i < longer; i++) {
492 		ai = c[i];
493 		r[i] = ai + cy;
494 		if (r[i] >= ai) {
495 			cy = 0;
496 		}
497 	}
498 	if (cy == 1) {
499 		r[i] = cy;
500 		result->len = longer + 1;
501 	} else {
502 		result->len = longer;
503 	}
504 	result->sign = 1;
505 	return (BIG_OK);
506 }
507 
508 
509 /* caller must make sure that result has at least len words allocated */
510 void
511 big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
512 {
513 	int		i;
514 	BIG_CHUNK_TYPE	cy, ai;
515 
516 	cy = 1;
517 	for (i = 0; i < len; i++) {
518 		ai = a[i];
519 		r[i] = ai + (~b[i]) + cy;
520 		if (r[i] > ai) {
521 			cy = 0;
522 		} else if (r[i] < ai) {
523 			cy = 1;
524 		}
525 	}
526 }
527 
528 
529 /* result=aa-bb  it is assumed that aa>=bb */
530 BIG_ERR_CODE
531 big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
532 {
533 	int		i, shorter;
534 	BIG_CHUNK_TYPE	cy = 1, ai;
535 	BIG_CHUNK_TYPE	*r, *a, *b;
536 	BIG_ERR_CODE	err = BIG_OK;
537 
538 	if (aa->len > bb->len) {
539 		shorter = bb->len;
540 	} else {
541 		shorter = aa->len;
542 	}
543 	if (result->size < aa->len) {
544 		err = big_extend(result, aa->len);
545 		if (err != BIG_OK) {
546 			return (err);
547 		}
548 	}
549 
550 	r = result->value;
551 	a = aa->value;
552 	b = bb->value;
553 	result->len = aa->len;
554 	cy = 1;
555 	for (i = 0; i < shorter; i++) {
556 		ai = a[i];
557 		r[i] = ai + (~b[i]) + cy;
558 		if (r[i] > ai) {
559 			cy = 0;
560 		} else if (r[i] < ai) {
561 			cy = 1;
562 		}
563 	}
564 	for (; i < aa->len; i++) {
565 		ai = a[i];
566 		r[i] = ai + (~0) + cy;
567 		if (r[i] < ai) {
568 			cy = 1;
569 		}
570 	}
571 	result->sign = 1;
572 
573 	if (cy == 0) {
574 		return (BIG_INVALID_ARGS);
575 	} else {
576 		return (BIG_OK);
577 	}
578 }
579 
580 
581 /* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
582 int
583 big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
584 {
585 	int	i;
586 
587 	if (aa->len > bb->len) {
588 		for (i = aa->len - 1; i > bb->len - 1; i--) {
589 			if (aa->value[i] > 0) {
590 				return (1);
591 			}
592 		}
593 	} else if (aa->len < bb->len) {
594 		for (i = bb->len - 1; i > aa->len - 1; i--) {
595 			if (bb->value[i] > 0) {
596 				return (-1);
597 			}
598 		}
599 	} else {
600 		i = aa->len - 1;
601 	}
602 	for (; i >= 0; i--) {
603 		if (aa->value[i] > bb->value[i]) {
604 			return (1);
605 		} else if (aa->value[i] < bb->value[i]) {
606 			return (-1);
607 		}
608 	}
609 
610 	return (0);
611 }
612 
613 
614 BIG_ERR_CODE
615 big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
616 {
617 	BIG_ERR_CODE	err;
618 
619 	if ((bb->sign == -1) && (aa->sign == 1)) {
620 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
621 			return (err);
622 		}
623 		result->sign = 1;
624 	} else if ((aa->sign == -1) && (bb->sign == 1)) {
625 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
626 			return (err);
627 		}
628 		result->sign = -1;
629 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
630 		if (big_cmp_abs(aa, bb) >= 0) {
631 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
632 				return (err);
633 			}
634 			result->sign = 1;
635 		} else {
636 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
637 				return (err);
638 			}
639 			result->sign = -1;
640 		}
641 	} else {
642 		if (big_cmp_abs(aa, bb) >= 0) {
643 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
644 				return (err);
645 			}
646 			result->sign = -1;
647 		} else {
648 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
649 				return (err);
650 			}
651 			result->sign = 1;
652 		}
653 	}
654 	return (BIG_OK);
655 }
656 
657 
658 BIG_ERR_CODE
659 big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
660 {
661 	BIG_ERR_CODE	err;
662 
663 	if ((bb->sign == -1) && (aa->sign == -1)) {
664 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
665 			return (err);
666 		}
667 		result->sign = -1;
668 	} else if ((aa->sign == 1) && (bb->sign == 1)) {
669 		if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
670 			return (err);
671 		}
672 		result->sign = 1;
673 	} else if ((aa->sign == 1) && (bb->sign == -1)) {
674 		if (big_cmp_abs(aa, bb) >= 0) {
675 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
676 				return (err);
677 			}
678 			result->sign = 1;
679 		} else {
680 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
681 				return (err);
682 			}
683 			result->sign = -1;
684 		}
685 	} else {
686 		if (big_cmp_abs(aa, bb) >= 0) {
687 			if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
688 				return (err);
689 			}
690 			result->sign = -1;
691 		} else {
692 			if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
693 				return (err);
694 			}
695 			result->sign = 1;
696 		}
697 	}
698 	return (BIG_OK);
699 }
700 
701 
702 /* result = aa/2 */
703 BIG_ERR_CODE
704 big_half_pos(BIGNUM *result, BIGNUM *aa)
705 {
706 	BIG_ERR_CODE	err;
707 	int		i;
708 	BIG_CHUNK_TYPE	cy, cy1;
709 	BIG_CHUNK_TYPE	*a, *r;
710 
711 	if (result->size < aa->len) {
712 		err = big_extend(result, aa->len);
713 		if (err != BIG_OK) {
714 			return (err);
715 		}
716 	}
717 
718 	result->len = aa->len;
719 	a = aa->value;
720 	r = result->value;
721 	cy = 0;
722 	for (i = aa->len - 1; i >= 0; i--) {
723 		cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
724 		r[i] = (cy | (a[i] >> 1));
725 		cy = cy1;
726 	}
727 	if (r[result->len - 1] == 0) {
728 		result->len--;
729 	}
730 
731 	return (BIG_OK);
732 }
733 
734 /* result  =  aa*2 */
735 BIG_ERR_CODE
736 big_double(BIGNUM *result, BIGNUM *aa)
737 {
738 	BIG_ERR_CODE	err;
739 	int		i, rsize;
740 	BIG_CHUNK_TYPE	cy, cy1;
741 	BIG_CHUNK_TYPE	*a, *r;
742 
743 	if ((aa->len > 0) &&
744 	    ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
745 		rsize = aa->len + 1;
746 	} else {
747 		rsize = aa->len;
748 	}
749 
750 	if (result->size < rsize) {
751 		err = big_extend(result, rsize);
752 		if (err != BIG_OK)
753 			return (err);
754 	}
755 
756 	a = aa->value;
757 	r = result->value;
758 	if (rsize == aa->len + 1) {
759 		r[rsize - 1] = 1;
760 	}
761 	cy = 0;
762 	for (i = 0; i < aa->len; i++) {
763 		cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
764 		r[i] = (cy | (a[i] << 1));
765 		cy = cy1;
766 	}
767 	result->len = rsize;
768 	return (BIG_OK);
769 }
770 
771 
772 /*
773  * returns aa mod b, aa must be nonneg, b must be a max
774  * (BIG_CHUNK_SIZE / 2)-bit integer
775  */
776 static uint32_t
777 big_modhalf_pos(BIGNUM *aa, uint32_t b)
778 {
779 	int		i;
780 	BIG_CHUNK_TYPE	rem;
781 
782 	if (aa->len == 0) {
783 		return (0);
784 	}
785 	rem = aa->value[aa->len - 1] % b;
786 	for (i = aa->len - 2; i >= 0; i--) {
787 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
788 		    (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
789 		rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
790 		    (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
791 	}
792 
793 	return ((uint32_t)rem);
794 }
795 
796 
797 /*
798  * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
799  * result->size should be at least aa->len at entry
800  * aa, bb, and result should be positive
801  */
802 void
803 big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
804 {
805 	int i, lendiff;
806 	BIGNUM res1, aa1;
807 
808 	lendiff = aa->len - bb->len;
809 	res1.size = result->size - lendiff;
810 	res1.malloced = 0;
811 	res1.value = result->value + lendiff;
812 	aa1.size = aa->size - lendiff;
813 	aa1.value = aa->value + lendiff;
814 	aa1.len = bb->len;
815 	aa1.sign = 1;
816 	(void) big_sub_pos(&res1, &aa1, bb);
817 	if (result->value != aa->value) {
818 		for (i = 0; i < lendiff; i++) {
819 			result->value[i] = aa->value[i];
820 		}
821 	}
822 	result->len = aa->len;
823 }
824 
825 
826 /*
827  * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
828  *					(2^BIG_CHUNK_SIZE)^lendiff * |bb|
829  * aa->len should be >= bb->len
830  */
831 int
832 big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
833 {
834 	int		lendiff;
835 	BIGNUM		aa1;
836 
837 	lendiff = aa->len - bb->len;
838 	aa1.len = bb->len;
839 	aa1.size = aa->size - lendiff;
840 	aa1.malloced = 0;
841 	aa1.value = aa->value + lendiff;
842 	return (big_cmp_abs(&aa1, bb));
843 }
844 
845 
846 /*
847  * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
848  * result should have enough space allocated.
849  */
850 static void
851 big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
852 {
853 	int		i;
854 	BIG_CHUNK_TYPE	t1, t2, ai, cy;
855 	BIG_CHUNK_TYPE	*a, *r;
856 
857 	a = aa->value;
858 	r = result->value;
859 	cy = 0;
860 	for (i = 0; i < aa->len; i++) {
861 		ai = a[i];
862 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
863 		t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
864 		    (t1 >> (BIG_CHUNK_SIZE / 2));
865 		r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
866 		    (t2 << (BIG_CHUNK_SIZE / 2));
867 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
868 	}
869 	r[i] = cy;
870 	result->len = aa->len + 1;
871 	result->sign = aa->sign;
872 }
873 
874 
875 /*
876  * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
877  * (BIG_CHUNK_SIZE / 2)-bit positive integer.
878  * result should have enough space allocated.
879  */
880 static void
881 big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
882 {
883 	int		i;
884 	BIG_CHUNK_TYPE	t1, t2, ai, cy, ri;
885 	BIG_CHUNK_TYPE	*a, *r;
886 
887 	a = aa->value;
888 	r = result->value;
889 	cy = 0;
890 	ri = 0;
891 	for (i = 0; i < aa->len; i++) {
892 		ai = a[i];
893 		t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
894 		t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
895 		    (t1 >>  (BIG_CHUNK_SIZE / 2));
896 		r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
897 		ri = t2 & BIG_CHUNK_LOWHALFBITS;
898 		cy = t2 >> (BIG_CHUNK_SIZE / 2);
899 	}
900 	r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
901 	result->len = aa->len + 1;
902 	result->sign = aa->sign;
903 }
904 
905 
906 /* it is assumed that result->size is big enough */
907 void
908 big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
909 {
910 	int		i;
911 	BIG_CHUNK_TYPE	cy, ai;
912 
913 	if (offs == 0) {
914 		if (result != aa) {
915 			(void) big_copy(result, aa);
916 		}
917 		return;
918 	}
919 	cy = 0;
920 	for (i = 0; i < aa->len; i++) {
921 		ai = aa->value[i];
922 		result->value[i] = (ai << offs) | cy;
923 		cy = ai >> (BIG_CHUNK_SIZE - offs);
924 	}
925 	if (cy != 0) {
926 		result->len = aa->len + 1;
927 		result->value[result->len - 1] = cy;
928 	} else {
929 		result->len = aa->len;
930 	}
931 	result->sign = aa->sign;
932 }
933 
934 
935 /* it is assumed that result->size is big enough */
936 void
937 big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
938 {
939 	int		 i;
940 	BIG_CHUNK_TYPE	cy, ai;
941 
942 	if (offs == 0) {
943 		if (result != aa) {
944 			(void) big_copy(result, aa);
945 		}
946 		return;
947 	}
948 	cy = aa->value[0] >> offs;
949 	for (i = 1; i < aa->len; i++) {
950 		ai = aa->value[i];
951 		result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
952 		cy = ai >> offs;
953 	}
954 	result->len = aa->len;
955 	result->value[result->len - 1] = cy;
956 	result->sign = aa->sign;
957 }
958 
959 
960 /*
961  * result = aa/bb   remainder = aa mod bb
962  * it is assumed that aa and bb are positive
963  */
964 BIG_ERR_CODE
965 big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
966 {
967 	BIG_ERR_CODE	err = BIG_OK;
968 	int		i, alen, blen, tlen, rlen, offs;
969 	BIG_CHUNK_TYPE	higha, highb, coeff;
970 	BIG_CHUNK_TYPE	*a, *b;
971 	BIGNUM		bbhigh, bblow, tresult, tmp1, tmp2;
972 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
973 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
974 	BIG_CHUNK_TYPE	tresultvalue[BIGTMPSIZE];
975 	BIG_CHUNK_TYPE	bblowvalue[BIGTMPSIZE];
976 	BIG_CHUNK_TYPE	bbhighvalue[BIGTMPSIZE];
977 
978 	a = aa->value;
979 	b = bb->value;
980 	alen = aa->len;
981 	blen = bb->len;
982 	while ((alen > 1) && (a[alen - 1] == 0)) {
983 		alen = alen - 1;
984 	}
985 	aa->len = alen;
986 	while ((blen > 1) && (b[blen - 1] == 0)) {
987 		blen = blen - 1;
988 	}
989 	bb->len = blen;
990 	if ((blen == 1) && (b[0] == 0)) {
991 		return (BIG_DIV_BY_0);
992 	}
993 
994 	if (big_cmp_abs(aa, bb) < 0) {
995 		if ((remainder != NULL) &&
996 		    ((err = big_copy(remainder, aa)) != BIG_OK)) {
997 			return (err);
998 		}
999 		if (result != NULL) {
1000 			result->len = 1;
1001 			result->sign = 1;
1002 			result->value[0] = 0;
1003 		}
1004 		return (BIG_OK);
1005 	}
1006 
1007 	if ((err = big_init1(&bblow, blen + 1,
1008 	    bblowvalue, arraysize(bblowvalue))) != BIG_OK)
1009 		return (err);
1010 
1011 	if ((err = big_init1(&bbhigh, blen + 1,
1012 	    bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
1013 		goto ret1;
1014 
1015 	if ((err = big_init1(&tmp1, alen + 2,
1016 	    tmp1value, arraysize(tmp1value))) != BIG_OK)
1017 		goto ret2;
1018 
1019 	if ((err = big_init1(&tmp2, blen + 2,
1020 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
1021 		goto ret3;
1022 
1023 	if ((err = big_init1(&tresult, alen - blen + 2,
1024 	    tresultvalue, arraysize(tresultvalue))) != BIG_OK)
1025 		goto ret4;
1026 
1027 	offs = 0;
1028 	highb = b[blen - 1];
1029 	if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
1030 		highb = highb >> (BIG_CHUNK_SIZE / 2);
1031 		offs = (BIG_CHUNK_SIZE / 2);
1032 	}
1033 	while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
1034 		highb = highb << 1;
1035 		offs++;
1036 	}
1037 
1038 	big_shiftleft(&bblow, bb, offs);
1039 
1040 	if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
1041 		big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1042 	} else {
1043 		big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1044 	}
1045 	if (bbhigh.value[bbhigh.len - 1] == 0) {
1046 		bbhigh.len--;
1047 	} else {
1048 		bbhigh.value[bbhigh.len] = 0;
1049 	}
1050 
1051 	highb = bblow.value[bblow.len - 1];
1052 
1053 	big_shiftleft(&tmp1, aa, offs);
1054 	rlen = tmp1.len - bblow.len + 1;
1055 	tresult.len = rlen;
1056 
1057 	tmp1.len++;
1058 	tlen = tmp1.len;
1059 	tmp1.value[tmp1.len - 1] = 0;
1060 	for (i = 0; i < rlen; i++) {
1061 		higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
1062 		    (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
1063 		coeff = higha / (highb + 1);
1064 		big_mulhalf_high(&tmp2, &bblow, coeff);
1065 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1066 		bbhigh.len++;
1067 		while (tmp1.value[tlen - 1] > 0) {
1068 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1069 			coeff++;
1070 		}
1071 		bbhigh.len--;
1072 		tlen--;
1073 		tmp1.len--;
1074 		while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
1075 			big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1076 			coeff++;
1077 		}
1078 		tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
1079 		higha = tmp1.value[tlen - 1];
1080 		coeff = higha / (highb + 1);
1081 		big_mulhalf_low(&tmp2, &bblow, coeff);
1082 		tmp2.len--;
1083 		big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1084 		while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
1085 			big_sub_pos_high(&tmp1, &tmp1, &bblow);
1086 			coeff++;
1087 		}
1088 		tresult.value[rlen - i - 1] =
1089 		    tresult.value[rlen - i - 1] + coeff;
1090 	}
1091 
1092 	big_shiftright(&tmp1, &tmp1, offs);
1093 
1094 	err = BIG_OK;
1095 
1096 	if ((remainder != NULL) &&
1097 	    ((err = big_copy(remainder, &tmp1)) != BIG_OK))
1098 		goto ret;
1099 
1100 	if (result != NULL)
1101 		err = big_copy(result, &tresult);
1102 
1103 ret:
1104 	big_finish(&tresult);
1105 ret4:
1106 	big_finish(&tmp1);
1107 ret3:
1108 	big_finish(&tmp2);
1109 ret2:
1110 	big_finish(&bbhigh);
1111 ret1:
1112 	big_finish(&bblow);
1113 	return (err);
1114 }
1115 
1116 
1117 /*
1118  * If there is no processor-specific integer implementation of
1119  * the lower level multiply functions, then this code is provided
1120  * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
1121  * big_sqr_vec().
1122  *
1123  * There are two generic implementations.  One that assumes that
1124  * there is hardware and C compiler support for a 32 x 32 --> 64
1125  * bit unsigned multiply, but otherwise is not specific to any
1126  * processor, platform, or ISA.
1127  *
1128  * The other makes very few assumptions about hardware capabilities.
1129  * It does not even assume that there is any implementation of a
1130  * 32 x 32 --> 64 bit multiply that is accessible to C code and
1131  * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
1132  * multiplies from 16 x 16 --> 32 bit multiplies.
1133  *
1134  */
1135 
1136 #if !defined(PSR_MUL)
1137 
1138 #ifdef UMUL64
1139 
1140 #if (BIG_CHUNK_SIZE == 32)
1141 
1142 #define	UNROLL8
1143 
1144 #define	MUL_SET_VEC_ROUND_PREFETCH(R) \
1145 	p = pf * d; \
1146 	pf = (uint64_t)a[R + 1]; \
1147 	t = p + cy; \
1148 	r[R] = (uint32_t)t; \
1149 	cy = t >> 32
1150 
1151 #define	MUL_SET_VEC_ROUND_NOPREFETCH(R) \
1152 	p = pf * d; \
1153 	t = p + cy; \
1154 	r[R] = (uint32_t)t; \
1155 	cy = t >> 32
1156 
1157 #define	MUL_ADD_VEC_ROUND_PREFETCH(R) \
1158 	t = (uint64_t)r[R]; \
1159 	p = pf * d; \
1160 	pf = (uint64_t)a[R + 1]; \
1161 	t = p + t + cy; \
1162 	r[R] = (uint32_t)t; \
1163 	cy = t >> 32
1164 
1165 #define	MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
1166 	t = (uint64_t)r[R]; \
1167 	p = pf * d; \
1168 	t = p + t + cy; \
1169 	r[R] = (uint32_t)t; \
1170 	cy = t >> 32
1171 
1172 #ifdef UNROLL8
1173 
1174 #define	UNROLL 8
1175 
1176 /*
1177  * r = a * b
1178  * where r and a are vectors; b is a single 32-bit digit
1179  */
1180 
1181 uint32_t
1182 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1183 {
1184 	uint64_t d, pf, p, t, cy;
1185 
1186 	if (len == 0)
1187 		return (0);
1188 	cy = 0;
1189 	d = (uint64_t)b;
1190 	pf = (uint64_t)a[0];
1191 	while (len > UNROLL) {
1192 		MUL_SET_VEC_ROUND_PREFETCH(0);
1193 		MUL_SET_VEC_ROUND_PREFETCH(1);
1194 		MUL_SET_VEC_ROUND_PREFETCH(2);
1195 		MUL_SET_VEC_ROUND_PREFETCH(3);
1196 		MUL_SET_VEC_ROUND_PREFETCH(4);
1197 		MUL_SET_VEC_ROUND_PREFETCH(5);
1198 		MUL_SET_VEC_ROUND_PREFETCH(6);
1199 		MUL_SET_VEC_ROUND_PREFETCH(7);
1200 		r += UNROLL;
1201 		a += UNROLL;
1202 		len -= UNROLL;
1203 	}
1204 	if (len == UNROLL) {
1205 		MUL_SET_VEC_ROUND_PREFETCH(0);
1206 		MUL_SET_VEC_ROUND_PREFETCH(1);
1207 		MUL_SET_VEC_ROUND_PREFETCH(2);
1208 		MUL_SET_VEC_ROUND_PREFETCH(3);
1209 		MUL_SET_VEC_ROUND_PREFETCH(4);
1210 		MUL_SET_VEC_ROUND_PREFETCH(5);
1211 		MUL_SET_VEC_ROUND_PREFETCH(6);
1212 		MUL_SET_VEC_ROUND_NOPREFETCH(7);
1213 		return ((uint32_t)cy);
1214 	}
1215 	while (len > 1) {
1216 		MUL_SET_VEC_ROUND_PREFETCH(0);
1217 		++r;
1218 		++a;
1219 		--len;
1220 	}
1221 	if (len > 0) {
1222 		MUL_SET_VEC_ROUND_NOPREFETCH(0);
1223 	}
1224 	return ((uint32_t)cy);
1225 }
1226 
1227 /*
1228  * r += a * b
1229  * where r and a are vectors; b is a single 32-bit digit
1230  */
1231 
1232 uint32_t
1233 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1234 {
1235 	uint64_t d, pf, p, t, cy;
1236 
1237 	if (len == 0)
1238 		return (0);
1239 	cy = 0;
1240 	d = (uint64_t)b;
1241 	pf = (uint64_t)a[0];
1242 	while (len > 8) {
1243 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1244 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1245 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1246 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1247 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1248 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1249 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1250 		MUL_ADD_VEC_ROUND_PREFETCH(7);
1251 		r += 8;
1252 		a += 8;
1253 		len -= 8;
1254 	}
1255 	if (len == 8) {
1256 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1257 		MUL_ADD_VEC_ROUND_PREFETCH(1);
1258 		MUL_ADD_VEC_ROUND_PREFETCH(2);
1259 		MUL_ADD_VEC_ROUND_PREFETCH(3);
1260 		MUL_ADD_VEC_ROUND_PREFETCH(4);
1261 		MUL_ADD_VEC_ROUND_PREFETCH(5);
1262 		MUL_ADD_VEC_ROUND_PREFETCH(6);
1263 		MUL_ADD_VEC_ROUND_NOPREFETCH(7);
1264 		return ((uint32_t)cy);
1265 	}
1266 	while (len > 1) {
1267 		MUL_ADD_VEC_ROUND_PREFETCH(0);
1268 		++r;
1269 		++a;
1270 		--len;
1271 	}
1272 	if (len > 0) {
1273 		MUL_ADD_VEC_ROUND_NOPREFETCH(0);
1274 	}
1275 	return ((uint32_t)cy);
1276 }
1277 #endif /* UNROLL8 */
1278 
1279 void
1280 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1281 {
1282 	uint32_t	*tr, *ta;
1283 	int		tlen, row, col;
1284 	uint64_t	p, s, t, t2, cy;
1285 	uint32_t	d;
1286 
1287 	tr = r + 1;
1288 	ta = a;
1289 	tlen = len - 1;
1290 	tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
1291 	while (--tlen > 0) {
1292 		tr += 2;
1293 		++ta;
1294 		tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
1295 	}
1296 	s = (uint64_t)a[0];
1297 	s = s * s;
1298 	r[0] = (uint32_t)s;
1299 	cy = s >> 32;
1300 	p = ((uint64_t)r[1] << 1) + cy;
1301 	r[1] = (uint32_t)p;
1302 	cy = p >> 32;
1303 	row = 1;
1304 	col = 2;
1305 	while (row < len) {
1306 		s = (uint64_t)a[row];
1307 		s = s * s;
1308 		p = (uint64_t)r[col] << 1;
1309 		t = p + s;
1310 		d = (uint32_t)t;
1311 		t2 = (uint64_t)d + cy;
1312 		r[col] = (uint32_t)t2;
1313 		cy = (t >> 32) + (t2 >> 32);
1314 		if (row == len - 1)
1315 			break;
1316 		p = ((uint64_t)r[col + 1] << 1) + cy;
1317 		r[col + 1] = (uint32_t)p;
1318 		cy = p >> 32;
1319 		++row;
1320 		col += 2;
1321 	}
1322 	r[col + 1] = (uint32_t)cy;
1323 }
1324 
1325 #else /* BIG_CHUNK_SIZE == 64 */
1326 
1327 /*
1328  * r = r + a * digit, r and a are vectors of length len
1329  * returns the carry digit
1330  */
1331 BIG_CHUNK_TYPE
1332 big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1333     BIG_CHUNK_TYPE digit)
1334 {
1335 	BIG_CHUNK_TYPE	cy, cy1, retcy, dlow, dhigh;
1336 	int		i;
1337 
1338 	cy1 = 0;
1339 	dlow = digit & BIG_CHUNK_LOWHALFBITS;
1340 	dhigh = digit >> (BIG_CHUNK_SIZE / 2);
1341 	for (i = 0; i < len; i++) {
1342 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1343 		    dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1344 		    (r[i] & BIG_CHUNK_LOWHALFBITS);
1345 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1346 		    dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1347 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1348 		r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
1349 		    (cy1 << (BIG_CHUNK_SIZE / 2));
1350 	}
1351 	retcy = cy1 >> (BIG_CHUNK_SIZE / 2);
1352 
1353 	cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
1354 	for (i = 0; i < len - 1; i++) {
1355 		cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1356 		    dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1357 		    (r[i] >> (BIG_CHUNK_SIZE / 2));
1358 		r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1359 		    (cy << (BIG_CHUNK_SIZE / 2));
1360 		cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1361 		    dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1362 		    (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
1363 	}
1364 	cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1365 	    dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
1366 	    (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
1367 	r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1368 	    (cy << (BIG_CHUNK_SIZE / 2));
1369 	retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
1370 	    dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;
1371 
1372 	return (retcy);
1373 }
1374 
1375 
1376 /*
1377  * r = a * digit, r and a are vectors of length len
1378  * returns the carry digit
1379  */
1380 BIG_CHUNK_TYPE
1381 big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1382     BIG_CHUNK_TYPE digit)
1383 {
1384 	int	i;
1385 
1386 	ASSERT(r != a);
1387 	for (i = 0; i < len; i++) {
1388 		r[i] = 0;
1389 	}
1390 	return (big_mul_add_vec(r, a, len, digit));
1391 }
1392 
1393 void
1394 big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
1395 {
1396 	int i;
1397 
1398 	ASSERT(r != a);
1399 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1400 	for (i = 1; i < len; ++i)
1401 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1402 }
1403 
1404 #endif /* BIG_CHUNK_SIZE == 32/64 */
1405 
1406 
1407 #else /* ! UMUL64 */
1408 
1409 #if (BIG_CHUNK_SIZE != 32)
1410 #error Don't use 64-bit chunks without defining UMUL64
1411 #endif
1412 
1413 
1414 /*
1415  * r = r + a * digit, r and a are vectors of length len
1416  * returns the carry digit
1417  */
1418 uint32_t
1419 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1420 {
1421 	uint32_t cy, cy1, retcy, dlow, dhigh;
1422 	int i;
1423 
1424 	cy1 = 0;
1425 	dlow = digit & 0xffff;
1426 	dhigh = digit >> 16;
1427 	for (i = 0; i < len; i++) {
1428 		cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
1429 		cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
1430 		r[i] = (cy & 0xffff) | (cy1 << 16);
1431 	}
1432 	retcy = cy1 >> 16;
1433 
1434 	cy1 = r[0] & 0xffff;
1435 	for (i = 0; i < len - 1; i++) {
1436 		cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
1437 		r[i] = (cy1 & 0xffff) | (cy << 16);
1438 		cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
1439 	}
1440 	cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
1441 	r[len - 1] = (cy1 & 0xffff) | (cy << 16);
1442 	retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
1443 
1444 	return (retcy);
1445 }
1446 
1447 
1448 /*
1449  * r = a * digit, r and a are vectors of length len
1450  * returns the carry digit
1451  */
1452 uint32_t
1453 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1454 {
1455 	int	i;
1456 
1457 	ASSERT(r != a);
1458 	for (i = 0; i < len; i++) {
1459 		r[i] = 0;
1460 	}
1461 
1462 	return (big_mul_add_vec(r, a, len, digit));
1463 }
1464 
1465 void
1466 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1467 {
1468 	int i;
1469 
1470 	ASSERT(r != a);
1471 	r[len] = big_mul_set_vec(r, a, len, a[0]);
1472 	for (i = 1; i < len; ++i)
1473 		r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1474 }
1475 
1476 #endif /* UMUL64 */
1477 
1478 void
1479 big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
1480     BIG_CHUNK_TYPE *b, int blen)
1481 {
1482 	int i;
1483 
1484 	r[alen] = big_mul_set_vec(r, a, alen, b[0]);
1485 	for (i = 1; i < blen; ++i)
1486 		r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
1487 }
1488 
1489 
1490 #endif /* ! PSR_MUL */
1491 
1492 
1493 /*
1494  * result = aa * bb  result->value should be big enough to hold the result
1495  *
1496  * Implementation: Standard grammar school algorithm
1497  *
1498  */
1499 BIG_ERR_CODE
1500 big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
1501 {
1502 	BIGNUM		tmp1;
1503 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1504 	BIG_CHUNK_TYPE	*r, *t, *a, *b;
1505 	BIG_ERR_CODE	err;
1506 	int		i, alen, blen, rsize, sign, diff;
1507 
1508 	if (aa == bb) {
1509 		diff = 0;
1510 	} else {
1511 		diff = big_cmp_abs(aa, bb);
1512 		if (diff < 0) {
1513 			BIGNUM *tt;
1514 			tt = aa;
1515 			aa = bb;
1516 			bb = tt;
1517 		}
1518 	}
1519 	a = aa->value;
1520 	b = bb->value;
1521 	alen = aa->len;
1522 	blen = bb->len;
1523 	while ((alen > 1) && (a[alen - 1] == 0)) {
1524 		alen--;
1525 	}
1526 	aa->len = alen;
1527 	while ((blen > 1) && (b[blen - 1] == 0)) {
1528 		blen--;
1529 	}
1530 	bb->len = blen;
1531 
1532 	rsize = alen + blen;
1533 	ASSERT(rsize > 0);
1534 	if (result->size < rsize) {
1535 		err = big_extend(result, rsize);
1536 		if (err != BIG_OK) {
1537 			return (err);
1538 		}
1539 		/* aa or bb might be an alias to result */
1540 		a = aa->value;
1541 		b = bb->value;
1542 	}
1543 	r = result->value;
1544 
1545 	if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
1546 		result->len = 1;
1547 		result->sign = 1;
1548 		r[0] = 0;
1549 		return (BIG_OK);
1550 	}
1551 	sign = aa->sign * bb->sign;
1552 	if ((alen == 1) && (a[0] == 1)) {
1553 		for (i = 0; i < blen; i++) {
1554 			r[i] = b[i];
1555 		}
1556 		result->len = blen;
1557 		result->sign = sign;
1558 		return (BIG_OK);
1559 	}
1560 	if ((blen == 1) && (b[0] == 1)) {
1561 		for (i = 0; i < alen; i++) {
1562 			r[i] = a[i];
1563 		}
1564 		result->len = alen;
1565 		result->sign = sign;
1566 		return (BIG_OK);
1567 	}
1568 
1569 	if ((err = big_init1(&tmp1, rsize,
1570 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1571 		return (err);
1572 	}
1573 	(void) big_copy(&tmp1, aa);
1574 	t = tmp1.value;
1575 
1576 	for (i = 0; i < rsize; i++) {
1577 		t[i] = 0;
1578 	}
1579 
1580 	if (diff == 0 && alen > 2) {
1581 		BIG_SQR_VEC(t, a, alen);
1582 	} else if (blen > 0) {
1583 		BIG_MUL_VEC(t, a, alen, b, blen);
1584 	}
1585 
1586 	if (t[rsize - 1] == 0) {
1587 		tmp1.len = rsize - 1;
1588 	} else {
1589 		tmp1.len = rsize;
1590 	}
1591 
1592 	err = big_copy(result, &tmp1);
1593 
1594 	result->sign = sign;
1595 
1596 	big_finish(&tmp1);
1597 
1598 	return (err);
1599 }
1600 
1601 
1602 /*
1603  * caller must ensure that  a < n,  b < n  and  ret->size >=  2 * n->len + 1
1604  * and that ret is not n
1605  */
1606 BIG_ERR_CODE
1607 big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
1608 {
1609 	int	i, j, nlen, needsubtract;
1610 	BIG_CHUNK_TYPE	*nn, *rr;
1611 	BIG_CHUNK_TYPE	digit, c;
1612 	BIG_ERR_CODE	err;
1613 
1614 	nlen = n->len;
1615 	nn = n->value;
1616 
1617 	rr = ret->value;
1618 
1619 	if ((err = big_mul(ret, a, b)) != BIG_OK) {
1620 		return (err);
1621 	}
1622 
1623 	rr = ret->value;
1624 	for (i = ret->len; i < 2 * nlen + 1; i++) {
1625 		rr[i] = 0;
1626 	}
1627 	for (i = 0; i < nlen; i++) {
1628 		digit = rr[i];
1629 		digit = digit * n0;
1630 
1631 		c = BIG_MUL_ADD_VEC(rr + i, nn, nlen, digit);
1632 		j = i + nlen;
1633 		rr[j] += c;
1634 		while (rr[j] < c) {
1635 			rr[j + 1] += 1;
1636 			j++;
1637 			c = 1;
1638 		}
1639 	}
1640 
1641 	needsubtract = 0;
1642 	if ((rr[2 * nlen]  != 0))
1643 		needsubtract = 1;
1644 	else {
1645 		for (i = 2 * nlen - 1; i >= nlen; i--) {
1646 			if (rr[i] > nn[i - nlen]) {
1647 				needsubtract = 1;
1648 				break;
1649 			} else if (rr[i] < nn[i - nlen]) {
1650 				break;
1651 			}
1652 		}
1653 	}
1654 	if (needsubtract)
1655 		big_sub_vec(rr, rr + nlen, nn, nlen);
1656 	else {
1657 		for (i = 0; i < nlen; i++) {
1658 			rr[i] = rr[i + nlen];
1659 		}
1660 	}
1661 
1662 	/* Remove leading zeros, but keep at least 1 digit: */
1663 	for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
1664 		;
1665 	ret->len = i + 1;
1666 
1667 	return (BIG_OK);
1668 }
1669 
1670 
1671 BIG_CHUNK_TYPE
1672 big_n0(BIG_CHUNK_TYPE n)
1673 {
1674 	int		i;
1675 	BIG_CHUNK_TYPE	result, tmp;
1676 
1677 	result = 0;
1678 	tmp = BIG_CHUNK_ALLBITS;
1679 	for (i = 0; i < BIG_CHUNK_SIZE; i++) {
1680 		if ((tmp & 1) == 1) {
1681 			result = (result >> 1) | BIG_CHUNK_HIGHBIT;
1682 			tmp = tmp - n;
1683 		} else {
1684 			result = (result >> 1);
1685 		}
1686 		tmp = tmp >> 1;
1687 	}
1688 
1689 	return (result);
1690 }
1691 
1692 
1693 int
1694 big_numbits(BIGNUM *n)
1695 {
1696 	int		i, j;
1697 	BIG_CHUNK_TYPE	t;
1698 
1699 	for (i = n->len - 1; i > 0; i--) {
1700 		if (n->value[i] != 0) {
1701 			break;
1702 		}
1703 	}
1704 	t = n->value[i];
1705 	for (j = BIG_CHUNK_SIZE; j > 0; j--) {
1706 		if ((t & BIG_CHUNK_HIGHBIT) == 0) {
1707 			t = t << 1;
1708 		} else {
1709 			return (BIG_CHUNK_SIZE * i + j);
1710 		}
1711 	}
1712 	return (0);
1713 }
1714 
1715 
1716 /* caller must make sure that a < n */
1717 BIG_ERR_CODE
1718 big_mont_rr(BIGNUM *result, BIGNUM *n)
1719 {
1720 	BIGNUM		rr;
1721 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1722 	int		len, i;
1723 	BIG_ERR_CODE	err;
1724 
1725 	rr.malloced = 0;
1726 	len = n->len;
1727 
1728 	if ((err = big_init1(&rr, 2 * len + 1,
1729 	    rrvalue, arraysize(rrvalue))) != BIG_OK) {
1730 		return (err);
1731 	}
1732 
1733 	for (i = 0; i < 2 * len; i++) {
1734 		rr.value[i] = 0;
1735 	}
1736 	rr.value[2 * len] = 1;
1737 	rr.len = 2 * len + 1;
1738 	if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1739 		goto ret;
1740 	}
1741 	err = big_copy(result, &rr);
1742 ret:
1743 	big_finish(&rr);
1744 	return (err);
1745 }
1746 
1747 
1748 /* caller must make sure that a < n */
1749 BIG_ERR_CODE
1750 big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
1751     BIGNUM *n_rr)
1752 {
1753 	BIGNUM		rr;
1754 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
1755 	int		len, i;
1756 	BIG_ERR_CODE	err;
1757 
1758 	rr.malloced = 0;
1759 	len = n->len;
1760 
1761 	if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
1762 	    != BIG_OK) {
1763 		return (err);
1764 	}
1765 
1766 	if (n_rr == NULL) {
1767 		for (i = 0; i < 2 * len; i++) {
1768 			rr.value[i] = 0;
1769 		}
1770 		rr.value[2 * len] = 1;
1771 		rr.len = 2 * len + 1;
1772 		if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1773 			goto ret;
1774 		}
1775 		n_rr = &rr;
1776 	}
1777 
1778 	if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
1779 		goto ret;
1780 	}
1781 	err = big_copy(result, &rr);
1782 
1783 ret:
1784 	big_finish(&rr);
1785 	return (err);
1786 }
1787 
1788 
1789 #ifdef	USE_FLOATING_POINT
1790 #define	big_modexp_ncp_float	big_modexp_ncp_sw
1791 #else
1792 #define	big_modexp_ncp_int	big_modexp_ncp_sw
1793 #endif
1794 
1795 #define	MAX_EXP_BIT_GROUP_SIZE 6
1796 #define	APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
1797 
1798 /* ARGSUSED */
1799 static BIG_ERR_CODE
1800 big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1801     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1802 
1803 {
1804 	BIGNUM		apowers[APOWERS_MAX_SIZE];
1805 	BIGNUM		tmp1;
1806 	BIG_CHUNK_TYPE	tmp1value[BIGTMPSIZE];
1807 	int		i, j, k, l, m, p;
1808 	uint32_t	bit, bitind, bitcount, groupbits, apowerssize;
1809 	uint32_t	nbits;
1810 	BIG_ERR_CODE	err;
1811 
1812 	nbits = big_numbits(e);
1813 	if (nbits < 50) {
1814 		groupbits = 1;
1815 		apowerssize = 1;
1816 	} else {
1817 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1818 		apowerssize = 1 << (groupbits - 1);
1819 	}
1820 
1821 
1822 	if ((err = big_init1(&tmp1, 2 * n->len + 1,
1823 	    tmp1value, arraysize(tmp1value))) != BIG_OK) {
1824 		return (err);
1825 	}
1826 
1827 	/* clear the malloced bit to help cleanup */
1828 	for (i = 0; i < apowerssize; i++) {
1829 		apowers[i].malloced = 0;
1830 	}
1831 
1832 	for (i = 0; i < apowerssize; i++) {
1833 		if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
1834 		    BIG_OK) {
1835 			goto ret;
1836 		}
1837 	}
1838 
1839 	(void) big_copy(&(apowers[0]), ma);
1840 
1841 	if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
1842 		goto ret;
1843 	}
1844 	(void) big_copy(ma, &tmp1);
1845 
1846 	for (i = 1; i < apowerssize; i++) {
1847 		if ((err = big_mont_mul(&tmp1, ma,
1848 		    &(apowers[i - 1]), n, n0)) != BIG_OK) {
1849 			goto ret;
1850 		}
1851 		(void) big_copy(&apowers[i], &tmp1);
1852 	}
1853 
1854 	bitind = nbits % BIG_CHUNK_SIZE;
1855 	k = 0;
1856 	l = 0;
1857 	p = 0;
1858 	bitcount = 0;
1859 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
1860 		for (j = bitind - 1; j >= 0; j--) {
1861 			bit = (e->value[i] >> j) & 1;
1862 			if ((bitcount == 0) && (bit == 0)) {
1863 				if ((err = big_mont_mul(tmp,
1864 				    tmp, tmp, n, n0)) != BIG_OK) {
1865 					goto ret;
1866 				}
1867 			} else {
1868 				bitcount++;
1869 				p = p * 2 + bit;
1870 				if (bit == 1) {
1871 					k = k + l + 1;
1872 					l = 0;
1873 				} else {
1874 					l++;
1875 				}
1876 				if (bitcount == groupbits) {
1877 					for (m = 0; m < k; m++) {
1878 						if ((err = big_mont_mul(tmp,
1879 						    tmp, tmp, n, n0)) !=
1880 						    BIG_OK) {
1881 							goto ret;
1882 						}
1883 					}
1884 					if ((err = big_mont_mul(tmp, tmp,
1885 					    &(apowers[p >> (l + 1)]),
1886 					    n, n0)) != BIG_OK) {
1887 						goto ret;
1888 					}
1889 					for (m = 0; m < l; m++) {
1890 						if ((err = big_mont_mul(tmp,
1891 						    tmp, tmp, n, n0)) !=
1892 						    BIG_OK) {
1893 							goto ret;
1894 						}
1895 					}
1896 					k = 0;
1897 					l = 0;
1898 					p = 0;
1899 					bitcount = 0;
1900 				}
1901 			}
1902 		}
1903 		bitind = BIG_CHUNK_SIZE;
1904 	}
1905 
1906 	for (m = 0; m < k; m++) {
1907 		if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
1908 			goto ret;
1909 		}
1910 	}
1911 	if (p != 0) {
1912 		if ((err = big_mont_mul(tmp, tmp,
1913 		    &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
1914 			goto ret;
1915 		}
1916 	}
1917 	for (m = 0; m < l; m++) {
1918 		if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
1919 			goto ret;
1920 		}
1921 	}
1922 
1923 ret:
1924 	for (i = apowerssize - 1; i >= 0; i--) {
1925 		big_finish(&(apowers[i]));
1926 	}
1927 	big_finish(&tmp1);
1928 
1929 	return (err);
1930 }
1931 
1932 
1933 #ifdef USE_FLOATING_POINT
1934 
1935 #ifdef _KERNEL
1936 
1937 #include <sys/sysmacros.h>
1938 #include <sys/regset.h>
1939 #include <sys/fpu/fpusystm.h>
1940 
1941 /* the alignment for block stores to save fp registers */
1942 #define	FPR_ALIGN	(64)
1943 
1944 extern void big_savefp(kfpu_t *);
1945 extern void big_restorefp(kfpu_t *);
1946 
1947 #endif /* _KERNEL */
1948 
1949 /*
1950  * This version makes use of floating point for performance
1951  */
1952 static BIG_ERR_CODE
1953 big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1954     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1955 {
1956 
1957 	int		i, j, k, l, m, p;
1958 	uint32_t	bit, bitind, bitcount, nlen;
1959 	double		dn0;
1960 	double		*dn, *dt, *d16r, *d32r;
1961 	uint32_t	*nint, *prod;
1962 	double		*apowers[APOWERS_MAX_SIZE];
1963 	uint32_t	nbits, groupbits, apowerssize;
1964 	BIG_ERR_CODE	err = BIG_OK;
1965 
1966 #ifdef _KERNEL
1967 	uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
1968 	kfpu_t *fpu;
1969 
1970 #ifdef DEBUG
1971 	if (!fpu_exists)
1972 		return (BIG_GENERAL_ERR);
1973 #endif
1974 
1975 	fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
1976 	big_savefp(fpu);
1977 
1978 #endif /* _KERNEL */
1979 
1980 	nbits = big_numbits(e);
1981 	if (nbits < 50) {
1982 		groupbits = 1;
1983 		apowerssize = 1;
1984 	} else {
1985 		groupbits = MAX_EXP_BIT_GROUP_SIZE;
1986 		apowerssize = 1 << (groupbits - 1);
1987 	}
1988 
1989 	nlen = (BIG_CHUNK_SIZE / 32) * n->len;
1990 	dn0 = (double)(n0 & 0xffff);
1991 
1992 	dn = dt = d16r = d32r = NULL;
1993 	nint = prod = NULL;
1994 	for (i = 0; i < apowerssize; i++) {
1995 		apowers[i] = NULL;
1996 	}
1997 
1998 	if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
1999 		err = BIG_NO_MEM;
2000 		goto ret;
2001 	}
2002 	if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
2003 		err = BIG_NO_MEM;
2004 		goto ret;
2005 	}
2006 	if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
2007 		err = BIG_NO_MEM;
2008 		goto ret;
2009 	}
2010 	if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
2011 		err = BIG_NO_MEM;
2012 		goto ret;
2013 	}
2014 	if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
2015 		err = BIG_NO_MEM;
2016 		goto ret;
2017 	}
2018 	if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
2019 		err = BIG_NO_MEM;
2020 		goto ret;
2021 	}
2022 	for (i = 0; i < apowerssize; i++) {
2023 		if ((apowers[i] = big_malloc((2 * nlen + 1) *
2024 		    sizeof (double))) == NULL) {
2025 			err = BIG_NO_MEM;
2026 			goto ret;
2027 		}
2028 	}
2029 
2030 #if (BIG_CHUNK_SIZE == 32)
2031 	for (i = 0; i < ma->len; i++) {
2032 		nint[i] = ma->value[i];
2033 	}
2034 	for (; i < nlen; i++) {
2035 		nint[i] = 0;
2036 	}
2037 #else
2038 	for (i = 0; i < ma->len; i++) {
2039 		nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
2040 		nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
2041 	}
2042 	for (i = ma->len * 2; i < nlen; i++) {
2043 		nint[i] = 0;
2044 	}
2045 #endif
2046 	conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
2047 
2048 #if (BIG_CHUNK_SIZE == 32)
2049 	for (i = 0; i < n->len; i++) {
2050 		nint[i] = n->value[i];
2051 	}
2052 	for (; i < nlen; i++) {
2053 		nint[i] = 0;
2054 	}
2055 #else
2056 	for (i = 0; i < n->len; i++) {
2057 		nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
2058 		nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
2059 	}
2060 	for (i = n->len * 2; i < nlen; i++) {
2061 		nint[i] = 0;
2062 	}
2063 #endif
2064 	conv_i32_to_d32(dn, nint, nlen);
2065 
2066 	mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
2067 	conv_i32_to_d32(d32r, prod, nlen);
2068 	for (i = 1; i < apowerssize; i++) {
2069 		mont_mulf_noconv(prod, d32r, apowers[i - 1],
2070 		    dt, dn, nint, nlen, dn0);
2071 		conv_i32_to_d16(apowers[i], prod, nlen);
2072 	}
2073 
2074 #if (BIG_CHUNK_SIZE == 32)
2075 	for (i = 0; i < tmp->len; i++) {
2076 		prod[i] = tmp->value[i];
2077 	}
2078 	for (; i < nlen + 1; i++) {
2079 		prod[i] = 0;
2080 	}
2081 #else
2082 	for (i = 0; i < tmp->len; i++) {
2083 		prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
2084 		prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
2085 	}
2086 	for (i = tmp->len * 2; i < nlen + 1; i++) {
2087 		prod[i] = 0;
2088 	}
2089 #endif
2090 
2091 	bitind = nbits % BIG_CHUNK_SIZE;
2092 	k = 0;
2093 	l = 0;
2094 	p = 0;
2095 	bitcount = 0;
2096 	for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
2097 		for (j = bitind - 1; j >= 0; j--) {
2098 			bit = (e->value[i] >> j) & 1;
2099 			if ((bitcount == 0) && (bit == 0)) {
2100 				conv_i32_to_d32_and_d16(d32r, d16r,
2101 				    prod, nlen);
2102 				mont_mulf_noconv(prod, d32r, d16r,
2103 				    dt, dn, nint, nlen, dn0);
2104 			} else {
2105 				bitcount++;
2106 				p = p * 2 + bit;
2107 				if (bit == 1) {
2108 					k = k + l + 1;
2109 					l = 0;
2110 				} else {
2111 					l++;
2112 				}
2113 				if (bitcount == groupbits) {
2114 					for (m = 0; m < k; m++) {
2115 						conv_i32_to_d32_and_d16(d32r,
2116 						    d16r, prod, nlen);
2117 						mont_mulf_noconv(prod, d32r,
2118 						    d16r, dt, dn, nint,
2119 						    nlen, dn0);
2120 					}
2121 					conv_i32_to_d32(d32r, prod, nlen);
2122 					mont_mulf_noconv(prod, d32r,
2123 					    apowers[p >> (l + 1)],
2124 					    dt, dn, nint, nlen, dn0);
2125 					for (m = 0; m < l; m++) {
2126 						conv_i32_to_d32_and_d16(d32r,
2127 						    d16r, prod, nlen);
2128 						mont_mulf_noconv(prod, d32r,
2129 						    d16r, dt, dn, nint,
2130 						    nlen, dn0);
2131 					}
2132 					k = 0;
2133 					l = 0;
2134 					p = 0;
2135 					bitcount = 0;
2136 				}
2137 			}
2138 		}
2139 		bitind = BIG_CHUNK_SIZE;
2140 	}
2141 
2142 	for (m = 0; m < k; m++) {
2143 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2144 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2145 	}
2146 	if (p != 0) {
2147 		conv_i32_to_d32(d32r, prod, nlen);
2148 		mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
2149 		    dt, dn, nint, nlen, dn0);
2150 	}
2151 	for (m = 0; m < l; m++) {
2152 		conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2153 		mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2154 	}
2155 
2156 #if (BIG_CHUNK_SIZE == 32)
2157 	for (i = 0; i < nlen; i++) {
2158 		result->value[i] = prod[i];
2159 	}
2160 	for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
2161 		;
2162 #else
2163 	for (i = 0; i < nlen / 2; i++) {
2164 		result->value[i] = (uint64_t)(prod[2 * i]) +
2165 		    (((uint64_t)(prod[2 * i + 1])) << 32);
2166 	}
2167 	for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
2168 		;
2169 #endif
2170 	result->len = i + 1;
2171 
2172 ret:
2173 	for (i = apowerssize - 1; i >= 0; i--) {
2174 		if (apowers[i] != NULL)
2175 			big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
2176 	}
2177 	if (d32r != NULL) {
2178 		big_free(d32r, nlen * sizeof (double));
2179 	}
2180 	if (d16r != NULL) {
2181 		big_free(d16r, (2 * nlen + 1) * sizeof (double));
2182 	}
2183 	if (prod != NULL) {
2184 		big_free(prod, (nlen + 1) * sizeof (uint32_t));
2185 	}
2186 	if (nint != NULL) {
2187 		big_free(nint, nlen * sizeof (uint32_t));
2188 	}
2189 	if (dt != NULL) {
2190 		big_free(dt, (4 * nlen + 2) * sizeof (double));
2191 	}
2192 	if (dn != NULL) {
2193 		big_free(dn, nlen * sizeof (double));
2194 	}
2195 
2196 #ifdef _KERNEL
2197 	big_restorefp(fpu);
2198 #endif
2199 
2200 	return (err);
2201 }
2202 
2203 #endif /* USE_FLOATING_POINT */
2204 
2205 
2206 BIG_ERR_CODE
2207 big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
2208     big_modexp_ncp_info_t *info)
2209 {
2210 	BIGNUM		ma, tmp, rr;
2211 	BIG_CHUNK_TYPE	mavalue[BIGTMPSIZE];
2212 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2213 	BIG_CHUNK_TYPE	rrvalue[BIGTMPSIZE];
2214 	BIG_ERR_CODE	err;
2215 	BIG_CHUNK_TYPE	n0;
2216 
2217 	if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue)))	!=
2218 	    BIG_OK) {
2219 		return (err);
2220 	}
2221 	ma.len = 1;
2222 	ma.value[0] = 0;
2223 
2224 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2225 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2226 		goto ret1;
2227 	}
2228 
2229 	/* clear the malloced bit to help cleanup */
2230 	rr.malloced = 0;
2231 
2232 	if (n_rr == NULL) {
2233 		if ((err = big_init1(&rr, 2 * n->len + 1,
2234 		    rrvalue, arraysize(rrvalue))) != BIG_OK) {
2235 			goto ret2;
2236 		}
2237 		if (big_mont_rr(&rr, n) != BIG_OK) {
2238 			goto ret;
2239 		}
2240 		n_rr = &rr;
2241 	}
2242 
2243 	n0 = big_n0(n->value[0]);
2244 
2245 	if (big_cmp_abs(a, n) > 0) {
2246 		if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
2247 			goto ret;
2248 		}
2249 		err = big_mont_conv(&ma, &ma, n, n0, n_rr);
2250 	} else {
2251 		err = big_mont_conv(&ma, a, n, n0, n_rr);
2252 	}
2253 	if (err != BIG_OK) {
2254 		goto ret;
2255 	}
2256 
2257 	tmp.len = 1;
2258 	tmp.value[0] = 1;
2259 	if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
2260 		goto ret;
2261 	}
2262 
2263 	if ((info != NULL) && (info->func != NULL)) {
2264 		err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
2265 		    info->ncp, info->reqp);
2266 	} else {
2267 		err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
2268 	}
2269 	if (err != BIG_OK) {
2270 		goto ret;
2271 	}
2272 
2273 	ma.value[0] = 1;
2274 	ma.len = 1;
2275 	if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
2276 		goto ret;
2277 	}
2278 	err = big_copy(result, &tmp);
2279 
2280 ret:
2281 	if (rr.malloced) {
2282 		big_finish(&rr);
2283 	}
2284 ret2:
2285 	big_finish(&tmp);
2286 ret1:
2287 	big_finish(&ma);
2288 
2289 	return (err);
2290 }
2291 
2292 BIG_ERR_CODE
2293 big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
2294 {
2295 	return (big_modexp_ext(result, a, e, n, n_rr, NULL));
2296 }
2297 
2298 
2299 BIG_ERR_CODE
2300 big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2301     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2302     BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
2303 {
2304 	BIGNUM		ap, aq, tmp;
2305 	int		alen, biglen, sign;
2306 	BIG_ERR_CODE	err;
2307 
2308 	if (p->len > q->len) {
2309 		biglen = p->len;
2310 	} else {
2311 		biglen = q->len;
2312 	}
2313 
2314 	if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
2315 		return (err);
2316 	}
2317 	if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
2318 		goto ret1;
2319 	}
2320 	if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
2321 		goto ret2;
2322 	}
2323 
2324 	/*
2325 	 * check whether a is too short - to avoid timing attacks
2326 	 */
2327 	alen = a->len;
2328 	while ((alen > p->len) && (a->value[alen - 1] == 0)) {
2329 		alen--;
2330 	}
2331 	if (alen < p->len + q->len) {
2332 		/*
2333 		 * a is too short, add p*q to it before
2334 		 * taking it modulo p and q
2335 		 * this will also affect timing, but this difference
2336 		 * does not depend on p or q, only on a
2337 		 * (in "normal" operation, this path will never be
2338 		 * taken, so it is not a performance penalty
2339 		 */
2340 		if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
2341 			goto ret;
2342 		}
2343 		if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
2344 			goto ret;
2345 		}
2346 		if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
2347 			goto ret;
2348 		}
2349 		if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2350 			goto ret;
2351 		}
2352 	} else {
2353 		if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
2354 			goto ret;
2355 		}
2356 		if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
2357 			goto ret;
2358 		}
2359 	}
2360 
2361 	if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
2362 	    BIG_OK) {
2363 		goto ret;
2364 	}
2365 	if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
2366 	    BIG_OK) {
2367 		goto ret;
2368 	}
2369 	if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
2370 		goto ret;
2371 	}
2372 	if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
2373 		goto ret;
2374 	}
2375 	sign = tmp.sign;
2376 	tmp.sign = 1;
2377 	if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2378 		goto ret;
2379 	}
2380 	if ((sign == -1) && (!big_is_zero(&aq))) {
2381 		(void) big_sub_pos(&aq, q, &aq);
2382 	}
2383 	if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
2384 		goto ret;
2385 	}
2386 	err = big_add_abs(result, &ap, &tmp);
2387 
2388 ret:
2389 	big_finish(&tmp);
2390 ret2:
2391 	big_finish(&aq);
2392 ret1:
2393 	big_finish(&ap);
2394 
2395 	return (err);
2396 }
2397 
2398 
2399 BIG_ERR_CODE
2400 big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2401     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2402     BIGNUM *p_rr, BIGNUM *q_rr)
2403 {
2404 	return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
2405 	    p, q, pinvmodq, p_rr, q_rr, NULL));
2406 }
2407 
2408 
2409 static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
2410 BIGNUM big_One = {1, 1, 1, 0, onearr};
2411 
2412 static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
2413 BIGNUM big_Two = {1, 1, 1, 0, twoarr};
2414 
2415 static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
2416 static BIGNUM big_Four = {1, 1, 1, 0, fourarr};
2417 
2418 
2419 BIG_ERR_CODE
2420 big_sqrt_pos(BIGNUM *result, BIGNUM *n)
2421 {
2422 	BIGNUM		*high, *low, *mid, *t;
2423 	BIGNUM		t1, t2, t3, prod;
2424 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2425 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2426 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2427 	BIG_CHUNK_TYPE	prodvalue[BIGTMPSIZE];
2428 	int		i, diff;
2429 	uint32_t	nbits, nrootbits, highbits;
2430 	BIG_ERR_CODE	err;
2431 
2432 	nbits = big_numbits(n);
2433 
2434 	if ((err = big_init1(&t1, n->len + 1,
2435 	    t1value, arraysize(t1value))) != BIG_OK)
2436 		return (err);
2437 	if ((err = big_init1(&t2, n->len + 1,
2438 	    t2value, arraysize(t2value))) != BIG_OK)
2439 		goto ret1;
2440 	if ((err = big_init1(&t3, n->len + 1,
2441 	    t3value, arraysize(t3value))) != BIG_OK)
2442 		goto ret2;
2443 	if ((err = big_init1(&prod, n->len + 1,
2444 	    prodvalue, arraysize(prodvalue))) != BIG_OK)
2445 		goto ret3;
2446 
2447 	nrootbits = (nbits + 1) / 2;
2448 	t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
2449 	for (i = 0; i < t1.len; i++) {
2450 		t1.value[i] = 0;
2451 		t2.value[i] = BIG_CHUNK_ALLBITS;
2452 	}
2453 	highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
2454 	if (highbits == BIG_CHUNK_SIZE) {
2455 		t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
2456 		t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
2457 	} else {
2458 		t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
2459 		t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
2460 	}
2461 
2462 	high = &t2;
2463 	low = &t1;
2464 	mid = &t3;
2465 
2466 	if ((err = big_mul(&prod, high, high)) != BIG_OK) {
2467 		goto ret;
2468 	}
2469 	diff = big_cmp_abs(&prod, n);
2470 	if (diff <= 0) {
2471 		err = big_copy(result, high);
2472 		goto ret;
2473 	}
2474 
2475 	(void) big_sub_pos(mid, high, low);
2476 	while (big_cmp_abs(&big_One, mid) != 0) {
2477 		(void) big_add_abs(mid, high, low);
2478 		(void) big_half_pos(mid, mid);
2479 		if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
2480 			goto ret;
2481 		diff = big_cmp_abs(&prod, n);
2482 		if (diff > 0) {
2483 			t = high;
2484 			high = mid;
2485 			mid = t;
2486 		} else if (diff < 0) {
2487 			t = low;
2488 			low = mid;
2489 			mid = t;
2490 		} else {
2491 			err = big_copy(result, low);
2492 			goto ret;
2493 		}
2494 		(void) big_sub_pos(mid, high, low);
2495 	}
2496 
2497 	err = big_copy(result, low);
2498 ret:
2499 	if (prod.malloced) big_finish(&prod);
2500 ret3:
2501 	if (t3.malloced) big_finish(&t3);
2502 ret2:
2503 	if (t2.malloced) big_finish(&t2);
2504 ret1:
2505 	if (t1.malloced) big_finish(&t1);
2506 
2507 	return (err);
2508 }
2509 
2510 
2511 BIG_ERR_CODE
2512 big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
2513 {
2514 	BIGNUM		*t, *tmp2, *m, *n;
2515 	BIGNUM		t1, t2, t3;
2516 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2517 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2518 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2519 	int		len, err;
2520 
2521 	if (big_is_zero(nn) ||
2522 	    (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
2523 		*jac = 0;
2524 		return (BIG_OK);
2525 	}
2526 
2527 	if (nn->len > mm->len) {
2528 		len = nn->len;
2529 	} else {
2530 		len = mm->len;
2531 	}
2532 
2533 	if ((err = big_init1(&t1, len,
2534 	    t1value, arraysize(t1value))) != BIG_OK) {
2535 		return (err);
2536 	}
2537 	if ((err = big_init1(&t2, len,
2538 	    t2value, arraysize(t2value))) != BIG_OK) {
2539 		goto ret1;
2540 	}
2541 	if ((err = big_init1(&t3, len,
2542 	    t3value, arraysize(t3value))) != BIG_OK) {
2543 		goto ret2;
2544 	}
2545 
2546 	n = &t1;
2547 	m = &t2;
2548 	tmp2 = &t3;
2549 
2550 	(void) big_copy(n, nn);
2551 	(void) big_copy(m, mm);
2552 
2553 	*jac = 1;
2554 	while (big_cmp_abs(&big_One, m) != 0) {
2555 		if (big_is_zero(n)) {
2556 			*jac = 0;
2557 			goto ret;
2558 		}
2559 		if ((m->value[0] & 1) == 0) {
2560 			if (((n->value[0] & 7) == 3) ||
2561 			    ((n->value[0] & 7) == 5))
2562 				*jac = -*jac;
2563 			(void) big_half_pos(m, m);
2564 		} else if ((n->value[0] & 1) == 0) {
2565 			if (((m->value[0] & 7) == 3) ||
2566 			    ((m->value[0] & 7) == 5))
2567 				*jac = -*jac;
2568 			(void) big_half_pos(n, n);
2569 		} else {
2570 			if (((m->value[0] & 3) == 3) &&
2571 			    ((n->value[0] & 3) == 3)) {
2572 				*jac = -*jac;
2573 			}
2574 			if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
2575 				goto ret;
2576 			}
2577 			t = tmp2;
2578 			tmp2 = m;
2579 			m = n;
2580 			n = t;
2581 		}
2582 	}
2583 	err = BIG_OK;
2584 
2585 ret:
2586 	if (t3.malloced) big_finish(&t3);
2587 ret2:
2588 	if (t2.malloced) big_finish(&t2);
2589 ret1:
2590 	if (t1.malloced) big_finish(&t1);
2591 
2592 	return (err);
2593 }
2594 
2595 
2596 BIG_ERR_CODE
2597 big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
2598 {
2599 	int		i;
2600 	uint32_t	m, w;
2601 	BIG_CHUNK_TYPE	bit;
2602 	BIGNUM		ki, tmp, tmp2;
2603 	BIG_CHUNK_TYPE	kivalue[BIGTMPSIZE];
2604 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2605 	BIG_CHUNK_TYPE	tmp2value[BIGTMPSIZE];
2606 	BIG_ERR_CODE	err;
2607 
2608 	if (big_cmp_abs(k, &big_One) == 0) {
2609 		(void) big_copy(Lk, p);
2610 		(void) big_copy(Lkminus1, &big_Two);
2611 		return (BIG_OK);
2612 	}
2613 
2614 	if ((err = big_init1(&ki, k->len + 1,
2615 	    kivalue, arraysize(kivalue))) != BIG_OK)
2616 		return (err);
2617 
2618 	if ((err = big_init1(&tmp, 2 * n->len + 1,
2619 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK)
2620 		goto ret1;
2621 
2622 	if ((err = big_init1(&tmp2, n->len,
2623 	    tmp2value, arraysize(tmp2value))) != BIG_OK)
2624 		goto ret2;
2625 
2626 	m = big_numbits(k);
2627 	ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
2628 	w = (m - 1) / BIG_CHUNK_SIZE;
2629 	bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
2630 	for (i = 0; i < ki.len; i++) {
2631 		ki.value[i] = 0;
2632 	}
2633 	ki.value[ki.len - 1] = bit;
2634 	if (big_cmp_abs(k, &ki) != 0) {
2635 		(void) big_double(&ki, &ki);
2636 	}
2637 	(void) big_sub_pos(&ki, &ki, k);
2638 
2639 	(void) big_copy(Lk, p);
2640 	(void) big_copy(Lkminus1, &big_Two);
2641 
2642 	for (i = 0; i < m; i++) {
2643 		if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
2644 			goto ret;
2645 		}
2646 		(void) big_add_abs(&tmp, &tmp, n);
2647 		(void) big_sub_pos(&tmp, &tmp, p);
2648 		if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
2649 			goto ret;
2650 		}
2651 		if ((ki.value[w] & bit) != 0) {
2652 			if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
2653 			    BIG_OK) {
2654 				goto ret;
2655 			}
2656 			(void) big_add_abs(&tmp, &tmp, n);
2657 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2658 			if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
2659 			    BIG_OK) {
2660 				goto ret;
2661 			}
2662 			(void) big_copy(Lk, &tmp2);
2663 		} else {
2664 			if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
2665 				goto ret;
2666 			}
2667 			(void) big_add_abs(&tmp, &tmp, n);
2668 			(void) big_sub_pos(&tmp, &tmp, &big_Two);
2669 			if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
2670 				goto ret;
2671 			}
2672 			(void) big_copy(Lkminus1, &tmp2);
2673 		}
2674 		bit = bit >> 1;
2675 		if (bit == 0) {
2676 			bit = BIG_CHUNK_HIGHBIT;
2677 			w--;
2678 		}
2679 	}
2680 
2681 	err = BIG_OK;
2682 
2683 ret:
2684 	if (tmp2.malloced) big_finish(&tmp2);
2685 ret2:
2686 	if (tmp.malloced) big_finish(&tmp);
2687 ret1:
2688 	if (ki.malloced) big_finish(&ki);
2689 
2690 	return (err);
2691 }
2692 
2693 
2694 BIG_ERR_CODE
2695 big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
2696 {
2697 	BIGNUM		o, nminus1, tmp, Lkminus1, Lk;
2698 	BIG_CHUNK_TYPE	ovalue[BIGTMPSIZE];
2699 	BIG_CHUNK_TYPE	nminus1value[BIGTMPSIZE];
2700 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2701 	BIG_CHUNK_TYPE	Lkminus1value[BIGTMPSIZE];
2702 	BIG_CHUNK_TYPE	Lkvalue[BIGTMPSIZE];
2703 	BIG_ERR_CODE	err;
2704 	int		e, i, jac;
2705 
2706 	if (big_cmp_abs(n, &big_One) == 0) {
2707 		return (BIG_FALSE);
2708 	}
2709 	if (big_cmp_abs(n, &big_Two) == 0) {
2710 		return (BIG_TRUE);
2711 	}
2712 	if ((n->value[0] & 1) == 0) {
2713 		return (BIG_FALSE);
2714 	}
2715 
2716 	if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
2717 	    BIG_OK) {
2718 		return (err);
2719 	}
2720 
2721 	if ((err = big_init1(&nminus1, n->len,
2722 	    nminus1value, arraysize(nminus1value))) != BIG_OK) {
2723 		goto ret1;
2724 	}
2725 
2726 	if ((err = big_init1(&tmp, 2 * n->len,
2727 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2728 		goto ret2;
2729 	}
2730 
2731 	if ((err = big_init1(&Lkminus1, n->len,
2732 	    Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
2733 		goto ret3;
2734 	}
2735 
2736 	if ((err = big_init1(&Lk, n->len,
2737 	    Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
2738 		goto ret4;
2739 	}
2740 
2741 	(void) big_sub_pos(&o, n, &big_One);	/* cannot fail */
2742 	(void) big_copy(&nminus1, &o);		/* cannot fail */
2743 	e = 0;
2744 	while ((o.value[0] & 1) == 0) {
2745 		e++;
2746 		(void) big_half_pos(&o, &o);  /* cannot fail */
2747 	}
2748 	if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
2749 	    BIG_OK) {
2750 		goto ret;
2751 	}
2752 
2753 	i = 0;
2754 	while ((i < e) &&
2755 	    (big_cmp_abs(&tmp, &big_One) != 0) &&
2756 	    (big_cmp_abs(&tmp, &nminus1) != 0)) {
2757 		if ((err =
2758 		    big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
2759 		    BIG_OK)
2760 			goto ret;
2761 		i++;
2762 	}
2763 
2764 	if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
2765 	    ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
2766 		err = BIG_FALSE;
2767 		goto ret;
2768 	}
2769 
2770 	if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
2771 		goto ret;
2772 	}
2773 
2774 	if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
2775 		goto ret;
2776 	}
2777 	if (big_cmp_abs(&tmp, n) == 0) {
2778 		err = BIG_FALSE;
2779 		goto ret;
2780 	}
2781 
2782 	(void) big_copy(&o, &big_Two);
2783 	do {
2784 		(void) big_add_abs(&o, &o, &big_One);
2785 		if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
2786 			goto ret;
2787 		}
2788 		(void) big_sub_pos(&tmp, &tmp, &big_Four);
2789 		if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
2790 			goto ret;
2791 		}
2792 	} while (jac != -1);
2793 
2794 	(void) big_add_abs(&tmp, n, &big_One);
2795 	if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
2796 		goto ret;
2797 	}
2798 
2799 	if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
2800 	    (big_cmp_abs(&Lk, &big_Two) == 0)) {
2801 		err = BIG_TRUE;
2802 	} else {
2803 		err = BIG_FALSE;
2804 	}
2805 
2806 ret:
2807 	if (Lk.malloced) big_finish(&Lk);
2808 ret4:
2809 	if (Lkminus1.malloced) big_finish(&Lkminus1);
2810 ret3:
2811 	if (tmp.malloced) big_finish(&tmp);
2812 ret2:
2813 	if (nminus1.malloced) big_finish(&nminus1);
2814 ret1:
2815 	if (o.malloced) big_finish(&o);
2816 
2817 	return (err);
2818 }
2819 
2820 
2821 BIG_ERR_CODE
2822 big_isprime_pos(BIGNUM *n)
2823 {
2824 	return (big_isprime_pos_ext(n, NULL));
2825 }
2826 
2827 
2828 #define	SIEVESIZE 1000
2829 
2830 
2831 BIG_ERR_CODE
2832 big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
2833 {
2834 	static const uint32_t smallprimes[] = {
2835 	    3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
2836 	    51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
2837 	BIG_ERR_CODE	err;
2838 	int		sieve[SIEVESIZE];
2839 	int		i;
2840 	uint32_t	off, p;
2841 
2842 	if ((err = big_copy(result, n)) != BIG_OK) {
2843 		return (err);
2844 	}
2845 	result->value[0] |= 1;
2846 	/* CONSTCOND */
2847 	while (1) {
2848 		for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
2849 		for (i = 0;
2850 		    i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
2851 			p = smallprimes[i];
2852 			off = big_modhalf_pos(result, p);
2853 			off = p - off;
2854 			if ((off % 2) == 1) {
2855 				off = off + p;
2856 			}
2857 			off = off / 2;
2858 			while (off < SIEVESIZE) {
2859 				sieve[off] = 1;
2860 				off = off + p;
2861 			}
2862 		}
2863 
2864 		for (i = 0; i < SIEVESIZE; i++) {
2865 			if (sieve[i] == 0) {
2866 				err = big_isprime_pos_ext(result, info);
2867 				if (err != BIG_FALSE) {
2868 					if (err != BIG_TRUE) {
2869 						return (err);
2870 					} else {
2871 						goto out;
2872 					}
2873 				}
2874 
2875 			}
2876 			if ((err = big_add_abs(result, result, &big_Two)) !=
2877 			    BIG_OK) {
2878 				return (err);
2879 			}
2880 		}
2881 	}
2882 
2883 out:
2884 	return (BIG_OK);
2885 }
2886 
2887 
2888 BIG_ERR_CODE
2889 big_nextprime_pos(BIGNUM *result, BIGNUM *n)
2890 {
2891 	return (big_nextprime_pos_ext(result, n, NULL));
2892 }
2893 
2894 
2895 BIG_ERR_CODE
2896 big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
2897 {
2898 	BIG_ERR_CODE err;
2899 
2900 
2901 	if ((err = big_copy(result, n)) != BIG_OK)
2902 		return (err);
2903 	result->value[0] |= 1;
2904 	while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
2905 		if (err != BIG_FALSE)
2906 			return (err);
2907 		if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
2908 			return (err);
2909 	}
2910 	return (BIG_OK);
2911 }
2912 
2913 
2914 /*
2915  * given m and e, computes the rest in the equation
2916  * gcd(m, e) = cm * m + ce * e
2917  */
2918 BIG_ERR_CODE
2919 big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
2920 {
2921 	BIGNUM		*xi, *ri, *riminus1, *riminus2, *t;
2922 	BIGNUM		*vmi, *vei, *vmiminus1, *veiminus1;
2923 	BIGNUM		t1, t2, t3, t4, t5, t6, t7, t8, tmp;
2924 	BIG_CHUNK_TYPE	t1value[BIGTMPSIZE];
2925 	BIG_CHUNK_TYPE	t2value[BIGTMPSIZE];
2926 	BIG_CHUNK_TYPE	t3value[BIGTMPSIZE];
2927 	BIG_CHUNK_TYPE	t4value[BIGTMPSIZE];
2928 	BIG_CHUNK_TYPE	t5value[BIGTMPSIZE];
2929 	BIG_CHUNK_TYPE	t6value[BIGTMPSIZE];
2930 	BIG_CHUNK_TYPE	t7value[BIGTMPSIZE];
2931 	BIG_CHUNK_TYPE	t8value[BIGTMPSIZE];
2932 	BIG_CHUNK_TYPE	tmpvalue[BIGTMPSIZE];
2933 	BIG_ERR_CODE	err;
2934 	int		len;
2935 
2936 	if (big_cmp_abs(m, e) >= 0) {
2937 		len = m->len;
2938 	} else {
2939 		len = e->len;
2940 	}
2941 
2942 	if ((err = big_init1(&t1, len,
2943 	    t1value, arraysize(t1value))) != BIG_OK) {
2944 		return (err);
2945 	}
2946 	if ((err = big_init1(&t2, len,
2947 	    t2value, arraysize(t2value))) != BIG_OK) {
2948 		goto ret1;
2949 	}
2950 	if ((err = big_init1(&t3, len,
2951 	    t3value, arraysize(t3value))) != BIG_OK) {
2952 		goto ret2;
2953 	}
2954 	if ((err = big_init1(&t4, len,
2955 	    t4value, arraysize(t3value))) != BIG_OK) {
2956 		goto ret3;
2957 	}
2958 	if ((err = big_init1(&t5, len,
2959 	    t5value, arraysize(t5value))) != BIG_OK) {
2960 		goto ret4;
2961 	}
2962 	if ((err = big_init1(&t6, len,
2963 	    t6value, arraysize(t6value))) != BIG_OK) {
2964 		goto ret5;
2965 	}
2966 	if ((err = big_init1(&t7, len,
2967 	    t7value, arraysize(t7value))) != BIG_OK) {
2968 		goto ret6;
2969 	}
2970 	if ((err = big_init1(&t8, len,
2971 	    t8value, arraysize(t8value))) != BIG_OK) {
2972 		goto ret7;
2973 	}
2974 
2975 	if ((err = big_init1(&tmp, 2 * len,
2976 	    tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2977 		goto ret8;
2978 	}
2979 
2980 	ri = &t1;
2981 	ri->value[0] = 1;
2982 	ri->len = 1;
2983 	xi = &t2;
2984 	riminus1 = &t3;
2985 	riminus2 = &t4;
2986 	vmi = &t5;
2987 	vei = &t6;
2988 	vmiminus1 = &t7;
2989 	veiminus1 = &t8;
2990 
2991 	(void) big_copy(vmiminus1, &big_One);
2992 	(void) big_copy(vmi, &big_One);
2993 	(void) big_copy(veiminus1, &big_One);
2994 	(void) big_copy(xi, &big_One);
2995 	vei->len = 1;
2996 	vei->value[0] = 0;
2997 
2998 	(void) big_copy(riminus1, m);
2999 	(void) big_copy(ri, e);
3000 
3001 	while (!big_is_zero(ri)) {
3002 		t = riminus2;
3003 		riminus2 = riminus1;
3004 		riminus1 = ri;
3005 		ri = t;
3006 		if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
3007 			goto ret;
3008 		}
3009 		if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
3010 			goto ret;
3011 		}
3012 		t = vmiminus1;
3013 		vmiminus1 = vmi;
3014 		vmi = t;
3015 		if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
3016 			goto ret;
3017 		}
3018 		if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
3019 			goto ret;
3020 		}
3021 		t = veiminus1;
3022 		veiminus1 = vei;
3023 		vei = t;
3024 		if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
3025 		    BIG_OK) {
3026 			goto ret;
3027 		}
3028 	}
3029 	if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
3030 		goto ret;
3031 	}
3032 	if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
3033 		goto ret;
3034 	}
3035 	if (ce != NULL) {
3036 		err = big_copy(ce, vei);
3037 	}
3038 ret:
3039 	if (tmp.malloced) big_finish(&tmp);
3040 ret8:
3041 	if (t8.malloced) big_finish(&t8);
3042 ret7:
3043 	if (t7.malloced) big_finish(&t7);
3044 ret6:
3045 	if (t6.malloced) big_finish(&t6);
3046 ret5:
3047 	if (t5.malloced) big_finish(&t5);
3048 ret4:
3049 	if (t4.malloced) big_finish(&t4);
3050 ret3:
3051 	if (t3.malloced) big_finish(&t3);
3052 ret2:
3053 	if (t2.malloced) big_finish(&t2);
3054 ret1:
3055 	if (t1.malloced) big_finish(&t1);
3056 
3057 	return (err);
3058 }
3059