1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/asm_linkage.h>
26#include <sys/x86_archext.h>
27#include <sys/controlregs.h>
28
29#if defined(__lint)
30
31#include <sys/types.h>
32
33uint32_t
34bignum_use_sse2()
35{ return (0); }
36
37/* Not to be called by C code */
38/* ARGSUSED */
39uint32_t
40big_mul_set_vec_sse2_r()
41{ return (0); }
42
43/* Not to be called by C code */
44/* ARGSUSED */
45uint32_t
46big_mul_add_vec_sse2_r()
47{ return (0); }
48
49/* ARGSUSED */
50uint32_t
51big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
52{ return (0); }
53
54/* ARGSUSED */
55uint32_t
56big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
57{ return (0); }
58
59/* ARGSUSED */
60void
61big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
62{}
63
64/* ARGSUSED */
65void
66big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
67{}
68
69#if defined(MMX_MANAGE)
70
71/* ARGSUSED */
72uint32_t
73big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
74{ return (0); }
75
76/* ARGSUSED */
77uint32_t
78big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
79{ return (0); }
80
81/* Not to be called by C code */
82/* ARGSUSED */
83void
84big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
85{}
86
87#endif	/* MMX_MANAGE */
88
89/*
90 * UMUL
91 *
92 */
93
94/* ARGSUSED */
95uint32_t
96big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
97{ return (0); }
98
99/* ARGSUSED */
100uint32_t
101big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
102{ return (0); }
103
104#else	/* __lint */
105
106#if defined(MMX_MANAGE)
107
108#if defined(_KERNEL)
109
110#define	KPREEMPT_DISABLE call kpr_disable
111#define	KPREEMPT_ENABLE call kpr_enable
112#define	TEST_TS(reg)					\
113	movl	%cr0, reg;				\
114	clts;						\
115	testl	$CR0_TS, reg
116
117#else	/* _KERNEL */
118
119#define	KPREEMPT_DISABLE
120#define	KPREEMPT_ENABLE
121
122#define	TEST_TS(reg)					\
123	movl	$0, reg;				\
124	testl	$CR0_TS, reg
125
126#endif	/* _KERNEL */
127
128#define	MMX_SIZE 8
129#define	MMX_ALIGN 8
130
131#define	SAVE_MMX_PROLOG(sreg, nreg)			\
132	subl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;	\
133	movl	%esp, sreg;				\
134	addl	$MMX_ALIGN, sreg;			\
135	andl	$-1![MMX_ALIGN-1], sreg;
136
137#define	RSTOR_MMX_EPILOG(nreg)				\
138	addl	$_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
139
140#define	SAVE_MMX_0TO4(sreg)			\
141	SAVE_MMX_PROLOG(sreg, 5);		\
142	movq	%mm0, 0(sreg);			\
143	movq	%mm1, 8(sreg);			\
144	movq	%mm2, 16(sreg);			\
145	movq	%mm3, 24(sreg);			\
146	movq	%mm4, 32(sreg)
147
148#define	RSTOR_MMX_0TO4(sreg)			\
149	movq	0(sreg), %mm0;			\
150	movq	8(sreg), %mm1;			\
151	movq	16(sreg), %mm2;			\
152	movq	24(sreg), %mm3;			\
153	movq	32(sreg), %mm4;			\
154	RSTOR_MMX_EPILOG(5)
155
156#endif	/* MMX_MANAGE */
157
158/ Note: this file contains implementations for
159/	big_mul_set_vec()
160/	big_mul_add_vec()
161/	big_mul_vec()
162/	big_sqr_vec()
163/ One set of implementations is for SSE2-capable models.
164/ The other uses no MMX, SSE, or SSE2 instructions, only
165/ the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
166/
167/ The code for the implementations is grouped by SSE2 vs UMUL,
168/ rather than grouping pairs of implementations for each function.
169/ This is because the bignum implementation gets "imprinted"
170/ on the correct implementation, at the time of first use,
171/ so none of the code for the other implementations is ever
172/ executed.  So, it is a no-brainer to layout the code to minimize
173/ the "footprint" of executed code.
174
175/ Can we use SSE2 instructions?  Return value is non-zero
176/ if we can.
177/
178/ Note:
179/   Using the cpuid instruction directly would work equally
180/   well in userland and in the kernel, but we do not use the
181/   cpuid instruction in the kernel, we use x86_featureset,
182/   instead.  This means we honor any decisions the kernel
183/   startup code may have made in setting this variable,
184/   including disabling SSE2.  It might even be a good idea
185/   to honor this kind of setting in userland, as well, but
186/   the variable, x86_featureset is not readily available to
187/   userland processes.
188/
189/ uint32_t
190/ bignum_use_sse2()
191
192	ENTRY(bignum_use_sse2)
193#if defined(_KERNEL)
194	xor	%eax, %eax
195	bt	$X86FSET_SSE2, x86_featureset
196	adc     %eax, %eax
197#else	/* _KERNEL */
198	pushl	%ebx
199	movl	$1, %eax		/ Get feature information
200	cpuid
201	movl	%edx, %eax		/ set return value
202	popl	%ebx
203	andl	$CPUID_INTC_EDX_SSE2, %eax
204#endif	/* _KERNEL */
205	ret
206	SET_SIZE(bignum_use_sse2)
207
208
209/ ------------------------------------------------------------------------
210/		SSE2 Implementations
211/ ------------------------------------------------------------------------
212
213/ r = a * digit, r and a are vectors of length len
214/ returns the carry digit
215/ Suitable only for x86 models that support SSE2 instruction set extensions
216/
217/ uint32_t
218/ big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
219/
220/ r	%edx
221/ a	%ebx
222/ len	%ecx
223/ digit	%mm3
224/
225/ Does not touch the following registers: %esi, %edi, %mm4
226/
227/ N.B.:
228/   This is strictly for internal use.
229/   The interface is very light-weight.
230/   All parameters are passed in registers.
231/   It does not conform to the SYSV x86 ABI.
232/   So, don't even think about calling this function directly from C code.
233/
234/ The basic multiply digit loop is unrolled 8 times.
235/ Each comment is preceded by an instance number.
236/ Instructions that have been moved retain their original, "natural"
237/ instance number.  It should be easier this way to follow
238/ the step-wise refinement process that went into constructing
239/ the final code.
240
241#define	UNROLL		8
242#define	UNROLL32	32
243
244	ENTRY(big_mul_set_vec_sse2_r)
245	xorl	%eax, %eax	/ if (len == 0) return (0);
246	testl	%ecx, %ecx
247	jz	.L17
248
249	pxor	%mm0, %mm0	/ cy = 0
250
251.L15:
252	cmpl	$UNROLL, %ecx
253	jl	.L16
254	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
255	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
256	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
257	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
258	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
259	psrlq	$32, %mm0	/ 1: cy = product[63..32]
260
261	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
262	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
263	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
264	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
265	psrlq	$32, %mm0	/ 2: cy = product[63..32]
266
267	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
268	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
269	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
270	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
271	psrlq	$32, %mm0	/ 3: cy = product[63..32]
272
273	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
274	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
275	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
276	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
277	psrlq	$32, %mm0	/ 4: cy = product[63..32]
278
279	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
280	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
281	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
282	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
283	psrlq	$32, %mm0	/ 5: cy = product[63..32]
284
285	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
286	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
287	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
288	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
289	psrlq	$32, %mm0	/ 6: cy = product[63..32]
290
291	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
292	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
293	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
294	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
295	psrlq	$32, %mm0	/ 7: cy = product[63..32]
296
297	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
298	paddq	%mm1, %mm0	/ 8: mm0 = digit * a[i] + cy;
299	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
300	psrlq	$32, %mm0	/ 8: cy = product[63..32]
301
302	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
303	leal	UNROLL32(%edx), %edx	/ r += UNROLL
304	subl	$UNROLL, %ecx		/ len -= UNROLL
305	jz	.L17
306	jmp	.L15
307
308.L16:
309	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
310	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
311	paddq	%mm1, %mm0	/ 1: mm0 = digit * a[i] + cy;
312	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
313	psrlq	$32, %mm0	/ 1: cy = product[63..32]
314	subl	$1, %ecx
315	jz	.L17
316
317	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
318	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
319	paddq	%mm1, %mm0	/ 2: mm0 = digit * a[i] + cy;
320	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
321	psrlq	$32, %mm0	/ 2: cy = product[63..32]
322	subl	$1, %ecx
323	jz	.L17
324
325	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
326	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
327	paddq	%mm1, %mm0	/ 3: mm0 = digit * a[i] + cy;
328	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
329	psrlq	$32, %mm0	/ 3: cy = product[63..32]
330	subl	$1, %ecx
331	jz	.L17
332
333	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
334	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
335	paddq	%mm1, %mm0	/ 4: mm0 = digit * a[i] + cy;
336	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
337	psrlq	$32, %mm0	/ 4: cy = product[63..32]
338	subl	$1, %ecx
339	jz	.L17
340
341	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
342	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
343	paddq	%mm1, %mm0	/ 5: mm0 = digit * a[i] + cy;
344	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
345	psrlq	$32, %mm0	/ 5: cy = product[63..32]
346	subl	$1, %ecx
347	jz	.L17
348
349	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
350	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
351	paddq	%mm1, %mm0	/ 6: mm0 = digit * a[i] + cy;
352	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
353	psrlq	$32, %mm0	/ 6: cy = product[63..32]
354	subl	$1, %ecx
355	jz	.L17
356
357	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
358	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
359	paddq	%mm1, %mm0	/ 7: mm0 = digit * a[i] + cy;
360	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
361	psrlq	$32, %mm0	/ 7: cy = product[63..32]
362
363.L17:
364	movd	%mm0, %eax	/ return (cy)
365	/ no emms.  caller is responsible for emms
366	ret
367	SET_SIZE(big_mul_set_vec_sse2_r)
368
369
370/ r = a * digit, r and a are vectors of length len
371/ returns the carry digit
372/ Suitable only for x86 models that support SSE2 instruction set extensions
373/
374/ r		 8(%ebp)	%edx
375/ a		12(%ebp)	%ebx
376/ len		16(%ebp)	%ecx
377/ digit		20(%ebp)	%mm3
378/
379/ In userland, there is just the one function, big_mul_set_vec_sse2().
380/ But in the kernel, there are two variations:
381/    1. big_mul_set_vec_sse2() which does what is necessary to save and
382/       restore state, if necessary, and to ensure that preemtion is
383/       disabled.
384/    2. big_mul_set_vec_sse2_nsv() which just does the work;
385/       it is the caller's responsibility to ensure that MMX state
386/       does not need to be saved and restored and that preemption
387/       is already disabled.
388
389#if defined(MMX_MANAGE)
390	ENTRY(big_mul_set_vec_sse2)
391	pushl	%ebp
392	movl	%esp, %ebp
393	pushl	%ebx
394	pushl	%esi
395	KPREEMPT_DISABLE
396	TEST_TS(%ebx)
397	pushl	%ebx
398	jnz	.setvec_no_save
399	pushl	%edi
400	SAVE_MMX_0TO4(%edi)
401	movl	8(%ebp), %edx
402	movl	12(%ebp), %ebx
403	movl	16(%ebp), %ecx
404	movd	20(%ebp), %mm3
405	call	big_mul_set_vec_sse2_r
406	movl	%eax, %esi
407	RSTOR_MMX_0TO4(%edi)
408	popl	%edi
409	jmp	.setvec_rtn
410
411.setvec_no_save:
412	movl	8(%ebp), %edx
413	movl	12(%ebp), %ebx
414	movl	16(%ebp), %ecx
415	movd	20(%ebp), %mm3
416	call	big_mul_set_vec_sse2_r
417	movl	%eax, %esi
418
419.setvec_rtn:
420	emms
421	popl	%ebx
422	movl	%ebx, %cr0
423	KPREEMPT_ENABLE
424	movl	%esi, %eax
425	popl	%esi
426	popl	%ebx
427	leave
428	ret
429	SET_SIZE(big_mul_set_vec_sse2)
430
431	ENTRY(big_mul_set_vec_sse2_nsv)
432	pushl	%ebp
433	movl	%esp, %ebp
434	pushl	%ebx
435	movl	8(%ebp), %edx
436	movl	12(%ebp), %ebx
437	movl	16(%ebp), %ecx
438	movd	20(%ebp), %mm3
439	call	big_mul_set_vec_sse2_r
440	popl	%ebx
441	leave
442	ret
443	SET_SIZE(big_mul_set_vec_sse2_nsv)
444
445#else	/* !defined(MMX_MANAGE) */
446
447/ r = a * digit, r and a are vectors of length len
448/ returns the carry digit
449/ Suitable only for x86 models that support SSE2 instruction set extensions
450/
451/ r		 8(%ebp)	%edx
452/ a		12(%ebp)	%ebx
453/ len		16(%ebp)	%ecx
454/ digit		20(%ebp)	%mm3
455
456	ENTRY(big_mul_set_vec_sse2)
457	pushl	%ebp
458	movl	%esp, %ebp
459	pushl	%ebx
460	movl	8(%ebp), %edx
461	movl	12(%ebp), %ebx
462	movl	16(%ebp), %ecx
463	movd	20(%ebp), %mm3
464	call	big_mul_set_vec_sse2_r
465	popl	%ebx
466	emms
467	leave
468	ret
469	SET_SIZE(big_mul_set_vec_sse2)
470
471#endif	/* MMX_MANAGE */
472
473
474/ r = r + a * digit, r and a are vectors of length len
475/ returns the carry digit
476/ Suitable only for x86 models that support SSE2 instruction set extensions
477/
478/ uint32_t
479/ big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
480/
481/ r	%edx
482/ a	%ebx
483/ len	%ecx
484/ digit	%mm3
485/
486/ N.B.:
487/   This is strictly for internal use.
488/   The interface is very light-weight.
489/   All parameters are passed in registers.
490/   It does not conform to the SYSV x86 ABI.
491/   So, don't even think about calling this function directly from C code.
492/
493/ The basic multiply digit loop is unrolled 8 times.
494/ Each comment is preceded by an instance number.
495/ Instructions that have been moved retain their original, "natural"
496/ instance number.  It should be easier this way to follow
497/ the step-wise refinement process that went into constructing
498/ the final code.
499
500	ENTRY(big_mul_add_vec_sse2_r)
501	xorl	%eax, %eax
502	testl	%ecx, %ecx
503	jz	.L27
504
505	pxor	%mm0, %mm0	/ cy = 0
506
507.L25:
508	cmpl	$UNROLL, %ecx
509	jl	.L26
510	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
511	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
512	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
513	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
514	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
515	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
516	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
517	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
518	psrlq	$32, %mm0	/ 1: cy = product[63..32]
519
520	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
521	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
522	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
523	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
524	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
525	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
526	psrlq	$32, %mm0	/ 2: cy = product[63..32]
527
528	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
529	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
530	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
531	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
532	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
533	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
534	psrlq	$32, %mm0	/ 3: cy = product[63..32]
535
536	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
537	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
538	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
539	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
540	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
541	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
542	psrlq	$32, %mm0	/ 4: cy = product[63..32]
543
544	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
545	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
546	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
547	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
548	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
549	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
550	psrlq	$32, %mm0	/ 5: cy = product[63..32]
551
552	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
553	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
554	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
555	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
556	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
557	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
558	psrlq	$32, %mm0	/ 6: cy = product[63..32]
559
560	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
561	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
562	movd	28(%ebx), %mm1	/ 8: mm1 = a[i]
563	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
564	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
565	movd	28(%edx), %mm2	/ 8: mm2 = r[i]
566	psrlq	$32, %mm0	/ 7: cy = product[63..32]
567
568	pmuludq	%mm3, %mm1	/ 8: mm1 = digit * a[i]
569	paddq	%mm1, %mm2	/ 8: mm2 = digit * a[i] + r[i]
570	paddq	%mm2, %mm0	/ 8: mm0 = digit * a[i] + r[i] + cy;
571	movd	%mm0, 28(%edx)	/ 8: r[i] = product[31..0]
572	psrlq	$32, %mm0	/ 8: cy = product[63..32]
573
574	leal	UNROLL32(%ebx), %ebx	/ a += UNROLL
575	leal	UNROLL32(%edx), %edx	/ r += UNROLL
576	subl	$UNROLL, %ecx		/ len -= UNROLL
577	jz	.L27
578	jmp	.L25
579
580.L26:
581	movd	0(%ebx), %mm1	/ 1: mm1 = a[i]
582	movd	0(%edx), %mm2	/ 1: mm2 = r[i]
583	pmuludq	%mm3, %mm1	/ 1: mm1 = digit * a[i]
584	paddq	%mm1, %mm2	/ 1: mm2 = digit * a[i] + r[i]
585	paddq	%mm2, %mm0	/ 1: mm0 = digit * a[i] + r[i] + cy;
586	movd	%mm0, 0(%edx)	/ 1: r[i] = product[31..0]
587	psrlq	$32, %mm0	/ 1: cy = product[63..32]
588	subl	$1, %ecx
589	jz	.L27
590
591	movd	4(%ebx), %mm1	/ 2: mm1 = a[i]
592	movd	4(%edx), %mm2	/ 2: mm2 = r[i]
593	pmuludq	%mm3, %mm1	/ 2: mm1 = digit * a[i]
594	paddq	%mm1, %mm2	/ 2: mm2 = digit * a[i] + r[i]
595	paddq	%mm2, %mm0	/ 2: mm0 = digit * a[i] + r[i] + cy;
596	movd	%mm0, 4(%edx)	/ 2: r[i] = product[31..0]
597	psrlq	$32, %mm0	/ 2: cy = product[63..32]
598	subl	$1, %ecx
599	jz	.L27
600
601	movd	8(%ebx), %mm1	/ 3: mm1 = a[i]
602	movd	8(%edx), %mm2	/ 3: mm2 = r[i]
603	pmuludq	%mm3, %mm1	/ 3: mm1 = digit * a[i]
604	paddq	%mm1, %mm2	/ 3: mm2 = digit * a[i] + r[i]
605	paddq	%mm2, %mm0	/ 3: mm0 = digit * a[i] + r[i] + cy;
606	movd	%mm0, 8(%edx)	/ 3: r[i] = product[31..0]
607	psrlq	$32, %mm0	/ 3: cy = product[63..32]
608	subl	$1, %ecx
609	jz	.L27
610
611	movd	12(%ebx), %mm1	/ 4: mm1 = a[i]
612	movd	12(%edx), %mm2	/ 4: mm2 = r[i]
613	pmuludq	%mm3, %mm1	/ 4: mm1 = digit * a[i]
614	paddq	%mm1, %mm2	/ 4: mm2 = digit * a[i] + r[i]
615	paddq	%mm2, %mm0	/ 4: mm0 = digit * a[i] + r[i] + cy;
616	movd	%mm0, 12(%edx)	/ 4: r[i] = product[31..0]
617	psrlq	$32, %mm0	/ 4: cy = product[63..32]
618	subl	$1, %ecx
619	jz	.L27
620
621	movd	16(%ebx), %mm1	/ 5: mm1 = a[i]
622	movd	16(%edx), %mm2	/ 5: mm2 = r[i]
623	pmuludq	%mm3, %mm1	/ 5: mm1 = digit * a[i]
624	paddq	%mm1, %mm2	/ 5: mm2 = digit * a[i] + r[i]
625	paddq	%mm2, %mm0	/ 5: mm0 = digit * a[i] + r[i] + cy;
626	movd	%mm0, 16(%edx)	/ 5: r[i] = product[31..0]
627	psrlq	$32, %mm0	/ 5: cy = product[63..32]
628	subl	$1, %ecx
629	jz	.L27
630
631	movd	20(%ebx), %mm1	/ 6: mm1 = a[i]
632	movd	20(%edx), %mm2	/ 6: mm2 = r[i]
633	pmuludq	%mm3, %mm1	/ 6: mm1 = digit * a[i]
634	paddq	%mm1, %mm2	/ 6: mm2 = digit * a[i] + r[i]
635	paddq	%mm2, %mm0	/ 6: mm0 = digit * a[i] + r[i] + cy;
636	movd	%mm0, 20(%edx)	/ 6: r[i] = product[31..0]
637	psrlq	$32, %mm0	/ 6: cy = product[63..32]
638	subl	$1, %ecx
639	jz	.L27
640
641	movd	24(%ebx), %mm1	/ 7: mm1 = a[i]
642	movd	24(%edx), %mm2	/ 7: mm2 = r[i]
643	pmuludq	%mm3, %mm1	/ 7: mm1 = digit * a[i]
644	paddq	%mm1, %mm2	/ 7: mm2 = digit * a[i] + r[i]
645	paddq	%mm2, %mm0	/ 7: mm0 = digit * a[i] + r[i] + cy;
646	movd	%mm0, 24(%edx)	/ 7: r[i] = product[31..0]
647	psrlq	$32, %mm0	/ 7: cy = product[63..32]
648
649.L27:
650	movd	%mm0, %eax
651	/ no emms.  caller is responsible for emms
652	ret
653	SET_SIZE(big_mul_add_vec_sse2_r)
654
655
656/ r = r + a * digit, r and a are vectors of length len
657/ returns the carry digit
658/ Suitable only for x86 models that support SSE2 instruction set extensions
659/
660/ r		 8(%ebp)	%edx
661/ a		12(%ebp)	%ebx
662/ len		16(%ebp)	%ecx
663/ digit		20(%ebp)	%mm3
664/
665/ In userland, there is just the one function, big_mul_add_vec_sse2().
666/ But in the kernel, there are two variations:
667/    1. big_mul_add_vec_sse2() which does what is necessary to save and
668/       restore state, if necessary, and to ensure that preemtion is
669/       disabled.
670/    2. big_mul_add_vec_sse2_nsv() which just does the work;
671/       it is the caller's responsibility to ensure that MMX state
672/       does not need to be saved and restored and that preemption
673/       is already disabled.
674
675
676#if defined(MMX_MANAGE)
677
678	ENTRY(big_mul_add_vec_sse2)
679	pushl	%ebp
680	movl	%esp, %ebp
681	pushl	%ebx
682	pushl	%esi
683	KPREEMPT_DISABLE
684	TEST_TS(%ebx)
685	pushl	%ebx
686	jnz	.addvec_no_save
687	pushl	%edi
688	SAVE_MMX_0TO4(%edi)
689	movl	8(%ebp), %edx
690	movl	12(%ebp), %ebx
691	movl	16(%ebp), %ecx
692	movd	20(%ebp), %mm3
693	call	big_mul_add_vec_sse2_r
694	movl	%eax, %esi
695	RSTOR_MMX_0TO4(%edi)
696	popl	%edi
697	jmp	.addvec_rtn
698
699.addvec_no_save:
700	movl	8(%ebp), %edx
701	movl	12(%ebp), %ebx
702	movl	16(%ebp), %ecx
703	movd	20(%ebp), %mm3
704	call	big_mul_add_vec_sse2_r
705	movl	%eax, %esi
706
707.addvec_rtn:
708	emms
709	popl	%ebx
710	movl	%ebx, %cr0
711	KPREEMPT_ENABLE
712	movl	%esi, %eax
713	popl	%esi
714	popl	%ebx
715	leave
716	ret
717	SET_SIZE(big_mul_add_vec_sse2)
718
719	ENTRY(big_mul_add_vec_sse2_nsv)
720	pushl	%ebp
721	movl	%esp, %ebp
722	pushl	%ebx
723	movl	8(%ebp), %edx
724	movl	12(%ebp), %ebx
725	movl	16(%ebp), %ecx
726	movd	20(%ebp), %mm3
727	call	big_mul_add_vec_sse2_r
728	popl	%ebx
729	leave
730	ret
731	SET_SIZE(big_mul_add_vec_sse2_nsv)
732
733
734#else	/* !defined(MMX_MANAGE) */
735
736	ENTRY(big_mul_add_vec_sse2)
737	pushl	%ebp
738	movl	%esp, %ebp
739	pushl	%ebx
740	movl	8(%ebp), %edx
741	movl	12(%ebp), %ebx
742	movl	16(%ebp), %ecx
743	movd	20(%ebp), %mm3
744	call	big_mul_add_vec_sse2_r
745	popl	%ebx
746	emms
747	leave
748	ret
749	SET_SIZE(big_mul_add_vec_sse2)
750
751#endif	/* MMX_MANAGE */
752
753
754/ void
755/ big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
756/ {
757/ 	int i;
758/
759/ 	r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
760/ 	for (i = 1; i < blen; ++i)
761/ 		r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
762/ }
763
764
765#if defined(MMX_MANAGE)
766	ENTRY(big_mul_vec_sse2_fc)
767#else
768	ENTRY(big_mul_vec_sse2)
769#endif
770	subl	$0x8, %esp
771	pushl	%ebx
772	pushl	%ebp
773	pushl	%esi
774	pushl	%edi
775	movl	40(%esp), %eax
776	movl	%eax, 20(%esp)
777	pushl	(%eax)
778	movl	40(%esp), %edi
779	pushl	%edi
780	movl	40(%esp), %esi
781	pushl	%esi
782	movl	40(%esp), %ebx
783	pushl	%ebx
784#if defined(MMX_MANAGE)
785	call	big_mul_set_vec_sse2_nsv
786#else
787	call	big_mul_set_vec_sse2
788#endif
789	addl	$0x10, %esp
790	movl	%eax, (%ebx,%edi,4)
791	movl	44(%esp), %eax
792	movl	%eax, 16(%esp)
793	cmpl	$0x1, %eax
794	jle	.mulvec_rtn
795	movl	$0x1, %ebp
796
797	.align 16
798.mulvec_add:
799	movl	20(%esp), %eax
800	pushl	(%eax,%ebp,4)
801	pushl	%edi
802	pushl	%esi
803	leal	(%ebx,%ebp,4), %eax
804	pushl	%eax
805#if defined(MMX_MANAGE)
806	call	big_mul_add_vec_sse2_nsv
807#else
808	call	big_mul_add_vec_sse2
809#endif
810	addl	$0x10, %esp
811	leal	(%ebp,%edi), %ecx
812	movl	%eax, (%ebx,%ecx,4)
813	incl	%ebp
814	cmpl	16(%esp), %ebp
815	jl	.mulvec_add
816.mulvec_rtn:
817#if defined(MMX_MANAGE)
818	emms
819#endif
820	popl	%edi
821	popl	%esi
822	popl	%ebp
823	popl	%ebx
824	addl	$0x8, %esp
825	ret
826#if defined(MMX_MANAGE)
827	SET_SIZE(big_mul_vec_sse2_fc)
828#else
829	SET_SIZE(big_mul_vec_sse2)
830#endif
831
832#if defined(MMX_MANAGE)
833
834	ENTRY(big_mul_vec_sse2)
835	pushl	%ebp
836	movl	%esp, %ebp
837	subl	$8, %esp
838	pushl	%edi
839	KPREEMPT_DISABLE
840	TEST_TS(%eax)
841	movl	%eax, -8(%ebp)
842	jnz	.mulvec_no_save
843	SAVE_MMX_0TO4(%edi)
844	movl	%edi, -4(%ebp)
845.mulvec_no_save:
846	movl	24(%ebp), %eax		/ blen
847	pushl	%eax
848	movl	20(%ebp), %eax		/ b
849	pushl	%eax
850	movl	16(%ebp), %eax		/ alen
851	pushl	%eax
852	movl	12(%ebp), %eax		/ a
853	pushl	%eax
854	movl	8(%ebp), %eax		/ r
855	pushl	%eax
856	call	big_mul_vec_sse2_fc
857	addl	$20, %esp
858	movl	-8(%ebp), %eax
859	testl	$CR0_TS, %eax
860	jnz	.mulvec_no_rstr
861	movl	-4(%ebp), %edi
862	RSTOR_MMX_0TO4(%edi)
863.mulvec_no_rstr:
864	movl	%eax, %cr0
865	KPREEMPT_ENABLE
866	popl	%edi
867	leave
868	ret
869	SET_SIZE(big_mul_vec_sse2)
870
871#endif	/* MMX_MANAGE */
872
873
874
875#undef UNROLL
876#undef UNROLL32
877
878
879/ r = a * a, r and a are vectors of length len
880/ Suitable only for x86 models that support SSE2 instruction set extensions
881/
882/ This function is not suitable for a truly general-purpose multiprecision
883/ arithmetic library, because it does not work for "small" numbers, that is
884/ numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
885/ for any small numbers.
886
887#if defined(MMX_MANAGE)
888	ENTRY(big_sqr_vec_sse2_fc)
889#else
890	ENTRY(big_sqr_vec_sse2)
891	pushl	%ebp
892	movl	%esp, %ebp
893#endif
894
895	pushl	%ebx
896	pushl	%edi
897	pushl	%esi
898
899	/ r[1..alen] = a[0] * a[1..alen-1]
900
901	movl	8(%ebp), %edi		/ r = arg(r)
902	movl	12(%ebp), %esi		/ a = arg(a)
903	movl	16(%ebp), %ecx		/ cnt = arg(alen)
904	movd	%ecx, %mm4		/ save_cnt = arg(alen)
905	leal	4(%edi), %edx		/ dst = &r[1]
906	movl	%esi, %ebx		/ src = a
907	movd	0(%ebx), %mm3		/ mm3 = a[0]
908	leal	4(%ebx), %ebx		/ src = &a[1]
909	subl	$1, %ecx		/ --cnt
910	call	big_mul_set_vec_sse2_r	/ r[1..alen-1] = a[0] * a[1..alen-1]
911	movl	%edi, %edx		/ dst = r
912	movl	%esi, %ebx		/ src = a
913	movd	%mm4, %ecx		/ cnt = save_cnt
914	movl	%eax, (%edx, %ecx, 4)	/ r[cnt] = cy
915
916/	/* High-level vector C pseudocode */
917/	for (i = 1; i < alen-1; ++i)
918/		r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
919/
920/	/* Same thing, but slightly lower level C-like pseudocode */
921/	i = 1;
922/	r = &arg_r[2*i + 1];
923/	a = &arg_a[i + 1];
924/	digit = arg_a[i];
925/	cnt = alen - 3;
926/	while (cnt != 0) {
927/		r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
928/		r += 2;
929/		++a;
930/		--cnt;
931/	}
932/
933/	/* Same thing, but even lower level
934/	 * For example, pointers are raw pointers,
935/	 * with no scaling by object size.
936/	 */
937/	r = arg_r + 12;	/* i == 1; 2i + 1 == 3;  4*3 == 12; */
938/	a = arg_a + 8;
939/	digit = *(arg_a + 4);
940/	cnt = alen - 3;
941/	while (cnt != 0) {
942/		cy = big_mul_add_vec_sse2_r();
943/		*(r + 4 * cnt) = cy;
944/		r += 8;
945/		a += 4;
946/		--cnt;
947/	}
948
949	leal	4(%edi), %edi		/ r += 4; r = &r[1]
950	leal	4(%esi), %esi		/ a += 4; a = &a[1]
951	movd	%mm4, %ecx		/ cnt = save
952	subl	$2, %ecx		/ cnt = alen - 2; i in 1..alen-2
953	movd	%ecx, %mm4		/ save_cnt
954	jecxz	.L32			/ while (cnt != 0) {
955.L31:
956	movd	0(%esi), %mm3		/ digit = a[i]
957	leal	4(%esi), %esi		/ a += 4; a = &a[1]; a = &a[i + 1]
958	leal	8(%edi), %edi		/ r += 8; r = &r[2]; r = &r[2 * i + 1]
959	movl	%edi, %edx		/ edx = r
960	movl	%esi, %ebx		/ ebx = a
961	cmp	$1, %ecx		/ The last triangle term is special
962	jz	.L32
963	call	big_mul_add_vec_sse2_r
964	movd	%mm4, %ecx		/ cnt = save_cnt
965	movl	%eax, (%edi, %ecx, 4)	/ r[cnt] = cy
966	subl	$1, %ecx		/ --cnt
967	movd	%ecx, %mm4		/ save_cnt = cnt
968	jmp	.L31			/ }
969
970.L32:
971	movd	0(%ebx), %mm1		/ mm1 = a[i + 1]
972	movd	0(%edx), %mm2		/ mm2 = r[2 * i + 1]
973	pmuludq	%mm3, %mm1		/ mm1 = p = digit * a[i + 1]
974	paddq	%mm1, %mm2		/ mm2 = r[2 * i + 1] + p
975	movd	%mm2, 0(%edx)		/ r[2 * i + 1] += lo32(p)
976	psrlq	$32, %mm2		/ mm2 = cy
977	movd	%mm2, 4(%edx)		/ r[2 * i + 2] = cy
978	pxor	%mm2, %mm2
979	movd	%mm2, 8(%edx)		/ r[2 * i + 3] = 0
980
981	movl	8(%ebp), %edx		/ r = arg(r)
982	movl	12(%ebp), %ebx		/ a = arg(a)
983	movl	16(%ebp), %ecx		/ cnt = arg(alen)
984
985	/ compute low-order corner
986	/ p = a[0]**2
987	/ r[0] = lo32(p)
988	/ cy   = hi32(p)
989	movd	0(%ebx), %mm2		/ mm2 = a[0]
990	pmuludq	%mm2, %mm2		/ mm2 = p = a[0]**2
991	movd	%mm2, 0(%edx)		/ r[0] = lo32(p)
992	psrlq	$32, %mm2		/ mm2 = cy = hi32(p)
993
994	/ p = 2 * r[1]
995	/ t = p + cy
996	/ r[1] = lo32(t)
997	/ cy   = hi32(t)
998	movd	4(%edx), %mm1		/ mm1 = r[1]
999	psllq	$1, %mm1		/ mm1 = p = 2 * r[1]
1000	paddq	%mm1, %mm2		/ mm2 = t = p + cy
1001	movd	%mm2, 4(%edx)		/ r[1] = low32(t)
1002	psrlq	$32, %mm2		/ mm2 = cy = hi32(t)
1003
1004	/ r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1005	subl	$2, %ecx		/ cnt = alen - 2
1006.L34:
1007	movd	4(%ebx), %mm0		/ mm0 = diag = a[i+1]
1008	pmuludq	%mm0, %mm0		/ mm0 = p = diag**2
1009	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1010	movd	%mm2, %eax
1011	movd	%eax, %mm1		/ mm1 = lo32(t)
1012	psrlq	$32, %mm2		/ mm2 = hi32(t)
1013
1014	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1015	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1016	paddq	%mm3, %mm1		/ mm1 = 2*r[2*i] + lo32(t)
1017	movd	%mm1, 8(%edx)		/ r[2*i] = 2*r[2*i] + lo32(t)
1018	psrlq	$32, %mm1
1019	paddq	%mm1, %mm2
1020
1021	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1022	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1023	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + hi32(t)
1024	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1025	psrlq	$32, %mm2		/ mm2 = cy
1026	leal	8(%edx), %edx		/ r += 2
1027	leal	4(%ebx), %ebx		/ ++a
1028	subl	$1, %ecx		/ --cnt
1029	jnz	.L34
1030
1031	/ Carry from last triangle term must participate in doubling,
1032	/ but this step isn't paired up with a squaring the elements
1033	/ of the inner diagonal.
1034	/ r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1035	movd	8(%edx), %mm3		/ mm3 = r[2*i]
1036	psllq	$1, %mm3		/ mm3 = 2*r[2*i]
1037	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i] + cy
1038	movd	%mm2, 8(%edx)		/ r[2*i] = lo32(2*r[2*i] + cy)
1039	psrlq	$32, %mm2		/ mm2 = cy = hi32(2*r[2*i] + cy)
1040
1041	movd	12(%edx), %mm3		/ mm3 = r[2*i+1]
1042	psllq	$1, %mm3		/ mm3 = 2*r[2*i+1]
1043	paddq	%mm3, %mm2		/ mm2 = 2*r[2*i+1] + cy
1044	movd	%mm2, 12(%edx)		/ r[2*i+1] = mm2
1045	psrlq	$32, %mm2		/ mm2 = cy
1046
1047	/ compute high-order corner and add it in
1048	/ p = a[alen - 1]**2
1049	/ t = p + cy
1050	/ r[alen + alen - 2] += lo32(t)
1051	/ cy = hi32(t)
1052	/ r[alen + alen - 1] = cy
1053	movd	4(%ebx), %mm0		/ mm0 = a[$-1]
1054	movd	8(%edx), %mm3		/ mm3 = r[$-2]
1055	pmuludq	%mm0, %mm0		/ mm0 = p = a[$-1]**2
1056	paddq	%mm0, %mm2		/ mm2 = t = p + cy
1057	paddq	%mm3, %mm2		/ mm2 = r[$-2] + t
1058	movd	%mm2, 8(%edx)		/ r[$-2] = lo32(r[$-2] + t)
1059	psrlq	$32, %mm2		/ mm2 = cy = hi32(r[$-2] + t)
1060	movd	12(%edx), %mm3
1061	paddq	%mm3, %mm2
1062	movd	%mm2, 12(%edx)		/ r[$-1] += cy
1063
1064.L35:
1065	emms
1066	popl	%esi
1067	popl	%edi
1068	popl	%ebx
1069
1070#if defined(MMX_MANAGE)
1071	ret
1072	SET_SIZE(big_sqr_vec_sse2_fc)
1073#else
1074	leave
1075	ret
1076	SET_SIZE(big_sqr_vec_sse2)
1077#endif
1078
1079
1080#if defined(MMX_MANAGE)
1081	ENTRY(big_sqr_vec_sse2)
1082	pushl	%ebp
1083	movl	%esp, %ebp
1084	KPREEMPT_DISABLE
1085	TEST_TS(%ebx)
1086	pushl	%ebx
1087	jnz	.sqr_no_save
1088	pushl	%edi
1089	SAVE_MMX_0TO4(%edi)
1090	call	big_sqr_vec_sse2_fc
1091	RSTOR_MMX_0TO4(%edi)
1092	popl	%edi
1093	jmp	.sqr_rtn
1094
1095.sqr_no_save:
1096	call	big_sqr_vec_sse2_fc
1097
1098.sqr_rtn:
1099	popl	%ebx
1100	movl	%ebx, %cr0
1101	KPREEMPT_ENABLE
1102	leave
1103	ret
1104	SET_SIZE(big_sqr_vec_sse2)
1105
1106#endif	/* MMX_MANAGE */
1107
1108/ ------------------------------------------------------------------------
1109/		UMUL Implementations
1110/ ------------------------------------------------------------------------
1111
1112
1113/ r = a * digit, r and a are vectors of length len
1114/ returns the carry digit
1115/ Does not use any MMX, SSE, or SSE2 instructions.
1116/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1117/ This is a fall-back implementation for x86 models that do not support
1118/ the PMULUDQ instruction.
1119/
1120/ uint32_t
1121/ big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1122/
1123/ r		 8(%ebp)	%edx	%edi
1124/ a		12(%ebp)	%ebx	%esi
1125/ len		16(%ebp)	%ecx
1126/ digit		20(%ebp)	%esi
1127
1128	ENTRY(big_mul_set_vec_umul)
1129	pushl	%ebp
1130	movl	%esp, %ebp
1131	pushl	%esi
1132	pushl	%edi
1133	pushl	%ebx
1134	movl	16(%ebp), %ecx
1135	xorl	%ebx, %ebx	/ cy = 0
1136	testl	%ecx, %ecx
1137	movl	8(%ebp), %edi
1138	movl	12(%ebp), %esi
1139	je	.L57
1140
1141.L55:
1142	movl	(%esi), %eax	/ eax = a[i]
1143	leal	4(%esi), %esi	/ ++a
1144	mull	20(%ebp)	/ edx:eax = a[i] * digit
1145	addl	%ebx, %eax
1146	adcl	$0, %edx	/ edx:eax = a[i] * digit + cy
1147	movl	%eax, (%edi)	/ r[i] = product[31..0]
1148	movl	%edx, %ebx	/ cy = product[63..32]
1149	leal	4(%edi), %edi	/ ++r
1150	decl	%ecx		/ --len
1151	jnz	.L55		/ while (len != 0)
1152.L57:
1153	movl	%ebx, %eax
1154	popl	%ebx
1155	popl	%edi
1156	popl	%esi
1157	leave
1158	ret
1159	SET_SIZE(big_mul_set_vec_umul)
1160
1161
1162/ r = r + a * digit, r and a are vectors of length len
1163/ returns the carry digit
1164/ Does not use any MMX, SSE, or SSE2 instructions.
1165/ Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1166/ This is a fall-back implementation for x86 models that do not support
1167/ the PMULUDQ instruction.
1168/
1169/ uint32_t
1170/ big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1171/
1172/ r		 8(%ebp)	%edx	%edi
1173/ a		12(%ebp)	%ebx	%esi
1174/ len		16(%ebp)	%ecx
1175/ digit		20(%ebp)	%esi
1176
1177	ENTRY(big_mul_add_vec_umul)
1178	pushl	%ebp
1179	movl	%esp, %ebp
1180	pushl	%esi
1181	pushl	%edi
1182	pushl	%ebx
1183	movl	16(%ebp), %ecx
1184	xorl	%ebx, %ebx	/ cy = 0
1185	testl	%ecx, %ecx
1186	movl	8(%ebp), %edi
1187	movl	12(%ebp), %esi
1188	je	.L67
1189	.align 4
1190.L65:
1191	movl	(%esi), %eax	/ eax = a[i]
1192	leal	4(%esi), %esi	/ ++a
1193	mull	20(%ebp)	/ edx:eax = a[i] * digit
1194	addl	(%edi), %eax
1195	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i]
1196	addl	%ebx, %eax
1197	adcl	$0, %edx	/ edx:eax = a[i] * digit + r[i] + cy
1198	movl	%eax, (%edi)	/ r[i] = product[31..0]
1199	movl	%edx, %ebx	/ cy = product[63..32]
1200	leal	4(%edi), %edi	/ ++r
1201	decl	%ecx		/ --len
1202	jnz	.L65		/ while (len != 0)
1203.L67:
1204	movl	%ebx, %eax
1205	popl	%ebx
1206	popl	%edi
1207	popl	%esi
1208	leave
1209	ret
1210	SET_SIZE(big_mul_add_vec_umul)
1211
1212#endif	/* __lint */
1213